diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs index 9f8dc87..536c935 100644 --- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -251,7 +251,7 @@ namespace CUETools.Codecs.FLACCL if (_settings.DeviceType == OpenCLDeviceType.CPU) { _settings.GroupSize = 1; - _settings.GPUOnly = false; + //_settings.GPUOnly = true; _settings.MappedMemory = true; } eparams.flake_set_defaults(_compressionLevel, !_settings.GPUOnly); @@ -2657,11 +2657,18 @@ namespace CUETools.Codecs.FLACCL max_porder, frameSize >> max_porder); - openCLCQ.EnqueueNDRangeKernel( - clCalcPartition, - groupSize, 1, - 1 + ((1 << max_porder) - 1) / (groupSize / 16), - channels * frameCount); + if (openCLCQ.Device.DeviceType == DeviceType.CPU) + openCLCQ.EnqueueNDRangeKernel( + clCalcPartition, + groupSize, 1, + 1, + channels * frameCount); + else + openCLCQ.EnqueueNDRangeKernel( + clCalcPartition, + groupSize, 1, + 1 + ((1 << max_porder) - 1) / (groupSize / 16), + channels * frameCount); } if (max_porder > 0) @@ -2670,26 +2677,32 @@ namespace CUETools.Codecs.FLACCL clPartitions, max_porder); - openCLCQ.EnqueueNDRangeKernel( - clSumPartition, - 128, 1, - (Flake.MAX_RICE_PARAM + 1), - channels * frameCount); + if (openCLCQ.Device.DeviceType == DeviceType.CPU) + openCLCQ.EnqueueNDRangeKernel( + clSumPartition, + 1, 1, 1, + channels * frameCount); + else + openCLCQ.EnqueueNDRangeKernel( + clSumPartition, + 128, 1, + (Flake.MAX_RICE_PARAM + 1), + channels * frameCount); } clFindRiceParameter.SetArgs( + clBestResidualTasks, clRiceParams, clPartitions, max_porder); openCLCQ.EnqueueNDRangeKernel( clFindRiceParameter, - groupSize, 1, - Math.Max(1, (2 << max_porder) / groupSize), - channels * frameCount); + groupSize, channels * frameCount); //if (max_porder > 0) // need to run even if max_porder==0 just to calculate the final frame size clFindPartitionOrder.SetArgs( + clResidual, clBestRiceParams, clBestResidualTasks, clRiceParams, diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl index f0e2fb2..63f2d44 100644 --- a/CUETools.Codecs.FLACCL/flac.cl +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -874,7 +874,7 @@ void clSumPartition( const int pos = (15 << (max_porder + 1)) * get_group_id(1) + (get_group_id(0) << (max_porder + 1)); // fetch partition lengths - int2 pl = get_local_id(0) * 2 < (1 << max_porder) ? *(__global int2*)&partition_lengths[pos + get_local_id(0) * 2] : 0; + int2 pl = get_local_id(0) * 2 < (1 << max_porder) ? vload2(get_local_id(0),&partition_lengths[pos]) : 0; data[get_local_id(0)] = pl.x + pl.y; barrier(CLK_LOCAL_MEM_FENCE); @@ -893,22 +893,19 @@ void clSumPartition( partition_lengths[pos + (1 << max_porder) + get_local_size(0) + get_local_id(0)] = data[get_local_size(0) + get_local_id(0)]; } -// Finds optimal rice parameter for several partitions at a time. -// get_group_id(0) == chunk index (chunk size is GROUP_SIZE, total task size is (2 << max_porder)) -// get_group_id(1) == task index +// Finds optimal rice parameter for each partition. +// get_group_id(0) == task index __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) void clFindRiceParameter( + __global FLACCLSubframeTask *tasks, __global int* rice_parameters, __global int* partition_lengths, int max_porder ) { - const int tid = get_local_id(0); - const int parts = min(GROUP_SIZE, 2 << max_porder); - const int pos = (15 << (max_porder + 1)) * get_group_id(1) + get_group_id(0) * GROUP_SIZE + tid; - - if (tid < parts) + for (int offs = get_local_id(0); offs < (2 << max_porder); offs += GROUP_SIZE) { + const int pos = (15 << (max_porder + 1)) * get_group_id(0) + offs; int best_l = partition_lengths[pos]; int best_k = 0; for (int k = 1; k <= 14; k++) @@ -919,15 +916,16 @@ void clFindRiceParameter( } // output rice parameter - rice_parameters[(get_group_id(1) << (max_porder + 2)) + get_group_id(0) * GROUP_SIZE + tid] = best_k; + rice_parameters[(get_group_id(0) << (max_porder + 2)) + offs] = best_k; // output length - rice_parameters[(get_group_id(1) << (max_porder + 2)) + (1 << (max_porder + 1)) + get_group_id(0) * GROUP_SIZE + tid] = best_l; + rice_parameters[(get_group_id(0) << (max_porder + 2)) + (1 << (max_porder + 1)) + offs] = best_l; } } // get_group_id(0) == task index __kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) void clFindPartitionOrder( + __global int *residual, __global int* best_rice_parameters, __global FLACCLSubframeTask *tasks, __global int* rice_parameters, @@ -973,9 +971,8 @@ void clFindPartitionOrder( task.type == Constant ? obits : obits * task.blocksize; } barrier(CLK_LOCAL_MEM_FENCE); - for (int offs = 0; offs < (1 << best_porder); offs += GROUP_SIZE) - if (offs + get_local_id(0) < (1 << best_porder)) - best_rice_parameters[(get_group_id(0) << max_porder) + offs + get_local_id(0)] = rice_parameters[pos - (2 << best_porder) + offs + get_local_id(0)]; + for (int offs = get_local_id(0); offs < (1 << best_porder); offs += GROUP_SIZE) + best_rice_parameters[(get_group_id(0) << max_porder) + offs] = rice_parameters[pos - (2 << best_porder) + offs]; // FIXME: should be bytes? } #endif diff --git a/CUETools.Codecs.FLACCL/flaccpu.cl b/CUETools.Codecs.FLACCL/flaccpu.cl index a5397fc..0aa06a1 100644 --- a/CUETools.Codecs.FLACCL/flaccpu.cl +++ b/CUETools.Codecs.FLACCL/flaccpu.cl @@ -131,6 +131,8 @@ void clFindWastedBits( } } +#define TEMPBLOCK 64 + // get_num_groups(0) == number of tasks // get_num_groups(1) == number of windows __kernel __attribute__((reqd_work_group_size(1, 1, 1))) @@ -145,43 +147,31 @@ void clComputeAutocor( FLACCLSubframeData task = tasks[get_group_id(0) * taskCount].data; int len = task.blocksize; int windowOffs = get_group_id(1) * len; - float data1[4096 + 32]; - - // TODO!!!!!!!!!!! if (bs > 4096) data1[bs + 32] - - for (int tid = 0; tid < len; tid++) - data1[tid] = samples[task.samplesOffs + tid] * window[windowOffs + tid]; - data1[len] = 0.0f; - __global float * pout = &output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1)]; - for (int l = 1; l < MAX_ORDER; l++) - data1[len + l] = 0.0f; - - // double ac0 = 0.0, ac1 = 0.0, ac2 = 0.0, ac3 = 0.0; - // for (int j = 0; j < len; j++) - // { - //float dj = data1[j]; - //ac0 += dj * dj; - //ac1 += dj * data1[j + 1]; - //ac2 += dj * data1[j + 2]; - //ac3 += dj * data1[j + 3]; - // } - // pout[0] = ac0; - // pout[1] = ac1; - // pout[2] = ac2; - // pout[3] = ac3; + float data[TEMPBLOCK + MAX_ORDER + 3]; + double ac[MAX_ORDER + 4]; + for (int i = 0; i <= MAX_ORDER; ++i) - { - double temp = 1.0; - double temp2 = 1.0; - float* finish = data1 + len - i; + ac[i] = 0.0; - for (float* pdata = data1; pdata < finish; pdata += 2) + for (int pos = 0; pos < len; pos += TEMPBLOCK) + { + for (int tid = 0; tid < TEMPBLOCK + MAX_ORDER + 3; tid++) + data[tid] = tid < len - pos ? samples[task.samplesOffs + pos + tid] * window[windowOffs + pos + tid] : 0.0f; + + for (int i = 0; i <= MAX_ORDER; i += 4) { - temp += pdata[i] * pdata[0]; - temp2 += pdata[i + 1] * pdata[1]; + float4 temp = 0.0; + for (int j = 0; j < min(TEMPBLOCK, len - pos); j++) + temp += data[j] * vload4(0, &data[j + i]); + ac[i] += temp.x; + ac[i+1] += temp.y; + ac[i+2] += temp.z; + ac[i+3] += temp.w; } - pout[i] = temp + temp2; } + __global float * pout = &output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1)]; + for (int i = 0; i <= MAX_ORDER; ++i) + pout[i] = ac[i]; } __kernel __attribute__((reqd_work_group_size(1, 1, 1))) @@ -319,13 +309,36 @@ void clQuantizeLPC( } } -#define ESTIMATE_N(ro,sum) for (int pos = ro; pos < bs; pos ++) { \ - __global int *ptr = data + pos - ro; \ - int t = clamp((data[pos] - ((sum) >> task.data.shift)) >> task.data.wbits, -0x7fffff, 0x7fffff); \ - len[pos >> (12 - EPO)] += (t << 1) ^ (t >> 31); \ - } +inline int calc_residual(__global int *ptr, int * coefs, int ro) +{ + int sum = 0; + for (int i = 0; i < ro; i++) + sum += ptr[i] * coefs[i]; + return sum; +} -// int sum = 0; for (int i = 0; i < ro; i++) sum += *(ptr++) * task.coefs[i]; +#define ENCODE_N(cro,action) for (int pos = cro; pos < bs; pos ++) { \ + int t = (data[pos] - (calc_residual(data + pos - cro, task.coefs, cro) >> task.data.shift)) >> task.data.wbits; \ + action; \ + } +#define SWITCH_N(action) \ + switch (ro) \ + { \ + case 0: ENCODE_N(0, action) break; \ + case 1: ENCODE_N(1, action) break; \ + case 2: ENCODE_N(2, action) /*if (task.coefs[0] == -1 && task.coefs[1] == 2) ENCODE_N(2, 2 * ptr[1] - ptr[0], action) else*/ break; \ + case 3: ENCODE_N(3, action) break; \ + case 4: ENCODE_N(4, action) break; \ + case 5: ENCODE_N(5, action) break; \ + case 6: ENCODE_N(6, action) break; \ + case 7: ENCODE_N(7, action) break; \ + case 8: ENCODE_N(8, action) break; \ + case 9: ENCODE_N(9, action) break; \ + case 10: ENCODE_N(10, action) break; \ + case 11: ENCODE_N(11, action) break; \ + case 12: ENCODE_N(12, action) break; \ + default: ENCODE_N(ro, action) \ + } __kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(1, 1, 1))) void clEstimateResidual( @@ -337,72 +350,15 @@ void clEstimateResidual( int ro = task.data.residualOrder; int bs = task.data.blocksize; #define EPO 6 - int len[1 << EPO]; + int len[1 << EPO]; // blocksize / 64!!!! -#if 0 - //float data[4096 + 32]; - //float fcoef[32]; - - // TODO!!!!!!!!!!! if (bs > 4096) data1[bs + 32] - - for (int tid = 0; tid < bs; tid++) - data[tid] = (float)samples[task.data.samplesOffs + tid] / (1 << task.data.wbits); - for (int tid = 0; tid < 32; tid++) - fcoef[tid] = select(0.0f, - ((float) task.coefs[tid]) / (1 << task.data.shift), tid < ro); - float4 c0 = vload4(0, &fcoef[0]); - float4 c1 = vload4(1, &fcoef[0]); - float4 c2 = vload4(2, &fcoef[0]); -#else __global int *data = &samples[task.data.samplesOffs]; - for (int i = ro; i < 32; i++) - task.coefs[i] = 0; -#endif + // for (int i = ro; i < 32; i++) + //task.coefs[i] = 0; for (int i = 0; i < 1 << EPO; i++) len[i] = 0; - switch (ro) - { - case 0: ESTIMATE_N(0, 0) break; - case 1: ESTIMATE_N(1, *ptr * task.coefs[0]) break; - case 2: ESTIMATE_N(2, *(ptr++) * task.coefs[0] + *ptr * task.coefs[1]) break; - case 3: ESTIMATE_N(3, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *ptr * task.coefs[2]) break; - case 4: ESTIMATE_N(4, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *ptr * task.coefs[3]) break; - case 5: ESTIMATE_N(5, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *ptr * task.coefs[4]) break; - case 6: ESTIMATE_N(6, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *ptr * task.coefs[5]) break; - case 7: ESTIMATE_N(7, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *ptr * task.coefs[6]) break; - case 8: ESTIMATE_N(8, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *ptr * task.coefs[7]) break; - case 9: ESTIMATE_N(9, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *ptr * task.coefs[8]) break; - case 10: ESTIMATE_N(10, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *ptr * task.coefs[9]) break; - case 11: ESTIMATE_N(11, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *ptr * task.coefs[10]) break; - case 12: ESTIMATE_N(12, *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] + *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *(ptr++) * task.coefs[10] + *ptr * task.coefs[11]) break; - default: - for (int pos = ro; pos < bs; pos ++) - { - #if 0 - float sum = dot(vload4(0, data + pos - ro), c0) - + dot(vload4(1, data + pos - ro), c1) - + dot(vload4(2, data + pos - ro), c2) - ; - int t = convert_int_rte(data[pos] + sum); - #else - __global int *ptr = data + pos - ro; - int sum = - *(ptr++) * task.coefs[0] + *(ptr++) * task.coefs[1] + *(ptr++) * task.coefs[2] + *(ptr++) * task.coefs[3] - + *(ptr++) * task.coefs[4] + *(ptr++) * task.coefs[5] + *(ptr++) * task.coefs[6] + *(ptr++) * task.coefs[7] - + *(ptr++) * task.coefs[8] + *(ptr++) * task.coefs[9] + *(ptr++) * task.coefs[10] + *(ptr++) * task.coefs[11] - ; - for (int i = 12; i < ro; i++) - sum += *(ptr++) * task.coefs[i]; - int t = (data[pos] - (sum >> task.data.shift)) >> task.data.wbits; - #endif - // overflow protection - t = clamp(t, -0x7fffff, 0x7fffff); - // convert to unsigned - t = (t << 1) ^ (t >> 31); - len[pos >> (12 - EPO)] += t; - } - break; - } + SWITCH_N((t = clamp(t, -0x7fffff, 0x7fffff), len[pos >> (12 - EPO)] += (t << 1) ^ (t >> 31))) int total = 0; for (int i = 0; i < 1 << EPO; i++) @@ -497,4 +453,183 @@ void clCopyBestMethodStereo( tasks_out[2 * get_group_id(0) + 1].data.residualOffs = tasks[best_index[1]].data.residualOffs; } +// get_group_id(0) == task index +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void clEncodeResidual( + __global int *residual, + __global int *samples, + __global FLACCLSubframeTask *tasks + ) +{ + FLACCLSubframeTask task = tasks[get_group_id(0)]; + int bs = task.data.blocksize; + int ro = task.data.residualOrder; + __global int *data = &samples[task.data.samplesOffs]; + SWITCH_N(residual[task.data.residualOffs + pos] = t); +} + +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void clCalcPartition( + __global int *partition_lengths, + __global int *residual, + __global FLACCLSubframeTask *tasks, + int max_porder, // <= 8 + int psize // == task.blocksize >> max_porder? + ) +{ + FLACCLSubframeTask task = tasks[get_group_id(1)]; + int bs = task.data.blocksize; + int ro = task.data.residualOrder; + //int psize = bs >> max_porder; + __global int *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1); + + for (int p = 0; p < (1 << max_porder); p++) + pl[p] = 0; + + for (int pos = ro; pos < bs; pos ++) + { + int t = residual[task.data.residualOffs + pos]; + // overflow protection + t = clamp(t, -0x7fffff, 0x7fffff); + // convert to unsigned + t = (t << 1) ^ (t >> 31); + pl[pos / psize] += t; + } +} +// get_group_id(0) == task index +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void clCalcPartition16( + __global int *partition_lengths, + __global int *residual, + __global int *samples, + __global FLACCLSubframeTask *tasks, + int max_porder // <= 8 + ) +{ + FLACCLSubframeTask task = tasks[get_group_id(0)]; + int bs = task.data.blocksize; + int ro = task.data.residualOrder; + __global int *data = &samples[task.data.samplesOffs]; + __global int *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(0); + for (int p = 0; p < (1 << max_porder); p++) + pl[p] = 0; + SWITCH_N((residual[task.data.residualOffs + pos] = t, t = clamp(t, -0x7fffff, 0x7fffff), t = (t << 1) ^ (t >> 31), pl[pos >> 4] += t)); +} + +// Sums partition lengths for a certain k == get_group_id(0) +// get_group_id(0) == k +// get_group_id(1) == task index +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void clSumPartition( + __global int* partition_lengths, + int max_porder + ) +{ + if (get_group_id(0) != 0) // ignore k != 0 + return; + __global int * sums = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1); + for (int i = max_porder - 1; i >= 0; i--) + { + for (int j = 0; j < (1 << i); j++) + { + sums[(2 << i) + j] = sums[2 * j] + sums[2 * j + 1]; + // if (get_group_id(1) == 0) + //printf("[%d][%d]: %d + %d == %d\n", i, j, sums[2 * j], sums[2 * j + 1], sums[2 * j] + sums[2 * j + 1]); + } + sums += 2 << i; + } +} + +// Finds optimal rice parameter for each partition. +// get_group_id(0) == task index +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void clFindRiceParameter( + __global FLACCLSubframeTask *tasks, + __global int* rice_parameters, + __global int* partition_lengths, + int max_porder + ) +{ + __global FLACCLSubframeTask* task = tasks + get_group_id(0); + const int tid = get_local_id(0); + int lim = (2 << max_porder) - 1; + int psize = task->data.blocksize >> max_porder; + int bs = task->data.blocksize; + int ro = task->data.residualOrder; + for (int offs = 0; offs < lim; offs ++) + { + int pl = partition_lengths[(1 << (max_porder + 1)) * get_group_id(0) + offs]; + int porder = 31 - clz(lim - offs); + int ps = (bs >> porder) - select(0, ro, offs == lim + 1 - (2 << porder)); + //if (ps <= 0) + // printf("max_porder == %d, porder == %d, ro == %d\n", max_porder, porder, ro); + int k = clamp(31 - clz(pl / max(1, ps)), 0, 14); + int plk = ps * (k + 1) + (pl >> k); + + // output rice parameter + rice_parameters[(get_group_id(0) << (max_porder + 2)) + offs] = k; + // output length + rice_parameters[(get_group_id(0) << (max_porder + 2)) + (1 << (max_porder + 1)) + offs] = plk; + } +} + +// get_group_id(0) == task index +__kernel __attribute__((reqd_work_group_size(1, 1, 1))) +void clFindPartitionOrder( + __global int *residual, + __global int* best_rice_parameters, + __global FLACCLSubframeTask *tasks, + __global int* rice_parameters, + int max_porder + ) +{ + __global FLACCLSubframeTask* task = tasks + get_group_id(0); + int partlen[9]; + for (int p = 0; p < 9; p++) + partlen[p] = 0; + // fetch partition lengths + const int pos = (get_group_id(0) << (max_porder + 2)) + (2 << max_porder); + int lim = (2 << max_porder) - 1; + for (int offs = 0; offs < lim; offs ++) + { + int len = rice_parameters[pos + offs]; + int porder = 31 - clz(lim - offs); + partlen[porder] += len; + } + + int best_length = partlen[0] + 4; + int best_porder = 0; + for (int porder = 1; porder <= max_porder; porder++) + { + int length = (4 << porder) + partlen[porder]; + best_porder = select(best_porder, porder, length < best_length); + best_length = min(best_length, length); + } + + best_length = (4 << best_porder) + task->data.blocksize - task->data.residualOrder; + int best_psize = task->data.blocksize >> best_porder; + int start = task->data.residualOffs + task->data.residualOrder; + int fin = task->data.residualOffs + best_psize; + for (int p = 0; p < (1 << best_porder); p++) + { + int k = rice_parameters[pos - (2 << best_porder) + p]; + best_length += k * (fin - start); + for (int i = start; i < fin; i++) + { + int t = residual[i]; + best_length += ((t << 1) ^ (t >> 31)) >> k; + } + start = fin; + fin += best_psize; + } + + int obits = task->data.obits - task->data.wbits; + task->data.porder = best_porder; + task->data.size = + task->data.type == Fixed ? task->data.residualOrder * obits + 6 + best_length : + task->data.type == LPC ? task->data.residualOrder * obits + 6 + best_length + 4 + 5 + task->data.residualOrder * task->data.cbits : + task->data.type == Constant ? obits : obits * task->data.blocksize; + for (int offs = 0; offs < (1 << best_porder); offs ++) + best_rice_parameters[(get_group_id(0) << max_porder) + offs] = rice_parameters[pos - (2 << best_porder) + offs]; +} #endif