diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs index 917a4cb..ad829c8 100644 --- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -964,6 +964,10 @@ namespace CUETools.Codecs.FLACCL { switch (sub.best.type) { + case SubframeType.Constant: + return (uint)sub.obits; + case SubframeType.Verbatim: + return (uint)(sub.obits * frame.blocksize); case SubframeType.Fixed: return measure_subframe_fixed(frame, sub); case SubframeType.LPC: @@ -1214,99 +1218,10 @@ namespace CUETools.Codecs.FLACCL } } - unsafe void encode_residual(FLACCLTask task) + unsafe void encode_residual(FLACCLTask task, int channelsCount, int iFrame) { - bool unpacked = false; - unpack_samples(task, Math.Min(32, task.frameSize)); - for (int ch = 0; ch < channels; ch++) - { - switch (task.frame.subframes[ch].best.type) - { - case SubframeType.Constant: - break; - case SubframeType.Verbatim: - if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; - break; - case SubframeType.Fixed: - if (!task.UseGPUOnly) - { - if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; - encode_residual_fixed(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, - task.frame.blocksize, task.frame.subframes[ch].best.order); + FlacFrame frame = task.frame; - int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); - int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); - uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 6; - task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, PCM.BitsPerSample > 16 ? 1 : 0); - } - break; - case SubframeType.LPC: - fixed (int* coefs = task.frame.subframes[ch].best.coefs) - { - ulong csum = 0; - for (int i = task.frame.subframes[ch].best.order; i > 0; i--) - csum += (ulong)Math.Abs(coefs[i - 1]); - -#if DEBUG - // check size - if (task.UseGPUOnly && !task.UseGPURice) - { - uint real_size = measure_subframe(task.frame, task.frame.subframes[ch]); - if (real_size != task.frame.subframes[ch].best.size) - throw new Exception("size reported incorrectly"); - } -#endif - - if ((((csum << task.frame.subframes[ch].obits) >= 1UL << 32) && PCM.BitsPerSample == 16) || !task.UseGPUOnly) - { - if (task.UseGPURice) -#if DEBUG -// throw new Exception("DoRice failed"); - break; -#else - break; -#endif - if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; - if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32) - lpc.encode_residual_long(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift); - else - lpc.encode_residual(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift); - int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); - int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); - uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 4 + 5 + (uint)task.frame.subframes[ch].best.order * (uint)task.frame.subframes[ch].best.cbits + 6; -#if KLJLKJLKJL - uint oldsize = task.frame.subframes[ch].best.size; - RiceContext rc1 = task.frame.subframes[ch].best.rc; - task.frame.subframes[ch].best.rc = new RiceContext(); -#endif - task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, PCM.BitsPerSample > 16 ? 1 : 0); - task.frame.subframes[ch].best.size = measure_subframe(task.frame, task.frame.subframes[ch]); -#if KJHKJH - // check size - if (task.UseGPUOnly && oldsize > task.frame.subframes[ch].best.size) - throw new Exception("unoptimal size reported"); -#endif - //if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * (uint)task.frame.blocksize && - // oldsize <= task.frame.subframes[ch].obits * (uint)task.frame.blocksize) - // throw new Exception("oops"); - } - } - break; - } - if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * task.frame.blocksize) - { -#if DEBUG - throw new Exception("larger than verbatim"); -#endif - task.frame.subframes[ch].best.type = SubframeType.Verbatim; - task.frame.subframes[ch].best.size = (uint)(task.frame.subframes[ch].obits * task.frame.blocksize); - if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; - } - } - } - - unsafe void select_best_methods(FlacFrame frame, int channelsCount, int iFrame, FLACCLTask task) - { if (channelsCount == 4 && channels == 2 && frame.blocksize > 4) { if (task.BestResidualTasks[iFrame * 2].channel == 0 && task.BestResidualTasks[iFrame * 2 + 1].channel == 1) @@ -1325,49 +1240,106 @@ namespace CUETools.Codecs.FLACCL else frame.ch_mode = channels != 2 ? ChannelMode.NotStereo : ChannelMode.LeftRight; + // calculate wbits before unpacking samples. + for (int ch = 0; ch < channels; ch++) + { + int index = ch + iFrame * channels; + frame.subframes[ch].wbits = frame.blocksize > 4 + ? task.BestResidualTasks[index].wbits : 0; + } + unpack_samples(task, Math.Min(task.frameSize, eparams.max_prediction_order)); + for (int ch = 0; ch < channels; ch++) { int index = ch + iFrame * channels; frame.subframes[ch].best.residual = ((int*)task.clResidualPtr) + task.BestResidualTasks[index].residualOffs; frame.subframes[ch].best.type = SubframeType.Verbatim; frame.subframes[ch].best.size = (uint)(frame.subframes[ch].obits * frame.blocksize); - frame.subframes[ch].wbits = 0; - if (frame.blocksize <= Math.Max(4, eparams.max_prediction_order)) - continue; - if (task.BestResidualTasks[index].size < 0) - throw new Exception("internal error"); - if (frame.subframes[ch].best.size > task.BestResidualTasks[index].size - && (SubframeType)task.BestResidualTasks[index].type != SubframeType.Verbatim) + if (frame.blocksize > Math.Max(4, eparams.max_prediction_order)) { - frame.subframes[ch].best.type = (SubframeType)task.BestResidualTasks[index].type; - frame.subframes[ch].best.size = (uint)task.BestResidualTasks[index].size; - frame.subframes[ch].best.order = task.BestResidualTasks[index].residualOrder; - frame.subframes[ch].best.cbits = task.BestResidualTasks[index].cbits; - frame.subframes[ch].best.shift = task.BestResidualTasks[index].shift; - frame.subframes[ch].obits -= task.BestResidualTasks[index].wbits; - frame.subframes[ch].wbits = task.BestResidualTasks[index].wbits; - for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++) - frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i]; - frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder; - frame.subframes[ch].best.rc.coding_method = task.BestResidualTasks[index].coding_method; - if (task.UseGPUOnly && !task.UseGPURice && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC)) - //if (task.UseGPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC)) + if (task.BestResidualTasks[index].size < 0) + throw new Exception("internal error"); + + if (frame.subframes[ch].best.size > task.BestResidualTasks[index].size + && (SubframeType)task.BestResidualTasks[index].type != SubframeType.Verbatim) { - int* riceParams = ((int*)task.clBestRiceParamsPtr) + (index << task.max_porder); - fixed (int* dstParams = frame.subframes[ch].best.rc.rparams) - AudioSamples.MemCpy(dstParams, riceParams, (1 << frame.subframes[ch].best.rc.porder)); - //for (int i = 0; i < (1 << frame.subframes[ch].best.rc.porder); i++) - // frame.subframes[ch].best.rc.rparams[i] = riceParams[i]; - uint real_size = measure_subframe(frame, frame.subframes[ch]); - if (real_size != task.frame.subframes[ch].best.size) + frame.subframes[ch].best.type = (SubframeType)task.BestResidualTasks[index].type; + frame.subframes[ch].best.size = (uint)task.BestResidualTasks[index].size; + frame.subframes[ch].best.order = task.BestResidualTasks[index].residualOrder; + frame.subframes[ch].best.cbits = task.BestResidualTasks[index].cbits; + frame.subframes[ch].best.shift = task.BestResidualTasks[index].shift; + frame.subframes[ch].obits -= task.BestResidualTasks[index].wbits; + frame.subframes[ch].wbits = task.BestResidualTasks[index].wbits; + for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++) + frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i]; + frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder; + frame.subframes[ch].best.rc.coding_method = task.BestResidualTasks[index].coding_method; + if (task.UseGPUOnly && !task.UseGPURice) + { + if (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC) + { + int* riceParams = ((int*)task.clBestRiceParamsPtr) + (index << task.max_porder); + fixed (int* dstParams = frame.subframes[ch].best.rc.rparams) + AudioSamples.MemCpy(dstParams, riceParams, (1 << frame.subframes[ch].best.rc.porder)); + } + uint real_size = measure_subframe(frame, frame.subframes[ch]); + if (real_size != task.frame.subframes[ch].best.size) + throw new Exception("size reported incorrectly"); + } + } + else + { + if (task.UseGPURice && frame.subframes[ch].best.size != task.BestResidualTasks[index].size) throw new Exception("size reported incorrectly"); } } - else + + switch (task.frame.subframes[ch].best.type) { - if (task.UseGPURice && frame.subframes[ch].best.size != task.BestResidualTasks[index].size) - throw new Exception("size reported incorrectly"); + case SubframeType.Constant: + break; + case SubframeType.Verbatim: + unpack_samples(task, task.frameSize); + break; + case SubframeType.Fixed: + if (!task.UseGPUOnly) + { + unpack_samples(task, task.frameSize); + encode_residual_fixed(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, + task.frame.blocksize, task.frame.subframes[ch].best.order); + + int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); + int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, PCM.BitsPerSample > 16 ? 1 : 0); + } + break; + case SubframeType.LPC: + if (!task.UseGPUOnly) + { + unpack_samples(task, task.frameSize); + fixed (int* coefs = task.frame.subframes[ch].best.coefs) + { + if (PCM.BitsPerSample > 16) + lpc.encode_residual_long(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift); + else + lpc.encode_residual(task.frame.subframes[ch].best.residual, task.frame.subframes[ch].samples, task.frame.blocksize, task.frame.subframes[ch].best.order, coefs, task.frame.subframes[ch].best.shift); + } + + int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); + int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order, PCM.BitsPerSample > 16 ? 1 : 0); + } + break; + } + if (!task.UseGPUOnly) + { + task.frame.subframes[ch].best.size = measure_subframe(task.frame, task.frame.subframes[ch]); + if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * task.frame.blocksize) + { + task.frame.subframes[ch].best.type = SubframeType.Verbatim; + task.frame.subframes[ch].best.size = (uint)(task.frame.subframes[ch].obits * task.frame.blocksize); + } } } } @@ -1577,9 +1549,7 @@ namespace CUETools.Codecs.FLACCL ((int*)task.clResidualPtr) + ch * task.channelSize + iFrame * task.frameSize, _pcm.BitsPerSample + (doMidside && ch == 3 ? 1 : 0), 0); - select_best_methods(task.frame, channelCount, iFrame, task); - //unpack_samples(task); - encode_residual(task); + encode_residual(task, channelCount, iFrame); //task.frame.writer.Reset(); task.frame.frame_number = current_frame_number; diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl index af74e41..8375ca5 100644 --- a/CUETools.Codecs.FLACCL/flac.cl +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -616,6 +616,7 @@ void clQuantizeLPC( { int bs = tasks[get_group_id(1) * taskCount].data.blocksize; int abits = tasks[get_group_id(1) * taskCount].data.abits; + int obits = tasks[get_group_id(1) * taskCount].data.obits; int lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1) * 32; float error[MAX_ORDER]; int best_orders[MAX_ORDER]; @@ -667,7 +668,7 @@ void clQuantizeLPC( #if BITS_PER_SAMPLE > 16 int cbits = max(3, min(15 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits)); #else - int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits), clz(order) + 1 - abits)); + int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits), clz(order) + 1 - obits)); #endif // calculate shift based on precision and number of leading zeroes in coeffs int shift = max(0,min(15, clz(tmpi) - 18 + cbits)); @@ -822,7 +823,7 @@ void clQuantizeLPC( #if BITS_PER_SAMPLE > 16 int cbits = max(3, min(min(15 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), 15)); #else - int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), clz(order) + 1 - shared.task.abits)); + int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (shared.task.blocksize <= 2304) - (shared.task.blocksize <= 1152) - (shared.task.blocksize <= 576), shared.task.abits), clz(order) + 1 - shared.task.obits)); #endif // calculate shift based on precision and number of leading zeroes in coeffs int shift = max(0,min(15, clz(shared.maxcoef[i]) - 18 + cbits)); @@ -1145,7 +1146,6 @@ void clEstimateResidual( atom_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t); #endif } -#if 1 if (pos < bs) { // fetch samples @@ -1192,7 +1192,6 @@ void clEstimateResidual( atom_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t); #endif } -#endif // calculate rice partition bit length for every 32 samples barrier(CLK_LOCAL_MEM_FENCE); @@ -1421,7 +1420,7 @@ void clEncodeResidual( int4 cptr2 = vload4(2, &task.coefs[0]); #endif - // We tweaked coeffs so that (task.cbits + task.abits + clz(ro) <= 32) + // We tweaked coeffs so that (task.cbits + task.obits + log2i(ro) <= 32) // when BITS_PER_SAMPLE == 16, so we don't need 64bit arithmetics. data[tid] = 0;