diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs index 07493fd..c525c71 100644 --- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -754,6 +754,32 @@ namespace CUETools.Codecs.FLACCL frame.writer.writebits(8, crc); } + unsafe int measure_residual(FlacFrame frame, FlacSubframeInfo sub, int pos, int cnt, int k) + { + int q = 0; + for (int i = pos; i < pos + cnt; i++) + { + int v = sub.best.residual[i]; + v = (v << 1) ^ (v >> 31); + q += (v >> k); + } + return (k + 1) * cnt + q; + } + + unsafe int measure_residual(FlacFrame frame, FlacSubframeInfo sub) + { + // partition order + int porder = sub.best.rc.porder; + int psize = frame.blocksize >> porder; + //assert(porder >= 0); + int size = 6 + (4 << porder); + size += measure_residual(frame, sub, sub.best.order, psize - sub.best.order, sub.best.rc.rparams[0]); + // residual + for (int p = 1; p < (1 << porder); p++) + size += measure_residual(frame, sub, p * psize, psize, sub.best.rc.rparams[p]); + return size; + } + unsafe void output_residual(FlacFrame frame, FlacSubframeInfo sub) { // rice-encoded block @@ -806,6 +832,12 @@ namespace CUETools.Codecs.FLACCL output_residual(frame, sub); } + unsafe uint + measure_subframe_lpc(FlacFrame frame, FlacSubframeInfo sub) + { + return (uint)(sub.best.order * sub.obits + 9 + sub.best.order * sub.best.cbits + measure_residual(frame, sub)); + } + unsafe void output_subframe_lpc(FlacFrame frame, FlacSubframeInfo sub) { @@ -898,8 +930,8 @@ namespace CUETools.Codecs.FLACCL task.nResidualTasks = 0; task.nTasksPerWindow = Math.Min(32, eparams.orders_per_window); task.nResidualTasksPerChannel = _windowcount * task.nTasksPerWindow + 1 + (eparams.do_constant ? 1 : 0) + eparams.max_fixed_order - eparams.min_fixed_order; - if (task.nResidualTasksPerChannel >= 4) - task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7; + //if (task.nResidualTasksPerChannel >= 4) + // task.nResidualTasksPerChannel = (task.nResidualTasksPerChannel + 7) & ~7; task.nAutocorTasksPerChannel = _windowcount; for (int iFrame = 0; iFrame < nFrames; iFrame++) { @@ -973,20 +1005,20 @@ namespace CUETools.Codecs.FLACCL } task.nResidualTasks++; } - // Filler - while ((task.nResidualTasks % task.nResidualTasksPerChannel) != 0) - { - task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Verbatim; - task.ResidualTasks[task.nResidualTasks].channel = ch; - task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); - task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits; - task.ResidualTasks[task.nResidualTasks].blocksize = blocksize; - task.ResidualTasks[task.nResidualTasks].residualOrder = 0; - task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * blocksize; - task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; - task.ResidualTasks[task.nResidualTasks].shift = 0; - task.nResidualTasks++; - } + //// Filler + //while ((task.nResidualTasks % task.nResidualTasksPerChannel) != 0) + //{ + // task.ResidualTasks[task.nResidualTasks].type = (int)SubframeType.Verbatim; + // task.ResidualTasks[task.nResidualTasks].channel = ch; + // task.ResidualTasks[task.nResidualTasks].obits = (int)bits_per_sample + (channels == 2 && ch == 3 ? 1 : 0); + // task.ResidualTasks[task.nResidualTasks].abits = task.ResidualTasks[task.nResidualTasks].obits; + // task.ResidualTasks[task.nResidualTasks].blocksize = blocksize; + // task.ResidualTasks[task.nResidualTasks].residualOrder = 0; + // task.ResidualTasks[task.nResidualTasks].samplesOffs = ch * FLACCLWriter.MAX_BLOCKSIZE + iFrame * blocksize; + // task.ResidualTasks[task.nResidualTasks].residualOffs = task.ResidualTasks[task.nResidualTasks].samplesOffs; + // task.ResidualTasks[task.nResidualTasks].shift = 0; + // task.nResidualTasks++; + //} } } if (sizeof(FLACCLSubframeTask) * task.nResidualTasks > task.residualTasksLen) @@ -1029,6 +1061,17 @@ namespace CUETools.Codecs.FLACCL ulong csum = 0; for (int i = task.frame.subframes[ch].best.order; i > 0; i--) csum += (ulong)Math.Abs(coefs[i - 1]); + +#if DEBUG + // check size + if (_settings.GPUOnly) + { + uint real_size = measure_subframe_lpc(task.frame, task.frame.subframes[ch]); + if (real_size != task.frame.subframes[ch].best.size) + throw new Exception("size reported incorrectly"); + } +#endif + if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32 || !_settings.GPUOnly) { if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; @@ -1039,8 +1082,18 @@ namespace CUETools.Codecs.FLACCL int pmin = get_max_p_order(eparams.min_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); int pmax = get_max_p_order(eparams.max_partition_order, task.frame.blocksize, task.frame.subframes[ch].best.order); uint bits = (uint)(task.frame.subframes[ch].best.order * task.frame.subframes[ch].obits) + 4 + 5 + (uint)task.frame.subframes[ch].best.order * (uint)task.frame.subframes[ch].best.cbits + 6; - //uint oldsize = task.frame.subframes[ch].best.size; - task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order); +#if KLJLKJLKJL + uint oldsize = task.frame.subframes[ch].best.size; + RiceContext rc1 = task.frame.subframes[ch].best.rc; + task.frame.subframes[ch].best.rc = new RiceContext(); +#endif + task.frame.subframes[ch].best.size = bits + calc_rice_params(task.frame.subframes[ch].best.rc, pmin, pmax, task.frame.subframes[ch].best.residual, (uint)task.frame.blocksize, (uint)task.frame.subframes[ch].best.order); + task.frame.subframes[ch].best.size = measure_subframe_lpc(task.frame, task.frame.subframes[ch]); +#if KJHKJH + // check size + if (_settings.GPUOnly && oldsize > task.frame.subframes[ch].best.size) + throw new Exception("unoptimal size reported"); +#endif //if (task.frame.subframes[ch].best.size > task.frame.subframes[ch].obits * (uint)task.frame.blocksize && // oldsize <= task.frame.subframes[ch].obits * (uint)task.frame.blocksize) // throw new Exception("oops"); @@ -1137,19 +1190,19 @@ namespace CUETools.Codecs.FLACCL task.cudaComputeLPC.SetArg(0, task.cudaResidualTasks); task.cudaComputeLPC.SetArg(1, task.cudaAutocorOutput); task.cudaComputeLPC.SetArg(2, task.cudaLPCData); - task.cudaComputeLPC.SetArg(3, (uint)task.nResidualTasksPerChannel); + task.cudaComputeLPC.SetArg(3, task.nResidualTasksPerChannel); task.cudaComputeLPC.SetArg(4, (uint)_windowcount); task.cudaQuantizeLPC.SetArg(0, task.cudaResidualTasks); task.cudaQuantizeLPC.SetArg(1, task.cudaLPCData); - task.cudaQuantizeLPC.SetArg(2, (uint)task.nResidualTasksPerChannel); + task.cudaQuantizeLPC.SetArg(2, task.nResidualTasksPerChannel); task.cudaQuantizeLPC.SetArg(3, (uint)task.nTasksPerWindow); task.cudaQuantizeLPC.SetArg(4, (uint)eparams.lpc_min_precision_search); task.cudaQuantizeLPC.SetArg(5, (uint)(eparams.lpc_max_precision_search - eparams.lpc_min_precision_search)); task.cudaCopyBestMethod.SetArg(0, task.cudaBestResidualTasks); task.cudaCopyBestMethod.SetArg(1, task.cudaResidualTasks); - task.cudaCopyBestMethod.SetArg(2, (uint)task.nResidualTasksPerChannel); + task.cudaCopyBestMethod.SetArg(2, task.nResidualTasksPerChannel); task.cudaCopyBestMethodStereo.SetArg(0, task.cudaBestResidualTasks); task.cudaCopyBestMethodStereo.SetArg(1, task.cudaResidualTasks); @@ -2384,8 +2437,8 @@ namespace CUETools.Codecs.FLACCL cudaComputeAutocor.SetArg(1, cudaSamples); cudaComputeAutocor.SetArg(2, cudaWindow); cudaComputeAutocor.SetArg(3, cudaResidualTasks); - cudaComputeAutocor.SetArg(4, (uint)nAutocorTasksPerChannel - 1); - cudaComputeAutocor.SetArg(5, (uint)nResidualTasksPerChannel); + cudaComputeAutocor.SetArg(4, nAutocorTasksPerChannel - 1); + cudaComputeAutocor.SetArg(5, nResidualTasksPerChannel); int workX = max_prediction_order / 4 + 1; int workY = nAutocorTasksPerChannel * channelsCount * frameCount; @@ -2406,7 +2459,7 @@ namespace CUETools.Codecs.FLACCL { cudaChooseBestMethod.SetArg(0, cudaResidualTasks); cudaChooseBestMethod.SetArg(1, cudaResidualOutput); - cudaChooseBestMethod.SetArg(2, (uint)nResidualTasksPerChannel); + cudaChooseBestMethod.SetArg(2, nResidualTasksPerChannel); openCLCQ.EnqueueNDRangeKernel(cudaChooseBestMethod, 2, null, new int[] { 32, channelsCount * frameCount }, new int[] { 32, 1 }); } diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl index 5ee17e6..d8e09d7 100644 --- a/CUETools.Codecs.FLACCL/flac.cl +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -621,18 +621,18 @@ void cudaChooseBestMethod( tasks[tid + taskCount * get_group_id(1)].data.size = shared.length[tid]; int l1 = shared.length[tid]; - for (int sh = 4; sh > 0; sh --) + for (int l = 16; l > 0; l >>= 1) { - if (tid < (1 << sh)) + if (tid < l) { - int l2 = shared.length[tid + (1 << sh)]; - shared.index[tid] = shared.index[tid + ((l2 < l1) << sh)]; + int l2 = shared.length[tid + l]; + shared.index[tid] = shared.index[tid + select(0, l, l2 < l1)]; shared.length[tid] = l1 = min(l1, l2); } barrier(CLK_LOCAL_MEM_FENCE); } if (tid == 0) - tasks[taskCount * get_group_id(1)].data.best_index = taskCount * get_group_id(1) + shared.index[shared.length[1] < shared.length[0]]; + tasks[taskCount * get_group_id(1)].data.best_index = taskCount * get_group_id(1) + shared.index[0]; } __kernel __attribute__((reqd_work_group_size(64, 1, 1))) @@ -921,13 +921,14 @@ void cudaFindPartitionOrder( if (get_local_id(0) < 32) shared.index[get_local_id(0)] = get_local_id(0); barrier(CLK_LOCAL_MEM_FENCE); + //atom_min(shared.index[get_local_id(0)],); int l1 = get_local_id(0) <= max_porder ? shared.length[get_local_id(0)] : 0xfffffff; - for (int sh = 3; sh >= 0; sh --) + for (int l = 8; l > 0; l >>= 1) { - if (get_local_id(0) < (1 << sh)) + if (get_local_id(0) < l) { - int l2 = shared.length[get_local_id(0) + (1 << sh)]; - shared.index[get_local_id(0)] = shared.index[get_local_id(0) + ((l2 < l1) << sh)]; + int l2 = shared.length[get_local_id(0) + l]; + shared.index[get_local_id(0)] = shared.index[get_local_id(0) + select(0, l, l2 < l1)]; shared.length[get_local_id(0)] = l1 = min(l1, l2); } barrier(CLK_LOCAL_MEM_FENCE); diff --git a/CUETools.FLACCL.cmd/Program.cs b/CUETools.FLACCL.cmd/Program.cs index 252c930..87bdcea 100644 --- a/CUETools.FLACCL.cmd/Program.cs +++ b/CUETools.FLACCL.cmd/Program.cs @@ -65,6 +65,7 @@ namespace CUETools.FLACCL.cmd TextWriter stdout = Console.Out; Console.SetOut(Console.Error); + var settings = new FLACCLWriterSettings(); DateTime start = DateTime.Now; TimeSpan lastPrint = TimeSpan.FromMilliseconds(0); bool debug = false, quiet = false; @@ -79,9 +80,8 @@ namespace CUETools.FLACCL.cmd orders_per_window = -1, blocksize = -1; int level = -1, padding = -1, vbr_mode = -1; - bool do_md5 = true, do_seektable = true, do_verify = false, gpu_only = true; + bool do_seektable = true; bool buffered = false; - int cpu_threads = 0; bool ok = true; for (int arg = 0; arg < args.Length; arg++) @@ -93,17 +93,27 @@ namespace CUETools.FLACCL.cmd else if ((args[arg] == "-q" || args[arg] == "--quiet")) quiet = true; else if (args[arg] == "--verify") - do_verify = true; + settings.DoVerify = true; else if (args[arg] == "--no-seektable") do_seektable = false; else if (args[arg] == "--slow-gpu") - gpu_only = false; + settings.GPUOnly = false; else if (args[arg] == "--no-md5") - do_md5 = false; + settings.DoMD5 = false; else if (args[arg] == "--buffered") buffered = true; else if (args[arg] == "--cpu-threads") - ok = (++arg < args.Length) && int.TryParse(args[arg], out cpu_threads); + { + int val = settings.CPUThreads; + ok = (++arg < args.Length) && int.TryParse(args[arg], out val); + settings.CPUThreads = val; + } + else if (args[arg] == "--group-size") + { + int val = settings.GroupSize; + ok = (++arg < args.Length) && int.TryParse(args[arg], out val); + settings.GroupSize = val; + } else if ((args[arg] == "-o" || args[arg] == "--output") && ++arg < args.Length) output_file = args[arg]; else if ((args[arg] == "-s" || args[arg] == "--stereo") && ++arg < args.Length) @@ -141,7 +151,7 @@ namespace CUETools.FLACCL.cmd else if ((args[arg] == "-v" || args[arg] == "--vbr")) ok = (++arg < args.Length) && int.TryParse(args[arg], out vbr_mode); else if (args[arg] == "--orders-per-window") - ok = (++arg < args.Length) && int.TryParse(args[arg], out orders_per_window); + ok = (++arg < args.Length) && int.TryParse(args[arg], out orders_per_window); else if ((args[arg] == "-b" || args[arg] == "--blocksize") && ++arg < args.Length) ok = int.TryParse(args[arg], out blocksize); else if ((args[arg] == "-p" || args[arg] == "--padding") && ++arg < args.Length) @@ -202,10 +212,7 @@ namespace CUETools.FLACCL.cmd try { - (encoder.Settings as FLACCLWriterSettings).GPUOnly = gpu_only; - (encoder.Settings as FLACCLWriterSettings).CPUThreads = cpu_threads; - (encoder.Settings as FLACCLWriterSettings).DoVerify = do_verify; - (encoder.Settings as FLACCLWriterSettings).DoMD5 = do_md5; + encoder.Settings = settings; if (level >= 0) encoder.CompressionLevel = level; if (stereo_method != null) @@ -310,15 +317,15 @@ namespace CUETools.FLACCL.cmd if (debug) { Console.SetOut(stdout); - Console.Out.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}..{5}\t{6}..{7}\t{8}..{9}\t{10}\t{11}", + Console.Out.WriteLine("{0}\t{1}\t{2}\t{3}\t{4} ({5})\t{6} ({7})\t{8}..{9}\t{10}\t{11}", encoder.TotalSize, encoder.UserProcessorTime.TotalSeconds > 0 ? encoder.UserProcessorTime.TotalSeconds : totalElapsed.TotalSeconds, encoder.StereoMethod.ToString().PadRight(15), encoder.WindowFunction.ToString().PadRight(15), - encoder.MinPartitionOrder, encoder.MaxPartitionOrder, - encoder.MinLPCOrder, + settings.GPUOnly ? "GPU" : "CPU", encoder.MaxLPCOrder, + encoder.OrdersPerWindow, encoder.MinPrecisionSearch, encoder.MaxPrecisionSearch, encoder.BlockSize,