diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs index 1fe461b..2561350 100644 --- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -37,6 +37,7 @@ namespace CUETools.Codecs.FLACCL { this.DoVerify = false; this.GPUOnly = true; + this.DoRice = false; this.MappedMemory = false; this.DoMD5 = true; this.GroupSize = 128; @@ -57,6 +58,10 @@ namespace CUETools.Codecs.FLACCL [SRDescription(typeof(Properties.Resources), "DescriptionGPUOnly")] public bool GPUOnly { get; set; } + [DefaultValue(false)] + [SRDescription(typeof(Properties.Resources), "DescriptionDoRice")] + public bool DoRice { get; set; } + [DefaultValue(false)] [SRDescription(typeof(Properties.Resources), "DescriptionMappedMemory")] public bool MappedMemory { get; set; } @@ -837,7 +842,7 @@ namespace CUETools.Codecs.FLACCL //assert(porder >= 0); frame.writer.writebits(4, porder); - if (task.riceOnGPU) + if (_settings.DoRice) { if (task.BestResidualTasks[index].size != (int)sub.best.size) throw new Exception("Encoding offset mismatch"); @@ -846,6 +851,7 @@ namespace CUETools.Codecs.FLACCL if (task.BestResidualTasks[index].encodingOffset != frame.writer.BitLength) throw new Exception("Encoding offset mismatch"); int len = task.BestResidualTasks[index].size - task.BestResidualTasks[index].headerLen; + //Console.WriteLine("{0:x} => {1:x}", _totalSize + frame.writer.BitLength / 8, _totalSize + (frame.writer.BitLength + len) / 8); // task.BestResidualTasks[index].headerLen frame.writer.writeints(len, (byte*)task.clRiceOutputPtr); if (task.BestResidualTasks[index].encodingOffset + len != frame.writer.BitLength) @@ -1196,7 +1202,7 @@ namespace CUETools.Codecs.FLACCL #if DEBUG // check size - if (_settings.GPUOnly && !task.riceOnGPU) + if (_settings.GPUOnly && !_settings.DoRice) { uint real_size = measure_subframe(task.frame, task.frame.subframes[ch]); if (real_size != task.frame.subframes[ch].best.size) @@ -1204,7 +1210,7 @@ namespace CUETools.Codecs.FLACCL } #endif - if (((csum << task.frame.subframes[ch].obits) >= 1UL << 32 || !_settings.GPUOnly) && !task.riceOnGPU) + if (((csum << task.frame.subframes[ch].obits) >= 1UL << 32 || !_settings.GPUOnly) && !_settings.DoRice) { if (!unpacked) unpack_samples(task, task.frameSize); unpacked = true; if ((csum << task.frame.subframes[ch].obits) >= 1UL << 32) @@ -1289,10 +1295,10 @@ namespace CUETools.Codecs.FLACCL frame.subframes[ch].wbits = task.BestResidualTasks[index].wbits; for (int i = 0; i < task.BestResidualTasks[index].residualOrder; i++) frame.subframes[ch].best.coefs[i] = task.BestResidualTasks[index].coefs[task.BestResidualTasks[index].residualOrder - 1 - i]; - if (_settings.GPUOnly && !task.riceOnGPU && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC)) + frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder; + if (_settings.GPUOnly && !_settings.DoRice && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC)) //if (_settings.GPUOnly && (frame.subframes[ch].best.type == SubframeType.Fixed || frame.subframes[ch].best.type == SubframeType.LPC)) { - frame.subframes[ch].best.rc.porder = task.BestResidualTasks[index].porder; int* riceParams = ((int*)task.clBestRiceParamsPtr) + (index << task.max_porder); fixed (int* dstParams = frame.subframes[ch].best.rc.rparams) AudioSamples.MemCpy(dstParams, riceParams, (1 << frame.subframes[ch].best.rc.porder)); @@ -1305,7 +1311,7 @@ namespace CUETools.Codecs.FLACCL } else { - if (task.riceOnGPU) + if (_settings.DoRice && frame.subframes[ch].best.size != task.BestResidualTasks[index].size) throw new Exception("size reported incorrectly"); } } @@ -1579,6 +1585,8 @@ namespace CUETools.Codecs.FLACCL "#define MAX_ORDER " + eparams.max_prediction_order.ToString() + "\n" + "#define GROUP_SIZE " + groupSize.ToString() + "\n" + "#define FLACCL_VERSION \"" + vendor_string + "\"\n" + + (_settings.GPUOnly ? "#define DO_PARTITIONS\n" : "") + + (_settings.DoRice ? "#define DO_RICE\n" : "") + #if DEBUG "#define DEBUG\n" + #endif @@ -1802,7 +1810,7 @@ namespace CUETools.Codecs.FLACCL public string Path { get { return _path; } } - public static readonly string vendor_string = "FLACCL#0.2"; + public static readonly string vendor_string = "FLACCL#0.3"; int select_blocksize(int samplerate, int time_ms) { @@ -2362,8 +2370,6 @@ namespace CUETools.Codecs.FLACCL public int nWindowFunctions = 0; public int max_porder = 0; - public bool riceOnGPU = false; - public FlakeReader verify; public Thread workThread = null; @@ -2492,7 +2498,7 @@ namespace CUETools.Codecs.FLACCL clSumPartition = openCLProgram.CreateKernel("clSumPartition"); clFindRiceParameter = openCLProgram.CreateKernel("clFindRiceParameter"); clFindPartitionOrder = openCLProgram.CreateKernel("clFindPartitionOrder"); - if (riceOnGPU) + if (writer._settings.DoRice) { clCalcOutputOffsets = openCLProgram.CreateKernel("clCalcOutputOffsets"); clRiceEncoding = openCLProgram.CreateKernel("clRiceEncoding"); @@ -2545,7 +2551,7 @@ namespace CUETools.Codecs.FLACCL clSumPartition.Dispose(); clFindRiceParameter.Dispose(); clFindPartitionOrder.Dispose(); - if (riceOnGPU) + if (writer._settings.DoRice) { clCalcOutputOffsets.Dispose(); clRiceEncoding.Dispose(); @@ -2875,7 +2881,7 @@ namespace CUETools.Codecs.FLACCL groupSize, channels * frameCount); - if (riceOnGPU) + if (writer._settings.DoRice) { clCalcOutputOffsets.SetArgs( clResidual, @@ -2907,7 +2913,7 @@ namespace CUETools.Codecs.FLACCL if (!writer._settings.MappedMemory) { openCLCQ.EnqueueReadBuffer(clBestRiceParams, false, 0, sizeof(int) * (1 << max_porder) * channels * frameCount, clBestRiceParamsPtr); - if (riceOnGPU) + if (writer._settings.DoRice) openCLCQ.EnqueueReadBuffer(clRiceOutput, false, 0, sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels, clRiceOutputPtr); else openCLCQ.EnqueueReadBuffer(clResidual, false, 0, sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels, clResidualPtr); diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl index 0df9604..2ef8105 100644 --- a/CUETools.Codecs.FLACCL/flac.cl +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -1007,6 +1007,7 @@ void clChooseBestMethod( tasks_out[get_global_id(0)] = tasks[best_no]; } +#ifdef DO_PARTITIONS #ifdef __CPU__ // get_group_id(0) == task index __kernel __attribute__((reqd_work_group_size(1, 1, 1))) @@ -1292,8 +1293,11 @@ void clCalcPartition16( s = sum.x + sum.y + sum.z + sum.w; const int lpos = (15 << (max_porder + 1)) * get_group_id(0) + (k << (max_porder + 1)) + offs / 16; - if (k <= 14) + if (k <= 14 && offs < bs) partition_lengths[lpos] = min(0x7fffff, s) + (16 - select(0, ro, offs < 16)) * (k + 1); + +// if (task.data.blocksize == 16 && x == 0 && k <= 14) +// printf("[%d] = %d = s:%d + %d * (k:%d + 1), ro=%d, offs=%d, lpos=%d\n", k, partition_lengths[lpos], s, (16 - select(0, ro, offs < 16)), k, ro, offs, lpos); } } #endif @@ -1531,20 +1535,30 @@ void clFindPartitionOrder( if (get_local_id(0) == 0) { - tasks[get_group_id(0)].data.porder = best_porder; + task.porder = best_porder; int obits = task.obits - task.wbits; - tasks[get_group_id(0)].data.size = - task.type == Fixed ? task.residualOrder * obits + 6 + best_length : - task.type == LPC ? task.residualOrder * obits + 6 + best_length + 4 + 5 + task.residualOrder * task.cbits : - task.type == Constant ? obits : obits * task.blocksize; + task.headerLen = + task.type == Fixed ? task.residualOrder * obits + 6 : + task.type == LPC ? task.residualOrder * obits + 6 + 4 + 5 + task.residualOrder * task.cbits : + task.type == Constant ? obits : + /* task.type == Verbatim ? */ obits * task.blocksize; + task.size = task.headerLen + select(0, best_length, task.type == Fixed || task.type == LPC); + if (task.size >= obits * task.blocksize) + { + task.headerLen = task.size = obits * task.blocksize; + task.type = Verbatim; + } } barrier(CLK_LOCAL_MEM_FENCE); + if (get_local_id(0) < sizeof(task) / sizeof(int)) + ((__global int*)(&tasks[get_group_id(0)]))[get_local_id(0)] = ((__local int*)&task)[get_local_id(0)]; for (int offs = get_local_id(0); offs < (1 << best_porder); offs += GROUP_SIZE) best_rice_parameters[(get_group_id(0) << max_porder) + offs] = rice_parameters[pos - (2 << best_porder) + offs]; // FIXME: should be bytes? } #endif +#ifdef DO_RICE #ifdef __CPU__ typedef struct BitWriter_t { @@ -1720,6 +1734,100 @@ void clRiceEncoding( //if (get_group_id(0) == 0) printf("\n"); flush(&bw); } +#else + __local FLACCLSubframeData task; + __local int riceparams[256]; + __local int mypos[GROUP_SIZE]; + __local unsigned int data[GROUP_SIZE]; + __local int start; + + int tid = get_local_id(0); + if (tid < sizeof(task) / sizeof(int)) + ((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0)]))[tid]; + barrier(CLK_LOCAL_MEM_FENCE); + for (int offs = tid; offs < (1 << task.porder); offs += GROUP_SIZE) + riceparams[offs] = best_rice_parameters[(get_group_id(0) << max_porder) + offs]; + if (tid == 0) + start = task.encodingOffset; + data[tid] = 0; + barrier(CLK_LOCAL_MEM_FENCE); + int bs = task.blocksize; + int partlen = bs >> task.porder; + for (int pos = 0; pos < bs; pos += GROUP_SIZE) + { + int offs = pos + tid; + int v = offs < bs ? residual[task.residualOffs + offs] : 0; + int k = offs < bs ? riceparams[offs / partlen] : 0; + int pstart = offs == task.residualOrder || (offs % partlen) == 0; + v = (v << 1) ^ (v >> 31); + int mylen = select(0, (v >> k) + 1 + k + select(0, 4, pstart), offs >= task.residualOrder && offs < bs); + mypos[tid] = mylen; + barrier(CLK_LOCAL_MEM_FENCE); + // Inclusive scan(+) + for (int offset = 1; offset < GROUP_SIZE; offset <<= 1) + { + int t = tid >= offset ? mypos[tid - offset] : 0; + barrier(CLK_LOCAL_MEM_FENCE); + mypos[tid] += t; + barrier(CLK_LOCAL_MEM_FENCE); + } + // make it exclusive + //if ((get_global_id(0) == 64 || get_global_id(0) == 63) && pos == 0) + // printf("v=%x,k=%d,mylen=%d,mypos=%d,pstart=%d,partlen=%d,start=%d\n", v, k, mylen, mypos[tid-1], pstart, partlen, start); + //barrier(CLK_LOCAL_MEM_FENCE); + mypos[tid] += start; + int start32 = start / 32; + barrier(CLK_LOCAL_MEM_FENCE); + //if ((get_global_id(0) == 64 || get_global_id(0) == 63) && pos == 0) + // printf("v=%x,k=%d,mylen=%d,mypos=%d,pstart=%d,partlen=%d\n", v, k, mylen, mypos[tid], pstart, partlen); + if (mylen > 0) + { + if (pstart) + { + int kpos = mypos[tid] - mylen; + unsigned int kval = (k << 28); + // if (get_group_id(0) == 0 && kpos / 32 - task.encodingOffset / 32 == 5 && pos == 0) + //printf("{%08X |= %08X}\n", data[kpos / 32 - start32], kval >> (kpos & 31)); + atom_or(&data[kpos / 32 - start32], kval >> (kpos & 31)); + if ((kpos & 31) != 0) + atom_or(&data[kpos / 32 - start32 + 1], kval << (32 - (kpos & 31))); + } + int qpos = mypos[tid] - k - 1; + unsigned int qval = (1U << 31) | (v << (31 - k)); + //if (get_group_id(0) == 0 && qpos / 32 - task.encodingOffset / 32 == 5 && pos == 0) + // printf("(%08X |= %08X) tid == %d, qpos == %d, qval == %08X\n", data[qpos / 32 - start32], qval >> (qpos & 31), tid, qpos, qval); + // if (get_group_id(0) == 0 && pos == 0) + // { + // printf("[%08X] (%08X |= %08X) qval==%08x qpos==%08x\n", qpos / 32 - start32, data[qpos / 32 - start32], qval >> (qpos & 31), qval, qpos); + //if (qval << (32 - (qpos & 31)) != 0) + // printf("[%08X] (%08X |= %08X)\n", qpos / 32 - start32 + 1, data[qpos / 32 - start32+1], qval << (32 - (qpos & 31))); + // } + atom_or(&data[qpos / 32 - start32], qval >> (qpos & 31)); + if ((qpos & 31) != 0) + atom_or(&data[qpos / 32 - start32 + 1], qval << (32 - (qpos & 31))); + } + if (tid == GROUP_SIZE - 1) + start = mypos[tid]; + //if (get_group_id(0) == 0 && pos == 0) + // printf("[%d] == %d\n", tid, mypos[tid]); + //if (get_group_id(0) == 0 && pos == 0) + // printf("%d == %d\n", (((qpos % 32) / 8) * 16 + 7 - qpos % 32), (((qpos << 1) & 48) + 7 - qpos & 31)); + barrier(CLK_LOCAL_MEM_FENCE); + unsigned int bb = data[tid]; + if ((start32 + tid) * 32 <= start) + output[start32 + tid] = (bb >> 24) | ((bb >> 8) & 0xff00) | ((bb << 8) & 0xff0000) | ((bb << 24) & 0xff000000); + //if (get_group_id(0) == 0 && pos == 0 && bb != 0) + // printf("[%08x] == %08X\n", 0x2dc8 + (tid + start32) * 4, data[tid]); + int remainder = data[start / 32 - start32]; + barrier(CLK_LOCAL_MEM_FENCE); + data[tid] = select(0, remainder, tid == 0); + //if (start / 32 - start32 > GROUP_SIZE) + // printf("buffer overflow: %d > %d\n", start / 32 - start32, GROUP_SIZE); + } + // if (tid == 0 && start != task.encodingOffset - task.headerLen + task.size) + //printf("size mismatch: %d != %d\n", start, task.encodingOffset - task.headerLen + task.size); #endif } +#endif /* DO_RICE */ +#endif /* DO_PARTITIONS */ #endif diff --git a/CUETools.FLACCL.cmd/Program.cs b/CUETools.FLACCL.cmd/Program.cs index 377e2ef..5b2ba58 100644 --- a/CUETools.FLACCL.cmd/Program.cs +++ b/CUETools.FLACCL.cmd/Program.cs @@ -77,7 +77,7 @@ namespace CUETools.FLACCL.cmd min_lpc_order = -1, max_lpc_order = -1, min_fixed_order = -1, max_fixed_order = -1, min_precision = -1, max_precision = -1, - orders_per_window = -1, + orders_per_window = -1, orders_per_channel = -1, blocksize = -1; int level = -1, padding = -1, vbr_mode = -1; bool do_seektable = true; @@ -99,6 +99,8 @@ namespace CUETools.FLACCL.cmd do_seektable = false; else if (args[arg] == "--slow-gpu") settings.GPUOnly = false; + else if (args[arg] == "--do-rice") + settings.DoRice = true; else if (args[arg] == "--no-md5") settings.DoMD5 = false; else if (args[arg] == "--buffered") @@ -155,6 +157,8 @@ namespace CUETools.FLACCL.cmd ok = (++arg < args.Length) && int.TryParse(args[arg], out vbr_mode); else if (args[arg] == "--orders-per-window") ok = (++arg < args.Length) && int.TryParse(args[arg], out orders_per_window); + else if (args[arg] == "--orders-per-channel") + ok = (++arg < args.Length) && int.TryParse(args[arg], out orders_per_channel); else if ((args[arg] == "-b" || args[arg] == "--blocksize") && ++arg < args.Length) ok = int.TryParse(args[arg], out blocksize); else if ((args[arg] == "-p" || args[arg] == "--padding") && ++arg < args.Length) @@ -248,6 +252,8 @@ namespace CUETools.FLACCL.cmd encoder.VBRMode = vbr_mode; if (orders_per_window >= 0) encoder.OrdersPerWindow = orders_per_window; + if (orders_per_channel >= 0) + encoder.OrdersPerChannel = orders_per_channel; encoder.DoSeekTable = do_seektable; } catch (Exception ex) @@ -327,7 +333,7 @@ namespace CUETools.FLACCL.cmd Console.Out.WriteLine("{0}\t{1}\t{2}\t{3}\t{4} ({5})\t{6}/{7}+{12}{13}\t{8}..{9}\t{10}\t{11}", encoder.TotalSize, encoder.UserProcessorTime.TotalSeconds > 0 ? encoder.UserProcessorTime.TotalSeconds : totalElapsed.TotalSeconds, - encoder.StereoMethod.ToString().PadRight(15), + (encoder.StereoMethod.ToString() + (encoder.OrdersPerChannel == 32 ? "" : "(" + encoder.OrdersPerChannel.ToString() + ")")).PadRight(15), encoder.WindowFunction.ToString().PadRight(15), encoder.MaxPartitionOrder, settings.GPUOnly ? "GPU" : "CPU",