diff --git a/CUETools.Codecs.FLACCL/FLACCLWriter.cs b/CUETools.Codecs.FLACCL/FLACCLWriter.cs index 2561350..37eb84a 100644 --- a/CUETools.Codecs.FLACCL/FLACCLWriter.cs +++ b/CUETools.Codecs.FLACCL/FLACCLWriter.cs @@ -2399,7 +2399,7 @@ namespace CUETools.Codecs.FLACCL residualTasksLen = sizeof(FLACCLSubframeTask) * 32 * channelsCount * FLACCLWriter.maxFrames; bestResidualTasksLen = sizeof(FLACCLSubframeTask) * channels * FLACCLWriter.maxFrames; int samplesBufferLen = sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channelsCount; - int residualBufferLen = sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channelsCount; // *channels! but need to adjust residualOffser + int residualBufferLen = sizeof(int) * FLACCLWriter.MAX_BLOCKSIZE * channels; // need to adjust residualOffset? int partitionsLen = sizeof(int) * (30 << 8) * channels * FLACCLWriter.maxFrames; int riceParamsLen = sizeof(int) * (4 << 8) * channels * FLACCLWriter.maxFrames; int autocorLen = sizeof(float) * (MAX_ORDER + 1) * lpc.MAX_LPC_WINDOWS * channelsCount * FLACCLWriter.maxFrames; @@ -2407,7 +2407,7 @@ namespace CUETools.Codecs.FLACCL int resOutLen = sizeof(int) * channelsCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * FLACCLWriter.maxFrames; int wndLen = sizeof(float) * FLACCLWriter.MAX_BLOCKSIZE /** 2*/ * lpc.MAX_LPC_WINDOWS; int selectedLen = sizeof(int) * 32 * channelsCount * FLACCLWriter.maxFrames; - int riceLen = sizeof(int) * channelsCount * FLACCLWriter.MAX_BLOCKSIZE; + int riceLen = sizeof(int) * channels * FLACCLWriter.MAX_BLOCKSIZE; if (!writer._settings.MappedMemory) { diff --git a/CUETools.Codecs.FLACCL/flac.cl b/CUETools.Codecs.FLACCL/flac.cl index 2ef8105..eff3dc9 100644 --- a/CUETools.Codecs.FLACCL/flac.cl +++ b/CUETools.Codecs.FLACCL/flac.cl @@ -1780,32 +1780,25 @@ void clRiceEncoding( barrier(CLK_LOCAL_MEM_FENCE); //if ((get_global_id(0) == 64 || get_global_id(0) == 63) && pos == 0) // printf("v=%x,k=%d,mylen=%d,mypos=%d,pstart=%d,partlen=%d\n", v, k, mylen, mypos[tid], pstart, partlen); - if (mylen > 0) + if (pstart && mylen) { - if (pstart) - { - int kpos = mypos[tid] - mylen; - unsigned int kval = (k << 28); - // if (get_group_id(0) == 0 && kpos / 32 - task.encodingOffset / 32 == 5 && pos == 0) - //printf("{%08X |= %08X}\n", data[kpos / 32 - start32], kval >> (kpos & 31)); - atom_or(&data[kpos / 32 - start32], kval >> (kpos & 31)); - if ((kpos & 31) != 0) - atom_or(&data[kpos / 32 - start32 + 1], kval << (32 - (kpos & 31))); - } - int qpos = mypos[tid] - k - 1; - unsigned int qval = (1U << 31) | (v << (31 - k)); - //if (get_group_id(0) == 0 && qpos / 32 - task.encodingOffset / 32 == 5 && pos == 0) - // printf("(%08X |= %08X) tid == %d, qpos == %d, qval == %08X\n", data[qpos / 32 - start32], qval >> (qpos & 31), tid, qpos, qval); - // if (get_group_id(0) == 0 && pos == 0) - // { - // printf("[%08X] (%08X |= %08X) qval==%08x qpos==%08x\n", qpos / 32 - start32, data[qpos / 32 - start32], qval >> (qpos & 31), qval, qpos); - //if (qval << (32 - (qpos & 31)) != 0) - // printf("[%08X] (%08X |= %08X)\n", qpos / 32 - start32 + 1, data[qpos / 32 - start32+1], qval << (32 - (qpos & 31))); - // } - atom_or(&data[qpos / 32 - start32], qval >> (qpos & 31)); - if ((qpos & 31) != 0) - atom_or(&data[qpos / 32 - start32 + 1], qval << (32 - (qpos & 31))); + int kpos = mypos[tid] - mylen; + int kpos0 = (kpos >> 5) - start32; + int kpos1 = kpos & 31; + unsigned int kval = k << 28; + unsigned int kval0 = kval >> kpos1; + unsigned int kval1 = select(0, kval << (32 - kpos1), kpos1); + atom_or(&data[kpos0], kval0); + atom_or(&data[kpos0 + 1], kval1); } + int qpos = mypos[tid] - k - 1; + int qpos0 = (qpos >> 5) - start32; + int qpos1 = qpos & 31; + unsigned int qval = select(0, (1U << 31) | (v << (31 - k)), mylen); + unsigned int qval0 = qval >> qpos1; + unsigned int qval1= select(0, qval << (32 - qpos1), qpos1); + atom_or(&data[qpos0], qval0); + atom_or(&data[qpos0 + 1], qval1); if (tid == GROUP_SIZE - 1) start = mypos[tid]; //if (get_group_id(0) == 0 && pos == 0)