Files
cuetools.net/CUETools.Codecs.FLACCL/flac.cl

2293 lines
76 KiB
Common Lisp
Raw Normal View History

2010-09-20 05:32:05 +00:00
/**
* CUETools.FLACCL: FLAC audio encoder using OpenCL
Bump copyright year to 2020 The copyright year was last time updated in 2018. There is some cleanup involved in this commit and the next copyright year update is supposed to be simpler (i.e. substitute "-2020"). - Substitute occurrences of "-2018" with "-2020" using: git grep -I -l -e '-2018' -- ':(exclude)*.bak' | xargs \ sed -b -i -e 's/-2018/-2020/g' - Update special cases: CUEPlayer git grep -I -l -e 'Grigory Chudov 2010' -- | xargs \ sed -b -i -e 's/Grigory Chudov 2010/2010-2020 Grigory Chudov/g' CUERipper git grep -I -l -e '2008-2009' -- | xargs \ sed -b -i -e 's/2008-2009/2008-2020/g' CUETools, CUETools.FLACCL.cmd git grep -I -l -e '2008-2010' -- ':(exclude)*FlaCuda*' | xargs \ sed -b -i -e 's/2008-2010/2008-2020/g' git grep -I -l -e '2010-2013' -- | xargs \ sed -b -i -e 's/2010-2013/2010-2020/g' CUETools.ChaptersToCue git grep -I -l -e 'Grigory Chudov 2017' -- | xargs \ sed -b -i -e 's/Grigory Chudov 2017/2017-2020 Grigory Chudov/g' CUETools.CTDB.EACPlugin git grep -I -l -e 'Grigory Chudov 2012' -- | xargs \ sed -b -i -e 's/Grigory Chudov 2012/2012-2020 Grigory Chudov/g' git grep -I -l -e '2011-12' -- | xargs \ sed -b -i -e 's/2011-12/2011-2020/g' CUETools.Codecs.FLACCL git grep -I -l -e '2009-2010' -- ':(exclude)*FlaCuda*' | xargs \ sed -b -i -e 's/2009-2010/2009-2020/g' CUETools.eac3ui (BluTools) git grep -I -l -e '© 2018' -- | xargs \ sed -b -i -e 's/© 2018/© 2018-2020 Grigory Chudov/g' CUETools.Flake git grep -I -l -e ' 2009-2014 Gr' -- | xargs \ sed -b -i -e 's/ 2009-2014 Gr/ 2009-2020 Gr/g' CUETools.Processor git grep -I -l -e ' 2008-2013 Gr' -- | xargs \ sed -b -i -e 's/ 2008-2013 Gr/ 2008-2020 Gr/g' CUETools.Ripper.Console git grep -I -l -e ' 2008-10 Gr' -- | xargs \ sed -b -i -e 's/ 2008-10 Gr/ 2008-2020 Gr/g' CUETools.Ripper.Console, CUETools.Ripper.SCSI git grep -I -l -e ' 2008-13 Gr' -- | xargs \ sed -b -i -e 's/ 2008-13 Gr/ 2008-2020 Gr/g' Single year entries: 2008, 2009, 2010, 2011, 2017, 2018 git grep -I -l -e ' 2008 Gr' -- | xargs \ sed -b -i -e 's/ 2008 Gr/ 2008-2020 Gr/g' git grep -I -l -e ' 2009 Gr' -- ':(exclude)*FlaCuda*' | xargs \ sed -b -i -e 's/ 2009 Gr/ 2009-2020 Gr/g' git grep -I -l -e ' 2010 Gr' -- | xargs \ sed -b -i -e 's/ 2010 Gr/ 2010-2020 Gr/g' git grep -I -l -e ' 2011 Gr' -- | xargs \ sed -b -i -e 's/ 2011 Gr/ 2011-2020 Gr/g' git grep -I -l -e ' 2017 Gr' -- | xargs \ sed -b -i -e 's/ 2017 Gr/ 2017-2020 Gr/g' git grep -I -l -e ' 2018 Gr' -- | xargs \ sed -b -i -e 's/ 2018 Gr/ 2018-2020 Gr/g' Fix typo in copyright year of CUETools.Codecs.WMA/AudioDecoder.cs: Copyright (c) 20139 Grigory Chudov git grep -I -lw -e '20139' -- | xargs \ sed -b -i -e 's/20139/2013-2020/g'
2020-01-30 18:13:46 +01:00
* Copyright (c) 2010-2020 Gregory S. Chudov
2010-09-20 05:32:05 +00:00
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef _FLACCL_KERNEL_H_
#define _FLACCL_KERNEL_H_
2013-05-30 22:14:16 -04:00
#if defined(__WinterPark__) || defined(__BeaverCreek__) || defined(__Turks__) || defined(__Caicos__) || defined(__Tahiti__) || defined(__Pitcairn__) || defined(__Capeverde__)
#define AMD
#elif defined(__Cayman__) || defined(__Barts__) || defined(__Cypress__) || defined(__Juniper__) || defined(__Redwood__) || defined(__Cedar__)
#define AMD
#elif defined(__ATI_RV770__) || defined(__ATI_RV730__) || defined(__ATI_RV710__)
#define AMD
2010-11-05 16:28:24 +00:00
#endif
2010-11-17 05:49:59 +00:00
2013-06-23 23:12:47 -04:00
#define VENDOR_ID_INTEL 0x8086
#define VENDOR_ID_NVIDIA 0x10DE
#define VENDOR_ID_ATIAMD 0x1002
#ifndef FLACCL_CPU
#if VENDOR_ID == VENDOR_ID_INTEL
#define WARP_SIZE 16
#else
#define WARP_SIZE 32
#endif
#endif
2010-12-02 15:58:41 +00:00
#if defined(HAVE_cl_khr_fp64) || defined(HAVE_cl_amd_fp64)
#define HAVE_DOUBLE
#define ZEROD 0.0
//#define FAST_DOUBLE
#else
#define double float
#define double4 float4
#define ZEROD 0.0f
#endif
#if defined(HAVE_DOUBLE) && defined(FAST_DOUBLE)
#define fastdouble double
#define fastdouble4 double4
#define ZEROFD 0.0
#else
#define fastdouble float
#define fastdouble4 float4
#define ZEROFD 0.0f
#endif
2010-12-07 22:52:34 +00:00
#if BITS_PER_SAMPLE > 16
#define MAX_RICE_PARAM 30
#define RICE_PARAM_BITS 5
#else
#define MAX_RICE_PARAM 14
#define RICE_PARAM_BITS 4
#endif
2010-09-20 05:32:05 +00:00
typedef enum
{
Constant = 0,
Verbatim = 1,
Fixed = 8,
LPC = 32
} SubframeType;
typedef struct
{
int residualOrder; // <= 32
int samplesOffs;
int shift;
int cbits;
int size;
int type;
int obits;
int blocksize;
2010-12-07 22:52:34 +00:00
int coding_method;
2010-09-20 05:32:05 +00:00
int channel;
int residualOffs;
int wbits;
int abits;
int porder;
2010-11-12 05:44:39 +00:00
int headerLen;
int encodingOffset;
2010-09-20 05:32:05 +00:00
} FLACCLSubframeData;
typedef struct
{
FLACCLSubframeData data;
2010-09-25 19:53:48 +00:00
int coefs[32]; // fixme: should be short?
2010-09-20 05:32:05 +00:00
} FLACCLSubframeTask;
2010-11-17 05:49:59 +00:00
#if 0
2010-11-05 16:28:24 +00:00
__kernel void clWindowRectangle(__global float* window, int windowOffset)
{
window[get_global_id(0)] = 1.0f;
}
__kernel void clWindowFlattop(__global float* window, int windowOffset)
{
float p = M_PI_F * get_global_id(0) / (get_global_size(0) - 1);
window[get_global_id(0)] = 1.0f
- 1.93f * cos(2 * p)
+ 1.29f * cos(4 * p)
- 0.388f * cos(6 * p)
+ 0.0322f * cos(8 * p);
}
__kernel void clWindowTukey(__global float* window, int windowOffset, float p)
{
2010-11-08 18:47:27 +00:00
int tid = get_global_id(0);
2010-11-05 16:28:24 +00:00
int Np = (int)(p / 2.0f * get_global_size(0)) - 1;
2010-11-08 18:47:27 +00:00
int Np2 = tid - (get_global_size(0) - Np - 1) + Np;
int n = select(max(Np, Np2), tid, tid <= Np);
window[tid] = 0.5f - 0.5f * cos(M_PI_F * n / Np);
2010-11-05 16:28:24 +00:00
}
2010-11-17 05:49:59 +00:00
#endif
2010-11-05 16:28:24 +00:00
2010-12-07 22:52:34 +00:00
#if BITS_PER_SAMPLE > 16
__kernel void clStereoDecorr(
__global int *samples,
__global unsigned char *src,
int offset
)
{
int pos = get_global_id(0);
int bpos = pos * 6;
int x = (((int)src[bpos] << 8) | ((int)src[bpos+1] << 16) | ((int)src[bpos+2] << 24)) >> 8;
int y = (((int)src[bpos+3] << 8) | ((int)src[bpos+4] << 16) | ((int)src[bpos+5] << 24)) >> 8;
samples[pos] = x;
samples[1 * offset + pos] = y;
samples[2 * offset + pos] = (x + y) >> 1;
samples[3 * offset + pos] = x - y;
}
__kernel void clChannelDecorr2(
__global int *samples,
__global unsigned char *src,
int offset
)
{
int pos = get_global_id(0);
int bpos = pos * 6;
samples[pos] = (((int)src[bpos] << 8) | ((int)src[bpos+1] << 16) | ((int)src[bpos+2] << 24)) >> 8;
samples[offset + pos] = (((int)src[bpos+3] << 8) | ((int)src[bpos+4] << 16) | ((int)src[bpos+5] << 24)) >> 8;
}
__kernel void clChannelDecorrX(
__global int *samples,
__global unsigned char *src,
int offset
)
{
int pos = get_global_id(0);
for (int ch = 0; ch < MAX_CHANNELS; ch++)
{
int bpos = 3 * (pos * MAX_CHANNELS + ch);
samples[offset * ch + pos] = (((int)src[bpos] << 8) | ((int)src[bpos+1] << 16) | ((int)src[bpos+2] << 24)) >> 8;
}
}
#else
2010-10-23 18:29:06 +00:00
__kernel void clStereoDecorr(
2010-11-05 16:28:24 +00:00
__global int4 *samples,
__global int4 *src,
2010-09-20 05:32:05 +00:00
int offset
)
{
int pos = get_global_id(0);
2010-11-05 16:28:24 +00:00
int4 s = src[pos];
int4 x = (s << 16) >> 16;
int4 y = s >> 16;
samples[pos] = x;
samples[1 * offset + pos] = y;
samples[2 * offset + pos] = (x + y) >> 1;
samples[3 * offset + pos] = x - y;
2010-09-20 05:32:05 +00:00
}
2010-10-23 18:29:06 +00:00
__kernel void clChannelDecorr2(
2010-11-05 16:28:24 +00:00
__global int4 *samples,
__global int4 *src,
2010-09-20 05:32:05 +00:00
int offset
)
{
int pos = get_global_id(0);
2010-11-05 16:28:24 +00:00
int4 s = src[pos];
samples[pos] = (s << 16) >> 16;
samples[offset + pos] = s >> 16;
2010-09-20 05:32:05 +00:00
}
2010-12-07 22:52:34 +00:00
__kernel void clChannelDecorrX(
__global int *samples,
__global short *src,
int offset
)
{
int pos = get_global_id(0);
for (int ch = 0; ch < MAX_CHANNELS; ch++)
{
int bpos = pos * MAX_CHANNELS + ch;
samples[offset * ch + pos] = src[bpos];
}
}
#endif
2010-10-23 18:29:06 +00:00
//__kernel void clChannelDecorr(
2010-09-20 05:32:05 +00:00
// int *samples,
// short *src,
// int offset
//)
//{
// int pos = get_global_id(0);
// if (pos < offset)
// samples[get_group_id(1) * offset + pos] = src[pos * get_num_groups(1) + get_group_id(1)];
//}
#define __ffs(a) (32 - clz(a & (-a)))
//#define __ffs(a) (33 - clz(~a & (a - 1)))
2010-11-19 07:35:43 +00:00
#ifdef FLACCL_CPU
2010-11-05 16:28:24 +00:00
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clFindWastedBits(
__global FLACCLSubframeTask *tasks,
__global int *samples,
int tasksPerChannel
)
{
__global FLACCLSubframeTask* ptask = &tasks[get_group_id(0) * tasksPerChannel];
int w = 0, a = 0;
for (int pos = 0; pos < ptask->data.blocksize; pos ++)
{
int smp = samples[ptask->data.samplesOffs + pos];
w |= smp;
a |= smp ^ (smp >> 31);
}
w = max(0,__ffs(w) - 1);
a = 32 - clz(a) - w;
for (int i = 0; i < tasksPerChannel; i++)
{
ptask[i].data.wbits = w;
ptask[i].data.abits = a;
//ptask[i].data.size = ptask[i].data.obits * ptask[i].data.blocksize;
}
}
#else
2010-09-25 19:53:48 +00:00
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
2010-10-23 18:29:06 +00:00
void clFindWastedBits(
2010-09-20 05:32:05 +00:00
__global FLACCLSubframeTask *tasks,
__global int *samples,
int tasksPerChannel
)
{
2010-09-25 19:53:48 +00:00
__local int abits[GROUP_SIZE];
__local int wbits[GROUP_SIZE];
2010-09-20 05:32:05 +00:00
__local FLACCLSubframeData task;
int tid = get_local_id(0);
if (tid < sizeof(task) / sizeof(int))
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0) * tasksPerChannel].data))[tid];
2010-10-23 18:29:06 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
2010-09-20 05:32:05 +00:00
int w = 0, a = 0;
2010-10-25 04:50:36 +00:00
for (int pos = tid; pos < task.blocksize; pos += GROUP_SIZE)
2010-09-20 05:32:05 +00:00
{
2010-10-23 18:29:06 +00:00
int smp = samples[task.samplesOffs + pos];
2010-09-20 05:32:05 +00:00
w |= smp;
a |= smp ^ (smp >> 31);
}
wbits[tid] = w;
abits[tid] = a;
barrier(CLK_LOCAL_MEM_FENCE);
2010-10-10 23:28:38 +00:00
for (int s = GROUP_SIZE / 2; s > 0; s >>= 1)
2010-09-20 05:32:05 +00:00
{
if (tid < s)
{
wbits[tid] |= wbits[tid + s];
abits[tid] |= abits[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
2010-09-25 19:53:48 +00:00
w = max(0,__ffs(wbits[0]) - 1);
a = 32 - clz(abits[0]) - w;
2010-09-20 05:32:05 +00:00
if (tid < tasksPerChannel)
2010-11-05 16:28:24 +00:00
{
int i = get_group_id(0) * tasksPerChannel + tid;
tasks[i].data.wbits = w;
tasks[i].data.abits = a;
//tasks[i].data.size = tasks[i].data.obits * tasks[i].data.blocksize;
}
2010-09-20 05:32:05 +00:00
}
2010-11-05 16:28:24 +00:00
#endif
2010-11-19 07:35:43 +00:00
#ifdef FLACCL_CPU
2010-11-20 14:06:10 +00:00
#define TEMPBLOCK 512
2010-11-05 16:28:24 +00:00
#define STORE_AC(ro, val) if (ro <= MAX_ORDER) pout[ro] = val;
#define STORE_AC4(ro, val) STORE_AC(ro*4+0, val##ro.x) STORE_AC(ro*4+1, val##ro.y) STORE_AC(ro*4+2, val##ro.z) STORE_AC(ro*4+3, val##ro.w)
// get_num_groups(0) == number of tasks
// get_num_groups(1) == number of windows
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clComputeAutocor(
__global float *output,
__global const int *samples,
__global const float *window,
__global FLACCLSubframeTask *tasks,
const int taskCount // tasks per block
)
{
FLACCLSubframeData task = tasks[get_group_id(0) * taskCount].data;
int len = task.blocksize;
int windowOffs = get_group_id(1) * len;
float data[TEMPBLOCK + MAX_ORDER + 3];
double4 ac0 = 0.0, ac1 = 0.0, ac2 = 0.0, ac3 = 0.0, ac4 = 0.0, ac5 = 0.0, ac6 = 0.0, ac7 = 0.0, ac8 = 0.0;
for (int pos = 0; pos < len; pos += TEMPBLOCK)
{
for (int tid = 0; tid < TEMPBLOCK + MAX_ORDER + 3; tid++)
data[tid] = tid < len - pos ? samples[task.samplesOffs + pos + tid] * window[windowOffs + pos + tid] : 0.0f;
2010-09-20 05:32:05 +00:00
2010-11-05 16:28:24 +00:00
for (int j = 0; j < TEMPBLOCK;)
{
float4 temp0 = 0.0f, temp1 = 0.0f, temp2 = 0.0f, temp3 = 0.0f, temp4 = 0.0f, temp5 = 0.0f, temp6 = 0.0f, temp7 = 0.0f, temp8 = 0.0f;
for (int k = 0; k < 32; k++)
{
float d0 = data[j];
temp0 += d0 * vload4(0, &data[j]);
temp1 += d0 * vload4(1, &data[j]);
#if MAX_ORDER >= 8
temp2 += d0 * vload4(2, &data[j]);
#if MAX_ORDER >= 12
temp3 += d0 * vload4(3, &data[j]);
#if MAX_ORDER >= 16
temp4 += d0 * vload4(4, &data[j]);
temp5 += d0 * vload4(5, &data[j]);
temp6 += d0 * vload4(6, &data[j]);
temp7 += d0 * vload4(7, &data[j]);
temp8 += d0 * vload4(8, &data[j]);
#endif
#endif
#endif
j++;
}
ac0 += convert_double4(temp0);
ac1 += convert_double4(temp1);
#if MAX_ORDER >= 8
ac2 += convert_double4(temp2);
#if MAX_ORDER >= 12
ac3 += convert_double4(temp3);
#if MAX_ORDER >= 16
ac4 += convert_double4(temp4);
ac5 += convert_double4(temp5);
ac6 += convert_double4(temp6);
ac7 += convert_double4(temp7);
ac8 += convert_double4(temp8);
#endif
#endif
#endif
}
}
__global float * pout = &output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1)];
STORE_AC4(0, ac) STORE_AC4(1, ac) STORE_AC4(2, ac) STORE_AC4(3, ac)
STORE_AC4(4, ac) STORE_AC4(5, ac) STORE_AC4(6, ac) STORE_AC4(7, ac)
STORE_AC4(8, ac)
}
#else
2010-10-23 18:29:06 +00:00
// get_num_groups(0) == number of tasks
// get_num_groups(1) == number of windows
2010-11-29 21:31:42 +00:00
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
void clComputeAutocor(
__global float *output,
__global const int *samples,
__global const float *window,
__global FLACCLSubframeTask *tasks,
const int taskCount // tasks per block
)
{
2010-12-02 15:58:41 +00:00
__local fastdouble data[GROUP_SIZE * 2];
2010-11-29 21:31:42 +00:00
__local FLACCLSubframeData task;
const int tid = get_local_id(0);
// fetch task data
if (tid < sizeof(task) / sizeof(int))
((__local int*)&task)[tid] = ((__global int*)(tasks + taskCount * get_group_id(0)))[tid];
barrier(CLK_LOCAL_MEM_FENCE);
int bs = task.blocksize;
2010-12-02 15:58:41 +00:00
data[tid] = ZEROFD;
2010-11-29 21:31:42 +00:00
const int THREADS_FOR_ORDERS = MAX_ORDER < 8 ? 8 : MAX_ORDER < 16 ? 16 : MAX_ORDER < 32 ? 32 : 64;
int lag = tid & (THREADS_FOR_ORDERS - 1);
int tid1 = tid + GROUP_SIZE - lag;
int pos = 0;
const __global float * wptr = &window[get_group_id(1) * bs];
2010-12-02 15:58:41 +00:00
// const __global int * sptr = &samples[task.samplesOffs];
double corr = ZEROD;
2010-11-29 21:31:42 +00:00
for (pos = 0; pos + GROUP_SIZE - 1 < bs; pos += GROUP_SIZE)
{
int off = pos + tid;
2010-12-02 15:58:41 +00:00
// fetch samples
fastdouble nextData = samples[task.samplesOffs + off] * wptr[off];
2010-11-29 21:31:42 +00:00
data[tid + GROUP_SIZE] = nextData;
barrier(CLK_LOCAL_MEM_FENCE);
2010-12-02 15:58:41 +00:00
fastdouble4 tmp = ZEROFD;
2010-11-29 21:31:42 +00:00
for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++)
2010-12-02 15:58:41 +00:00
tmp += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]);
corr += (tmp.x + tmp.y) + (tmp.w + tmp.z);
2010-11-29 21:31:42 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
data[tid] = nextData;
}
{
int off = pos + tid;
2010-12-02 15:58:41 +00:00
// fetch samples
double nextData = off < bs ? samples[task.samplesOffs + off] * wptr[off] : ZEROD;
2010-11-29 21:31:42 +00:00
data[tid + GROUP_SIZE] = nextData;
2013-06-23 23:12:47 -04:00
barrier(CLK_LOCAL_MEM_FENCE);
2010-11-29 21:31:42 +00:00
2010-12-02 15:58:41 +00:00
fastdouble4 tmp = ZEROFD;
2010-11-29 21:31:42 +00:00
for (int i = 0; i < THREADS_FOR_ORDERS / 4; i++)
2010-12-02 15:58:41 +00:00
tmp += vload4(i, &data[tid1 - lag]) * vload4(i, &data[tid1]);
corr += (tmp.x + tmp.y) + (tmp.w + tmp.z);
2010-11-29 21:31:42 +00:00
}
2010-12-02 15:58:41 +00:00
data[tid] = corr;
2010-11-29 21:31:42 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = GROUP_SIZE / 2; i >= THREADS_FOR_ORDERS; i >>= 1)
{
if (tid < i)
data[tid] += data[tid + i];
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid <= MAX_ORDER)
output[(get_group_id(0) * get_num_groups(1) + get_group_id(1)) * (MAX_ORDER + 1) + tid] = data[tid];
}
#endif
2010-09-20 05:32:05 +00:00
2010-11-19 07:35:43 +00:00
#ifdef FLACCL_CPU
2010-11-05 16:28:24 +00:00
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clComputeLPC(
__global float *pautoc,
__global float *lpcs,
int windowCount
)
{
int lpcOffs = (get_group_id(0) + get_group_id(1) * windowCount) * (MAX_ORDER + 1) * 32;
int autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
volatile double ldr[32];
volatile double gen0[32];
volatile double gen1[32];
volatile double err[32];
__global float* autoc = pautoc + autocOffs;
for (int i = 0; i < MAX_ORDER; i++)
{
gen0[i] = gen1[i] = autoc[i + 1];
ldr[i] = 0.0;
}
// Compute LPC using Schur and Levinson-Durbin recursion
double error = autoc[0];
for (int order = 0; order < MAX_ORDER; order++)
{
// Schur recursion
double reff = -gen1[0] / error;
//error += gen1[0] * reff; // Equivalent to error *= (1 - reff * reff);
error *= (1 - reff * reff);
for (int j = 0; j < MAX_ORDER - 1 - order; j++)
{
gen1[j] = gen1[j + 1] + reff * gen0[j];
gen0[j] = gen1[j + 1] * reff + gen0[j];
}
err[order] = error;
// Levinson-Durbin recursion
ldr[order] = reff;
for (int j = 0; j < order / 2; j++)
{
double tmp = ldr[j];
ldr[j] += reff * ldr[order - 1 - j];
ldr[order - 1 - j] += reff * tmp;
}
if (0 != (order & 1))
ldr[order / 2] += ldr[order / 2] * reff;
// Output coeffs
for (int j = 0; j <= order; j++)
lpcs[lpcOffs + order * 32 + j] = -ldr[order - j];
}
// Output prediction error estimates
for (int j = 0; j < MAX_ORDER; j++)
lpcs[lpcOffs + MAX_ORDER * 32 + j] = err[j];
}
#else
2010-09-20 05:32:05 +00:00
__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
2010-10-23 18:29:06 +00:00
void clComputeLPC(
2010-09-25 19:53:48 +00:00
__global float *autoc,
2010-09-20 05:32:05 +00:00
__global float *lpcs,
2010-09-25 19:53:48 +00:00
int windowCount
2010-09-20 05:32:05 +00:00
)
{
__local struct {
2010-12-02 15:58:41 +00:00
volatile double ldr[32];
volatile double gen1[32];
2010-09-20 05:32:05 +00:00
volatile float error[32];
volatile float autoc[33];
} shared;
2010-10-25 04:50:36 +00:00
const int tid = get_local_id(0);// + get_local_id(1) * 32;
int autocOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1);
2010-10-29 16:51:11 +00:00
int lpcOffs = autocOffs * 32;
2010-09-20 05:32:05 +00:00
2013-06-23 23:12:47 -04:00
shared.autoc[get_local_id(0)] = get_local_id(0) <= MAX_ORDER ? autoc[autocOffs + get_local_id(0)] : 0;
2010-09-25 19:53:48 +00:00
if (get_local_id(0) + get_local_size(0) <= MAX_ORDER)
2010-10-25 04:50:36 +00:00
shared.autoc[get_local_id(0) + get_local_size(0)] = autoc[autocOffs + get_local_id(0) + get_local_size(0)];
2010-09-20 05:32:05 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
// Compute LPC using Schur and Levinson-Durbin recursion
2010-12-02 15:58:41 +00:00
double gen0 = shared.gen1[get_local_id(0)] = shared.autoc[get_local_id(0)+1];
shared.ldr[get_local_id(0)] = ZEROD;
double error = shared.autoc[0];
2010-10-10 23:28:38 +00:00
2010-10-23 18:29:06 +00:00
#ifdef DEBUGPRINT1
2013-06-23 23:12:47 -04:00
int magic = autocOffs == 0; // shared.autoc[0] == 177286873088.0f;
2010-10-10 23:28:38 +00:00
if (magic && get_local_id(0) <= MAX_ORDER)
printf("autoc[%d] == %f\n", get_local_id(0), shared.autoc[get_local_id(0)]);
#endif
2010-09-20 05:32:05 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
2010-09-25 19:53:48 +00:00
for (int order = 0; order < MAX_ORDER; order++)
2010-09-20 05:32:05 +00:00
{
// Schur recursion
2010-12-02 15:58:41 +00:00
double reff = -shared.gen1[0] / error;
2010-10-23 18:29:06 +00:00
//error += shared.gen1[0] * reff; // Equivalent to error *= (1 - reff * reff);
error *= (1 - reff * reff);
2010-12-02 15:58:41 +00:00
double gen1;
2010-09-25 19:53:48 +00:00
if (get_local_id(0) < MAX_ORDER - 1 - order)
2010-09-20 05:32:05 +00:00
{
gen1 = shared.gen1[get_local_id(0) + 1] + reff * gen0;
gen0 += shared.gen1[get_local_id(0) + 1] * reff;
}
barrier(CLK_LOCAL_MEM_FENCE);
2010-09-25 19:53:48 +00:00
if (get_local_id(0) < MAX_ORDER - 1 - order)
2010-09-20 05:32:05 +00:00
shared.gen1[get_local_id(0)] = gen1;
2010-10-23 18:29:06 +00:00
#ifdef DEBUGPRINT1
2010-10-10 23:28:38 +00:00
if (magic && get_local_id(0) == 0)
printf("order == %d, reff == %f, error = %f\n", order, reff, error);
if (magic && get_local_id(0) <= MAX_ORDER)
printf("gen[%d] == %f, %f\n", get_local_id(0), gen0, gen1);
#endif
2010-09-20 05:32:05 +00:00
// Store prediction error
if (get_local_id(0) == 0)
shared.error[order] = error;
// Levinson-Durbin recursion
2010-12-02 15:58:41 +00:00
double ldr = shared.ldr[get_local_id(0)];
2010-09-20 05:32:05 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
2010-10-29 16:51:11 +00:00
if (get_local_id(0) < order)
shared.ldr[order - 1 - get_local_id(0)] += reff * ldr;
if (get_local_id(0) == order)
shared.ldr[get_local_id(0)] += reff;
2010-09-20 05:32:05 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
// Output coeffs
if (get_local_id(0) <= order)
2010-10-25 04:50:36 +00:00
lpcs[lpcOffs + order * 32 + get_local_id(0)] = -shared.ldr[order - get_local_id(0)];
2010-10-10 23:28:38 +00:00
//if (get_local_id(0) <= order + 1 && fabs(-shared.ldr[0]) > 3000)
// printf("coef[%d] == %f, autoc == %f, error == %f\n", get_local_id(0), -shared.ldr[order - get_local_id(0)], shared.autoc[get_local_id(0)], shared.error[get_local_id(0)]);
2010-09-20 05:32:05 +00:00
}
barrier(CLK_LOCAL_MEM_FENCE);
2013-06-23 23:12:47 -04:00
#ifdef DEBUGPRINT1
if (magic && get_local_id(0) < MAX_ORDER)
printf("error[%d] == %f\n", get_local_id(0), shared.error[get_local_id(0)]);
#endif
2010-09-20 05:32:05 +00:00
// Output prediction error estimates
2010-09-25 19:53:48 +00:00
if (get_local_id(0) < MAX_ORDER)
2010-10-25 04:50:36 +00:00
lpcs[lpcOffs + MAX_ORDER * 32 + get_local_id(0)] = shared.error[get_local_id(0)];
2010-09-20 05:32:05 +00:00
}
2010-11-05 16:28:24 +00:00
#endif
2010-09-20 05:32:05 +00:00
2010-11-19 07:35:43 +00:00
#ifdef FLACCL_CPU
2010-11-05 16:28:24 +00:00
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clQuantizeLPC(
__global FLACCLSubframeTask *tasks,
__global float*lpcs,
int taskCount, // tasks per block
int taskCountLPC, // tasks per set of coeffs (<= 32)
int minprecision,
int precisions
)
{
int bs = tasks[get_group_id(1) * taskCount].data.blocksize;
int abits = tasks[get_group_id(1) * taskCount].data.abits;
2010-12-11 07:20:54 +00:00
int obits = tasks[get_group_id(1) * taskCount].data.obits;
2010-11-05 16:28:24 +00:00
int lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1) * 32;
float error[MAX_ORDER];
int best_orders[MAX_ORDER];
2010-12-02 15:58:41 +00:00
int best8 = 0;
2010-11-05 16:28:24 +00:00
// Load prediction error estimates based on Akaike's Criteria
for (int tid = 0; tid < MAX_ORDER; tid++)
{
2010-12-02 15:58:41 +00:00
error[tid] = bs * log(1.0f + lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)bs);
2010-11-05 16:28:24 +00:00
best_orders[tid] = tid;
2010-12-02 15:58:41 +00:00
if (tid < 8 && error[tid] < error[best8])
best8 = tid;
2010-11-05 16:28:24 +00:00
}
2010-12-02 15:58:41 +00:00
#if 0
for (int i = best8 + 1; i < MAX_ORDER; i++)
error[i] += 20.5f * log((float)bs);
#endif
2010-11-05 16:28:24 +00:00
// Select best orders
for (int i = 0; i < MAX_ORDER && i < taskCountLPC; i++)
{
for (int j = i + 1; j < MAX_ORDER; j++)
{
if (error[best_orders[j]] < error[best_orders[i]])
{
int tmp = best_orders[j];
best_orders[j] = best_orders[i];
best_orders[i] = tmp;
}
}
}
// Quantization
for (int i = 0; i < taskCountLPC; i ++)
{
int order = best_orders[i >> precisions];
int tmpi = 0;
for (int tid = 0; tid <= order; tid ++)
{
float lpc = lpcs[lpcOffs + order * 32 + tid];
// get 15 bits of each coeff
int c = convert_int_rte(lpc * (1 << 15));
// remove sign bits
tmpi |= c ^ (c >> 31);
}
// choose precision
//int cbits = max(3, min(10, 5 + (abits >> 1))); // - convert_int_rte(shared.PE[order - 1])
2010-12-07 22:52:34 +00:00
#if BITS_PER_SAMPLE > 16
int cbits = max(3, min(15 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits));
#else
2010-12-11 07:20:54 +00:00
int cbits = max(3, min(min(13 - minprecision + (i - ((i >> precisions) << precisions)) - (bs <= 2304) - (bs <= 1152) - (bs <= 576), abits), clz(order) + 1 - obits));
2010-12-07 22:52:34 +00:00
#endif
2010-11-05 16:28:24 +00:00
// calculate shift based on precision and number of leading zeroes in coeffs
int shift = max(0,min(15, clz(tmpi) - 18 + cbits));
int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i;
tmpi = 0;
for (int tid = 0; tid <= order; tid ++)
{
float lpc = lpcs[lpcOffs + order * 32 + tid];
// quantize coeffs with given shift
int c = convert_int_rte(clamp(lpc * (1 << shift), (float)((-1 << (cbits - 1)) + 1), (float)((1 << (cbits - 1)) - 1)));
2010-11-05 16:28:24 +00:00
// remove sign bits
tmpi |= c ^ (c >> 31);
tasks[taskNo].coefs[tid] = c;
}
// calculate actual number of bits (+1 for sign)
cbits = 1 + 32 - clz(tmpi);
// output shift, cbits, ro
tasks[taskNo].data.shift = shift;
tasks[taskNo].data.cbits = cbits;
tasks[taskNo].data.residualOrder = order + 1;
}
}
#else
2010-10-06 11:16:41 +00:00
__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
2010-10-23 18:29:06 +00:00
void clQuantizeLPC(
2010-09-20 05:32:05 +00:00
__global FLACCLSubframeTask *tasks,
2010-09-25 19:53:48 +00:00
__global float*lpcs,
2010-09-20 05:32:05 +00:00
int taskCount, // tasks per block
int taskCountLPC, // tasks per set of coeffs (<= 32)
int minprecision,
int precisions
)
{
__local struct {
FLACCLSubframeData task;
volatile int index[64];
volatile float error[64];
2010-10-25 04:50:36 +00:00
volatile int maxcoef[32];
2010-12-02 15:58:41 +00:00
// volatile int best8;
2010-09-20 05:32:05 +00:00
} shared;
2010-10-06 11:16:41 +00:00
const int tid = get_local_id(0);
2010-09-20 05:32:05 +00:00
// fetch task data
if (tid < sizeof(shared.task) / sizeof(int))
((__local int*)&shared.task)[tid] = ((__global int*)(tasks + get_group_id(1) * taskCount))[tid];
barrier(CLK_LOCAL_MEM_FENCE);
2010-10-31 07:42:09 +00:00
const int lpcOffs = (get_group_id(0) + get_group_id(1) * get_num_groups(0)) * (MAX_ORDER + 1) * 32;
2010-09-20 05:32:05 +00:00
2010-09-25 19:53:48 +00:00
// Select best orders based on Akaike's Criteria
2010-10-06 11:16:41 +00:00
shared.index[tid] = min(MAX_ORDER - 1, tid);
shared.error[tid] = shared.task.blocksize * 64 + tid;
2010-10-10 23:28:38 +00:00
shared.index[32 + tid] = MAX_ORDER - 1;
shared.error[32 + tid] = shared.task.blocksize * 64 + tid + 32;
2010-10-25 04:50:36 +00:00
shared.maxcoef[tid] = 0;
2010-10-06 11:16:41 +00:00
// Load prediction error estimates
if (tid < MAX_ORDER)
2010-10-31 07:42:09 +00:00
shared.error[tid] = shared.task.blocksize * log(lpcs[lpcOffs + MAX_ORDER * 32 + tid]) + tid * 4.12f * log((float)shared.task.blocksize);
//shared.error[get_local_id(0)] = shared.task.blocksize * log(lpcs[lpcOffs + MAX_ORDER * 32 + get_local_id(0)]) + get_local_id(0) * 0.30f * (shared.task.abits + 1) * log(shared.task.blocksize);
2010-12-02 15:58:41 +00:00
#if 0
if (tid == 0)
{
int b8 = 0;
for (int i = 1; i < 8; i++)
if (shared.error[i] < shared.error[b8])
b8 = i;
shared.best8 = b8;
}
shared.error[tid] += select(0.0f, 20.5f * log((float)shared.task.blocksize), tid > shared.best8);
#endif
2010-09-20 05:32:05 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
// Sort using bitonic sort
for(int size = 2; size < 64; size <<= 1){
//Bitonic merge
2010-09-25 19:53:48 +00:00
int ddd = (tid & (size / 2)) == 0;
2010-09-20 05:32:05 +00:00
for(int stride = size / 2; stride > 0; stride >>= 1){
2010-09-25 19:53:48 +00:00
int pos = 2 * tid - (tid & (stride - 1));
2010-10-06 11:16:41 +00:00
float e0 = shared.error[pos];
float e1 = shared.error[pos + stride];
int i0 = shared.index[pos];
int i1 = shared.index[pos + stride];
2010-09-20 05:32:05 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
2010-10-06 11:16:41 +00:00
if ((e0 >= e1) == ddd)
2010-09-20 05:32:05 +00:00
{
shared.error[pos] = e1;
shared.error[pos + stride] = e0;
shared.index[pos] = i1;
shared.index[pos + stride] = i0;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
//ddd == dir for the last bitonic merge step
{
for(int stride = 32; stride > 0; stride >>= 1){
//barrier(CLK_LOCAL_MEM_FENCE);
2010-09-25 19:53:48 +00:00
int pos = 2 * tid - (tid & (stride - 1));
2010-10-06 11:16:41 +00:00
float e0 = shared.error[pos];
float e1 = shared.error[pos + stride];
int i0 = shared.index[pos];
int i1 = shared.index[pos + stride];
2010-09-20 05:32:05 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
2010-10-06 11:16:41 +00:00
if (e0 >= e1)
2010-09-20 05:32:05 +00:00
{
shared.error[pos] = e1;
shared.error[pos + stride] = e0;
shared.index[pos] = i1;
shared.index[pos + stride] = i0;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
2010-10-10 23:28:38 +00:00
//shared.index[tid] = MAX_ORDER - 1;
//barrier(CLK_LOCAL_MEM_FENCE);
2010-09-20 05:32:05 +00:00
// Quantization
2010-10-06 11:16:41 +00:00
for (int i = 0; i < taskCountLPC; i ++)
2010-09-20 05:32:05 +00:00
{
int order = shared.index[i >> precisions];
2010-10-31 07:42:09 +00:00
float lpc = tid <= order ? lpcs[lpcOffs + order * 32 + tid] : 0.0f;
2010-09-20 05:32:05 +00:00
// get 15 bits of each coeff
int coef = convert_int_rte(lpc * (1 << 15));
// remove sign bits
2013-05-30 22:14:16 -04:00
atomic_or(shared.maxcoef + i, coef ^ (coef >> 31));
2010-09-20 05:32:05 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
2013-05-29 22:39:21 -04:00
int cbits = min(51 - 2 * clz(shared.task.blocksize), shared.task.abits) - minprecision + (i - ((i >> precisions) << precisions));
2013-05-29 22:39:21 -04:00
#if BITS_PER_SAMPLE <= 16
// Limit cbits so that 32-bit arithmetics will be enough when calculating residual
// (1 << (obits - 1)) * ((1 << (cbits - 1)) - 1) * (order + 1) < (1 << 31)
// (1 << (cbits - 1)) - 1 < (1 << (32 - obits)) / (order + 1)
// (1 << (cbits - 1)) <= (1 << (32 - obits)) / (order + 1)
// (1 << (cbits - 1)) <= (1 << (32 - obits - (32 - clz(order))) <= (1 << (32 - obits)) / (order + 1)
// (1 << (cbits - 1)) <= (1 << (clz(order) - obits))
// cbits - 1 <= clz(order) - obits
// cbits <= clz(order) - obits + 1
cbits = min(cbits, clz(order) + 1 - shared.task.obits);
2010-12-07 22:52:34 +00:00
#endif
cbits = clamp(cbits, 3, 15);
2013-05-29 22:39:21 -04:00
// Calculate shift based on precision and number of leading zeroes in coeffs.
// We know that if shifted by 15, coefs require
// 33 - clz(shared.maxcoef[i]) bits;
// So to get the desired cbits, we need to shift coefs by
// 15 + cbits - (33 - clz(shared.maxcoef[i]));
2013-05-30 22:14:16 -04:00
int shift = clamp(clz(shared.maxcoef[i]) - 18 + cbits, 0, 15);
2013-05-29 22:39:21 -04:00
int lim = (1 << (cbits - 1)) - 1;
2013-05-30 22:14:16 -04:00
coef = clamp(convert_int_rte(lpc * (1 << shift)), -lim, lim);
2010-09-20 05:32:05 +00:00
// output shift, cbits and output coeffs
2010-10-10 23:28:38 +00:00
int taskNo = get_group_id(1) * taskCount + get_group_id(0) * taskCountLPC + i;
if (tid == 0)
tasks[taskNo].data.shift = shift;
if (tid == 0)
tasks[taskNo].data.cbits = cbits;
if (tid == 0)
tasks[taskNo].data.residualOrder = order + 1;
if (tid <= order)
tasks[taskNo].coefs[tid] = coef;
2010-09-20 05:32:05 +00:00
}
}
2010-11-05 16:28:24 +00:00
#endif
2010-12-07 22:52:34 +00:00
#ifdef FLACCL_CPU
#define TEMPBLOCK1 TEMPBLOCK
2010-11-19 07:35:43 +00:00
__kernel __attribute__(( vec_type_hint (int4))) __attribute__((reqd_work_group_size(1, 1, 1)))
2010-11-05 16:28:24 +00:00
void clEstimateResidual(
__global int*samples,
__global int*selectedTasks,
__global FLACCLSubframeTask *tasks
)
{
int selectedTask = selectedTasks[get_group_id(0)];
FLACCLSubframeTask task = tasks[selectedTask];
int ro = task.data.residualOrder;
int bs = task.data.blocksize;
#define ERPARTS (MAX_BLOCKSIZE >> 6)
float len[ERPARTS]; // blocksize / 64!!!!
2010-11-05 16:28:24 +00:00
__global int *data = &samples[task.data.samplesOffs];
for (int i = 0; i < ERPARTS; i++)
len[i] = 0.0f;
2010-12-10 05:19:39 +00:00
if (ro <= 4)
{
float fcoef[4];
for (int tid = 0; tid < 4; tid++)
fcoef[tid] = tid + ro - 4 < 0 ? 0.0f : - ((float) task.coefs[tid + ro - 4]) / (1 << task.data.shift);
float4 fc0 = vload4(0, &fcoef[0]);
float fdata[4];
for (int pos = 0; pos < 4; pos++)
fdata[pos] = pos + ro - 4 < 0 ? 0.0f : (float)(data[pos + ro - 4] >> task.data.wbits);
float4 fd0 = vload4(0, &fdata[0]);
for (int pos = ro; pos < bs; pos ++)
{
float4 sum4 = fc0 * fd0;
float2 sum2 = sum4.s01 + sum4.s23;
fd0 = fd0.s1230;
fd0.s3 = (float)(data[pos] >> task.data.wbits);
len[pos >> 6] += fabs(fd0.s3 + (sum2.x + sum2.y));
}
}
else if (ro <= 8)
{
2010-12-10 05:19:39 +00:00
float fcoef[8];
for (int tid = 0; tid < 8; tid++)
fcoef[tid] = tid + ro - 8 < 0 ? 0.0f : - ((float) task.coefs[tid + ro - 8]) / (1 << task.data.shift);
float8 fc0 = vload8(0, &fcoef[0]);
float fdata[8];
for (int pos = 0; pos < 8; pos++)
fdata[pos] = pos + ro - 8 < 0 ? 0.0f : (float)(data[pos + ro - 8] >> task.data.wbits);
float8 fd0 = vload8(0, &fdata[0]);
for (int pos = ro; pos < bs; pos ++)
{
float8 sum8 = fc0 * fd0;
float4 sum4 = sum8.s0123 + sum8.s4567;
float2 sum2 = sum4.s01 + sum4.s23;
fd0 = fd0.s12345670;
fd0.s7 = (float)(data[pos] >> task.data.wbits);
len[pos >> 6] += fabs(fd0.s7 + (sum2.x + sum2.y));
}
}
2010-12-10 05:19:39 +00:00
else if (ro <= 12)
{
2010-12-10 05:19:39 +00:00
float fcoef[12];
for (int tid = 0; tid < 12; tid++)
fcoef[tid] = tid + ro - 12 >= 0 ? - ((float) task.coefs[tid + ro - 12]) / (1 << task.data.shift) : 0.0f;
float4 fc0 = vload4(0, &fcoef[0]);
float4 fc1 = vload4(1, &fcoef[0]);
float4 fc2 = vload4(2, &fcoef[0]);
float fdata[12];
for (int pos = 0; pos < 12; pos++)
fdata[pos] = pos + ro - 12 < 0 ? 0.0f : (float)(data[pos + ro - 12] >> task.data.wbits);
float4 fd0 = vload4(0, &fdata[0]);
float4 fd1 = vload4(1, &fdata[0]);
float4 fd2 = vload4(2, &fdata[0]);
for (int pos = ro; pos < bs; pos ++)
{
float4 sum4 = fc0 * fd0 + fc1 * fd1 + fc2 * fd2;
float2 sum2 = sum4.s01 + sum4.s23;
fd0 = fd0.s1230;
fd1 = fd1.s1230;
fd2 = fd2.s1230;
fd0.s3 = fd1.s3;
fd1.s3 = fd2.s3;
fd2.s3 = (float)(data[pos] >> task.data.wbits);
len[pos >> 6] += fabs(fd2.s3 + (sum2.x + sum2.y));
}
}
2010-12-10 05:19:39 +00:00
else
2010-11-19 07:35:43 +00:00
{
2010-12-10 05:19:39 +00:00
float fcoef[32];
for (int tid = 0; tid < 32; tid++)
fcoef[tid] = tid < MAX_ORDER && tid + ro - MAX_ORDER >= 0 ? - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift) : 0.0f;
float4 fc0 = vload4(0, &fcoef[0]);
float4 fc1 = vload4(1, &fcoef[0]);
float4 fc2 = vload4(2, &fcoef[0]);
float fdata[MAX_ORDER + TEMPBLOCK1 + 32];
for (int pos = 0; pos < MAX_ORDER; pos++)
fdata[pos] = 0.0f;
for (int pos = MAX_ORDER + TEMPBLOCK1; pos < MAX_ORDER + TEMPBLOCK1 + 32; pos++)
fdata[pos] = 0.0f;
for (int bpos = 0; bpos < bs; bpos += TEMPBLOCK1)
{
int end = min(bpos + TEMPBLOCK1, bs);
2010-12-10 05:19:39 +00:00
for (int pos = max(bpos - ro, 0); pos < max(bpos, ro); pos++)
fdata[MAX_ORDER + pos - bpos] = (float)(data[pos] >> task.data.wbits);
2010-12-10 05:19:39 +00:00
for (int pos = max(bpos, ro); pos < end; pos ++)
{
float next = (float)(data[pos] >> task.data.wbits);
float * dptr = fdata + pos - bpos;
dptr[MAX_ORDER] = next;
float4 sum
= fc0 * vload4(0, dptr)
+ fc1 * vload4(1, dptr)
#if MAX_ORDER > 8
+ fc2 * vload4(2, dptr)
#if MAX_ORDER > 12
+ vload4(3, &fcoef[0]) * vload4(3, dptr)
#if MAX_ORDER > 16
+ vload4(4, &fcoef[0]) * vload4(4, dptr)
+ vload4(5, &fcoef[0]) * vload4(5, dptr)
+ vload4(6, &fcoef[0]) * vload4(6, dptr)
+ vload4(7, &fcoef[0]) * vload4(7, dptr)
#endif
#endif
#endif
2010-12-10 05:19:39 +00:00
;
next += sum.x + sum.y + sum.z + sum.w;
len[pos >> 6] += fabs(next);
}
}
2010-11-19 07:35:43 +00:00
}
2010-11-05 16:28:24 +00:00
int total = 0;
for (int i = 0; i < ERPARTS; i++)
2010-11-05 16:28:24 +00:00
{
int res = convert_int_sat_rte(len[i] * 2);
2013-05-30 22:14:16 -04:00
int k = clamp(31 - clz(res) - 6, 0, MAX_RICE_PARAM); // 25 - clz(res) == clz(64) - clz(res) == log2(res / 64)
total += (k << 6) + (res >> k);
2010-11-05 16:28:24 +00:00
}
int partLen = min(0x7ffffff, total) + (bs - ro);
int obits = task.data.obits - task.data.wbits;
tasks[selectedTask].data.size = min(obits * bs,
2010-12-07 22:52:34 +00:00
task.data.type == Fixed ? ro * obits + 6 + RICE_PARAM_BITS + partLen :
task.data.type == LPC ? ro * obits + 4 + 5 + ro * task.data.cbits + 6 + RICE_PARAM_BITS/* << porder */ + partLen :
2010-11-05 16:28:24 +00:00
task.data.type == Constant ? obits * select(1, bs, partLen != bs - ro) :
obits * bs);
}
#else
#define ESTPARTLOG 5
2010-09-25 19:53:48 +00:00
__kernel /*__attribute__(( vec_type_hint (int4)))*/ __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
2010-10-23 18:29:06 +00:00
void clEstimateResidual(
2010-09-20 05:32:05 +00:00
__global int*samples,
2010-11-05 16:28:24 +00:00
__global int*selectedTasks,
2010-09-20 05:32:05 +00:00
__global FLACCLSubframeTask *tasks
)
{
2010-11-09 19:34:57 +00:00
__local float data[GROUP_SIZE * 2 + 32];
2013-05-30 22:14:16 -04:00
#if !defined(AMD)
__local volatile uint idata[GROUP_SIZE + 16];
2010-11-17 05:26:59 +00:00
#endif
2010-09-20 05:32:05 +00:00
__local FLACCLSubframeTask task;
__local uint psum[MAX_BLOCKSIZE >> ESTPARTLOG];
2010-10-25 04:50:36 +00:00
__local float fcoef[32];
2010-11-05 16:28:24 +00:00
__local int selectedTask;
if (get_local_id(0) == 0)
selectedTask = selectedTasks[get_group_id(0)];
barrier(CLK_LOCAL_MEM_FENCE);
2010-09-20 05:32:05 +00:00
const int tid = get_local_id(0);
if (tid < sizeof(task)/sizeof(int))
2010-11-05 16:28:24 +00:00
((__local int*)&task)[tid] = ((__global int*)(&tasks[selectedTask]))[tid];
barrier(CLK_LOCAL_MEM_FENCE);
2010-09-20 05:32:05 +00:00
int ro = task.data.residualOrder;
int bs = task.data.blocksize;
2010-10-25 04:50:36 +00:00
if (tid < 32)
2010-11-09 19:34:57 +00:00
//fcoef[tid] = select(0.0f, - ((float) task.coefs[tid]) / (1 << task.data.shift), tid < ro);
fcoef[tid] = tid < MAX_ORDER && tid + ro - MAX_ORDER >= 0 ? - ((float) task.coefs[tid + ro - MAX_ORDER]) / (1 << task.data.shift) : 0.0f;
for (int offs = tid; offs < (MAX_BLOCKSIZE >> ESTPARTLOG); offs += GROUP_SIZE)
psum[offs] = 0;
2010-10-25 04:50:36 +00:00
data[tid] = 0.0f;
// need to initialize "extra" data, because NaNs can produce weird results even when multiplied by zero extra coefs
2010-11-09 19:34:57 +00:00
if (tid < 32)
data[GROUP_SIZE * 2 + tid] = 0.0f;
2010-10-25 04:50:36 +00:00
2010-10-15 19:56:36 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
2010-11-02 07:37:07 +00:00
float4 fc0 = vload4(0, &fcoef[0]);
float4 fc1 = vload4(1, &fcoef[0]);
#if MAX_ORDER > 8
float4 fc2 = vload4(2, &fcoef[0]);
#endif
2010-11-29 21:31:42 +00:00
__global int * rptr = &samples[task.data.samplesOffs];
int wb = task.data.wbits;
int pos;
for (pos = 0; pos + GROUP_SIZE - 1 < bs; pos += GROUP_SIZE)
{
// fetch samples
int offs = pos + tid;
float nextData = rptr[offs] >> wb;
data[tid + GROUP_SIZE] = nextData;
barrier(CLK_LOCAL_MEM_FENCE);
// compute residual
__local float* dptr = &data[tid + GROUP_SIZE - MAX_ORDER];
float4 sum4
= fc0 * vload4(0, dptr)
+ fc1 * vload4(1, dptr)
#if MAX_ORDER > 8
+ fc2 * vload4(2, dptr)
#if MAX_ORDER > 12
+ vload4(3, &fcoef[0]) * vload4(3, dptr)
#if MAX_ORDER > 16
+ vload4(4, &fcoef[0]) * vload4(4, dptr)
+ vload4(5, &fcoef[0]) * vload4(5, dptr)
+ vload4(6, &fcoef[0]) * vload4(6, dptr)
+ vload4(7, &fcoef[0]) * vload4(7, dptr)
#endif
#endif
#endif
;
float2 sum2 = sum4.s01 + sum4.s23;
int it = convert_int_sat_rte(nextData + (sum2.s0 + sum2.s1));
2010-11-29 21:31:42 +00:00
// int t = (int)(nextData + sum.x + sum.y + sum.z + sum.w);
barrier(CLK_LOCAL_MEM_FENCE);
data[tid] = nextData;
// convert to unsigned
uint t = (it << 1) ^ (it >> 31);
2010-11-29 21:31:42 +00:00
// ensure we're within frame bounds
t = select(0U, t, offs >= ro);
2010-11-29 21:31:42 +00:00
// overflow protection
t = min(t, 0x7ffffffU);
2013-06-23 23:12:47 -04:00
#if defined(AMD)
atomic_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t);
#else
2010-11-29 21:31:42 +00:00
idata[tid] = t;
2013-06-23 23:12:47 -04:00
#if WARP_SIZE <= (1 << (ESTPARTLOG - 1))
barrier(CLK_LOCAL_MEM_FENCE);
for (int l = 1 << (ESTPARTLOG - 1); l >= WARP_SIZE; l >>= 1) {
if (!(tid & l)) idata[tid] += idata[tid + l];
barrier(CLK_LOCAL_MEM_FENCE);
}
for (int l = WARP_SIZE / 2; l > 1; l >>= 1)
2010-11-29 21:31:42 +00:00
idata[tid] += idata[tid + l];
#else
2013-06-23 23:12:47 -04:00
for (int l = 1 << (ESTPARTLOG - 1); l > 1; l >>= 1)
idata[tid] += idata[tid + l];
#endif
if ((tid & (1 << ESTPARTLOG) - 1) == 0)
psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG] = idata[tid] + idata[tid + 1];
2010-11-29 21:31:42 +00:00
#endif
}
if (pos < bs)
2010-09-20 05:32:05 +00:00
{
// fetch samples
2010-10-15 19:56:36 +00:00
int offs = pos + tid;
2010-11-29 21:31:42 +00:00
float nextData = offs < bs ? rptr[offs] >> wb : 0.0f;
2010-09-25 19:53:48 +00:00
data[tid + GROUP_SIZE] = nextData;
2010-09-20 05:32:05 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
// compute residual
2010-11-09 19:34:57 +00:00
__local float* dptr = &data[tid + GROUP_SIZE - MAX_ORDER];
2010-11-02 07:37:07 +00:00
float4 sum
= fc0 * vload4(0, dptr)
+ fc1 * vload4(1, dptr)
2010-09-20 05:32:05 +00:00
#if MAX_ORDER > 8
2010-11-02 07:37:07 +00:00
+ fc2 * vload4(2, dptr)
2010-10-25 04:50:36 +00:00
#if MAX_ORDER > 12
+ vload4(3, &fcoef[0]) * vload4(3, dptr)
#if MAX_ORDER > 16
+ vload4(4, &fcoef[0]) * vload4(4, dptr)
+ vload4(5, &fcoef[0]) * vload4(5, dptr)
+ vload4(6, &fcoef[0]) * vload4(6, dptr)
+ vload4(7, &fcoef[0]) * vload4(7, dptr)
#endif
#endif
2010-09-20 05:32:05 +00:00
#endif
2010-10-10 23:28:38 +00:00
;
2010-10-25 04:50:36 +00:00
int it = convert_int_sat_rte(nextData + sum.x + sum.y + sum.z + sum.w);
2010-10-25 04:50:36 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
2010-11-02 07:37:07 +00:00
data[tid] = nextData;
// convert to unsigned
uint t = (it << 1) ^ (it >> 31);
2010-10-17 05:35:11 +00:00
// ensure we're within frame bounds
t = select(0U, t, offs >= ro && offs < bs);
2010-10-17 05:35:11 +00:00
// overflow protection
t = min(t, 0x7ffffffU);
2013-06-23 23:12:47 -04:00
#if defined(AMD)
atomic_add(&psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG], t);
#else
2010-11-17 05:26:59 +00:00
idata[tid] = t;
2013-06-23 23:12:47 -04:00
#if WARP_SIZE <= (1 << (ESTPARTLOG - 1))
barrier(CLK_LOCAL_MEM_FENCE);
for (int l = 1 << (ESTPARTLOG - 1); l >= WARP_SIZE; l >>= 1) {
if (!(tid & l)) idata[tid] += idata[tid + l];
barrier(CLK_LOCAL_MEM_FENCE);
}
for (int l = WARP_SIZE / 2; l > 1; l >>= 1)
idata[tid] += idata[tid + l];
2010-11-17 05:26:59 +00:00
#else
2013-06-23 23:12:47 -04:00
for (int l = 1 << (ESTPARTLOG - 1); l > 1; l >>= 1)
idata[tid] += idata[tid + l];
#endif
if ((tid & (1 << ESTPARTLOG) - 1) == 0)
psum[min(MAX_BLOCKSIZE - 1, offs) >> ESTPARTLOG] = idata[tid] + idata[tid + 1];
2010-11-01 17:09:48 +00:00
#endif
2010-09-20 05:32:05 +00:00
}
// calculate rice partition bit length for every 32 samples
2010-11-01 17:09:48 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
2013-06-16 15:46:50 -04:00
#if (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) > GROUP_SIZE
#error MAX_BLOCKSIZE is too large for this GROUP_SIZE
#endif
uint pl = tid < (MAX_BLOCKSIZE >> (ESTPARTLOG + 1)) ? psum[tid * 2] + psum[tid * 2 + 1] : 0;
2010-10-10 23:28:38 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
// for (int pos = 0; pos < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2; pos += GROUP_SIZE)
// {
//int offs = pos + tid;
//int pl = offs < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2 ? psum[offs * 2] + psum[offs * 2 + 1] : 0;
////int pl = psum[offs * 2] + psum[offs * 2 + 1];
//barrier(CLK_LOCAL_MEM_FENCE);
//if (offs < (MAX_BLOCKSIZE >> ESTPARTLOG) / 2)
// psum[offs] = pl;
// }
2013-05-30 22:14:16 -04:00
int k = clamp(31 - (int)clz(pl) - (ESTPARTLOG + 1), 0, MAX_RICE_PARAM); // 26 - clz(res) == clz(32) - clz(res) == log2(res / 32)
2013-06-16 15:46:50 -04:00
if (tid < MAX_BLOCKSIZE >> (ESTPARTLOG + 1))
psum[tid] = (k << (ESTPARTLOG + 1)) + (pl >> k);
barrier(CLK_LOCAL_MEM_FENCE);
for (int l = MAX_BLOCKSIZE >> (ESTPARTLOG + 2); l > 0; l >>= 1)
2010-10-10 23:28:38 +00:00
{
if (tid < l)
2010-10-25 04:50:36 +00:00
psum[tid] += psum[tid + l];
2010-10-10 23:28:38 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid == 0)
2010-10-31 07:42:09 +00:00
{
int pl = (int)psum[0] + (bs - ro);
2010-10-31 07:42:09 +00:00
int obits = task.data.obits - task.data.wbits;
int len = min(obits * task.data.blocksize,
2010-12-07 22:52:34 +00:00
task.data.type == Fixed ? task.data.residualOrder * obits + 6 + RICE_PARAM_BITS + pl :
task.data.type == LPC ? task.data.residualOrder * obits + 4 + 5 + task.data.residualOrder * task.data.cbits + 6 + RICE_PARAM_BITS/* << porder */ + pl :
2010-10-31 07:42:09 +00:00
task.data.type == Constant ? obits * select(1, task.data.blocksize, pl != task.data.blocksize - task.data.residualOrder) :
obits * task.data.blocksize);
2010-11-05 16:28:24 +00:00
tasks[selectedTask].data.size = len;
2010-10-31 07:42:09 +00:00
}
2010-09-20 05:32:05 +00:00
}
2010-11-05 16:28:24 +00:00
#endif
2010-09-20 05:32:05 +00:00
2010-11-05 16:28:24 +00:00
__kernel
void clSelectStereoTasks(
2010-09-20 05:32:05 +00:00
__global FLACCLSubframeTask *tasks,
2010-11-05 16:28:24 +00:00
__global int*selectedTasks,
__global int*selectedTasksSecondEstimate,
__global int*selectedTasksBestMethod,
2010-12-10 05:19:39 +00:00
int tasksWindow,
int windowCount,
int tasksToSecondEstimate,
2010-11-05 16:28:24 +00:00
int taskCount,
int selectedCount
2010-09-20 05:32:05 +00:00
)
{
2010-11-05 16:28:24 +00:00
int best_size[4];
2010-12-10 05:19:39 +00:00
int best_wind[4];
2010-11-05 16:28:24 +00:00
for (int ch = 0; ch < 4; ch++)
2010-10-06 11:16:41 +00:00
{
2010-11-05 16:28:24 +00:00
int first_no = selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount];
int best_len = tasks[first_no].data.size;
2010-12-10 05:19:39 +00:00
int best_wnd = 0;
2010-11-05 16:28:24 +00:00
for (int i = 1; i < selectedCount; i++)
2010-10-06 11:16:41 +00:00
{
2010-11-05 16:28:24 +00:00
int task_no = selectedTasks[(get_global_id(0) * 4 + ch) * selectedCount + i];
int task_len = tasks[task_no].data.size;
2010-12-10 05:19:39 +00:00
int task_wnd = (task_no - first_no) / tasksWindow;
task_wnd = select(0, task_wnd, task_wnd < windowCount);
best_wnd = select(best_wnd, task_wnd, task_len < best_len);
2010-11-05 16:28:24 +00:00
best_len = min(task_len, best_len);
}
best_size[ch] = best_len;
2010-12-10 05:19:39 +00:00
best_wind[ch] = best_wnd;
2010-11-05 16:28:24 +00:00
}
int bitsBest = best_size[2] + best_size[3]; // MidSide
int chMask = 2 | (3 << 2);
int bits = best_size[3] + best_size[1];
chMask = select(chMask, 3 | (1 << 2), bits < bitsBest); // RightSide
bitsBest = min(bits, bitsBest);
bits = best_size[0] + best_size[3];
chMask = select(chMask, 0 | (3 << 2), bits < bitsBest); // LeftSide
bitsBest = min(bits, bitsBest);
bits = best_size[0] + best_size[1];
chMask = select(chMask, 0 | (1 << 2), bits < bitsBest); // LeftRight
bitsBest = min(bits, bitsBest);
for (int ich = 0; ich < 2; ich++)
{
int ch = select(chMask & 3, chMask >> 2, ich > 0);
int roffs = tasks[(get_global_id(0) * 4 + ich) * taskCount].data.samplesOffs;
int nonSelectedNo = 0;
2010-12-10 05:19:39 +00:00
for (int j = taskCount - 1; j >= 0; j--)
2010-11-05 16:28:24 +00:00
{
2010-12-10 05:19:39 +00:00
int i = select(j, (j % windowCount) * tasksWindow + (j / windowCount), j < windowCount * tasksWindow);
2010-11-05 16:28:24 +00:00
int no = (get_global_id(0) * 4 + ch) * taskCount + i;
selectedTasksBestMethod[(get_global_id(0) * 2 + ich) * taskCount + i] = no;
tasks[no].data.residualOffs = roffs;
2010-12-10 05:19:39 +00:00
if (j >= selectedCount)
tasks[no].data.size = 0x7fffffff;
if (nonSelectedNo < tasksToSecondEstimate)
if (tasksToSecondEstimate == taskCount - selectedCount || best_wind[ch] == i / tasksWindow || i >= windowCount * tasksWindow)
selectedTasksSecondEstimate[(get_global_id(0) * 2 + ich) * tasksToSecondEstimate + nonSelectedNo++] = no;
2010-09-20 05:32:05 +00:00
}
2010-10-06 11:16:41 +00:00
}
2010-09-20 05:32:05 +00:00
}
2010-11-05 16:28:24 +00:00
__kernel
void clChooseBestMethod(
2010-09-20 05:32:05 +00:00
__global FLACCLSubframeTask *tasks_out,
__global FLACCLSubframeTask *tasks,
2010-11-05 16:28:24 +00:00
__global int*selectedTasks,
int taskCount
2010-09-20 05:32:05 +00:00
)
{
2010-11-05 16:28:24 +00:00
int best_no = selectedTasks[get_global_id(0) * taskCount];
int best_len = tasks[best_no].data.size;
for (int i = 1; i < taskCount; i++)
{
int task_no = selectedTasks[get_global_id(0) * taskCount + i];
int task_len = tasks[task_no].data.size;
best_no = select(best_no, task_no, task_len < best_len);
best_len = min(best_len, task_len);
}
tasks_out[get_global_id(0)] = tasks[best_no];
2010-09-20 05:32:05 +00:00
}
#ifdef DO_PARTITIONS
#if BITS_PER_SAMPLE > 16
#define residual_t long
#define convert_bps_sat convert_int_sat
#else
#define residual_t int
#define convert_bps_sat
#endif
2010-11-19 07:35:43 +00:00
#ifdef FLACCL_CPU
inline residual_t calc_residual(__global int *ptr, int * coefs, int ro)
{
residual_t sum = 0;
for (int i = 0; i < ro; i++)
sum += (residual_t)ptr[i] * coefs[i];
//sum += upsample(mul_hi(ptr[i], coefs[i]), as_uint(ptr[i] * coefs[i]));
return sum;
}
#define ENCODE_N(cro,action) for (int pos = cro; pos < bs; pos ++) { \
residual_t t = (data[pos] - (calc_residual(data + pos - cro, task.coefs, cro) >> task.data.shift)) >> task.data.wbits; \
action; \
}
#define SWITCH_N(action) \
switch (ro) \
{ \
case 0: ENCODE_N(0, action) break; \
case 1: ENCODE_N(1, action) break; \
case 2: ENCODE_N(2, action) /*if (task.coefs[0] == -1 && task.coefs[1] == 2) ENCODE_N(2, 2 * ptr[1] - ptr[0], action) else*/ break; \
case 3: ENCODE_N(3, action) break; \
case 4: ENCODE_N(4, action) break; \
case 5: ENCODE_N(5, action) break; \
case 6: ENCODE_N(6, action) break; \
case 7: ENCODE_N(7, action) break; \
case 8: ENCODE_N(8, action) break; \
case 9: ENCODE_N(9, action) break; \
case 10: ENCODE_N(10, action) break; \
case 11: ENCODE_N(11, action) break; \
case 12: ENCODE_N(12, action) break; \
default: ENCODE_N(ro, action) \
}
2010-11-05 16:28:24 +00:00
// get_group_id(0) == task index
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clEncodeResidual(
2010-12-10 05:19:39 +00:00
__global ulong *partition_lengths,
2010-11-05 16:28:24 +00:00
__global int *residual,
__global int *samples,
2010-12-10 05:19:39 +00:00
__global FLACCLSubframeTask *tasks,
int max_porder, // <= 8
int psize // == task.blocksize >> max_porder?
2010-09-20 05:32:05 +00:00
)
{
2010-11-05 16:28:24 +00:00
FLACCLSubframeTask task = tasks[get_group_id(0)];
int bs = task.data.blocksize;
int ro = task.data.residualOrder;
__global int *data = &samples[task.data.samplesOffs];
2010-12-10 05:19:39 +00:00
__global ulong *pl = partition_lengths + (1 << (max_porder + 1)) * get_group_id(0);
int r;
for (int p = 0; p < (1 << max_porder); p++)
pl[p] = 0UL;
__global int *rptr = residual + task.data.residualOffs;
if (psize == 16)
{
SWITCH_N((rptr[pos] = r = convert_bps_sat(t), pl[pos >> 4] += (uint)((r << 1) ^ (r >> 31))));
}
else
{
SWITCH_N((rptr[pos] = r = convert_bps_sat(t), pl[pos / psize] += (uint)((r << 1) ^ (r >> 31))));
}
2010-09-20 05:32:05 +00:00
}
2010-11-05 16:28:24 +00:00
#else
2010-10-10 23:28:38 +00:00
// get_group_id(0) == task index
2010-10-06 11:16:41 +00:00
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
2010-10-23 18:29:06 +00:00
void clEncodeResidual(
2010-12-10 05:19:39 +00:00
__global int *partition_lengths,
2010-10-06 11:16:41 +00:00
__global int *output,
__global int *samples,
2010-12-10 05:19:39 +00:00
__global FLACCLSubframeTask *tasks,
int max_porder, // <= 8
int psize // == task.blocksize >> max_porder?
2010-10-06 11:16:41 +00:00
)
{
__local FLACCLSubframeTask task;
2010-11-08 18:47:27 +00:00
__local int data[GROUP_SIZE * 2 + MAX_ORDER];
2010-10-06 11:16:41 +00:00
const int tid = get_local_id(0);
if (get_local_id(0) < sizeof(task) / sizeof(int))
2010-10-10 23:28:38 +00:00
((__local int*)&task)[get_local_id(0)] = ((__global int*)(&tasks[get_group_id(0)]))[get_local_id(0)];
2010-10-06 11:16:41 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
int bs = task.data.blocksize;
int ro = task.data.residualOrder;
2010-10-15 19:56:36 +00:00
if (tid < 32 && tid >= ro)
task.coefs[tid] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
int4 cptr0 = vload4(0, &task.coefs[0]);
int4 cptr1 = vload4(1, &task.coefs[0]);
2010-10-15 19:56:36 +00:00
#if MAX_ORDER > 8
int4 cptr2 = vload4(2, &task.coefs[0]);
2010-10-15 19:56:36 +00:00
#endif
2010-12-11 07:20:54 +00:00
// We tweaked coeffs so that (task.cbits + task.obits + log2i(ro) <= 32)
// when BITS_PER_SAMPLE == 16, so we don't need 64bit arithmetics.
2010-10-15 19:56:36 +00:00
data[tid] = 0;
2010-10-06 11:16:41 +00:00
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
{
// fetch samples
2010-10-15 19:56:36 +00:00
int off = pos + tid;
int nextData = off < bs ? samples[task.data.samplesOffs + off] >> task.data.wbits : 0;
2010-10-06 11:16:41 +00:00
data[tid + GROUP_SIZE] = nextData;
barrier(CLK_LOCAL_MEM_FENCE);
// compute residual
2010-10-29 16:51:11 +00:00
__local int* dptr = &data[tid + GROUP_SIZE - ro];
#if BITS_PER_SAMPLE > 16
long4 sum
= upsample(mul_hi(cptr0, vload4(0, dptr)), as_uint4(cptr0 * vload4(0, dptr)))
+ upsample(mul_hi(cptr1, vload4(1, dptr)), as_uint4(cptr1 * vload4(1, dptr)))
#if MAX_ORDER > 8
+ upsample(mul_hi(cptr2, vload4(2, dptr)), as_uint4(cptr2 * vload4(2, dptr)))
#if MAX_ORDER > 12
+ upsample(mul_hi(vload4(3, &task.coefs[0]), vload4(3, dptr)), as_uint4(vload4(3, &task.coefs[0]) * vload4(3, dptr)))
#if MAX_ORDER > 16
+ upsample(mul_hi(vload4(4, &task.coefs[0]), vload4(4, dptr)), as_uint4(vload4(4, &task.coefs[0]) * vload4(4, dptr)))
+ upsample(mul_hi(vload4(5, &task.coefs[0]), vload4(5, dptr)), as_uint4(vload4(5, &task.coefs[0]) * vload4(5, dptr)))
+ upsample(mul_hi(vload4(6, &task.coefs[0]), vload4(6, dptr)), as_uint4(vload4(6, &task.coefs[0]) * vload4(6, dptr)))
+ upsample(mul_hi(vload4(7, &task.coefs[0]), vload4(7, dptr)), as_uint4(vload4(7, &task.coefs[0]) * vload4(7, dptr)))
#endif
#endif
#endif
#else
int4 sum
= cptr0 * vload4(0, dptr)
+ cptr1 * vload4(1, dptr)
#if MAX_ORDER > 8
+ cptr2 * vload4(2, dptr)
#if MAX_ORDER > 12
+ vload4(3, &task.coefs[0]) * vload4(3, dptr)
#if MAX_ORDER > 16
+ vload4(4, &task.coefs[0]) * vload4(4, dptr)
+ vload4(5, &task.coefs[0]) * vload4(5, dptr)
+ vload4(6, &task.coefs[0]) * vload4(6, dptr)
+ vload4(7, &task.coefs[0]) * vload4(7, dptr)
#endif
#endif
#endif
2010-10-15 19:56:36 +00:00
#endif
;
if (off >= ro && off < bs)
2010-12-07 22:52:34 +00:00
output[task.data.residualOffs + off] = convert_bps_sat(nextData - ((sum.x + sum.y + sum.z + sum.w) >> task.data.shift));
2010-10-06 11:16:41 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
data[tid] = nextData;
}
}
2010-11-05 16:28:24 +00:00
#endif
2010-12-10 05:19:39 +00:00
#ifndef FLACCL_CPU
2010-10-23 18:29:06 +00:00
// get_group_id(0) == partition index / (GROUP_SIZE / 16)
2010-10-10 23:28:38 +00:00
// get_group_id(1) == task index
2010-10-06 11:16:41 +00:00
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
2010-10-23 18:29:06 +00:00
void clCalcPartition(
2010-10-06 11:16:41 +00:00
__global int *partition_lengths,
__global int *residual,
__global FLACCLSubframeTask *tasks,
int max_porder, // <= 8
int psize // == task.blocksize >> max_porder?
)
{
2010-12-07 22:52:34 +00:00
__local uint pl[(GROUP_SIZE / 16)][MAX_RICE_PARAM + 1];
2010-10-06 11:16:41 +00:00
__local FLACCLSubframeData task;
const int tid = get_local_id(0);
if (tid < sizeof(task) / sizeof(int))
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(1)]))[tid];
2010-12-07 22:52:34 +00:00
if (tid < (GROUP_SIZE / 16))
2010-10-23 18:29:06 +00:00
{
2010-12-07 22:52:34 +00:00
for (int k = 0; k <= MAX_RICE_PARAM; k++)
2010-10-23 18:29:06 +00:00
pl[tid][k] = 0;
}
2010-10-06 11:16:41 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
2010-10-23 18:29:06 +00:00
int start = get_group_id(0) * psize * (GROUP_SIZE / 16);
int end = min(start + psize * (GROUP_SIZE / 16), task.blocksize);
for (int offs = start + tid; offs < end; offs += GROUP_SIZE)
2010-10-06 11:16:41 +00:00
{
// fetch residual
2010-10-23 18:29:06 +00:00
int s = (offs >= task.residualOrder && offs < end) ? residual[task.residualOffs + offs] : 0;
2010-10-25 04:50:36 +00:00
// convert to unsigned
2010-12-07 22:52:34 +00:00
uint t = (s << 1) ^ (s >> 31);
// calc number of unary bits for each residual sample with each rice parameter
2010-12-07 22:52:34 +00:00
int part = (offs - start) / psize;
// we must ensure that psize * (t >> k) doesn't overflow;
uint lim = 0x7fffffffU / (uint)psize;
2010-12-07 22:52:34 +00:00
for (int k = 0; k <= MAX_RICE_PARAM; k++)
2013-05-30 22:14:16 -04:00
atomic_add(&pl[part][k], min(lim, t >> k));
2010-10-23 18:29:06 +00:00
//pl[part][k] += s >> k;
}
2010-10-06 11:16:41 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
2010-10-23 18:29:06 +00:00
int part = get_group_id(0) * (GROUP_SIZE / 16) + tid;
if (tid < (GROUP_SIZE / 16) && part < (1 << max_porder))
2010-10-06 11:16:41 +00:00
{
2010-12-07 22:52:34 +00:00
for (int k = 0; k <= MAX_RICE_PARAM; k++)
2010-10-23 18:29:06 +00:00
{
// output length
2010-12-07 22:52:34 +00:00
const int pos = ((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(1) + (k << (max_porder + 1));
uint plen = pl[tid][k];
partition_lengths[pos + part] = min(0x007fffffU, plen) + (uint)(psize - select(0, task.residualOrder, part == 0)) * (k + 1);
2010-10-23 18:29:06 +00:00
// if (get_group_id(1) == 0)
//printf("pl[%d][%d] == %d\n", k, part, min(0x7fffff, pl[k][tid]) + (psize - task.residualOrder * (part == 0)) * (k + 1));
}
2010-10-06 11:16:41 +00:00
}
}
2010-10-15 19:56:36 +00:00
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
2010-10-23 18:29:06 +00:00
void clCalcPartition16(
2010-12-07 22:52:34 +00:00
__global unsigned int *partition_lengths,
2010-10-15 19:56:36 +00:00
__global int *residual,
__global FLACCLSubframeTask *tasks,
int max_porder // <= 8
)
{
2010-12-07 22:52:34 +00:00
__local FLACCLSubframeData task;
__local unsigned int res[GROUP_SIZE];
__local unsigned int pl[GROUP_SIZE >> 4][MAX_RICE_PARAM + 1];
2010-10-15 19:56:36 +00:00
const int tid = get_local_id(0);
if (tid < sizeof(task) / sizeof(int))
2010-10-17 05:35:11 +00:00
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0)]))[tid];
2010-10-15 19:56:36 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
2010-12-07 22:52:34 +00:00
int bs = task.blocksize;
int ro = task.residualOrder;
2010-10-15 19:56:36 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
{
int offs = pos + tid;
2010-12-07 22:52:34 +00:00
// fetch residual
int s = (offs >= ro && offs < bs) ? residual[task.residualOffs + offs] : 0;
2010-10-15 19:56:36 +00:00
// convert to unsigned
res[tid] = (s << 1) ^ (s >> 31);
2010-10-23 18:29:06 +00:00
2010-10-15 19:56:36 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
// we must ensure that psize * (t >> k) doesn't overflow;
uint4 lim = 0x07ffffffU;
int x = tid >> 4;
__local uint * chunk = &res[x << 4];
2010-12-07 22:52:34 +00:00
for (int k0 = 0; k0 <= MAX_RICE_PARAM; k0 += 16)
{
// calc number of unary bits for each group of 16 residual samples
// with each rice parameter.
int k = k0 + (tid & 15);
uint4 rsum
= min(lim, vload4(0,chunk) >> k)
+ min(lim, vload4(1,chunk) >> k)
+ min(lim, vload4(2,chunk) >> k)
+ min(lim, vload4(3,chunk) >> k)
;
2010-12-07 22:52:34 +00:00
uint rs = rsum.x + rsum.y + rsum.z + rsum.w;
// We can safely limit length here to 0x007fffffU, not causing length
// mismatch, because any such length would cause Verbatim frame anyway.
// And this limit protects us from overflows when calculating larger
// partitions, as we can have a maximum of 2^8 partitions, resulting
// in maximum partition length of 0x7fffffffU + change.
if (k <= MAX_RICE_PARAM) pl[x][k] = min(0x007fffffU, rs) + (uint)(16 - select(0, ro, offs < 16)) * (k + 1);
}
2010-10-15 19:56:36 +00:00
2010-12-02 15:58:41 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
2010-12-07 22:52:34 +00:00
for (int k0 = 0; k0 <= MAX_RICE_PARAM; k0 += 16)
{
int k1 = k0 + (tid >> (GROUP_SIZE_LOG - 4)), x1 = tid & ((1 << (GROUP_SIZE_LOG - 4)) - 1);
2010-12-07 22:52:34 +00:00
if (k1 <= MAX_RICE_PARAM && (pos >> 4) + x1 < (1 << max_porder))
partition_lengths[((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(0) + (k1 << (max_porder + 1)) + (pos >> 4) + x1] = pl[x1][k1];
}
2010-10-15 19:56:36 +00:00
}
}
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
void clCalcPartition32(
__global unsigned int *partition_lengths,
__global int *residual,
__global FLACCLSubframeTask *tasks,
int max_porder // <= 8
)
{
__local FLACCLSubframeData task;
__local unsigned int res[GROUP_SIZE];
__local unsigned int pl[GROUP_SIZE >> 5][32];
const int tid = get_local_id(0);
if (tid < sizeof(task) / sizeof(int))
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0)]))[tid];
barrier(CLK_LOCAL_MEM_FENCE);
int bs = task.blocksize;
int ro = task.residualOrder;
barrier(CLK_LOCAL_MEM_FENCE);
for (int pos = 0; pos < bs; pos += GROUP_SIZE)
{
int offs = pos + tid;
// fetch residual
int s = (offs >= ro && offs < bs) ? residual[task.residualOffs + offs] : 0;
// convert to unsigned
res[tid] = (s << 1) ^ (s >> 31);
barrier(CLK_LOCAL_MEM_FENCE);
// we must ensure that psize * (t >> k) doesn't overflow;
uint4 lim = 0x03ffffffU;
int x = tid >> 5;
__local uint * chunk = &res[x << 5];
// calc number of unary bits for each group of 32 residual samples
// with each rice parameter.
int k = tid & 31;
uint4 rsum
= min(lim, vload4(0,chunk) >> k)
+ min(lim, vload4(1,chunk) >> k)
+ min(lim, vload4(2,chunk) >> k)
+ min(lim, vload4(3,chunk) >> k)
+ min(lim, vload4(4,chunk) >> k)
+ min(lim, vload4(5,chunk) >> k)
+ min(lim, vload4(6,chunk) >> k)
+ min(lim, vload4(7,chunk) >> k)
;
uint rs = rsum.x + rsum.y + rsum.z + rsum.w;
// We can safely limit length here to 0x007fffffU, not causing length
// mismatch, because any such length would cause Verbatim frame anyway.
// And this limit protects us from overflows when calculating larger
// partitions, as we can have a maximum of 2^8 partitions, resulting
// in maximum partition length of 0x7fffffffU + change.
if (k <= MAX_RICE_PARAM) pl[x][k] = min(0x007fffffU, rs) + (uint)(32 - select(0, ro, offs < 32)) * (k + 1);
barrier(CLK_LOCAL_MEM_FENCE);
int k1 = (tid >> (GROUP_SIZE_LOG - 5)), x1 = tid & ((1 << (GROUP_SIZE_LOG - 5)) - 1);
if (k1 <= MAX_RICE_PARAM && (pos >> 5) + x1 < (1 << max_porder))
partition_lengths[((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(0) + (k1 << (max_porder + 1)) + (pos >> 5) + x1] = pl[x1][k1];
}
}
2010-11-05 16:28:24 +00:00
#endif
2010-10-15 19:56:36 +00:00
2010-11-19 07:35:43 +00:00
#ifdef FLACCL_CPU
2010-11-05 16:28:24 +00:00
// Sums partition lengths for a certain k == get_group_id(0)
// get_group_id(0) == k
// get_group_id(1) == task index
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clSumPartition(
2010-12-07 22:52:34 +00:00
__global ulong* partition_lengths,
2010-11-05 16:28:24 +00:00
int max_porder
)
{
if (get_group_id(0) != 0) // ignore k != 0
return;
2010-12-07 22:52:34 +00:00
__global ulong * sums = partition_lengths + (1 << (max_porder + 1)) * get_group_id(1);
2010-11-05 16:28:24 +00:00
for (int i = max_porder - 1; i >= 0; i--)
{
for (int j = 0; j < (1 << i); j++)
{
sums[(2 << i) + j] = sums[2 * j] + sums[2 * j + 1];
// if (get_group_id(1) == 0)
//printf("[%d][%d]: %d + %d == %d\n", i, j, sums[2 * j], sums[2 * j + 1], sums[2 * j] + sums[2 * j + 1]);
}
sums += 2 << i;
}
}
#else
2010-10-10 23:28:38 +00:00
// Sums partition lengths for a certain k == get_group_id(0)
// Requires 128 threads
// get_group_id(0) == k
// get_group_id(1) == task index
__kernel __attribute__((reqd_work_group_size(128, 1, 1)))
2010-10-23 18:29:06 +00:00
void clSumPartition(
2010-12-07 22:52:34 +00:00
__global uint* partition_lengths,
2010-10-10 23:28:38 +00:00
int max_porder
)
{
2010-12-07 22:52:34 +00:00
__local uint data[256]; // max_porder <= 8, data length <= 1 << 9.
const int pos = ((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(1) + (get_group_id(0) << (max_porder + 1));
2010-10-10 23:28:38 +00:00
// fetch partition lengths
2010-12-07 22:52:34 +00:00
uint2 pl = get_local_id(0) * 2 < (1 << max_porder) ? vload2(get_local_id(0),&partition_lengths[pos]) : 0;
2010-10-23 18:29:06 +00:00
data[get_local_id(0)] = pl.x + pl.y;
2010-10-10 23:28:38 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
int in_pos = (get_local_id(0) << 1);
2010-10-23 18:29:06 +00:00
int out_pos = (1 << (max_porder - 1)) + get_local_id(0);
for (int bs = 1 << (max_porder - 2); bs > 0; bs >>= 1)
2010-10-10 23:28:38 +00:00
{
2010-12-07 22:52:34 +00:00
if (get_local_id(0) < bs) data[out_pos] = data[in_pos] + data[in_pos + 1];
2010-10-10 23:28:38 +00:00
in_pos += bs << 1;
out_pos += bs;
barrier(CLK_LOCAL_MEM_FENCE);
}
if (get_local_id(0) < (1 << max_porder))
2010-10-23 18:29:06 +00:00
partition_lengths[pos + (1 << max_porder) + get_local_id(0)] = data[get_local_id(0)];
2010-10-10 23:28:38 +00:00
if (get_local_size(0) + get_local_id(0) < (1 << max_porder))
2010-10-23 18:29:06 +00:00
partition_lengths[pos + (1 << max_porder) + get_local_size(0) + get_local_id(0)] = data[get_local_size(0) + get_local_id(0)];
2010-10-10 23:28:38 +00:00
}
2010-11-05 16:28:24 +00:00
#endif
2010-10-10 23:28:38 +00:00
2010-11-19 07:35:43 +00:00
#ifdef FLACCL_CPU
2010-11-05 16:28:24 +00:00
// Finds optimal rice parameter for each partition.
// get_group_id(0) == task index
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clFindRiceParameter(
__global FLACCLSubframeTask *tasks,
__global int* rice_parameters,
2010-12-07 22:52:34 +00:00
__global ulong* partition_lengths,
2010-11-05 16:28:24 +00:00
int max_porder
)
{
__global FLACCLSubframeTask* task = tasks + get_group_id(0);
const int tid = get_local_id(0);
int lim = (2 << max_porder) - 1;
2010-11-12 05:44:39 +00:00
//int psize = task->data.blocksize >> max_porder;
2010-11-05 16:28:24 +00:00
int bs = task->data.blocksize;
int ro = task->data.residualOrder;
2010-12-07 22:52:34 +00:00
__global ulong* ppl = &partition_lengths[get_group_id(0) << (max_porder + 1)];
2010-11-20 14:06:10 +00:00
__global int* prp = &rice_parameters[get_group_id(0) << (max_porder + 2)];
__global int* pol = prp + (1 << (max_porder + 1));
for (int porder = max_porder; porder >= 0; porder--)
2010-11-05 16:28:24 +00:00
{
2010-11-20 14:06:10 +00:00
int pos = (2 << max_porder) - (2 << porder);
int fin = pos + (1 << porder);
2010-12-07 22:52:34 +00:00
ulong pl = ppl[pos];
2010-11-20 14:06:10 +00:00
int ps = (bs >> porder) - ro;
2013-05-30 22:14:16 -04:00
int k = clamp(63 - (int)clz(pl / max(1, ps)), 0, MAX_RICE_PARAM);
2010-12-07 22:52:34 +00:00
int plk = ps * (k + 1) + (int)(pl >> k);
2010-11-20 14:06:10 +00:00
2010-11-05 16:28:24 +00:00
// output rice parameter
2010-11-20 14:06:10 +00:00
prp[pos] = k;
2010-11-05 16:28:24 +00:00
// output length
2010-11-20 14:06:10 +00:00
pol[pos] = plk;
ps = (bs >> porder);
for (int offs = pos + 1; offs < fin; offs++)
{
pl = ppl[offs];
2013-05-30 22:14:16 -04:00
k = clamp(63 - (int)clz(pl / ps), 0, MAX_RICE_PARAM);
2010-12-07 22:52:34 +00:00
plk = ps * (k + 1) + (int)(pl >> k);
2010-11-20 14:06:10 +00:00
// output rice parameter
prp[offs] = k;
// output length
pol[offs] = plk;
}
2010-11-05 16:28:24 +00:00
}
}
#else
2010-10-31 18:09:45 +00:00
// Finds optimal rice parameter for each partition.
// get_group_id(0) == task index
2010-10-10 23:28:38 +00:00
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
2010-10-23 18:29:06 +00:00
void clFindRiceParameter(
2010-10-31 18:09:45 +00:00
__global FLACCLSubframeTask *tasks,
2010-10-10 23:28:38 +00:00
__global int* rice_parameters,
2010-12-07 22:52:34 +00:00
__global uint* partition_lengths,
2010-10-10 23:28:38 +00:00
int max_porder
)
{
2010-10-31 18:09:45 +00:00
for (int offs = get_local_id(0); offs < (2 << max_porder); offs += GROUP_SIZE)
2010-10-10 23:28:38 +00:00
{
2010-12-07 22:52:34 +00:00
const int pos = ((MAX_RICE_PARAM + 1) << (max_porder + 1)) * get_group_id(0) + offs;
uint best_l = partition_lengths[pos];
2010-10-23 18:29:06 +00:00
int best_k = 0;
2010-12-07 22:52:34 +00:00
for (int k = 1; k <= MAX_RICE_PARAM; k++)
2010-10-23 18:29:06 +00:00
{
2010-12-07 22:52:34 +00:00
uint l = partition_lengths[pos + (k << (max_porder + 1))];
2010-10-23 18:29:06 +00:00
best_k = select(best_k, k, l < best_l);
best_l = min(best_l, l);
}
2010-10-10 23:28:38 +00:00
// output rice parameter
2010-10-31 18:09:45 +00:00
rice_parameters[(get_group_id(0) << (max_porder + 2)) + offs] = best_k;
2010-10-10 23:28:38 +00:00
// output length
2010-10-31 18:09:45 +00:00
rice_parameters[(get_group_id(0) << (max_porder + 2)) + (1 << (max_porder + 1)) + offs] = best_l;
2010-10-10 23:28:38 +00:00
}
}
2010-11-05 16:28:24 +00:00
#endif
2010-11-19 07:35:43 +00:00
#ifdef FLACCL_CPU
2010-11-05 16:28:24 +00:00
// get_group_id(0) == task index
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
void clFindPartitionOrder(
__global int *residual,
__global int* best_rice_parameters,
__global FLACCLSubframeTask *tasks,
__global int* rice_parameters,
int max_porder
)
{
__global FLACCLSubframeTask* task = tasks + get_group_id(0);
int partlen[9];
for (int p = 0; p < 9; p++)
partlen[p] = 0;
// fetch partition lengths
const int pos = (get_group_id(0) << (max_porder + 2)) + (2 << max_porder);
2010-11-20 14:06:10 +00:00
for (int porder = max_porder; porder >= 0; porder--)
2010-11-05 16:28:24 +00:00
{
2010-11-20 14:06:10 +00:00
int start = (2 << max_porder) - (2 << porder);
for (int offs = 0; offs < (1 << porder); offs ++)
partlen[porder] += rice_parameters[pos + start + offs];
2010-11-05 16:28:24 +00:00
}
2010-12-07 22:52:34 +00:00
int best_length = partlen[0] + RICE_PARAM_BITS;
2010-11-05 16:28:24 +00:00
int best_porder = 0;
for (int porder = 1; porder <= max_porder; porder++)
{
2010-12-07 22:52:34 +00:00
int length = (RICE_PARAM_BITS << porder) + partlen[porder];
2010-11-05 16:28:24 +00:00
best_porder = select(best_porder, porder, length < best_length);
best_length = min(best_length, length);
}
2010-10-10 23:28:38 +00:00
2010-12-07 22:52:34 +00:00
best_length = (RICE_PARAM_BITS << best_porder) + task->data.blocksize - task->data.residualOrder;
2010-11-05 16:28:24 +00:00
int best_psize = task->data.blocksize >> best_porder;
int start = task->data.residualOffs + task->data.residualOrder;
int fin = task->data.residualOffs + best_psize;
for (int p = 0; p < (1 << best_porder); p++)
{
int k = rice_parameters[pos - (2 << best_porder) + p];
best_length += k * (fin - start);
for (int i = start; i < fin; i++)
{
int t = residual[i];
best_length += ((t << 1) ^ (t >> 31)) >> k;
}
start = fin;
fin += best_psize;
}
int obits = task->data.obits - task->data.wbits;
task->data.porder = best_porder;
2010-11-12 05:44:39 +00:00
task->data.headerLen =
task->data.type == Constant ? obits :
task->data.type == Verbatim ? obits * task->data.blocksize :
task->data.type == Fixed ? task->data.residualOrder * obits + 6 :
task->data.type == LPC ? task->data.residualOrder * obits + 6 + 4 + 5 + task->data.residualOrder * task->data.cbits : 0;
task->data.size =
task->data.headerLen + ((task->data.type == Fixed || task->data.type == LPC) ? best_length : 0);
if (task->data.size >= obits * task->data.blocksize)
{
task->data.headerLen = task->data.size = obits * task->data.blocksize;
task->data.type = Verbatim;
}
2010-11-05 16:28:24 +00:00
for (int offs = 0; offs < (1 << best_porder); offs ++)
best_rice_parameters[(get_group_id(0) << max_porder) + offs] = rice_parameters[pos - (2 << best_porder) + offs];
}
#else
2010-10-10 23:28:38 +00:00
// get_group_id(0) == task index
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
2010-10-23 18:29:06 +00:00
void clFindPartitionOrder(
2010-10-31 18:09:45 +00:00
__global int *residual,
2010-10-10 23:28:38 +00:00
__global int* best_rice_parameters,
__global FLACCLSubframeTask *tasks,
__global int* rice_parameters,
int max_porder
)
{
2010-10-25 04:50:36 +00:00
__local int partlen[16];
2010-10-10 23:28:38 +00:00
__local FLACCLSubframeData task;
const int pos = (get_group_id(0) << (max_porder + 2)) + (2 << max_porder);
if (get_local_id(0) < sizeof(task) / sizeof(int))
((__local int*)&task)[get_local_id(0)] = ((__global int*)(&tasks[get_group_id(0)]))[get_local_id(0)];
2010-10-25 04:50:36 +00:00
if (get_local_id(0) < 16)
2010-10-23 18:29:06 +00:00
partlen[get_local_id(0)] = 0;
2010-10-10 23:28:38 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
2010-10-23 18:29:06 +00:00
// fetch partition lengths
2010-10-25 04:50:36 +00:00
int lim = (2 << max_porder) - 1;
for (int offs = get_local_id(0); offs < lim; offs += GROUP_SIZE)
2010-10-10 23:28:38 +00:00
{
2010-10-25 04:50:36 +00:00
int len = rice_parameters[pos + offs];
int porder = 31 - clz(lim - offs);
2013-05-30 22:14:16 -04:00
atomic_add(&partlen[porder], len);
2010-10-10 23:28:38 +00:00
}
barrier(CLK_LOCAL_MEM_FENCE);
2010-10-23 18:29:06 +00:00
2010-12-07 22:52:34 +00:00
int best_length = partlen[0] + RICE_PARAM_BITS;
2010-10-23 18:29:06 +00:00
int best_porder = 0;
for (int porder = 1; porder <= max_porder; porder++)
2010-10-10 23:28:38 +00:00
{
2010-12-07 22:52:34 +00:00
int length = (RICE_PARAM_BITS << porder) + partlen[porder];
2010-10-23 18:29:06 +00:00
best_porder = select(best_porder, porder, length < best_length);
best_length = min(best_length, length);
2010-10-10 23:28:38 +00:00
}
2010-10-23 18:29:06 +00:00
2010-10-10 23:28:38 +00:00
if (get_local_id(0) == 0)
{
task.porder = best_porder;
2010-10-10 23:28:38 +00:00
int obits = task.obits - task.wbits;
task.headerLen =
task.type == Fixed ? task.residualOrder * obits + 6 :
task.type == LPC ? task.residualOrder * obits + 6 + 4 + 5 + task.residualOrder * task.cbits :
task.type == Constant ? obits :
/* task.type == Verbatim ? */ obits * task.blocksize;
task.size = task.headerLen + select(0, best_length, task.type == Fixed || task.type == LPC);
if (task.size >= obits * task.blocksize)
{
task.headerLen = task.size = obits * task.blocksize;
task.type = Verbatim;
}
2010-10-10 23:28:38 +00:00
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < sizeof(task) / sizeof(int))
((__global int*)(&tasks[get_group_id(0)]))[get_local_id(0)] = ((__local int*)&task)[get_local_id(0)];
2010-10-31 18:09:45 +00:00
for (int offs = get_local_id(0); offs < (1 << best_porder); offs += GROUP_SIZE)
best_rice_parameters[(get_group_id(0) << max_porder) + offs] = rice_parameters[pos - (2 << best_porder) + offs];
2010-10-10 23:28:38 +00:00
// FIXME: should be bytes?
}
2010-09-20 05:32:05 +00:00
#endif
2010-11-12 05:44:39 +00:00
#ifdef DO_RICE
2010-11-19 07:35:43 +00:00
#ifdef FLACCL_CPU
2010-11-12 05:44:39 +00:00
typedef struct BitWriter_t
{
2010-11-19 07:35:43 +00:00
__global unsigned int *buffer;
2010-11-12 05:44:39 +00:00
unsigned int bit_buf;
int bit_left;
int buf_ptr;
} BitWriter;
inline void writebits(BitWriter *bw, int bits, int v)
{
uint val = ((uint)v) & ((1 << bits) - 1);
if (bits < bw->bit_left)
{
bw->bit_buf = (bw->bit_buf << bits) | val;
bw->bit_left -= bits;
}
else
{
// if (bits >= 32) printf("\n\n\n\n-------------------------\n\n\n");
unsigned int bb = (bw->bit_buf << bw->bit_left) | (val >> (bits - bw->bit_left));
bw->buffer[bw->buf_ptr++] = (bb >> 24) | ((bb >> 8) & 0xff00) | ((bb << 8) & 0xff0000) | ((bb << 24) & 0xff000000);
bw->bit_left += (32 - bits);
bw->bit_buf = val;
// bw->bit_buf = val & ((1 << (32 - bw->bit_left)) - 1);
}
}
inline void flush(BitWriter *bw)
{
if (bw->bit_left < 32)
writebits(bw, bw->bit_left, 0);
}
#endif
inline int len_utf8(int n)
{
int bts = 31 - clz(n);
2010-12-02 15:58:41 +00:00
return select(8, 8 * ((bts + 4) / 5), bts > 6);
2010-11-12 05:44:39 +00:00
}
2010-12-02 15:58:41 +00:00
#ifdef FLACCL_CPU
2010-11-12 05:44:39 +00:00
// get_global_id(0) * channels == task index
2010-12-02 15:58:41 +00:00
__kernel __attribute__((reqd_work_group_size(1, 1, 1)))
2010-11-12 05:44:39 +00:00
void clCalcOutputOffsets(
__global int *residual,
__global int *samples,
__global FLACCLSubframeTask *tasks,
int channels,
int frameCount,
int firstFrame
)
{
int offset = 0;
for (int iFrame = 0; iFrame < frameCount; iFrame++)
{
//printf("len_utf8(%d) == %d\n", firstFrame + iFrame, len_utf8(firstFrame + iFrame));
offset += 15 + 1 + 4 + 4 + 4 + 3 + 1 + len_utf8(firstFrame + iFrame)
// + 8-16 // custom block size
// + 8-16 // custom sample rate
;
int bs = tasks[iFrame * channels].data.blocksize;
2010-12-02 15:58:41 +00:00
//public static readonly int[] flac_blocksizes = new int[15] { 0, 192, 576, 1152, 2304, 4608, 0, 0, 256, 512, 1024, 2048, 4096, 8192, 16384 };
offset += select(0, select(8, 16, bs >= 256), bs != 4096 && bs != 4608); // TODO: check all other standard sizes
2010-11-12 05:44:39 +00:00
// assert (offset % 8) == 0
offset += 8;
for (int ch = 0; ch < channels; ch++)
{
__global FLACCLSubframeTask* task = tasks + iFrame * channels + ch;
offset += 8 + task->data.wbits;
2010-11-25 09:10:35 +00:00
// Add 32 bits to separate frames if header is too small so they can intersect
offset += 64;
2010-11-12 05:44:39 +00:00
task->data.encodingOffset = offset + task->data.headerLen;
offset += task->data.size;
}
offset = (offset + 7) & ~7;
offset += 16;
}
}
2010-12-02 15:58:41 +00:00
#else
// get_global_id(0) * channels == task index
__kernel __attribute__((reqd_work_group_size(32, 1, 1)))
void clCalcOutputOffsets(
__global int *residual,
__global int *samples,
__global FLACCLSubframeTask *tasks,
int channels1,
int frameCount,
int firstFrame
)
{
2010-12-07 22:52:34 +00:00
__local FLACCLSubframeData ltasks[MAX_CHANNELS];
__local volatile int mypos[MAX_CHANNELS];
2010-12-02 15:58:41 +00:00
int offset = 0;
for (int iFrame = 0; iFrame < frameCount; iFrame++)
{
if (get_local_id(0) < sizeof(ltasks[0]) / sizeof(int))
2010-12-07 22:52:34 +00:00
for (int ch = 0; ch < MAX_CHANNELS; ch++)
((__local int*)&ltasks[ch])[get_local_id(0)] = ((__global int*)(&tasks[iFrame * MAX_CHANNELS + ch]))[get_local_id(0)];
2010-12-02 15:58:41 +00:00
//printf("len_utf8(%d) == %d\n", firstFrame + iFrame, len_utf8(firstFrame + iFrame));
offset += 15 + 1 + 4 + 4 + 4 + 3 + 1 + len_utf8(firstFrame + iFrame)
// + 8-16 // custom block size
// + 8-16 // custom sample rate
;
int bs = ltasks[0].blocksize;
//public static readonly int[] flac_blocksizes = new int[15] { 0, 192, 576, 1152, 2304, 4608, 0, 0, 256, 512, 1024, 2048, 4096, 8192, 16384 };
offset += select(0, select(8, 16, bs >= 256), bs != 4096 && bs != 4608); // TODO: check all other standard sizes
// assert (offset % 8) == 0
offset += 8;
2010-12-07 22:52:34 +00:00
if (get_local_id(0) < MAX_CHANNELS)
2010-12-02 15:58:41 +00:00
{
int ch = get_local_id(0);
// Add 64 bits to separate frames if header is too small so they can intersect
int mylen = 8 + ltasks[ch].wbits + 64 + ltasks[ch].size;
mypos[ch] = mylen;
2010-12-07 22:52:34 +00:00
for (int offset = 1; offset < WARP_SIZE && offset < MAX_CHANNELS; offset <<= 1)
2010-12-02 15:58:41 +00:00
if (ch >= offset) mypos[ch] += mypos[ch - offset];
mypos[ch] += offset;
2010-12-07 22:52:34 +00:00
tasks[iFrame * MAX_CHANNELS + ch].data.encodingOffset = mypos[ch] - ltasks[ch].size + ltasks[ch].headerLen;
2010-12-02 15:58:41 +00:00
}
2010-12-07 22:52:34 +00:00
offset = mypos[MAX_CHANNELS - 1];
2010-12-02 15:58:41 +00:00
offset = (offset + 7) & ~7;
offset += 16;
}
}
#endif
2010-11-12 05:44:39 +00:00
// get_group_id(0) == task index
__kernel __attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1)))
void clRiceEncoding(
__global int *residual,
__global int *samples,
__global int* best_rice_parameters,
__global FLACCLSubframeTask *tasks,
2010-11-18 16:29:37 +00:00
__global unsigned int* output,
2010-11-12 05:44:39 +00:00
int max_porder
)
{
2010-11-19 07:35:43 +00:00
#ifdef FLACCL_CPU
2010-11-12 05:44:39 +00:00
__global FLACCLSubframeTask* task = tasks + get_group_id(0);
if (task->data.type == Fixed || task->data.type == LPC)
{
int ro = task->data.residualOrder;
int bs = task->data.blocksize;
int porder = task->data.porder;
int psize = bs >> porder;
BitWriter bw;
bw.buffer = output;
bw.buf_ptr = task->data.encodingOffset / 32;
bw.bit_left = 32 - (task->data.encodingOffset & 31);
bw.bit_buf = 0;
//if (get_group_id(0) == 0) printf("%d\n", offs);
int res_cnt = psize - ro;
// residual
int j = ro;
__global int * kptr = &best_rice_parameters[get_group_id(0) << max_porder];
for (int p = 0; p < (1 << porder); p++)
{
int k = kptr[p];
2010-12-07 22:52:34 +00:00
writebits(&bw, RICE_PARAM_BITS, k);
2010-11-12 05:44:39 +00:00
//if (get_group_id(0) == 0) printf("[%x] ", k);
//if (get_group_id(0) == 0) printf("(%x) ", bw.bit_buf);
if (p == 1) res_cnt = psize;
int cnt = min(res_cnt, bs - j);
2010-11-24 17:32:48 +00:00
unsigned int kexp = 1U << k;
__global int *rptr = &residual[task->data.residualOffs + j];
2010-11-12 05:44:39 +00:00
for (int i = 0; i < cnt; i++)
{
2010-11-24 17:32:48 +00:00
int iv = rptr[i];
unsigned int v = (iv << 1) ^ (iv >> 31);
2010-11-12 05:44:39 +00:00
// write quotient in unary
2010-11-24 17:32:48 +00:00
int bits = k + (v >> k) + 1;
2010-11-12 05:44:39 +00:00
while (bits > 31)
{
int b = min(bits - 31, 31);
if (b < bw.bit_left)
{
bw.bit_buf <<= b;
bw.bit_left -= b;
}
else
{
unsigned int bb = bw.bit_buf << bw.bit_left;
bw.bit_buf = 0;
bw.bit_left += (32 - b);
bw.buffer[bw.buf_ptr++] = as_int(as_char4(bb).wzyx);
2010-11-12 05:44:39 +00:00
}
bits -= b;
}
2010-11-24 17:32:48 +00:00
unsigned int val = (v & (kexp - 1)) | kexp;
2010-11-12 05:44:39 +00:00
if (bits < bw.bit_left)
{
bw.bit_buf = (bw.bit_buf << bits) | val;
bw.bit_left -= bits;
}
else
{
unsigned int bb = (bw.bit_buf << bw.bit_left) | (val >> (bits - bw.bit_left));
bw.bit_buf = val;
bw.bit_left += (32 - bits);
bw.buffer[bw.buf_ptr++] = as_int(as_char4(bb).wzyx);
2010-11-12 05:44:39 +00:00
}
////if (get_group_id(0) == 0) printf("%x ", v);
//writebits(&bw, (v >> k) + 1, 1);
////if (get_group_id(0) == 0) printf("(%x) ", bw.bit_buf);
//writebits(&bw, k, v);
////if (get_group_id(0) == 0) printf("(%x) ", bw.bit_buf);
}
j += cnt;
}
//if (bw.buf_ptr * 32 + 32 - bw.bit_left != task->data.encodingOffset - task->data.headerLen + task->data.size)
// printf("bit length mismatch: encodingOffset == %d, headerLen == %d, size == %d, so should be %d, but is %d\n",
// task->data.encodingOffset, task->data.headerLen, task->data.size,
// task->data.encodingOffset - task->data.headerLen + task->data.size,
// bw.buf_ptr * 32 + 32 - bw.bit_left
// );
//if (get_group_id(0) == 0) printf("\n");
flush(&bw);
}
#else
2010-12-07 22:52:34 +00:00
__local uint data[GROUP_SIZE];
2010-11-23 09:04:22 +00:00
__local volatile int mypos[GROUP_SIZE+1];
2010-12-02 15:58:41 +00:00
#if 0
__local int brp[256];
#endif
2010-11-24 17:32:48 +00:00
__local volatile int warppos[WARP_SIZE];
2010-11-17 05:26:59 +00:00
__local FLACCLSubframeData task;
int tid = get_local_id(0);
if (tid < sizeof(task) / sizeof(int))
((__local int*)&task)[tid] = ((__global int*)(&tasks[get_group_id(0)]))[tid];
barrier(CLK_LOCAL_MEM_FENCE);
2010-11-25 09:10:35 +00:00
if (task.type != Fixed && task.type != LPC)
return;
2010-11-14 22:26:10 +00:00
if (tid == 0)
mypos[GROUP_SIZE] = 0;
2010-11-23 09:04:22 +00:00
if (tid < WARP_SIZE)
warppos[tid] = 0;
2010-12-02 15:58:41 +00:00
#if 0
for (int offs = tid; offs < (1 << task.porder); offs ++)
brp[offs] = best_rice_parameters[(get_group_id(0) << max_porder) + offs];
#endif
data[tid] = 0;
barrier(CLK_LOCAL_MEM_FENCE);
2010-11-17 05:26:59 +00:00
const int bs = task.blocksize;
int start = task.encodingOffset;
2010-11-23 09:04:22 +00:00
int plen = bs >> task.porder;
//int plenoffs = 12 - task.porder;
2010-12-07 22:52:34 +00:00
uint remainder = 0U;
2010-12-02 15:58:41 +00:00
int pos;
for (pos = 0; pos + GROUP_SIZE - 1 < bs; pos += GROUP_SIZE)
{
int offs = pos + tid;
2010-12-07 22:52:34 +00:00
int iv = residual[task.residualOffs + offs];
int part = offs / plen;
//int part = offs >> plenoffs;
2010-12-02 15:58:41 +00:00
#if 0
int k = brp[part];
#else
int k = best_rice_parameters[(get_group_id(0) << max_porder) + part];
#endif
int pstart = offs == part * plen;
//int pstart = offs == part << plenoffs;
2010-12-07 22:52:34 +00:00
uint v = (iv << 1) ^ (iv >> 31);
int mylen = select(0, (int)(v >> k) + 1 + k, offs >= task.residualOrder && offs < bs) + select(0, RICE_PARAM_BITS, pstart);
mypos[tid] = mylen;
2010-12-07 22:52:34 +00:00
// Inclusive scan(+)
2010-11-17 05:26:59 +00:00
int lane = (tid & (WARP_SIZE - 1));
2010-11-14 22:26:10 +00:00
for (int offset = 1; offset < WARP_SIZE; offset <<= 1)
2010-11-17 05:26:59 +00:00
mypos[tid] += mypos[select(GROUP_SIZE, tid - offset, lane >= offset)];
2010-11-23 09:04:22 +00:00
int mp = mypos[tid];
2010-11-14 22:26:10 +00:00
if ((tid & (WARP_SIZE - 1)) == WARP_SIZE - 1)
2010-11-23 09:04:22 +00:00
warppos[tid/WARP_SIZE] = mp;
2010-11-14 22:26:10 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
2010-11-23 09:04:22 +00:00
if (tid < GROUP_SIZE/WARP_SIZE)
2010-11-14 22:26:10 +00:00
{
2010-11-23 09:04:22 +00:00
for (int offset = 1; offset < GROUP_SIZE/WARP_SIZE; offset <<= 1)
warppos[tid] += warppos[select(GROUP_SIZE/WARP_SIZE, tid - offset, tid >= offset)];
2010-11-14 22:26:10 +00:00
}
barrier(CLK_LOCAL_MEM_FENCE);
2010-11-23 09:04:22 +00:00
mp += start + select(0, warppos[tid / WARP_SIZE - 1], tid / WARP_SIZE > 0);
int start32 = start >> 5;
start += mypos[GROUP_SIZE - 1] + warppos[GROUP_SIZE / WARP_SIZE - 2];
2010-12-07 22:52:34 +00:00
//if (start / 32 - start32 >= GROUP_SIZE - 3)
// tasks[get_group_id(0)].data.size = 1;
2010-12-02 15:58:41 +00:00
//if (tid == GROUP_SIZE - 1 && mypos[tid] > (GROUP_SIZE/2) * 32)
// printf("Oops: %d\n", mypos[tid]);
data[tid] = select(0U, remainder, tid == 0);
2010-11-14 22:26:10 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
if (pstart)
{
int kpos = mp - mylen;
int kpos0 = (kpos >> 5) - start32;
int kpos1 = kpos & 31;
uint kval = (uint)k << (32 - RICE_PARAM_BITS);
uint kval0 = kval >> kpos1;
uint kval1 = kval << (32 - kpos1);
if (kval0) atomic_or(&data[kpos0], kval0);
if (kpos1 && kval1) atomic_or(&data[kpos0 + 1], kval1);
}
if (offs >= task.residualOrder && offs < bs)
{
2010-12-02 15:58:41 +00:00
int qpos = mp - k - 1;
int qpos0 = (qpos >> 5) - start32;
int qpos1 = qpos & 31;
2010-12-07 22:52:34 +00:00
uint qval = (1U << 31) | (v << (31 - k));
uint qval0 = qval >> qpos1;
uint qval1= qval << (32 - qpos1);
2013-05-30 22:14:16 -04:00
if (qval0) atomic_or(&data[qpos0], qval0);
if (qpos1 && qval1) atomic_or(&data[qpos0 + 1], qval1);
}
2010-12-02 15:58:41 +00:00
barrier(CLK_LOCAL_MEM_FENCE);
if ((start32 + tid) * 32 <= start)
output[start32 + tid] = as_int(as_char4(data[tid]).wzyx);
remainder = data[start / 32 - start32];
}
if (pos < bs)
{
int offs = pos + tid;
2010-12-07 22:52:34 +00:00
int iv = offs < bs ? residual[task.residualOffs + offs] : 0;
2010-12-02 15:58:41 +00:00
int part = offs / plen; // >> plenoffs;
//int k = brp[min(255, part)];
int k = offs < bs ? best_rice_parameters[(get_group_id(0) << max_porder) + part] : 0;
int pstart = offs == part * plen;
2010-12-07 22:52:34 +00:00
uint v = (iv << 1) ^ (iv >> 31);
int mylen = select(0, (int)(v >> k) + 1 + k, offs >= task.residualOrder && offs < bs) + select(0, RICE_PARAM_BITS, pstart);
2010-12-02 15:58:41 +00:00
mypos[tid] = mylen;
// Inclusive scan(+)
int lane = (tid & (WARP_SIZE - 1));
for (int offset = 1; offset < WARP_SIZE; offset <<= 1)
mypos[tid] += mypos[select(GROUP_SIZE, tid - offset, lane >= offset)];
int mp = mypos[tid];
if ((tid & (WARP_SIZE - 1)) == WARP_SIZE - 1)
warppos[tid/WARP_SIZE] = mp;
barrier(CLK_LOCAL_MEM_FENCE);
if (tid < GROUP_SIZE/WARP_SIZE)
{
for (int offset = 1; offset < GROUP_SIZE/WARP_SIZE; offset <<= 1)
warppos[tid] += warppos[select(GROUP_SIZE/WARP_SIZE, tid - offset, tid >= offset)];
}
barrier(CLK_LOCAL_MEM_FENCE);
mp += start + select(0, warppos[tid / WARP_SIZE - 1], tid / WARP_SIZE > 0);
int start32 = start >> 5;
start += mypos[GROUP_SIZE - 1] + warppos[GROUP_SIZE / WARP_SIZE - 2];
2010-11-20 14:06:10 +00:00
//if (tid == GROUP_SIZE - 1 && mypos[tid] > (GROUP_SIZE/2) * 32)
// printf("Oops: %d\n", mypos[tid]);
2010-11-23 09:04:22 +00:00
data[tid] = select(0U, remainder, tid == 0);
barrier(CLK_LOCAL_MEM_FENCE);
if (pstart)
{
int kpos = mp - mylen;
int kpos0 = (kpos >> 5) - start32;
int kpos1 = kpos & 31;
uint kval = (uint)k << (32 - RICE_PARAM_BITS);
uint kval0 = kval >> kpos1;
uint kval1 = kval << (32 - kpos1);
if (kval0) atomic_or(&data[kpos0], kval0);
if (kpos1 && kval1) atomic_or(&data[kpos0 + 1], kval1);
}
if (offs >= task.residualOrder && offs < bs)
{
2010-11-23 09:04:22 +00:00
int qpos = mp - k - 1;
int qpos0 = (qpos >> 5) - start32;
int qpos1 = qpos & 31;
2010-12-07 22:52:34 +00:00
uint qval = (1U << 31) | (v << (31 - k));
uint qval0 = qval >> qpos1;
uint qval1= qval << (32 - qpos1);
2013-05-30 22:14:16 -04:00
if (qval0) atomic_or(&data[qpos0], qval0);
if (qpos1 && qval1) atomic_or(&data[qpos0 + 1], qval1);
}
barrier(CLK_LOCAL_MEM_FENCE);
if ((start32 + tid) * 32 <= start)
output[start32 + tid] = as_int(as_char4(data[tid]).wzyx);
2010-11-23 09:04:22 +00:00
remainder = data[start / 32 - start32];
}
2010-11-20 14:06:10 +00:00
// if (tid == 0 && start != task.encodingOffset - task.headerLen + task.size)
//printf("size mismatch: %d != %d\n", start, task.encodingOffset - task.headerLen + task.size);
2010-11-12 05:44:39 +00:00
#endif
}
#endif /* DO_RICE */
#endif /* DO_PARTITIONS */
2010-11-05 16:28:24 +00:00
#endif