2013-10-04 01:38:00 +10:00
/* libFLAC - Free Lossless Audio Codec library
* Copyright ( C ) 2000 - 2009 Josh Coalson
2014-11-24 22:07:15 +11:00
* Copyright ( C ) 2011 - 2014 Xiph . Org Foundation
2013-10-04 01:38:00 +10:00
*
* Redistribution and use in source and binary forms , with or without
* modification , are permitted provided that the following conditions
* are met :
*
* - Redistributions of source code must retain the above copyright
* notice , this list of conditions and the following disclaimer .
*
* - Redistributions in binary form must reproduce the above copyright
* notice , this list of conditions and the following disclaimer in the
* documentation and / or other materials provided with the distribution .
*
* - Neither the name of the Xiph . org Foundation nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission .
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ` ` AS IS ' ' AND ANY EXPRESS OR IMPLIED WARRANTIES , INCLUDING , BUT NOT
* LIMITED TO , THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED . IN NO EVENT SHALL THE FOUNDATION OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT , INDIRECT , INCIDENTAL , SPECIAL ,
* EXEMPLARY , OR CONSEQUENTIAL DAMAGES ( INCLUDING , BUT NOT LIMITED TO ,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES ; LOSS OF USE , DATA , OR
* PROFITS ; OR BUSINESS INTERRUPTION ) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY , WHETHER IN CONTRACT , STRICT LIABILITY , OR TORT ( INCLUDING
* NEGLIGENCE OR OTHERWISE ) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE , EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE .
*/
2014-03-23 21:59:46 +11:00
# ifdef HAVE_CONFIG_H
2013-10-04 01:38:00 +10:00
# include <config.h>
# endif
# ifndef FLAC__NO_ASM
2014-06-15 20:29:34 +10:00
# if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
2014-01-30 21:49:51 +11:00
# include "private/stream_encoder.h"
2014-06-19 13:04:33 +02:00
# include "private/bitmath.h"
2013-10-04 01:38:00 +10:00
# ifdef FLAC__SSSE3_SUPPORTED
# include <stdlib.h> /* for abs() */
# include <tmmintrin.h> /* SSSE3 */
# include "FLAC/assert.h"
2014-01-30 21:49:51 +11:00
FLAC__SSE_TARGET ( " ssse3 " )
2014-01-07 21:14:55 +11:00
void FLAC__precompute_partition_info_sums_intrin_ssse3 ( const FLAC__int32 residual [ ] , FLAC__uint64 abs_residual_partition_sums [ ] ,
2013-10-04 01:38:00 +10:00
unsigned residual_samples , unsigned predictor_order , unsigned min_partition_order , unsigned max_partition_order , unsigned bps )
{
const unsigned default_partition_samples = ( residual_samples + predictor_order ) > > max_partition_order ;
unsigned partitions = 1u < < max_partition_order ;
FLAC__ASSERT ( default_partition_samples > predictor_order ) ;
/* first do max_partition_order */
{
2015-11-03 18:08:49 +11:00
const unsigned threshold = 32 - FLAC__bitmath_ilog2 ( default_partition_samples ) ;
2013-10-04 01:38:00 +10:00
unsigned partition , residual_sample , end = ( unsigned ) ( - ( int ) predictor_order ) ;
2015-11-03 18:08:49 +11:00
if ( bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold ) {
2013-10-04 01:38:00 +10:00
for ( partition = residual_sample = 0 ; partition < partitions ; partition + + ) {
2015-11-03 18:08:49 +11:00
__m128i mm_sum = _mm_setzero_si128 ( ) ;
unsigned e1 , e3 ;
2013-10-04 01:38:00 +10:00
end + = default_partition_samples ;
e1 = ( residual_sample + 3 ) & ~ 3 ; e3 = end & ~ 3 ;
if ( e1 > end )
e1 = end ; /* try flac -l 1 -b 16 and you'll be here */
2014-01-30 21:53:37 +11:00
/* assumption: residual[] is properly aligned so (residual + e1) is properly aligned too and _mm_loadu_si128() is fast */
2014-01-31 20:39:26 +11:00
for ( ; residual_sample < e1 ; residual_sample + + ) {
2015-11-18 19:24:44 +11:00
__m128i mm_res = _mm_abs_epi32 ( _mm_cvtsi32_si128 ( residual [ residual_sample ] ) ) ;
2014-01-31 20:39:26 +11:00
mm_sum = _mm_add_epi32 ( mm_sum , mm_res ) ;
}
2013-10-04 01:38:00 +10:00
for ( ; residual_sample < e3 ; residual_sample + = 4 ) {
2015-11-18 19:24:44 +11:00
__m128i mm_res = _mm_abs_epi32 ( _mm_loadu_si128 ( ( const __m128i * ) ( residual + residual_sample ) ) ) ;
2014-01-31 20:39:26 +11:00
mm_sum = _mm_add_epi32 ( mm_sum , mm_res ) ;
}
2013-10-04 01:38:00 +10:00
2014-01-31 20:39:26 +11:00
for ( ; residual_sample < end ; residual_sample + + ) {
2015-11-18 19:24:44 +11:00
__m128i mm_res = _mm_abs_epi32 ( _mm_cvtsi32_si128 ( residual [ residual_sample ] ) ) ;
2013-10-04 01:38:00 +10:00
mm_sum = _mm_add_epi32 ( mm_sum , mm_res ) ;
}
mm_sum = _mm_hadd_epi32 ( mm_sum , mm_sum ) ;
mm_sum = _mm_hadd_epi32 ( mm_sum , mm_sum ) ;
2014-09-21 08:48:17 +10:00
abs_residual_partition_sums [ partition ] = ( FLAC__uint32 ) _mm_cvtsi128_si32 ( mm_sum ) ;
2016-05-05 17:21:20 +10:00
/* workaround for a bug in MSVC2015U2 - see https://connect.microsoft.com/VisualStudio/feedback/details/2659191/incorrect-code-generation-for-x86-64 */
# if (defined _MSC_VER) && (_MSC_FULL_VER == 190023918) && (defined FLAC__CPU_X86_64)
abs_residual_partition_sums [ partition ] & = 0xFFFFFFFF ;
# endif
2013-10-04 01:38:00 +10:00
}
}
else { /* have to pessimistically use 64 bits for accumulator */
for ( partition = residual_sample = 0 ; partition < partitions ; partition + + ) {
2015-11-03 18:08:49 +11:00
__m128i mm_sum = _mm_setzero_si128 ( ) ;
unsigned e1 , e3 ;
2013-10-04 01:38:00 +10:00
end + = default_partition_samples ;
e1 = ( residual_sample + 1 ) & ~ 1 ; e3 = end & ~ 1 ;
FLAC__ASSERT ( e1 < = end ) ;
2014-01-31 20:39:26 +11:00
for ( ; residual_sample < e1 ; residual_sample + + ) {
2015-11-18 19:24:44 +11:00
__m128i mm_res = _mm_abs_epi32 ( _mm_cvtsi32_si128 ( residual [ residual_sample ] ) ) ; /* 0 0 0 |r0| == 00 |r0_64| */
2014-01-31 20:39:26 +11:00
mm_sum = _mm_add_epi64 ( mm_sum , mm_res ) ;
}
2013-10-04 01:38:00 +10:00
for ( ; residual_sample < e3 ; residual_sample + = 2 ) {
2015-11-18 19:24:44 +11:00
__m128i mm_res = _mm_abs_epi32 ( _mm_loadl_epi64 ( ( const __m128i * ) ( residual + residual_sample ) ) ) ; /* 0 0 |r1| |r0| */
2013-10-04 01:38:00 +10:00
mm_res = _mm_shuffle_epi32 ( mm_res , _MM_SHUFFLE ( 3 , 1 , 2 , 0 ) ) ; /* 0 |r1| 0 |r0| == |r1_64| |r0_64| */
mm_sum = _mm_add_epi64 ( mm_sum , mm_res ) ;
}
2014-01-31 20:39:26 +11:00
for ( ; residual_sample < end ; residual_sample + + ) {
2015-11-18 19:24:44 +11:00
__m128i mm_res = _mm_abs_epi32 ( _mm_cvtsi32_si128 ( residual [ residual_sample ] ) ) ;
2014-01-31 20:39:26 +11:00
mm_sum = _mm_add_epi64 ( mm_sum , mm_res ) ;
2013-10-04 01:38:00 +10:00
}
2014-01-31 20:39:26 +11:00
mm_sum = _mm_add_epi64 ( mm_sum , _mm_srli_si128 ( mm_sum , 8 ) ) ;
_mm_storel_epi64 ( ( __m128i * ) ( abs_residual_partition_sums + partition ) , mm_sum ) ;
2013-10-04 01:38:00 +10:00
}
}
}
/* now merge partitions for lower orders */
{
unsigned from_partition = 0 , to_partition = partitions ;
int partition_order ;
for ( partition_order = ( int ) max_partition_order - 1 ; partition_order > = ( int ) min_partition_order ; partition_order - - ) {
unsigned i ;
partitions > > = 1 ;
for ( i = 0 ; i < partitions ; i + + ) {
abs_residual_partition_sums [ to_partition + + ] =
abs_residual_partition_sums [ from_partition ] +
abs_residual_partition_sums [ from_partition + 1 ] ;
from_partition + = 2 ;
}
}
}
}
# endif /* FLAC__SSSE3_SUPPORTED */
2014-06-15 20:29:34 +10:00
# endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
2013-10-04 01:38:00 +10:00
# endif /* FLAC__NO_ASM */