%include "Tools64.inc" segment_code %imacro AdaptAddAligned 1 AdaptAddLoop0%1: movdqa xmm0, [rcx] movdqa xmm1, [rdx] %1 xmm0, xmm1 movdqa [rcx], xmm0 movdqa xmm2, [rcx + 16] movdqa xmm3, [rdx + 16] %1 xmm2, xmm3 movdqa [rcx + 16], xmm2 add rcx, byte 32 add rdx, byte 32 dec r9d jnz AdaptAddLoop0%1 emms ret %endmacro %imacro AdaptAddUnaligned 1 mov r8d, r9d and r8d, 1 shr r9d, 1 cmp r9d, byte 0 je short AdaptAddLoopULast%1 AdaptAddLoopU%1: movdqa xmm0, [rcx] lddqu xmm1, [rdx] %1 xmm0, xmm1 movdqa [rcx], xmm0 movdqa xmm2, [rcx + 16] lddqu xmm3, [rdx + 16] %1 xmm2, xmm3 movdqa [rcx + 16], xmm2 movdqa xmm4, [rcx+32] lddqu xmm5, [rdx+32] %1 xmm4, xmm5 movdqa [rcx+32], xmm4 movdqa xmm6, [rcx + 48] lddqu xmm7, [rdx + 48] %1 xmm6, xmm7 movdqa [rcx + 48], xmm6 add rcx, byte 64 add rdx, byte 64 dec r9d jnz AdaptAddLoopU%1 AdaptAddLoopULast%1: cmp r8d, byte 0 je short AdaptAddLoopUEnd%1 movdqa xmm0, [rcx] lddqu xmm1, [rdx] %1 xmm0, xmm1 movdqa [rcx], xmm0 movdqa xmm2, [rcx + 16] lddqu xmm3, [rdx + 16] %1 xmm2, xmm3 movdqa [rcx + 16], xmm2 AdaptAddLoopUEnd%1: emms ret %endmacro ; ; void Adapt ( short* pM, const short* pAdapt, int nDirection, int nOrder ) ; ; r9d nOrder ; r8d nDirection ; rdx pAdapt ; rcx pM ; [esp+ 0] Return Address align 16 nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop proc Adapt shr r9d, 4 cmp r8d, byte 0 ; nDirection jle AdaptSub1 AdaptAddUnaligned paddw align 16 nop nop nop nop nop nop nop AdaptSub1: je AdaptDone1 AdaptAddUnaligned psubw AdaptDone1: endproc ; ; int CalculateDotProduct ( const short* pA, const short* pB, int nOrder ) ; align 16 nop nop nop nop nop proc CalculateDotProduct shr r8d, 4 pxor xmm7, xmm7 mov r9d, r8d and r9d, 1 shr r8d, 1 cmp r8d, byte 0 je DotNonEven LoopDotEven: lddqu xmm0, [rcx] ;pA pmaddwd xmm0, [rdx] ;pB paddd xmm7, xmm0 lddqu xmm1, [rcx + 16] pmaddwd xmm1, [rdx + 16] paddd xmm7, xmm1 lddqu xmm2, [rcx + 32] pmaddwd xmm2, [rdx + 32] paddd xmm7, xmm2 lddqu xmm3, [rcx + 48] pmaddwd xmm3, [rdx + 48] add rcx, byte 64 add rdx, byte 64 paddd xmm7, xmm3 dec r8d jnz short LoopDotEven DotNonEven: cmp r9d, byte 0 je DotFinal lddqu xmm0, [rcx] ;pA pmaddwd xmm0, [rdx] ;pB paddd xmm7, xmm0 lddqu xmm1, [rcx + 16] pmaddwd xmm1, [rdx + 16] paddd xmm7, xmm1 DotFinal: movdqa xmm6, xmm7 psrldq xmm6, 8 paddd xmm7, xmm6 movshdup xmm6, xmm7 paddd xmm7, xmm6 movd eax, xmm7 emms endproc ; ; BOOL GetMMXAvailable ( void ); ; proc GetMMXAvailable push rbx push rcx push rdx mov eax,1 CPUID test ecx, 1 ; actually, testing for SSE3 setnz al and eax, byte 1 pop rdx pop rcx pop rbx endproc