Files
cuetools.net/MAC_SDK/Source/MACLib/Assembly/Assembly64.nas
2008-10-14 17:13:34 +00:00

195 lines
4.3 KiB
Plaintext

%include "Tools64.inc"
segment_code
%imacro AdaptAddAligned 1
AdaptAddLoop0%1:
movdqa xmm0, [rcx]
movdqa xmm1, [rdx]
%1 xmm0, xmm1
movdqa [rcx], xmm0
movdqa xmm2, [rcx + 16]
movdqa xmm3, [rdx + 16]
%1 xmm2, xmm3
movdqa [rcx + 16], xmm2
add rcx, byte 32
add rdx, byte 32
dec r9d
jnz AdaptAddLoop0%1
emms
ret
%endmacro
%imacro AdaptAddUnaligned 1
mov r8d, r9d
and r8d, 1
shr r9d, 1
cmp r9d, byte 0
je short AdaptAddLoopULast%1
AdaptAddLoopU%1:
movdqa xmm0, [rcx]
lddqu xmm1, [rdx]
%1 xmm0, xmm1
movdqa [rcx], xmm0
movdqa xmm2, [rcx + 16]
lddqu xmm3, [rdx + 16]
%1 xmm2, xmm3
movdqa [rcx + 16], xmm2
movdqa xmm4, [rcx+32]
lddqu xmm5, [rdx+32]
%1 xmm4, xmm5
movdqa [rcx+32], xmm4
movdqa xmm6, [rcx + 48]
lddqu xmm7, [rdx + 48]
%1 xmm6, xmm7
movdqa [rcx + 48], xmm6
add rcx, byte 64
add rdx, byte 64
dec r9d
jnz AdaptAddLoopU%1
AdaptAddLoopULast%1:
cmp r8d, byte 0
je short AdaptAddLoopUEnd%1
movdqa xmm0, [rcx]
lddqu xmm1, [rdx]
%1 xmm0, xmm1
movdqa [rcx], xmm0
movdqa xmm2, [rcx + 16]
lddqu xmm3, [rdx + 16]
%1 xmm2, xmm3
movdqa [rcx + 16], xmm2
AdaptAddLoopUEnd%1:
emms
ret
%endmacro
;
; void Adapt ( short* pM, const short* pAdapt, int nDirection, int nOrder )
;
; r9d nOrder
; r8d nDirection
; rdx pAdapt
; rcx pM
; [esp+ 0] Return Address
align 16
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
proc Adapt
shr r9d, 4
cmp r8d, byte 0 ; nDirection
jle AdaptSub1
AdaptAddUnaligned paddw
align 16
nop
nop
nop
nop
nop
nop
nop
AdaptSub1: je AdaptDone1
AdaptAddUnaligned psubw
AdaptDone1:
endproc
;
; int CalculateDotProduct ( const short* pA, const short* pB, int nOrder )
;
align 16
nop
nop
nop
nop
nop
proc CalculateDotProduct
shr r8d, 4
pxor xmm7, xmm7
mov r9d, r8d
and r9d, 1
shr r8d, 1
cmp r8d, byte 0
je DotNonEven
LoopDotEven:
lddqu xmm0, [rcx] ;pA
pmaddwd xmm0, [rdx] ;pB
paddd xmm7, xmm0
lddqu xmm1, [rcx + 16]
pmaddwd xmm1, [rdx + 16]
paddd xmm7, xmm1
lddqu xmm2, [rcx + 32]
pmaddwd xmm2, [rdx + 32]
paddd xmm7, xmm2
lddqu xmm3, [rcx + 48]
pmaddwd xmm3, [rdx + 48]
add rcx, byte 64
add rdx, byte 64
paddd xmm7, xmm3
dec r8d
jnz short LoopDotEven
DotNonEven:
cmp r9d, byte 0
je DotFinal
lddqu xmm0, [rcx] ;pA
pmaddwd xmm0, [rdx] ;pB
paddd xmm7, xmm0
lddqu xmm1, [rcx + 16]
pmaddwd xmm1, [rdx + 16]
paddd xmm7, xmm1
DotFinal:
movdqa xmm6, xmm7
psrldq xmm6, 8
paddd xmm7, xmm6
movshdup xmm6, xmm7
paddd xmm7, xmm6
movd eax, xmm7
emms
endproc
;
; BOOL GetMMXAvailable ( void );
;
proc GetMMXAvailable
push rbx
push rcx
push rdx
mov eax,1
CPUID
test ecx, 1 ; actually, testing for SSE3
setnz al
and eax, byte 1
pop rdx
pop rcx
pop rbx
endproc