Files
cuetools.net/MAC_SDK/Source/MACLib/Assembly/Assembly64.nas

195 lines
4.3 KiB
Plaintext
Raw Normal View History

2008-10-13 19:25:11 +00:00
%include "Tools64.inc"
segment_code
2008-10-14 04:16:26 +00:00
%imacro AdaptAddAligned 1
AdaptAddLoop0%1:
movdqa xmm0, [rcx]
movdqa xmm1, [rdx]
%1 xmm0, xmm1
movdqa [rcx], xmm0
movdqa xmm2, [rcx + 16]
movdqa xmm3, [rdx + 16]
%1 xmm2, xmm3
movdqa [rcx + 16], xmm2
add rcx, byte 32
add rdx, byte 32
dec r9d
jnz AdaptAddLoop0%1
emms
ret
%endmacro
%imacro AdaptAddUnaligned 1
mov r8d, r9d
and r8d, 1
shr r9d, 1
cmp r9d, byte 0
je short AdaptAddLoopULast%1
AdaptAddLoopU%1:
movdqa xmm0, [rcx]
2008-10-14 17:13:34 +00:00
lddqu xmm1, [rdx]
2008-10-14 04:16:26 +00:00
%1 xmm0, xmm1
movdqa [rcx], xmm0
movdqa xmm2, [rcx + 16]
2008-10-14 17:13:34 +00:00
lddqu xmm3, [rdx + 16]
2008-10-14 04:16:26 +00:00
%1 xmm2, xmm3
movdqa [rcx + 16], xmm2
movdqa xmm4, [rcx+32]
2008-10-14 17:13:34 +00:00
lddqu xmm5, [rdx+32]
2008-10-14 04:16:26 +00:00
%1 xmm4, xmm5
movdqa [rcx+32], xmm4
movdqa xmm6, [rcx + 48]
2008-10-14 17:13:34 +00:00
lddqu xmm7, [rdx + 48]
2008-10-14 04:16:26 +00:00
%1 xmm6, xmm7
movdqa [rcx + 48], xmm6
add rcx, byte 64
add rdx, byte 64
dec r9d
jnz AdaptAddLoopU%1
AdaptAddLoopULast%1:
cmp r8d, byte 0
je short AdaptAddLoopUEnd%1
movdqa xmm0, [rcx]
2008-10-14 17:13:34 +00:00
lddqu xmm1, [rdx]
2008-10-14 04:16:26 +00:00
%1 xmm0, xmm1
movdqa [rcx], xmm0
movdqa xmm2, [rcx + 16]
2008-10-14 17:13:34 +00:00
lddqu xmm3, [rdx + 16]
2008-10-14 04:16:26 +00:00
%1 xmm2, xmm3
movdqa [rcx + 16], xmm2
AdaptAddLoopUEnd%1:
emms
ret
%endmacro
2008-10-13 19:25:11 +00:00
;
; void Adapt ( short* pM, const short* pAdapt, int nDirection, int nOrder )
;
; r9d nOrder
; r8d nDirection
; rdx pAdapt
; rcx pM
; [esp+ 0] Return Address
align 16
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
2008-10-14 04:48:52 +00:00
nop
nop
nop
nop
nop
2008-10-13 19:25:11 +00:00
proc Adapt
shr r9d, 4
cmp r8d, byte 0 ; nDirection
2008-10-14 04:16:26 +00:00
jle AdaptSub1
2008-10-13 19:25:11 +00:00
2008-10-14 04:16:26 +00:00
AdaptAddUnaligned paddw
2008-10-13 19:25:11 +00:00
align 16
nop
nop
nop
nop
nop
nop
nop
2008-10-14 04:16:26 +00:00
AdaptSub1: je AdaptDone1
AdaptAddUnaligned psubw
AdaptDone1:
2008-10-13 19:25:11 +00:00
endproc
;
; int CalculateDotProduct ( const short* pA, const short* pB, int nOrder )
;
align 16
nop
nop
nop
nop
nop
proc CalculateDotProduct
shr r8d, 4
pxor xmm7, xmm7
2008-10-14 00:19:30 +00:00
mov r9d, r8d
and r9d, 1
shr r8d, 1
2008-10-13 19:25:11 +00:00
2008-10-14 04:16:26 +00:00
cmp r8d, byte 0
je DotNonEven
2008-10-14 00:19:30 +00:00
LoopDotEven:
2008-10-14 17:13:34 +00:00
lddqu xmm0, [rcx] ;pA
2008-10-13 19:25:11 +00:00
pmaddwd xmm0, [rdx] ;pB
paddd xmm7, xmm0
2008-10-14 17:13:34 +00:00
lddqu xmm1, [rcx + 16]
2008-10-14 00:19:30 +00:00
pmaddwd xmm1, [rdx + 16]
2008-10-13 19:25:11 +00:00
paddd xmm7, xmm1
2008-10-14 17:13:34 +00:00
lddqu xmm2, [rcx + 32]
2008-10-14 00:19:30 +00:00
pmaddwd xmm2, [rdx + 32]
paddd xmm7, xmm2
2008-10-14 17:13:34 +00:00
lddqu xmm3, [rcx + 48]
2008-10-14 00:19:30 +00:00
pmaddwd xmm3, [rdx + 48]
add rcx, byte 64
add rdx, byte 64
paddd xmm7, xmm3
2008-10-13 19:25:11 +00:00
dec r8d
2008-10-14 00:19:30 +00:00
jnz short LoopDotEven
2008-10-14 04:16:26 +00:00
DotNonEven:
2008-10-14 00:19:30 +00:00
cmp r9d, byte 0
je DotFinal
2008-10-14 17:13:34 +00:00
lddqu xmm0, [rcx] ;pA
2008-10-14 00:19:30 +00:00
pmaddwd xmm0, [rdx] ;pB
paddd xmm7, xmm0
2008-10-14 17:13:34 +00:00
lddqu xmm1, [rcx + 16]
2008-10-14 00:19:30 +00:00
pmaddwd xmm1, [rdx + 16]
paddd xmm7, xmm1
DotFinal:
movdqa xmm6, xmm7
psrldq xmm6, 8
paddd xmm7, xmm6
2008-10-14 17:13:34 +00:00
movshdup xmm6, xmm7
2008-10-14 00:19:30 +00:00
paddd xmm7, xmm6
movd eax, xmm7
2008-10-13 19:25:11 +00:00
emms
endproc
;
; BOOL GetMMXAvailable ( void );
;
proc GetMMXAvailable
2008-10-14 17:13:34 +00:00
push rbx
push rcx
push rdx
mov eax,1
CPUID
test ecx, 1 ; actually, testing for SSE3
setnz al
and eax, byte 1
pop rdx
pop rcx
pop rbx
2008-10-13 19:25:11 +00:00
endproc