mirror of
https://github.com/claunia/cuetools.net.git
synced 2025-12-16 18:14:25 +00:00
playing with SSE2
This commit is contained in:
@@ -95,10 +95,6 @@ endproc
|
||||
;
|
||||
; int CalculateDotProduct ( const short* pA, const short* pB, int nOrder )
|
||||
;
|
||||
; [esp+12] nOrder
|
||||
; [esp+ 8] pB
|
||||
; [esp+ 4] pA
|
||||
; [esp+ 0] Return Address
|
||||
|
||||
align 16
|
||||
nop
|
||||
@@ -118,90 +114,48 @@ endproc
|
||||
|
||||
proc CalculateDotProduct
|
||||
|
||||
shr r8d, 4
|
||||
pxor mm7, mm7
|
||||
|
||||
loopDot: movq mm0, [rcx] ;pA
|
||||
pmaddwd mm0, [rdx] ;pB
|
||||
paddd mm7, mm0
|
||||
movq mm1, [rcx + 8]
|
||||
pmaddwd mm1, [rdx + 8]
|
||||
paddd mm7, mm1
|
||||
movq mm2, [rcx + 16]
|
||||
pmaddwd mm2, [rdx + 16]
|
||||
paddd mm7, mm2
|
||||
movq mm3, [rcx + 24]
|
||||
pmaddwd mm3, [rdx + 24]
|
||||
add rcx, byte 32
|
||||
add rdx, byte 32
|
||||
paddd mm7, mm3
|
||||
dec r8d
|
||||
jnz loopDot
|
||||
|
||||
movq mm6, mm7
|
||||
psrlq mm7, 32
|
||||
paddd mm6, mm7
|
||||
movd eax, mm6
|
||||
emms
|
||||
endproc
|
||||
|
||||
;
|
||||
; int CalculateDotProduct ( const short* pA, const short* pB, int nOrder )
|
||||
;
|
||||
; [esp+12] nOrder
|
||||
; [esp+ 8] pB
|
||||
; [esp+ 4] pA
|
||||
; [esp+ 0] Return Address
|
||||
|
||||
align 16
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
|
||||
proc CalculateDotProductXMM
|
||||
|
||||
shr r8d, 4
|
||||
pxor xmm7, xmm7
|
||||
mov r9d, r8d
|
||||
and r9d, 1
|
||||
shr r8d, 1
|
||||
|
||||
loopDotXMM: movdqu xmm0, [rcx] ;pA
|
||||
LoopDotEven:
|
||||
movdqu xmm0, [rcx] ;pA
|
||||
pmaddwd xmm0, [rdx] ;pB
|
||||
paddd xmm7, xmm0
|
||||
movdqu xmm1, [rcx + 16]
|
||||
pmaddwd xmm1, [rdx + 16]
|
||||
; paddd xmm7, xmm1
|
||||
; movq xmm2, [rcx + 32]
|
||||
; pmaddwd xmm2, [rdx + 32]
|
||||
; paddd xmm7, mm2
|
||||
; movq xmm3, [rcx + 48]
|
||||
; pmaddwd xmm3, [rdx + 48]
|
||||
add rcx, byte 32
|
||||
add rdx, byte 32
|
||||
; paddd xmm7, xmm3
|
||||
movdqu xmm1, [rcx + 16]
|
||||
pmaddwd xmm1, [rdx + 16]
|
||||
paddd xmm7, xmm1
|
||||
movdqu xmm2, [rcx + 32]
|
||||
pmaddwd xmm2, [rdx + 32]
|
||||
paddd xmm7, xmm2
|
||||
movdqu xmm3, [rcx + 48]
|
||||
pmaddwd xmm3, [rdx + 48]
|
||||
add rcx, byte 64
|
||||
add rdx, byte 64
|
||||
paddd xmm7, xmm3
|
||||
dec r8d
|
||||
jnz loopDotXMM
|
||||
jnz short LoopDotEven
|
||||
|
||||
movq xmm5, xmm7
|
||||
psrldq xmm5, 16
|
||||
movq xmm4, xmm5
|
||||
psrlq xmm5, 32
|
||||
movq xmm6, xmm7
|
||||
psrlq xmm7, 32
|
||||
paddd xmm6, xmm4
|
||||
paddd xmm6, xmm5
|
||||
paddd xmm6, xmm7
|
||||
movd eax, xmm6
|
||||
cmp r9d, byte 0
|
||||
je DotFinal
|
||||
|
||||
movdqu xmm0, [rcx] ;pA
|
||||
pmaddwd xmm0, [rdx] ;pB
|
||||
paddd xmm7, xmm0
|
||||
movdqu xmm1, [rcx + 16]
|
||||
pmaddwd xmm1, [rdx + 16]
|
||||
paddd xmm7, xmm1
|
||||
|
||||
DotFinal:
|
||||
movdqa xmm6, xmm7
|
||||
psrldq xmm6, 8
|
||||
paddd xmm7, xmm6
|
||||
movdqa xmm6, xmm7
|
||||
psrldq xmm6, 4
|
||||
paddd xmm7, xmm6
|
||||
movd eax, xmm7
|
||||
emms
|
||||
endproc
|
||||
|
||||
@@ -211,30 +165,5 @@ endproc
|
||||
;
|
||||
|
||||
proc GetMMXAvailable
|
||||
push rax
|
||||
push rcx
|
||||
push rdx
|
||||
push rbx
|
||||
pushfq
|
||||
pop rax
|
||||
mov rcx, rax
|
||||
xor rax, 0x200000
|
||||
push rax
|
||||
popfq
|
||||
pushfq
|
||||
pop rax
|
||||
cmp rax, rcx
|
||||
jz short return ; no CPUID command, so no MMX
|
||||
|
||||
mov rax,1
|
||||
CPUID
|
||||
test rdx,0x800000
|
||||
return: pop rbx
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rax
|
||||
setnz al
|
||||
and eax, byte 1
|
||||
mov eax, 1
|
||||
endproc
|
||||
|
||||
; end
|
||||
|
||||
Reference in New Issue
Block a user