playing with SSE2

This commit is contained in:
chudov
2008-10-14 00:19:30 +00:00
parent 30a26ae9a8
commit f1f8e8e308
5 changed files with 35 additions and 327 deletions

View File

@@ -1,180 +0,0 @@
%include "Tools64.inc"
segment_code
;
; void Adapt ( short* pM, const short* pAdapt, int nDirection, int nOrder )
;
; r9d nOrder
; r8d nDirection
; rdx pAdapt
; rcx pM
; [esp+ 0] Return Address
align 16
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
proc Adapt
shr r9d, 4
cmp r8d, byte 0 ; nDirection
jle short AdaptSub
AdaptAddLoop:
movq mm0, [rcx]
paddw mm0, [rdx]
movq [rcx], mm0
movq mm1, [rcx + 8]
paddw mm1, [rdx + 8]
movq [rcx + 8], mm1
movq mm2, [rcx + 16]
paddw mm2, [rdx + 16]
movq [rcx + 16], mm2
movq mm3, [rcx + 24]
paddw mm3, [rdx + 24]
movq [rcx + 24], mm3
add rcx, byte 32
add rdx, byte 32
dec r9d
jnz AdaptAddLoop
emms
ret
align 16
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
AdaptSub: je short AdaptDone
AdaptSubLoop:
movq mm0, [rcx]
psubw mm0, [rdx]
movq [rcx], mm0
movq mm1, [rcx + 8]
psubw mm1, [rdx + 8]
movq [rcx + 8], mm1
movq mm2, [rcx + 16]
psubw mm2, [rdx + 16]
movq [rcx + 16], mm2
movq mm3, [rcx + 24]
psubw mm3, [rdx + 24]
movq [rcx + 24], mm3
add rcx, byte 32
add rdx, byte 32
dec r9d
jnz AdaptSubLoop
emms
AdaptDone:
endproc
;
; int CalculateDotProduct ( const short* pA, const short* pB, int nOrder )
;
; [esp+12] nOrder
; [esp+ 8] pB
; [esp+ 4] pA
; [esp+ 0] Return Address
align 16
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
proc CalculateDotProduct
shr r8d, 4
pxor mm7, mm7
loopDot: movq mm0, [rcx] ;pA
pmaddwd mm0, [rdx] ;pB
paddd mm7, mm0
movq mm1, [rcx + 8]
pmaddwd mm1, [rdx + 8]
paddd mm7, mm1
movq mm2, [rcx + 16]
pmaddwd mm2, [rdx + 16]
paddd mm7, mm2
movq mm3, [rcx + 24]
pmaddwd mm3, [rdx + 24]
add rcx, byte 32
add rdx, byte 32
paddd mm7, mm3
dec r8d
jnz loopDot
movq mm6, mm7
psrlq mm7, 32
paddd mm6, mm7
movd eax, mm6
emms
endproc
;
; BOOL GetMMXAvailable ( void );
;
proc GetMMXAvailable
push rax
push rcx
push rdx
push rbx
pushfq
pop rax
mov rcx, rax
xor rax, 0x200000
push rax
popfq
pushfq
pop rax
cmp rax, rcx
jz short return ; no CPUID command, so no MMX
mov rax,1
CPUID
test rdx,0x800000
return: pop rbx
pop rdx
pop rcx
pop rax
setnz al
and eax, byte 1
endproc
end

View File

@@ -95,10 +95,6 @@ endproc
; ;
; int CalculateDotProduct ( const short* pA, const short* pB, int nOrder ) ; int CalculateDotProduct ( const short* pA, const short* pB, int nOrder )
; ;
; [esp+12] nOrder
; [esp+ 8] pB
; [esp+ 4] pA
; [esp+ 0] Return Address
align 16 align 16
nop nop
@@ -118,90 +114,48 @@ endproc
proc CalculateDotProduct proc CalculateDotProduct
shr r8d, 4
pxor mm7, mm7
loopDot: movq mm0, [rcx] ;pA
pmaddwd mm0, [rdx] ;pB
paddd mm7, mm0
movq mm1, [rcx + 8]
pmaddwd mm1, [rdx + 8]
paddd mm7, mm1
movq mm2, [rcx + 16]
pmaddwd mm2, [rdx + 16]
paddd mm7, mm2
movq mm3, [rcx + 24]
pmaddwd mm3, [rdx + 24]
add rcx, byte 32
add rdx, byte 32
paddd mm7, mm3
dec r8d
jnz loopDot
movq mm6, mm7
psrlq mm7, 32
paddd mm6, mm7
movd eax, mm6
emms
endproc
;
; int CalculateDotProduct ( const short* pA, const short* pB, int nOrder )
;
; [esp+12] nOrder
; [esp+ 8] pB
; [esp+ 4] pA
; [esp+ 0] Return Address
align 16
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
proc CalculateDotProductXMM
shr r8d, 4 shr r8d, 4
pxor xmm7, xmm7 pxor xmm7, xmm7
mov r9d, r8d
and r9d, 1
shr r8d, 1
loopDotXMM: movdqu xmm0, [rcx] ;pA LoopDotEven:
movdqu xmm0, [rcx] ;pA
pmaddwd xmm0, [rdx] ;pB pmaddwd xmm0, [rdx] ;pB
paddd xmm7, xmm0 paddd xmm7, xmm0
movdqu xmm1, [rcx + 16] movdqu xmm1, [rcx + 16]
pmaddwd xmm1, [rdx + 16] pmaddwd xmm1, [rdx + 16]
; paddd xmm7, xmm1
; movq xmm2, [rcx + 32]
; pmaddwd xmm2, [rdx + 32]
; paddd xmm7, mm2
; movq xmm3, [rcx + 48]
; pmaddwd xmm3, [rdx + 48]
add rcx, byte 32
add rdx, byte 32
; paddd xmm7, xmm3
paddd xmm7, xmm1 paddd xmm7, xmm1
movdqu xmm2, [rcx + 32]
pmaddwd xmm2, [rdx + 32]
paddd xmm7, xmm2
movdqu xmm3, [rcx + 48]
pmaddwd xmm3, [rdx + 48]
add rcx, byte 64
add rdx, byte 64
paddd xmm7, xmm3
dec r8d dec r8d
jnz loopDotXMM jnz short LoopDotEven
movq xmm5, xmm7 cmp r9d, byte 0
psrldq xmm5, 16 je DotFinal
movq xmm4, xmm5
psrlq xmm5, 32 movdqu xmm0, [rcx] ;pA
movq xmm6, xmm7 pmaddwd xmm0, [rdx] ;pB
psrlq xmm7, 32 paddd xmm7, xmm0
paddd xmm6, xmm4 movdqu xmm1, [rcx + 16]
paddd xmm6, xmm5 pmaddwd xmm1, [rdx + 16]
paddd xmm6, xmm7 paddd xmm7, xmm1
movd eax, xmm6
DotFinal:
movdqa xmm6, xmm7
psrldq xmm6, 8
paddd xmm7, xmm6
movdqa xmm6, xmm7
psrldq xmm6, 4
paddd xmm7, xmm6
movd eax, xmm7
emms emms
endproc endproc
@@ -211,30 +165,5 @@ endproc
; ;
proc GetMMXAvailable proc GetMMXAvailable
push rax mov eax, 1
push rcx
push rdx
push rbx
pushfq
pop rax
mov rcx, rax
xor rax, 0x200000
push rax
popfq
pushfq
pop rax
cmp rax, rcx
jz short return ; no CPUID command, so no MMX
mov rax,1
CPUID
test rdx,0x800000
return: pop rbx
pop rdx
pop rcx
pop rax
setnz al
and eax, byte 1
endproc endproc
; end

View File

@@ -1,41 +0,0 @@
usage: nasm [-@ response file] [-o outfile] [-f format] [-l listfile]
[options...] [--] filename
or nasm -v for version info
-t assemble in SciTech TASM compatible mode
-g generate debug information in selected format.
-E (or -e) preprocess only (writes output to stdout by default)
-a don't preprocess (assemble only)
-M generate Makefile dependencies on stdout
-MG d:o, missing files assumed generated
-Z<file> redirect error messages to file
-s redirect error messages to stdout
-F format select a debugging format
-I<path> adds a pathname to the include file path
-O<digit> optimize branch offsets (-O0 disables, default)
-P<file> pre-includes a file
-D<macro>[=<value>] pre-defines a macro
-U<macro> undefines a macro
-X<format> specifies error reporting format (gnu or vc)
-w+foo enables warning foo (equiv. -Wfoo)
-w-foo disable warning foo (equiv. -Wno-foo)
Warnings:
error treat warnings as errors (default off)
macro-params macro calls with wrong parameter count (default on)
macro-selfref cyclic macro references (default off)
macro-defaults macros with more default than optional parameters (default on)
orphan-labels labels alone on lines without trailing `:' (default on)
number-overflow numeric constants does not fit in 64 bits (default on)
gnu-elf-extensions using 8- or 16-bit relocation in ELF32, a GNU extension (default off)
float-overflow floating point overflow (default on)
float-denorm floating point denormal (default off)
float-underflow floating point underflow (default off)
float-toolong too many digits in floating-point number (default on)
response files should contain command line parameters, one per line.
For a list of valid output formats, use -hf.
For a list of debug formats, use -f <form> -y.