diff --git a/MAC_SDK/Source/MACLib/Assembly/Assembly.obj b/MAC_SDK/Source/MACLib/Assembly/Assembly.obj index 0a8f2af..85de600 100644 Binary files a/MAC_SDK/Source/MACLib/Assembly/Assembly.obj and b/MAC_SDK/Source/MACLib/Assembly/Assembly.obj differ diff --git a/MAC_SDK/Source/MACLib/Assembly/Assembly64.bak b/MAC_SDK/Source/MACLib/Assembly/Assembly64.bak deleted file mode 100644 index 30f8adf..0000000 --- a/MAC_SDK/Source/MACLib/Assembly/Assembly64.bak +++ /dev/null @@ -1,180 +0,0 @@ - -%include "Tools64.inc" - -segment_code - -; -; void Adapt ( short* pM, const short* pAdapt, int nDirection, int nOrder ) -; -; r9d nOrder -; r8d nDirection -; rdx pAdapt -; rcx pM -; [esp+ 0] Return Address - - align 16 - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop -proc Adapt - - shr r9d, 4 - - cmp r8d, byte 0 ; nDirection - jle short AdaptSub - -AdaptAddLoop: - movq mm0, [rcx] - paddw mm0, [rdx] - movq [rcx], mm0 - movq mm1, [rcx + 8] - paddw mm1, [rdx + 8] - movq [rcx + 8], mm1 - movq mm2, [rcx + 16] - paddw mm2, [rdx + 16] - movq [rcx + 16], mm2 - movq mm3, [rcx + 24] - paddw mm3, [rdx + 24] - movq [rcx + 24], mm3 - add rcx, byte 32 - add rdx, byte 32 - dec r9d - jnz AdaptAddLoop - - emms - ret - - align 16 - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - -AdaptSub: je short AdaptDone - -AdaptSubLoop: - movq mm0, [rcx] - psubw mm0, [rdx] - movq [rcx], mm0 - movq mm1, [rcx + 8] - psubw mm1, [rdx + 8] - movq [rcx + 8], mm1 - movq mm2, [rcx + 16] - psubw mm2, [rdx + 16] - movq [rcx + 16], mm2 - movq mm3, [rcx + 24] - psubw mm3, [rdx + 24] - movq [rcx + 24], mm3 - add rcx, byte 32 - add rdx, byte 32 - dec r9d - jnz AdaptSubLoop - - emms -AdaptDone: - -endproc - -; -; int CalculateDotProduct ( const short* pA, const short* pB, int nOrder ) -; -; [esp+12] nOrder -; [esp+ 8] pB -; [esp+ 4] pA -; [esp+ 0] Return Address - - align 16 - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - -proc CalculateDotProduct - - shr r8d, 4 - pxor mm7, mm7 - -loopDot: movq mm0, [rcx] ;pA - pmaddwd mm0, [rdx] ;pB - paddd mm7, mm0 - movq mm1, [rcx + 8] - pmaddwd mm1, [rdx + 8] - paddd mm7, mm1 - movq mm2, [rcx + 16] - pmaddwd mm2, [rdx + 16] - paddd mm7, mm2 - movq mm3, [rcx + 24] - pmaddwd mm3, [rdx + 24] - add rcx, byte 32 - add rdx, byte 32 - paddd mm7, mm3 - dec r8d - jnz loopDot - - movq mm6, mm7 - psrlq mm7, 32 - paddd mm6, mm7 - movd eax, mm6 - emms -endproc - - -; -; BOOL GetMMXAvailable ( void ); -; - -proc GetMMXAvailable - push rax - push rcx - push rdx - push rbx - pushfq - pop rax - mov rcx, rax - xor rax, 0x200000 - push rax - popfq - pushfq - pop rax - cmp rax, rcx - jz short return ; no CPUID command, so no MMX - - mov rax,1 - CPUID - test rdx,0x800000 -return: pop rbx - pop rdx - pop rcx - pop rax - setnz al - and eax, byte 1 -endproc - - end diff --git a/MAC_SDK/Source/MACLib/Assembly/Assembly64.nas b/MAC_SDK/Source/MACLib/Assembly/Assembly64.nas index ef18cf7..bfe5017 100644 --- a/MAC_SDK/Source/MACLib/Assembly/Assembly64.nas +++ b/MAC_SDK/Source/MACLib/Assembly/Assembly64.nas @@ -95,10 +95,6 @@ endproc ; ; int CalculateDotProduct ( const short* pA, const short* pB, int nOrder ) ; -; [esp+12] nOrder -; [esp+ 8] pB -; [esp+ 4] pA -; [esp+ 0] Return Address align 16 nop @@ -118,90 +114,48 @@ endproc proc CalculateDotProduct - shr r8d, 4 - pxor mm7, mm7 - -loopDot: movq mm0, [rcx] ;pA - pmaddwd mm0, [rdx] ;pB - paddd mm7, mm0 - movq mm1, [rcx + 8] - pmaddwd mm1, [rdx + 8] - paddd mm7, mm1 - movq mm2, [rcx + 16] - pmaddwd mm2, [rdx + 16] - paddd mm7, mm2 - movq mm3, [rcx + 24] - pmaddwd mm3, [rdx + 24] - add rcx, byte 32 - add rdx, byte 32 - paddd mm7, mm3 - dec r8d - jnz loopDot - - movq mm6, mm7 - psrlq mm7, 32 - paddd mm6, mm7 - movd eax, mm6 - emms -endproc - -; -; int CalculateDotProduct ( const short* pA, const short* pB, int nOrder ) -; -; [esp+12] nOrder -; [esp+ 8] pB -; [esp+ 4] pA -; [esp+ 0] Return Address - - align 16 - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - nop - -proc CalculateDotProductXMM - shr r8d, 4 pxor xmm7, xmm7 + mov r9d, r8d + and r9d, 1 + shr r8d, 1 -loopDotXMM: movdqu xmm0, [rcx] ;pA +LoopDotEven: + movdqu xmm0, [rcx] ;pA pmaddwd xmm0, [rdx] ;pB paddd xmm7, xmm0 - movdqu xmm1, [rcx + 16] - pmaddwd xmm1, [rdx + 16] -; paddd xmm7, xmm1 -; movq xmm2, [rcx + 32] -; pmaddwd xmm2, [rdx + 32] -; paddd xmm7, mm2 -; movq xmm3, [rcx + 48] -; pmaddwd xmm3, [rdx + 48] - add rcx, byte 32 - add rdx, byte 32 -; paddd xmm7, xmm3 + movdqu xmm1, [rcx + 16] + pmaddwd xmm1, [rdx + 16] paddd xmm7, xmm1 + movdqu xmm2, [rcx + 32] + pmaddwd xmm2, [rdx + 32] + paddd xmm7, xmm2 + movdqu xmm3, [rcx + 48] + pmaddwd xmm3, [rdx + 48] + add rcx, byte 64 + add rdx, byte 64 + paddd xmm7, xmm3 dec r8d - jnz loopDotXMM + jnz short LoopDotEven - movq xmm5, xmm7 - psrldq xmm5, 16 - movq xmm4, xmm5 - psrlq xmm5, 32 - movq xmm6, xmm7 - psrlq xmm7, 32 - paddd xmm6, xmm4 - paddd xmm6, xmm5 - paddd xmm6, xmm7 - movd eax, xmm6 + cmp r9d, byte 0 + je DotFinal + + movdqu xmm0, [rcx] ;pA + pmaddwd xmm0, [rdx] ;pB + paddd xmm7, xmm0 + movdqu xmm1, [rcx + 16] + pmaddwd xmm1, [rdx + 16] + paddd xmm7, xmm1 + +DotFinal: + movdqa xmm6, xmm7 + psrldq xmm6, 8 + paddd xmm7, xmm6 + movdqa xmm6, xmm7 + psrldq xmm6, 4 + paddd xmm7, xmm6 + movd eax, xmm7 emms endproc @@ -211,30 +165,5 @@ endproc ; proc GetMMXAvailable - push rax - push rcx - push rdx - push rbx - pushfq - pop rax - mov rcx, rax - xor rax, 0x200000 - push rax - popfq - pushfq - pop rax - cmp rax, rcx - jz short return ; no CPUID command, so no MMX - - mov rax,1 - CPUID - test rdx,0x800000 -return: pop rbx - pop rdx - pop rcx - pop rax - setnz al - and eax, byte 1 + mov eax, 1 endproc - -; end diff --git a/MAC_SDK/Source/MACLib/Assembly/Assembly64.obj b/MAC_SDK/Source/MACLib/Assembly/Assembly64.obj index ebee012..5993c5d 100644 Binary files a/MAC_SDK/Source/MACLib/Assembly/Assembly64.obj and b/MAC_SDK/Source/MACLib/Assembly/Assembly64.obj differ diff --git a/MAC_SDK/Source/MACLib/Assembly/hhh b/MAC_SDK/Source/MACLib/Assembly/hhh deleted file mode 100644 index bad3af6..0000000 --- a/MAC_SDK/Source/MACLib/Assembly/hhh +++ /dev/null @@ -1,41 +0,0 @@ -usage: nasm [-@ response file] [-o outfile] [-f format] [-l listfile] - [options...] [--] filename - or nasm -v for version info - - -t assemble in SciTech TASM compatible mode - -g generate debug information in selected format. - -E (or -e) preprocess only (writes output to stdout by default) - -a don't preprocess (assemble only) - -M generate Makefile dependencies on stdout - -MG d:o, missing files assumed generated - - -Z redirect error messages to file - -s redirect error messages to stdout - - -F format select a debugging format - - -I adds a pathname to the include file path - -O optimize branch offsets (-O0 disables, default) - -P pre-includes a file - -D[=] pre-defines a macro - -U undefines a macro - -X specifies error reporting format (gnu or vc) - -w+foo enables warning foo (equiv. -Wfoo) - -w-foo disable warning foo (equiv. -Wno-foo) -Warnings: - error treat warnings as errors (default off) - macro-params macro calls with wrong parameter count (default on) - macro-selfref cyclic macro references (default off) - macro-defaults macros with more default than optional parameters (default on) - orphan-labels labels alone on lines without trailing `:' (default on) - number-overflow numeric constants does not fit in 64 bits (default on) - gnu-elf-extensions using 8- or 16-bit relocation in ELF32, a GNU extension (default off) - float-overflow floating point overflow (default on) - float-denorm floating point denormal (default off) - float-underflow floating point underflow (default off) - float-toolong too many digits in floating-point number (default on) - -response files should contain command line parameters, one per line. - -For a list of valid output formats, use -hf. -For a list of debug formats, use -f
-y.