From 3179fa7d840b8c3095826797a2886752323c5f2c Mon Sep 17 00:00:00 2001 From: chudov Date: Tue, 14 Oct 2008 04:16:26 +0000 Subject: [PATCH] More SSE2 optimizations --- MAC_SDK/Source/MACLib/Assembly/Assembly.obj | Bin 836 -> 836 bytes MAC_SDK/Source/MACLib/Assembly/Assembly64.nas | 114 +++++++++++------- MAC_SDK/Source/MACLib/Assembly/Assembly64.obj | Bin 825 -> 1237 bytes 3 files changed, 70 insertions(+), 44 deletions(-) diff --git a/MAC_SDK/Source/MACLib/Assembly/Assembly.obj b/MAC_SDK/Source/MACLib/Assembly/Assembly.obj index 65a09fdff51cc9a8d098ef24ffa92c003cd0356a..7f52bbaa7fd882a37bae6261b23166ca0525a399 100644 GIT binary patch delta 16 XcmX@Yc7%=HhmnzCgZP(??6%ARD9;3) delta 16 XcmX@Yc7%=Hhmnz?^7H47?6%AREO7-v diff --git a/MAC_SDK/Source/MACLib/Assembly/Assembly64.nas b/MAC_SDK/Source/MACLib/Assembly/Assembly64.nas index 9f4de5a..f316e1d 100644 --- a/MAC_SDK/Source/MACLib/Assembly/Assembly64.nas +++ b/MAC_SDK/Source/MACLib/Assembly/Assembly64.nas @@ -3,6 +3,67 @@ segment_code +%imacro AdaptAddAligned 1 +AdaptAddLoop0%1: + movdqa xmm0, [rcx] + movdqa xmm1, [rdx] + %1 xmm0, xmm1 + movdqa [rcx], xmm0 + movdqa xmm2, [rcx + 16] + movdqa xmm3, [rdx + 16] + %1 xmm2, xmm3 + movdqa [rcx + 16], xmm2 + add rcx, byte 32 + add rdx, byte 32 + dec r9d + jnz AdaptAddLoop0%1 + emms + ret +%endmacro + +%imacro AdaptAddUnaligned 1 + mov r8d, r9d + and r8d, 1 + shr r9d, 1 + cmp r9d, byte 0 + je short AdaptAddLoopULast%1 +AdaptAddLoopU%1: + movdqa xmm0, [rcx] + movdqu xmm1, [rdx] + %1 xmm0, xmm1 + movdqa [rcx], xmm0 + movdqa xmm2, [rcx + 16] + movdqu xmm3, [rdx + 16] + %1 xmm2, xmm3 + movdqa [rcx + 16], xmm2 + movdqa xmm4, [rcx+32] + movdqu xmm5, [rdx+32] + %1 xmm4, xmm5 + movdqa [rcx+32], xmm4 + movdqa xmm6, [rcx + 48] + movdqu xmm7, [rdx + 48] + %1 xmm6, xmm7 + movdqa [rcx + 48], xmm6 + add rcx, byte 64 + add rdx, byte 64 + dec r9d + jnz AdaptAddLoopU%1 +AdaptAddLoopULast%1: + cmp r8d, byte 0 + je short AdaptAddLoopUEnd%1 + movdqa xmm0, [rcx] + movdqu xmm1, [rdx] + %1 xmm0, xmm1 + movdqa [rcx], xmm0 + movdqa xmm2, [rcx + 16] + movdqu xmm3, [rdx + 16] + %1 xmm2, xmm3 + movdqa [rcx + 16], xmm2 +AdaptAddLoopUEnd%1: + emms + ret +%endmacro + ; ; void Adapt ( short* pM, const short* pAdapt, int nDirection, int nOrder ) ; @@ -28,28 +89,9 @@ proc Adapt shr r9d, 4 cmp r8d, byte 0 ; nDirection - jle short AdaptSub + jle AdaptSub1 - - mov r8, rdx -; and edx, 0xfffffff0 - and r8b, 0xf -AdaptAddLoop: - movdqa xmm0, [rcx] - movdqu xmm1, [rdx] - paddw xmm0, xmm1 - movdqa [rcx], xmm0 - movdqa xmm2, [rcx + 16] - movdqu xmm3, [rdx + 16] - paddw xmm2, xmm3 - movdqa [rcx + 16], xmm2 - add rcx, byte 32 - add rdx, byte 32 - dec r9d - jnz AdaptAddLoop - - emms - ret +AdaptAddUnaligned paddw align 16 nop @@ -62,30 +104,10 @@ AdaptAddLoop: nop nop nop - nop - nop - nop - nop - -AdaptSub: je short AdaptDone - -AdaptSubLoop: - movdqa xmm0, [rcx] - movdqu xmm1, [rdx] - psubw xmm0, xmm1 - movdqa [rcx], xmm0 - movdqa xmm2, [rcx + 16] - movdqu xmm3, [rdx + 16] - psubw xmm2, xmm3 - movdqa [rcx + 16], xmm2 - add rcx, byte 32 - add rdx, byte 32 - dec r9d - jnz AdaptSubLoop - - emms -AdaptDone: +AdaptSub1: je AdaptDone1 +AdaptAddUnaligned psubw +AdaptDone1: endproc ; @@ -116,6 +138,9 @@ proc CalculateDotProduct and r9d, 1 shr r8d, 1 + cmp r8d, byte 0 + je DotNonEven + LoopDotEven: movdqu xmm0, [rcx] ;pA pmaddwd xmm0, [rdx] ;pB @@ -134,6 +159,7 @@ LoopDotEven: dec r8d jnz short LoopDotEven +DotNonEven: cmp r9d, byte 0 je DotFinal diff --git a/MAC_SDK/Source/MACLib/Assembly/Assembly64.obj b/MAC_SDK/Source/MACLib/Assembly/Assembly64.obj index 2b37dcd5f27a2c8c4947a66d4a72796dd1030221..3c04ea4c10c15be08ef3884c44e422a5719e05e5 100644 GIT binary patch literal 1237 zcmYdkV`Mlh@x{ZRiGe|k0Rr?&QY%UzOfV_T#K2&~z`y`f0~TOlU{GL4NSFWzjt5_| zI5z)a;O|?+z`)?zdBU;r0V4wggX6`Qj?F(AO2X3k^BF($=X0g;|2>$-U(cAvpC2gj znLj^DAdUa;OILxcEKw%HZ2t1YaJPaO{4G-nx$`H6JAY!j^CzY|e-hzNoM!Wv zA4YX3$gxoDc<=>F8vpzMuAK)R8y`YK_XRivoImsDGp6x>WlH1!_v15vzNbJM|5q=8 zH2!}-KlA4YDx~p$4N^$s|M%-Ne}1Gv8voZQcrd2%|NHIu|3v9ta5PEbHYklh|9cvL z@jZ?-{(t{q92N+t?gxMQ;T<3s9o9?B%t?i&k$?Xg7}6OXi;GiplX5D}O!V>+i$PMz z$qS^3nGux0Kr#rdmzY$XpHm7>h5!FEFfcR3JEkNSlrS)GF)%QIbTKhN1&~BQ`dJtl z7_^~km_Y&z;tUK7jVK~|3=9mbP()l97#Mb;i1;xuFkD6vNn~JP&}4+z4l%icfq@|i zMWl~`fguz{WF`XxLmpIw3GAMA7$S!l7#KRCY9J<`V_;x70~c}0FLBGvOU!u#7XcY` zlYxPOmkHt?h{>-|M1FxXASlorQ&N2L^9w=?5>ryjA;AU{@<}W%!6NLMmx3f3T$*Hv zs;!{7GzptYn8F~F5TY*md8vjB@y>}k$)!1oC8;j?B>_eGDW%CJ4A{g#u6D^UaV<;D gV{pkY@ypKxGvnP;OMHDJ9Lo|ja}twsQW?M5Aid41;3XZ*~c&y~ji z_uyy#dJsEM;4^=Ilt3E)-^-u*>jMQmnhz>?G#^rM{C~3a0)P4BiHs5^B}NF1KQT1^ z#HCSk@^VIr$$U&wlkYPwWAbI3+|J}zFT%jUz{tSBz|7#7l2}j@T$&UI7hzyvU=UIz;GC