updated on Thu Jan 26 00:18:00 UTC 2012
[aur-mirror.git] / paq8 / paq7asm.asm
blob82d55a7c7ddb165625685face63167bfb32c074d
1 ; NASM assembly language code for PAQ7.
2 ; (C) 2005, Matt Mahoney.
3 ; This is free software under GPL, http://www.gnu.org/licenses/gpl.txt
5 ; MINGW g++: nasm paq7asm.asm -f win32 --prefix _
6 ; DJGPP g++: nasm paq7asm.asm -f coff --prefix _
7 ; Borland, Mars: nasm paq7asm.asm -f obj --prefix _
8 ; Linux: nasm paq7asm.asm -f elf
10 ; For other Windows compilers try -f win32 or -f obj. Some old versions
11 ; of Linux should use -f aout instead of -f elf.
13 ; This code will only work on a Pentium-MMX or higher. It doesn't
14 ; use extended (Katmai/SSE) instructions. It won't work
15 ; in 64-bit mode.
17 section .text use32 class=CODE
19 ; Reset after MMX
20 global do_emms
21 do_emms:
22 emms
23 ret
25 ; Vector product a*b of n signed words, returning signed dword scaled
26 ; down by 8 bits. n is rounded up to a multiple of 8.
28 global dot_product ; (short* a, short* b, int n)
29 align 16
30 dot_product:
31 mov eax, [esp+4] ; a
32 mov edx, [esp+8] ; b
33 mov ecx, [esp+12] ; n
34 add ecx, 7 ; n rounding up
35 and ecx, -8
36 jz .done
37 sub eax, 8
38 sub edx, 8
39 pxor mm0, mm0 ; sum = 0
40 .loop: ; each loop sums 4 products
41 movq mm1, [eax+ecx*2] ; put halves of vector product in mm0
42 pmaddwd mm1, [edx+ecx*2]
43 movq mm2, [eax+ecx*2-8]
44 pmaddwd mm2, [edx+ecx*2-8]
45 psrad mm1, 8
46 psrad mm2, 8
47 paddd mm0, mm1
48 paddd mm0, mm2
49 sub ecx, 8
50 ja .loop
51 movq mm1, mm0 ; add 2 halves of mm0 and return in eax
52 psrlq mm1, 32
53 paddd mm0, mm1
54 movd eax, mm0
55 emms
56 .done
57 ret
59 ; This should work on a Pentium 4 or higher in 32-bit mode,
60 ; but it isn't much faster than the MMX version so I don't use it.
62 global dot_product_sse2 ; (short* a, short* b, int n)
63 align 16
64 dot_product_sse2:
65 mov eax, [esp+4] ; a
66 mov edx, [esp+8] ; b
67 mov ecx, [esp+12] ; n
68 add ecx, 7 ; n rounding up
69 and ecx, -8
70 jz .done
71 sub eax, 16
72 sub edx, 16
73 pxor xmm0, xmm0 ; sum = 0
74 .loop: ; each loop sums 4 products
75 movdqa xmm1, [eax+ecx*2] ; put parital sums of vector product in xmm0
76 pmaddwd xmm1, [edx+ecx*2]
77 psrad xmm1, 8
78 paddd xmm0, xmm1
79 sub ecx, 8
80 ja .loop
81 movdqa xmm1, xmm0 ; add 4 parts of xmm0 and return in eax
82 psrldq xmm1, 8
83 paddd xmm0, xmm1
84 movdqa xmm1, xmm0
85 psrldq xmm1, 4
86 paddd xmm0, xmm1
87 movd eax, xmm0
88 .done
89 ret
92 ; Train n neural network weights w[n] on inputs t[n] and err.
93 ; w[i] += t[i]*err*2+1 >> 17 bounded to +- 32K.
94 ; n is rounded up to a multiple of 8.
96 global train ; (short* t, short* w, int n, int err)
97 align 16
98 train:
99 mov eax, [esp+16] ; err
100 and eax, 0xffff ; put 4 copies of err in mm0
101 movd mm0, eax
102 movd mm1, eax
103 psllq mm1, 16
104 por mm0, mm1
105 movq mm1, mm0
106 psllq mm1, 32
107 por mm0, mm1
108 pcmpeqb mm1, mm1 ; 4 copies of 1 in mm1
109 psrlw mm1, 15
110 mov eax, [esp+4] ; t
111 mov edx, [esp+8] ; w
112 mov ecx, [esp+12] ; n
113 add ecx, 7 ; n/8 rounding up
114 and ecx, -8
115 sub eax, 8
116 sub edx, 8
117 jz .done
118 .loop: ; each iteration adjusts 8 weights
119 movq mm2, [edx+ecx*2] ; w[i]
120 movq mm3, [eax+ecx*2] ; t[i]
121 movq mm4, [edx+ecx*2-8] ; w[i]
122 movq mm5, [eax+ecx*2-8] ; t[i]
123 paddsw mm3, mm3
124 paddsw mm5, mm5
125 pmulhw mm3, mm0
126 pmulhw mm5, mm0
127 paddsw mm3, mm1
128 paddsw mm5, mm1
129 psraw mm3, 1
130 psraw mm5, 1
131 paddsw mm2, mm3
132 paddsw mm4, mm5
133 movq [edx+ecx*2], mm2
134 movq [edx+ecx*2-8], mm4
135 sub ecx, 8
136 ja .loop
137 .done:
138 emms