2 ;;; mblockq_sad_mmxe.s
:
4 ;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblock
5 ;;; quads
(2 by
2 squares of adjacent macroblocks
)
7 ;;; Explanation
: the motion compensation search at
1-pel
and 2*2 sub-sampled
8 ;;; evaluates macroblock quads.
A lot of memory accesses can
be saved
9 ;;; if each quad is done together rather than each macroblock in the
10 ;;; quad handled individually.
12 ;;; TODO
: Really there ought to
be MMX versions
and the function
's
13 ;;; specification should be documented...
15 ; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
19 ; This program is free software; you can reaxstribute it and/or
20 ; modify it under the terms of the GNU General Public License
21 ; as published by the Free Software Foundation; either version 2
22 ; of the License, or (at your option) any later version.
24 ; This program is distributed in the hope that it will be useful,
25 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
26 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 ; GNU General Public License for more details.
29 ; You should have received a copy of the GNU General Public License
30 ; along with this program; if not, write to the Free Software
31 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
35 ;;; CURRENTLY not used but used in testing as reference for tweaks...
36 global mblockq_sad1_REF
38 ; void mblockq_dist1_REF(char *blk1,char *blk2,int lx,int h,int *weightvec);
57 push ebp ; save frame pointer
66 pxor mm0, mm0 ; zero accumulators
70 mov eax, [ebp+8] ; get p1
71 mov ebx, [ebp+12] ; get p2
72 mov edx, [ebp+16] ; get lx
74 mov edi, [ebp+20] ; get rowsleft
83 movq mm4, [eax] ; load 1st 8 bytes of p1
86 psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
87 paddd mm0, mm4 ; accumulate difference
88 movq mm4, [eax+8] ; load 2nd 8 bytes of p1
90 psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
91 paddd mm0, mm4 ; accumulate difference
99 psadbw mm6, [ebx] ; compare to next 8 bytes of p2 (row 1)
100 paddd mm2, mm6 ; accumulate difference
101 psadbw mm7, [ebx+8] ; next 8 bytes of p1 (row 1)
112 psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
113 paddd mm1, mm4 ; accumulate difference
116 psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
117 paddd mm1, mm4 ; accumulate difference
122 ;; Do the (+2, +2 ) SAD
124 psadbw mm6, [ebx] ; compare to 1st 8 bytes of prev p2
125 psadbw mm7, [ebx+8] ; 2nd 8 bytes of prev p2
127 paddd mm3, mm6 ; accumulate difference
131 add eax, edx ; update pointer to next row
135 jnz near nextrow_block_d1
137 ;; Do the last row of the (0,+2) SAD
139 movq mm4, [eax] ; load 1st 8 bytes of p1
140 movq mm5, [eax+8] ; load 2nd 8 bytes of p1
142 psadbw mm4, [ebx] ; compare to next 8 bytes of p2 (row 1)
143 psadbw mm5, [ebx+8] ; next 8 bytes of p1 (row 1)
144 paddd mm2, mm4 ; accumulate difference
150 ;; Do the last row of rhw (+2, +2) SAD
151 psadbw mm4, [ebx] ; compare to 1st 8 bytes of prev p2
152 psadbw mm5, [ebx+8] ; 2nd 8 bytes of prev p2
153 paddd mm3, mm4 ; accumulate difference
157 mov eax, [ebp+24] ; Weightvec
176 global mblockq_dist1_mmxe
178 ; void mblockq_dist1_mmxe(char *blk1,char *blk2,int lx,int h,int *weightvec);
187 ; mm0 = SAD (x+0,y+0),SAD (x+0,y+2)
188 ; mm1 = SAD (x+2,y+0),SAD (x+2,y+2)
197 push ebp ; save frame pointer
206 mov eax, [ebp+8] ; get p1
208 pxor mm0, mm0 ; zero accumulators
210 mov ebx, [ebp+12] ; get p2
211 mov edx, [ebp+16] ; get lx
213 mov edi, [ebp+20] ; get rowsleft
220 ;; Do the (+0,+0) SAD
222 movq mm4, [eax] ; load 1st 8 bytes of p1
225 psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
226 paddd mm0, mm4 ; accumulate difference
227 movq mm4, [eax+8] ; load 2nd 8 bytes of p1
229 psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
230 paddd mm0, mm4 ; accumulate difference
238 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
240 psadbw mm6, mm2 ; compare to next 8 bytes of p2 (row 1)
241 paddd mm0, mm6 ; accumulate difference
243 psadbw mm7, mm3 ; next 8 bytes of p1 (row 1)
246 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
254 psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
255 paddd mm1, mm4 ; accumulate difference
260 psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
261 paddd mm1, mm4 ; accumulate difference
266 ;; Do the (+2, +2 ) SAD
268 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
269 psadbw mm6, mm2 ; compare to 1st 8 bytes of prev p2
270 psadbw mm7, mm3 ; 2nd 8 bytes of prev p2
272 paddd mm1, mm6 ; accumulate difference
274 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
277 add eax, edx ; update pointer to next row
281 jnz near nextrow_block_e1
283 ;; Do the last row of the (0,+2) SAD
284 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
285 movq mm4, [eax] ; load 1st 8 bytes of p1
286 movq mm5, [eax+8] ; load 2nd 8 bytes of p1
288 psadbw mm4, [ebx] ; compare to next 8 bytes of p2 (row 1)
289 psadbw mm5, [ebx+8] ; next 8 bytes of p1 (row 1)
290 paddd mm0, mm4 ; accumulate difference
294 ;; Do the last row of rhw (+2, +2) SAD
295 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
299 psadbw mm4, [ebx] ; compare to 1st 8 bytes of prev p2
300 psadbw mm5, [ebx+8] ; 2nd 8 bytes of prev p2
301 paddd mm1, mm4 ; accumulate difference
305 mov eax, [ebp+24] ; Weightvec
307 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
309 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
324 global mblockq_dist22_mmxe
326 ; void mblockq_dist22_mmxe(unsigned char *blk1,unsigned char *blk2,int flx,int fh, int* resvec);
333 ; mm0 = distance accumulator
334 ; mm1 = distance accumulator
335 ; mm2 = previous p1 row
336 ; mm3 = previous p1 displaced by 1 byte...
340 ; mm7 = temp / 0 if first row 0xff otherwise
345 push ebp ; save frame pointer
352 pxor mm0, mm0 ; zero acculumator
353 pxor mm1, mm1 ; zero acculumator
354 pxor mm2, mm2 ; zero acculumator
355 pxor mm3, mm3 ; zero acculumator
357 mov eax, [ebp+8] ; get p1
358 mov ebx, [ebp+12] ; get p2
359 mov edx, [ebp+16] ; get lx
362 movq mm3, [eax+edx+1]
366 movq mm5, [ebx] ; load previous row reference block
367 ; mm2 /mm3 containts current row target block
369 psadbw mm2, mm5 ; Comparse (x+0,y+2)
372 psadbw mm3, mm5 ; Compare (x+2,y+2)
373 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
376 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
378 movq mm2, [eax] ; Load current row traget block into mm2 / mm3
386 psadbw mm6, mm5 ; Compare (x+0,y+0)
388 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
389 psadbw mm7, mm5 ; Compare (x+2,y+0)
391 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64