r1009: Move the dependencies to newer package names
[cinelerra_cv/mob.git] / mpeg2enc / mblock_sad_mmxe.s
blob0aec5215814add1c8788e9af109e14f21534ceaf
1 ;;;
2 ;;; mblock_sad_mmxe.s:
3 ;;;
4 ;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblocks
5 ;;; (interpolated, 1-pel, 2*2 sub-sampled pel and 4*4 sub-sampled pel)
7 ; dist1_* Original Copyright (C) 2000 Chris Atenasio <chris@crud.net>
8 ; Enhancements and rest Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
10 ;; Yes, I tried prefetch-ing. It makes no difference or makes
11 ;; stuff *slower*.
14 ; This program is free software; you can reaxstribute it and/or
15 ; modify it under the terms of the GNU General Public License
16 ; as published by the Free Software Foundation; either version 2
17 ; of the License, or (at your option) any later version.
19 ; This program is distributed in the hope that it will be useful,
20 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ; GNU General Public License for more details.
24 ; You should have received a copy of the GNU General Public License
25 ; along with this program; if not, write to the Free Software
26 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
32 global dist1_00_mmxe
34 ; int dist1_00(char *blk1,char *blk2,int lx,int h,int distlim);
35 ; distlim unused - costs more to check than the savings of
36 ; aborting the computation early from time to time...
37 ; eax = p1
38 ; ebx = p2
39 ; ecx = rowsleft
40 ; edx = lx;
42 ; mm0 = distance accumulator
43 ; mm1 = temp
44 ; mm2 = temp
45 ; mm3 = temp
46 ; mm4 = temp
47 ; mm5 = temp
48 ; mm6 = temp
51 align 32
52 dist1_00_mmxe:
53 push ebp ; save frame pointer
54 mov ebp, esp ; link
56 push ebx
57 push ecx
58 push edx
60 pxor mm0, mm0 ; zero acculumator
62 mov eax, [ebp+8] ; get p1
63 dist1_00_0misalign:
64 mov ebx, [ebp+12] ; get p2
65 mov edx, [ebp+16] ; get lx
67 mov ecx, [ebp+20] ; get rowsleft
68 jmp nextrow00sse
69 align 32
70 nextrow00sse:
71 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
72 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
73 movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1)
74 add eax, edx ; update pointer to next row
75 paddd mm0, mm4 ; accumulate difference
77 psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
78 add ebx, edx ; ditto
79 paddd mm0, mm5 ; accumulate difference
82 movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
83 psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
84 movq mm4, [eax+8] ; load next 8 bytes of p1 (row 2)
85 add eax, edx ; update pointer to next row
86 paddd mm0, mm6 ; accumulate difference
88 psadbw mm4, [ebx+8] ; compare to next 8 bytes of p2 (row 2)
89 add ebx, edx ; ditto
90 paddd mm0, mm4 ; accumulate difference
92 ;psubd mm2, mm3 ; decrease rowsleft
93 ;movq mm5, mm1 ; copy distlim
94 ;pcmpgtd mm5, mm0 ; distlim > dist?
95 ;pand mm2, mm5 ; mask rowsleft with answer
96 ;movd ecx, mm2 ; move rowsleft to ecx
98 ;add eax, edx ; update pointer to next row
99 ;add ebx, edx ; ditto
101 ;test ecx, ecx ; check rowsleft
102 sub ecx, 2
103 jnz nextrow00sse
105 movd eax, mm0 ; store return value
107 pop edx
108 pop ecx
109 pop ebx
111 pop ebp
112 emms
113 ret
118 global dist1_00_Ammxe
119 ;; This is a special version that only does aligned accesses...
120 ;; Wonder if it'll make it faster on a P-III
121 ;; ANSWER: NO its slower hence no longer used.
123 ; int dist1_00(char *blk1,char *blk2,int lx,int h,int distlim);
124 ; distlim unused - costs more to check than the savings of
125 ; aborting the computation early from time to time...
126 ; eax = p1
127 ; ebx = p2
128 ; ecx = rowsleft
129 ; edx = lx;
131 ; mm0 = distance accumulator
132 ; mm1 = temp
133 ; mm2 = right shift to adjust for mis-align
134 ; mm3 = left shift to adjust for mis-align
135 ; mm4 = temp
136 ; mm5 = temp
137 ; mm6 = temp
140 align 32
141 dist1_00_Ammxe:
142 push ebp ; save frame pointer
143 mov ebp, esp ; link
145 push ebx
146 push ecx
147 push edx
149 pxor mm0, mm0 ; zero acculumator
151 mov eax, [ebp+8] ; get p1
152 mov ebx, eax
153 and ebx, 7 ; Misalignment!
154 cmp ebx, 0
155 jz near dist1_00_0misalign
156 sub eax, ebx ; Align eax
157 mov ecx, 8 ; ecx = 8-misalignment
158 sub ecx, ebx
159 shl ebx, 3 ; Convert into bit-shifts...
160 shl ecx, 3
161 movd mm2, ebx ; mm2 = shift to start msb
162 movd mm3, ecx ; mm3 = shift to end lsb
164 mov ebx, [ebp+12] ; get p2
165 mov edx, [ebp+16] ; get lx
166 mov ecx, [ebp+20] ; get rowsleft
167 jmp nextrow00ssea
168 align 32
169 nextrow00ssea:
170 movq mm4, [eax] ; load first 8 bytes of aligned p1 (row 1)
171 movq mm5, [eax+8] ; load next 8 bytes of aligned p1 (row 1)
172 movq mm6, mm5
173 psrlq mm4, mm2 ; mm4 first 8 bytes of p1 proper
174 psllq mm5, mm3
175 por mm4, mm5
176 psadbw mm4, [ebx] ; compare to first 8 bytes of p2
178 movq mm7, [eax+16] ; load last 8 bytes of aligned p1
179 add eax, edx ; update pointer to next row
180 psrlq mm6, mm2 ; mm6 2nd 8 bytes of p1 proper
181 psllq mm7, mm3
182 por mm6, mm7
185 paddd mm0, mm4 ; accumulate difference
187 psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
188 add ebx, edx ; ditto
189 paddd mm0, mm6 ; accumulate difference
191 sub ecx, 1
192 jnz nextrow00ssea
194 movd eax, mm0 ; store return value
196 pop edx
197 pop ecx
198 pop ebx
200 pop ebp
201 emms
202 ret
205 global dist1_01_mmxe
207 ; int dist1_01(char *blk1,char *blk2,int lx,int h);
209 ; eax = p1
210 ; ebx = p2
211 ; ecx = counter temp
212 ; edx = lx;
214 ; mm0 = distance accumulator
215 ; mm1 = distlim
216 ; mm2 = rowsleft
217 ; mm3 = 2 (rows per loop)
218 ; mm4 = temp
219 ; mm5 = temp
220 ; mm6 = temp
223 align 32
224 dist1_01_mmxe:
225 push ebp
226 mov ebp, esp
228 push ebx
229 push ecx
230 push edx
232 pxor mm0, mm0 ; zero acculumator
234 mov eax, [ebp+8] ; get p1
235 mov ebx, [ebp+12] ; get p2
236 mov edx, [ebp+16] ; get lx
238 mov ecx, [ebp+20] ; get rowsleft
239 jmp nextrow01 ; snap to it
240 align 32
241 nextrow01:
242 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
243 pavgb mm4, [eax+1] ; Interpolate...
244 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
245 paddd mm0, mm4 ; accumulate difference
247 movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1)
248 pavgb mm5, [eax+9] ; Interpolate
249 psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
250 paddd mm0, mm5 ; accumulate difference
252 add eax, edx ; update pointer to next row
253 add ebx, edx ; ditto
255 movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
256 pavgb mm6, [eax+1] ; Interpolate
257 psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
258 paddd mm0, mm6 ; accumulate difference
260 movq mm7, [eax+8] ; load next 8 bytes of p1 (row 2)
261 pavgb mm7, [eax+9]
262 psadbw mm7, [ebx+8] ; compare to next 8 bytes of p2 (row 2)
263 paddd mm0, mm7 ; accumulate difference
265 add eax, edx ; update pointer to next row
266 add ebx, edx ; ditto
268 sub ecx, 2 ; check rowsleft
269 jnz nextrow01 ; rinse and repeat
271 movd eax, mm0 ; store return value
273 pop edx
274 pop ecx
275 pop ebx
277 pop ebp ; restore stack pointer
279 emms ; clear mmx registers
280 ret ; we now return you to your regular programming
283 global dist1_10_mmxe
285 ; int dist1_10(char *blk1,char *blk2,int lx,int h);
287 ; eax = p1
288 ; ebx = p2
289 ; ecx = counter temp
290 ; edx = lx;
291 ; edi = p1+lx
293 ; mm0 = distance accumulator
294 ; mm2 = rowsleft
295 ; mm3 = 2 (rows per loop)
296 ; mm4 = temp
297 ; mm5 = temp
298 ; mm6 = temp
301 align 32
302 dist1_10_mmxe:
303 push ebp ; save stack pointer
304 mov ebp, esp
306 push ebx
307 push ecx
308 push edx
309 push edi
311 pxor mm0, mm0 ; zero acculumator
313 mov eax, [ebp+8] ; get p1
314 mov ebx, [ebp+12] ; get p2
315 mov edx, [ebp+16] ; get lx
316 mov edi, eax
317 add edi, edx
318 mov ecx, [ebp+20] ; get rowsleft
319 jmp nextrow10 ; snap to it
320 align 32
321 nextrow10:
322 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
323 pavgb mm4, [edi] ; Interpolate...
324 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
325 paddd mm0, mm4 ; accumulate difference
327 movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1)
328 pavgb mm5, [edi+8] ; Interpolate
329 psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
330 paddd mm0, mm5 ; accumulate difference
332 add eax, edx ; update pointer to next row
333 add ebx, edx ; ditto
334 add edi, edx
336 movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
337 pavgb mm6, [edi] ; Interpolate
338 psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
339 paddd mm0, mm6 ; accumulate difference
341 movq mm7, [eax+8] ; load next 8 bytes of p1 (row 2)
342 pavgb mm7, [edi+8]
343 psadbw mm7, [ebx+8] ; compare to next 8 bytes of p2 (row 2)
344 paddd mm0, mm7 ; accumulate difference
346 psubd mm2, mm3 ; decrease rowsleft
348 add eax, edx ; update pointer to next row
349 add ebx, edx ; ditto
350 add edi, edx
352 sub ecx, 2 ; check rowsleft (we're doing 2 at a time)
353 jnz nextrow10 ; rinse and repeat
355 movd eax, mm0 ; store return value
357 pop edi
358 pop edx
359 pop ecx
360 pop ebx
362 pop ebp ; restore stack pointer
364 emms ; clear mmx registers
365 ret ; we now return you to your regular programming
368 global dist1_11_mmxe
370 ; int dist1_11(char *blk1,char *blk2,int lx,int h);
372 ; eax = p1
373 ; ebx = p2
374 ; ecx = counter temp
375 ; edx = lx;
376 ; edi = p1+lx
379 ; mm0 = distance accumulator
380 ; mm2 = rowsleft
381 ; mm3 = 2 (rows per loop)
382 ; mm4 = temp
383 ; mm5 = temp
384 ; mm6 = temp
387 align 32
388 dist1_11_mmxe:
389 push ebp ; save stack pointer
390 mov ebp, esp ; so that we can do this
392 push ebx ; save the pigs
393 push ecx ; make them squeal
394 push edx ; lets have pigs for every meal
395 push edi
397 pxor mm0, mm0 ; zero acculumator
399 mov eax, [ebp+8] ; get p1
400 mov ebx, [ebp+12] ; get p2
401 mov edx, [ebp+16] ; get lx
402 mov edi, eax
403 add edi, edx
404 mov ecx, [ebp+20] ; get rowsleft
405 jmp nextrow11 ; snap to it
406 align 32
407 nextrow11:
408 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
409 pavgb mm4, [edi] ; Interpolate...
410 movq mm5, [eax+1]
411 pavgb mm5, [edi+1]
412 pavgb mm4, mm5
413 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
414 paddd mm0, mm4 ; accumulate difference
416 movq mm6, [eax+8] ; load next 8 bytes of p1 (row 1)
417 pavgb mm6, [edi+8] ; Interpolate
418 movq mm7, [eax+9]
419 pavgb mm7, [edi+9]
420 pavgb mm6, mm7
421 psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
422 paddd mm0, mm6 ; accumulate difference
424 add eax, edx ; update pointer to next row
425 add ebx, edx ; ditto
426 add edi, edx
428 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
429 pavgb mm4, [edi] ; Interpolate...
430 movq mm5, [eax+1]
431 pavgb mm5, [edi+1]
432 pavgb mm4, mm5
433 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
434 paddd mm0, mm4 ; accumulate difference
436 movq mm6, [eax+8] ; load next 8 bytes of p1 (row 1)
437 pavgb mm6, [edi+8] ; Interpolate
438 movq mm7, [eax+9]
439 pavgb mm7, [edi+9]
440 pavgb mm6, mm7
441 psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
442 paddd mm0, mm6 ; accumulate difference
444 add eax, edx ; update pointer to next row
445 add ebx, edx ; ditto
446 add edi, edx
449 sub ecx, 2 ; check rowsleft
450 jnz near nextrow11 ; rinse and repeat
452 movd eax, mm0 ; store return value
454 pop edi
455 pop edx
456 pop ecx
457 pop ebx
459 pop ebp ; restore stack pointer
461 emms ; clear mmx registers
462 ret ; we now return you to your regular programming
464 global dist22_mmxe
466 ; int dist22_mmxe(unsigned char *blk1,unsigned char *blk2,int flx,int fh);
468 ; eax = p1
469 ; ebx = p2
470 ; ecx = counter temp
471 ; edx = flx;
473 ; mm0 = distance accumulator
474 ; mm2 = rowsleft
475 ; mm3 = 2 (rows per loop)
476 ; mm4 = temp
477 ; mm5 = temp
478 ; mm6 = temp
481 align 32
482 dist22_mmxe:
483 push ebp ; save frame pointer
484 mov ebp, esp
486 push ebx
487 push ecx
488 push edx
490 pxor mm0, mm0 ; zero acculumator
492 mov eax, [ebp+8] ; get p1
493 mov ebx, [ebp+12] ; get p2
494 mov edx, [ebp+16] ; get lx
496 mov ecx, [ebp+20]
497 jmp nextrowfd
498 align 32
499 nextrowfd:
500 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
501 add eax, edx ; update pointer to next row
502 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
503 add ebx, edx ; ditto
504 paddd mm0, mm4 ; accumulate difference
507 movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
508 add eax, edx ; update pointer to next row
509 psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
510 add ebx, edx ; ditto
511 paddd mm0, mm6 ; accumulate difference
514 sub ecx, 2
515 jnz nextrowfd
517 movd eax, mm0
519 pop edx
520 pop ecx
521 pop ebx
523 pop ebp
525 emms
532 global dist44_mmxe
534 ; int dist44_mmxe(unsigned char *blk1,unsigned char *blk2,int qlx,int qh);
536 ; eax = p1
537 ; ebx = p2
538 ; ecx = temp
539 ; edx = qlx;
540 ; esi = rowsleft
542 ; mm0 = distance accumulator left block p1
543 ; mm1 = distance accumulator right block p1
544 ; mm2 = 0
545 ; mm3 = 0
546 ; mm4 = temp
547 ; mm5 = temp
548 ; mm6 = temp
551 align 32
552 dist44_mmxe:
553 push ebp
554 mov ebp, esp
556 push ebx
557 push ecx
558 push edx
559 push esi
561 pxor mm0, mm0 ; zero acculumator
562 pxor mm1, mm1
563 pxor mm2, mm2
564 mov eax, [ebp+8] ; get p1
565 mov ebx, [ebp+12] ; get p2
566 mov edx, [ebp+16] ; get qlx
568 mov esi, [ebp+20] ; get rowsleft
569 jmp nextrowqd ; snap to it
570 align 32
571 nextrowqd:
572 movq mm4, [eax] ; load 8 bytes of p1 (two blocks!)
573 add eax, edx ; update pointer to next row
574 movq mm6, mm4 ;
575 mov ecx, [ebx] ; load 4 bytes of p2
576 punpcklbw mm4, mm2 ; mm4 = bytes 0..3 p1 (spaced out)
577 movd mm5, ecx
578 punpcklbw mm5, mm2 ; mm5 = bytes 0..3 p2 (spaced out)
579 psadbw mm4, mm5 ; compare to left block
580 add ebx, edx ; ditto
582 ; punpckhbw mm6, mm2 ; mm6 = bytes 4..7 p1 (spaced out)
584 paddd mm0, mm4 ; accumulate difference left block
586 ; psadbw mm6,mm5 ; compare to right block
589 ; paddd mm1, mm6 ; accumulate difference right block
591 sub esi, 1
592 jnz nextrowqd
594 movd eax, mm0
595 ; movd ebx, mm1
596 ; sal ebx, 16
597 ; or eax, ebx
599 pop esi
600 pop edx
601 pop ecx
602 pop ebx
604 pop ebp ; restore stack pointer
606 emms ; clear mmx registers
607 ret ; we now return you to your regular programming