2 ; dist2_mmx.s
: mmX optimized squared distance sum
4 ; Original believed to
be Copyright
(C
) 2000 Brent Byeler
6 ; This program is free software; you can reaxstribute it
and/or
7 ; modify it under the terms of the GNU General Public License
8 ; as published by the Free Software Foundation; either version
2
9 ; of the License
, or (at your option
) any later version.
11 ; This program is distributed in the hope that it will
be useful
,
12 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ; MERCHANTABILITY
or FITNESS FOR
A PARTICULAR PURPOSE. See the
14 ; GNU General Public License for more details.
16 ; You should have received
a copy of the GNU General Public License
17 ; along with this program; if
not, write to the Free Software
18 ; Foundation
, Inc.
, 59 Temple Place
- Suite
330, Boston
, MA
02111-1307, USA.
21 ; total squared difference between two
(16*h
) blocks
22 ; including optional half pel interpolation of
[ebp+
8] ; blk1
(hx
,hy
)
23 ; blk1
,blk2
: addresses of top left pels of both blocks
24 ;
lx: distance
(in bytes
) of vertically adjacent pels
25 ; hx
,hy
: flags for horizontal
and/or vertical interpolation
26 ; h
: height of block
(usually
8 or 16)
30 ; int dist2_mmx
(unsigned char
*blk1
, unsigned char
*blk2
,
31 ; int
lx, int hx
, int hy
, int h
)
43 ;; private constants needed
56 push ebp ; save frame pointer
64 mov esi
, [ebp+
16] ;
lx
65 mov eax
, [ebp+
20] ; hx
66 mov edx
, [ebp+
24] ; hy
70 test edi
, edi ; h
= 0?
73 pxor mm7
, mm7 ; get zeros i mm7
75 test eax
, eax ; hx
!= 0?
77 test edx
, edx ; hy
!= 0?
120 ;; Accumulate sum in edx.
.. we use mm5
142 mov eax
, [ebp+
8] ; blk1
143 mov ebx
, [ebp+
12] ; blk1
145 pxor mm6
, mm6 ; mm6
= 0 and isn
't changed anyplace in the loop..
162 paddw mm0, mm6 ; here we add mm6 = 0.... weird...
206 ; Accumulate mm0 sum on edx... we'll use mm5 for this
and add up at the end
227 mov eax
, [ebp+
8] ; blk1
228 mov edx
, [ebp+
12] ; blk2
230 add ebx
, esi ; blk1
+ lx
234 psubw mm6
, mm1 ; mm6
= 1
296 ;; Accumulate in
"s" - we use mm5 for the purpose
305 ;; Originally this moved
306 mov eax
, ebx ; eax
= eax
+ lx
307 add edx
, esi ; edx
= edx
+ lx
308 add ebx
, esi ; ebx
= ebx
+ lx
314 mov eax
, [ebp+
8] ; blk1
315 mov edx
, [ebp+
12] ; blk2
317 add ebx
, esi ; ebx
= blk1
+ lx
344 ;pxor mm6
, mm6 ; mm6
= 0
345 ;pcmpeqw mm5
, mm5 ; mm5
= -1
346 ;psubw mm6
, mm5 ; mm6
= 1
347 ;paddw mm6
, mm6 ; mm6
= 2
349 paddw mm0
, mm6 ; round mm0
350 paddw mm1
, mm6 ; round mm1
393 ;pxor mm6
, mm6 ; Zero mm6
394 ;pcmpeqw mm5
, mm5 ; mm5
= -1
395 ;psubw mm6
, mm5 ; mm6
= 1
396 ;paddw mm6
, mm6 ; mm6
= 2
397 ;paddw mm1
, mm6 ; round mm1
and mm2
420 ;; Accumulate the result in
"s" we use mm6 for the purpose.
..
428 mov eax
, ebx ; ahem ebx
= eax at start of loop
and wasn
't changed...
436 ;; Put the final sum in eax for return...
448 pop ebp ; restore stack pointer
450 emms ; clear mmx registers
454 ; total squared difference between two (8*h) blocks
455 ; blk1,blk2: addresses of top left pels of both blocks
456 ; lx: distance (in bytes) of vertically adjacent pels
457 ; h: height of block (usually 4, or 8)
461 ; int dist2_22_mmx(unsigned char *blk1, unsigned char *blk2,
475 push ebp ; save frame pointer
483 mov esi, [ebp+16] ; lx
484 mov edi, [ebp+20] ; h
487 test edi, edi ; h = 0?
490 pxor mm7, mm7 ; get zeros i mm7
492 mov eax, [ebp+8] ; blk1
493 mov ebx, [ebp+12] ; blk2
522 ; total squared difference between interpolation of two (8*h) blocks and
524 ; blk1,blk2: addresses of top left pels of both blocks
525 ; lx: distance (in bytes) of vertically adjacent pels
526 ; h: height of block (usually 4, or 8)
530 ; int bdist2_22_mmx(unsigned char *blk1f, unsigned char*blk1b,
531 ; unsigned char *blk2,
545 push ebp ; save frame pointer
553 mov esi, [ebp+20] ; lx
554 mov edi, [ebp+24] ; h
557 test edi, edi ; h = 0?
560 pxor mm7, mm7 ; get zeros i mm7
562 mov eax, [ebp+8] ; blk1f
563 mov ebx, [ebp+12] ; blk1b
564 mov ecx, [ebp+16] ; blk2