mpeg2enc/mblockq_sad_mmxe.s

   1 ;;;
   2 ;;;  mblockq_sad_mmxe.s:
   3 ;;;
   4 ;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblock
   5 ;;; quads (2 by 2 squares of adjacent macroblocks)
   6
   7 ;;; Explanation: the motion compensation search at 1-pel and 2*2 sub-sampled
   8 ;;; evaluates macroblock quads.  A lot of memory accesses can be saved
   9 ;;; if each quad is done together rather than each macroblock in the
  10 ;;; quad handled individually.
  11
  12 ;;; TODO:               Really there ought to be MMX versions and the function's
  13 ;;; specification should be documented...
  14 ;
  15 ; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
  16
  17
  18 ;
  19 ;  This program is free software; you can reaxstribute it and/or
  20 ;  modify it under the terms of the GNU General Public License
  21 ;  as published by the Free Software Foundation; either version 2
  22 ;  of the License, or (at your option) any later version.
  23 ;
  24 ;  This program is distributed in the hope that it will be useful,
  25 ;  but WITHOUT ANY WARRANTY; without even the implied warranty of
  26 ;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  27 ;  GNU General Public License for more details.
  28 ;
  29 ;  You should have received a copy of the GNU General Public License
  30 ;  along with this program; if not, write to the Free Software
  31 ;  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  32 ;
  33 ;
  34
  35 ;;; CURRENTLY not used but used in testing as reference for tweaks...
  36 global mblockq_sad1_REF
  37
  38 ; void mblockq_dist1_REF(char *blk1,char *blk2,int lx,int h,int *weightvec);
  39 ; eax = p1
  40 ; ebx = p2
  41 ; ecx = unused
  42 ; edx = lx;
  43 ; edi = rowsleft
  44 ; esi = h
  45
  46 ; mm0 = SAD (x+0,y+0)
  47 ; mm1 = SAD (x+2,y+0)
  48 ; mm2 = SAD (x+0,y+2)
  49 ; mm3 = SAD (x+2,y+2)
  50 ; mm4 = temp
  51 ; mm5 = temp
  52 ; mm6 = temp
  53 ; mm7 = temp
  54
  55 align 32
  56 mblockq_dist1_REF:
  57         push ebp                                        ; save frame pointer
  58         mov ebp, esp                            ; link
  59         push eax
  60         push ebx
  61         push ecx
  62         push edx
  63         push edi
  64         push esi
  65
  66         pxor mm0, mm0           ; zero accumulators
  67         pxor mm1, mm1
  68         pxor mm2, mm2
  69         pxor mm3, mm3
  70         mov eax, [ebp+8]        ; get p1
  71         mov ebx, [ebp+12]       ; get p2
  72         mov edx, [ebp+16]       ; get lx
  73
  74         mov edi, [ebp+20]       ; get rowsleft
  75         mov esi, edi
  76
  77         jmp nextrow_block_d1
  78 align 32
  79 nextrow_block_d1:
  80
  81                 ;; Do the (+0,+0) SAD
  82
  83         movq mm4, [eax]         ; load 1st 8 bytes of p1
  84         movq mm6, mm4
  85         movq mm5, [ebx]
  86         psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
  87         paddd mm0, mm4          ; accumulate difference
  88         movq mm4, [eax+8]       ; load 2nd 8 bytes of p1
  89         movq mm7, mm4
  90         psadbw mm4, [ebx+8]     ; compare to 2nd 8 bytes of p2
  91         paddd mm0, mm4          ; accumulate difference
  92
  93
  94     cmp edi, esi
  95         jz  firstrow0
  96
  97                 ;; Do the (0,+2) SAD
  98         sub ebx, edx
  99         psadbw mm6, [ebx]       ; compare to next 8 bytes of p2 (row 1)
 100         paddd mm2, mm6          ; accumulate difference
 101         psadbw mm7, [ebx+8]     ;  next 8 bytes of p1 (row 1)
 102         add ebx, edx
 103         paddd mm2, mm7
 104
 105 firstrow0:
 106
 107                 ;; Do the (+2,0) SAD
 108
 109         movq mm4, [eax+1]
 110
 111         movq mm6, mm4
 112         psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
 113         paddd mm1, mm4          ; accumulate difference
 114         movq mm4, [eax+9]
 115         movq mm7, mm4
 116         psadbw mm4, [ebx+8]     ; compare to 2nd 8 bytes of p2
 117         paddd mm1, mm4          ; accumulate difference
 118
 119     cmp edi, esi
 120         jz  firstrow1
 121
 122                 ;; Do the (+2, +2 ) SAD
 123         sub ebx, edx
 124         psadbw mm6, [ebx]       ; compare to 1st 8 bytes of prev p2
 125         psadbw mm7, [ebx+8]     ;  2nd 8 bytes of prev p2
 126         add ebx, edx
 127         paddd mm3, mm6          ; accumulate difference
 128         paddd mm3, mm7
 129 firstrow1:
 130
 131         add eax, edx                            ; update pointer to next row
 132         add ebx, edx            ; ditto
 133
 134         sub edi, 1
 135         jnz near nextrow_block_d1
 136
 137                 ;; Do the last row of the (0,+2) SAD
 138
 139         movq mm4, [eax]         ; load 1st 8 bytes of p1
 140         movq mm5, [eax+8]       ; load 2nd 8 bytes of p1
 141         sub  ebx, edx
 142         psadbw mm4, [ebx]       ; compare to next 8 bytes of p2 (row 1)
 143         psadbw mm5, [ebx+8]     ;  next 8 bytes of p1 (row 1)
 144         paddd mm2, mm4          ; accumulate difference
 145         paddd mm2, mm5
 146
 147         movq mm4, [eax+1]
 148         movq mm5, [eax+9]
 149
 150                 ;; Do the last row of rhw (+2, +2) SAD
 151         psadbw mm4, [ebx]       ; compare to 1st 8 bytes of prev p2
 152         psadbw mm5, [ebx+8]     ;  2nd 8 bytes of prev p2
 153         paddd mm3, mm4          ; accumulate difference
 154         paddd mm3, mm5
 155
 156
 157         mov eax, [ebp+24]                       ; Weightvec
 158         movd [eax+0], mm0
 159         movd [eax+4], mm1
 160         movd [eax+8], mm2
 161         movd [eax+12], mm3
 162
 163         pop esi
 164         pop edi
 165         pop edx
 166         pop ecx
 167         pop ebx
 168         pop eax
 169
 170         pop ebp
 171         emms
 172         ret
 173
 174
 175
 176 global mblockq_dist1_mmxe
 177
 178 ; void mblockq_dist1_mmxe(char *blk1,char *blk2,int lx,int h,int *weightvec);
 179
 180 ; eax = p1
 181 ; ebx = p2
 182 ; ecx = unused
 183 ; edx = lx;
 184 ; edi = rowsleft
 185 ; esi = h
 186
 187 ; mm0 = SAD (x+0,y+0),SAD (x+0,y+2)
 188 ; mm1 = SAD (x+2,y+0),SAD (x+2,y+2)
 189
 190 ; mm4 = temp
 191 ; mm5 = temp
 192 ; mm6 = temp
 193 ; mm7 = temp
 194
 195 align 32
 196 mblockq_dist1_mmxe:
 197         push ebp                                        ; save frame pointer
 198         mov ebp, esp                            ; link
 199         push eax
 200         push ebx
 201         push ecx
 202         push edx
 203         push edi
 204         push esi
 205
 206         mov eax, [ebp+8]        ; get p1
 207         prefetcht0 [eax]
 208         pxor mm0, mm0           ; zero accumulators
 209         pxor mm1, mm1
 210         mov ebx, [ebp+12]       ; get p2
 211         mov edx, [ebp+16]       ; get lx
 212
 213         mov edi, [ebp+20]       ; get rowsleft
 214         mov esi, edi
 215
 216         jmp nextrow_block_e1
 217 align 32
 218 nextrow_block_e1:
 219
 220                 ;; Do the (+0,+0) SAD
 221         prefetcht0 [eax+edx]
 222         movq mm4, [eax]         ; load 1st 8 bytes of p1
 223         movq mm6, mm4
 224         movq mm5, [ebx]
 225         psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
 226         paddd mm0, mm4          ; accumulate difference
 227         movq mm4, [eax+8]       ; load 2nd 8 bytes of p1
 228         movq mm7, mm4
 229         psadbw mm4, [ebx+8]     ; compare to 2nd 8 bytes of p2
 230         paddd mm0, mm4          ; accumulate difference
 231
 232
 233     cmp edi, esi
 234         jz  firstrowe0
 235
 236                 ;; Do the (0,+2) SAD
 237         sub ebx, edx
 238         pshufw  mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 239         movq   mm2, [ebx]
 240         psadbw mm6, mm2     ; compare to next 8 bytes of p2 (row 1)
 241         paddd mm0, mm6          ; accumulate difference
 242         movq  mm3, [ebx+8]
 243         psadbw mm7, mm3 ;  next 8 bytes of p1 (row 1)
 244         add ebx, edx
 245         paddd mm0, mm7
 246         pshufw  mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 247 firstrowe0:
 248
 249                 ;; Do the (+2,0) SAD
 250
 251         movq mm4, [eax+1]
 252         movq mm6, mm4
 253
 254         psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
 255         paddd mm1, mm4          ; accumulate difference
 256
 257         movq mm4, [eax+9]
 258         movq mm7, mm4
 259
 260         psadbw mm4, [ebx+8]     ; compare to 2nd 8 bytes of p2
 261         paddd mm1, mm4          ; accumulate difference
 262
 263     cmp edi, esi
 264         jz  firstrowe1
 265
 266                 ;; Do the (+2, +2 ) SAD
 267         sub ebx, edx
 268         pshufw  mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 269         psadbw mm6, mm2 ; compare to 1st 8 bytes of prev p2
 270         psadbw mm7, mm3 ;  2nd 8 bytes of prev p2
 271         add ebx, edx
 272         paddd mm1, mm6          ; accumulate difference
 273         paddd mm1, mm7
 274         pshufw  mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 275 firstrowe1:
 276
 277         add eax, edx                            ; update pointer to next row
 278         add ebx, edx            ; ditto
 279
 280         sub edi, 1
 281         jnz near nextrow_block_e1
 282
 283                 ;; Do the last row of the (0,+2) SAD
 284         pshufw  mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 285         movq mm4, [eax]         ; load 1st 8 bytes of p1
 286         movq mm5, [eax+8]       ; load 2nd 8 bytes of p1
 287         sub  ebx, edx
 288         psadbw mm4, [ebx]       ; compare to next 8 bytes of p2 (row 1)
 289         psadbw mm5, [ebx+8]     ;  next 8 bytes of p1 (row 1)
 290         paddd mm0, mm4          ; accumulate difference
 291         paddd mm0, mm5
 292
 293
 294                 ;; Do the last row of rhw (+2, +2) SAD
 295         pshufw  mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 296         movq mm4, [eax+1]
 297         movq mm5, [eax+9]
 298
 299         psadbw mm4, [ebx]       ; compare to 1st 8 bytes of prev p2
 300         psadbw mm5, [ebx+8]     ;  2nd 8 bytes of prev p2
 301         paddd mm1, mm4          ; accumulate difference
 302         paddd mm1, mm5
 303
 304
 305         mov eax, [ebp+24]                       ; Weightvec
 306         movd [eax+8], mm0
 307         pshufw  mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 308         movd [eax+12], mm1
 309         pshufw  mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 310         movd [eax+0], mm0
 311         movd [eax+4], mm1
 312
 313         pop esi
 314         pop edi
 315         pop edx
 316         pop ecx
 317         pop ebx
 318         pop eax
 319
 320         pop ebp
 321         emms
 322         ret
 323
 324 global mblockq_dist22_mmxe
 325
 326 ; void mblockq_dist22_mmxe(unsigned char *blk1,unsigned char *blk2,int flx,int fh, int* resvec);
 327
 328 ; eax = p1
 329 ; ebx = p2
 330 ; ecx = counter temp
 331 ; edx = flx;
 332
 333 ; mm0 = distance accumulator
 334 ; mm1 = distance accumulator
 335 ; mm2 = previous p1 row
 336 ; mm3 = previous p1 displaced by 1 byte...
 337 ; mm4 = temp
 338 ; mm5 = temp
 339 ; mm6 = temp
 340 ; mm7 = temp / 0 if first row 0xff otherwise
 341
 342
 343 align 32
 344 mblockq_dist22_mmxe:
 345         push ebp                ; save frame pointer
 346         mov ebp, esp
 347         push eax
 348         push ebx
 349         push ecx
 350         push edx
 351
 352         pxor mm0, mm0           ; zero acculumator
 353         pxor mm1, mm1           ; zero acculumator
 354         pxor mm2, mm2           ; zero acculumator
 355         pxor mm3, mm3           ; zero acculumator
 356
 357         mov eax, [ebp+8]        ; get p1
 358         mov ebx, [ebp+12]       ; get p2
 359         mov edx, [ebp+16]       ; get lx
 360         mov ecx, [ebp+20]
 361         movq mm2, [eax+edx]
 362         movq mm3, [eax+edx+1]
 363         jmp nextrowbd22
 364 align 32
 365 nextrowbd22:
 366         movq   mm5, [ebx]                       ; load previous row reference block
 367                                                                 ; mm2 /mm3 containts current row target block
 368
 369         psadbw mm2, mm5                         ; Comparse (x+0,y+2)
 370         paddd  mm1, mm2
 371
 372         psadbw mm3, mm5                         ; Compare (x+2,y+2)
 373         pshufw  mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 374         paddd  mm1, mm3
 375
 376         pshufw  mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 377
 378         movq mm2, [eax]                         ; Load current row traget block into mm2 / mm3
 379         movq mm6, mm2
 380         movq mm3, [eax+1]
 381         sub        eax, edx
 382         sub        ebx, edx
 383         prefetcht0 [eax]
 384         movq mm7, mm3
 385
 386         psadbw  mm6, mm5                        ; Compare (x+0,y+0)
 387         paddd   mm0, mm6
 388         pshufw  mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 389         psadbw  mm7, mm5                        ; Compare (x+2,y+0)
 390         paddd   mm0, mm7
 391         pshufw  mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 392
 393         sub ecx, 1
 394         jnz nextrowbd22
 395
 396         mov  eax, [ebp+24]
 397         movq [eax+0], mm0
 398         movq [eax+8], mm1
 399         pop edx
 400         pop ecx
 401         pop ebx
 402         pop eax
 403         pop ebp
 404
 405         emms
 406         ret
 407
 408
 409
 410
 411