libavcodec/x86/motion_est_mmx.c

   1 /*
   2  * MMX optimized motion estimation
   3  * Copyright (c) 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer
   5  *
   6  * mostly by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 #include "libavutil/x86_cpu.h"
  26 #include "libavcodec/dsputil.h"
  27
  28 DECLARE_ASM_CONST(8, uint64_t, round_tab[3])={
  29 0x0000000000000000ULL,
  30 0x0001000100010001ULL,
  31 0x0002000200020002ULL,
  32 };
  33
  34 DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL;
  35
  36 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
  37 {
  38     x86_reg len= -(stride*h);
  39     __asm__ volatile(
  40         ASMALIGN(4)
  41         "1:                             \n\t"
  42         "movq (%1, %%"REG_a"), %%mm0    \n\t"
  43         "movq (%2, %%"REG_a"), %%mm2    \n\t"
  44         "movq (%2, %%"REG_a"), %%mm4    \n\t"
  45         "add %3, %%"REG_a"              \n\t"
  46         "psubusb %%mm0, %%mm2           \n\t"
  47         "psubusb %%mm4, %%mm0           \n\t"
  48         "movq (%1, %%"REG_a"), %%mm1    \n\t"
  49         "movq (%2, %%"REG_a"), %%mm3    \n\t"
  50         "movq (%2, %%"REG_a"), %%mm5    \n\t"
  51         "psubusb %%mm1, %%mm3           \n\t"
  52         "psubusb %%mm5, %%mm1           \n\t"
  53         "por %%mm2, %%mm0               \n\t"
  54         "por %%mm1, %%mm3               \n\t"
  55         "movq %%mm0, %%mm1              \n\t"
  56         "movq %%mm3, %%mm2              \n\t"
  57         "punpcklbw %%mm7, %%mm0         \n\t"
  58         "punpckhbw %%mm7, %%mm1         \n\t"
  59         "punpcklbw %%mm7, %%mm3         \n\t"
  60         "punpckhbw %%mm7, %%mm2         \n\t"
  61         "paddw %%mm1, %%mm0             \n\t"
  62         "paddw %%mm3, %%mm2             \n\t"
  63         "paddw %%mm2, %%mm0             \n\t"
  64         "paddw %%mm0, %%mm6             \n\t"
  65         "add %3, %%"REG_a"              \n\t"
  66         " js 1b                         \n\t"
  67         : "+a" (len)
  68         : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride)
  69     );
  70 }
  71
  72 static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
  73 {
  74     __asm__ volatile(
  75         ASMALIGN(4)
  76         "1:                             \n\t"
  77         "movq (%1), %%mm0               \n\t"
  78         "movq (%1, %3), %%mm1           \n\t"
  79         "psadbw (%2), %%mm0             \n\t"
  80         "psadbw (%2, %3), %%mm1         \n\t"
  81         "paddw %%mm0, %%mm6             \n\t"
  82         "paddw %%mm1, %%mm6             \n\t"
  83         "lea (%1,%3,2), %1              \n\t"
  84         "lea (%2,%3,2), %2              \n\t"
  85         "sub $2, %0                     \n\t"
  86         " jg 1b                         \n\t"
  87         : "+r" (h), "+r" (blk1), "+r" (blk2)
  88         : "r" ((x86_reg)stride)
  89     );
  90 }
  91
  92 static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
  93 {
  94     int ret;
  95     __asm__ volatile(
  96         "pxor %%xmm6, %%xmm6            \n\t"
  97         ASMALIGN(4)
  98         "1:                             \n\t"
  99         "movdqu (%1), %%xmm0            \n\t"
 100         "movdqu (%1, %3), %%xmm1        \n\t"
 101         "psadbw (%2), %%xmm0            \n\t"
 102         "psadbw (%2, %3), %%xmm1        \n\t"
 103         "paddw %%xmm0, %%xmm6           \n\t"
 104         "paddw %%xmm1, %%xmm6           \n\t"
 105         "lea (%1,%3,2), %1              \n\t"
 106         "lea (%2,%3,2), %2              \n\t"
 107         "sub $2, %0                     \n\t"
 108         " jg 1b                         \n\t"
 109         : "+r" (h), "+r" (blk1), "+r" (blk2)
 110         : "r" ((x86_reg)stride)
 111     );
 112     __asm__ volatile(
 113         "movhlps %%xmm6, %%xmm0         \n\t"
 114         "paddw   %%xmm0, %%xmm6         \n\t"
 115         "movd    %%xmm6, %0             \n\t"
 116         : "=r"(ret)
 117     );
 118     return ret;
 119 }
 120
 121 static inline void sad8_x2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 122 {
 123     __asm__ volatile(
 124         ASMALIGN(4)
 125         "1:                             \n\t"
 126         "movq (%1), %%mm0               \n\t"
 127         "movq (%1, %3), %%mm1           \n\t"
 128         "pavgb 1(%1), %%mm0             \n\t"
 129         "pavgb 1(%1, %3), %%mm1         \n\t"
 130         "psadbw (%2), %%mm0             \n\t"
 131         "psadbw (%2, %3), %%mm1         \n\t"
 132         "paddw %%mm0, %%mm6             \n\t"
 133         "paddw %%mm1, %%mm6             \n\t"
 134         "lea (%1,%3,2), %1              \n\t"
 135         "lea (%2,%3,2), %2              \n\t"
 136         "sub $2, %0                     \n\t"
 137         " jg 1b                         \n\t"
 138         : "+r" (h), "+r" (blk1), "+r" (blk2)
 139         : "r" ((x86_reg)stride)
 140     );
 141 }
 142
 143 static inline void sad8_y2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 144 {
 145     __asm__ volatile(
 146         "movq (%1), %%mm0               \n\t"
 147         "add %3, %1                     \n\t"
 148         ASMALIGN(4)
 149         "1:                             \n\t"
 150         "movq (%1), %%mm1               \n\t"
 151         "movq (%1, %3), %%mm2           \n\t"
 152         "pavgb %%mm1, %%mm0             \n\t"
 153         "pavgb %%mm2, %%mm1             \n\t"
 154         "psadbw (%2), %%mm0             \n\t"
 155         "psadbw (%2, %3), %%mm1         \n\t"
 156         "paddw %%mm0, %%mm6             \n\t"
 157         "paddw %%mm1, %%mm6             \n\t"
 158         "movq %%mm2, %%mm0              \n\t"
 159         "lea (%1,%3,2), %1              \n\t"
 160         "lea (%2,%3,2), %2              \n\t"
 161         "sub $2, %0                     \n\t"
 162         " jg 1b                         \n\t"
 163         : "+r" (h), "+r" (blk1), "+r" (blk2)
 164         : "r" ((x86_reg)stride)
 165     );
 166 }
 167
 168 static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 169 {
 170     __asm__ volatile(
 171         "movq "MANGLE(bone)", %%mm5     \n\t"
 172         "movq (%1), %%mm0               \n\t"
 173         "pavgb 1(%1), %%mm0             \n\t"
 174         "add %3, %1                     \n\t"
 175         ASMALIGN(4)
 176         "1:                             \n\t"
 177         "movq (%1), %%mm1               \n\t"
 178         "movq (%1,%3), %%mm2            \n\t"
 179         "pavgb 1(%1), %%mm1             \n\t"
 180         "pavgb 1(%1,%3), %%mm2          \n\t"
 181         "psubusb %%mm5, %%mm1           \n\t"
 182         "pavgb %%mm1, %%mm0             \n\t"
 183         "pavgb %%mm2, %%mm1             \n\t"
 184         "psadbw (%2), %%mm0             \n\t"
 185         "psadbw (%2,%3), %%mm1          \n\t"
 186         "paddw %%mm0, %%mm6             \n\t"
 187         "paddw %%mm1, %%mm6             \n\t"
 188         "movq %%mm2, %%mm0              \n\t"
 189         "lea (%1,%3,2), %1              \n\t"
 190         "lea (%2,%3,2), %2              \n\t"
 191         "sub $2, %0                     \n\t"
 192         " jg 1b                         \n\t"
 193         : "+r" (h), "+r" (blk1), "+r" (blk2)
 194         : "r" ((x86_reg)stride)
 195     );
 196 }
 197
 198 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
 199 {
 200     x86_reg len= -(stride*h);
 201     __asm__ volatile(
 202         ASMALIGN(4)
 203         "1:                             \n\t"
 204         "movq (%1, %%"REG_a"), %%mm0    \n\t"
 205         "movq (%2, %%"REG_a"), %%mm1    \n\t"
 206         "movq (%1, %%"REG_a"), %%mm2    \n\t"
 207         "movq (%2, %%"REG_a"), %%mm3    \n\t"
 208         "punpcklbw %%mm7, %%mm0         \n\t"
 209         "punpcklbw %%mm7, %%mm1         \n\t"
 210         "punpckhbw %%mm7, %%mm2         \n\t"
 211         "punpckhbw %%mm7, %%mm3         \n\t"
 212         "paddw %%mm0, %%mm1             \n\t"
 213         "paddw %%mm2, %%mm3             \n\t"
 214         "movq (%3, %%"REG_a"), %%mm4    \n\t"
 215         "movq (%3, %%"REG_a"), %%mm2    \n\t"
 216         "paddw %%mm5, %%mm1             \n\t"
 217         "paddw %%mm5, %%mm3             \n\t"
 218         "psrlw $1, %%mm1                \n\t"
 219         "psrlw $1, %%mm3                \n\t"
 220         "packuswb %%mm3, %%mm1          \n\t"
 221         "psubusb %%mm1, %%mm4           \n\t"
 222         "psubusb %%mm2, %%mm1           \n\t"
 223         "por %%mm4, %%mm1               \n\t"
 224         "movq %%mm1, %%mm0              \n\t"
 225         "punpcklbw %%mm7, %%mm0         \n\t"
 226         "punpckhbw %%mm7, %%mm1         \n\t"
 227         "paddw %%mm1, %%mm0             \n\t"
 228         "paddw %%mm0, %%mm6             \n\t"
 229         "add %4, %%"REG_a"              \n\t"
 230         " js 1b                         \n\t"
 231         : "+a" (len)
 232         : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride)
 233     );
 234 }
 235
 236 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 237 {
 238     x86_reg len= -(stride*h);
 239     __asm__ volatile(
 240         "movq (%1, %%"REG_a"), %%mm0    \n\t"
 241         "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
 242         "movq %%mm0, %%mm1              \n\t"
 243         "movq %%mm2, %%mm3              \n\t"
 244         "punpcklbw %%mm7, %%mm0         \n\t"
 245         "punpckhbw %%mm7, %%mm1         \n\t"
 246         "punpcklbw %%mm7, %%mm2         \n\t"
 247         "punpckhbw %%mm7, %%mm3         \n\t"
 248         "paddw %%mm2, %%mm0             \n\t"
 249         "paddw %%mm3, %%mm1             \n\t"
 250         ASMALIGN(4)
 251         "1:                             \n\t"
 252         "movq (%2, %%"REG_a"), %%mm2    \n\t"
 253         "movq 1(%2, %%"REG_a"), %%mm4   \n\t"
 254         "movq %%mm2, %%mm3              \n\t"
 255         "movq %%mm4, %%mm5              \n\t"
 256         "punpcklbw %%mm7, %%mm2         \n\t"
 257         "punpckhbw %%mm7, %%mm3         \n\t"
 258         "punpcklbw %%mm7, %%mm4         \n\t"
 259         "punpckhbw %%mm7, %%mm5         \n\t"
 260         "paddw %%mm4, %%mm2             \n\t"
 261         "paddw %%mm5, %%mm3             \n\t"
 262         "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
 263         "paddw %%mm2, %%mm0             \n\t"
 264         "paddw %%mm3, %%mm1             \n\t"
 265         "paddw %%mm5, %%mm0             \n\t"
 266         "paddw %%mm5, %%mm1             \n\t"
 267         "movq (%3, %%"REG_a"), %%mm4    \n\t"
 268         "movq (%3, %%"REG_a"), %%mm5    \n\t"
 269         "psrlw $2, %%mm0                \n\t"
 270         "psrlw $2, %%mm1                \n\t"
 271         "packuswb %%mm1, %%mm0          \n\t"
 272         "psubusb %%mm0, %%mm4           \n\t"
 273         "psubusb %%mm5, %%mm0           \n\t"
 274         "por %%mm4, %%mm0               \n\t"
 275         "movq %%mm0, %%mm4              \n\t"
 276         "punpcklbw %%mm7, %%mm0         \n\t"
 277         "punpckhbw %%mm7, %%mm4         \n\t"
 278         "paddw %%mm0, %%mm6             \n\t"
 279         "paddw %%mm4, %%mm6             \n\t"
 280         "movq  %%mm2, %%mm0             \n\t"
 281         "movq  %%mm3, %%mm1             \n\t"
 282         "add %4, %%"REG_a"              \n\t"
 283         " js 1b                         \n\t"
 284         : "+a" (len)
 285         : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride)
 286     );
 287 }
 288
 289 static inline int sum_mmx(void)
 290 {
 291     int ret;
 292     __asm__ volatile(
 293         "movq %%mm6, %%mm0              \n\t"
 294         "psrlq $32, %%mm6               \n\t"
 295         "paddw %%mm0, %%mm6             \n\t"
 296         "movq %%mm6, %%mm0              \n\t"
 297         "psrlq $16, %%mm6               \n\t"
 298         "paddw %%mm0, %%mm6             \n\t"
 299         "movd %%mm6, %0                 \n\t"
 300         : "=r" (ret)
 301     );
 302     return ret&0xFFFF;
 303 }
 304
 305 static inline int sum_mmx2(void)
 306 {
 307     int ret;
 308     __asm__ volatile(
 309         "movd %%mm6, %0                 \n\t"
 310         : "=r" (ret)
 311     );
 312     return ret;
 313 }
 314
 315 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 316 {
 317     sad8_2_mmx(blk1, blk1+1, blk2, stride, h);
 318 }
 319 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 320 {
 321     sad8_2_mmx(blk1, blk1+stride, blk2, stride, h);
 322 }
 323
 324
 325 #define PIX_SAD(suf)\
 326 static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 327 {\
 328     assert(h==8);\
 329     __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
 330                  "pxor %%mm6, %%mm6     \n\t":);\
 331 \
 332     sad8_1_ ## suf(blk1, blk2, stride, 8);\
 333 \
 334     return sum_ ## suf();\
 335 }\
 336 static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 337 {\
 338     assert(h==8);\
 339     __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
 340                  "pxor %%mm6, %%mm6     \n\t"\
 341                  "movq %0, %%mm5        \n\t"\
 342                  :: "m"(round_tab[1]) \
 343                  );\
 344 \
 345     sad8_x2a_ ## suf(blk1, blk2, stride, 8);\
 346 \
 347     return sum_ ## suf();\
 348 }\
 349 \
 350 static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 351 {\
 352     assert(h==8);\
 353     __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
 354                  "pxor %%mm6, %%mm6     \n\t"\
 355                  "movq %0, %%mm5        \n\t"\
 356                  :: "m"(round_tab[1]) \
 357                  );\
 358 \
 359     sad8_y2a_ ## suf(blk1, blk2, stride, 8);\
 360 \
 361     return sum_ ## suf();\
 362 }\
 363 \
 364 static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 365 {\
 366     assert(h==8);\
 367     __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
 368                  "pxor %%mm6, %%mm6     \n\t"\
 369                  ::);\
 370 \
 371     sad8_4_ ## suf(blk1, blk2, stride, 8);\
 372 \
 373     return sum_ ## suf();\
 374 }\
 375 \
 376 static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 377 {\
 378     __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
 379                  "pxor %%mm6, %%mm6     \n\t":);\
 380 \
 381     sad8_1_ ## suf(blk1  , blk2  , stride, h);\
 382     sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
 383 \
 384     return sum_ ## suf();\
 385 }\
 386 static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 387 {\
 388     __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
 389                  "pxor %%mm6, %%mm6     \n\t"\
 390                  "movq %0, %%mm5        \n\t"\
 391                  :: "m"(round_tab[1]) \
 392                  );\
 393 \
 394     sad8_x2a_ ## suf(blk1  , blk2  , stride, h);\
 395     sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\
 396 \
 397     return sum_ ## suf();\
 398 }\
 399 static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 400 {\
 401     __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
 402                  "pxor %%mm6, %%mm6     \n\t"\
 403                  "movq %0, %%mm5        \n\t"\
 404                  :: "m"(round_tab[1]) \
 405                  );\
 406 \
 407     sad8_y2a_ ## suf(blk1  , blk2  , stride, h);\
 408     sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\
 409 \
 410     return sum_ ## suf();\
 411 }\
 412 static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 413 {\
 414     __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
 415                  "pxor %%mm6, %%mm6     \n\t"\
 416                  ::);\
 417 \
 418     sad8_4_ ## suf(blk1  , blk2  , stride, h);\
 419     sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
 420 \
 421     return sum_ ## suf();\
 422 }\
 423
 424 PIX_SAD(mmx)
 425 PIX_SAD(mmx2)
 426
 427 void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
 428 {
 429     if (mm_flags & FF_MM_MMX) {
 430         c->pix_abs[0][0] = sad16_mmx;
 431         c->pix_abs[0][1] = sad16_x2_mmx;
 432         c->pix_abs[0][2] = sad16_y2_mmx;
 433         c->pix_abs[0][3] = sad16_xy2_mmx;
 434         c->pix_abs[1][0] = sad8_mmx;
 435         c->pix_abs[1][1] = sad8_x2_mmx;
 436         c->pix_abs[1][2] = sad8_y2_mmx;
 437         c->pix_abs[1][3] = sad8_xy2_mmx;
 438
 439         c->sad[0]= sad16_mmx;
 440         c->sad[1]= sad8_mmx;
 441     }
 442     if (mm_flags & FF_MM_MMX2) {
 443         c->pix_abs[0][0] = sad16_mmx2;
 444         c->pix_abs[1][0] = sad8_mmx2;
 445
 446         c->sad[0]= sad16_mmx2;
 447         c->sad[1]= sad8_mmx2;
 448
 449         if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
 450             c->pix_abs[0][1] = sad16_x2_mmx2;
 451             c->pix_abs[0][2] = sad16_y2_mmx2;
 452             c->pix_abs[0][3] = sad16_xy2_mmx2;
 453             c->pix_abs[1][1] = sad8_x2_mmx2;
 454             c->pix_abs[1][2] = sad8_y2_mmx2;
 455             c->pix_abs[1][3] = sad8_xy2_mmx2;
 456         }
 457     }
 458     if ((mm_flags & FF_MM_SSE2) && !(mm_flags & FF_MM_3DNOW) && avctx->codec_id != CODEC_ID_SNOW) {
 459         c->sad[0]= sad16_sse2;
 460     }
 461 }