gfx/ycbcr/yuv_row_win.cpp

   1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "yuv_row.h"
   6 #include "mozilla/SSE.h"
   7
   8 #define kCoefficientsRgbU kCoefficientsRgbY + 2048
   9 #define kCoefficientsRgbV kCoefficientsRgbY + 4096
  10
  11 extern "C" {
  12
  13 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
  14 #if defined(__clang__)
  15 // clang-cl has a bug where it doesn't mangle names in inline asm
  16 // so let's do the mangling in the preprocessor (ugh)
  17 // (but we still need to declare a dummy extern for the parser)
  18 extern void* _kCoefficientsRgbY;
  19 #define kCoefficientsRgbY _kCoefficientsRgbY
  20 #endif
  21
  22 __declspec(naked)
  23 void FastConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
  24                                   const uint8_t* u_buf,
  25                                   const uint8_t* v_buf,
  26                                   uint8_t* rgb_buf,
  27                                   int width) {
  28   __asm {
  29     pushad
  30     mov       edx, [esp + 32 + 4]   // Y
  31     mov       edi, [esp + 32 + 8]   // U
  32     mov       esi, [esp + 32 + 12]  // V
  33     mov       ebp, [esp + 32 + 16]  // rgb
  34     mov       ecx, [esp + 32 + 20]  // width
  35     jmp       convertend
  36
  37  convertloop :
  38     movzx     eax, byte ptr [edi]
  39     add       edi, 1
  40     movzx     ebx, byte ptr [esi]
  41     add       esi, 1
  42     movq      mm0, [kCoefficientsRgbU + 8 * eax]
  43     movzx     eax, byte ptr [edx]
  44     paddsw    mm0, [kCoefficientsRgbV + 8 * ebx]
  45     movzx     ebx, byte ptr [edx + 1]
  46     movq      mm1, [kCoefficientsRgbY + 8 * eax]
  47     add       edx, 2
  48     movq      mm2, [kCoefficientsRgbY + 8 * ebx]
  49     paddsw    mm1, mm0
  50     paddsw    mm2, mm0
  51     psraw     mm1, 6
  52     psraw     mm2, 6
  53     packuswb  mm1, mm2
  54     movntq    [ebp], mm1
  55     add       ebp, 8
  56  convertend :
  57     sub       ecx, 2
  58     jns       convertloop
  59
  60     and       ecx, 1  // odd number of pixels?
  61     jz        convertdone
  62
  63     movzx     eax, byte ptr [edi]
  64     movq      mm0, [kCoefficientsRgbU + 8 * eax]
  65     movzx     eax, byte ptr [esi]
  66     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
  67     movzx     eax, byte ptr [edx]
  68     movq      mm1, [kCoefficientsRgbY + 8 * eax]
  69     paddsw    mm1, mm0
  70     psraw     mm1, 6
  71     packuswb  mm1, mm1
  72     movd      [ebp], mm1
  73  convertdone :
  74
  75     popad
  76     ret
  77   }
  78 }
  79
  80 __declspec(naked)
  81 void ConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
  82                               const uint8_t* u_buf,
  83                               const uint8_t* v_buf,
  84                               uint8_t* rgb_buf,
  85                               int width,
  86                               int step) {
  87   __asm {
  88     pushad
  89     mov       edx, [esp + 32 + 4]   // Y
  90     mov       edi, [esp + 32 + 8]   // U
  91     mov       esi, [esp + 32 + 12]  // V
  92     mov       ebp, [esp + 32 + 16]  // rgb
  93     mov       ecx, [esp + 32 + 20]  // width
  94     mov       ebx, [esp + 32 + 24]  // step
  95     jmp       wend
  96
  97  wloop :
  98     movzx     eax, byte ptr [edi]
  99     add       edi, ebx
 100     movq      mm0, [kCoefficientsRgbU + 8 * eax]
 101     movzx     eax, byte ptr [esi]
 102     add       esi, ebx
 103     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
 104     movzx     eax, byte ptr [edx]
 105     add       edx, ebx
 106     movq      mm1, [kCoefficientsRgbY + 8 * eax]
 107     movzx     eax, byte ptr [edx]
 108     add       edx, ebx
 109     movq      mm2, [kCoefficientsRgbY + 8 * eax]
 110     paddsw    mm1, mm0
 111     paddsw    mm2, mm0
 112     psraw     mm1, 6
 113     psraw     mm2, 6
 114     packuswb  mm1, mm2
 115     movntq    [ebp], mm1
 116     add       ebp, 8
 117  wend :
 118     sub       ecx, 2
 119     jns       wloop
 120
 121     and       ecx, 1  // odd number of pixels?
 122     jz        wdone
 123
 124     movzx     eax, byte ptr [edi]
 125     movq      mm0, [kCoefficientsRgbU + 8 * eax]
 126     movzx     eax, byte ptr [esi]
 127     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
 128     movzx     eax, byte ptr [edx]
 129     movq      mm1, [kCoefficientsRgbY + 8 * eax]
 130     paddsw    mm1, mm0
 131     psraw     mm1, 6
 132     packuswb  mm1, mm1
 133     movd      [ebp], mm1
 134  wdone :
 135
 136     popad
 137     ret
 138   }
 139 }
 140
 141 __declspec(naked)
 142 void RotateConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
 143                                     const uint8_t* u_buf,
 144                                     const uint8_t* v_buf,
 145                                     uint8_t* rgb_buf,
 146                                     int width,
 147                                     int ystep,
 148                                     int uvstep) {
 149   __asm {
 150     pushad
 151     mov       edx, [esp + 32 + 4]   // Y
 152     mov       edi, [esp + 32 + 8]   // U
 153     mov       esi, [esp + 32 + 12]  // V
 154     mov       ebp, [esp + 32 + 16]  // rgb
 155     mov       ecx, [esp + 32 + 20]  // width
 156     jmp       wend
 157
 158  wloop :
 159     movzx     eax, byte ptr [edi]
 160     mov       ebx, [esp + 32 + 28]  // uvstep
 161     add       edi, ebx
 162     movq      mm0, [kCoefficientsRgbU + 8 * eax]
 163     movzx     eax, byte ptr [esi]
 164     add       esi, ebx
 165     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
 166     movzx     eax, byte ptr [edx]
 167     mov       ebx, [esp + 32 + 24]  // ystep
 168     add       edx, ebx
 169     movq      mm1, [kCoefficientsRgbY + 8 * eax]
 170     movzx     eax, byte ptr [edx]
 171     add       edx, ebx
 172     movq      mm2, [kCoefficientsRgbY + 8 * eax]
 173     paddsw    mm1, mm0
 174     paddsw    mm2, mm0
 175     psraw     mm1, 6
 176     psraw     mm2, 6
 177     packuswb  mm1, mm2
 178     movntq    [ebp], mm1
 179     add       ebp, 8
 180  wend :
 181     sub       ecx, 2
 182     jns       wloop
 183
 184     and       ecx, 1  // odd number of pixels?
 185     jz        wdone
 186
 187     movzx     eax, byte ptr [edi]
 188     movq      mm0, [kCoefficientsRgbU + 8 * eax]
 189     movzx     eax, byte ptr [esi]
 190     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
 191     movzx     eax, byte ptr [edx]
 192     movq      mm1, [kCoefficientsRgbY + 8 * eax]
 193     paddsw    mm1, mm0
 194     psraw     mm1, 6
 195     packuswb  mm1, mm1
 196     movd      [ebp], mm1
 197  wdone :
 198
 199     popad
 200     ret
 201   }
 202 }
 203
 204 __declspec(naked)
 205 void DoubleYUVToRGB32Row_SSE(const uint8_t* y_buf,
 206                              const uint8_t* u_buf,
 207                              const uint8_t* v_buf,
 208                              uint8_t* rgb_buf,
 209                              int width) {
 210   __asm {
 211     pushad
 212     mov       edx, [esp + 32 + 4]   // Y
 213     mov       edi, [esp + 32 + 8]   // U
 214     mov       esi, [esp + 32 + 12]  // V
 215     mov       ebp, [esp + 32 + 16]  // rgb
 216     mov       ecx, [esp + 32 + 20]  // width
 217     jmp       wend
 218
 219  wloop :
 220     movzx     eax, byte ptr [edi]
 221     add       edi, 1
 222     movzx     ebx, byte ptr [esi]
 223     add       esi, 1
 224     movq      mm0, [kCoefficientsRgbU + 8 * eax]
 225     movzx     eax, byte ptr [edx]
 226     paddsw    mm0, [kCoefficientsRgbV + 8 * ebx]
 227     movq      mm1, [kCoefficientsRgbY + 8 * eax]
 228     paddsw    mm1, mm0
 229     psraw     mm1, 6
 230     packuswb  mm1, mm1
 231     punpckldq mm1, mm1
 232     movntq    [ebp], mm1
 233
 234     movzx     ebx, byte ptr [edx + 1]
 235     add       edx, 2
 236     paddsw    mm0, [kCoefficientsRgbY + 8 * ebx]
 237     psraw     mm0, 6
 238     packuswb  mm0, mm0
 239     punpckldq mm0, mm0
 240     movntq    [ebp+8], mm0
 241     add       ebp, 16
 242  wend :
 243     sub       ecx, 4
 244     jns       wloop
 245
 246     add       ecx, 4
 247     jz        wdone
 248
 249     movzx     eax, byte ptr [edi]
 250     movq      mm0, [kCoefficientsRgbU + 8 * eax]
 251     movzx     eax, byte ptr [esi]
 252     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
 253     movzx     eax, byte ptr [edx]
 254     movq      mm1, [kCoefficientsRgbY + 8 * eax]
 255     paddsw    mm1, mm0
 256     psraw     mm1, 6
 257     packuswb  mm1, mm1
 258     jmp       wend1
 259
 260  wloop1 :
 261     movd      [ebp], mm1
 262     add       ebp, 4
 263  wend1 :
 264     sub       ecx, 1
 265     jns       wloop1
 266  wdone :
 267     popad
 268     ret
 269   }
 270 }
 271
 272 // This version does general purpose scaling by any amount, up or down.
 273 // The only thing it cannot do is rotation by 90 or 270.
 274 // For performance the chroma is under-sampled, reducing cost of a 3x
 275 // 1080p scale from 8.4 ms to 5.4 ms.
 276 __declspec(naked)
 277 void ScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
 278                             const uint8_t* u_buf,
 279                             const uint8_t* v_buf,
 280                             uint8_t* rgb_buf,
 281                             int width,
 282                             int source_dx) {
 283   __asm {
 284     pushad
 285     mov       edx, [esp + 32 + 4]   // Y
 286     mov       edi, [esp + 32 + 8]   // U
 287     mov       esi, [esp + 32 + 12]  // V
 288     mov       ebp, [esp + 32 + 16]  // rgb
 289     mov       ecx, [esp + 32 + 20]  // width
 290     xor       ebx, ebx              // x
 291     jmp       scaleend
 292
 293  scaleloop :
 294     mov       eax, ebx
 295     sar       eax, 17
 296     movzx     eax, byte ptr [edi + eax]
 297     movq      mm0, [kCoefficientsRgbU + 8 * eax]
 298     mov       eax, ebx
 299     sar       eax, 17
 300     movzx     eax, byte ptr [esi + eax]
 301     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
 302     mov       eax, ebx
 303     add       ebx, [esp + 32 + 24]  // x += source_dx
 304     sar       eax, 16
 305     movzx     eax, byte ptr [edx + eax]
 306     movq      mm1, [kCoefficientsRgbY + 8 * eax]
 307     mov       eax, ebx
 308     add       ebx, [esp + 32 + 24]  // x += source_dx
 309     sar       eax, 16
 310     movzx     eax, byte ptr [edx + eax]
 311     movq      mm2, [kCoefficientsRgbY + 8 * eax]
 312     paddsw    mm1, mm0
 313     paddsw    mm2, mm0
 314     psraw     mm1, 6
 315     psraw     mm2, 6
 316     packuswb  mm1, mm2
 317     movntq    [ebp], mm1
 318     add       ebp, 8
 319  scaleend :
 320     sub       ecx, 2
 321     jns       scaleloop
 322
 323     and       ecx, 1  // odd number of pixels?
 324     jz        scaledone
 325
 326     mov       eax, ebx
 327     sar       eax, 17
 328     movzx     eax, byte ptr [edi + eax]
 329     movq      mm0, [kCoefficientsRgbU + 8 * eax]
 330     mov       eax, ebx
 331     sar       eax, 17
 332     movzx     eax, byte ptr [esi + eax]
 333     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
 334     mov       eax, ebx
 335     sar       eax, 16
 336     movzx     eax, byte ptr [edx + eax]
 337     movq      mm1, [kCoefficientsRgbY + 8 * eax]
 338     paddsw    mm1, mm0
 339     psraw     mm1, 6
 340     packuswb  mm1, mm1
 341     movd      [ebp], mm1
 342
 343  scaledone :
 344     popad
 345     ret
 346   }
 347 }
 348
 349 __declspec(naked)
 350 void LinearScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
 351                                   const uint8_t* u_buf,
 352                                   const uint8_t* v_buf,
 353                                   uint8_t* rgb_buf,
 354                                   int width,
 355                                   int source_dx) {
 356   __asm {
 357     pushad
 358     mov       edx, [esp + 32 + 4]  // Y
 359     mov       edi, [esp + 32 + 8]  // U
 360                 // [esp + 32 + 12] // V
 361     mov       ebp, [esp + 32 + 16] // rgb
 362     mov       ecx, [esp + 32 + 20] // width
 363     imul      ecx, [esp + 32 + 24] // source_dx
 364     mov       [esp + 32 + 20], ecx // source_width = width * source_dx
 365     mov       ecx, [esp + 32 + 24] // source_dx
 366     xor       ebx, ebx             // x = 0
 367     cmp       ecx, 0x20000
 368     jl        lscaleend
 369     mov       ebx, 0x8000          // x = 0.5 for 1/2 or less
 370     jmp       lscaleend
 371 lscaleloop:
 372     mov       eax, ebx
 373     sar       eax, 0x11
 374
 375     movzx     ecx, byte ptr [edi + eax]
 376     movzx     esi, byte ptr [edi + eax + 1]
 377     mov       eax, ebx
 378     and       eax, 0x1fffe
 379     imul      esi, eax
 380     xor       eax, 0x1fffe
 381     imul      ecx, eax
 382     add       ecx, esi
 383     shr       ecx, 17
 384     movq      mm0, [kCoefficientsRgbU + 8 * ecx]
 385
 386     mov       esi, [esp + 32 + 12]
 387     mov       eax, ebx
 388     sar       eax, 0x11
 389
 390     movzx     ecx, byte ptr [esi + eax]
 391     movzx     esi, byte ptr [esi + eax + 1]
 392     mov       eax, ebx
 393     and       eax, 0x1fffe
 394     imul      esi, eax
 395     xor       eax, 0x1fffe
 396     imul      ecx, eax
 397     add       ecx, esi
 398     shr       ecx, 17
 399     paddsw    mm0, [kCoefficientsRgbV + 8 * ecx]
 400
 401     mov       eax, ebx
 402     sar       eax, 0x10
 403     movzx     ecx, byte ptr [edx + eax]
 404     movzx     esi, byte ptr [1 + edx + eax]
 405     mov       eax, ebx
 406     add       ebx, [esp + 32 + 24]
 407     and       eax, 0xffff
 408     imul      esi, eax
 409     xor       eax, 0xffff
 410     imul      ecx, eax
 411     add       ecx, esi
 412     shr       ecx, 16
 413     movq      mm1, [kCoefficientsRgbY + 8 * ecx]
 414
 415     cmp       ebx, [esp + 32 + 20]
 416     jge       lscalelastpixel
 417
 418     mov       eax, ebx
 419     sar       eax, 0x10
 420     movzx     ecx, byte ptr [edx + eax]
 421     movzx     esi, byte ptr [edx + eax + 1]
 422     mov       eax, ebx
 423     add       ebx, [esp + 32 + 24]
 424     and       eax, 0xffff
 425     imul      esi, eax
 426     xor       eax, 0xffff
 427     imul      ecx, eax
 428     add       ecx, esi
 429     shr       ecx, 16
 430     movq      mm2, [kCoefficientsRgbY + 8 * ecx]
 431
 432     paddsw    mm1, mm0
 433     paddsw    mm2, mm0
 434     psraw     mm1, 0x6
 435     psraw     mm2, 0x6
 436     packuswb  mm1, mm2
 437     movntq    [ebp], mm1
 438     add       ebp, 0x8
 439
 440 lscaleend:
 441     cmp       ebx, [esp + 32 + 20]
 442     jl        lscaleloop
 443     popad
 444     ret
 445
 446 lscalelastpixel:
 447     paddsw    mm1, mm0
 448     psraw     mm1, 6
 449     packuswb  mm1, mm1
 450     movd      [ebp], mm1
 451     popad
 452     ret
 453   };
 454 }
 455 #endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
 456
 457 void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
 458                               const uint8_t* u_buf,
 459                               const uint8_t* v_buf,
 460                               uint8_t* rgb_buf,
 461                               int width) {
 462 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
 463   if (mozilla::supports_sse()) {
 464     FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
 465     return;
 466   }
 467 #endif
 468
 469   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
 470 }
 471
 472 void ScaleYUVToRGB32Row(const uint8_t* y_buf,
 473                         const uint8_t* u_buf,
 474                         const uint8_t* v_buf,
 475                         uint8_t* rgb_buf,
 476                         int width,
 477                         int source_dx) {
 478
 479 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
 480   if (mozilla::supports_sse()) {
 481     ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
 482     return;
 483   }
 484 #endif
 485
 486   ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
 487 }
 488
 489 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
 490                               const uint8_t* u_buf,
 491                               const uint8_t* v_buf,
 492                               uint8_t* rgb_buf,
 493                               int width,
 494                               int source_dx) {
 495 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
 496   if (mozilla::supports_sse()) {
 497     LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
 498                                  source_dx);
 499     return;
 500   }
 501 #endif
 502
 503   LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
 504 }
 505
 506 } // extern "C"