jpeg/simd/jdsammmx.asm

   1 ;
   2 ; jdsammmx.asm - upsampling (MMX)
   3 ;
   4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
   5 ;
   6 ; Based on
   7 ; x86 SIMD extension for IJG JPEG library
   8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
   9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10 ;
  11 ; This file should be assembled with NASM (Netwide Assembler),
  12 ; can *not* be assembled with Microsoft's MASM or any compatible
  13 ; assembler (including Borland's Turbo Assembler).
  14 ; NASM is available from http://nasm.sourceforge.net/ or
  15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16 ;
  17 ; [TAB8]
  18
  19 %include "jsimdext.inc"
  20
  21 ; --------------------------------------------------------------------------
  22         SECTION SEG_CONST
  23
  24         alignz  16
  25         global  EXTN(jconst_fancy_upsample_mmx)
  26
  27 EXTN(jconst_fancy_upsample_mmx):
  28
  29 PW_ONE          times 4 dw  1
  30 PW_TWO          times 4 dw  2
  31 PW_THREE        times 4 dw  3
  32 PW_SEVEN        times 4 dw  7
  33 PW_EIGHT        times 4 dw  8
  34
  35         alignz  16
  36
  37 ; --------------------------------------------------------------------------
  38         SECTION SEG_TEXT
  39         BITS    32
  40 ;
  41 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
  42 ;
  43 ; The upsampling algorithm is linear interpolation between pixel centers,
  44 ; also known as a "triangle filter".  This is a good compromise between
  45 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
  46 ; of the way between input pixel centers.
  47 ;
  48 ; GLOBAL(void)
  49 ; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
  50 ;                                JDIMENSION downsampled_width,
  51 ;                                JSAMPARRAY input_data,
  52 ;                                JSAMPARRAY * output_data_ptr);
  53 ;
  54
  55 %define max_v_samp(b)           (b)+8                   ; int max_v_samp_factor
  56 %define downsamp_width(b)       (b)+12  ; JDIMENSION downsampled_width
  57 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
  58 %define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
  59
  60         align   16
  61         global  EXTN(jsimd_h2v1_fancy_upsample_mmx)
  62
  63 EXTN(jsimd_h2v1_fancy_upsample_mmx):
  64         push    ebp
  65         mov     ebp,esp
  66         pushpic ebx
  67 ;       push    ecx             ; need not be preserved
  68 ;       push    edx             ; need not be preserved
  69         push    esi
  70         push    edi
  71
  72         get_GOT ebx             ; get GOT address
  73
  74         mov     eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
  75         test    eax,eax
  76         jz      near .return
  77
  78         mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
  79         test    ecx,ecx
  80         jz      near .return
  81
  82         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
  83         mov     edi, POINTER [output_data_ptr(ebp)]
  84         mov     edi, JSAMPARRAY [edi]                   ; output_data
  85         alignx  16,7
  86 .rowloop:
  87         push    eax                     ; colctr
  88         push    edi
  89         push    esi
  90
  91         mov     esi, JSAMPROW [esi]     ; inptr
  92         mov     edi, JSAMPROW [edi]     ; outptr
  93
  94         test    eax, SIZEOF_MMWORD-1
  95         jz      short .skip
  96         mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
  97         mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
  98 .skip:
  99         pxor    mm0,mm0                 ; mm0=(all 0's)
 100         pcmpeqb mm7,mm7
 101         psrlq   mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
 102         pand    mm7, MMWORD [esi+0*SIZEOF_MMWORD]
 103
 104         add     eax, byte SIZEOF_MMWORD-1
 105         and     eax, byte -SIZEOF_MMWORD
 106         cmp     eax, byte SIZEOF_MMWORD
 107         ja      short .columnloop
 108         alignx  16,7
 109
 110 .columnloop_last:
 111         pcmpeqb mm6,mm6
 112         psllq   mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
 113         pand    mm6, MMWORD [esi+0*SIZEOF_MMWORD]
 114         jmp     short .upsample
 115         alignx  16,7
 116
 117 .columnloop:
 118         movq    mm6, MMWORD [esi+1*SIZEOF_MMWORD]
 119         psllq   mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
 120
 121 .upsample:
 122         movq    mm1, MMWORD [esi+0*SIZEOF_MMWORD]
 123         movq    mm2,mm1
 124         movq    mm3,mm1                 ; mm1=( 0 1 2 3 4 5 6 7)
 125         psllq   mm2,BYTE_BIT            ; mm2=( - 0 1 2 3 4 5 6)
 126         psrlq   mm3,BYTE_BIT            ; mm3=( 1 2 3 4 5 6 7 -)
 127
 128         por     mm2,mm7                 ; mm2=(-1 0 1 2 3 4 5 6)
 129         por     mm3,mm6                 ; mm3=( 1 2 3 4 5 6 7 8)
 130
 131         movq    mm7,mm1
 132         psrlq   mm7,(SIZEOF_MMWORD-1)*BYTE_BIT  ; mm7=( 7 - - - - - - -)
 133
 134         movq      mm4,mm1
 135         punpcklbw mm1,mm0               ; mm1=( 0 1 2 3)
 136         punpckhbw mm4,mm0               ; mm4=( 4 5 6 7)
 137         movq      mm5,mm2
 138         punpcklbw mm2,mm0               ; mm2=(-1 0 1 2)
 139         punpckhbw mm5,mm0               ; mm5=( 3 4 5 6)
 140         movq      mm6,mm3
 141         punpcklbw mm3,mm0               ; mm3=( 1 2 3 4)
 142         punpckhbw mm6,mm0               ; mm6=( 5 6 7 8)
 143
 144         pmullw  mm1,[GOTOFF(ebx,PW_THREE)]
 145         pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
 146         paddw   mm2,[GOTOFF(ebx,PW_ONE)]
 147         paddw   mm5,[GOTOFF(ebx,PW_ONE)]
 148         paddw   mm3,[GOTOFF(ebx,PW_TWO)]
 149         paddw   mm6,[GOTOFF(ebx,PW_TWO)]
 150
 151         paddw   mm2,mm1
 152         paddw   mm5,mm4
 153         psrlw   mm2,2                   ; mm2=OutLE=( 0  2  4  6)
 154         psrlw   mm5,2                   ; mm5=OutHE=( 8 10 12 14)
 155         paddw   mm3,mm1
 156         paddw   mm6,mm4
 157         psrlw   mm3,2                   ; mm3=OutLO=( 1  3  5  7)
 158         psrlw   mm6,2                   ; mm6=OutHO=( 9 11 13 15)
 159
 160         psllw   mm3,BYTE_BIT
 161         psllw   mm6,BYTE_BIT
 162         por     mm2,mm3                 ; mm2=OutL=( 0  1  2  3  4  5  6  7)
 163         por     mm5,mm6                 ; mm5=OutH=( 8  9 10 11 12 13 14 15)
 164
 165         movq    MMWORD [edi+0*SIZEOF_MMWORD], mm2
 166         movq    MMWORD [edi+1*SIZEOF_MMWORD], mm5
 167
 168         sub     eax, byte SIZEOF_MMWORD
 169         add     esi, byte 1*SIZEOF_MMWORD       ; inptr
 170         add     edi, byte 2*SIZEOF_MMWORD       ; outptr
 171         cmp     eax, byte SIZEOF_MMWORD
 172         ja      near .columnloop
 173         test    eax,eax
 174         jnz     near .columnloop_last
 175
 176         pop     esi
 177         pop     edi
 178         pop     eax
 179
 180         add     esi, byte SIZEOF_JSAMPROW       ; input_data
 181         add     edi, byte SIZEOF_JSAMPROW       ; output_data
 182         dec     ecx                             ; rowctr
 183         jg      near .rowloop
 184
 185         emms            ; empty MMX state
 186
 187 .return:
 188         pop     edi
 189         pop     esi
 190 ;       pop     edx             ; need not be preserved
 191 ;       pop     ecx             ; need not be preserved
 192         poppic  ebx
 193         pop     ebp
 194         ret
 195
 196 ; --------------------------------------------------------------------------
 197 ;
 198 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
 199 ; Again a triangle filter; see comments for h2v1 case, above.
 200 ;
 201 ; GLOBAL(void)
 202 ; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
 203 ;                                JDIMENSION downsampled_width,
 204 ;                                JSAMPARRAY input_data,
 205 ;                                JSAMPARRAY * output_data_ptr);
 206 ;
 207
 208 %define max_v_samp(b)           (b)+8                   ; int max_v_samp_factor
 209 %define downsamp_width(b)       (b)+12  ; JDIMENSION downsampled_width
 210 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
 211 %define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
 212
 213 %define original_ebp    ebp+0
 214 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
 215 %define WK_NUM          4
 216 %define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
 217
 218         align   16
 219         global  EXTN(jsimd_h2v2_fancy_upsample_mmx)
 220
 221 EXTN(jsimd_h2v2_fancy_upsample_mmx):
 222         push    ebp
 223         mov     eax,esp                         ; eax = original ebp
 224         sub     esp, byte 4
 225         and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
 226         mov     [esp],eax
 227         mov     ebp,esp                         ; ebp = aligned ebp
 228         lea     esp, [wk(0)]
 229         pushpic eax             ; make a room for GOT address
 230         push    ebx
 231 ;       push    ecx             ; need not be preserved
 232 ;       push    edx             ; need not be preserved
 233         push    esi
 234         push    edi
 235
 236         get_GOT ebx                     ; get GOT address
 237         movpic  POINTER [gotptr], ebx   ; save GOT address
 238
 239         mov     edx,eax                         ; edx = original ebp
 240         mov     eax, JDIMENSION [downsamp_width(edx)]  ; colctr
 241         test    eax,eax
 242         jz      near .return
 243
 244         mov     ecx, INT [max_v_samp(edx)]      ; rowctr
 245         test    ecx,ecx
 246         jz      near .return
 247
 248         mov     esi, JSAMPARRAY [input_data(edx)]       ; input_data
 249         mov     edi, POINTER [output_data_ptr(edx)]
 250         mov     edi, JSAMPARRAY [edi]                   ; output_data
 251         alignx  16,7
 252 .rowloop:
 253         push    eax                                     ; colctr
 254         push    ecx
 255         push    edi
 256         push    esi
 257
 258         mov     ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
 259         mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
 260         mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
 261         mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
 262         mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
 263
 264         test    eax, SIZEOF_MMWORD-1
 265         jz      short .skip
 266         push    edx
 267         mov     dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
 268         mov     JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
 269         mov     dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
 270         mov     JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
 271         mov     dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
 272         mov     JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
 273         pop     edx
 274 .skip:
 275         ; -- process the first column block
 276
 277         movq    mm0, MMWORD [ebx+0*SIZEOF_MMWORD]       ; mm0=row[ 0][0]
 278         movq    mm1, MMWORD [ecx+0*SIZEOF_MMWORD]       ; mm1=row[-1][0]
 279         movq    mm2, MMWORD [esi+0*SIZEOF_MMWORD]       ; mm2=row[+1][0]
 280
 281         pushpic ebx
 282         movpic  ebx, POINTER [gotptr]   ; load GOT address
 283
 284         pxor      mm3,mm3               ; mm3=(all 0's)
 285         movq      mm4,mm0
 286         punpcklbw mm0,mm3               ; mm0=row[ 0][0]( 0 1 2 3)
 287         punpckhbw mm4,mm3               ; mm4=row[ 0][0]( 4 5 6 7)
 288         movq      mm5,mm1
 289         punpcklbw mm1,mm3               ; mm1=row[-1][0]( 0 1 2 3)
 290         punpckhbw mm5,mm3               ; mm5=row[-1][0]( 4 5 6 7)
 291         movq      mm6,mm2
 292         punpcklbw mm2,mm3               ; mm2=row[+1][0]( 0 1 2 3)
 293         punpckhbw mm6,mm3               ; mm6=row[+1][0]( 4 5 6 7)
 294
 295         pmullw  mm0,[GOTOFF(ebx,PW_THREE)]
 296         pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
 297
 298         pcmpeqb mm7,mm7
 299         psrlq   mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
 300
 301         paddw   mm1,mm0                 ; mm1=Int0L=( 0 1 2 3)
 302         paddw   mm5,mm4                 ; mm5=Int0H=( 4 5 6 7)
 303         paddw   mm2,mm0                 ; mm2=Int1L=( 0 1 2 3)
 304         paddw   mm6,mm4                 ; mm6=Int1H=( 4 5 6 7)
 305
 306         movq    MMWORD [edx+0*SIZEOF_MMWORD], mm1       ; temporarily save
 307         movq    MMWORD [edx+1*SIZEOF_MMWORD], mm5       ; the intermediate data
 308         movq    MMWORD [edi+0*SIZEOF_MMWORD], mm2
 309         movq    MMWORD [edi+1*SIZEOF_MMWORD], mm6
 310
 311         pand    mm1,mm7                 ; mm1=( 0 - - -)
 312         pand    mm2,mm7                 ; mm2=( 0 - - -)
 313
 314         movq    MMWORD [wk(0)], mm1
 315         movq    MMWORD [wk(1)], mm2
 316
 317         poppic  ebx
 318
 319         add     eax, byte SIZEOF_MMWORD-1
 320         and     eax, byte -SIZEOF_MMWORD
 321         cmp     eax, byte SIZEOF_MMWORD
 322         ja      short .columnloop
 323         alignx  16,7
 324
 325 .columnloop_last:
 326         ; -- process the last column block
 327
 328         pushpic ebx
 329         movpic  ebx, POINTER [gotptr]   ; load GOT address
 330
 331         pcmpeqb mm1,mm1
 332         psllq   mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
 333         movq    mm2,mm1
 334
 335         pand    mm1, MMWORD [edx+1*SIZEOF_MMWORD]       ; mm1=( - - - 7)
 336         pand    mm2, MMWORD [edi+1*SIZEOF_MMWORD]       ; mm2=( - - - 7)
 337
 338         movq    MMWORD [wk(2)], mm1
 339         movq    MMWORD [wk(3)], mm2
 340
 341         jmp     short .upsample
 342         alignx  16,7
 343
 344 .columnloop:
 345         ; -- process the next column block
 346
 347         movq    mm0, MMWORD [ebx+1*SIZEOF_MMWORD]       ; mm0=row[ 0][1]
 348         movq    mm1, MMWORD [ecx+1*SIZEOF_MMWORD]       ; mm1=row[-1][1]
 349         movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]       ; mm2=row[+1][1]
 350
 351         pushpic ebx
 352         movpic  ebx, POINTER [gotptr]   ; load GOT address
 353
 354         pxor      mm3,mm3               ; mm3=(all 0's)
 355         movq      mm4,mm0
 356         punpcklbw mm0,mm3               ; mm0=row[ 0][1]( 0 1 2 3)
 357         punpckhbw mm4,mm3               ; mm4=row[ 0][1]( 4 5 6 7)
 358         movq      mm5,mm1
 359         punpcklbw mm1,mm3               ; mm1=row[-1][1]( 0 1 2 3)
 360         punpckhbw mm5,mm3               ; mm5=row[-1][1]( 4 5 6 7)
 361         movq      mm6,mm2
 362         punpcklbw mm2,mm3               ; mm2=row[+1][1]( 0 1 2 3)
 363         punpckhbw mm6,mm3               ; mm6=row[+1][1]( 4 5 6 7)
 364
 365         pmullw  mm0,[GOTOFF(ebx,PW_THREE)]
 366         pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
 367
 368         paddw   mm1,mm0                 ; mm1=Int0L=( 0 1 2 3)
 369         paddw   mm5,mm4                 ; mm5=Int0H=( 4 5 6 7)
 370         paddw   mm2,mm0                 ; mm2=Int1L=( 0 1 2 3)
 371         paddw   mm6,mm4                 ; mm6=Int1H=( 4 5 6 7)
 372
 373         movq    MMWORD [edx+2*SIZEOF_MMWORD], mm1       ; temporarily save
 374         movq    MMWORD [edx+3*SIZEOF_MMWORD], mm5       ; the intermediate data
 375         movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
 376         movq    MMWORD [edi+3*SIZEOF_MMWORD], mm6
 377
 378         psllq   mm1,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm1=( - - - 0)
 379         psllq   mm2,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm2=( - - - 0)
 380
 381         movq    MMWORD [wk(2)], mm1
 382         movq    MMWORD [wk(3)], mm2
 383
 384 .upsample:
 385         ; -- process the upper row
 386
 387         movq    mm7, MMWORD [edx+0*SIZEOF_MMWORD]       ; mm7=Int0L=( 0 1 2 3)
 388         movq    mm3, MMWORD [edx+1*SIZEOF_MMWORD]       ; mm3=Int0H=( 4 5 6 7)
 389
 390         movq    mm0,mm7
 391         movq    mm4,mm3
 392         psrlq   mm0,2*BYTE_BIT                  ; mm0=( 1 2 3 -)
 393         psllq   mm4,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( - - - 4)
 394         movq    mm5,mm7
 395         movq    mm6,mm3
 396         psrlq   mm5,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm5=( 3 - - -)
 397         psllq   mm6,2*BYTE_BIT                  ; mm6=( - 4 5 6)
 398
 399         por     mm0,mm4                         ; mm0=( 1 2 3 4)
 400         por     mm5,mm6                         ; mm5=( 3 4 5 6)
 401
 402         movq    mm1,mm7
 403         movq    mm2,mm3
 404         psllq   mm1,2*BYTE_BIT                  ; mm1=( - 0 1 2)
 405         psrlq   mm2,2*BYTE_BIT                  ; mm2=( 5 6 7 -)
 406         movq    mm4,mm3
 407         psrlq   mm4,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm4=( 7 - - -)
 408
 409         por     mm1, MMWORD [wk(0)]             ; mm1=(-1 0 1 2)
 410         por     mm2, MMWORD [wk(2)]             ; mm2=( 5 6 7 8)
 411
 412         movq    MMWORD [wk(0)], mm4
 413
 414         pmullw  mm7,[GOTOFF(ebx,PW_THREE)]
 415         pmullw  mm3,[GOTOFF(ebx,PW_THREE)]
 416         paddw   mm1,[GOTOFF(ebx,PW_EIGHT)]
 417         paddw   mm5,[GOTOFF(ebx,PW_EIGHT)]
 418         paddw   mm0,[GOTOFF(ebx,PW_SEVEN)]
 419         paddw   mm2,[GOTOFF(ebx,PW_SEVEN)]
 420
 421         paddw   mm1,mm7
 422         paddw   mm5,mm3
 423         psrlw   mm1,4                   ; mm1=Out0LE=( 0  2  4  6)
 424         psrlw   mm5,4                   ; mm5=Out0HE=( 8 10 12 14)
 425         paddw   mm0,mm7
 426         paddw   mm2,mm3
 427         psrlw   mm0,4                   ; mm0=Out0LO=( 1  3  5  7)
 428         psrlw   mm2,4                   ; mm2=Out0HO=( 9 11 13 15)
 429
 430         psllw   mm0,BYTE_BIT
 431         psllw   mm2,BYTE_BIT
 432         por     mm1,mm0                 ; mm1=Out0L=( 0  1  2  3  4  5  6  7)
 433         por     mm5,mm2                 ; mm5=Out0H=( 8  9 10 11 12 13 14 15)
 434
 435         movq    MMWORD [edx+0*SIZEOF_MMWORD], mm1
 436         movq    MMWORD [edx+1*SIZEOF_MMWORD], mm5
 437
 438         ; -- process the lower row
 439
 440         movq    mm6, MMWORD [edi+0*SIZEOF_MMWORD]       ; mm6=Int1L=( 0 1 2 3)
 441         movq    mm4, MMWORD [edi+1*SIZEOF_MMWORD]       ; mm4=Int1H=( 4 5 6 7)
 442
 443         movq    mm7,mm6
 444         movq    mm3,mm4
 445         psrlq   mm7,2*BYTE_BIT                  ; mm7=( 1 2 3 -)
 446         psllq   mm3,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( - - - 4)
 447         movq    mm0,mm6
 448         movq    mm2,mm4
 449         psrlq   mm0,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm0=( 3 - - -)
 450         psllq   mm2,2*BYTE_BIT                  ; mm2=( - 4 5 6)
 451
 452         por     mm7,mm3                         ; mm7=( 1 2 3 4)
 453         por     mm0,mm2                         ; mm0=( 3 4 5 6)
 454
 455         movq    mm1,mm6
 456         movq    mm5,mm4
 457         psllq   mm1,2*BYTE_BIT                  ; mm1=( - 0 1 2)
 458         psrlq   mm5,2*BYTE_BIT                  ; mm5=( 5 6 7 -)
 459         movq    mm3,mm4
 460         psrlq   mm3,(SIZEOF_MMWORD-2)*BYTE_BIT  ; mm3=( 7 - - -)
 461
 462         por     mm1, MMWORD [wk(1)]             ; mm1=(-1 0 1 2)
 463         por     mm5, MMWORD [wk(3)]             ; mm5=( 5 6 7 8)
 464
 465         movq    MMWORD [wk(1)], mm3
 466
 467         pmullw  mm6,[GOTOFF(ebx,PW_THREE)]
 468         pmullw  mm4,[GOTOFF(ebx,PW_THREE)]
 469         paddw   mm1,[GOTOFF(ebx,PW_EIGHT)]
 470         paddw   mm0,[GOTOFF(ebx,PW_EIGHT)]
 471         paddw   mm7,[GOTOFF(ebx,PW_SEVEN)]
 472         paddw   mm5,[GOTOFF(ebx,PW_SEVEN)]
 473
 474         paddw   mm1,mm6
 475         paddw   mm0,mm4
 476         psrlw   mm1,4                   ; mm1=Out1LE=( 0  2  4  6)
 477         psrlw   mm0,4                   ; mm0=Out1HE=( 8 10 12 14)
 478         paddw   mm7,mm6
 479         paddw   mm5,mm4
 480         psrlw   mm7,4                   ; mm7=Out1LO=( 1  3  5  7)
 481         psrlw   mm5,4                   ; mm5=Out1HO=( 9 11 13 15)
 482
 483         psllw   mm7,BYTE_BIT
 484         psllw   mm5,BYTE_BIT
 485         por     mm1,mm7                 ; mm1=Out1L=( 0  1  2  3  4  5  6  7)
 486         por     mm0,mm5                 ; mm0=Out1H=( 8  9 10 11 12 13 14 15)
 487
 488         movq    MMWORD [edi+0*SIZEOF_MMWORD], mm1
 489         movq    MMWORD [edi+1*SIZEOF_MMWORD], mm0
 490
 491         poppic  ebx
 492
 493         sub     eax, byte SIZEOF_MMWORD
 494         add     ecx, byte 1*SIZEOF_MMWORD       ; inptr1(above)
 495         add     ebx, byte 1*SIZEOF_MMWORD       ; inptr0
 496         add     esi, byte 1*SIZEOF_MMWORD       ; inptr1(below)
 497         add     edx, byte 2*SIZEOF_MMWORD       ; outptr0
 498         add     edi, byte 2*SIZEOF_MMWORD       ; outptr1
 499         cmp     eax, byte SIZEOF_MMWORD
 500         ja      near .columnloop
 501         test    eax,eax
 502         jnz     near .columnloop_last
 503
 504         pop     esi
 505         pop     edi
 506         pop     ecx
 507         pop     eax
 508
 509         add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
 510         add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
 511         sub     ecx, byte 2                     ; rowctr
 512         jg      near .rowloop
 513
 514         emms            ; empty MMX state
 515
 516 .return:
 517         pop     edi
 518         pop     esi
 519 ;       pop     edx             ; need not be preserved
 520 ;       pop     ecx             ; need not be preserved
 521         pop     ebx
 522         mov     esp,ebp         ; esp <- aligned ebp
 523         pop     esp             ; esp <- original ebp
 524         pop     ebp
 525         ret
 526
 527 ; --------------------------------------------------------------------------
 528 ;
 529 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
 530 ; It's still a box filter.
 531 ;
 532 ; GLOBAL(void)
 533 ; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
 534 ;                          JDIMENSION output_width,
 535 ;                          JSAMPARRAY input_data,
 536 ;                          JSAMPARRAY * output_data_ptr);
 537 ;
 538
 539 %define max_v_samp(b)           (b)+8                   ; int max_v_samp_factor
 540 %define output_width(b) (b)+12          ; JDIMENSION output_width
 541 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
 542 %define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
 543
 544         align   16
 545         global  EXTN(jsimd_h2v1_upsample_mmx)
 546
 547 EXTN(jsimd_h2v1_upsample_mmx):
 548         push    ebp
 549         mov     ebp,esp
 550 ;       push    ebx             ; unused
 551 ;       push    ecx             ; need not be preserved
 552 ;       push    edx             ; need not be preserved
 553         push    esi
 554         push    edi
 555
 556         mov     edx, JDIMENSION [output_width(ebp)]
 557         add     edx, byte (2*SIZEOF_MMWORD)-1
 558         and     edx, byte -(2*SIZEOF_MMWORD)
 559         jz      short .return
 560
 561         mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
 562         test    ecx,ecx
 563         jz      short .return
 564
 565         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
 566         mov     edi, POINTER [output_data_ptr(ebp)]
 567         mov     edi, JSAMPARRAY [edi]                   ; output_data
 568         alignx  16,7
 569 .rowloop:
 570         push    edi
 571         push    esi
 572
 573         mov     esi, JSAMPROW [esi]             ; inptr
 574         mov     edi, JSAMPROW [edi]             ; outptr
 575         mov     eax,edx                         ; colctr
 576         alignx  16,7
 577 .columnloop:
 578
 579         movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
 580
 581         movq      mm1,mm0
 582         punpcklbw mm0,mm0
 583         punpckhbw mm1,mm1
 584
 585         movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
 586         movq    MMWORD [edi+1*SIZEOF_MMWORD], mm1
 587
 588         sub     eax, byte 2*SIZEOF_MMWORD
 589         jz      short .nextrow
 590
 591         movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]
 592
 593         movq      mm3,mm2
 594         punpcklbw mm2,mm2
 595         punpckhbw mm3,mm3
 596
 597         movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
 598         movq    MMWORD [edi+3*SIZEOF_MMWORD], mm3
 599
 600         sub     eax, byte 2*SIZEOF_MMWORD
 601         jz      short .nextrow
 602
 603         add     esi, byte 2*SIZEOF_MMWORD       ; inptr
 604         add     edi, byte 4*SIZEOF_MMWORD       ; outptr
 605         jmp     short .columnloop
 606         alignx  16,7
 607
 608 .nextrow:
 609         pop     esi
 610         pop     edi
 611
 612         add     esi, byte SIZEOF_JSAMPROW       ; input_data
 613         add     edi, byte SIZEOF_JSAMPROW       ; output_data
 614         dec     ecx                             ; rowctr
 615         jg      short .rowloop
 616
 617         emms            ; empty MMX state
 618
 619 .return:
 620         pop     edi
 621         pop     esi
 622 ;       pop     edx             ; need not be preserved
 623 ;       pop     ecx             ; need not be preserved
 624 ;       pop     ebx             ; unused
 625         pop     ebp
 626         ret
 627
 628 ; --------------------------------------------------------------------------
 629 ;
 630 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
 631 ; It's still a box filter.
 632 ;
 633 ; GLOBAL(void)
 634 ; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
 635 ;                          JDIMENSION output_width,
 636 ;                          JSAMPARRAY input_data,
 637 ;                          JSAMPARRAY * output_data_ptr);
 638 ;
 639
 640 %define max_v_samp(b)           (b)+8                   ; int max_v_samp_factor
 641 %define output_width(b) (b)+12          ; JDIMENSION output_width
 642 %define input_data(b)           (b)+16          ; JSAMPARRAY input_data
 643 %define output_data_ptr(b)      (b)+20          ; JSAMPARRAY * output_data_ptr
 644
 645         align   16
 646         global  EXTN(jsimd_h2v2_upsample_mmx)
 647
 648 EXTN(jsimd_h2v2_upsample_mmx):
 649         push    ebp
 650         mov     ebp,esp
 651         push    ebx
 652 ;       push    ecx             ; need not be preserved
 653 ;       push    edx             ; need not be preserved
 654         push    esi
 655         push    edi
 656
 657         mov     edx, JDIMENSION [output_width(ebp)]
 658         add     edx, byte (2*SIZEOF_MMWORD)-1
 659         and     edx, byte -(2*SIZEOF_MMWORD)
 660         jz      near .return
 661
 662         mov     ecx, INT [max_v_samp(ebp)]      ; rowctr
 663         test    ecx,ecx
 664         jz      short .return
 665
 666         mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
 667         mov     edi, POINTER [output_data_ptr(ebp)]
 668         mov     edi, JSAMPARRAY [edi]                   ; output_data
 669         alignx  16,7
 670 .rowloop:
 671         push    edi
 672         push    esi
 673
 674         mov     esi, JSAMPROW [esi]                     ; inptr
 675         mov     ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]   ; outptr0
 676         mov     edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]   ; outptr1
 677         mov     eax,edx                                 ; colctr
 678         alignx  16,7
 679 .columnloop:
 680
 681         movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
 682
 683         movq      mm1,mm0
 684         punpcklbw mm0,mm0
 685         punpckhbw mm1,mm1
 686
 687         movq    MMWORD [ebx+0*SIZEOF_MMWORD], mm0
 688         movq    MMWORD [ebx+1*SIZEOF_MMWORD], mm1
 689         movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
 690         movq    MMWORD [edi+1*SIZEOF_MMWORD], mm1
 691
 692         sub     eax, byte 2*SIZEOF_MMWORD
 693         jz      short .nextrow
 694
 695         movq    mm2, MMWORD [esi+1*SIZEOF_MMWORD]
 696
 697         movq      mm3,mm2
 698         punpcklbw mm2,mm2
 699         punpckhbw mm3,mm3
 700
 701         movq    MMWORD [ebx+2*SIZEOF_MMWORD], mm2
 702         movq    MMWORD [ebx+3*SIZEOF_MMWORD], mm3
 703         movq    MMWORD [edi+2*SIZEOF_MMWORD], mm2
 704         movq    MMWORD [edi+3*SIZEOF_MMWORD], mm3
 705
 706         sub     eax, byte 2*SIZEOF_MMWORD
 707         jz      short .nextrow
 708
 709         add     esi, byte 2*SIZEOF_MMWORD       ; inptr
 710         add     ebx, byte 4*SIZEOF_MMWORD       ; outptr0
 711         add     edi, byte 4*SIZEOF_MMWORD       ; outptr1
 712         jmp     short .columnloop
 713         alignx  16,7
 714
 715 .nextrow:
 716         pop     esi
 717         pop     edi
 718
 719         add     esi, byte 1*SIZEOF_JSAMPROW     ; input_data
 720         add     edi, byte 2*SIZEOF_JSAMPROW     ; output_data
 721         sub     ecx, byte 2                     ; rowctr
 722         jg      short .rowloop
 723
 724         emms            ; empty MMX state
 725
 726 .return:
 727         pop     edi
 728         pop     esi
 729 ;       pop     edx             ; need not be preserved
 730 ;       pop     ecx             ; need not be preserved
 731         pop     ebx
 732         pop     ebp
 733         ret
 734
 735 ; For some reason, the OS X linker does not honor the request to align the
 736 ; segment unless we do this.
 737         align   16