jpeg/simd/jdsamss2-64.asm

   1 ;
   2 ; jdsamss2-64.asm - upsampling (64-bit SSE2)
   3 ;
   4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
   5 ; Copyright 2009 D. R. Commander
   6 ;
   7 ; Based on
   8 ; x86 SIMD extension for IJG JPEG library
   9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  11 ;
  12 ; This file should be assembled with NASM (Netwide Assembler),
  13 ; can *not* be assembled with Microsoft's MASM or any compatible
  14 ; assembler (including Borland's Turbo Assembler).
  15 ; NASM is available from http://nasm.sourceforge.net/ or
  16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
  17 ;
  18 ; [TAB8]
  19
  20 %include "jsimdext.inc"
  21
  22 ; --------------------------------------------------------------------------
  23         SECTION SEG_CONST
  24
  25         alignz  16
  26         global  EXTN(jconst_fancy_upsample_sse2)
  27
  28 EXTN(jconst_fancy_upsample_sse2):
  29
  30 PW_ONE          times 8 dw  1
  31 PW_TWO          times 8 dw  2
  32 PW_THREE        times 8 dw  3
  33 PW_SEVEN        times 8 dw  7
  34 PW_EIGHT        times 8 dw  8
  35
  36         alignz  16
  37
  38 ; --------------------------------------------------------------------------
  39         SECTION SEG_TEXT
  40         BITS    64
  41 ;
  42 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
  43 ;
  44 ; The upsampling algorithm is linear interpolation between pixel centers,
  45 ; also known as a "triangle filter".  This is a good compromise between
  46 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
  47 ; of the way between input pixel centers.
  48 ;
  49 ; GLOBAL(void)
  50 ; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
  51 ;                                 JDIMENSION downsampled_width,
  52 ;                                 JSAMPARRAY input_data,
  53 ;                                 JSAMPARRAY * output_data_ptr);
  54 ;
  55
  56 ; r10 = int max_v_samp_factor
  57 ; r11 = JDIMENSION downsampled_width
  58 ; r12 = JSAMPARRAY input_data
  59 ; r13 = JSAMPARRAY * output_data_ptr
  60
  61         align   16
  62         global  EXTN(jsimd_h2v1_fancy_upsample_sse2)
  63
  64 EXTN(jsimd_h2v1_fancy_upsample_sse2):
  65         push    rbp
  66         mov     rax,rsp
  67         mov     rbp,rsp
  68         collect_args
  69
  70         mov     rax, r11  ; colctr
  71         test    rax,rax
  72         jz      near .return
  73
  74         mov     rcx, r10        ; rowctr
  75         test    rcx,rcx
  76         jz      near .return
  77
  78         mov     rsi, r12        ; input_data
  79         mov     rdi, r13
  80         mov     rdi, JSAMPARRAY [rdi]                   ; output_data
  81 .rowloop:
  82         push    rax                     ; colctr
  83         push    rdi
  84         push    rsi
  85
  86         mov     rsi, JSAMPROW [rsi]     ; inptr
  87         mov     rdi, JSAMPROW [rdi]     ; outptr
  88
  89         test    rax, SIZEOF_XMMWORD-1
  90         jz      short .skip
  91         mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
  92         mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
  93 .skip:
  94         pxor    xmm0,xmm0               ; xmm0=(all 0's)
  95         pcmpeqb xmm7,xmm7
  96         psrldq  xmm7,(SIZEOF_XMMWORD-1)
  97         pand    xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  98
  99         add     rax, byte SIZEOF_XMMWORD-1
 100         and     rax, byte -SIZEOF_XMMWORD
 101         cmp     rax, byte SIZEOF_XMMWORD
 102         ja      short .columnloop
 103
 104 .columnloop_last:
 105         pcmpeqb xmm6,xmm6
 106         pslldq  xmm6,(SIZEOF_XMMWORD-1)
 107         pand    xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 108         jmp     short .upsample
 109
 110 .columnloop:
 111         movdqa  xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
 112         pslldq  xmm6,(SIZEOF_XMMWORD-1)
 113
 114 .upsample:
 115         movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 116         movdqa  xmm2,xmm1
 117         movdqa  xmm3,xmm1               ; xmm1=( 0  1  2 ... 13 14 15)
 118         pslldq  xmm2,1                  ; xmm2=(--  0  1 ... 12 13 14)
 119         psrldq  xmm3,1                  ; xmm3=( 1  2  3 ... 14 15 --)
 120
 121         por     xmm2,xmm7               ; xmm2=(-1  0  1 ... 12 13 14)
 122         por     xmm3,xmm6               ; xmm3=( 1  2  3 ... 14 15 16)
 123
 124         movdqa  xmm7,xmm1
 125         psrldq  xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --)
 126
 127         movdqa    xmm4,xmm1
 128         punpcklbw xmm1,xmm0             ; xmm1=( 0  1  2  3  4  5  6  7)
 129         punpckhbw xmm4,xmm0             ; xmm4=( 8  9 10 11 12 13 14 15)
 130         movdqa    xmm5,xmm2
 131         punpcklbw xmm2,xmm0             ; xmm2=(-1  0  1  2  3  4  5  6)
 132         punpckhbw xmm5,xmm0             ; xmm5=( 7  8  9 10 11 12 13 14)
 133         movdqa    xmm6,xmm3
 134         punpcklbw xmm3,xmm0             ; xmm3=( 1  2  3  4  5  6  7  8)
 135         punpckhbw xmm6,xmm0             ; xmm6=( 9 10 11 12 13 14 15 16)
 136
 137         pmullw  xmm1,[rel PW_THREE]
 138         pmullw  xmm4,[rel PW_THREE]
 139         paddw   xmm2,[rel PW_ONE]
 140         paddw   xmm5,[rel PW_ONE]
 141         paddw   xmm3,[rel PW_TWO]
 142         paddw   xmm6,[rel PW_TWO]
 143
 144         paddw   xmm2,xmm1
 145         paddw   xmm5,xmm4
 146         psrlw   xmm2,2                  ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
 147         psrlw   xmm5,2                  ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
 148         paddw   xmm3,xmm1
 149         paddw   xmm6,xmm4
 150         psrlw   xmm3,2                  ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
 151         psrlw   xmm6,2                  ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
 152
 153         psllw   xmm3,BYTE_BIT
 154         psllw   xmm6,BYTE_BIT
 155         por     xmm2,xmm3               ; xmm2=OutL=( 0  1  2 ... 13 14 15)
 156         por     xmm5,xmm6               ; xmm5=OutH=(16 17 18 ... 29 30 31)
 157
 158         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
 159         movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
 160
 161         sub     rax, byte SIZEOF_XMMWORD
 162         add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr
 163         add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
 164         cmp     rax, byte SIZEOF_XMMWORD
 165         ja      near .columnloop
 166         test    eax,eax
 167         jnz     near .columnloop_last
 168
 169         pop     rsi
 170         pop     rdi
 171         pop     rax
 172
 173         add     rsi, byte SIZEOF_JSAMPROW       ; input_data
 174         add     rdi, byte SIZEOF_JSAMPROW       ; output_data
 175         dec     rcx                             ; rowctr
 176         jg      near .rowloop
 177
 178 .return:
 179         uncollect_args
 180         pop     rbp
 181         ret
 182
 183 ; --------------------------------------------------------------------------
 184 ;
 185 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
 186 ; Again a triangle filter; see comments for h2v1 case, above.
 187 ;
 188 ; GLOBAL(void)
 189 ; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
 190 ;                                 JDIMENSION downsampled_width,
 191 ;                                 JSAMPARRAY input_data,
 192 ;                                 JSAMPARRAY * output_data_ptr);
 193 ;
 194
 195 ; r10 = int max_v_samp_factor
 196 ; r11 = JDIMENSION downsampled_width
 197 ; r12 = JSAMPARRAY input_data
 198 ; r13 = JSAMPARRAY * output_data_ptr
 199
 200 %define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
 201 %define WK_NUM          4
 202
 203         align   16
 204         global  EXTN(jsimd_h2v2_fancy_upsample_sse2)
 205
 206 EXTN(jsimd_h2v2_fancy_upsample_sse2):
 207         push    rbp
 208         mov     rax,rsp                         ; rax = original rbp
 209         sub     rsp, byte 4
 210         and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
 211         mov     [rsp],rax
 212         mov     rbp,rsp                         ; rbp = aligned rbp
 213         lea     rsp, [wk(0)]
 214         collect_args
 215         push    rbx
 216
 217         mov     rax, r11  ; colctr
 218         test    rax,rax
 219         jz      near .return
 220
 221         mov     rcx, r10        ; rowctr
 222         test    rcx,rcx
 223         jz      near .return
 224
 225         mov     rsi, r12        ; input_data
 226         mov     rdi, r13
 227         mov     rdi, JSAMPARRAY [rdi]                   ; output_data
 228 .rowloop:
 229         push    rax                                     ; colctr
 230         push    rcx
 231         push    rdi
 232         push    rsi
 233
 234         mov     rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]   ; inptr1(above)
 235         mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
 236         mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1(below)
 237         mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
 238         mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
 239
 240         test    rax, SIZEOF_XMMWORD-1
 241         jz      short .skip
 242         push    rdx
 243         mov     dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
 244         mov     JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
 245         mov     dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
 246         mov     JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
 247         mov     dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
 248         mov     JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
 249         pop     rdx
 250 .skip:
 251         ; -- process the first column block
 252
 253         movdqa  xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]    ; xmm0=row[ 0][0]
 254         movdqa  xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]    ; xmm1=row[-1][0]
 255         movdqa  xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]    ; xmm2=row[+1][0]
 256
 257         pxor      xmm3,xmm3             ; xmm3=(all 0's)
 258         movdqa    xmm4,xmm0
 259         punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
 260         punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
 261         movdqa    xmm5,xmm1
 262         punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
 263         punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
 264         movdqa    xmm6,xmm2
 265         punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
 266         punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
 267
 268         pmullw  xmm0,[rel PW_THREE]
 269         pmullw  xmm4,[rel PW_THREE]
 270
 271         pcmpeqb xmm7,xmm7
 272         psrldq  xmm7,(SIZEOF_XMMWORD-2)
 273
 274         paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
 275         paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
 276         paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
 277         paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
 278
 279         movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1    ; temporarily save
 280         movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5    ; the intermediate data
 281         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
 282         movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
 283
 284         pand    xmm1,xmm7               ; xmm1=( 0 -- -- -- -- -- -- --)
 285         pand    xmm2,xmm7               ; xmm2=( 0 -- -- -- -- -- -- --)
 286
 287         movdqa  XMMWORD [wk(0)], xmm1
 288         movdqa  XMMWORD [wk(1)], xmm2
 289
 290         add     rax, byte SIZEOF_XMMWORD-1
 291         and     rax, byte -SIZEOF_XMMWORD
 292         cmp     rax, byte SIZEOF_XMMWORD
 293         ja      short .columnloop
 294
 295 .columnloop_last:
 296         ; -- process the last column block
 297
 298         pcmpeqb xmm1,xmm1
 299         pslldq  xmm1,(SIZEOF_XMMWORD-2)
 300         movdqa  xmm2,xmm1
 301
 302         pand    xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
 303         pand    xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
 304
 305         movdqa  XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
 306         movdqa  XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
 307
 308         jmp     near .upsample
 309
 310 .columnloop:
 311         ; -- process the next column block
 312
 313         movdqa  xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]    ; xmm0=row[ 0][1]
 314         movdqa  xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]    ; xmm1=row[-1][1]
 315         movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]    ; xmm2=row[+1][1]
 316
 317         pxor      xmm3,xmm3             ; xmm3=(all 0's)
 318         movdqa    xmm4,xmm0
 319         punpcklbw xmm0,xmm3             ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
 320         punpckhbw xmm4,xmm3             ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
 321         movdqa    xmm5,xmm1
 322         punpcklbw xmm1,xmm3             ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
 323         punpckhbw xmm5,xmm3             ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
 324         movdqa    xmm6,xmm2
 325         punpcklbw xmm2,xmm3             ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
 326         punpckhbw xmm6,xmm3             ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
 327
 328         pmullw  xmm0,[rel PW_THREE]
 329         pmullw  xmm4,[rel PW_THREE]
 330
 331         paddw   xmm1,xmm0               ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
 332         paddw   xmm5,xmm4               ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
 333         paddw   xmm2,xmm0               ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
 334         paddw   xmm6,xmm4               ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
 335
 336         movdqa  XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1    ; temporarily save
 337         movdqa  XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5    ; the intermediate data
 338         movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
 339         movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
 340
 341         pslldq  xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- --  0)
 342         pslldq  xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- --  0)
 343
 344         movdqa  XMMWORD [wk(2)], xmm1
 345         movdqa  XMMWORD [wk(3)], xmm2
 346
 347 .upsample:
 348         ; -- process the upper row
 349
 350         movdqa  xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
 351         movdqa  xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
 352
 353         movdqa  xmm0,xmm7               ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
 354         movdqa  xmm4,xmm3               ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
 355         psrldq  xmm0,2                  ; xmm0=( 1  2  3  4  5  6  7 --)
 356         pslldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- --  8)
 357         movdqa  xmm5,xmm7
 358         movdqa  xmm6,xmm3
 359         psrldq  xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --)
 360         pslldq  xmm6,2                  ; xmm6=(--  8  9 10 11 12 13 14)
 361
 362         por     xmm0,xmm4               ; xmm0=( 1  2  3  4  5  6  7  8)
 363         por     xmm5,xmm6               ; xmm5=( 7  8  9 10 11 12 13 14)
 364
 365         movdqa  xmm1,xmm7
 366         movdqa  xmm2,xmm3
 367         pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
 368         psrldq  xmm2,2                  ; xmm2=( 9 10 11 12 13 14 15 --)
 369         movdqa  xmm4,xmm3
 370         psrldq  xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --)
 371
 372         por     xmm1, XMMWORD [wk(0)]   ; xmm1=(-1  0  1  2  3  4  5  6)
 373         por     xmm2, XMMWORD [wk(2)]   ; xmm2=( 9 10 11 12 13 14 15 16)
 374
 375         movdqa  XMMWORD [wk(0)], xmm4
 376
 377         pmullw  xmm7,[rel PW_THREE]
 378         pmullw  xmm3,[rel PW_THREE]
 379         paddw   xmm1,[rel PW_EIGHT]
 380         paddw   xmm5,[rel PW_EIGHT]
 381         paddw   xmm0,[rel PW_SEVEN]
 382         paddw   xmm2,[rel PW_SEVEN]
 383
 384         paddw   xmm1,xmm7
 385         paddw   xmm5,xmm3
 386         psrlw   xmm1,4                  ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
 387         psrlw   xmm5,4                  ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
 388         paddw   xmm0,xmm7
 389         paddw   xmm2,xmm3
 390         psrlw   xmm0,4                  ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
 391         psrlw   xmm2,4                  ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
 392
 393         psllw   xmm0,BYTE_BIT
 394         psllw   xmm2,BYTE_BIT
 395         por     xmm1,xmm0               ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
 396         por     xmm5,xmm2               ; xmm5=Out0H=(16 17 18 ... 29 30 31)
 397
 398         movdqa  XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
 399         movdqa  XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
 400
 401         ; -- process the lower row
 402
 403         movdqa  xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
 404         movdqa  xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
 405
 406         movdqa  xmm7,xmm6               ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
 407         movdqa  xmm3,xmm4               ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
 408         psrldq  xmm7,2                  ; xmm7=( 1  2  3  4  5  6  7 --)
 409         pslldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- --  8)
 410         movdqa  xmm0,xmm6
 411         movdqa  xmm2,xmm4
 412         psrldq  xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --)
 413         pslldq  xmm2,2                  ; xmm2=(--  8  9 10 11 12 13 14)
 414
 415         por     xmm7,xmm3               ; xmm7=( 1  2  3  4  5  6  7  8)
 416         por     xmm0,xmm2               ; xmm0=( 7  8  9 10 11 12 13 14)
 417
 418         movdqa  xmm1,xmm6
 419         movdqa  xmm5,xmm4
 420         pslldq  xmm1,2                  ; xmm1=(--  0  1  2  3  4  5  6)
 421         psrldq  xmm5,2                  ; xmm5=( 9 10 11 12 13 14 15 --)
 422         movdqa  xmm3,xmm4
 423         psrldq  xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --)
 424
 425         por     xmm1, XMMWORD [wk(1)]   ; xmm1=(-1  0  1  2  3  4  5  6)
 426         por     xmm5, XMMWORD [wk(3)]   ; xmm5=( 9 10 11 12 13 14 15 16)
 427
 428         movdqa  XMMWORD [wk(1)], xmm3
 429
 430         pmullw  xmm6,[rel PW_THREE]
 431         pmullw  xmm4,[rel PW_THREE]
 432         paddw   xmm1,[rel PW_EIGHT]
 433         paddw   xmm0,[rel PW_EIGHT]
 434         paddw   xmm7,[rel PW_SEVEN]
 435         paddw   xmm5,[rel PW_SEVEN]
 436
 437         paddw   xmm1,xmm6
 438         paddw   xmm0,xmm4
 439         psrlw   xmm1,4                  ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
 440         psrlw   xmm0,4                  ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
 441         paddw   xmm7,xmm6
 442         paddw   xmm5,xmm4
 443         psrlw   xmm7,4                  ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
 444         psrlw   xmm5,4                  ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
 445
 446         psllw   xmm7,BYTE_BIT
 447         psllw   xmm5,BYTE_BIT
 448         por     xmm1,xmm7               ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
 449         por     xmm0,xmm5               ; xmm0=Out1H=(16 17 18 ... 29 30 31)
 450
 451         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
 452         movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
 453
 454         sub     rax, byte SIZEOF_XMMWORD
 455         add     rcx, byte 1*SIZEOF_XMMWORD      ; inptr1(above)
 456         add     rbx, byte 1*SIZEOF_XMMWORD      ; inptr0
 457         add     rsi, byte 1*SIZEOF_XMMWORD      ; inptr1(below)
 458         add     rdx, byte 2*SIZEOF_XMMWORD      ; outptr0
 459         add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr1
 460         cmp     rax, byte SIZEOF_XMMWORD
 461         ja      near .columnloop
 462         test    rax,rax
 463         jnz     near .columnloop_last
 464
 465         pop     rsi
 466         pop     rdi
 467         pop     rcx
 468         pop     rax
 469
 470         add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
 471         add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
 472         sub     rcx, byte 2                     ; rowctr
 473         jg      near .rowloop
 474
 475 .return:
 476         pop     rbx
 477         uncollect_args
 478         mov     rsp,rbp         ; rsp <- aligned rbp
 479         pop     rsp             ; rsp <- original rbp
 480         pop     rbp
 481         ret
 482
 483 ; --------------------------------------------------------------------------
 484 ;
 485 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
 486 ; It's still a box filter.
 487 ;
 488 ; GLOBAL(void)
 489 ; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
 490 ;                           JDIMENSION output_width,
 491 ;                           JSAMPARRAY input_data,
 492 ;                           JSAMPARRAY * output_data_ptr);
 493 ;
 494
 495 ; r10 = int max_v_samp_factor
 496 ; r11 = JDIMENSION output_width
 497 ; r12 = JSAMPARRAY input_data
 498 ; r13 = JSAMPARRAY * output_data_ptr
 499
 500         align   16
 501         global  EXTN(jsimd_h2v1_upsample_sse2)
 502
 503 EXTN(jsimd_h2v1_upsample_sse2):
 504         push    rbp
 505         mov     rax,rsp
 506         mov     rbp,rsp
 507         collect_args
 508
 509         mov     rdx, r11
 510         add     rdx, byte (2*SIZEOF_XMMWORD)-1
 511         and     rdx, byte -(2*SIZEOF_XMMWORD)
 512         jz      near .return
 513
 514         mov     rcx, r10        ; rowctr
 515         test    rcx,rcx
 516         jz      short .return
 517
 518         mov     rsi, r12 ; input_data
 519         mov     rdi, r13
 520         mov     rdi, JSAMPARRAY [rdi]                   ; output_data
 521 .rowloop:
 522         push    rdi
 523         push    rsi
 524
 525         mov     rsi, JSAMPROW [rsi]             ; inptr
 526         mov     rdi, JSAMPROW [rdi]             ; outptr
 527         mov     rax,rdx                         ; colctr
 528 .columnloop:
 529
 530         movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 531
 532         movdqa    xmm1,xmm0
 533         punpcklbw xmm0,xmm0
 534         punpckhbw xmm1,xmm1
 535
 536         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
 537         movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
 538
 539         sub     rax, byte 2*SIZEOF_XMMWORD
 540         jz      short .nextrow
 541
 542         movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
 543
 544         movdqa    xmm3,xmm2
 545         punpcklbw xmm2,xmm2
 546         punpckhbw xmm3,xmm3
 547
 548         movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
 549         movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
 550
 551         sub     rax, byte 2*SIZEOF_XMMWORD
 552         jz      short .nextrow
 553
 554         add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
 555         add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr
 556         jmp     short .columnloop
 557
 558 .nextrow:
 559         pop     rsi
 560         pop     rdi
 561
 562         add     rsi, byte SIZEOF_JSAMPROW       ; input_data
 563         add     rdi, byte SIZEOF_JSAMPROW       ; output_data
 564         dec     rcx                             ; rowctr
 565         jg      short .rowloop
 566
 567 .return:
 568         uncollect_args
 569         pop     rbp
 570         ret
 571
 572 ; --------------------------------------------------------------------------
 573 ;
 574 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
 575 ; It's still a box filter.
 576 ;
 577 ; GLOBAL(void)
 578 ; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
 579 ;                           JDIMENSION output_width,
 580 ;                           JSAMPARRAY input_data,
 581 ;                           JSAMPARRAY * output_data_ptr);
 582 ;
 583
 584 ; r10 = int max_v_samp_factor
 585 ; r11 = JDIMENSION output_width
 586 ; r12 = JSAMPARRAY input_data
 587 ; r13 = JSAMPARRAY * output_data_ptr
 588
 589         align   16
 590         global  EXTN(jsimd_h2v2_upsample_sse2)
 591
 592 EXTN(jsimd_h2v2_upsample_sse2):
 593         push    rbp
 594         mov     rax,rsp
 595         mov     rbp,rsp
 596         collect_args
 597         push    rbx
 598
 599         mov     rdx, r11
 600         add     rdx, byte (2*SIZEOF_XMMWORD)-1
 601         and     rdx, byte -(2*SIZEOF_XMMWORD)
 602         jz      near .return
 603
 604         mov     rcx, r10        ; rowctr
 605         test    rcx,rcx
 606         jz      near .return
 607
 608         mov     rsi, r12        ; input_data
 609         mov     rdi, r13
 610         mov     rdi, JSAMPARRAY [rdi]                   ; output_data
 611 .rowloop:
 612         push    rdi
 613         push    rsi
 614
 615         mov     rsi, JSAMPROW [rsi]                     ; inptr
 616         mov     rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]   ; outptr0
 617         mov     rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]   ; outptr1
 618         mov     rax,rdx                                 ; colctr
 619 .columnloop:
 620
 621         movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 622
 623         movdqa    xmm1,xmm0
 624         punpcklbw xmm0,xmm0
 625         punpckhbw xmm1,xmm1
 626
 627         movdqa  XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
 628         movdqa  XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
 629         movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
 630         movdqa  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
 631
 632         sub     rax, byte 2*SIZEOF_XMMWORD
 633         jz      short .nextrow
 634
 635         movdqa  xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
 636
 637         movdqa    xmm3,xmm2
 638         punpcklbw xmm2,xmm2
 639         punpckhbw xmm3,xmm3
 640
 641         movdqa  XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
 642         movdqa  XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
 643         movdqa  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
 644         movdqa  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
 645
 646         sub     rax, byte 2*SIZEOF_XMMWORD
 647         jz      short .nextrow
 648
 649         add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
 650         add     rbx, byte 4*SIZEOF_XMMWORD      ; outptr0
 651         add     rdi, byte 4*SIZEOF_XMMWORD      ; outptr1
 652         jmp     short .columnloop
 653
 654 .nextrow:
 655         pop     rsi
 656         pop     rdi
 657
 658         add     rsi, byte 1*SIZEOF_JSAMPROW     ; input_data
 659         add     rdi, byte 2*SIZEOF_JSAMPROW     ; output_data
 660         sub     rcx, byte 2                     ; rowctr
 661         jg      near .rowloop
 662
 663 .return:
 664         pop     rbx
 665         uncollect_args
 666         pop     rbp
 667         ret
 668
 669 ; For some reason, the OS X linker does not honor the request to align the
 670 ; segment unless we do this.
 671         align   16