jpeg/simd/jcqnts2i.asm

   1 ;
   2 ; jcqnts2i.asm - sample data conversion and quantization (SSE2)
   3 ;
   4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
   5 ;
   6 ; Based on
   7 ; x86 SIMD extension for IJG JPEG library
   8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
   9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10 ;
  11 ; This file should be assembled with NASM (Netwide Assembler),
  12 ; can *not* be assembled with Microsoft's MASM or any compatible
  13 ; assembler (including Borland's Turbo Assembler).
  14 ; NASM is available from http://nasm.sourceforge.net/ or
  15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16 ;
  17 ; [TAB8]
  18
  19 %include "jsimdext.inc"
  20 %include "jdct.inc"
  21
  22 ; --------------------------------------------------------------------------
  23         SECTION SEG_TEXT
  24         BITS    32
  25 ;
  26 ; Load data into workspace, applying unsigned->signed conversion
  27 ;
  28 ; GLOBAL(void)
  29 ; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
  30 ;                      DCTELEM * workspace);
  31 ;
  32
  33 %define sample_data     ebp+8           ; JSAMPARRAY sample_data
  34 %define start_col       ebp+12          ; JDIMENSION start_col
  35 %define workspace       ebp+16          ; DCTELEM * workspace
  36
  37         align   16
  38         global  EXTN(jsimd_convsamp_sse2)
  39
  40 EXTN(jsimd_convsamp_sse2):
  41         push    ebp
  42         mov     ebp,esp
  43         push    ebx
  44 ;       push    ecx             ; need not be preserved
  45 ;       push    edx             ; need not be preserved
  46         push    esi
  47         push    edi
  48
  49         pxor    xmm6,xmm6               ; xmm6=(all 0's)
  50         pcmpeqw xmm7,xmm7
  51         psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
  52
  53         mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
  54         mov     eax, JDIMENSION [start_col]
  55         mov     edi, POINTER [workspace]        ; (DCTELEM *)
  56         mov     ecx, DCTSIZE/4
  57         alignx  16,7
  58 .convloop:
  59         mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
  60         mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
  61
  62         movq    xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]       ; xmm0=(01234567)
  63         movq    xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]       ; xmm1=(89ABCDEF)
  64
  65         mov     ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
  66         mov     edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
  67
  68         movq    xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]       ; xmm2=(GHIJKLMN)
  69         movq    xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]       ; xmm3=(OPQRSTUV)
  70
  71         punpcklbw xmm0,xmm6             ; xmm0=(01234567)
  72         punpcklbw xmm1,xmm6             ; xmm1=(89ABCDEF)
  73         paddw     xmm0,xmm7
  74         paddw     xmm1,xmm7
  75         punpcklbw xmm2,xmm6             ; xmm2=(GHIJKLMN)
  76         punpcklbw xmm3,xmm6             ; xmm3=(OPQRSTUV)
  77         paddw     xmm2,xmm7
  78         paddw     xmm3,xmm7
  79
  80         movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
  81         movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
  82         movdqa  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
  83         movdqa  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
  84
  85         add     esi, byte 4*SIZEOF_JSAMPROW
  86         add     edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
  87         dec     ecx
  88         jnz     short .convloop
  89
  90         pop     edi
  91         pop     esi
  92 ;       pop     edx             ; need not be preserved
  93 ;       pop     ecx             ; need not be preserved
  94         pop     ebx
  95         pop     ebp
  96         ret
  97
  98 ; --------------------------------------------------------------------------
  99 ;
 100 ; Quantize/descale the coefficients, and store into coef_block
 101 ;
 102 ; This implementation is based on an algorithm described in
 103 ;   "How to optimize for the Pentium family of microprocessors"
 104 ;   (http://www.agner.org/assem/).
 105 ;
 106 ; GLOBAL(void)
 107 ; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
 108 ;                      DCTELEM * workspace);
 109 ;
 110
 111 %define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
 112 %define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
 113 %define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
 114
 115 %define coef_block      ebp+8           ; JCOEFPTR coef_block
 116 %define divisors        ebp+12          ; DCTELEM * divisors
 117 %define workspace       ebp+16          ; DCTELEM * workspace
 118
 119         align   16
 120         global  EXTN(jsimd_quantize_sse2)
 121
 122 EXTN(jsimd_quantize_sse2):
 123         push    ebp
 124         mov     ebp,esp
 125 ;       push    ebx             ; unused
 126 ;       push    ecx             ; unused
 127 ;       push    edx             ; need not be preserved
 128         push    esi
 129         push    edi
 130
 131         mov     esi, POINTER [workspace]
 132         mov     edx, POINTER [divisors]
 133         mov     edi, JCOEFPTR [coef_block]
 134         mov     eax, DCTSIZE2/32
 135         alignx  16,7
 136 .quantloop:
 137         movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
 138         movdqa  xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
 139         movdqa  xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
 140         movdqa  xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
 141         movdqa  xmm0,xmm4
 142         movdqa  xmm1,xmm5
 143         movdqa  xmm2,xmm6
 144         movdqa  xmm3,xmm7
 145         psraw   xmm4,(WORD_BIT-1)
 146         psraw   xmm5,(WORD_BIT-1)
 147         psraw   xmm6,(WORD_BIT-1)
 148         psraw   xmm7,(WORD_BIT-1)
 149         pxor    xmm0,xmm4
 150         pxor    xmm1,xmm5
 151         pxor    xmm2,xmm6
 152         pxor    xmm3,xmm7
 153         psubw   xmm0,xmm4               ; if (xmm0 < 0) xmm0 = -xmm0;
 154         psubw   xmm1,xmm5               ; if (xmm1 < 0) xmm1 = -xmm1;
 155         psubw   xmm2,xmm6               ; if (xmm2 < 0) xmm2 = -xmm2;
 156         psubw   xmm3,xmm7               ; if (xmm3 < 0) xmm3 = -xmm3;
 157
 158         paddw   xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
 159         paddw   xmm1, XMMWORD [CORRECTION(1,0,edx)]
 160         paddw   xmm2, XMMWORD [CORRECTION(2,0,edx)]
 161         paddw   xmm3, XMMWORD [CORRECTION(3,0,edx)]
 162         pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
 163         pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
 164         pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
 165         pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
 166         pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)]  ; scale
 167         pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)]
 168         pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)]
 169         pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)]
 170
 171         pxor    xmm0,xmm4
 172         pxor    xmm1,xmm5
 173         pxor    xmm2,xmm6
 174         pxor    xmm3,xmm7
 175         psubw   xmm0,xmm4
 176         psubw   xmm1,xmm5
 177         psubw   xmm2,xmm6
 178         psubw   xmm3,xmm7
 179         movdqa  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
 180         movdqa  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
 181         movdqa  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
 182         movdqa  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
 183
 184         add     esi, byte 32*SIZEOF_DCTELEM
 185         add     edx, byte 32*SIZEOF_DCTELEM
 186         add     edi, byte 32*SIZEOF_JCOEF
 187         dec     eax
 188         jnz     near .quantloop
 189
 190         pop     edi
 191         pop     esi
 192 ;       pop     edx             ; need not be preserved
 193 ;       pop     ecx             ; unused
 194 ;       pop     ebx             ; unused
 195         pop     ebp
 196         ret
 197
 198 ; For some reason, the OS X linker does not honor the request to align the
 199 ; segment unless we do this.
 200         align   16