2 ; jcqnts2i.asm - sample data conversion and quantization (SSE2)
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
7 ; x86 SIMD extension for IJG JPEG library
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
11 ; This file should be assembled with NASM (Netwide Assembler),
12 ; can *not* be assembled with Microsoft's MASM or any compatible
13 ; assembler (including Borland's Turbo Assembler).
14 ; NASM is available from http://nasm.sourceforge.net/ or
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
19 %include "jsimdext.inc"
22 ; --------------------------------------------------------------------------
26 ; Load data into workspace, applying unsigned->signed conversion
29 ; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
30 ; DCTELEM * workspace);
33 %define sample_data
ebp+8 ; JSAMPARRAY sample_data
34 %define start_col
ebp+12 ; JDIMENSION start_col
35 %define workspace
ebp+16 ; DCTELEM * workspace
38 global EXTN
(jsimd_convsamp_sse2
)
40 EXTN
(jsimd_convsamp_sse2
):
44 ; push ecx ; need not be preserved
45 ; push edx ; need not be preserved
49 pxor xmm6
,xmm6
; xmm6=(all 0's)
51 psllw xmm7
,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
53 mov esi, JSAMPARRAY
[sample_data
] ; (JSAMPROW *)
54 mov eax, JDIMENSION
[start_col
]
55 mov edi, POINTER
[workspace
] ; (DCTELEM *)
59 mov ebx, JSAMPROW
[esi+0*SIZEOF_JSAMPROW
] ; (JSAMPLE *)
60 mov edx, JSAMPROW
[esi+1*SIZEOF_JSAMPROW
] ; (JSAMPLE *)
62 movq xmm0
, XMM_MMWORD
[ebx+eax*SIZEOF_JSAMPLE
] ; xmm0=(01234567)
63 movq xmm1
, XMM_MMWORD
[edx+eax*SIZEOF_JSAMPLE
] ; xmm1=(89ABCDEF)
65 mov ebx, JSAMPROW
[esi+2*SIZEOF_JSAMPROW
] ; (JSAMPLE *)
66 mov edx, JSAMPROW
[esi+3*SIZEOF_JSAMPROW
] ; (JSAMPLE *)
68 movq xmm2
, XMM_MMWORD
[ebx+eax*SIZEOF_JSAMPLE
] ; xmm2=(GHIJKLMN)
69 movq xmm3
, XMM_MMWORD
[edx+eax*SIZEOF_JSAMPLE
] ; xmm3=(OPQRSTUV)
71 punpcklbw xmm0
,xmm6
; xmm0=(01234567)
72 punpcklbw xmm1
,xmm6
; xmm1=(89ABCDEF)
75 punpcklbw xmm2
,xmm6
; xmm2=(GHIJKLMN)
76 punpcklbw xmm3
,xmm6
; xmm3=(OPQRSTUV)
80 movdqa XMMWORD
[XMMBLOCK
(0,0,edi,SIZEOF_DCTELEM
)], xmm0
81 movdqa XMMWORD
[XMMBLOCK
(1,0,edi,SIZEOF_DCTELEM
)], xmm1
82 movdqa XMMWORD
[XMMBLOCK
(2,0,edi,SIZEOF_DCTELEM
)], xmm2
83 movdqa XMMWORD
[XMMBLOCK
(3,0,edi,SIZEOF_DCTELEM
)], xmm3
85 add esi, byte 4*SIZEOF_JSAMPROW
86 add edi, byte 4*DCTSIZE
*SIZEOF_DCTELEM
92 ; pop edx ; need not be preserved
93 ; pop ecx ; need not be preserved
98 ; --------------------------------------------------------------------------
100 ; Quantize/descale the coefficients, and store into coef_block
102 ; This implementation is based on an algorithm described in
103 ; "How to optimize for the Pentium family of microprocessors"
104 ; (http://www.agner.org/assem/).
107 ; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,
108 ; DCTELEM * workspace);
111 %define RECIPROCAL
(m
,n
,b
) XMMBLOCK
(DCTSIZE
*0+(m
),(n
),(b
),SIZEOF_DCTELEM
)
112 %define CORRECTION
(m
,n
,b
) XMMBLOCK
(DCTSIZE
*1+(m
),(n
),(b
),SIZEOF_DCTELEM
)
113 %define SCALE
(m
,n
,b
) XMMBLOCK
(DCTSIZE
*2+(m
),(n
),(b
),SIZEOF_DCTELEM
)
115 %define coef_block
ebp+8 ; JCOEFPTR coef_block
116 %define divisors
ebp+12 ; DCTELEM * divisors
117 %define workspace
ebp+16 ; DCTELEM * workspace
120 global EXTN
(jsimd_quantize_sse2
)
122 EXTN
(jsimd_quantize_sse2
):
127 ; push edx ; need not be preserved
131 mov esi, POINTER
[workspace
]
132 mov edx, POINTER
[divisors
]
133 mov edi, JCOEFPTR
[coef_block
]
137 movdqa xmm4
, XMMWORD
[XMMBLOCK
(0,0,esi,SIZEOF_DCTELEM
)]
138 movdqa xmm5
, XMMWORD
[XMMBLOCK
(1,0,esi,SIZEOF_DCTELEM
)]
139 movdqa xmm6
, XMMWORD
[XMMBLOCK
(2,0,esi,SIZEOF_DCTELEM
)]
140 movdqa xmm7
, XMMWORD
[XMMBLOCK
(3,0,esi,SIZEOF_DCTELEM
)]
145 psraw xmm4
,(WORD_BIT
-1)
146 psraw xmm5
,(WORD_BIT
-1)
147 psraw xmm6
,(WORD_BIT
-1)
148 psraw xmm7
,(WORD_BIT
-1)
153 psubw xmm0
,xmm4
; if (xmm0 < 0) xmm0 = -xmm0;
154 psubw xmm1
,xmm5
; if (xmm1 < 0) xmm1 = -xmm1;
155 psubw xmm2
,xmm6
; if (xmm2 < 0) xmm2 = -xmm2;
156 psubw xmm3
,xmm7
; if (xmm3 < 0) xmm3 = -xmm3;
158 paddw xmm0
, XMMWORD
[CORRECTION
(0,0,edx)] ; correction + roundfactor
159 paddw xmm1
, XMMWORD
[CORRECTION
(1,0,edx)]
160 paddw xmm2
, XMMWORD
[CORRECTION
(2,0,edx)]
161 paddw xmm3
, XMMWORD
[CORRECTION
(3,0,edx)]
162 pmulhuw xmm0
, XMMWORD
[RECIPROCAL
(0,0,edx)] ; reciprocal
163 pmulhuw xmm1
, XMMWORD
[RECIPROCAL
(1,0,edx)]
164 pmulhuw xmm2
, XMMWORD
[RECIPROCAL
(2,0,edx)]
165 pmulhuw xmm3
, XMMWORD
[RECIPROCAL
(3,0,edx)]
166 pmulhuw xmm0
, XMMWORD
[SCALE
(0,0,edx)] ; scale
167 pmulhuw xmm1
, XMMWORD
[SCALE
(1,0,edx)]
168 pmulhuw xmm2
, XMMWORD
[SCALE
(2,0,edx)]
169 pmulhuw xmm3
, XMMWORD
[SCALE
(3,0,edx)]
179 movdqa XMMWORD
[XMMBLOCK
(0,0,edi,SIZEOF_DCTELEM
)], xmm0
180 movdqa XMMWORD
[XMMBLOCK
(1,0,edi,SIZEOF_DCTELEM
)], xmm1
181 movdqa XMMWORD
[XMMBLOCK
(2,0,edi,SIZEOF_DCTELEM
)], xmm2
182 movdqa XMMWORD
[XMMBLOCK
(3,0,edi,SIZEOF_DCTELEM
)], xmm3
184 add esi, byte 32*SIZEOF_DCTELEM
185 add edx, byte 32*SIZEOF_DCTELEM
186 add edi, byte 32*SIZEOF_JCOEF
192 ; pop edx ; need not be preserved
198 ; For some reason, the OS X linker does not honor the request to align the
199 ; segment unless we do this.