2 ; jcqntsse.asm - sample data conversion and quantization (SSE & MMX)
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
7 ; x86 SIMD extension for IJG JPEG library
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
11 ; This file should be assembled with NASM (Netwide Assembler),
12 ; can *not* be assembled with Microsoft's MASM or any compatible
13 ; assembler (including Borland's Turbo Assembler).
14 ; NASM is available from http://nasm.sourceforge.net/ or
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
19 %include "jsimdext.inc"
22 ; --------------------------------------------------------------------------
26 ; Load data into workspace, applying unsigned->signed conversion
29 ; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
30 ; FAST_FLOAT * workspace);
33 %define sample_data
ebp+8 ; JSAMPARRAY sample_data
34 %define start_col
ebp+12 ; JDIMENSION start_col
35 %define workspace
ebp+16 ; FAST_FLOAT * workspace
38 global EXTN
(jsimd_convsamp_float_sse
)
40 EXTN
(jsimd_convsamp_float_sse
):
44 ; push ecx ; need not be preserved
45 ; push edx ; need not be preserved
51 packsswb mm7
,mm7
; mm7 = PB_CENTERJSAMPLE (0x808080..)
53 mov esi, JSAMPARRAY
[sample_data
] ; (JSAMPROW *)
54 mov eax, JDIMENSION
[start_col
]
55 mov edi, POINTER
[workspace
] ; (DCTELEM *)
59 mov ebx, JSAMPROW
[esi+0*SIZEOF_JSAMPROW
] ; (JSAMPLE *)
60 mov edx, JSAMPROW
[esi+1*SIZEOF_JSAMPROW
] ; (JSAMPLE *)
62 movq mm0
, MMWORD
[ebx+eax*SIZEOF_JSAMPLE
]
63 movq mm1
, MMWORD
[edx+eax*SIZEOF_JSAMPLE
]
65 psubb mm0
,mm7
; mm0=(01234567)
66 psubb mm1
,mm7
; mm1=(89ABCDEF)
68 punpcklbw mm2
,mm0
; mm2=(*0*1*2*3)
69 punpckhbw mm0
,mm0
; mm0=(*4*5*6*7)
70 punpcklbw mm3
,mm1
; mm3=(*8*9*A*B)
71 punpckhbw mm1
,mm1
; mm1=(*C*D*E*F)
73 punpcklwd mm4
,mm2
; mm4=(***0***1)
74 punpckhwd mm2
,mm2
; mm2=(***2***3)
75 punpcklwd mm5
,mm0
; mm5=(***4***5)
76 punpckhwd mm0
,mm0
; mm0=(***6***7)
78 psrad mm4
,(DWORD_BIT
-BYTE_BIT
) ; mm4=(01)
79 psrad mm2
,(DWORD_BIT
-BYTE_BIT
) ; mm2=(23)
80 cvtpi2ps xmm0
,mm4
; xmm0=(01**)
81 cvtpi2ps xmm1
,mm2
; xmm1=(23**)
82 psrad mm5
,(DWORD_BIT
-BYTE_BIT
) ; mm5=(45)
83 psrad mm0
,(DWORD_BIT
-BYTE_BIT
) ; mm0=(67)
84 cvtpi2ps xmm2
,mm5
; xmm2=(45**)
85 cvtpi2ps xmm3
,mm0
; xmm3=(67**)
87 punpcklwd mm6
,mm3
; mm6=(***8***9)
88 punpckhwd mm3
,mm3
; mm3=(***A***B)
89 punpcklwd mm4
,mm1
; mm4=(***C***D)
90 punpckhwd mm1
,mm1
; mm1=(***E***F)
92 psrad mm6
,(DWORD_BIT
-BYTE_BIT
) ; mm6=(89)
93 psrad mm3
,(DWORD_BIT
-BYTE_BIT
) ; mm3=(AB)
94 cvtpi2ps xmm4
,mm6
; xmm4=(89**)
95 cvtpi2ps xmm5
,mm3
; xmm5=(AB**)
96 psrad mm4
,(DWORD_BIT
-BYTE_BIT
) ; mm4=(CD)
97 psrad mm1
,(DWORD_BIT
-BYTE_BIT
) ; mm1=(EF)
98 cvtpi2ps xmm6
,mm4
; xmm6=(CD**)
99 cvtpi2ps xmm7
,mm1
; xmm7=(EF**)
101 movlhps xmm0
,xmm1
; xmm0=(0123)
102 movlhps xmm2
,xmm3
; xmm2=(4567)
103 movlhps xmm4
,xmm5
; xmm4=(89AB)
104 movlhps xmm6
,xmm7
; xmm6=(CDEF)
106 movaps XMMWORD
[XMMBLOCK
(0,0,edi,SIZEOF_FAST_FLOAT
)], xmm0
107 movaps XMMWORD
[XMMBLOCK
(0,1,edi,SIZEOF_FAST_FLOAT
)], xmm2
108 movaps XMMWORD
[XMMBLOCK
(1,0,edi,SIZEOF_FAST_FLOAT
)], xmm4
109 movaps XMMWORD
[XMMBLOCK
(1,1,edi,SIZEOF_FAST_FLOAT
)], xmm6
111 add esi, byte 2*SIZEOF_JSAMPROW
112 add edi, byte 2*DCTSIZE
*SIZEOF_FAST_FLOAT
116 emms
; empty MMX state
120 ; pop edx ; need not be preserved
121 ; pop ecx ; need not be preserved
127 ; --------------------------------------------------------------------------
129 ; Quantize/descale the coefficients, and store into coef_block
132 ; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,
133 ; FAST_FLOAT * workspace);
136 %define coef_block
ebp+8 ; JCOEFPTR coef_block
137 %define divisors
ebp+12 ; FAST_FLOAT * divisors
138 %define workspace
ebp+16 ; FAST_FLOAT * workspace
141 global EXTN
(jsimd_quantize_float_sse
)
143 EXTN
(jsimd_quantize_float_sse
):
148 ; push edx ; need not be preserved
152 mov esi, POINTER
[workspace
]
153 mov edx, POINTER
[divisors
]
154 mov edi, JCOEFPTR
[coef_block
]
158 movaps xmm0
, XMMWORD
[XMMBLOCK
(0,0,esi,SIZEOF_FAST_FLOAT
)]
159 movaps xmm1
, XMMWORD
[XMMBLOCK
(0,1,esi,SIZEOF_FAST_FLOAT
)]
160 mulps xmm0
, XMMWORD
[XMMBLOCK
(0,0,edx,SIZEOF_FAST_FLOAT
)]
161 mulps xmm1
, XMMWORD
[XMMBLOCK
(0,1,edx,SIZEOF_FAST_FLOAT
)]
162 movaps xmm2
, XMMWORD
[XMMBLOCK
(1,0,esi,SIZEOF_FAST_FLOAT
)]
163 movaps xmm3
, XMMWORD
[XMMBLOCK
(1,1,esi,SIZEOF_FAST_FLOAT
)]
164 mulps xmm2
, XMMWORD
[XMMBLOCK
(1,0,edx,SIZEOF_FAST_FLOAT
)]
165 mulps xmm3
, XMMWORD
[XMMBLOCK
(1,1,edx,SIZEOF_FAST_FLOAT
)]
188 movq MMWORD
[MMBLOCK
(0,0,edi,SIZEOF_JCOEF
)], mm0
189 movq MMWORD
[MMBLOCK
(0,1,edi,SIZEOF_JCOEF
)], mm1
190 movq MMWORD
[MMBLOCK
(1,0,edi,SIZEOF_JCOEF
)], mm2
191 movq MMWORD
[MMBLOCK
(1,1,edi,SIZEOF_JCOEF
)], mm3
193 add esi, byte 16*SIZEOF_FAST_FLOAT
194 add edx, byte 16*SIZEOF_FAST_FLOAT
195 add edi, byte 16*SIZEOF_JCOEF
199 emms
; empty MMX state
203 ; pop edx ; need not be preserved
209 ; For some reason, the OS X linker does not honor the request to align the
210 ; segment unless we do this.