2 ; ji3dnflt.asm - floating-point IDCT (3DNow! & MMX)
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
7 ; x86 SIMD extension for IJG JPEG library
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
11 ; This file should be assembled with NASM (Netwide Assembler),
12 ; can *not* be assembled with Microsoft's MASM or any compatible
13 ; assembler (including Borland's Turbo Assembler).
14 ; NASM is available from http://nasm.sourceforge.net/ or
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
17 ; This file contains a floating-point implementation of the inverse DCT
18 ; (Discrete Cosine Transform). The following code is based directly on
19 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
23 %include "jsimdext.inc"
26 ; --------------------------------------------------------------------------
30 global EXTN
(jconst_idct_float_3dnow
)
32 EXTN
(jconst_idct_float_3dnow
):
34 PD_1_414 times
2 dd 1.414213562373095048801689
35 PD_1_847 times
2 dd 1.847759065022573512256366
36 PD_1_082 times
2 dd 1.082392200292393968799446
37 PD_2_613 times
2 dd 2.613125929752753055713286
38 PD_RNDINT_MAGIC times
2 dd 100663296.0 ; (float)(0x00C00000 << 3)
39 PB_CENTERJSAMP times
8 db CENTERJSAMPLE
43 ; --------------------------------------------------------------------------
47 ; Perform dequantization and inverse DCT on one block of coefficients.
50 ; jsimd_idct_float_3dnow (void * dct_table, JCOEFPTR coef_block,
51 ; JSAMPARRAY output_buf, JDIMENSION output_col)
54 %define dct_table
(b
) (b
)+8 ; void * dct_table
55 %define coef_block
(b
) (b
)+12 ; JCOEFPTR coef_block
56 %define output_buf
(b
) (b
)+16 ; JSAMPARRAY output_buf
57 %define output_col
(b
) (b
)+20 ; JDIMENSION output_col
59 %define original_ebp
ebp+0
60 %define wk
(i
) ebp-(WK_NUM
-(i
))*SIZEOF_MMWORD
; mmword wk[WK_NUM]
62 %define workspace wk
(0)-DCTSIZE2
*SIZEOF_FAST_FLOAT
63 ; FAST_FLOAT workspace[DCTSIZE2]
66 global EXTN
(jsimd_idct_float_3dnow
)
68 EXTN
(jsimd_idct_float_3dnow
):
70 mov eax,esp ; eax = original ebp
72 and esp, byte (-SIZEOF_MMWORD
) ; align to 64 bits
74 mov ebp,esp ; ebp = aligned ebp
77 ; push ecx ; need not be preserved
78 ; push edx ; need not be preserved
82 get_GOT
ebx ; get GOT address
84 ; ---- Pass 1: process columns from input, store into work array.
86 ; mov eax, [original_ebp]
87 mov edx, POINTER
[dct_table
(eax)] ; quantptr
88 mov esi, JCOEFPTR
[coef_block
(eax)] ; inptr
89 lea edi, [workspace
] ; FAST_FLOAT * wsptr
90 mov ecx, DCTSIZE
/2 ; ctr
93 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
94 mov eax, DWORD [DWBLOCK
(1,0,esi,SIZEOF_JCOEF
)]
95 or eax, DWORD [DWBLOCK
(2,0,esi,SIZEOF_JCOEF
)]
98 pushpic
ebx ; save GOT address
99 mov ebx, DWORD [DWBLOCK
(3,0,esi,SIZEOF_JCOEF
)]
100 mov eax, DWORD [DWBLOCK
(4,0,esi,SIZEOF_JCOEF
)]
101 or ebx, DWORD [DWBLOCK
(5,0,esi,SIZEOF_JCOEF
)]
102 or eax, DWORD [DWBLOCK
(6,0,esi,SIZEOF_JCOEF
)]
103 or ebx, DWORD [DWBLOCK
(7,0,esi,SIZEOF_JCOEF
)]
105 poppic
ebx ; restore GOT address
108 ; -- AC terms all zero
110 movd mm0
, DWORD [DWBLOCK
(0,0,esi,SIZEOF_JCOEF
)]
113 psrad mm0
,(DWORD_BIT
-WORD_BIT
)
116 pfmul mm0
, MMWORD
[MMBLOCK
(0,0,edx,SIZEOF_FLOAT_MULT_TYPE
)]
122 movq MMWORD
[MMBLOCK
(0,0,edi,SIZEOF_FAST_FLOAT
)], mm0
123 movq MMWORD
[MMBLOCK
(0,1,edi,SIZEOF_FAST_FLOAT
)], mm0
124 movq MMWORD
[MMBLOCK
(0,2,edi,SIZEOF_FAST_FLOAT
)], mm0
125 movq MMWORD
[MMBLOCK
(0,3,edi,SIZEOF_FAST_FLOAT
)], mm0
126 movq MMWORD
[MMBLOCK
(1,0,edi,SIZEOF_FAST_FLOAT
)], mm1
127 movq MMWORD
[MMBLOCK
(1,1,edi,SIZEOF_FAST_FLOAT
)], mm1
128 movq MMWORD
[MMBLOCK
(1,2,edi,SIZEOF_FAST_FLOAT
)], mm1
129 movq MMWORD
[MMBLOCK
(1,3,edi,SIZEOF_FAST_FLOAT
)], mm1
137 movd mm0
, DWORD [DWBLOCK
(0,0,esi,SIZEOF_JCOEF
)]
138 movd mm1
, DWORD [DWBLOCK
(2,0,esi,SIZEOF_JCOEF
)]
139 movd mm2
, DWORD [DWBLOCK
(4,0,esi,SIZEOF_JCOEF
)]
140 movd mm3
, DWORD [DWBLOCK
(6,0,esi,SIZEOF_JCOEF
)]
144 psrad mm0
,(DWORD_BIT
-WORD_BIT
)
145 psrad mm1
,(DWORD_BIT
-WORD_BIT
)
149 pfmul mm0
, MMWORD
[MMBLOCK
(0,0,edx,SIZEOF_FLOAT_MULT_TYPE
)]
150 pfmul mm1
, MMWORD
[MMBLOCK
(2,0,edx,SIZEOF_FLOAT_MULT_TYPE
)]
154 psrad mm2
,(DWORD_BIT
-WORD_BIT
)
155 psrad mm3
,(DWORD_BIT
-WORD_BIT
)
159 pfmul mm2
, MMWORD
[MMBLOCK
(4,0,edx,SIZEOF_FLOAT_MULT_TYPE
)]
160 pfmul mm3
, MMWORD
[MMBLOCK
(6,0,edx,SIZEOF_FLOAT_MULT_TYPE
)]
164 pfsub mm0
,mm2
; mm0=tmp11
166 pfadd mm4
,mm2
; mm4=tmp10
167 pfadd mm5
,mm3
; mm5=tmp13
169 pfmul mm1
,[GOTOFF
(ebx,PD_1_414
)]
170 pfsub mm1
,mm5
; mm1=tmp12
174 pfsub mm4
,mm5
; mm4=tmp3
175 pfsub mm0
,mm1
; mm0=tmp2
176 pfadd mm6
,mm5
; mm6=tmp0
177 pfadd mm7
,mm1
; mm7=tmp1
179 movq MMWORD
[wk
(1)], mm4
; tmp3
180 movq MMWORD
[wk
(0)], mm0
; tmp2
184 movd mm2
, DWORD [DWBLOCK
(1,0,esi,SIZEOF_JCOEF
)]
185 movd mm3
, DWORD [DWBLOCK
(3,0,esi,SIZEOF_JCOEF
)]
186 movd mm5
, DWORD [DWBLOCK
(5,0,esi,SIZEOF_JCOEF
)]
187 movd mm1
, DWORD [DWBLOCK
(7,0,esi,SIZEOF_JCOEF
)]
191 psrad mm2
,(DWORD_BIT
-WORD_BIT
)
192 psrad mm3
,(DWORD_BIT
-WORD_BIT
)
196 pfmul mm2
, MMWORD
[MMBLOCK
(1,0,edx,SIZEOF_FLOAT_MULT_TYPE
)]
197 pfmul mm3
, MMWORD
[MMBLOCK
(3,0,edx,SIZEOF_FLOAT_MULT_TYPE
)]
201 psrad mm5
,(DWORD_BIT
-WORD_BIT
)
202 psrad mm1
,(DWORD_BIT
-WORD_BIT
)
206 pfmul mm5
, MMWORD
[MMBLOCK
(5,0,edx,SIZEOF_FLOAT_MULT_TYPE
)]
207 pfmul mm1
, MMWORD
[MMBLOCK
(7,0,edx,SIZEOF_FLOAT_MULT_TYPE
)]
211 pfadd mm2
,mm1
; mm2=z11
212 pfadd mm5
,mm3
; mm5=z13
213 pfsub mm4
,mm1
; mm4=z12
214 pfsub mm0
,mm3
; mm0=z10
218 pfadd mm1
,mm5
; mm1=tmp7
220 pfmul mm2
,[GOTOFF
(ebx,PD_1_414
)] ; mm2=tmp11
224 pfmul mm0
,[GOTOFF
(ebx,PD_1_847
)] ; mm0=z5
225 pfmul mm3
,[GOTOFF
(ebx,PD_2_613
)] ; mm3=(z10 * 2.613125930)
226 pfmul mm4
,[GOTOFF
(ebx,PD_1_082
)] ; mm4=(z12 * 1.082392200)
227 pfsubr mm3
,mm0
; mm3=tmp12
228 pfsub mm4
,mm0
; mm4=tmp10
230 ; -- Final output stage
232 pfsub mm3
,mm1
; mm3=tmp6
235 pfadd mm6
,mm1
; mm6=data0=(00 01)
236 pfadd mm7
,mm3
; mm7=data1=(10 11)
237 pfsub mm5
,mm1
; mm5=data7=(70 71)
238 pfsub mm0
,mm3
; mm0=data6=(60 61)
239 pfsub mm2
,mm3
; mm2=tmp5
241 movq mm1
,mm6
; transpose coefficients
242 punpckldq mm6
,mm7
; mm6=(00 10)
243 punpckhdq mm1
,mm7
; mm1=(01 11)
244 movq mm3
,mm0
; transpose coefficients
245 punpckldq mm0
,mm5
; mm0=(60 70)
246 punpckhdq mm3
,mm5
; mm3=(61 71)
248 movq MMWORD
[MMBLOCK
(0,0,edi,SIZEOF_FAST_FLOAT
)], mm6
249 movq MMWORD
[MMBLOCK
(1,0,edi,SIZEOF_FAST_FLOAT
)], mm1
250 movq MMWORD
[MMBLOCK
(0,3,edi,SIZEOF_FAST_FLOAT
)], mm0
251 movq MMWORD
[MMBLOCK
(1,3,edi,SIZEOF_FAST_FLOAT
)], mm3
253 movq mm7
, MMWORD
[wk
(0)] ; mm7=tmp2
254 movq mm5
, MMWORD
[wk
(1)] ; mm5=tmp3
256 pfadd mm4
,mm2
; mm4=tmp4
259 pfadd mm7
,mm2
; mm7=data2=(20 21)
260 pfadd mm5
,mm4
; mm5=data4=(40 41)
261 pfsub mm6
,mm2
; mm6=data5=(50 51)
262 pfsub mm1
,mm4
; mm1=data3=(30 31)
264 movq mm0
,mm7
; transpose coefficients
265 punpckldq mm7
,mm1
; mm7=(20 30)
266 punpckhdq mm0
,mm1
; mm0=(21 31)
267 movq mm3
,mm5
; transpose coefficients
268 punpckldq mm5
,mm6
; mm5=(40 50)
269 punpckhdq mm3
,mm6
; mm3=(41 51)
271 movq MMWORD
[MMBLOCK
(0,1,edi,SIZEOF_FAST_FLOAT
)], mm7
272 movq MMWORD
[MMBLOCK
(1,1,edi,SIZEOF_FAST_FLOAT
)], mm0
273 movq MMWORD
[MMBLOCK
(0,2,edi,SIZEOF_FAST_FLOAT
)], mm5
274 movq MMWORD
[MMBLOCK
(1,2,edi,SIZEOF_FAST_FLOAT
)], mm3
277 add esi, byte 2*SIZEOF_JCOEF
; coef_block
278 add edx, byte 2*SIZEOF_FLOAT_MULT_TYPE
; quantptr
279 add edi, byte 2*DCTSIZE
*SIZEOF_FAST_FLOAT
; wsptr
283 ; -- Prefetch the next coefficient block
285 prefetch
[esi + (DCTSIZE2
-8)*SIZEOF_JCOEF
+ 0*32]
286 prefetch
[esi + (DCTSIZE2
-8)*SIZEOF_JCOEF
+ 1*32]
287 prefetch
[esi + (DCTSIZE2
-8)*SIZEOF_JCOEF
+ 2*32]
288 prefetch
[esi + (DCTSIZE2
-8)*SIZEOF_JCOEF
+ 3*32]
290 ; ---- Pass 2: process rows from work array, store into output array.
292 mov eax, [original_ebp
]
293 lea esi, [workspace
] ; FAST_FLOAT * wsptr
294 mov edi, JSAMPARRAY
[output_buf
(eax)] ; (JSAMPROW *)
295 mov eax, JDIMENSION
[output_col
(eax)]
296 mov ecx, DCTSIZE
/2 ; ctr
302 movq mm0
, MMWORD
[MMBLOCK
(0,0,esi,SIZEOF_FAST_FLOAT
)]
303 movq mm1
, MMWORD
[MMBLOCK
(2,0,esi,SIZEOF_FAST_FLOAT
)]
304 movq mm2
, MMWORD
[MMBLOCK
(4,0,esi,SIZEOF_FAST_FLOAT
)]
305 movq mm3
, MMWORD
[MMBLOCK
(6,0,esi,SIZEOF_FAST_FLOAT
)]
309 pfsub mm0
,mm2
; mm0=tmp11
311 pfadd mm4
,mm2
; mm4=tmp10
312 pfadd mm5
,mm3
; mm5=tmp13
314 pfmul mm1
,[GOTOFF
(ebx,PD_1_414
)]
315 pfsub mm1
,mm5
; mm1=tmp12
319 pfsub mm4
,mm5
; mm4=tmp3
320 pfsub mm0
,mm1
; mm0=tmp2
321 pfadd mm6
,mm5
; mm6=tmp0
322 pfadd mm7
,mm1
; mm7=tmp1
324 movq MMWORD
[wk
(1)], mm4
; tmp3
325 movq MMWORD
[wk
(0)], mm0
; tmp2
329 movq mm2
, MMWORD
[MMBLOCK
(1,0,esi,SIZEOF_FAST_FLOAT
)]
330 movq mm3
, MMWORD
[MMBLOCK
(3,0,esi,SIZEOF_FAST_FLOAT
)]
331 movq mm5
, MMWORD
[MMBLOCK
(5,0,esi,SIZEOF_FAST_FLOAT
)]
332 movq mm1
, MMWORD
[MMBLOCK
(7,0,esi,SIZEOF_FAST_FLOAT
)]
336 pfadd mm2
,mm1
; mm2=z11
337 pfadd mm5
,mm3
; mm5=z13
338 pfsub mm4
,mm1
; mm4=z12
339 pfsub mm0
,mm3
; mm0=z10
343 pfadd mm1
,mm5
; mm1=tmp7
345 pfmul mm2
,[GOTOFF
(ebx,PD_1_414
)] ; mm2=tmp11
349 pfmul mm0
,[GOTOFF
(ebx,PD_1_847
)] ; mm0=z5
350 pfmul mm3
,[GOTOFF
(ebx,PD_2_613
)] ; mm3=(z10 * 2.613125930)
351 pfmul mm4
,[GOTOFF
(ebx,PD_1_082
)] ; mm4=(z12 * 1.082392200)
352 pfsubr mm3
,mm0
; mm3=tmp12
353 pfsub mm4
,mm0
; mm4=tmp10
355 ; -- Final output stage
357 pfsub mm3
,mm1
; mm3=tmp6
360 pfadd mm6
,mm1
; mm6=data0=(00 10)
361 pfadd mm7
,mm3
; mm7=data1=(01 11)
362 pfsub mm5
,mm1
; mm5=data7=(07 17)
363 pfsub mm0
,mm3
; mm0=data6=(06 16)
364 pfsub mm2
,mm3
; mm2=tmp5
366 movq mm1
,[GOTOFF
(ebx,PD_RNDINT_MAGIC
)] ; mm1=[PD_RNDINT_MAGIC]
368 psrld mm3
,WORD_BIT
; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
370 pfadd mm6
,mm1
; mm6=roundint(data0/8)=(00 ** 10 **)
371 pfadd mm7
,mm1
; mm7=roundint(data1/8)=(01 ** 11 **)
372 pfadd mm0
,mm1
; mm0=roundint(data6/8)=(06 ** 16 **)
373 pfadd mm5
,mm1
; mm5=roundint(data7/8)=(07 ** 17 **)
375 pand mm6
,mm3
; mm6=(00 -- 10 --)
376 pslld mm7
,WORD_BIT
; mm7=(-- 01 -- 11)
377 pand mm0
,mm3
; mm0=(06 -- 16 --)
378 pslld mm5
,WORD_BIT
; mm5=(-- 07 -- 17)
379 por mm6
,mm7
; mm6=(00 01 10 11)
380 por mm0
,mm5
; mm0=(06 07 16 17)
382 movq mm1
, MMWORD
[wk
(0)] ; mm1=tmp2
383 movq mm3
, MMWORD
[wk
(1)] ; mm3=tmp3
385 pfadd mm4
,mm2
; mm4=tmp4
388 pfadd mm1
,mm2
; mm1=data2=(02 12)
389 pfadd mm3
,mm4
; mm3=data4=(04 14)
390 pfsub mm7
,mm2
; mm7=data5=(05 15)
391 pfsub mm5
,mm4
; mm5=data3=(03 13)
393 movq mm2
,[GOTOFF
(ebx,PD_RNDINT_MAGIC
)] ; mm2=[PD_RNDINT_MAGIC]
395 psrld mm4
,WORD_BIT
; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
397 pfadd mm3
,mm2
; mm3=roundint(data4/8)=(04 ** 14 **)
398 pfadd mm7
,mm2
; mm7=roundint(data5/8)=(05 ** 15 **)
399 pfadd mm1
,mm2
; mm1=roundint(data2/8)=(02 ** 12 **)
400 pfadd mm5
,mm2
; mm5=roundint(data3/8)=(03 ** 13 **)
402 pand mm3
,mm4
; mm3=(04 -- 14 --)
403 pslld mm7
,WORD_BIT
; mm7=(-- 05 -- 15)
404 pand mm1
,mm4
; mm1=(02 -- 12 --)
405 pslld mm5
,WORD_BIT
; mm5=(-- 03 -- 13)
406 por mm3
,mm7
; mm3=(04 05 14 15)
407 por mm1
,mm5
; mm1=(02 03 12 13)
409 movq mm2
,[GOTOFF
(ebx,PB_CENTERJSAMP
)] ; mm2=[PB_CENTERJSAMP]
411 packsswb mm6
,mm3
; mm6=(00 01 10 11 04 05 14 15)
412 packsswb mm1
,mm0
; mm1=(02 03 12 13 06 07 16 17)
416 movq mm4
,mm6
; transpose coefficients(phase 2)
417 punpcklwd mm6
,mm1
; mm6=(00 01 02 03 10 11 12 13)
418 punpckhwd mm4
,mm1
; mm4=(04 05 06 07 14 15 16 17)
420 movq mm7
,mm6
; transpose coefficients(phase 3)
421 punpckldq mm6
,mm4
; mm6=(00 01 02 03 04 05 06 07)
422 punpckhdq mm7
,mm4
; mm7=(10 11 12 13 14 15 16 17)
424 pushpic
ebx ; save GOT address
426 mov edx, JSAMPROW
[edi+0*SIZEOF_JSAMPROW
]
427 mov ebx, JSAMPROW
[edi+1*SIZEOF_JSAMPROW
]
428 movq MMWORD
[edx+eax*SIZEOF_JSAMPLE
], mm6
429 movq MMWORD
[ebx+eax*SIZEOF_JSAMPLE
], mm7
431 poppic
ebx ; restore GOT address
433 add esi, byte 2*SIZEOF_FAST_FLOAT
; wsptr
434 add edi, byte 2*SIZEOF_JSAMPROW
438 femms
; empty MMX/3DNow! state
442 ; pop edx ; need not be preserved
443 ; pop ecx ; need not be preserved
445 mov esp,ebp ; esp <- aligned ebp
446 pop esp ; esp <- original ebp
450 ; For some reason, the OS X linker does not honor the request to align the
451 ; segment unless we do this.