2 ; jimmxfst.asm - fast integer IDCT (MMX)
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
7 ; x86 SIMD extension for IJG JPEG library
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
11 ; This file should be assembled with NASM (Netwide Assembler),
12 ; can *not* be assembled with Microsoft's MASM or any compatible
13 ; assembler (including Borland's Turbo Assembler).
14 ; NASM is available from http://nasm.sourceforge.net/ or
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
17 ; This file contains a fast, not so accurate integer implementation of
18 ; the inverse DCT (Discrete Cosine Transform). The following code is
19 ; based directly on the IJG's original jidctfst.c; see the jidctfst.c
24 %include "jsimdext.inc"
27 ; --------------------------------------------------------------------------
29 %define CONST_BITS
8 ; 14 is also OK.
32 %if IFAST_SCALE_BITS
!= PASS1_BITS
33 %error
"'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
37 F_1_082
equ 277 ; FIX(1.082392200)
38 F_1_414
equ 362 ; FIX(1.414213562)
39 F_1_847
equ 473 ; FIX(1.847759065)
40 F_2_613
equ 669 ; FIX(2.613125930)
41 F_1_613
equ (F_2_613
- 256) ; FIX(2.613125930) - FIX(1)
43 ; NASM cannot do compile-time arithmetic on floating-point constants.
44 %define DESCALE
(x
,n
) (((x
)+(1<<((n
)-1)))>>(n
))
45 F_1_082
equ DESCALE
(1162209775,30-CONST_BITS
) ; FIX(1.082392200)
46 F_1_414
equ DESCALE
(1518500249,30-CONST_BITS
) ; FIX(1.414213562)
47 F_1_847
equ DESCALE
(1984016188,30-CONST_BITS
) ; FIX(1.847759065)
48 F_2_613
equ DESCALE
(2805822602,30-CONST_BITS
) ; FIX(2.613125930)
49 F_1_613
equ (F_2_613
- (1 << CONST_BITS
)) ; FIX(2.613125930) - FIX(1)
52 ; --------------------------------------------------------------------------
55 ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
56 ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
58 %define PRE_MULTIPLY_SCALE_BITS
2
59 %define CONST_SHIFT
(16 - PRE_MULTIPLY_SCALE_BITS
- CONST_BITS
)
62 global EXTN
(jconst_idct_ifast_mmx
)
64 EXTN
(jconst_idct_ifast_mmx
):
66 PW_F1414 times
4 dw F_1_414
<< CONST_SHIFT
67 PW_F1847 times
4 dw F_1_847
<< CONST_SHIFT
68 PW_MF1613 times
4 dw -F_1_613
<< CONST_SHIFT
69 PW_F1082 times
4 dw F_1_082
<< CONST_SHIFT
70 PB_CENTERJSAMP times
8 db CENTERJSAMPLE
74 ; --------------------------------------------------------------------------
78 ; Perform dequantization and inverse DCT on one block of coefficients.
81 ; jsimd_idct_ifast_mmx (void * dct_table, JCOEFPTR coef_block,
82 ; JSAMPARRAY output_buf, JDIMENSION output_col)
85 %define dct_table
(b
) (b
)+8 ; jpeg_component_info * compptr
86 %define coef_block
(b
) (b
)+12 ; JCOEFPTR coef_block
87 %define output_buf
(b
) (b
)+16 ; JSAMPARRAY output_buf
88 %define output_col
(b
) (b
)+20 ; JDIMENSION output_col
90 %define original_ebp
ebp+0
91 %define wk
(i
) ebp-(WK_NUM
-(i
))*SIZEOF_MMWORD
; mmword wk[WK_NUM]
93 %define workspace wk
(0)-DCTSIZE2
*SIZEOF_JCOEF
94 ; JCOEF workspace[DCTSIZE2]
97 global EXTN
(jsimd_idct_ifast_mmx
)
99 EXTN
(jsimd_idct_ifast_mmx
):
101 mov eax,esp ; eax = original ebp
103 and esp, byte (-SIZEOF_MMWORD
) ; align to 64 bits
105 mov ebp,esp ; ebp = aligned ebp
108 ; push ecx ; need not be preserved
109 ; push edx ; need not be preserved
113 get_GOT
ebx ; get GOT address
115 ; ---- Pass 1: process columns from input, store into work array.
117 ; mov eax, [original_ebp]
118 mov edx, POINTER
[dct_table
(eax)] ; quantptr
119 mov esi, JCOEFPTR
[coef_block
(eax)] ; inptr
120 lea edi, [workspace
] ; JCOEF * wsptr
121 mov ecx, DCTSIZE
/4 ; ctr
124 %ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
125 mov eax, DWORD [DWBLOCK
(1,0,esi,SIZEOF_JCOEF
)]
126 or eax, DWORD [DWBLOCK
(2,0,esi,SIZEOF_JCOEF
)]
129 movq mm0
, MMWORD
[MMBLOCK
(1,0,esi,SIZEOF_JCOEF
)]
130 movq mm1
, MMWORD
[MMBLOCK
(2,0,esi,SIZEOF_JCOEF
)]
131 por mm0
, MMWORD
[MMBLOCK
(3,0,esi,SIZEOF_JCOEF
)]
132 por mm1
, MMWORD
[MMBLOCK
(4,0,esi,SIZEOF_JCOEF
)]
133 por mm0
, MMWORD
[MMBLOCK
(5,0,esi,SIZEOF_JCOEF
)]
134 por mm1
, MMWORD
[MMBLOCK
(6,0,esi,SIZEOF_JCOEF
)]
135 por mm0
, MMWORD
[MMBLOCK
(7,0,esi,SIZEOF_JCOEF
)]
142 ; -- AC terms all zero
144 movq mm0
, MMWORD
[MMBLOCK
(0,0,esi,SIZEOF_JCOEF
)]
145 pmullw mm0
, MMWORD
[MMBLOCK
(0,0,edx,SIZEOF_IFAST_MULT_TYPE
)]
147 movq mm2
,mm0
; mm0=in0=(00 01 02 03)
148 punpcklwd mm0
,mm0
; mm0=(00 00 01 01)
149 punpckhwd mm2
,mm2
; mm2=(02 02 03 03)
152 punpckldq mm0
,mm0
; mm0=(00 00 00 00)
153 punpckhdq mm1
,mm1
; mm1=(01 01 01 01)
155 punpckldq mm2
,mm2
; mm2=(02 02 02 02)
156 punpckhdq mm3
,mm3
; mm3=(03 03 03 03)
158 movq MMWORD
[MMBLOCK
(0,0,edi,SIZEOF_JCOEF
)], mm0
159 movq MMWORD
[MMBLOCK
(0,1,edi,SIZEOF_JCOEF
)], mm0
160 movq MMWORD
[MMBLOCK
(1,0,edi,SIZEOF_JCOEF
)], mm1
161 movq MMWORD
[MMBLOCK
(1,1,edi,SIZEOF_JCOEF
)], mm1
162 movq MMWORD
[MMBLOCK
(2,0,edi,SIZEOF_JCOEF
)], mm2
163 movq MMWORD
[MMBLOCK
(2,1,edi,SIZEOF_JCOEF
)], mm2
164 movq MMWORD
[MMBLOCK
(3,0,edi,SIZEOF_JCOEF
)], mm3
165 movq MMWORD
[MMBLOCK
(3,1,edi,SIZEOF_JCOEF
)], mm3
173 movq mm0
, MMWORD
[MMBLOCK
(0,0,esi,SIZEOF_JCOEF
)]
174 movq mm1
, MMWORD
[MMBLOCK
(2,0,esi,SIZEOF_JCOEF
)]
175 pmullw mm0
, MMWORD
[MMBLOCK
(0,0,edx,SIZEOF_IFAST_MULT_TYPE
)]
176 pmullw mm1
, MMWORD
[MMBLOCK
(2,0,edx,SIZEOF_IFAST_MULT_TYPE
)]
177 movq mm2
, MMWORD
[MMBLOCK
(4,0,esi,SIZEOF_JCOEF
)]
178 movq mm3
, MMWORD
[MMBLOCK
(6,0,esi,SIZEOF_JCOEF
)]
179 pmullw mm2
, MMWORD
[MMBLOCK
(4,0,edx,SIZEOF_IFAST_MULT_TYPE
)]
180 pmullw mm3
, MMWORD
[MMBLOCK
(6,0,edx,SIZEOF_IFAST_MULT_TYPE
)]
184 psubw mm0
,mm2
; mm0=tmp11
186 paddw mm4
,mm2
; mm4=tmp10
187 paddw mm5
,mm3
; mm5=tmp13
189 psllw mm1
,PRE_MULTIPLY_SCALE_BITS
190 pmulhw mm1
,[GOTOFF
(ebx,PW_F1414
)]
191 psubw mm1
,mm5
; mm1=tmp12
195 psubw mm4
,mm5
; mm4=tmp3
196 psubw mm0
,mm1
; mm0=tmp2
197 paddw mm6
,mm5
; mm6=tmp0
198 paddw mm7
,mm1
; mm7=tmp1
200 movq MMWORD
[wk
(1)], mm4
; wk(1)=tmp3
201 movq MMWORD
[wk
(0)], mm0
; wk(0)=tmp2
205 movq mm2
, MMWORD
[MMBLOCK
(1,0,esi,SIZEOF_JCOEF
)]
206 movq mm3
, MMWORD
[MMBLOCK
(3,0,esi,SIZEOF_JCOEF
)]
207 pmullw mm2
, MMWORD
[MMBLOCK
(1,0,edx,SIZEOF_IFAST_MULT_TYPE
)]
208 pmullw mm3
, MMWORD
[MMBLOCK
(3,0,edx,SIZEOF_IFAST_MULT_TYPE
)]
209 movq mm5
, MMWORD
[MMBLOCK
(5,0,esi,SIZEOF_JCOEF
)]
210 movq mm1
, MMWORD
[MMBLOCK
(7,0,esi,SIZEOF_JCOEF
)]
211 pmullw mm5
, MMWORD
[MMBLOCK
(5,0,edx,SIZEOF_IFAST_MULT_TYPE
)]
212 pmullw mm1
, MMWORD
[MMBLOCK
(7,0,edx,SIZEOF_IFAST_MULT_TYPE
)]
216 psubw mm2
,mm1
; mm2=z12
217 psubw mm5
,mm3
; mm5=z10
218 paddw mm4
,mm1
; mm4=z11
219 paddw mm0
,mm3
; mm0=z13
221 movq mm1
,mm5
; mm1=z10(unscaled)
222 psllw mm2
,PRE_MULTIPLY_SCALE_BITS
223 psllw mm5
,PRE_MULTIPLY_SCALE_BITS
227 paddw mm3
,mm0
; mm3=tmp7
229 psllw mm4
,PRE_MULTIPLY_SCALE_BITS
230 pmulhw mm4
,[GOTOFF
(ebx,PW_F1414
)] ; mm4=tmp11
232 ; To avoid overflow...
235 ; tmp12 = -2.613125930 * z10 + z5;
237 ; (This implementation)
238 ; tmp12 = (-1.613125930 - 1) * z10 + z5;
239 ; = -1.613125930 * z10 - z10 + z5;
243 pmulhw mm5
,[GOTOFF
(ebx,PW_F1847
)] ; mm5=z5
244 pmulhw mm0
,[GOTOFF
(ebx,PW_MF1613
)]
245 pmulhw mm2
,[GOTOFF
(ebx,PW_F1082
)]
247 psubw mm2
,mm5
; mm2=tmp10
248 paddw mm0
,mm5
; mm0=tmp12
250 ; -- Final output stage
252 psubw mm0
,mm3
; mm0=tmp6
255 paddw mm6
,mm3
; mm6=data0=(00 01 02 03)
256 paddw mm7
,mm0
; mm7=data1=(10 11 12 13)
257 psubw mm1
,mm3
; mm1=data7=(70 71 72 73)
258 psubw mm5
,mm0
; mm5=data6=(60 61 62 63)
259 psubw mm4
,mm0
; mm4=tmp5
261 movq mm3
,mm6
; transpose coefficients(phase 1)
262 punpcklwd mm6
,mm7
; mm6=(00 10 01 11)
263 punpckhwd mm3
,mm7
; mm3=(02 12 03 13)
264 movq mm0
,mm5
; transpose coefficients(phase 1)
265 punpcklwd mm5
,mm1
; mm5=(60 70 61 71)
266 punpckhwd mm0
,mm1
; mm0=(62 72 63 73)
268 movq mm7
, MMWORD
[wk
(0)] ; mm7=tmp2
269 movq mm1
, MMWORD
[wk
(1)] ; mm1=tmp3
271 movq MMWORD
[wk
(0)], mm5
; wk(0)=(60 70 61 71)
272 movq MMWORD
[wk
(1)], mm0
; wk(1)=(62 72 63 73)
274 paddw mm2
,mm4
; mm2=tmp4
277 paddw mm7
,mm4
; mm7=data2=(20 21 22 23)
278 paddw mm1
,mm2
; mm1=data4=(40 41 42 43)
279 psubw mm5
,mm4
; mm5=data5=(50 51 52 53)
280 psubw mm0
,mm2
; mm0=data3=(30 31 32 33)
282 movq mm4
,mm7
; transpose coefficients(phase 1)
283 punpcklwd mm7
,mm0
; mm7=(20 30 21 31)
284 punpckhwd mm4
,mm0
; mm4=(22 32 23 33)
285 movq mm2
,mm1
; transpose coefficients(phase 1)
286 punpcklwd mm1
,mm5
; mm1=(40 50 41 51)
287 punpckhwd mm2
,mm5
; mm2=(42 52 43 53)
289 movq mm0
,mm6
; transpose coefficients(phase 2)
290 punpckldq mm6
,mm7
; mm6=(00 10 20 30)
291 punpckhdq mm0
,mm7
; mm0=(01 11 21 31)
292 movq mm5
,mm3
; transpose coefficients(phase 2)
293 punpckldq mm3
,mm4
; mm3=(02 12 22 32)
294 punpckhdq mm5
,mm4
; mm5=(03 13 23 33)
296 movq mm7
, MMWORD
[wk
(0)] ; mm7=(60 70 61 71)
297 movq mm4
, MMWORD
[wk
(1)] ; mm4=(62 72 63 73)
299 movq MMWORD
[MMBLOCK
(0,0,edi,SIZEOF_JCOEF
)], mm6
300 movq MMWORD
[MMBLOCK
(1,0,edi,SIZEOF_JCOEF
)], mm0
301 movq MMWORD
[MMBLOCK
(2,0,edi,SIZEOF_JCOEF
)], mm3
302 movq MMWORD
[MMBLOCK
(3,0,edi,SIZEOF_JCOEF
)], mm5
304 movq mm6
,mm1
; transpose coefficients(phase 2)
305 punpckldq mm1
,mm7
; mm1=(40 50 60 70)
306 punpckhdq mm6
,mm7
; mm6=(41 51 61 71)
307 movq mm0
,mm2
; transpose coefficients(phase 2)
308 punpckldq mm2
,mm4
; mm2=(42 52 62 72)
309 punpckhdq mm0
,mm4
; mm0=(43 53 63 73)
311 movq MMWORD
[MMBLOCK
(0,1,edi,SIZEOF_JCOEF
)], mm1
312 movq MMWORD
[MMBLOCK
(1,1,edi,SIZEOF_JCOEF
)], mm6
313 movq MMWORD
[MMBLOCK
(2,1,edi,SIZEOF_JCOEF
)], mm2
314 movq MMWORD
[MMBLOCK
(3,1,edi,SIZEOF_JCOEF
)], mm0
317 add esi, byte 4*SIZEOF_JCOEF
; coef_block
318 add edx, byte 4*SIZEOF_IFAST_MULT_TYPE
; quantptr
319 add edi, byte 4*DCTSIZE
*SIZEOF_JCOEF
; wsptr
323 ; ---- Pass 2: process rows from work array, store into output array.
325 mov eax, [original_ebp
]
326 lea esi, [workspace
] ; JCOEF * wsptr
327 mov edi, JSAMPARRAY
[output_buf
(eax)] ; (JSAMPROW *)
328 mov eax, JDIMENSION
[output_col
(eax)]
329 mov ecx, DCTSIZE
/4 ; ctr
335 movq mm0
, MMWORD
[MMBLOCK
(0,0,esi,SIZEOF_JCOEF
)]
336 movq mm1
, MMWORD
[MMBLOCK
(2,0,esi,SIZEOF_JCOEF
)]
337 movq mm2
, MMWORD
[MMBLOCK
(4,0,esi,SIZEOF_JCOEF
)]
338 movq mm3
, MMWORD
[MMBLOCK
(6,0,esi,SIZEOF_JCOEF
)]
342 psubw mm0
,mm2
; mm0=tmp11
344 paddw mm4
,mm2
; mm4=tmp10
345 paddw mm5
,mm3
; mm5=tmp13
347 psllw mm1
,PRE_MULTIPLY_SCALE_BITS
348 pmulhw mm1
,[GOTOFF
(ebx,PW_F1414
)]
349 psubw mm1
,mm5
; mm1=tmp12
353 psubw mm4
,mm5
; mm4=tmp3
354 psubw mm0
,mm1
; mm0=tmp2
355 paddw mm6
,mm5
; mm6=tmp0
356 paddw mm7
,mm1
; mm7=tmp1
358 movq MMWORD
[wk
(1)], mm4
; wk(1)=tmp3
359 movq MMWORD
[wk
(0)], mm0
; wk(0)=tmp2
363 movq mm2
, MMWORD
[MMBLOCK
(1,0,esi,SIZEOF_JCOEF
)]
364 movq mm3
, MMWORD
[MMBLOCK
(3,0,esi,SIZEOF_JCOEF
)]
365 movq mm5
, MMWORD
[MMBLOCK
(5,0,esi,SIZEOF_JCOEF
)]
366 movq mm1
, MMWORD
[MMBLOCK
(7,0,esi,SIZEOF_JCOEF
)]
370 psubw mm2
,mm1
; mm2=z12
371 psubw mm5
,mm3
; mm5=z10
372 paddw mm4
,mm1
; mm4=z11
373 paddw mm0
,mm3
; mm0=z13
375 movq mm1
,mm5
; mm1=z10(unscaled)
376 psllw mm2
,PRE_MULTIPLY_SCALE_BITS
377 psllw mm5
,PRE_MULTIPLY_SCALE_BITS
381 paddw mm3
,mm0
; mm3=tmp7
383 psllw mm4
,PRE_MULTIPLY_SCALE_BITS
384 pmulhw mm4
,[GOTOFF
(ebx,PW_F1414
)] ; mm4=tmp11
386 ; To avoid overflow...
389 ; tmp12 = -2.613125930 * z10 + z5;
391 ; (This implementation)
392 ; tmp12 = (-1.613125930 - 1) * z10 + z5;
393 ; = -1.613125930 * z10 - z10 + z5;
397 pmulhw mm5
,[GOTOFF
(ebx,PW_F1847
)] ; mm5=z5
398 pmulhw mm0
,[GOTOFF
(ebx,PW_MF1613
)]
399 pmulhw mm2
,[GOTOFF
(ebx,PW_F1082
)]
401 psubw mm2
,mm5
; mm2=tmp10
402 paddw mm0
,mm5
; mm0=tmp12
404 ; -- Final output stage
406 psubw mm0
,mm3
; mm0=tmp6
409 paddw mm6
,mm3
; mm6=data0=(00 10 20 30)
410 paddw mm7
,mm0
; mm7=data1=(01 11 21 31)
411 psraw mm6
,(PASS1_BITS
+3) ; descale
412 psraw mm7
,(PASS1_BITS
+3) ; descale
413 psubw mm1
,mm3
; mm1=data7=(07 17 27 37)
414 psubw mm5
,mm0
; mm5=data6=(06 16 26 36)
415 psraw mm1
,(PASS1_BITS
+3) ; descale
416 psraw mm5
,(PASS1_BITS
+3) ; descale
417 psubw mm4
,mm0
; mm4=tmp5
419 packsswb mm6
,mm5
; mm6=(00 10 20 30 06 16 26 36)
420 packsswb mm7
,mm1
; mm7=(01 11 21 31 07 17 27 37)
422 movq mm3
, MMWORD
[wk
(0)] ; mm3=tmp2
423 movq mm0
, MMWORD
[wk
(1)] ; mm0=tmp3
425 paddw mm2
,mm4
; mm2=tmp4
428 paddw mm3
,mm4
; mm3=data2=(02 12 22 32)
429 paddw mm0
,mm2
; mm0=data4=(04 14 24 34)
430 psraw mm3
,(PASS1_BITS
+3) ; descale
431 psraw mm0
,(PASS1_BITS
+3) ; descale
432 psubw mm5
,mm4
; mm5=data5=(05 15 25 35)
433 psubw mm1
,mm2
; mm1=data3=(03 13 23 33)
434 psraw mm5
,(PASS1_BITS
+3) ; descale
435 psraw mm1
,(PASS1_BITS
+3) ; descale
437 movq mm4
,[GOTOFF
(ebx,PB_CENTERJSAMP
)] ; mm4=[PB_CENTERJSAMP]
439 packsswb mm3
,mm0
; mm3=(02 12 22 32 04 14 24 34)
440 packsswb mm1
,mm5
; mm1=(03 13 23 33 05 15 25 35)
447 movq mm2
,mm6
; transpose coefficients(phase 1)
448 punpcklbw mm6
,mm7
; mm6=(00 01 10 11 20 21 30 31)
449 punpckhbw mm2
,mm7
; mm2=(06 07 16 17 26 27 36 37)
450 movq mm0
,mm3
; transpose coefficients(phase 1)
451 punpcklbw mm3
,mm1
; mm3=(02 03 12 13 22 23 32 33)
452 punpckhbw mm0
,mm1
; mm0=(04 05 14 15 24 25 34 35)
454 movq mm5
,mm6
; transpose coefficients(phase 2)
455 punpcklwd mm6
,mm3
; mm6=(00 01 02 03 10 11 12 13)
456 punpckhwd mm5
,mm3
; mm5=(20 21 22 23 30 31 32 33)
457 movq mm4
,mm0
; transpose coefficients(phase 2)
458 punpcklwd mm0
,mm2
; mm0=(04 05 06 07 14 15 16 17)
459 punpckhwd mm4
,mm2
; mm4=(24 25 26 27 34 35 36 37)
461 movq mm7
,mm6
; transpose coefficients(phase 3)
462 punpckldq mm6
,mm0
; mm6=(00 01 02 03 04 05 06 07)
463 punpckhdq mm7
,mm0
; mm7=(10 11 12 13 14 15 16 17)
464 movq mm1
,mm5
; transpose coefficients(phase 3)
465 punpckldq mm5
,mm4
; mm5=(20 21 22 23 24 25 26 27)
466 punpckhdq mm1
,mm4
; mm1=(30 31 32 33 34 35 36 37)
468 pushpic
ebx ; save GOT address
470 mov edx, JSAMPROW
[edi+0*SIZEOF_JSAMPROW
]
471 mov ebx, JSAMPROW
[edi+1*SIZEOF_JSAMPROW
]
472 movq MMWORD
[edx+eax*SIZEOF_JSAMPLE
], mm6
473 movq MMWORD
[ebx+eax*SIZEOF_JSAMPLE
], mm7
474 mov edx, JSAMPROW
[edi+2*SIZEOF_JSAMPROW
]
475 mov ebx, JSAMPROW
[edi+3*SIZEOF_JSAMPROW
]
476 movq MMWORD
[edx+eax*SIZEOF_JSAMPLE
], mm5
477 movq MMWORD
[ebx+eax*SIZEOF_JSAMPLE
], mm1
479 poppic
ebx ; restore GOT address
481 add esi, byte 4*SIZEOF_JCOEF
; wsptr
482 add edi, byte 4*SIZEOF_JSAMPROW
486 emms
; empty MMX state
490 ; pop edx ; need not be preserved
491 ; pop ecx ; need not be preserved
493 mov esp,ebp ; esp <- aligned ebp
494 pop esp ; esp <- original ebp
498 ; For some reason, the OS X linker does not honor the request to align the
499 ; segment unless we do this.