2 ; jcclrmmx.asm - colorspace conversion (MMX)
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
7 ; x86 SIMD extension for IJG JPEG library
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
11 ; This file should be assembled with NASM (Netwide Assembler),
12 ; can *not* be assembled with Microsoft's MASM or any compatible
13 ; assembler (including Borland's Turbo Assembler).
14 ; NASM is available from http://nasm.sourceforge.net/ or
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
19 %include "jcolsamp.inc"
21 ; --------------------------------------------------------------------------
25 ; Convert some rows of samples to the output colorspace.
28 ; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width,
29 ; JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
30 ; JDIMENSION output_row, int num_rows);
33 %define img_width
(b
) (b
)+8 ; JDIMENSION img_width
34 %define input_buf
(b
) (b
)+12 ; JSAMPARRAY input_buf
35 %define output_buf
(b
) (b
)+16 ; JSAMPIMAGE output_buf
36 %define output_row
(b
) (b
)+20 ; JDIMENSION output_row
37 %define num_rows
(b
) (b
)+24 ; int num_rows
39 %define original_ebp
ebp+0
40 %define wk
(i
) ebp-(WK_NUM
-(i
))*SIZEOF_MMWORD
; mmword wk[WK_NUM]
42 %define gotptr wk
(0)-SIZEOF_POINTER
; void * gotptr
45 global EXTN
(jsimd_rgb_ycc_convert_mmx
)
47 EXTN
(jsimd_rgb_ycc_convert_mmx
):
49 mov eax,esp ; eax = original ebp
51 and esp, byte (-SIZEOF_MMWORD
) ; align to 64 bits
53 mov ebp,esp ; ebp = aligned ebp
55 pushpic
eax ; make a room for GOT address
57 ; push ecx ; need not be preserved
58 ; push edx ; need not be preserved
62 get_GOT
ebx ; get GOT address
63 movpic POINTER
[gotptr
], ebx ; save GOT address
65 mov ecx, JDIMENSION
[img_width
(eax)] ; num_cols
71 mov esi, JSAMPIMAGE
[output_buf
(eax)]
72 mov ecx, JDIMENSION
[output_row
(eax)]
73 mov edi, JSAMPARRAY
[esi+0*SIZEOF_JSAMPARRAY
]
74 mov ebx, JSAMPARRAY
[esi+1*SIZEOF_JSAMPARRAY
]
75 mov edx, JSAMPARRAY
[esi+2*SIZEOF_JSAMPARRAY
]
76 lea edi, [edi+ecx*SIZEOF_JSAMPROW
]
77 lea ebx, [ebx+ecx*SIZEOF_JSAMPROW
]
78 lea edx, [edx+ecx*SIZEOF_JSAMPROW
]
82 mov esi, JSAMPARRAY
[input_buf
(eax)]
83 mov eax, INT [num_rows
(eax)]
95 mov esi, JSAMPROW
[esi] ; inptr
96 mov edi, JSAMPROW
[edi] ; outptr0
97 mov ebx, JSAMPROW
[ebx] ; outptr1
98 mov edx, JSAMPROW
[edx] ; outptr2
99 movpic
eax, POINTER
[gotptr
] ; load GOT address (eax)
101 cmp ecx, byte SIZEOF_MMWORD
102 jae short .columnloop
105 %if RGB_PIXELSIZE
== 3 ; ---------------
110 lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE
113 sub ecx, byte SIZEOF_BYTE
115 mov al, BYTE [esi+ecx]
119 sub ecx, byte SIZEOF_WORD
121 mov dx, WORD [esi+ecx]
128 test cl, SIZEOF_DWORD
130 sub ecx, byte SIZEOF_DWORD
131 movd mmG
, DWORD [esi+ecx]
135 test cl, SIZEOF_MMWORD
136 jz short .column_ld16
138 movq mmA
, MMWORD
[esi+0*SIZEOF_MMWORD
]
139 mov ecx, SIZEOF_MMWORD
140 jmp short .rgb_ycc_cnv
142 test cl, 2*SIZEOF_MMWORD
143 mov ecx, SIZEOF_MMWORD
144 jz short .rgb_ycc_cnv
146 movq mmA
, MMWORD
[esi+0*SIZEOF_MMWORD
]
147 movq mmG
, MMWORD
[esi+1*SIZEOF_MMWORD
]
148 jmp short .rgb_ycc_cnv
152 movq mmA
, MMWORD
[esi+0*SIZEOF_MMWORD
]
153 movq mmG
, MMWORD
[esi+1*SIZEOF_MMWORD
]
154 movq mmF
, MMWORD
[esi+2*SIZEOF_MMWORD
]
157 ; mmA=(00 10 20 01 11 21 02 12)
158 ; mmG=(22 03 13 23 04 14 24 05)
159 ; mmF=(15 25 06 16 26 07 17 27)
162 psllq mmA
,4*BYTE_BIT
; mmA=(-- -- -- -- 00 10 20 01)
163 psrlq mmD
,4*BYTE_BIT
; mmD=(11 21 02 12 -- -- -- --)
165 punpckhbw mmA
,mmG
; mmA=(00 04 10 14 20 24 01 05)
166 psllq mmG
,4*BYTE_BIT
; mmG=(-- -- -- -- 22 03 13 23)
168 punpcklbw mmD
,mmF
; mmD=(11 15 21 25 02 06 12 16)
169 punpckhbw mmG
,mmF
; mmG=(22 26 03 07 13 17 23 27)
172 psllq mmA
,4*BYTE_BIT
; mmA=(-- -- -- -- 00 04 10 14)
173 psrlq mmE
,4*BYTE_BIT
; mmE=(20 24 01 05 -- -- -- --)
175 punpckhbw mmA
,mmD
; mmA=(00 02 04 06 10 12 14 16)
176 psllq mmD
,4*BYTE_BIT
; mmD=(-- -- -- -- 11 15 21 25)
178 punpcklbw mmE
,mmG
; mmE=(20 22 24 26 01 03 05 07)
179 punpckhbw mmD
,mmG
; mmD=(11 13 15 17 21 23 25 27)
184 punpcklbw mmA
,mmH
; mmA=(00 02 04 06)
185 punpckhbw mmC
,mmH
; mmC=(10 12 14 16)
188 punpcklbw mmE
,mmH
; mmE=(20 22 24 26)
189 punpckhbw mmB
,mmH
; mmB=(01 03 05 07)
192 punpcklbw mmD
,mmH
; mmD=(11 13 15 17)
193 punpckhbw mmF
,mmH
; mmF=(21 23 25 27)
195 %else
; RGB_PIXELSIZE == 4 ; -----------
198 test cl, SIZEOF_MMWORD
/8
200 sub ecx, byte SIZEOF_MMWORD
/8
201 movd mmA
, DWORD [esi+ecx*RGB_PIXELSIZE
]
203 test cl, SIZEOF_MMWORD
/4
205 sub ecx, byte SIZEOF_MMWORD
/4
207 movq mmA
, MMWORD
[esi+ecx*RGB_PIXELSIZE
]
209 test cl, SIZEOF_MMWORD
/2
210 mov ecx, SIZEOF_MMWORD
211 jz short .rgb_ycc_cnv
214 movq mmA
, MMWORD
[esi+0*SIZEOF_MMWORD
]
215 movq mmF
, MMWORD
[esi+1*SIZEOF_MMWORD
]
216 jmp short .rgb_ycc_cnv
220 movq mmA
, MMWORD
[esi+0*SIZEOF_MMWORD
]
221 movq mmF
, MMWORD
[esi+1*SIZEOF_MMWORD
]
222 movq mmD
, MMWORD
[esi+2*SIZEOF_MMWORD
]
223 movq mmC
, MMWORD
[esi+3*SIZEOF_MMWORD
]
226 ; mmA=(00 10 20 30 01 11 21 31)
227 ; mmF=(02 12 22 32 03 13 23 33)
228 ; mmD=(04 14 24 34 05 15 25 35)
229 ; mmC=(06 16 26 36 07 17 27 37)
232 punpcklbw mmA
,mmF
; mmA=(00 02 10 12 20 22 30 32)
233 punpckhbw mmB
,mmF
; mmB=(01 03 11 13 21 23 31 33)
236 punpcklbw mmD
,mmC
; mmD=(04 06 14 16 24 26 34 36)
237 punpckhbw mmG
,mmC
; mmG=(05 07 15 17 25 27 35 37)
240 punpcklwd mmA
,mmD
; mmA=(00 02 04 06 10 12 14 16)
241 punpckhwd mmE
,mmD
; mmE=(20 22 24 26 30 32 34 36)
244 punpcklwd mmB
,mmG
; mmB=(01 03 05 07 11 13 15 17)
245 punpckhwd mmH
,mmG
; mmH=(21 23 25 27 31 33 35 37)
250 punpcklbw mmA
,mmF
; mmA=(00 02 04 06)
251 punpckhbw mmC
,mmF
; mmC=(10 12 14 16)
254 punpcklbw mmB
,mmF
; mmB=(01 03 05 07)
255 punpckhbw mmD
,mmF
; mmD=(11 13 15 17)
258 punpcklbw mmE
,mmF
; mmE=(20 22 24 26)
259 punpckhbw mmG
,mmF
; mmG=(30 32 34 36)
263 psrlw mmF
,BYTE_BIT
; mmF=(21 23 25 27)
264 psrlw mmH
,BYTE_BIT
; mmH=(31 33 35 37)
266 %endif
; RGB_PIXELSIZE ; ---------------
268 ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
269 ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
272 ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
273 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
274 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
276 ; (This implementation)
277 ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
278 ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
279 ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
281 movq MMWORD
[wk
(0)], mm0
; wk(0)=RE
282 movq MMWORD
[wk
(1)], mm1
; wk(1)=RO
283 movq MMWORD
[wk
(2)], mm4
; wk(2)=BE
284 movq MMWORD
[wk
(3)], mm5
; wk(3)=BO
291 pmaddwd mm1
,[GOTOFF
(eax,PW_F0299_F0337
)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
292 pmaddwd mm6
,[GOTOFF
(eax,PW_F0299_F0337
)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
293 pmaddwd mm7
,[GOTOFF
(eax,PW_MF016_MF033
)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
294 pmaddwd mm4
,[GOTOFF
(eax,PW_MF016_MF033
)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
296 movq MMWORD
[wk
(4)], mm1
; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
297 movq MMWORD
[wk
(5)], mm6
; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
301 punpcklwd mm1
,mm5
; mm1=BOL
302 punpckhwd mm6
,mm5
; mm6=BOH
303 psrld mm1
,1 ; mm1=BOL*FIX(0.500)
304 psrld mm6
,1 ; mm6=BOH*FIX(0.500)
306 movq mm5
,[GOTOFF
(eax,PD_ONEHALFM1_CJ
)] ; mm5=[PD_ONEHALFM1_CJ]
312 psrld mm7
,SCALEBITS
; mm7=CbOL
313 psrld mm4
,SCALEBITS
; mm4=CbOH
314 packssdw mm7
,mm4
; mm7=CbO
316 movq mm1
, MMWORD
[wk
(2)] ; mm1=BE
323 pmaddwd mm0
,[GOTOFF
(eax,PW_F0299_F0337
)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
324 pmaddwd mm6
,[GOTOFF
(eax,PW_F0299_F0337
)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
325 pmaddwd mm5
,[GOTOFF
(eax,PW_MF016_MF033
)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
326 pmaddwd mm4
,[GOTOFF
(eax,PW_MF016_MF033
)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
328 movq MMWORD
[wk
(6)], mm0
; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
329 movq MMWORD
[wk
(7)], mm6
; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
333 punpcklwd mm0
,mm1
; mm0=BEL
334 punpckhwd mm6
,mm1
; mm6=BEH
335 psrld mm0
,1 ; mm0=BEL*FIX(0.500)
336 psrld mm6
,1 ; mm6=BEH*FIX(0.500)
338 movq mm1
,[GOTOFF
(eax,PD_ONEHALFM1_CJ
)] ; mm1=[PD_ONEHALFM1_CJ]
344 psrld mm5
,SCALEBITS
; mm5=CbEL
345 psrld mm4
,SCALEBITS
; mm4=CbEH
346 packssdw mm5
,mm4
; mm5=CbE
350 movq MMWORD
[ebx], mm5
; Save Cb
352 movq mm0
, MMWORD
[wk
(3)] ; mm0=BO
353 movq mm6
, MMWORD
[wk
(2)] ; mm6=BE
354 movq mm1
, MMWORD
[wk
(1)] ; mm1=RO
361 pmaddwd mm0
,[GOTOFF
(eax,PW_F0114_F0250
)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
362 pmaddwd mm4
,[GOTOFF
(eax,PW_F0114_F0250
)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
363 pmaddwd mm7
,[GOTOFF
(eax,PW_MF008_MF041
)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
364 pmaddwd mm5
,[GOTOFF
(eax,PW_MF008_MF041
)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
366 movq mm3
,[GOTOFF
(eax,PD_ONEHALF
)] ; mm3=[PD_ONEHALF]
368 paddd mm0
, MMWORD
[wk
(4)]
369 paddd mm4
, MMWORD
[wk
(5)]
372 psrld mm0
,SCALEBITS
; mm0=YOL
373 psrld mm4
,SCALEBITS
; mm4=YOH
374 packssdw mm0
,mm4
; mm0=YO
378 punpcklwd mm3
,mm1
; mm3=ROL
379 punpckhwd mm4
,mm1
; mm4=ROH
380 psrld mm3
,1 ; mm3=ROL*FIX(0.500)
381 psrld mm4
,1 ; mm4=ROH*FIX(0.500)
383 movq mm1
,[GOTOFF
(eax,PD_ONEHALFM1_CJ
)] ; mm1=[PD_ONEHALFM1_CJ]
389 psrld mm7
,SCALEBITS
; mm7=CrOL
390 psrld mm5
,SCALEBITS
; mm5=CrOH
391 packssdw mm7
,mm5
; mm7=CrO
393 movq mm3
, MMWORD
[wk
(0)] ; mm3=RE
400 pmaddwd mm6
,[GOTOFF
(eax,PW_F0114_F0250
)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
401 pmaddwd mm4
,[GOTOFF
(eax,PW_F0114_F0250
)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
402 pmaddwd mm1
,[GOTOFF
(eax,PW_MF008_MF041
)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
403 pmaddwd mm5
,[GOTOFF
(eax,PW_MF008_MF041
)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
405 movq mm2
,[GOTOFF
(eax,PD_ONEHALF
)] ; mm2=[PD_ONEHALF]
407 paddd mm6
, MMWORD
[wk
(6)]
408 paddd mm4
, MMWORD
[wk
(7)]
411 psrld mm6
,SCALEBITS
; mm6=YEL
412 psrld mm4
,SCALEBITS
; mm4=YEH
413 packssdw mm6
,mm4
; mm6=YE
417 movq MMWORD
[edi], mm6
; Save Y
421 punpcklwd mm2
,mm3
; mm2=REL
422 punpckhwd mm4
,mm3
; mm4=REH
423 psrld mm2
,1 ; mm2=REL*FIX(0.500)
424 psrld mm4
,1 ; mm4=REH*FIX(0.500)
426 movq mm0
,[GOTOFF
(eax,PD_ONEHALFM1_CJ
)] ; mm0=[PD_ONEHALFM1_CJ]
432 psrld mm1
,SCALEBITS
; mm1=CrEL
433 psrld mm5
,SCALEBITS
; mm5=CrEH
434 packssdw mm1
,mm5
; mm1=CrE
438 movq MMWORD
[edx], mm1
; Save Cr
440 sub ecx, byte SIZEOF_MMWORD
441 add esi, byte RGB_PIXELSIZE
*SIZEOF_MMWORD
; inptr
442 add edi, byte SIZEOF_MMWORD
; outptr0
443 add ebx, byte SIZEOF_MMWORD
; outptr1
444 add edx, byte SIZEOF_MMWORD
; outptr2
445 cmp ecx, byte SIZEOF_MMWORD
457 add esi, byte SIZEOF_JSAMPROW
; input_buf
458 add edi, byte SIZEOF_JSAMPROW
459 add ebx, byte SIZEOF_JSAMPROW
460 add edx, byte SIZEOF_JSAMPROW
464 emms
; empty MMX state
469 ; pop edx ; need not be preserved
470 ; pop ecx ; need not be preserved
472 mov esp,ebp ; esp <- aligned ebp
473 pop esp ; esp <- original ebp
477 ; For some reason, the OS X linker does not honor the request to align the
478 ; segment unless we do this.