2 ; jcsamss2.asm - downsampling (SSE2)
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
7 ; x86 SIMD extension for IJG JPEG library
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
11 ; This file should be assembled with NASM (Netwide Assembler),
12 ; can *not* be assembled with Microsoft's MASM or any compatible
13 ; assembler (including Borland's Turbo Assembler).
14 ; NASM is available from http://nasm.sourceforge.net/ or
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
19 %include "jsimdext.inc"
21 ; --------------------------------------------------------------------------
25 ; Downsample pixel values of a single component.
26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
30 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
31 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
32 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
35 %define img_width
(b
) (b
)+8 ; JDIMENSION image_width
36 %define max_v_samp
(b
) (b
)+12 ; int max_v_samp_factor
37 %define v_samp
(b
) (b
)+16 ; JDIMENSION v_samp_factor
38 %define width_blks
(b
) (b
)+20 ; JDIMENSION width_blocks
39 %define input_data
(b
) (b
)+24 ; JSAMPARRAY input_data
40 %define output_data
(b
) (b
)+28 ; JSAMPARRAY output_data
43 global EXTN
(jsimd_h2v1_downsample_sse2
)
45 EXTN
(jsimd_h2v1_downsample_sse2
):
49 ; push ecx ; need not be preserved
50 ; push edx ; need not be preserved
54 mov ecx, JDIMENSION
[width_blks
(ebp)]
55 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
58 mov edx, JDIMENSION
[img_width
(ebp)]
60 ; -- expand_right_edge
63 shl ecx,1 ; output_cols * 2
67 mov eax, INT [max_v_samp
(ebp)]
72 mov esi, JSAMPARRAY
[input_data
(ebp)] ; input_data
78 mov edi, JSAMPROW
[esi]
80 mov al, JSAMPLE
[edi-1]
87 add esi, byte SIZEOF_JSAMPROW
96 mov eax, JDIMENSION
[v_samp
(ebp)] ; rowctr
100 mov edx, 0x00010000 ; bias pattern
103 pshufd xmm7
,xmm7
,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
104 psrlw xmm6
,BYTE_BIT
; xmm6={0xFF 0x00 0xFF 0x00 ..}
106 mov esi, JSAMPARRAY
[input_data
(ebp)] ; input_data
107 mov edi, JSAMPARRAY
[output_data
(ebp)] ; output_data
114 mov esi, JSAMPROW
[esi] ; inptr
115 mov edi, JSAMPROW
[edi] ; outptr
117 cmp ecx, byte SIZEOF_XMMWORD
118 jae short .columnloop
122 movdqa xmm0
, XMMWORD
[esi+0*SIZEOF_XMMWORD
]
124 mov ecx, SIZEOF_XMMWORD
125 jmp short .downsample
129 movdqa xmm0
, XMMWORD
[esi+0*SIZEOF_XMMWORD
]
130 movdqa xmm1
, XMMWORD
[esi+1*SIZEOF_XMMWORD
]
150 movdqa XMMWORD
[edi+0*SIZEOF_XMMWORD
], xmm0
152 sub ecx, byte SIZEOF_XMMWORD
; outcol
153 add esi, byte 2*SIZEOF_XMMWORD
; inptr
154 add edi, byte 1*SIZEOF_XMMWORD
; outptr
155 cmp ecx, byte SIZEOF_XMMWORD
156 jae short .columnloop
158 jnz short .columnloop_r8
164 add esi, byte SIZEOF_JSAMPROW
; input_data
165 add edi, byte SIZEOF_JSAMPROW
; output_data
172 ; pop edx ; need not be preserved
173 ; pop ecx ; need not be preserved
178 ; --------------------------------------------------------------------------
180 ; Downsample pixel values of a single component.
181 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
185 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
186 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
187 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
190 %define img_width
(b
) (b
)+8 ; JDIMENSION image_width
191 %define max_v_samp
(b
) (b
)+12 ; int max_v_samp_factor
192 %define v_samp
(b
) (b
)+16 ; JDIMENSION v_samp_factor
193 %define width_blks
(b
) (b
)+20 ; JDIMENSION width_blocks
194 %define input_data
(b
) (b
)+24 ; JSAMPARRAY input_data
195 %define output_data
(b
) (b
)+28 ; JSAMPARRAY output_data
198 global EXTN
(jsimd_h2v2_downsample_sse2
)
200 EXTN
(jsimd_h2v2_downsample_sse2
):
204 ; push ecx ; need not be preserved
205 ; push edx ; need not be preserved
209 mov ecx, JDIMENSION
[width_blks
(ebp)]
210 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
213 mov edx, JDIMENSION
[img_width
(ebp)]
215 ; -- expand_right_edge
218 shl ecx,1 ; output_cols * 2
220 jle short .expand_end
222 mov eax, INT [max_v_samp
(ebp)]
224 jle short .expand_end
227 mov esi, JSAMPARRAY
[input_data
(ebp)] ; input_data
233 mov edi, JSAMPROW
[esi]
235 mov al, JSAMPLE
[edi-1]
242 add esi, byte SIZEOF_JSAMPROW
247 pop ecx ; output_cols
251 mov eax, JDIMENSION
[v_samp
(ebp)] ; rowctr
255 mov edx, 0x00020001 ; bias pattern
258 pshufd xmm7
,xmm7
,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
259 psrlw xmm6
,BYTE_BIT
; xmm6={0xFF 0x00 0xFF 0x00 ..}
261 mov esi, JSAMPARRAY
[input_data
(ebp)] ; input_data
262 mov edi, JSAMPARRAY
[output_data
(ebp)] ; output_data
269 mov edx, JSAMPROW
[esi+0*SIZEOF_JSAMPROW
] ; inptr0
270 mov esi, JSAMPROW
[esi+1*SIZEOF_JSAMPROW
] ; inptr1
271 mov edi, JSAMPROW
[edi] ; outptr
273 cmp ecx, byte SIZEOF_XMMWORD
274 jae short .columnloop
278 movdqa xmm0
, XMMWORD
[edx+0*SIZEOF_XMMWORD
]
279 movdqa xmm1
, XMMWORD
[esi+0*SIZEOF_XMMWORD
]
282 mov ecx, SIZEOF_XMMWORD
283 jmp short .downsample
287 movdqa xmm0
, XMMWORD
[edx+0*SIZEOF_XMMWORD
]
288 movdqa xmm1
, XMMWORD
[esi+0*SIZEOF_XMMWORD
]
289 movdqa xmm2
, XMMWORD
[edx+1*SIZEOF_XMMWORD
]
290 movdqa xmm3
, XMMWORD
[esi+1*SIZEOF_XMMWORD
]
320 movdqa XMMWORD
[edi+0*SIZEOF_XMMWORD
], xmm0
322 sub ecx, byte SIZEOF_XMMWORD
; outcol
323 add edx, byte 2*SIZEOF_XMMWORD
; inptr0
324 add esi, byte 2*SIZEOF_XMMWORD
; inptr1
325 add edi, byte 1*SIZEOF_XMMWORD
; outptr
326 cmp ecx, byte SIZEOF_XMMWORD
329 jnz near .columnloop_r8
335 add esi, byte 2*SIZEOF_JSAMPROW
; input_data
336 add edi, byte 1*SIZEOF_JSAMPROW
; output_data
343 ; pop edx ; need not be preserved
344 ; pop ecx ; need not be preserved
349 ; For some reason, the OS X linker does not honor the request to align the
350 ; segment unless we do this.