2 ; jcsamss2-64.asm - downsampling (64-bit SSE2)
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ; Copyright 2009 D. R. Commander
8 ; x86 SIMD extension for IJG JPEG library
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
12 ; This file should be assembled with NASM (Netwide Assembler),
13 ; can *not* be assembled with Microsoft's MASM or any compatible
14 ; assembler (including Borland's Turbo Assembler).
15 ; NASM is available from http://nasm.sourceforge.net/ or
16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
20 %include "jsimdext.inc"
22 ; --------------------------------------------------------------------------
26 ; Downsample pixel values of a single component.
27 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
31 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
32 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
33 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
36 ; r10 = JDIMENSION image_width
37 ; r11 = int max_v_samp_factor
38 ; r12 = JDIMENSION v_samp_factor
39 ; r13 = JDIMENSION width_blocks
40 ; r14 = JSAMPARRAY input_data
41 ; r15 = JSAMPARRAY output_data
44 global EXTN
(jsimd_h2v1_downsample_sse2
)
46 EXTN
(jsimd_h2v1_downsample_sse2
):
53 shl rcx
,3 ; imul rcx,DCTSIZE (rcx = output_cols)
58 ; -- expand_right_edge
61 shl rcx
,1 ; output_cols * 2
70 mov rsi
, r14
; input_data
75 mov rdi
, JSAMPROW
[rsi
]
77 mov al, JSAMPLE
[rdi
-1]
84 add rsi
, byte SIZEOF_JSAMPROW
97 mov rdx
, 0x00010000 ; bias pattern
100 pshufd xmm7
,xmm7
,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
101 psrlw xmm6
,BYTE_BIT
; xmm6={0xFF 0x00 0xFF 0x00 ..}
103 mov rsi
, r14
; input_data
104 mov rdi
, r15
; output_data
110 mov rsi
, JSAMPROW
[rsi
] ; inptr
111 mov rdi
, JSAMPROW
[rdi
] ; outptr
113 cmp rcx
, byte SIZEOF_XMMWORD
114 jae short .columnloop
117 movdqa xmm0
, XMMWORD
[rsi
+0*SIZEOF_XMMWORD
]
119 mov rcx
, SIZEOF_XMMWORD
120 jmp short .downsample
123 movdqa xmm0
, XMMWORD
[rsi
+0*SIZEOF_XMMWORD
]
124 movdqa xmm1
, XMMWORD
[rsi
+1*SIZEOF_XMMWORD
]
144 movdqa XMMWORD
[rdi
+0*SIZEOF_XMMWORD
], xmm0
146 sub rcx
, byte SIZEOF_XMMWORD
; outcol
147 add rsi
, byte 2*SIZEOF_XMMWORD
; inptr
148 add rdi
, byte 1*SIZEOF_XMMWORD
; outptr
149 cmp rcx
, byte SIZEOF_XMMWORD
150 jae short .columnloop
152 jnz short .columnloop_r8
158 add rsi
, byte SIZEOF_JSAMPROW
; input_data
159 add rdi
, byte SIZEOF_JSAMPROW
; output_data
168 ; --------------------------------------------------------------------------
170 ; Downsample pixel values of a single component.
171 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
175 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
176 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
177 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
180 ; r10 = JDIMENSION image_width
181 ; r11 = int max_v_samp_factor
182 ; r12 = JDIMENSION v_samp_factor
183 ; r13 = JDIMENSION width_blocks
184 ; r14 = JSAMPARRAY input_data
185 ; r15 = JSAMPARRAY output_data
188 global EXTN
(jsimd_h2v2_downsample_sse2
)
190 EXTN
(jsimd_h2v2_downsample_sse2
):
197 shl rcx
,3 ; imul rcx,DCTSIZE (rcx = output_cols)
202 ; -- expand_right_edge
205 shl rcx
,1 ; output_cols * 2
207 jle short .expand_end
211 jle short .expand_end
214 mov rsi
, r14
; input_data
219 mov rdi
, JSAMPROW
[rsi
]
221 mov al, JSAMPLE
[rdi
-1]
228 add rsi
, byte SIZEOF_JSAMPROW
233 pop rcx
; output_cols
237 mov rax
, r12
; rowctr
241 mov rdx
, 0x00020001 ; bias pattern
244 pshufd xmm7
,xmm7
,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
245 psrlw xmm6
,BYTE_BIT
; xmm6={0xFF 0x00 0xFF 0x00 ..}
247 mov rsi
, r14
; input_data
248 mov rdi
, r15
; output_data
254 mov rdx
, JSAMPROW
[rsi
+0*SIZEOF_JSAMPROW
] ; inptr0
255 mov rsi
, JSAMPROW
[rsi
+1*SIZEOF_JSAMPROW
] ; inptr1
256 mov rdi
, JSAMPROW
[rdi
] ; outptr
258 cmp rcx
, byte SIZEOF_XMMWORD
259 jae short .columnloop
262 movdqa xmm0
, XMMWORD
[rdx
+0*SIZEOF_XMMWORD
]
263 movdqa xmm1
, XMMWORD
[rsi
+0*SIZEOF_XMMWORD
]
266 mov rcx
, SIZEOF_XMMWORD
267 jmp short .downsample
270 movdqa xmm0
, XMMWORD
[rdx
+0*SIZEOF_XMMWORD
]
271 movdqa xmm1
, XMMWORD
[rsi
+0*SIZEOF_XMMWORD
]
272 movdqa xmm2
, XMMWORD
[rdx
+1*SIZEOF_XMMWORD
]
273 movdqa xmm3
, XMMWORD
[rsi
+1*SIZEOF_XMMWORD
]
303 movdqa XMMWORD
[rdi
+0*SIZEOF_XMMWORD
], xmm0
305 sub rcx
, byte SIZEOF_XMMWORD
; outcol
306 add rdx
, byte 2*SIZEOF_XMMWORD
; inptr0
307 add rsi
, byte 2*SIZEOF_XMMWORD
; inptr1
308 add rdi
, byte 1*SIZEOF_XMMWORD
; outptr
309 cmp rcx
, byte SIZEOF_XMMWORD
312 jnz near .columnloop_r8
318 add rsi
, byte 2*SIZEOF_JSAMPROW
; input_data
319 add rdi
, byte 1*SIZEOF_JSAMPROW
; output_data
328 ; For some reason, the OS X linker does not honor the request to align the
329 ; segment unless we do this.