2 ; jdsammmx.asm - upsampling (MMX)
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
7 ; x86 SIMD extension for IJG JPEG library
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
11 ; This file should be assembled with NASM (Netwide Assembler),
12 ; can *not* be assembled with Microsoft's MASM or any compatible
13 ; assembler (including Borland's Turbo Assembler).
14 ; NASM is available from http://nasm.sourceforge.net/ or
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
19 %include "jsimdext.inc"
21 ; --------------------------------------------------------------------------
25 global EXTN
(jconst_fancy_upsample_mmx
)
27 EXTN
(jconst_fancy_upsample_mmx
):
37 ; --------------------------------------------------------------------------
41 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
43 ; The upsampling algorithm is linear interpolation between pixel centers,
44 ; also known as a "triangle filter". This is a good compromise between
45 ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4
46 ; of the way between input pixel centers.
49 ; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
50 ; JDIMENSION downsampled_width,
51 ; JSAMPARRAY input_data,
52 ; JSAMPARRAY * output_data_ptr);
55 %define max_v_samp
(b
) (b
)+8 ; int max_v_samp_factor
56 %define downsamp_width
(b
) (b
)+12 ; JDIMENSION downsampled_width
57 %define input_data
(b
) (b
)+16 ; JSAMPARRAY input_data
58 %define output_data_ptr
(b
) (b
)+20 ; JSAMPARRAY * output_data_ptr
61 global EXTN
(jsimd_h2v1_fancy_upsample_mmx
)
63 EXTN
(jsimd_h2v1_fancy_upsample_mmx
):
67 ; push ecx ; need not be preserved
68 ; push edx ; need not be preserved
72 get_GOT
ebx ; get GOT address
74 mov eax, JDIMENSION
[downsamp_width
(ebp)] ; colctr
78 mov ecx, INT [max_v_samp
(ebp)] ; rowctr
82 mov esi, JSAMPARRAY
[input_data
(ebp)] ; input_data
83 mov edi, POINTER
[output_data_ptr
(ebp)]
84 mov edi, JSAMPARRAY
[edi] ; output_data
91 mov esi, JSAMPROW
[esi] ; inptr
92 mov edi, JSAMPROW
[edi] ; outptr
94 test eax, SIZEOF_MMWORD
-1
96 mov dl, JSAMPLE
[esi+(eax-1)*SIZEOF_JSAMPLE
]
97 mov JSAMPLE
[esi+eax*SIZEOF_JSAMPLE
], dl ; insert a dummy sample
99 pxor mm0
,mm0
; mm0=(all 0's)
101 psrlq mm7
,(SIZEOF_MMWORD
-1)*BYTE_BIT
102 pand mm7
, MMWORD
[esi+0*SIZEOF_MMWORD
]
104 add eax, byte SIZEOF_MMWORD
-1
105 and eax, byte -SIZEOF_MMWORD
106 cmp eax, byte SIZEOF_MMWORD
112 psllq mm6
,(SIZEOF_MMWORD
-1)*BYTE_BIT
113 pand mm6
, MMWORD
[esi+0*SIZEOF_MMWORD
]
118 movq mm6
, MMWORD
[esi+1*SIZEOF_MMWORD
]
119 psllq mm6
,(SIZEOF_MMWORD
-1)*BYTE_BIT
122 movq mm1
, MMWORD
[esi+0*SIZEOF_MMWORD
]
124 movq mm3
,mm1
; mm1=( 0 1 2 3 4 5 6 7)
125 psllq mm2
,BYTE_BIT
; mm2=( - 0 1 2 3 4 5 6)
126 psrlq mm3
,BYTE_BIT
; mm3=( 1 2 3 4 5 6 7 -)
128 por mm2
,mm7
; mm2=(-1 0 1 2 3 4 5 6)
129 por mm3
,mm6
; mm3=( 1 2 3 4 5 6 7 8)
132 psrlq mm7
,(SIZEOF_MMWORD
-1)*BYTE_BIT
; mm7=( 7 - - - - - - -)
135 punpcklbw mm1
,mm0
; mm1=( 0 1 2 3)
136 punpckhbw mm4
,mm0
; mm4=( 4 5 6 7)
138 punpcklbw mm2
,mm0
; mm2=(-1 0 1 2)
139 punpckhbw mm5
,mm0
; mm5=( 3 4 5 6)
141 punpcklbw mm3
,mm0
; mm3=( 1 2 3 4)
142 punpckhbw mm6
,mm0
; mm6=( 5 6 7 8)
144 pmullw mm1
,[GOTOFF
(ebx,PW_THREE
)]
145 pmullw mm4
,[GOTOFF
(ebx,PW_THREE
)]
146 paddw mm2
,[GOTOFF
(ebx,PW_ONE
)]
147 paddw mm5
,[GOTOFF
(ebx,PW_ONE
)]
148 paddw mm3
,[GOTOFF
(ebx,PW_TWO
)]
149 paddw mm6
,[GOTOFF
(ebx,PW_TWO
)]
153 psrlw mm2
,2 ; mm2=OutLE=( 0 2 4 6)
154 psrlw mm5
,2 ; mm5=OutHE=( 8 10 12 14)
157 psrlw mm3
,2 ; mm3=OutLO=( 1 3 5 7)
158 psrlw mm6
,2 ; mm6=OutHO=( 9 11 13 15)
162 por mm2
,mm3
; mm2=OutL=( 0 1 2 3 4 5 6 7)
163 por mm5
,mm6
; mm5=OutH=( 8 9 10 11 12 13 14 15)
165 movq MMWORD
[edi+0*SIZEOF_MMWORD
], mm2
166 movq MMWORD
[edi+1*SIZEOF_MMWORD
], mm5
168 sub eax, byte SIZEOF_MMWORD
169 add esi, byte 1*SIZEOF_MMWORD
; inptr
170 add edi, byte 2*SIZEOF_MMWORD
; outptr
171 cmp eax, byte SIZEOF_MMWORD
174 jnz near .columnloop_last
180 add esi, byte SIZEOF_JSAMPROW
; input_data
181 add edi, byte SIZEOF_JSAMPROW
; output_data
185 emms
; empty MMX state
190 ; pop edx ; need not be preserved
191 ; pop ecx ; need not be preserved
196 ; --------------------------------------------------------------------------
198 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
199 ; Again a triangle filter; see comments for h2v1 case, above.
202 ; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
203 ; JDIMENSION downsampled_width,
204 ; JSAMPARRAY input_data,
205 ; JSAMPARRAY * output_data_ptr);
208 %define max_v_samp
(b
) (b
)+8 ; int max_v_samp_factor
209 %define downsamp_width
(b
) (b
)+12 ; JDIMENSION downsampled_width
210 %define input_data
(b
) (b
)+16 ; JSAMPARRAY input_data
211 %define output_data_ptr
(b
) (b
)+20 ; JSAMPARRAY * output_data_ptr
213 %define original_ebp
ebp+0
214 %define wk
(i
) ebp-(WK_NUM
-(i
))*SIZEOF_MMWORD
; mmword wk[WK_NUM]
216 %define gotptr wk
(0)-SIZEOF_POINTER
; void * gotptr
219 global EXTN
(jsimd_h2v2_fancy_upsample_mmx
)
221 EXTN
(jsimd_h2v2_fancy_upsample_mmx
):
223 mov eax,esp ; eax = original ebp
225 and esp, byte (-SIZEOF_MMWORD
) ; align to 64 bits
227 mov ebp,esp ; ebp = aligned ebp
229 pushpic
eax ; make a room for GOT address
231 ; push ecx ; need not be preserved
232 ; push edx ; need not be preserved
236 get_GOT
ebx ; get GOT address
237 movpic POINTER
[gotptr
], ebx ; save GOT address
239 mov edx,eax ; edx = original ebp
240 mov eax, JDIMENSION
[downsamp_width
(edx)] ; colctr
244 mov ecx, INT [max_v_samp
(edx)] ; rowctr
248 mov esi, JSAMPARRAY
[input_data
(edx)] ; input_data
249 mov edi, POINTER
[output_data_ptr
(edx)]
250 mov edi, JSAMPARRAY
[edi] ; output_data
258 mov ecx, JSAMPROW
[esi-1*SIZEOF_JSAMPROW
] ; inptr1(above)
259 mov ebx, JSAMPROW
[esi+0*SIZEOF_JSAMPROW
] ; inptr0
260 mov esi, JSAMPROW
[esi+1*SIZEOF_JSAMPROW
] ; inptr1(below)
261 mov edx, JSAMPROW
[edi+0*SIZEOF_JSAMPROW
] ; outptr0
262 mov edi, JSAMPROW
[edi+1*SIZEOF_JSAMPROW
] ; outptr1
264 test eax, SIZEOF_MMWORD
-1
267 mov dl, JSAMPLE
[ecx+(eax-1)*SIZEOF_JSAMPLE
]
268 mov JSAMPLE
[ecx+eax*SIZEOF_JSAMPLE
], dl
269 mov dl, JSAMPLE
[ebx+(eax-1)*SIZEOF_JSAMPLE
]
270 mov JSAMPLE
[ebx+eax*SIZEOF_JSAMPLE
], dl
271 mov dl, JSAMPLE
[esi+(eax-1)*SIZEOF_JSAMPLE
]
272 mov JSAMPLE
[esi+eax*SIZEOF_JSAMPLE
], dl ; insert a dummy sample
275 ; -- process the first column block
277 movq mm0
, MMWORD
[ebx+0*SIZEOF_MMWORD
] ; mm0=row[ 0][0]
278 movq mm1
, MMWORD
[ecx+0*SIZEOF_MMWORD
] ; mm1=row[-1][0]
279 movq mm2
, MMWORD
[esi+0*SIZEOF_MMWORD
] ; mm2=row[+1][0]
282 movpic
ebx, POINTER
[gotptr
] ; load GOT address
284 pxor mm3
,mm3
; mm3=(all 0's)
286 punpcklbw mm0
,mm3
; mm0=row[ 0][0]( 0 1 2 3)
287 punpckhbw mm4
,mm3
; mm4=row[ 0][0]( 4 5 6 7)
289 punpcklbw mm1
,mm3
; mm1=row[-1][0]( 0 1 2 3)
290 punpckhbw mm5
,mm3
; mm5=row[-1][0]( 4 5 6 7)
292 punpcklbw mm2
,mm3
; mm2=row[+1][0]( 0 1 2 3)
293 punpckhbw mm6
,mm3
; mm6=row[+1][0]( 4 5 6 7)
295 pmullw mm0
,[GOTOFF
(ebx,PW_THREE
)]
296 pmullw mm4
,[GOTOFF
(ebx,PW_THREE
)]
299 psrlq mm7
,(SIZEOF_MMWORD
-2)*BYTE_BIT
301 paddw mm1
,mm0
; mm1=Int0L=( 0 1 2 3)
302 paddw mm5
,mm4
; mm5=Int0H=( 4 5 6 7)
303 paddw mm2
,mm0
; mm2=Int1L=( 0 1 2 3)
304 paddw mm6
,mm4
; mm6=Int1H=( 4 5 6 7)
306 movq MMWORD
[edx+0*SIZEOF_MMWORD
], mm1
; temporarily save
307 movq MMWORD
[edx+1*SIZEOF_MMWORD
], mm5
; the intermediate data
308 movq MMWORD
[edi+0*SIZEOF_MMWORD
], mm2
309 movq MMWORD
[edi+1*SIZEOF_MMWORD
], mm6
311 pand mm1
,mm7
; mm1=( 0 - - -)
312 pand mm2
,mm7
; mm2=( 0 - - -)
314 movq MMWORD
[wk
(0)], mm1
315 movq MMWORD
[wk
(1)], mm2
319 add eax, byte SIZEOF_MMWORD
-1
320 and eax, byte -SIZEOF_MMWORD
321 cmp eax, byte SIZEOF_MMWORD
326 ; -- process the last column block
329 movpic
ebx, POINTER
[gotptr
] ; load GOT address
332 psllq mm1
,(SIZEOF_MMWORD
-2)*BYTE_BIT
335 pand mm1
, MMWORD
[edx+1*SIZEOF_MMWORD
] ; mm1=( - - - 7)
336 pand mm2
, MMWORD
[edi+1*SIZEOF_MMWORD
] ; mm2=( - - - 7)
338 movq MMWORD
[wk
(2)], mm1
339 movq MMWORD
[wk
(3)], mm2
345 ; -- process the next column block
347 movq mm0
, MMWORD
[ebx+1*SIZEOF_MMWORD
] ; mm0=row[ 0][1]
348 movq mm1
, MMWORD
[ecx+1*SIZEOF_MMWORD
] ; mm1=row[-1][1]
349 movq mm2
, MMWORD
[esi+1*SIZEOF_MMWORD
] ; mm2=row[+1][1]
352 movpic
ebx, POINTER
[gotptr
] ; load GOT address
354 pxor mm3
,mm3
; mm3=(all 0's)
356 punpcklbw mm0
,mm3
; mm0=row[ 0][1]( 0 1 2 3)
357 punpckhbw mm4
,mm3
; mm4=row[ 0][1]( 4 5 6 7)
359 punpcklbw mm1
,mm3
; mm1=row[-1][1]( 0 1 2 3)
360 punpckhbw mm5
,mm3
; mm5=row[-1][1]( 4 5 6 7)
362 punpcklbw mm2
,mm3
; mm2=row[+1][1]( 0 1 2 3)
363 punpckhbw mm6
,mm3
; mm6=row[+1][1]( 4 5 6 7)
365 pmullw mm0
,[GOTOFF
(ebx,PW_THREE
)]
366 pmullw mm4
,[GOTOFF
(ebx,PW_THREE
)]
368 paddw mm1
,mm0
; mm1=Int0L=( 0 1 2 3)
369 paddw mm5
,mm4
; mm5=Int0H=( 4 5 6 7)
370 paddw mm2
,mm0
; mm2=Int1L=( 0 1 2 3)
371 paddw mm6
,mm4
; mm6=Int1H=( 4 5 6 7)
373 movq MMWORD
[edx+2*SIZEOF_MMWORD
], mm1
; temporarily save
374 movq MMWORD
[edx+3*SIZEOF_MMWORD
], mm5
; the intermediate data
375 movq MMWORD
[edi+2*SIZEOF_MMWORD
], mm2
376 movq MMWORD
[edi+3*SIZEOF_MMWORD
], mm6
378 psllq mm1
,(SIZEOF_MMWORD
-2)*BYTE_BIT
; mm1=( - - - 0)
379 psllq mm2
,(SIZEOF_MMWORD
-2)*BYTE_BIT
; mm2=( - - - 0)
381 movq MMWORD
[wk
(2)], mm1
382 movq MMWORD
[wk
(3)], mm2
385 ; -- process the upper row
387 movq mm7
, MMWORD
[edx+0*SIZEOF_MMWORD
] ; mm7=Int0L=( 0 1 2 3)
388 movq mm3
, MMWORD
[edx+1*SIZEOF_MMWORD
] ; mm3=Int0H=( 4 5 6 7)
392 psrlq mm0
,2*BYTE_BIT
; mm0=( 1 2 3 -)
393 psllq mm4
,(SIZEOF_MMWORD
-2)*BYTE_BIT
; mm4=( - - - 4)
396 psrlq mm5
,(SIZEOF_MMWORD
-2)*BYTE_BIT
; mm5=( 3 - - -)
397 psllq mm6
,2*BYTE_BIT
; mm6=( - 4 5 6)
399 por mm0
,mm4
; mm0=( 1 2 3 4)
400 por mm5
,mm6
; mm5=( 3 4 5 6)
404 psllq mm1
,2*BYTE_BIT
; mm1=( - 0 1 2)
405 psrlq mm2
,2*BYTE_BIT
; mm2=( 5 6 7 -)
407 psrlq mm4
,(SIZEOF_MMWORD
-2)*BYTE_BIT
; mm4=( 7 - - -)
409 por mm1
, MMWORD
[wk
(0)] ; mm1=(-1 0 1 2)
410 por mm2
, MMWORD
[wk
(2)] ; mm2=( 5 6 7 8)
412 movq MMWORD
[wk
(0)], mm4
414 pmullw mm7
,[GOTOFF
(ebx,PW_THREE
)]
415 pmullw mm3
,[GOTOFF
(ebx,PW_THREE
)]
416 paddw mm1
,[GOTOFF
(ebx,PW_EIGHT
)]
417 paddw mm5
,[GOTOFF
(ebx,PW_EIGHT
)]
418 paddw mm0
,[GOTOFF
(ebx,PW_SEVEN
)]
419 paddw mm2
,[GOTOFF
(ebx,PW_SEVEN
)]
423 psrlw mm1
,4 ; mm1=Out0LE=( 0 2 4 6)
424 psrlw mm5
,4 ; mm5=Out0HE=( 8 10 12 14)
427 psrlw mm0
,4 ; mm0=Out0LO=( 1 3 5 7)
428 psrlw mm2
,4 ; mm2=Out0HO=( 9 11 13 15)
432 por mm1
,mm0
; mm1=Out0L=( 0 1 2 3 4 5 6 7)
433 por mm5
,mm2
; mm5=Out0H=( 8 9 10 11 12 13 14 15)
435 movq MMWORD
[edx+0*SIZEOF_MMWORD
], mm1
436 movq MMWORD
[edx+1*SIZEOF_MMWORD
], mm5
438 ; -- process the lower row
440 movq mm6
, MMWORD
[edi+0*SIZEOF_MMWORD
] ; mm6=Int1L=( 0 1 2 3)
441 movq mm4
, MMWORD
[edi+1*SIZEOF_MMWORD
] ; mm4=Int1H=( 4 5 6 7)
445 psrlq mm7
,2*BYTE_BIT
; mm7=( 1 2 3 -)
446 psllq mm3
,(SIZEOF_MMWORD
-2)*BYTE_BIT
; mm3=( - - - 4)
449 psrlq mm0
,(SIZEOF_MMWORD
-2)*BYTE_BIT
; mm0=( 3 - - -)
450 psllq mm2
,2*BYTE_BIT
; mm2=( - 4 5 6)
452 por mm7
,mm3
; mm7=( 1 2 3 4)
453 por mm0
,mm2
; mm0=( 3 4 5 6)
457 psllq mm1
,2*BYTE_BIT
; mm1=( - 0 1 2)
458 psrlq mm5
,2*BYTE_BIT
; mm5=( 5 6 7 -)
460 psrlq mm3
,(SIZEOF_MMWORD
-2)*BYTE_BIT
; mm3=( 7 - - -)
462 por mm1
, MMWORD
[wk
(1)] ; mm1=(-1 0 1 2)
463 por mm5
, MMWORD
[wk
(3)] ; mm5=( 5 6 7 8)
465 movq MMWORD
[wk
(1)], mm3
467 pmullw mm6
,[GOTOFF
(ebx,PW_THREE
)]
468 pmullw mm4
,[GOTOFF
(ebx,PW_THREE
)]
469 paddw mm1
,[GOTOFF
(ebx,PW_EIGHT
)]
470 paddw mm0
,[GOTOFF
(ebx,PW_EIGHT
)]
471 paddw mm7
,[GOTOFF
(ebx,PW_SEVEN
)]
472 paddw mm5
,[GOTOFF
(ebx,PW_SEVEN
)]
476 psrlw mm1
,4 ; mm1=Out1LE=( 0 2 4 6)
477 psrlw mm0
,4 ; mm0=Out1HE=( 8 10 12 14)
480 psrlw mm7
,4 ; mm7=Out1LO=( 1 3 5 7)
481 psrlw mm5
,4 ; mm5=Out1HO=( 9 11 13 15)
485 por mm1
,mm7
; mm1=Out1L=( 0 1 2 3 4 5 6 7)
486 por mm0
,mm5
; mm0=Out1H=( 8 9 10 11 12 13 14 15)
488 movq MMWORD
[edi+0*SIZEOF_MMWORD
], mm1
489 movq MMWORD
[edi+1*SIZEOF_MMWORD
], mm0
493 sub eax, byte SIZEOF_MMWORD
494 add ecx, byte 1*SIZEOF_MMWORD
; inptr1(above)
495 add ebx, byte 1*SIZEOF_MMWORD
; inptr0
496 add esi, byte 1*SIZEOF_MMWORD
; inptr1(below)
497 add edx, byte 2*SIZEOF_MMWORD
; outptr0
498 add edi, byte 2*SIZEOF_MMWORD
; outptr1
499 cmp eax, byte SIZEOF_MMWORD
502 jnz near .columnloop_last
509 add esi, byte 1*SIZEOF_JSAMPROW
; input_data
510 add edi, byte 2*SIZEOF_JSAMPROW
; output_data
511 sub ecx, byte 2 ; rowctr
514 emms
; empty MMX state
519 ; pop edx ; need not be preserved
520 ; pop ecx ; need not be preserved
522 mov esp,ebp ; esp <- aligned ebp
523 pop esp ; esp <- original ebp
527 ; --------------------------------------------------------------------------
529 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
530 ; It's still a box filter.
533 ; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
534 ; JDIMENSION output_width,
535 ; JSAMPARRAY input_data,
536 ; JSAMPARRAY * output_data_ptr);
539 %define max_v_samp
(b
) (b
)+8 ; int max_v_samp_factor
540 %define output_width
(b
) (b
)+12 ; JDIMENSION output_width
541 %define input_data
(b
) (b
)+16 ; JSAMPARRAY input_data
542 %define output_data_ptr
(b
) (b
)+20 ; JSAMPARRAY * output_data_ptr
545 global EXTN
(jsimd_h2v1_upsample_mmx
)
547 EXTN
(jsimd_h2v1_upsample_mmx
):
551 ; push ecx ; need not be preserved
552 ; push edx ; need not be preserved
556 mov edx, JDIMENSION
[output_width
(ebp)]
557 add edx, byte (2*SIZEOF_MMWORD
)-1
558 and edx, byte -(2*SIZEOF_MMWORD
)
561 mov ecx, INT [max_v_samp
(ebp)] ; rowctr
565 mov esi, JSAMPARRAY
[input_data
(ebp)] ; input_data
566 mov edi, POINTER
[output_data_ptr
(ebp)]
567 mov edi, JSAMPARRAY
[edi] ; output_data
573 mov esi, JSAMPROW
[esi] ; inptr
574 mov edi, JSAMPROW
[edi] ; outptr
579 movq mm0
, MMWORD
[esi+0*SIZEOF_MMWORD
]
585 movq MMWORD
[edi+0*SIZEOF_MMWORD
], mm0
586 movq MMWORD
[edi+1*SIZEOF_MMWORD
], mm1
588 sub eax, byte 2*SIZEOF_MMWORD
591 movq mm2
, MMWORD
[esi+1*SIZEOF_MMWORD
]
597 movq MMWORD
[edi+2*SIZEOF_MMWORD
], mm2
598 movq MMWORD
[edi+3*SIZEOF_MMWORD
], mm3
600 sub eax, byte 2*SIZEOF_MMWORD
603 add esi, byte 2*SIZEOF_MMWORD
; inptr
604 add edi, byte 4*SIZEOF_MMWORD
; outptr
605 jmp short .columnloop
612 add esi, byte SIZEOF_JSAMPROW
; input_data
613 add edi, byte SIZEOF_JSAMPROW
; output_data
617 emms
; empty MMX state
622 ; pop edx ; need not be preserved
623 ; pop ecx ; need not be preserved
628 ; --------------------------------------------------------------------------
630 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
631 ; It's still a box filter.
634 ; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
635 ; JDIMENSION output_width,
636 ; JSAMPARRAY input_data,
637 ; JSAMPARRAY * output_data_ptr);
640 %define max_v_samp
(b
) (b
)+8 ; int max_v_samp_factor
641 %define output_width
(b
) (b
)+12 ; JDIMENSION output_width
642 %define input_data
(b
) (b
)+16 ; JSAMPARRAY input_data
643 %define output_data_ptr
(b
) (b
)+20 ; JSAMPARRAY * output_data_ptr
646 global EXTN
(jsimd_h2v2_upsample_mmx
)
648 EXTN
(jsimd_h2v2_upsample_mmx
):
652 ; push ecx ; need not be preserved
653 ; push edx ; need not be preserved
657 mov edx, JDIMENSION
[output_width
(ebp)]
658 add edx, byte (2*SIZEOF_MMWORD
)-1
659 and edx, byte -(2*SIZEOF_MMWORD
)
662 mov ecx, INT [max_v_samp
(ebp)] ; rowctr
666 mov esi, JSAMPARRAY
[input_data
(ebp)] ; input_data
667 mov edi, POINTER
[output_data_ptr
(ebp)]
668 mov edi, JSAMPARRAY
[edi] ; output_data
674 mov esi, JSAMPROW
[esi] ; inptr
675 mov ebx, JSAMPROW
[edi+0*SIZEOF_JSAMPROW
] ; outptr0
676 mov edi, JSAMPROW
[edi+1*SIZEOF_JSAMPROW
] ; outptr1
681 movq mm0
, MMWORD
[esi+0*SIZEOF_MMWORD
]
687 movq MMWORD
[ebx+0*SIZEOF_MMWORD
], mm0
688 movq MMWORD
[ebx+1*SIZEOF_MMWORD
], mm1
689 movq MMWORD
[edi+0*SIZEOF_MMWORD
], mm0
690 movq MMWORD
[edi+1*SIZEOF_MMWORD
], mm1
692 sub eax, byte 2*SIZEOF_MMWORD
695 movq mm2
, MMWORD
[esi+1*SIZEOF_MMWORD
]
701 movq MMWORD
[ebx+2*SIZEOF_MMWORD
], mm2
702 movq MMWORD
[ebx+3*SIZEOF_MMWORD
], mm3
703 movq MMWORD
[edi+2*SIZEOF_MMWORD
], mm2
704 movq MMWORD
[edi+3*SIZEOF_MMWORD
], mm3
706 sub eax, byte 2*SIZEOF_MMWORD
709 add esi, byte 2*SIZEOF_MMWORD
; inptr
710 add ebx, byte 4*SIZEOF_MMWORD
; outptr0
711 add edi, byte 4*SIZEOF_MMWORD
; outptr1
712 jmp short .columnloop
719 add esi, byte 1*SIZEOF_JSAMPROW
; input_data
720 add edi, byte 2*SIZEOF_JSAMPROW
; output_data
721 sub ecx, byte 2 ; rowctr
724 emms
; empty MMX state
729 ; pop edx ; need not be preserved
730 ; pop ecx ; need not be preserved
735 ; For some reason, the OS X linker does not honor the request to align the
736 ; segment unless we do this.