Increase ParseScript cache from 30 to 90 seconds
[xy_vsfilter.git] / src / dsutil / a_yuv2rgb.asm
bloba0b1d7366717d393c53a52263435704c466b8dfb
1 ; VirtualDub - Video processing and capture application
2 ; Copyright (C) 1998-2001 Avery Lee
4 ; This program is free software; you can redistribute it and/or modify
5 ; it under the terms of the GNU General Public License as published by
6 ; the Free Software Foundation; either version 2 of the License, or
7 ; (at your option) any later version.
9 ; This program is distributed in the hope that it will be useful,
10 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
11 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 ; GNU General Public License for more details.
14 ; You should have received a copy of the GNU General Public License
15 ; along with this program; if not, write to the Free Software
16 ; Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 .686
19 .mmx
20 .xmm
21 .model flat
23 extern _YUV_Y_table: dword
24 extern _YUV_U_table: dword
25 extern _YUV_V_table: dword
26 extern _YUV_clip_table: byte
27 extern _YUV_clip_table16: byte
29 .const
31 align 16
33 MMX_10w dq 00010001000100010h
34 MMX_80w dq 00080008000800080h
35 MMX_00FFw dq 000FF00FF00FF00FFh
36 MMX_FF00w dq 0FF00FF00FF00FF00h
37 MMX_Ublucoeff dq 00081008100810081h
38 MMX_Vredcoeff dq 00066006600660066h
39 MMX_Ugrncoeff dq 0FFE7FFE7FFE7FFE7h
40 MMX_Vgrncoeff dq 0FFCCFFCCFFCCFFCCh
41 MMX_Ycoeff dq 0004A004A004A004Ah
42 MMX_rbmask dq 07c1f7c1f7c1f7c1fh
43 MMX_grnmask dq 003e003e003e003e0h
44 MMX_grnmask2 dq 000f800f800f800f8h
45 MMX_clip dq 07c007c007c007c00h
47 MMX_Ucoeff0 dq 000810000FFE70081h
48 MMX_Ucoeff1 dq 0FFE700810000FFE7h
49 MMX_Ucoeff2 dq 00000FFE700810000h
50 MMX_Vcoeff0 dq 000000066FFCC0000h
51 MMX_Vcoeff1 dq 0FFCC00000066FFCCh
52 MMX_Vcoeff2 dq 00066FFCC00000066h
54 .code
56 public _asm_YUVtoRGB32_row
57 public _asm_YUVtoRGB24_row
58 public _asm_YUVtoRGB16_row
59 public _asm_YUVtoRGB32_row_MMX
60 public _asm_YUVtoRGB24_row_MMX
61 public _asm_YUVtoRGB16_row_MMX
62 public _asm_YUVtoRGB32_row_ISSE
63 public _asm_YUVtoRGB24_row_ISSE
64 public _asm_YUVtoRGB16_row_ISSE
66 ; asm_YUVtoRGB_row(
67 ; Pixel *ARGB1_pointer,
68 ; Pixel *ARGB2_pointer,
69 ; YUVPixel *Y1_pointer,
70 ; YUVPixel *Y2_pointer,
71 ; YUVPixel *U_pointer,
72 ; YUVPixel *V_pointer,
73 ; long width
74 ; );
76 ARGB1_pointer equ [esp+ 4+16]
77 ARGB2_pointer equ [esp+ 8+16]
78 Y1_pointer equ [esp+12+16]
79 Y2_pointer equ [esp+16+16]
80 U_pointer equ [esp+20+16]
81 V_pointer equ [esp+24+16]
82 count equ [esp+28+16]
84 _asm_YUVtoRGB32_row:
85 push ebx
86 push esi
87 push edi
88 push ebp
90 mov eax,count
91 mov ebp,eax
92 mov ebx,eax
93 shl ebx,3
94 add eax,eax
95 add ARGB1_pointer,ebx
96 add ARGB2_pointer,ebx
97 add Y1_pointer,eax
98 add Y2_pointer,eax
99 add U_pointer,ebp
100 add V_pointer,ebp
101 neg ebp
103 mov esi,U_pointer ;[C]
104 mov edi,V_pointer ;[C]
105 xor edx,edx ;[C]
106 xor ecx,ecx ;[C]
107 jmp short col_loop_start
109 col_loop:
110 mov ch,[_YUV_clip_table+ebx-3f00h] ;[4] edx = [0][0][red][green]
111 mov esi,U_pointer ;[C]
112 shl ecx,8 ;[4] edx = [0][red][green][0]
113 mov edi,V_pointer ;[C]
114 mov cl,[_YUV_clip_table+edx-3f00h] ;[4] edx = [0][r][g][b] !!
115 xor edx,edx ;[C]
116 mov [eax+ebp*8-4],ecx ;[4]
117 xor ecx,ecx ;[C]
118 col_loop_start:
119 mov cl,[esi + ebp] ;[C] eax = U
120 mov dl,[edi + ebp] ;[C] ebx = V
121 mov eax,Y1_pointer ;[1]
122 xor ebx,ebx ;[1]
123 mov esi,[_YUV_U_table + ecx*4] ;[C] eax = [b impact][u-g impact]
124 mov ecx,[_YUV_V_table + edx*4] ;[C] ebx = [r impact][v-g impact]
125 mov edi,esi ;[C]
126 mov bl,[eax + ebp*2] ;[1] ebx = Y1 value
127 shr esi,16 ;[C] eax = blue impact
128 add edi,ecx ;[C] edi = [junk][g impact]
129 mov ebx,[_YUV_Y_table + ebx*4] ;[1] ebx = Y impact
130 and ecx,0ffff0000h ;[C]
131 mov edx,ebx ;[1] edx = Y impact
132 add esi,ecx ;[C] eax = [r impact][b impact]
133 and edi,0000ffffh ;[C]
134 add ebx,esi ;[1] ebx = [red][blue]
135 mov ecx,ebx ;[1] edi = [red][blue]
136 and edx,0000ffffh ;[1] ecx = green
137 shr ebx,16 ;[1] ebx = red
138 and ecx,0000ffffh ;[1] edi = blue
139 mov dl,[_YUV_clip_table+edx+edi-3f00h] ;[1] edx = [0][0][junk][green]
140 mov eax,Y1_pointer ;[2]
141 mov dh,[_YUV_clip_table+ebx-3f00h] ;[1] edx = [0][0][red][green]
142 xor ebx,ebx ;[2]
143 shl edx,8 ;[1] edx = [0][red][green][0]
144 mov bl,[eax + ebp*2 + 1] ;[2] ebx = Y1 value
145 mov eax,ARGB1_pointer ;[1]
146 mov dl,[_YUV_clip_table+ecx-3f00h] ;[1] edx = [0][r][g][b] !!
147 mov ebx,[_YUV_Y_table + ebx*4] ;[2] ebx = Y impact
148 mov ecx,0000ffffh ;[2]
150 and ecx,ebx ;[2]
151 add ebx,esi ;[2] ebx = [red][blue]
153 mov [eax+ebp*8],edx ;[1]
154 mov edx,ebx ;[2]
156 shr ebx,16 ;[2] ebx = red
157 mov eax,Y2_pointer ;[3]
159 and edx,0000ffffh ;[2]
160 mov cl,[_YUV_clip_table+ecx+edi-3f00h] ;[2] edx = [0][0][junk][green]
162 mov al,[eax + ebp*2] ;[3] ebx = Y1 value
163 mov ch,[_YUV_clip_table+ebx-3f00h] ;[2] edx = [0][0][red][green]
165 shl ecx,8 ;[2] edx = [0][red][green][0]
166 and eax,000000ffh ;[3]
168 mov cl,[_YUV_clip_table+edx-3f00h] ;[2] edx = [0][r][g][b] !!
169 mov edx,ARGB1_pointer ;[2]
171 mov ebx,[_YUV_Y_table + eax*4] ;[3] ebx = Y impact
172 mov eax,0000ffffh
174 and eax,ebx ;[3] edi = [red][blue]
175 add ebx,esi ;[3] ebx = [red][blue]
177 mov [edx+ebp*8+4],ecx ;[2]
178 mov edx,ebx ;[3]
180 shr ebx,16 ;[3] ebx = red
181 mov ecx,Y2_pointer ;[4]
183 and edx,0000ffffh ;[3] ecx = green
184 mov al,[_YUV_clip_table+eax+edi-3f00h] ;[3] edx = [0][0][junk][green]
186 mov cl,[ecx + ebp*2+1] ;[4] ebx = Y1 value
187 mov ah,[_YUV_clip_table+ebx-3f00h] ;[3] edx = [0][0][red][green]
189 shl eax,8 ;[3] edx = [0][red][green][0]
190 and ecx,000000ffh ;[4]
192 mov al,[_YUV_clip_table+edx-3f00h] ;[3] edx = [0][r][g][b] !!
193 mov edx,ARGB2_pointer ;[3]
195 mov ebx,[_YUV_Y_table + ecx*4] ;[4] ebx = Y impact
196 mov ecx,0000ffffh ;[4]
198 and ecx,ebx ;[4] ecx = [0][Y-impact]
199 add ebx,esi ;[4] ebx = [red][blue]
201 mov [edx+ebp*8],eax ;[3]
202 mov edx,ebx ;[4] edx = [red][blue]
204 shr ebx,16 ;[4] ebx = red
205 mov cl,[_YUV_clip_table+ecx+edi-3f00h] ;[4] edx = [0][0][junk][green]
207 and edx,0000ffffh ;[4] edx = blue
208 mov eax,ARGB2_pointer ;[4]
210 inc ebp
212 jnz col_loop
214 mov ch,[_YUV_clip_table+ebx-3f00h] ;[4] edx = [0][0][red][green]
215 shl ecx,8 ;[4] edx = [0][red][green][0]
216 mov cl,[_YUV_clip_table+edx-3f00h] ;[4] edx = [0][r][g][b] !!
217 mov [eax+ebp*8-4],ecx ;[4]
219 pop ebp
220 pop edi
221 pop esi
222 pop ebx
225 ;MMX_test dq 7060504030201000h
227 _asm_YUVtoRGB32_row_MMX:
228 push ebx
229 push esi
230 push edi
231 push ebp
233 mov eax,count
234 mov ebp,eax
235 mov ebx,eax
236 shl ebx,3
237 add eax,eax
238 add ARGB1_pointer,ebx
239 add ARGB2_pointer,ebx
240 add Y1_pointer,eax
241 add Y2_pointer,eax
242 add U_pointer,ebp
243 add V_pointer,ebp
244 neg ebp
246 mov esi,U_pointer
247 mov edi,V_pointer
248 mov ecx,Y1_pointer
249 mov edx,Y2_pointer
250 mov eax,ARGB1_pointer
251 mov ebx,ARGB2_pointer
253 col_loop_MMX:
254 movd mm0, dword ptr [esi+ebp] ;U (byte)
255 pxor mm7,mm7
257 movd mm1, dword ptr [edi+ebp] ;V (byte)
258 punpcklbw mm0,mm7 ;U (word)
260 psubw mm0,MMX_80w
261 punpcklbw mm1,mm7 ;V (word)
263 psubw mm1,MMX_80w
264 movq mm2,mm0
266 pmullw mm2,MMX_Ugrncoeff
267 movq mm3,mm1
269 pmullw mm3,MMX_Vgrncoeff
270 pmullw mm0,MMX_Ublucoeff
271 pmullw mm1,MMX_Vredcoeff
272 paddw mm2,mm3
274 ;mm0: blue
275 ;mm1: red
276 ;mm2: green
278 movq mm6,[ecx+ebp*2] ;Y
279 pand mm6,MMX_00FFw
280 psubw mm6,MMX_10w
281 pmullw mm6,MMX_Ycoeff
282 movq mm4,mm6
283 paddw mm6,mm0 ;mm6: <B3><B2><B1><B0>
284 movq mm5,mm4
285 paddw mm4,mm1 ;mm4: <R3><R2><R1><R0>
286 paddw mm5,mm2 ;mm5: <G3><G2><G1><G0>
287 psraw mm6,6
288 psraw mm4,6
289 packuswb mm6,mm6 ;mm6: B3B2B1B0B3B2B1B0
290 psraw mm5,6
291 packuswb mm4,mm4 ;mm4: R3R2R1R0R3R2R1R0
292 punpcklbw mm6,mm4 ;mm6: R3B3R2B2R1B1R0B0
293 packuswb mm5,mm5 ;mm5: G3G2G1G0G3G2G1G0
294 punpcklbw mm5,mm5 ;mm5: G3G3G2G2G1G1G0G0
295 movq mm4,mm6
296 punpcklbw mm6,mm5 ;mm6: G1R1G1B2G0R0G0B0
297 punpckhbw mm4,mm5 ;mm4: G3R3G3B3G2R2G2B2
299 movq mm7,[ecx+ebp*2] ;Y
300 psrlw mm7,8
301 psubw mm7,MMX_10w
302 pmullw mm7,MMX_Ycoeff
303 movq mm3,mm7
304 paddw mm7,mm0 ;mm7: final blue
305 movq mm5,mm3
306 paddw mm3,mm1 ;mm3: final red
307 paddw mm5,mm2 ;mm5: final green
308 psraw mm7,6
309 psraw mm3,6
310 packuswb mm7,mm7 ;mm7: B3B2B1B0B3B2B1B0
311 psraw mm5,6
312 packuswb mm3,mm3 ;mm3: R3R2R1R0R3R2R1R0
313 punpcklbw mm7,mm3 ;mm7: R3B3R2B2R1B1R0B0
314 packuswb mm5,mm5 ;mm5: G3G2G1G0G3G2G1G0
315 punpcklbw mm5,mm5 ;mm5: G3G3G2G2G1G1G0G0
316 movq mm3,mm7
317 punpcklbw mm7,mm5 ;mm7: G1R1G1B2G0R0G0B0
318 punpckhbw mm3,mm5 ;mm3: G3R3G3B3G2R2G2B2
320 ;mm3 P7:P5
321 ;mm4 P6:P4
322 ;mm6 P2:P0
323 ;mm7 P3:P1
325 movq mm5,mm6
326 punpckldq mm5,mm7 ;P1:P0
327 punpckhdq mm6,mm7 ;P3:P2
328 movq mm7,mm4
329 punpckldq mm4,mm3 ;P5:P4
330 punpckhdq mm7,mm3 ;P7:P6
332 movq [eax+ebp*8],mm5
333 movq [eax+ebp*8+8],mm6
334 movq [eax+ebp*8+16],mm4
335 movq [eax+ebp*8+24],mm7
337 movq mm6,[edx+ebp*2] ;Y
338 pand mm6,MMX_00FFw
339 psubw mm6,MMX_10w
340 pmullw mm6,MMX_Ycoeff
341 movq mm4,mm6
342 paddw mm6,mm0 ;mm6: <B3><B2><B1><B0>
343 movq mm5,mm4
344 paddw mm4,mm1 ;mm4: <R3><R2><R1><R0>
345 paddw mm5,mm2 ;mm5: <G3><G2><G1><G0>
346 psraw mm6,6
347 psraw mm4,6
348 packuswb mm6,mm6 ;mm6: B3B2B1B0B3B2B1B0
349 psraw mm5,6
350 packuswb mm4,mm4 ;mm4: R3R2R1R0R3R2R1R0
351 punpcklbw mm6,mm4 ;mm6: R3B3R2B2R1B1R0B0
352 packuswb mm5,mm5 ;mm5: G3G2G1G0G3G2G1G0
353 punpcklbw mm5,mm5 ;mm5: G3G3G2G2G1G1G0G0
354 movq mm4,mm6
355 punpcklbw mm6,mm5 ;mm6: G1R1G1B2G0R0G0B0
356 punpckhbw mm4,mm5 ;mm4: G3R3G3B3G2R2G2B2
358 movq mm7,[edx+ebp*2] ;Y
359 psrlw mm7,8
360 psubw mm7,MMX_10w
361 pmullw mm7,MMX_Ycoeff
362 movq mm3,mm7
363 paddw mm7,mm0 ;mm7: final blue
364 movq mm5,mm3
365 paddw mm3,mm1 ;mm3: final red
366 paddw mm5,mm2 ;mm5: final green
367 psraw mm7,6
368 psraw mm3,6
369 packuswb mm7,mm7 ;mm7: B3B2B1B0B3B2B1B0
370 psraw mm5,6
371 packuswb mm3,mm3 ;mm3: R3R2R1R0R3R2R1R0
372 punpcklbw mm7,mm3 ;mm7: R3B3R2B2R1B1R0B0
373 packuswb mm5,mm5 ;mm5: G3G2G1G0G3G2G1G0
374 punpcklbw mm5,mm5 ;mm5: G3G3G2G2G1G1G0G0
375 movq mm3,mm7
376 punpcklbw mm7,mm5 ;mm7: G1R1G1B2G0R0G0B0
377 punpckhbw mm3,mm5 ;mm3: G3R3G3B3G2R2G2B2
379 ;mm3 P7:P5
380 ;mm4 P6:P4
381 ;mm6 P2:P0
382 ;mm7 P3:P1
384 movq mm5,mm6
385 punpckldq mm5,mm7 ;P1:P0
386 punpckhdq mm6,mm7 ;P3:P2
387 movq mm7,mm4
388 punpckldq mm4,mm3 ;P5:P4
389 punpckhdq mm7,mm3 ;P7:P6
391 movq [ebx+ebp*8 ],mm5
392 movq [ebx+ebp*8+ 8],mm6
394 movq [ebx+ebp*8+16],mm4
395 movq [ebx+ebp*8+24],mm7
397 add ebp,4
399 jnz col_loop_MMX
401 pop ebp
402 pop edi
403 pop esi
404 pop ebx
407 ;**************************************************************************
409 ; asm_YUVtoRGB24_row(
410 ; Pixel *ARGB1_pointer,
411 ; Pixel *ARGB2_pointer,
412 ; YUVPixel *Y1_pointer,
413 ; YUVPixel *Y2_pointer,
414 ; YUVPixel *U_pointer,
415 ; YUVPixel *V_pointer,
416 ; long width
417 ; );
419 ARGB1_pointer equ [esp+ 4+16]
420 ARGB2_pointer equ [esp+ 8+16]
421 Y1_pointer equ [esp+12+16]
422 Y2_pointer equ [esp+16+16]
423 U_pointer equ [esp+20+16]
424 V_pointer equ [esp+24+16]
425 count equ [esp+28+16]
427 _asm_YUVtoRGB24_row:
428 push ebx
429 push esi
430 push edi
431 push ebp
433 mov eax,count
434 mov ebp,eax
435 add eax,eax
436 add Y1_pointer,eax
437 add Y2_pointer,eax
438 add U_pointer,ebp
439 add V_pointer,ebp
440 neg ebp
442 mov esi,U_pointer ;[C]
443 mov edi,V_pointer ;[C]
444 xor edx,edx ;[C]
445 xor ecx,ecx ;[C]
447 col_loop24:
448 mov esi,U_pointer
449 mov edi,V_pointer
450 xor eax,eax
451 xor ebx,ebx
452 mov al,[esi + ebp] ;eax = U
453 mov bl,[edi + ebp] ;ebx = V
454 mov eax,[_YUV_U_table + eax*4] ;eax = [b impact][u-g impact]
455 mov edi,[_YUV_V_table + ebx*4] ;edi = [r impact][v-g impact]
457 mov ecx,eax ;[C]
458 mov esi,Y1_pointer ;[1]
460 mov edx,edi ;[C]
461 xor ebx,ebx ;[1]
463 shr eax,16 ;[C] eax = blue impact
464 mov bl,[esi + ebp*2] ;[1] ebx = Y1 value
466 and edi,0ffff0000h ;[C] edi = [r impact][0]
467 add ecx,edx ;[C] ecx = [junk][g impact]
469 add eax,edi ;[C] eax = [r impact][b impact]
470 mov ebx,[_YUV_Y_table + ebx*4] ;[1] ebx = Y impact
472 ;eax = [r][b]
473 ;ecx = [g]
475 mov esi,ebx ;[1]
476 add ebx,eax ;[1] ebx = [red][blue]
478 add esi,ecx ;[1] edx = [junk][green]
479 mov edi,ebx ;[1] edi = [red][blue]
481 shr ebx,16 ;[1] ebx = red
482 and esi,0000ffffh ;[1] ecx = green
484 and edi,0000ffffh ;edi = blue
485 xor edx,edx
487 mov bh,[_YUV_clip_table+ebx-3f00h] ;bh = red
488 mov dl,[_YUV_clip_table+esi-3f00h] ;dl = green
490 mov esi,Y1_pointer ;[2]
491 mov bl,[_YUV_clip_table+edi-3f00h] ;bl = blue
493 mov edi,ARGB1_pointer ;[1]
494 mov [edi+2],bh ;[1]
496 mov [edi+0],bl ;[1]
497 xor ebx,ebx ;[2]
499 mov [edi+1],dl ;[1]
501 mov bl,[esi + ebp*2 + 1] ;[2] ebx = Y1 value
502 mov esi,ecx ;[2]
504 mov ebx,[_YUV_Y_table + ebx*4] ;[2] ebx = Y impact
505 mov edi,0000ffffh ;[2]
507 add esi,ebx ;[2] edx = [junk][green]
508 add ebx,eax ;[2] ebx = [red][blue]
510 and edi,ebx ;[2] edi = blue
511 and esi,0000ffffh ;[2] ecx = green
513 shr ebx,16 ;ebx = red
514 xor edx,edx
516 mov bh,[_YUV_clip_table+ebx-3f00h] ;bh = red
517 mov dl,[_YUV_clip_table+esi-3f00h] ;dl = green
519 mov esi,Y2_pointer ;[3]
520 mov bl,[_YUV_clip_table+edi-3f00h] ;bl = blue
522 mov edi,ARGB1_pointer ;[2]
523 mov [edi+5],bh ;[2]
525 mov [edi+4],dl ;[2]
526 mov [edi+3],bl ;[2]
528 xor ebx,ebx ;[3]
530 mov bl,[esi + ebp*2] ;[3] ebx = Y1 value
531 mov edi,ecx ;[2]
533 mov ebx,[_YUV_Y_table + ebx*4] ;[3] ebx = Y impact
534 mov esi,0000ffffh ;[3]
536 add edi,ebx ;[3] edx = [junk][green]
537 add ebx,eax ;[3] ebx = [red][blue]
539 and esi,ebx ;[3] edi = blue
540 and edi,0000ffffh ;ecx = green
542 shr ebx,16 ;ebx = red
543 xor edx,edx
545 mov dl,[_YUV_clip_table+edi-3f00h] ;dl = green
546 mov edi,ARGB2_pointer ;[3]
548 mov bh,[_YUV_clip_table+ebx-3f00h] ;bh = red
549 mov bl,[_YUV_clip_table+esi-3f00h] ;bl = blue
551 mov esi,Y2_pointer ;[4]
552 mov [edi+2],bh
554 mov [edi+0],bl
555 xor ebx,ebx ;[4]
557 mov [edi+1],dl
558 mov bl,[esi + ebp*2 + 1] ;[4] ebx = Y1 value
560 mov edi,0000ffffh ;[4]
562 mov ebx,[_YUV_Y_table + ebx*4] ;[4] ebx = Y impact
563 xor edx,edx
565 add ecx,ebx ;[4] ecx = [junk][green]
566 add ebx,eax ;ebx = [red][blue]
568 and edi,ebx ;edi = blue
569 and ecx,0000ffffh ;ecx = green
571 shr ebx,16 ;ebx = red
572 mov esi,ARGB2_pointer
574 mov bl,[_YUV_clip_table+ebx-3f00h] ;bh = red
575 mov dl,[_YUV_clip_table+ecx-3f00h] ;dl = green
577 mov al,[_YUV_clip_table+edi-3f00h] ;bl = blue
578 mov [esi+5],bl
580 mov [esi+4],dl
581 mov ecx,ARGB1_pointer
583 mov [esi+3],al
584 add esi,6
586 mov ARGB2_pointer,esi
587 add ecx,6
589 mov ARGB1_pointer,ecx
591 inc ebp
592 jnz col_loop24
594 pop ebp
595 pop edi
596 pop esi
597 pop ebx
600 _asm_YUVtoRGB24_row_MMX:
601 push ebx
602 push esi
603 push edi
604 push ebp
606 mov eax,count
607 mov ebp,eax
608 add eax,eax
609 add Y1_pointer,eax
610 add Y2_pointer,eax
611 add U_pointer,ebp
612 add V_pointer,ebp
613 neg ebp
615 mov esi,U_pointer
616 mov edi,V_pointer
617 mov ecx,Y1_pointer
618 mov edx,Y2_pointer
619 mov eax,ARGB1_pointer
620 mov ebx,ARGB2_pointer
622 col_loop_MMX24:
623 movd mm0, dword ptr [esi+ebp] ;U (byte)
624 pxor mm7,mm7
626 movd mm1, dword ptr [edi+ebp] ;V (byte)
627 punpcklbw mm0,mm7 ;U (word)
629 movd mm2, dword ptr [ecx+ebp*2] ;Y low
630 punpcklbw mm1,mm7 ;V (word)
632 movd mm3, dword ptr [edx+ebp*2] ;Y high
633 punpcklbw mm2,mm7 ;Y1 (word)
635 psubw mm2,MMX_10w
636 punpcklbw mm3,mm7 ;Y2 (word)
638 psubw mm3,MMX_10w
640 psubw mm0,MMX_80w
641 psubw mm1,MMX_80w
643 ;group 1
645 pmullw mm2,MMX_Ycoeff ;[lazy]
646 movq mm6,mm0
647 pmullw mm3,MMX_Ycoeff ;[lazy]
648 movq mm7,mm1
649 punpcklwd mm6,mm6 ;mm6 = U1U1U0U0
650 movq mm4,mm2 ;mm4 = Y3Y2Y1Y0 [high]
651 punpckldq mm6,mm6 ;mm6 = U0U0U0U0
652 movq mm5,mm3 ;mm3 = Y3Y2Y1Y0 [low]
653 punpcklwd mm7,mm7 ;mm7 = V1V1V0V0
654 punpckldq mm7,mm7 ;mm7 = V0V0V0V0
656 pmullw mm6,MMX_Ucoeff0
657 punpcklwd mm4,mm4 ;mm4 = Y1Y1Y0Y0 [high]
658 pmullw mm7,MMX_Vcoeff0
659 punpcklwd mm5,mm5 ;mm5 = Y1Y1Y0Y0 [low]
661 punpcklwd mm4,mm2 ;mm4 = Y1Y0Y0Y0
662 punpcklwd mm5,mm3 ;mm5 = Y1Y0Y0Y0
664 paddw mm4,mm6
665 paddw mm5,mm6
666 paddw mm4,mm7
667 paddw mm5,mm7
669 psraw mm4,6
670 psraw mm5,6
672 packuswb mm4,mm4
673 packuswb mm5,mm5
675 ;group 2
677 movd dword ptr [eax+0],mm4 ;[lazy write]
678 movq mm4,mm0
679 movd dword ptr [ebx+0],mm5 ;[lazy write]
680 movq mm5,mm1
682 punpcklwd mm4,mm4 ;mm6 = U1U1U0U0
683 movq mm6,mm2 ;mm4 = Y3Y2Y1Y0 [high]
684 punpcklwd mm5,mm5 ;mm6 = V1V1V0V0
685 movq mm7,mm3 ;mm3 = Y3Y2Y1Y0 [low]
687 pmullw mm4,MMX_Ucoeff1
688 psrlq mm6,16 ;mm4 = 00Y3Y2Y1 [high]
689 pmullw mm5,MMX_Vcoeff1
690 psrlq mm7,16 ;mm4 = 00Y3Y2Y1 [low]
692 punpcklwd mm6,mm6 ;mm4 = Y2Y2Y1Y1 [high]
693 punpcklwd mm7,mm7 ;mm5 = Y2Y2Y1Y1 [high]
695 paddw mm6,mm4
696 paddw mm7,mm4
697 paddw mm6,mm5
698 paddw mm7,mm5
700 psraw mm6,6
701 psraw mm7,6
703 packuswb mm6,mm6
704 packuswb mm7,mm7
706 ;group 3
708 movd dword ptr [eax+4],mm6 ;[lazy write]
709 movq mm6,mm0
710 movd dword ptr [ebx+4],mm7 ;[lazy write]
711 movq mm7,mm1
713 movq mm4,mm2 ;mm4 = Y3Y2Y1Y0 [high]
714 punpcklwd mm6,mm6 ;mm6 = U1U1U0U0
715 movq mm5,mm3 ;mm3 = Y3Y2Y1Y0 [low]
716 punpckhdq mm6,mm6 ;mm6 = U1U1U1U1
717 punpcklwd mm7,mm7 ;mm7 = V1V1V0V0
718 punpckhdq mm7,mm7 ;mm7 = V1V1V1V1
720 pmullw mm6,MMX_Ucoeff2
721 punpckhwd mm2,mm2 ;mm2 = Y3Y3Y2Y2 [high]
722 pmullw mm7,MMX_Vcoeff2
723 punpckhwd mm3,mm3 ;mm3 = Y3Y3Y2Y2 [low]
725 punpckhdq mm4,mm2 ;mm4 = Y3Y3Y3Y2 [high]
726 punpckhdq mm5,mm3 ;mm5 = Y3Y3Y3Y2 [low]
728 paddw mm4,mm6
729 paddw mm5,mm6
730 paddw mm4,mm7
731 paddw mm5,mm7
733 psraw mm4,6
734 psraw mm5,6
736 ;next 3 groups
738 movd mm2, dword ptr [ecx+ebp*2+4] ;Y low
739 packuswb mm4,mm4 ;[lazy]
741 movd mm3, dword ptr [edx+ebp*2+4] ;Y high
742 packuswb mm5,mm5 ;[lazy]
744 movd dword ptr [eax+8],mm4 ;[lazy write]
745 pxor mm7,mm7
747 movd dword ptr [ebx+8],mm5 ;[lazy write]
748 punpcklbw mm2,mm7 ;U (word)
751 psubw mm2,MMX_10w
752 punpcklbw mm3,mm7 ;V (word)
754 psubw mm3,MMX_10w
757 ;group 1
759 pmullw mm2,MMX_Ycoeff ;[init]
760 movq mm6,mm0
762 pmullw mm3,MMX_Ycoeff ;[init]
763 punpckhwd mm6,mm6 ;mm6 = U3U3U2U2
765 movq mm7,mm1
766 punpckldq mm6,mm6 ;mm6 = U2U2U2U2
767 movq mm4,mm2 ;mm4 = Y3Y2Y1Y0 [high]
768 punpckhwd mm7,mm7 ;mm7 = V3V3V2V2
769 movq mm5,mm3 ;mm3 = Y3Y2Y1Y0 [low]
770 punpckldq mm7,mm7 ;mm7 = V2V2V2V2
772 pmullw mm6,MMX_Ucoeff0
773 punpcklwd mm4,mm4 ;mm4 = Y1Y1Y0Y0 [high]
774 pmullw mm7,MMX_Vcoeff0
775 punpcklwd mm5,mm5 ;mm5 = Y1Y1Y0Y0 [low]
777 punpcklwd mm4,mm2 ;mm4 = Y1Y0Y0Y0
778 punpcklwd mm5,mm3 ;mm5 = Y1Y0Y0Y0
780 paddw mm4,mm6
781 paddw mm5,mm6
782 paddw mm4,mm7
783 paddw mm5,mm7
785 psraw mm4,6
786 psraw mm5,6
788 packuswb mm4,mm4
789 packuswb mm5,mm5
791 ;group 2
793 movd dword ptr [eax+12],mm4
794 movq mm6,mm0
795 movd dword ptr [ebx+12],mm5
796 movq mm7,mm1
798 punpckhwd mm6,mm6 ;mm6 = U3U3U2U2
799 movq mm4,mm2 ;mm4 = Y3Y2Y1Y0 [high]
800 punpckhwd mm7,mm7 ;mm6 = V3V3V2V2
801 movq mm5,mm3 ;mm3 = Y3Y2Y1Y0 [low]
803 pmullw mm6,MMX_Ucoeff1
804 psrlq mm4,16 ;mm4 = 00Y3Y2Y1 [high]
805 pmullw mm7,MMX_Vcoeff1
806 psrlq mm5,16 ;mm4 = 00Y3Y2Y1 [low]
808 punpcklwd mm4,mm4 ;mm4 = Y2Y2Y1Y1 [high]
809 punpcklwd mm5,mm5 ;mm5 = Y2Y2Y1Y1 [high]
811 paddw mm4,mm6
812 paddw mm5,mm6
813 paddw mm4,mm7
814 paddw mm5,mm7
816 psraw mm4,6
817 psraw mm5,6
819 packuswb mm4,mm4
820 packuswb mm5,mm5
822 ;group 3
824 movq mm6,mm2 ;mm4 = Y3Y2Y1Y0 [high]
825 punpckhwd mm0,mm0 ;mm6 = U3U3U2U2
827 movq mm7,mm3 ;mm3 = Y3Y2Y1Y0 [low]
828 punpckhdq mm0,mm0 ;mm6 = U3U3U3U3
830 movd dword ptr [eax+16],mm4 ;[lazy write]
831 punpckhwd mm1,mm1 ;mm7 = V3V3V2V2
833 movd dword ptr [ebx+16],mm5 ;[lazy write]
834 punpckhdq mm1,mm1 ;mm7 = V3V3V3V3
836 pmullw mm0,MMX_Ucoeff2
837 punpckhwd mm2,mm2 ;mm2 = Y3Y3Y2Y2 [high]
838 pmullw mm1,MMX_Vcoeff2
839 punpckhwd mm3,mm3 ;mm3 = Y3Y3Y2Y2 [low]
841 punpckhdq mm6,mm2 ;mm4 = Y3Y3Y3Y2 [high]
842 punpckhdq mm7,mm3 ;mm5 = Y3Y3Y3Y2 [low]
844 paddw mm6,mm0
845 paddw mm7,mm0
846 paddw mm6,mm1
847 paddw mm7,mm1
849 psraw mm6,6
850 psraw mm7,6
852 packuswb mm6,mm6
853 packuswb mm7,mm7
855 movd dword ptr [eax+20],mm6
856 add eax,24
857 movd dword ptr [ebx+20],mm7
858 add ebx,24
860 ;done
862 add ebp,4
863 jnz col_loop_MMX24
865 pop ebp
866 pop edi
867 pop esi
868 pop ebx
871 ;**************************************************************************
873 _asm_YUVtoRGB16_row:
874 push ebx
875 push esi
876 push edi
877 push ebp
879 mov eax,count
880 mov ebp,eax
881 mov ebx,eax
882 shl ebx,2
883 add ARGB1_pointer,ebx
884 add ARGB2_pointer,ebx
885 add eax,eax
886 add Y1_pointer,eax
887 add Y2_pointer,eax
888 add U_pointer,ebp
889 add V_pointer,ebp
890 neg ebp
892 mov esi,U_pointer ;[C]
893 mov edi,V_pointer ;[C]
894 xor edx,edx ;[C]
895 xor ecx,ecx ;[C]
897 col_loop16:
898 mov esi,U_pointer
899 mov edi,V_pointer
900 xor eax,eax
901 xor ebx,ebx
902 mov al,[esi + ebp] ;eax = U
903 mov bl,[edi + ebp] ;ebx = V
904 mov eax,[_YUV_U_table + eax*4] ;eax = [b impact][u-g impact]
905 mov edi,[_YUV_V_table + ebx*4] ;edi = [r impact][v-g impact]
907 mov ecx,eax ;[C]
908 mov esi,Y1_pointer ;[1]
910 mov edx,edi ;[C]
911 xor ebx,ebx ;[1]
913 shr eax,16 ;[C] eax = blue impact
914 mov bl,[esi + ebp*2] ;[1] ebx = Y1 value
916 and edi,0ffff0000h ;[C] edi = [r impact][0]
917 add ecx,edx ;[C] ecx = [junk][g impact]
919 add eax,edi ;[C] eax = [r impact][b impact]
920 mov ebx,[_YUV_Y_table + ebx*4] ;[1] ebx = Y impact
922 ;eax = [r][b]
923 ;ecx = [g]
925 mov esi,ebx ;[1]
926 add ebx,eax ;[1] ebx = [red][blue]
928 add esi,ecx ;[1] edx = [junk][green]
929 mov edi,ebx ;[1] edi = [red][blue]
931 shr ebx,16 ;[1] ebx = red
932 and esi,0000ffffh ;[1] ecx = green
934 and edi,0000ffffh ;edi = blue
935 xor edx,edx
937 mov bh,[_YUV_clip_table16+ebx-3f00h] ;bh = red
938 mov dl,[_YUV_clip_table16+esi-3f00h] ;dl = green
940 mov bl,[_YUV_clip_table16+edi-3f00h] ;bl = blue
941 xor dh,dh ;[1]
943 ;565fix shl bh,2 ;[1]
944 shl bh,3 ;[1]
945 mov edi,ARGB1_pointer ;[1]
947 ;565fix shl edx,5 ;[1]
948 shl edx,6 ;[1]
949 mov esi,Y1_pointer ;[2]
951 add edx,ebx ;[1]
952 xor ebx,ebx ;[2]
954 mov [edi+ebp*4+0],dl ;[1]
955 mov bl,[esi + ebp*2 + 1] ;[2] ebx = Y1 value
957 mov [edi+ebp*4+1],dh ;[1]
958 mov esi,ecx ;[2]
960 mov ebx,[_YUV_Y_table + ebx*4] ;[2] ebx = Y impact
961 mov edi,0000ffffh ;[2]
963 add esi,ebx ;[2] edx = [junk][green]
964 add ebx,eax ;[2] ebx = [red][blue]
966 and edi,ebx ;[2] edi = blue
967 and esi,0000ffffh ;[2] ecx = green
969 shr ebx,16 ;ebx = red
970 xor edx,edx
972 mov bh,[_YUV_clip_table16+ebx-3f00h] ;bh = red
974 mov dl,[_YUV_clip_table16+esi-3f00h] ;dl = green
975 mov bl,[_YUV_clip_table16+edi-3f00h] ;bl = blue
977 ;565fix shl edx,5 ;[2]
978 shl edx,6 ;[2]
979 mov edi,ARGB1_pointer ;[2]
981 ;565fix shl bh,2 ;[2]
982 shl bh,3 ;[2]
983 mov esi,Y2_pointer ;[3]
985 add edx,ebx ;[2]
986 xor ebx,ebx ;[3]
988 mov [edi+ebp*4+2],dl ;[2]
989 mov bl,[esi + ebp*2] ;[3] ebx = Y1 value
991 mov [edi+ebp*4+3],dh ;[2]
992 mov edi,ecx ;[2]
994 mov ebx,[_YUV_Y_table + ebx*4] ;[3] ebx = Y impact
995 mov esi,0000ffffh ;[3]
997 add edi,ebx ;[3] edx = [junk][green]
998 add ebx,eax ;[3] ebx = [red][blue]
1000 and esi,ebx ;[3] edi = blue
1001 and edi,0000ffffh ;ecx = green
1003 shr ebx,16 ;ebx = red
1004 xor edx,edx
1006 mov dl,[_YUV_clip_table16+edi-3f00h] ;dl = green
1007 mov edi,ARGB2_pointer ;[3]
1009 ;565fix shl edx,5
1010 shl edx,6
1011 mov bh,[_YUV_clip_table16+ebx-3f00h] ;bh = red
1013 mov bl,[_YUV_clip_table16+esi-3f00h] ;bl = blue
1014 mov esi,Y2_pointer ;[4]
1016 ;565fix shl bh,2 ;[3]
1017 shl bh,3 ;[3]
1020 add edx,ebx ;[3]
1021 xor ebx,ebx ;[4]
1023 mov [edi+ebp*4+0],dl ;[3]
1024 mov bl,[esi + ebp*2 + 1] ;[4] ebx = Y1 value
1026 mov [edi+ebp*4+1],dh ;[3]
1027 mov edi,0000ffffh ;[4]
1029 mov ebx,[_YUV_Y_table + ebx*4] ;[4] ebx = Y impact
1030 xor edx,edx
1032 add ecx,ebx ;[4] ecx = [junk][green]
1033 add ebx,eax ;ebx = [red][blue]
1035 and edi,ebx ;edi = blue
1036 and ecx,0000ffffh ;ecx = green
1038 shr ebx,16 ;ebx = red
1039 mov esi,ARGB2_pointer
1041 mov dl,[_YUV_clip_table16+ecx-3f00h] ;dl = green
1042 mov al,[_YUV_clip_table16+edi-3f00h] ;bl = blue
1044 ;565fix shl edx,5
1045 shl edx,6
1046 mov ah,[_YUV_clip_table16+ebx-3f00h] ;bh = red
1048 ;565fix shl ah,2
1049 shl ah,3
1051 add eax,edx
1053 mov [esi+ebp*4+2],al
1054 mov [esi+ebp*4+3],ah
1056 inc ebp
1057 jnz col_loop16
1059 pop ebp
1060 pop edi
1061 pop esi
1062 pop ebx
1067 _asm_YUVtoRGB16_row_MMX:
1068 push ebx
1069 push esi
1070 push edi
1071 push ebp
1073 mov eax,count
1074 mov ebp,eax
1075 mov ebx,eax
1076 shl ebx,2
1077 add eax,eax
1078 add ARGB1_pointer,ebx
1079 add ARGB2_pointer,ebx
1080 add Y1_pointer,eax
1081 add Y2_pointer,eax
1082 add U_pointer,ebp
1083 add V_pointer,ebp
1084 neg ebp
1086 mov esi,U_pointer
1087 mov edi,V_pointer
1088 mov ecx,Y1_pointer
1089 mov edx,Y2_pointer
1090 mov eax,ARGB1_pointer
1091 mov ebx,ARGB2_pointer
1093 col_loop_MMX16:
1094 movd mm0, dword ptr [esi+ebp] ;[0 ] U (byte)
1095 pxor mm7,mm7 ;[0 7]
1097 movd mm1, dword ptr [edi+ebp] ;[01 7] V (byte)
1098 punpcklbw mm0,mm7 ;[01 7] U (word)
1100 psubw mm0,MMX_80w ;[01 7]
1101 punpcklbw mm1,mm7 ;[01 7] V (word)
1103 psubw mm1,MMX_80w ;[01 ]
1104 movq mm2,mm0 ;[012 ]
1106 pmullw mm2,MMX_Ugrncoeff ;[012 ]
1107 movq mm3,mm1 ;[0123 ]
1109 ;mm0: blue
1110 ;mm1: red
1111 ;mm2: green
1113 movq mm6,[ecx+ebp*2] ;[0123 6 ] [1] Y
1114 ;<-->
1116 pmullw mm3,MMX_Vgrncoeff ;[0123 ]
1117 movq mm7,mm6 ;[012 67] [2] Y
1119 pmullw mm0,MMX_Ublucoeff ;[0123 ]
1120 psrlw mm7,8 ;[012 67] [2]
1122 pmullw mm1,MMX_Vredcoeff ;[0123 ]
1123 ;<-->
1125 pand mm6,MMX_00FFw ;[012 67] [1]
1126 paddw mm2,mm3 ;[012 6 ] [C]
1128 psubw mm6,MMX_10w ;[012 67] [1]
1130 pmullw mm6,MMX_Ycoeff ;[012 67] [1]
1132 psubw mm7,MMX_10w ;[012 67] [2]
1133 movq mm4,mm6 ;[012 4 67] [1]
1135 pmullw mm7,MMX_Ycoeff ;[012 67] [2]
1136 movq mm5,mm6 ;[012 4567] [1]
1138 paddw mm6,mm0 ;[012 4 67] [1] mm6: <B3><B2><B1><B0>
1139 paddw mm4,mm1 ;[012 4567] [1] mm4: <R3><R2><R1><R0>
1141 paddw mm5,mm2 ;[012 4567] [1] mm5: <G3><G2><G1><G0>
1142 psraw mm4,6 ;[012 4567] [1]
1144 movq mm3,mm7 ;[01234567] [2]
1145 psraw mm5,4 ;[01234567] [1]
1147 paddw mm7,mm0 ;[01234567] [2] mm6: <B3><B2><B1><B0>
1148 psraw mm6,6 ;[01234567] [1]
1150 paddsw mm5,MMX_clip
1151 packuswb mm6,mm6 ;[01234567] [1] mm6: B3B2B1B0B3B2B1B0
1153 psubusw mm5,MMX_clip
1154 packuswb mm4,mm4 ;[01234567] [1] mm4: R3R2R1R0R3R2R1R0
1156 pand mm5,MMX_grnmask ;[01234567] [1] mm7: <G3><G2><G1><G0>
1157 psrlq mm6,2 ;[01234567] [1]
1159 punpcklbw mm6,mm4 ;[0123 567] [1] mm4: R3B3R2B2R1B1R0B0
1161 movq mm4,[edx+ebp*2] ;[01234567] [3] Y
1162 psrlw mm6,1 ;[01234567] [1]
1164 pand mm6,MMX_rbmask ;[01234567] [1] mm6: <RB3><RB2><RB1><RB0>
1166 por mm6,mm5 ;[01234 67] [1] mm6: P6P4P2P0
1167 movq mm5,mm3 ;[01234567] [2]
1169 paddw mm3,mm1 ;[01234567] [2] mm4: <R3><R2><R1><R0>
1170 paddw mm5,mm2 ;[01234567] [2] mm5: <G3><G2><G1><G0>
1172 pand mm4,MMX_00FFw ;[01234567] [3]
1173 psraw mm3,6 ;[01234567] [2]
1175 psubw mm4,MMX_10w ;[01234567] [3]
1176 psraw mm5,4 ;[01234567] [2]
1178 pmullw mm4,MMX_Ycoeff ;[01234567] [3]
1179 psraw mm7,6 ;[01234567] [2]
1181 paddsw mm5,MMX_clip
1182 packuswb mm3,mm3 ;[01234567] [2] mm4: R3R2R1R0R3R2R1R0
1184 psubusw mm5,MMX_clip
1185 packuswb mm7,mm7 ;[01234567] [2] mm6: B3B2B1B0B3B2B1B0
1187 pand mm5,MMX_grnmask ;[012 4567] [2] mm7: <G3><G2><G1><G0>
1188 psrlq mm7,2 ;[01234567] [2]
1190 punpcklbw mm7,mm3 ;[012 4567] [2] mm6: R3B3R2B2R1B1R0B0
1192 movq mm3,[edx+ebp*2] ;[01234567] [4] Y
1193 psrlw mm7,1 ;[01234567] [2]
1195 pand mm7,MMX_rbmask ;[01234567] [2] mm6: <RB3><RB2><RB1><RB0>
1196 psrlw mm3,8 ;[01234567] [4]
1198 por mm7,mm5 ;[01234567] [2] mm7: P7P5P3P1
1199 movq mm5,mm6 ;[01234567] [A]
1201 psubw mm3,MMX_10w ;[01234567] [4]
1202 punpcklwd mm6,mm7 ;[01234567] [A] mm4: P3P2P1P0
1204 pmullw mm3,MMX_Ycoeff ;[0123456 ] [4]
1205 punpckhwd mm5,mm7 ;[0123456 ] [A} mm5: P7P6P5P4
1207 movq [eax+ebp*4 ],mm6 ;[012345 ] [A]
1208 movq mm6,mm4 ;[0123456 ] [3]
1210 movq [eax+ebp*4+ 8],mm5 ;[0123456 ] [A]
1211 paddw mm6,mm0 ;[01234 6 ] [3] mm6: <B3><B2><B1><B0>
1213 movq mm5,mm4 ;[0123456 ] [3]
1214 paddw mm4,mm1 ;[0123456 ] [3] mm4: <R3><R2><R1><R0>
1216 paddw mm5,mm2 ;[0123456 ] [3] mm5: <G3><G2><G1><G0>
1217 psraw mm4,6 ;[0123456 ] [3]
1219 movq mm7,mm3 ;[01234567] [4]
1220 psraw mm5,4 ;[01234567] [3]
1222 paddw mm7,mm0 ;[01234567] [4] mm6: <B3><B2><B1><B0>
1223 psraw mm6,6 ;[01234567] [3]
1225 movq mm0,mm3 ;[01234567] [4]
1226 packuswb mm4,mm4 ;[01234567] [3] mm4: R3R2R1R0R3R2R1R0
1229 packuswb mm6,mm6 ;[01 34567] [3] mm6: B3B2B1B0B3B2B1B0
1230 paddw mm3,mm1 ;[01234567] [4] mm4: <R3><R2><R1><R0>
1232 psrlq mm6,2
1233 paddw mm0,mm2 ;[01 34567] [4] mm5: <G3><G2><G1><G0>
1235 paddsw mm5,MMX_clip
1236 punpcklbw mm6,mm4 ;[01 3 567] [3] mm6: B3B3B2B2B1B1B0B0
1238 psubusw mm5,MMX_clip
1239 psrlw mm6,1 ;[01 3 567] [3]
1241 pand mm6,MMX_rbmask ;[01 3 567] [3] mm6: <B3><B2><B1><B0>
1242 psraw mm3,6 ;[01 3 567] [4]
1244 pand mm5,MMX_grnmask ;[01 3 567] [3] mm7: <G3><G2><G1><G0>
1245 psraw mm0,4 ;[01 3 567] [4]
1247 por mm6,mm5 ;[01 3 67] [3] mm4: P6P4P2P0
1248 psraw mm7,6 ;[01 3 67] [4]
1250 paddsw mm0,MMX_clip
1251 packuswb mm3,mm3 ;[01 3 67] [4] mm4: R3R2R1R0R3R2R1R0
1253 psubusw mm0,MMX_clip
1254 packuswb mm7,mm7 ;[01 3 67] mm6: B3B2B1B0B3B2B1B0
1256 pand mm0,MMX_grnmask ;[01 67] mm7: <G3><G2><G1><G0>
1257 psrlq mm7,2
1259 punpcklbw mm7,mm3 ;[01 67] mm6: R3B3R2B2R1B1R0B0
1260 movq mm1,mm6
1262 psrlw mm7,1
1263 add ebp,4
1265 pand mm7,MMX_rbmask ;[01 67] mm6: <B3><B2><B1><B0>
1267 por mm0,mm7 ;[01 67] mm0: P7P5P3P1
1269 punpcklwd mm6,mm0 ;[01 6 ] mm4: P3P2P1P0
1271 punpckhwd mm1,mm0 ;[ 1 6 ] mm5: P7P6P5P4
1272 movq [ebx+ebp*4-16],mm6
1274 movq [ebx+ebp*4- 8],mm1
1275 jnz col_loop_MMX16
1277 pop ebp
1278 pop edi
1279 pop esi
1280 pop ebx
1283 ;--------------------------------------------------------------------------
1285 _asm_YUVtoRGB32_row_ISSE:
1286 push ebx
1287 push esi
1288 push edi
1289 push ebp
1291 mov eax,count
1292 mov ebp,eax
1293 mov ebx,eax
1294 shl ebx,3
1295 add eax,eax
1296 add ARGB1_pointer,ebx
1297 add ARGB2_pointer,ebx
1298 add Y1_pointer,eax
1299 add Y2_pointer,eax
1300 add U_pointer,ebp
1301 add V_pointer,ebp
1302 neg ebp
1304 mov esi,U_pointer
1305 mov edi,V_pointer
1306 mov ecx,Y1_pointer
1307 mov edx,Y2_pointer
1308 mov eax,ARGB1_pointer
1309 mov ebx,ARGB2_pointer
1311 col_loop_SSE:
1312 prefetchnta [esi+ebp+32]
1313 prefetchnta [edi+ebp+32]
1314 prefetchnta [ecx+ebp*2+32]
1315 prefetchnta [edx+ebp*2+32]
1317 movd mm0, dword ptr [esi+ebp] ;U (byte)
1318 pxor mm7,mm7
1320 movd mm1, dword ptr [edi+ebp] ;V (byte)
1321 punpcklbw mm0,mm7 ;U (word)
1323 psubw mm0,MMX_80w
1324 punpcklbw mm1,mm7 ;V (word)
1326 psubw mm1,MMX_80w
1327 movq mm2,mm0
1329 pmullw mm2,MMX_Ugrncoeff
1330 movq mm3,mm1
1332 pmullw mm3,MMX_Vgrncoeff
1333 pmullw mm0,MMX_Ublucoeff
1334 pmullw mm1,MMX_Vredcoeff
1335 paddw mm2,mm3
1337 ;mm0: blue
1338 ;mm1: red
1339 ;mm2: green
1341 movq mm6,[ecx+ebp*2] ;Y
1342 pand mm6,MMX_00FFw
1343 psubw mm6,MMX_10w
1344 pmullw mm6,MMX_Ycoeff
1345 movq mm4,mm6
1346 paddw mm6,mm0 ;mm6: <B3><B2><B1><B0>
1347 movq mm5,mm4
1348 paddw mm4,mm1 ;mm4: <R3><R2><R1><R0>
1349 paddw mm5,mm2 ;mm5: <G3><G2><G1><G0>
1350 psraw mm6,6
1351 psraw mm4,6
1352 packuswb mm6,mm6 ;mm6: B3B2B1B0B3B2B1B0
1353 psraw mm5,6
1354 packuswb mm4,mm4 ;mm4: R3R2R1R0R3R2R1R0
1355 punpcklbw mm6,mm4 ;mm6: R3B3R2B2R1B1R0B0
1356 packuswb mm5,mm5 ;mm5: G3G2G1G0G3G2G1G0
1357 punpcklbw mm5,mm5 ;mm5: G3G3G2G2G1G1G0G0
1358 movq mm4,mm6
1359 punpcklbw mm6,mm5 ;mm6: G1R1G1B2G0R0G0B0
1360 punpckhbw mm4,mm5 ;mm4: G3R3G3B3G2R2G2B2
1362 movq mm7,[ecx+ebp*2] ;Y
1363 psrlw mm7,8
1364 psubw mm7,MMX_10w
1365 pmullw mm7,MMX_Ycoeff
1366 movq mm3,mm7
1367 paddw mm7,mm0 ;mm7: final blue
1368 movq mm5,mm3
1369 paddw mm3,mm1 ;mm3: final red
1370 paddw mm5,mm2 ;mm5: final green
1371 psraw mm7,6
1372 psraw mm3,6
1373 packuswb mm7,mm7 ;mm7: B3B2B1B0B3B2B1B0
1374 psraw mm5,6
1375 packuswb mm3,mm3 ;mm3: R3R2R1R0R3R2R1R0
1376 punpcklbw mm7,mm3 ;mm7: R3B3R2B2R1B1R0B0
1377 packuswb mm5,mm5 ;mm5: G3G2G1G0G3G2G1G0
1378 punpcklbw mm5,mm5 ;mm5: G3G3G2G2G1G1G0G0
1379 movq mm3,mm7
1380 punpcklbw mm7,mm5 ;mm7: G1R1G1B2G0R0G0B0
1381 punpckhbw mm3,mm5 ;mm3: G3R3G3B3G2R2G2B2
1383 ;mm3 P7:P5
1384 ;mm4 P6:P4
1385 ;mm6 P2:P0
1386 ;mm7 P3:P1
1388 movq mm5,mm6
1389 punpckldq mm5,mm7 ;P1:P0
1390 punpckhdq mm6,mm7 ;P3:P2
1391 movq mm7,mm4
1392 punpckldq mm4,mm3 ;P5:P4
1393 punpckhdq mm7,mm3 ;P7:P6
1395 movntq [eax+ebp*8],mm5
1396 movntq [eax+ebp*8+8],mm6
1397 movntq [eax+ebp*8+16],mm4
1398 movntq [eax+ebp*8+24],mm7
1400 movq mm6,[edx+ebp*2] ;Y
1401 pand mm6,MMX_00FFw
1402 psubw mm6,MMX_10w
1403 pmullw mm6,MMX_Ycoeff
1404 movq mm4,mm6
1405 paddw mm6,mm0 ;mm6: <B3><B2><B1><B0>
1406 movq mm5,mm4
1407 paddw mm4,mm1 ;mm4: <R3><R2><R1><R0>
1408 paddw mm5,mm2 ;mm5: <G3><G2><G1><G0>
1409 psraw mm6,6
1410 psraw mm4,6
1411 packuswb mm6,mm6 ;mm6: B3B2B1B0B3B2B1B0
1412 psraw mm5,6
1413 packuswb mm4,mm4 ;mm4: R3R2R1R0R3R2R1R0
1414 punpcklbw mm6,mm4 ;mm6: R3B3R2B2R1B1R0B0
1415 packuswb mm5,mm5 ;mm5: G3G2G1G0G3G2G1G0
1416 punpcklbw mm5,mm5 ;mm5: G3G3G2G2G1G1G0G0
1417 movq mm4,mm6
1418 punpcklbw mm6,mm5 ;mm6: G1R1G1B2G0R0G0B0
1419 punpckhbw mm4,mm5 ;mm4: G3R3G3B3G2R2G2B2
1421 movq mm7,[edx+ebp*2] ;Y
1422 psrlw mm7,8
1423 psubw mm7,MMX_10w
1424 pmullw mm7,MMX_Ycoeff
1425 movq mm3,mm7
1426 paddw mm7,mm0 ;mm7: final blue
1427 movq mm5,mm3
1428 paddw mm3,mm1 ;mm3: final red
1429 paddw mm5,mm2 ;mm5: final green
1430 psraw mm7,6
1431 psraw mm3,6
1432 packuswb mm7,mm7 ;mm7: B3B2B1B0B3B2B1B0
1433 psraw mm5,6
1434 packuswb mm3,mm3 ;mm3: R3R2R1R0R3R2R1R0
1435 punpcklbw mm7,mm3 ;mm7: R3B3R2B2R1B1R0B0
1436 packuswb mm5,mm5 ;mm5: G3G2G1G0G3G2G1G0
1437 punpcklbw mm5,mm5 ;mm5: G3G3G2G2G1G1G0G0
1438 movq mm3,mm7
1439 punpcklbw mm7,mm5 ;mm7: G1R1G1B2G0R0G0B0
1440 punpckhbw mm3,mm5 ;mm3: G3R3G3B3G2R2G2B2
1442 ;mm3 P7:P5
1443 ;mm4 P6:P4
1444 ;mm6 P2:P0
1445 ;mm7 P3:P1
1447 movq mm5,mm6
1448 punpckldq mm5,mm7 ;P1:P0
1449 punpckhdq mm6,mm7 ;P3:P2
1450 movq mm7,mm4
1451 punpckldq mm4,mm3 ;P5:P4
1452 punpckhdq mm7,mm3 ;P7:P6
1454 movntq [ebx+ebp*8 ],mm5
1455 movntq [ebx+ebp*8+ 8],mm6
1457 movntq [ebx+ebp*8+16],mm4
1458 movntq [ebx+ebp*8+24],mm7
1460 add ebp,4
1462 jnz col_loop_SSE
1464 pop ebp
1465 pop edi
1466 pop esi
1467 pop ebx
1470 _asm_YUVtoRGB24_row_ISSE:
1471 push ebx
1472 push esi
1473 push edi
1474 push ebp
1476 mov eax,count
1477 mov ebp,eax
1478 add eax,eax
1479 add Y1_pointer,eax
1480 add Y2_pointer,eax
1481 add U_pointer,ebp
1482 add V_pointer,ebp
1483 neg ebp
1485 mov esi,U_pointer
1486 mov edi,V_pointer
1487 mov ecx,Y1_pointer
1488 mov edx,Y2_pointer
1489 mov eax,ARGB1_pointer
1490 mov ebx,ARGB2_pointer
1492 movd mm0,esp
1493 sub esp,20
1494 and esp,-8
1495 movd dword ptr [esp+16],mm0
1497 col_loop_ISSE24:
1498 prefetchnta [esi+ebp+32]
1499 prefetchnta [edi+ebp+32]
1500 prefetchnta [ecx+ebp*2+32]
1501 prefetchnta [edx+ebp*2+32]
1503 movd mm0, dword ptr [esi+ebp] ;U (byte)
1504 pxor mm7,mm7
1506 movd mm1, dword ptr [edi+ebp] ;V (byte)
1507 punpcklbw mm0,mm7 ;U (word)
1509 movd mm2, dword ptr [ecx+ebp*2] ;Y low
1510 punpcklbw mm1,mm7 ;V (word)
1512 movd mm3, dword ptr [edx+ebp*2] ;Y high
1513 punpcklbw mm2,mm7 ;Y1 (word)
1515 psubw mm2,MMX_10w
1516 punpcklbw mm3,mm7 ;Y2 (word)
1518 psubw mm3,MMX_10w
1520 psubw mm0,MMX_80w
1521 psubw mm1,MMX_80w
1523 movq [esp+0],mm0
1524 movq [esp+8],mm1
1526 ;group 1
1528 pmullw mm2,MMX_Ycoeff ;[lazy]
1529 pmullw mm3,MMX_Ycoeff ;[lazy]
1531 pshufw mm6,mm0,00000000b ;mm6 = U0U0U0U0
1532 pshufw mm7,mm1,00000000b ;mm7 = V0V0V0V0
1534 pmullw mm6,MMX_Ucoeff0
1535 pshufw mm4,mm2,01000000b ;mm4 = Y1Y0Y0Y0 [high]
1536 pmullw mm7,MMX_Vcoeff0
1537 pshufw mm5,mm3,01000000b ;mm4 = Y1Y0Y0Y0 [low]
1539 paddw mm4,mm6
1540 paddw mm5,mm6
1541 paddw mm4,mm7
1542 paddw mm5,mm7
1544 psraw mm4,6
1545 psraw mm5,6
1547 ;group 2
1549 pshufw mm6,[esp+0],01010000b ;mm6 = U1U1U0U0
1550 pshufw mm7,[esp+8],01010000b ;mm7 = V1V1V0V0
1552 pmullw mm6,MMX_Ucoeff1
1553 pshufw mm0,mm2,10100101b ;mm0 = Y2Y2Y1Y1 [high]
1554 pmullw mm7,MMX_Vcoeff1
1555 pshufw mm1,mm3,10100101b ;mm1 = Y2Y2Y1Y1 [low]
1557 paddw mm0,mm6
1558 paddw mm1,mm6
1559 paddw mm0,mm7
1560 paddw mm1,mm7
1562 psraw mm0,6
1563 psraw mm1,6
1565 packuswb mm4,mm0
1566 packuswb mm5,mm1
1568 ;group 3
1570 pshufw mm6,[esp+0],01010101b ;mm6 = U1U1U1U1
1571 pshufw mm7,[esp+8],01010101b ;mm7 = V1V1V1V1
1573 movntq [eax],mm4 ;[lazy write]
1574 movntq [ebx],mm5 ;[lazy write]
1576 pmullw mm6,MMX_Ucoeff2
1577 pshufw mm4,mm2,11111110b ;mm4 = Y3Y3Y3Y2 [high]
1578 pmullw mm7,MMX_Vcoeff2
1579 pshufw mm5,mm3,11111110b ;mm5 = Y3Y3Y3Y2 [low]
1581 paddw mm4,mm6
1582 paddw mm5,mm6
1583 paddw mm4,mm7
1584 paddw mm5,mm7
1586 psraw mm4,6
1587 psraw mm5,6
1589 ;next 3 groups
1591 movd mm2, dword ptr [ecx+ebp*2+4] ;Y low
1592 pxor mm7,mm7
1594 movd mm3, dword ptr [edx+ebp*2+4] ;Y high
1595 punpcklbw mm2,mm7 ;U (word)
1597 psubw mm2,MMX_10w
1598 punpcklbw mm3,mm7 ;V (word)
1600 psubw mm3,MMX_10w
1603 ;group 1
1605 pmullw mm2,MMX_Ycoeff ;[init]
1606 pmullw mm3,MMX_Ycoeff ;[init]
1608 pshufw mm6,[esp+0],10101010b ;mm6 = U2U2U2U2
1609 pshufw mm7,[esp+8],10101010b ;mm7 = V2V2V2V2
1611 pmullw mm6,MMX_Ucoeff0
1612 pshufw mm0,mm2,01000000b ;mm0 = Y1Y0Y0Y0 [high]
1613 pmullw mm7,MMX_Vcoeff0
1614 pshufw mm1,mm3,01000000b ;mm1 = Y1Y0Y0Y0 [low]
1616 paddw mm0,mm6
1617 paddw mm1,mm6
1618 paddw mm0,mm7
1619 paddw mm1,mm7
1621 psraw mm0,6
1622 psraw mm1,6
1624 packuswb mm4,mm0
1625 packuswb mm5,mm1
1627 ;group 2
1629 pshufw mm6,[esp+0],11111010b ;mm6 = U3U3U2U2
1630 pshufw mm7,[esp+8],11111010b ;mm7 = V3V3V2V2
1632 movntq [eax+8],mm4
1633 movntq [ebx+8],mm5
1635 pmullw mm6,MMX_Ucoeff1
1636 pshufw mm4,mm2,10100101b ;mm4 = Y2Y2Y1Y1 [high]
1637 pmullw mm7,MMX_Vcoeff1
1638 pshufw mm5,mm3,10100101b ;mm5 = Y2Y2Y1Y1 [low]
1640 paddw mm4,mm6
1641 paddw mm5,mm6
1642 paddw mm4,mm7
1643 paddw mm5,mm7
1645 psraw mm4,6
1646 psraw mm5,6
1648 ;group 3
1650 pshufw mm0,[esp+0],11111111b ;mm6 = U3U3U3U3
1651 pshufw mm1,[esp+8],11111111b ;mm7 = V3V3V3V3
1653 pmullw mm0,MMX_Ucoeff2
1654 pshufw mm2,mm2,11111110b ;mm6 = Y3Y3Y3Y2 [high]
1655 pmullw mm1,MMX_Vcoeff2
1656 pshufw mm3,mm3,11111110b ;mm7 = Y3Y3Y3Y2 [low]
1658 paddw mm2,mm0
1659 paddw mm3,mm0
1660 paddw mm2,mm1
1661 paddw mm3,mm1
1663 psraw mm2,6
1664 psraw mm3,6
1666 packuswb mm4,mm2
1667 packuswb mm5,mm3
1669 movntq [eax+16],mm4
1670 add eax,24
1671 movntq [ebx+16],mm5
1672 add ebx,24
1674 ;done
1676 add ebp,4
1677 jnz col_loop_ISSE24
1679 mov esp,[esp+16]
1681 pop ebp
1682 pop edi
1683 pop esi
1684 pop ebx
1687 _asm_YUVtoRGB16_row_ISSE:
1688 push ebx
1689 push esi
1690 push edi
1691 push ebp
1693 mov eax,count
1694 mov ebp,eax
1695 mov ebx,eax
1696 shl ebx,2
1697 add eax,eax
1698 add ARGB1_pointer,ebx
1699 add ARGB2_pointer,ebx
1700 add Y1_pointer,eax
1701 add Y2_pointer,eax
1702 add U_pointer,ebp
1703 add V_pointer,ebp
1704 neg ebp
1706 mov esi,U_pointer
1707 mov edi,V_pointer
1708 mov ecx,Y1_pointer
1709 mov edx,Y2_pointer
1710 mov eax,ARGB1_pointer
1711 mov ebx,ARGB2_pointer
1713 col_loop_ISSE16:
1714 prefetchnta [esi+ebp+32]
1715 prefetchnta [edi+ebp+32]
1717 movd mm0, dword ptr [esi+ebp] ;[0 ] U (byte)
1718 pxor mm7,mm7 ;[0 7]
1720 movd mm1, dword ptr [edi+ebp] ;[01 7] V (byte)
1721 punpcklbw mm0,mm7 ;[01 7] U (word)
1723 psubw mm0,MMX_80w ;[01 7]
1724 punpcklbw mm1,mm7 ;[01 7] V (word)
1726 psubw mm1,MMX_80w ;[01 ]
1727 movq mm2,mm0 ;[012 ]
1729 pmullw mm2,MMX_Ugrncoeff ;[012 ]
1730 movq mm3,mm1 ;[0123 ]
1732 ;mm0: blue
1733 ;mm1: red
1734 ;mm2: green
1736 prefetchnta [ecx+ebp*2+32]
1737 prefetchnta [edx+ebp*2+32]
1739 movq mm6,[ecx+ebp*2] ;[0123 6 ] [1] Y
1740 ;<-->
1742 pmullw mm3,MMX_Vgrncoeff ;[0123 ]
1743 movq mm7,mm6 ;[012 67] [2] Y
1745 pmullw mm0,MMX_Ublucoeff ;[0123 ]
1746 psrlw mm7,8 ;[012 67] [2]
1748 pmullw mm1,MMX_Vredcoeff ;[0123 ]
1749 ;<-->
1751 pand mm6,MMX_00FFw ;[012 67] [1]
1752 paddw mm2,mm3 ;[012 6 ] [C]
1754 psubw mm6,MMX_10w ;[012 67] [1]
1756 pmullw mm6,MMX_Ycoeff ;[012 67] [1]
1758 psubw mm7,MMX_10w ;[012 67] [2]
1759 movq mm4,mm6 ;[012 4 67] [1]
1761 pmullw mm7,MMX_Ycoeff ;[012 67] [2]
1762 movq mm5,mm6 ;[012 4567] [1]
1764 paddw mm6,mm0 ;[012 4 67] [1] mm6: <B3><B2><B1><B0>
1765 paddw mm4,mm1 ;[012 4567] [1] mm4: <R3><R2><R1><R0>
1767 paddw mm5,mm2 ;[012 4567] [1] mm5: <G3><G2><G1><G0>
1768 psraw mm4,6 ;[012 4567] [1]
1770 movq mm3,mm7 ;[01234567] [2]
1771 psraw mm5,4 ;[01234567] [1]
1773 paddw mm7,mm0 ;[01234567] [2] mm6: <B3><B2><B1><B0>
1774 psraw mm6,6 ;[01234567] [1]
1776 paddsw mm5,MMX_clip
1777 packuswb mm6,mm6 ;[01234567] [1] mm6: B3B2B1B0B3B2B1B0
1779 psubusw mm5,MMX_clip
1780 packuswb mm4,mm4 ;[01234567] [1] mm4: R3R2R1R0R3R2R1R0
1782 pand mm5,MMX_grnmask ;[01234567] [1] mm7: <G3><G2><G1><G0>
1783 psrlq mm6,2 ;[01234567] [1]
1785 punpcklbw mm6,mm4 ;[0123 567] [1] mm4: R3B3R2B2R1B1R0B0
1787 movq mm4,[edx+ebp*2] ;[01234567] [3] Y
1788 psrlw mm6,1 ;[01234567] [1]
1790 pand mm6,MMX_rbmask ;[01234567] [1] mm6: <RB3><RB2><RB1><RB0>
1792 por mm6,mm5 ;[01234 67] [1] mm6: P6P4P2P0
1793 movq mm5,mm3 ;[01234567] [2]
1795 paddw mm3,mm1 ;[01234567] [2] mm4: <R3><R2><R1><R0>
1796 paddw mm5,mm2 ;[01234567] [2] mm5: <G3><G2><G1><G0>
1798 pand mm4,MMX_00FFw ;[01234567] [3]
1799 psraw mm3,6 ;[01234567] [2]
1801 psubw mm4,MMX_10w ;[01234567] [3]
1802 psraw mm5,4 ;[01234567] [2]
1804 pmullw mm4,MMX_Ycoeff ;[01234567] [3]
1805 psraw mm7,6 ;[01234567] [2]
1807 paddsw mm5,MMX_clip
1808 packuswb mm3,mm3 ;[01234567] [2] mm4: R3R2R1R0R3R2R1R0
1810 psubusw mm5,MMX_clip
1811 packuswb mm7,mm7 ;[01234567] [2] mm6: B3B2B1B0B3B2B1B0
1813 pand mm5,MMX_grnmask ;[012 4567] [2] mm7: <G3><G2><G1><G0>
1814 psrlq mm7,2 ;[01234567] [2]
1816 punpcklbw mm7,mm3 ;[012 4567] [2] mm6: R3B3R2B2R1B1R0B0
1818 movq mm3,[edx+ebp*2] ;[01234567] [4] Y
1819 psrlw mm7,1 ;[01234567] [2]
1821 pand mm7,MMX_rbmask ;[01234567] [2] mm6: <RB3><RB2><RB1><RB0>
1822 psrlw mm3,8 ;[01234567] [4]
1824 por mm7,mm5 ;[01234567] [2] mm7: P7P5P3P1
1825 movq mm5,mm6 ;[01234567] [A]
1827 psubw mm3,MMX_10w ;[01234567] [4]
1828 punpcklwd mm6,mm7 ;[01234567] [A] mm4: P3P2P1P0
1830 pmullw mm3,MMX_Ycoeff ;[0123456 ] [4]
1831 punpckhwd mm5,mm7 ;[0123456 ] [A} mm5: P7P6P5P4
1833 movntq [eax+ebp*4 ],mm6 ;[012345 ] [A]
1834 movq mm6,mm4 ;[0123456 ] [3]
1836 movntq [eax+ebp*4+ 8],mm5 ;[0123456 ] [A]
1837 paddw mm6,mm0 ;[01234 6 ] [3] mm6: <B3><B2><B1><B0>
1839 movq mm5,mm4 ;[0123456 ] [3]
1840 paddw mm4,mm1 ;[0123456 ] [3] mm4: <R3><R2><R1><R0>
1842 paddw mm5,mm2 ;[0123456 ] [3] mm5: <G3><G2><G1><G0>
1843 psraw mm4,6 ;[0123456 ] [3]
1845 movq mm7,mm3 ;[01234567] [4]
1846 psraw mm5,4 ;[01234567] [3]
1848 paddw mm7,mm0 ;[01234567] [4] mm6: <B3><B2><B1><B0>
1849 psraw mm6,6 ;[01234567] [3]
1851 movq mm0,mm3 ;[01234567] [4]
1852 packuswb mm4,mm4 ;[01234567] [3] mm4: R3R2R1R0R3R2R1R0
1855 packuswb mm6,mm6 ;[01 34567] [3] mm6: B3B2B1B0B3B2B1B0
1856 paddw mm3,mm1 ;[01234567] [4] mm4: <R3><R2><R1><R0>
1858 psrlq mm6,2
1859 paddw mm0,mm2 ;[01 34567] [4] mm5: <G3><G2><G1><G0>
1861 paddsw mm5,MMX_clip
1862 punpcklbw mm6,mm4 ;[01 3 567] [3] mm6: B3B3B2B2B1B1B0B0
1864 psubusw mm5,MMX_clip
1865 psrlw mm6,1 ;[01 3 567] [3]
1867 pand mm6,MMX_rbmask ;[01 3 567] [3] mm6: <B3><B2><B1><B0>
1868 psraw mm3,6 ;[01 3 567] [4]
1870 pand mm5,MMX_grnmask ;[01 3 567] [3] mm7: <G3><G2><G1><G0>
1871 psraw mm0,4 ;[01 3 567] [4]
1873 por mm6,mm5 ;[01 3 67] [3] mm4: P6P4P2P0
1874 psraw mm7,6 ;[01 3 67] [4]
1876 paddsw mm0,MMX_clip
1877 packuswb mm3,mm3 ;[01 3 67] [4] mm4: R3R2R1R0R3R2R1R0
1879 psubusw mm0,MMX_clip
1880 packuswb mm7,mm7 ;[01 3 67] mm6: B3B2B1B0B3B2B1B0
1882 pand mm0,MMX_grnmask ;[01 67] mm7: <G3><G2><G1><G0>
1883 psrlq mm7,2
1885 punpcklbw mm7,mm3 ;[01 67] mm6: R3B3R2B2R1B1R0B0
1886 movq mm1,mm6
1888 psrlw mm7,1
1889 add ebp,4
1891 pand mm7,MMX_rbmask ;[01 67] mm6: <B3><B2><B1><B0>
1893 por mm0,mm7 ;[01 67] mm0: P7P5P3P1
1895 punpcklwd mm6,mm0 ;[01 6 ] mm4: P3P2P1P0
1897 punpckhwd mm1,mm0 ;[ 1 6 ] mm5: P7P6P5P4
1898 movntq [ebx+ebp*4-16],mm6
1900 movntq [ebx+ebp*4- 8],mm1
1901 jnz col_loop_ISSE16
1903 pop ebp
1904 pop edi
1905 pop esi
1906 pop ebx