d3d9: fix compilation after opengl rework
[vlc.git] / modules / video_chroma / i420_rgb16_x86.c
blob35d3272baf1e97473af87a1ffd41892d5ed17bc9
1 /*****************************************************************************
2 * i420_rgb16_x86.c : YUV to bitmap RGB conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000 VLC authors and VideoLAN
6 * Authors: Samuel Hocevar <sam@zoy.org>
7 * Damien Fouilleul <damienf@videolan.org>
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of the GNU Lesser General Public License as published by
11 * the Free Software Foundation; either version 2.1 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public License
20 * along with this program; if not, write to the Free Software Foundation,
21 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
22 *****************************************************************************/
24 #ifdef HAVE_CONFIG_H
25 # include "config.h"
26 #endif
28 #include <vlc_common.h>
29 #include <vlc_filter.h>
30 #include <vlc_picture.h>
31 #include <vlc_cpu.h>
33 #include "i420_rgb.h"
34 #ifdef SSE2
35 # include "i420_rgb_sse2.h"
36 # define VLC_TARGET VLC_SSE
37 #else
38 # include "i420_rgb_mmx.h"
39 # define VLC_TARGET VLC_MMX
40 #endif
42 /*****************************************************************************
43 * SetOffset: build offset array for conversion functions
44 *****************************************************************************
45 * This function will build an offset array used in later conversion functions.
46 * It will also set horizontal and vertical scaling indicators.
47 *****************************************************************************/
48 static void SetOffset( int i_width, int i_height, int i_pic_width,
49 int i_pic_height, bool *pb_hscale,
50 unsigned int *pi_vscale, int *p_offset )
53 * Prepare horizontal offset array
55 if( i_pic_width - i_width == 0 )
56 { /* No horizontal scaling: YUV conversion is done directly to picture */
57 *pb_hscale = 0;
59 else if( i_pic_width - i_width > 0 )
60 { /* Prepare scaling array for horizontal extension */
61 int i_scale_count = i_pic_width;
63 *pb_hscale = 1;
64 for( int i_x = i_width; i_x--; )
66 while( (i_scale_count -= i_width) > 0 )
68 *p_offset++ = 0;
70 *p_offset++ = 1;
71 i_scale_count += i_pic_width;
74 else /* if( i_pic_width - i_width < 0 ) */
75 { /* Prepare scaling array for horizontal reduction */
76 int i_scale_count = i_pic_width;
78 *pb_hscale = 1;
79 for( int i_x = i_pic_width; i_x--; )
81 *p_offset = 1;
82 while( (i_scale_count -= i_pic_width) > 0 )
84 *p_offset += 1;
86 p_offset++;
87 i_scale_count += i_width;
92 * Set vertical scaling indicator
94 if( i_pic_height - i_height == 0 )
95 *pi_vscale = 0;
96 else if( i_pic_height - i_height > 0 )
97 *pi_vscale = 1;
98 else /* if( i_pic_height - i_height < 0 ) */
99 *pi_vscale = -1;
102 VLC_TARGET
103 void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
105 filter_sys_t *p_sys = p_filter->p_sys;
107 /* We got this one from the old arguments */
108 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
109 uint8_t *p_y = p_src->Y_PIXELS;
110 uint8_t *p_u = p_src->U_PIXELS;
111 uint8_t *p_v = p_src->V_PIXELS;
113 bool b_hscale; /* horizontal scaling type */
114 unsigned int i_vscale; /* vertical scaling type */
115 unsigned int i_x, i_y; /* horizontal and vertical indexes */
117 int i_right_margin;
118 int i_rewind;
119 int i_scale_count; /* scale modulo counter */
120 int i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
121 uint16_t * p_pic_start; /* beginning of the current line for copy */
123 /* Conversion buffer pointer */
124 uint16_t * p_buffer_start;
125 uint16_t * p_buffer;
127 /* Offset array pointer */
128 int * p_offset_start = p_sys->p_offset;
129 int * p_offset;
131 const int i_source_margin = p_src->p[0].i_pitch
132 - p_src->p[0].i_visible_pitch
133 - p_filter->fmt_in.video.i_x_offset;
134 const int i_source_margin_c = p_src->p[1].i_pitch
135 - p_src->p[1].i_visible_pitch
136 - ( p_filter->fmt_in.video.i_x_offset / 2 );
138 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
140 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
141 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
142 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
143 SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
144 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
145 (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
146 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
147 &b_hscale, &i_vscale, p_offset_start );
149 if(b_hscale &&
150 AllocateOrGrow(&p_sys->p_buffer, &p_sys->i_buffer_size,
151 p_filter->fmt_in.video.i_x_offset +
152 p_filter->fmt_in.video.i_visible_width,
153 p_sys->i_bytespp))
154 return;
155 else p_buffer_start = (uint16_t*)p_sys->p_buffer;
158 * Perform conversion
160 i_scale_count = ( i_vscale == 1 ) ?
161 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
162 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
164 #ifdef SSE2
166 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
169 ** SSE2 128 bits fetch/store instructions are faster
170 ** if memory access is 16 bytes aligned
173 p_buffer = b_hscale ? p_buffer_start : p_pic;
175 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
176 p_dest->p->i_pitch|
177 ((intptr_t)p_y)|
178 ((intptr_t)p_buffer))) )
180 /* use faster SSE2 aligned fetch and store */
181 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
183 p_pic_start = p_pic;
185 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/16; i_x--; )
187 SSE2_CALL (
188 SSE2_INIT_16_ALIGNED
189 SSE2_YUV_MUL
190 SSE2_YUV_ADD
191 SSE2_UNPACK_15_ALIGNED
193 p_y += 16;
194 p_u += 8;
195 p_v += 8;
196 p_buffer += 16;
198 /* Here we do some unaligned reads and duplicate conversions, but
199 * at least we have all the pixels */
200 if( i_rewind )
202 p_y -= i_rewind;
203 p_u -= i_rewind >> 1;
204 p_v -= i_rewind >> 1;
205 p_buffer -= i_rewind;
207 SSE2_CALL (
208 SSE2_INIT_16_UNALIGNED
209 SSE2_YUV_MUL
210 SSE2_YUV_ADD
211 SSE2_UNPACK_15_UNALIGNED
213 p_y += 16;
214 p_u += 8;
215 p_v += 8;
217 SCALE_WIDTH;
218 SCALE_HEIGHT( 420, 2 );
220 p_y += i_source_margin;
221 if( i_y % 2 )
223 p_u += i_source_margin_c;
224 p_v += i_source_margin_c;
226 p_buffer = b_hscale ? p_buffer_start : p_pic;
229 else
231 /* use slower SSE2 unaligned fetch and store */
232 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
234 p_pic_start = p_pic;
236 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/16; i_x--; )
238 SSE2_CALL (
239 SSE2_INIT_16_UNALIGNED
240 SSE2_YUV_MUL
241 SSE2_YUV_ADD
242 SSE2_UNPACK_15_UNALIGNED
244 p_y += 16;
245 p_u += 8;
246 p_v += 8;
247 p_buffer += 16;
249 /* Here we do some unaligned reads and duplicate conversions, but
250 * at least we have all the pixels */
251 if( i_rewind )
253 p_y -= i_rewind;
254 p_u -= i_rewind >> 1;
255 p_v -= i_rewind >> 1;
256 p_buffer -= i_rewind;
258 SSE2_CALL (
259 SSE2_INIT_16_UNALIGNED
260 SSE2_YUV_MUL
261 SSE2_YUV_ADD
262 SSE2_UNPACK_15_UNALIGNED
264 p_y += 16;
265 p_u += 8;
266 p_v += 8;
268 SCALE_WIDTH;
269 SCALE_HEIGHT( 420, 2 );
271 p_y += i_source_margin;
272 if( i_y % 2 )
274 p_u += i_source_margin_c;
275 p_v += i_source_margin_c;
277 p_buffer = b_hscale ? p_buffer_start : p_pic;
281 /* make sure all SSE2 stores are visible thereafter */
282 SSE2_END;
284 #else /* SSE2 */
286 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
288 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
290 p_pic_start = p_pic;
291 p_buffer = b_hscale ? p_buffer_start : p_pic;
293 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
295 MMX_CALL (
296 MMX_INIT_16
297 MMX_YUV_MUL
298 MMX_YUV_ADD
299 MMX_UNPACK_15
301 p_y += 8;
302 p_u += 4;
303 p_v += 4;
304 p_buffer += 8;
307 /* Here we do some unaligned reads and duplicate conversions, but
308 * at least we have all the pixels */
309 if( i_rewind )
311 p_y -= i_rewind;
312 p_u -= i_rewind >> 1;
313 p_v -= i_rewind >> 1;
314 p_buffer -= i_rewind;
316 MMX_CALL (
317 MMX_INIT_16
318 MMX_YUV_MUL
319 MMX_YUV_ADD
320 MMX_UNPACK_15
322 p_y += 8;
323 p_u += 4;
324 p_v += 4;
326 SCALE_WIDTH;
327 SCALE_HEIGHT( 420, 2 );
329 p_y += i_source_margin;
330 if( i_y % 2 )
332 p_u += i_source_margin_c;
333 p_v += i_source_margin_c;
336 /* re-enable FPU registers */
337 MMX_END;
339 #endif /* SSE2 */
342 VLC_TARGET
343 void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
345 filter_sys_t *p_sys = p_filter->p_sys;
347 /* We got this one from the old arguments */
348 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
349 uint8_t *p_y = p_src->Y_PIXELS;
350 uint8_t *p_u = p_src->U_PIXELS;
351 uint8_t *p_v = p_src->V_PIXELS;
353 bool b_hscale; /* horizontal scaling type */
354 unsigned int i_vscale; /* vertical scaling type */
355 unsigned int i_x, i_y; /* horizontal and vertical indexes */
357 int i_right_margin;
358 int i_rewind;
359 int i_scale_count; /* scale modulo counter */
360 int i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
361 uint16_t * p_pic_start; /* beginning of the current line for copy */
363 /* Conversion buffer pointer */
364 uint16_t * p_buffer_start;
365 uint16_t * p_buffer;
367 /* Offset array pointer */
368 int * p_offset_start = p_sys->p_offset;
369 int * p_offset;
371 const int i_source_margin = p_src->p[0].i_pitch
372 - p_src->p[0].i_visible_pitch
373 - p_filter->fmt_in.video.i_x_offset;
374 const int i_source_margin_c = p_src->p[1].i_pitch
375 - p_src->p[1].i_visible_pitch
376 - ( p_filter->fmt_in.video.i_x_offset / 2 );
378 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
380 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
381 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
382 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
383 SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
384 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
385 (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
386 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
387 &b_hscale, &i_vscale, p_offset_start );
389 if(b_hscale &&
390 AllocateOrGrow(&p_sys->p_buffer, &p_sys->i_buffer_size,
391 p_filter->fmt_in.video.i_x_offset +
392 p_filter->fmt_in.video.i_visible_width,
393 p_sys->i_bytespp))
394 return;
395 else p_buffer_start = (uint16_t*)p_sys->p_buffer;
398 * Perform conversion
400 i_scale_count = ( i_vscale == 1 ) ?
401 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
402 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
404 #ifdef SSE2
406 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
409 ** SSE2 128 bits fetch/store instructions are faster
410 ** if memory access is 16 bytes aligned
413 p_buffer = b_hscale ? p_buffer_start : p_pic;
415 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
416 p_dest->p->i_pitch|
417 ((intptr_t)p_y)|
418 ((intptr_t)p_buffer))) )
420 /* use faster SSE2 aligned fetch and store */
421 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
423 p_pic_start = p_pic;
425 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/16; i_x--; )
427 SSE2_CALL (
428 SSE2_INIT_16_ALIGNED
429 SSE2_YUV_MUL
430 SSE2_YUV_ADD
431 SSE2_UNPACK_16_ALIGNED
433 p_y += 16;
434 p_u += 8;
435 p_v += 8;
436 p_buffer += 16;
438 /* Here we do some unaligned reads and duplicate conversions, but
439 * at least we have all the pixels */
440 if( i_rewind )
442 p_y -= i_rewind;
443 p_u -= i_rewind >> 1;
444 p_v -= i_rewind >> 1;
445 p_buffer -= i_rewind;
447 SSE2_CALL (
448 SSE2_INIT_16_UNALIGNED
449 SSE2_YUV_MUL
450 SSE2_YUV_ADD
451 SSE2_UNPACK_16_UNALIGNED
453 p_y += 16;
454 p_u += 8;
455 p_v += 8;
457 SCALE_WIDTH;
458 SCALE_HEIGHT( 420, 2 );
460 p_y += i_source_margin;
461 if( i_y % 2 )
463 p_u += i_source_margin_c;
464 p_v += i_source_margin_c;
466 p_buffer = b_hscale ? p_buffer_start : p_pic;
469 else
471 /* use slower SSE2 unaligned fetch and store */
472 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
474 p_pic_start = p_pic;
476 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/16; i_x--; )
478 SSE2_CALL(
479 SSE2_INIT_16_UNALIGNED
480 SSE2_YUV_MUL
481 SSE2_YUV_ADD
482 SSE2_UNPACK_16_UNALIGNED
484 p_y += 16;
485 p_u += 8;
486 p_v += 8;
487 p_buffer += 16;
489 /* Here we do some unaligned reads and duplicate conversions, but
490 * at least we have all the pixels */
491 if( i_rewind )
493 p_y -= i_rewind;
494 p_u -= i_rewind >> 1;
495 p_v -= i_rewind >> 1;
496 p_buffer -= i_rewind;
498 SSE2_CALL(
499 SSE2_INIT_16_UNALIGNED
500 SSE2_YUV_MUL
501 SSE2_YUV_ADD
502 SSE2_UNPACK_16_UNALIGNED
504 p_y += 16;
505 p_u += 8;
506 p_v += 8;
508 SCALE_WIDTH;
509 SCALE_HEIGHT( 420, 2 );
511 p_y += i_source_margin;
512 if( i_y % 2 )
514 p_u += i_source_margin_c;
515 p_v += i_source_margin_c;
517 p_buffer = b_hscale ? p_buffer_start : p_pic;
521 /* make sure all SSE2 stores are visible thereafter */
522 SSE2_END;
524 #else /* SSE2 */
526 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
528 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
530 p_pic_start = p_pic;
531 p_buffer = b_hscale ? p_buffer_start : p_pic;
533 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
535 MMX_CALL (
536 MMX_INIT_16
537 MMX_YUV_MUL
538 MMX_YUV_ADD
539 MMX_UNPACK_16
541 p_y += 8;
542 p_u += 4;
543 p_v += 4;
544 p_buffer += 8;
547 /* Here we do some unaligned reads and duplicate conversions, but
548 * at least we have all the pixels */
549 if( i_rewind )
551 p_y -= i_rewind;
552 p_u -= i_rewind >> 1;
553 p_v -= i_rewind >> 1;
554 p_buffer -= i_rewind;
556 MMX_CALL (
557 MMX_INIT_16
558 MMX_YUV_MUL
559 MMX_YUV_ADD
560 MMX_UNPACK_16
562 p_y += 8;
563 p_u += 4;
564 p_v += 4;
566 SCALE_WIDTH;
567 SCALE_HEIGHT( 420, 2 );
569 p_y += i_source_margin;
570 if( i_y % 2 )
572 p_u += i_source_margin_c;
573 p_v += i_source_margin_c;
576 /* re-enable FPU registers */
577 MMX_END;
579 #endif /* SSE2 */
582 VLC_TARGET
583 void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
584 picture_t *p_dest )
586 filter_sys_t *p_sys = p_filter->p_sys;
588 /* We got this one from the old arguments */
589 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
590 uint8_t *p_y = p_src->Y_PIXELS;
591 uint8_t *p_u = p_src->U_PIXELS;
592 uint8_t *p_v = p_src->V_PIXELS;
594 bool b_hscale; /* horizontal scaling type */
595 unsigned int i_vscale; /* vertical scaling type */
596 unsigned int i_x, i_y; /* horizontal and vertical indexes */
598 int i_right_margin;
599 int i_rewind;
600 int i_scale_count; /* scale modulo counter */
601 int i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
602 uint32_t * p_pic_start; /* beginning of the current line for copy */
603 /* Conversion buffer pointer */
604 uint32_t * p_buffer_start;
605 uint32_t * p_buffer;
607 /* Offset array pointer */
608 int * p_offset_start = p_sys->p_offset;
609 int * p_offset;
611 const int i_source_margin = p_src->p[0].i_pitch
612 - p_src->p[0].i_visible_pitch
613 - p_filter->fmt_in.video.i_x_offset;
614 const int i_source_margin_c = p_src->p[1].i_pitch
615 - p_src->p[1].i_visible_pitch
616 - ( p_filter->fmt_in.video.i_x_offset / 2 );
618 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
620 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
621 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
622 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
623 SetOffset( p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width,
624 p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height,
625 (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
626 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
627 &b_hscale, &i_vscale, p_offset_start );
629 if(b_hscale &&
630 AllocateOrGrow(&p_sys->p_buffer, &p_sys->i_buffer_size,
631 p_filter->fmt_in.video.i_x_offset +
632 p_filter->fmt_in.video.i_visible_width,
633 p_sys->i_bytespp))
634 return;
635 else p_buffer_start = (uint32_t*)p_sys->p_buffer;
638 * Perform conversion
640 i_scale_count = ( i_vscale == 1 ) ?
641 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
642 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
644 #ifdef SSE2
646 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
649 ** SSE2 128 bits fetch/store instructions are faster
650 ** if memory access is 16 bytes aligned
653 p_buffer = b_hscale ? p_buffer_start : p_pic;
655 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
656 p_dest->p->i_pitch|
657 ((intptr_t)p_y)|
658 ((intptr_t)p_buffer))) )
660 /* use faster SSE2 aligned fetch and store */
661 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
663 p_pic_start = p_pic;
665 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
667 SSE2_CALL (
668 SSE2_INIT_32_ALIGNED
669 SSE2_YUV_MUL
670 SSE2_YUV_ADD
671 SSE2_UNPACK_32_ARGB_ALIGNED
673 p_y += 16;
674 p_u += 8;
675 p_v += 8;
676 p_buffer += 16;
679 /* Here we do some unaligned reads and duplicate conversions, but
680 * at least we have all the pixels */
681 if( i_rewind )
683 p_y -= i_rewind;
684 p_u -= i_rewind >> 1;
685 p_v -= i_rewind >> 1;
686 p_buffer -= i_rewind;
687 SSE2_CALL (
688 SSE2_INIT_32_UNALIGNED
689 SSE2_YUV_MUL
690 SSE2_YUV_ADD
691 SSE2_UNPACK_32_ARGB_UNALIGNED
693 p_y += 16;
694 p_u += 8;
695 p_v += 8;
697 SCALE_WIDTH;
698 SCALE_HEIGHT( 420, 4 );
700 p_y += i_source_margin;
701 if( i_y % 2 )
703 p_u += i_source_margin_c;
704 p_v += i_source_margin_c;
706 p_buffer = b_hscale ? p_buffer_start : p_pic;
709 else
711 /* use slower SSE2 unaligned fetch and store */
712 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
714 p_pic_start = p_pic;
716 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
718 SSE2_CALL (
719 SSE2_INIT_32_UNALIGNED
720 SSE2_YUV_MUL
721 SSE2_YUV_ADD
722 SSE2_UNPACK_32_ARGB_UNALIGNED
724 p_y += 16;
725 p_u += 8;
726 p_v += 8;
727 p_buffer += 16;
730 /* Here we do some unaligned reads and duplicate conversions, but
731 * at least we have all the pixels */
732 if( i_rewind )
734 p_y -= i_rewind;
735 p_u -= i_rewind >> 1;
736 p_v -= i_rewind >> 1;
737 p_buffer -= i_rewind;
738 SSE2_CALL (
739 SSE2_INIT_32_UNALIGNED
740 SSE2_YUV_MUL
741 SSE2_YUV_ADD
742 SSE2_UNPACK_32_ARGB_UNALIGNED
744 p_y += 16;
745 p_u += 8;
746 p_v += 8;
748 SCALE_WIDTH;
749 SCALE_HEIGHT( 420, 4 );
751 p_y += i_source_margin;
752 if( i_y % 2 )
754 p_u += i_source_margin_c;
755 p_v += i_source_margin_c;
757 p_buffer = b_hscale ? p_buffer_start : p_pic;
761 /* make sure all SSE2 stores are visible thereafter */
762 SSE2_END;
764 #else
766 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
768 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
770 p_pic_start = p_pic;
771 p_buffer = b_hscale ? p_buffer_start : p_pic;
773 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
775 MMX_CALL (
776 MMX_INIT_32
777 MMX_YUV_MUL
778 MMX_YUV_ADD
779 MMX_UNPACK_32_ARGB
781 p_y += 8;
782 p_u += 4;
783 p_v += 4;
784 p_buffer += 8;
787 /* Here we do some unaligned reads and duplicate conversions, but
788 * at least we have all the pixels */
789 if( i_rewind )
791 p_y -= i_rewind;
792 p_u -= i_rewind >> 1;
793 p_v -= i_rewind >> 1;
794 p_buffer -= i_rewind;
795 MMX_CALL (
796 MMX_INIT_32
797 MMX_YUV_MUL
798 MMX_YUV_ADD
799 MMX_UNPACK_32_ARGB
801 p_y += 8;
802 p_u += 4;
803 p_v += 4;
805 SCALE_WIDTH;
806 SCALE_HEIGHT( 420, 4 );
808 p_y += i_source_margin;
809 if( i_y % 2 )
811 p_u += i_source_margin_c;
812 p_v += i_source_margin_c;
816 /* re-enable FPU registers */
817 MMX_END;
819 #endif
822 VLC_TARGET
823 void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
825 filter_sys_t *p_sys = p_filter->p_sys;
827 /* We got this one from the old arguments */
828 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
829 uint8_t *p_y = p_src->Y_PIXELS;
830 uint8_t *p_u = p_src->U_PIXELS;
831 uint8_t *p_v = p_src->V_PIXELS;
833 bool b_hscale; /* horizontal scaling type */
834 unsigned int i_vscale; /* vertical scaling type */
835 unsigned int i_x, i_y; /* horizontal and vertical indexes */
837 int i_right_margin;
838 int i_rewind;
839 int i_scale_count; /* scale modulo counter */
840 int i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
841 uint32_t * p_pic_start; /* beginning of the current line for copy */
842 /* Conversion buffer pointer */
843 uint32_t * p_buffer_start;
844 uint32_t * p_buffer;
846 /* Offset array pointer */
847 int * p_offset_start = p_sys->p_offset;
848 int * p_offset;
850 const int i_source_margin = p_src->p[0].i_pitch
851 - p_src->p[0].i_visible_pitch
852 - p_filter->fmt_in.video.i_x_offset;
853 const int i_source_margin_c = p_src->p[1].i_pitch
854 - p_src->p[1].i_visible_pitch
855 - ( p_filter->fmt_in.video.i_x_offset / 2 );
857 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
859 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
860 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
861 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
862 SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
863 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
864 (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
865 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
866 &b_hscale, &i_vscale, p_offset_start );
868 if(b_hscale &&
869 AllocateOrGrow(&p_sys->p_buffer, &p_sys->i_buffer_size,
870 p_filter->fmt_in.video.i_x_offset +
871 p_filter->fmt_in.video.i_visible_width,
872 p_sys->i_bytespp))
873 return;
874 else p_buffer_start = (uint32_t*)p_sys->p_buffer;
877 * Perform conversion
879 i_scale_count = ( i_vscale == 1 ) ?
880 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
881 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
883 #ifdef SSE2
885 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
888 ** SSE2 128 bits fetch/store instructions are faster
889 ** if memory access is 16 bytes aligned
892 p_buffer = b_hscale ? p_buffer_start : p_pic;
894 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
895 p_dest->p->i_pitch|
896 ((intptr_t)p_y)|
897 ((intptr_t)p_buffer))) )
899 /* use faster SSE2 aligned fetch and store */
900 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
902 p_pic_start = p_pic;
904 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
906 SSE2_CALL (
907 SSE2_INIT_32_ALIGNED
908 SSE2_YUV_MUL
909 SSE2_YUV_ADD
910 SSE2_UNPACK_32_RGBA_ALIGNED
912 p_y += 16;
913 p_u += 8;
914 p_v += 8;
915 p_buffer += 16;
918 /* Here we do some unaligned reads and duplicate conversions, but
919 * at least we have all the pixels */
920 if( i_rewind )
922 p_y -= i_rewind;
923 p_u -= i_rewind >> 1;
924 p_v -= i_rewind >> 1;
925 p_buffer -= i_rewind;
926 SSE2_CALL (
927 SSE2_INIT_32_UNALIGNED
928 SSE2_YUV_MUL
929 SSE2_YUV_ADD
930 SSE2_UNPACK_32_RGBA_UNALIGNED
932 p_y += 16;
933 p_u += 8;
934 p_v += 8;
936 SCALE_WIDTH;
937 SCALE_HEIGHT( 420, 4 );
939 p_y += i_source_margin;
940 if( i_y % 2 )
942 p_u += i_source_margin_c;
943 p_v += i_source_margin_c;
945 p_buffer = b_hscale ? p_buffer_start : p_pic;
948 else
950 /* use slower SSE2 unaligned fetch and store */
951 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
953 p_pic_start = p_pic;
955 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
957 SSE2_CALL (
958 SSE2_INIT_32_UNALIGNED
959 SSE2_YUV_MUL
960 SSE2_YUV_ADD
961 SSE2_UNPACK_32_RGBA_UNALIGNED
963 p_y += 16;
964 p_u += 8;
965 p_v += 8;
966 p_buffer += 16;
969 /* Here we do some unaligned reads and duplicate conversions, but
970 * at least we have all the pixels */
971 if( i_rewind )
973 p_y -= i_rewind;
974 p_u -= i_rewind >> 1;
975 p_v -= i_rewind >> 1;
976 p_buffer -= i_rewind;
977 SSE2_CALL (
978 SSE2_INIT_32_UNALIGNED
979 SSE2_YUV_MUL
980 SSE2_YUV_ADD
981 SSE2_UNPACK_32_RGBA_UNALIGNED
983 p_y += 16;
984 p_u += 8;
985 p_v += 8;
987 SCALE_WIDTH;
988 SCALE_HEIGHT( 420, 4 );
990 p_y += i_source_margin;
991 if( i_y % 2 )
993 p_u += i_source_margin_c;
994 p_v += i_source_margin_c;
996 p_buffer = b_hscale ? p_buffer_start : p_pic;
1000 /* make sure all SSE2 stores are visible thereafter */
1001 SSE2_END;
1003 #else
1005 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
1007 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1009 p_pic_start = p_pic;
1010 p_buffer = b_hscale ? p_buffer_start : p_pic;
1012 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
1014 MMX_CALL (
1015 MMX_INIT_32
1016 MMX_YUV_MUL
1017 MMX_YUV_ADD
1018 MMX_UNPACK_32_RGBA
1020 p_y += 8;
1021 p_u += 4;
1022 p_v += 4;
1023 p_buffer += 8;
1026 /* Here we do some unaligned reads and duplicate conversions, but
1027 * at least we have all the pixels */
1028 if( i_rewind )
1030 p_y -= i_rewind;
1031 p_u -= i_rewind >> 1;
1032 p_v -= i_rewind >> 1;
1033 p_buffer -= i_rewind;
1034 MMX_CALL (
1035 MMX_INIT_32
1036 MMX_YUV_MUL
1037 MMX_YUV_ADD
1038 MMX_UNPACK_32_RGBA
1040 p_y += 8;
1041 p_u += 4;
1042 p_v += 4;
1044 SCALE_WIDTH;
1045 SCALE_HEIGHT( 420, 4 );
1047 p_y += i_source_margin;
1048 if( i_y % 2 )
1050 p_u += i_source_margin_c;
1051 p_v += i_source_margin_c;
1055 /* re-enable FPU registers */
1056 MMX_END;
1058 #endif
1061 VLC_TARGET
1062 void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
1064 filter_sys_t *p_sys = p_filter->p_sys;
1066 /* We got this one from the old arguments */
1067 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1068 uint8_t *p_y = p_src->Y_PIXELS;
1069 uint8_t *p_u = p_src->U_PIXELS;
1070 uint8_t *p_v = p_src->V_PIXELS;
1072 bool b_hscale; /* horizontal scaling type */
1073 unsigned int i_vscale; /* vertical scaling type */
1074 unsigned int i_x, i_y; /* horizontal and vertical indexes */
1076 int i_right_margin;
1077 int i_rewind;
1078 int i_scale_count; /* scale modulo counter */
1079 int i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
1080 uint32_t * p_pic_start; /* beginning of the current line for copy */
1081 /* Conversion buffer pointer */
1082 uint32_t * p_buffer_start;
1083 uint32_t * p_buffer;
1085 /* Offset array pointer */
1086 int * p_offset_start = p_sys->p_offset;
1087 int * p_offset;
1089 const int i_source_margin = p_src->p[0].i_pitch
1090 - p_src->p[0].i_visible_pitch
1091 - p_filter->fmt_in.video.i_x_offset;
1092 const int i_source_margin_c = p_src->p[1].i_pitch
1093 - p_src->p[1].i_visible_pitch
1094 - ( p_filter->fmt_in.video.i_x_offset / 2 );
1096 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1098 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1099 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1100 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1101 SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
1102 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
1103 (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
1104 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
1105 &b_hscale, &i_vscale, p_offset_start );
1107 if(b_hscale &&
1108 AllocateOrGrow(&p_sys->p_buffer, &p_sys->i_buffer_size,
1109 p_filter->fmt_in.video.i_x_offset +
1110 p_filter->fmt_in.video.i_visible_width,
1111 p_sys->i_bytespp))
1112 return;
1113 else p_buffer_start = (uint32_t*)p_sys->p_buffer;
1116 * Perform conversion
1118 i_scale_count = ( i_vscale == 1 ) ?
1119 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
1120 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
1122 #ifdef SSE2
1124 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
1127 ** SSE2 128 bits fetch/store instructions are faster
1128 ** if memory access is 16 bytes aligned
1131 p_buffer = b_hscale ? p_buffer_start : p_pic;
1133 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1134 p_dest->p->i_pitch|
1135 ((intptr_t)p_y)|
1136 ((intptr_t)p_buffer))) )
1138 /* use faster SSE2 aligned fetch and store */
1139 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1141 p_pic_start = p_pic;
1143 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
1145 SSE2_CALL (
1146 SSE2_INIT_32_ALIGNED
1147 SSE2_YUV_MUL
1148 SSE2_YUV_ADD
1149 SSE2_UNPACK_32_BGRA_ALIGNED
1151 p_y += 16;
1152 p_u += 8;
1153 p_v += 8;
1154 p_buffer += 16;
1157 /* Here we do some unaligned reads and duplicate conversions, but
1158 * at least we have all the pixels */
1159 if( i_rewind )
1161 p_y -= i_rewind;
1162 p_u -= i_rewind >> 1;
1163 p_v -= i_rewind >> 1;
1164 p_buffer -= i_rewind;
1165 SSE2_CALL (
1166 SSE2_INIT_32_UNALIGNED
1167 SSE2_YUV_MUL
1168 SSE2_YUV_ADD
1169 SSE2_UNPACK_32_BGRA_UNALIGNED
1171 p_y += 16;
1172 p_u += 8;
1173 p_v += 8;
1175 SCALE_WIDTH;
1176 SCALE_HEIGHT( 420, 4 );
1178 p_y += i_source_margin;
1179 if( i_y % 2 )
1181 p_u += i_source_margin_c;
1182 p_v += i_source_margin_c;
1184 p_buffer = b_hscale ? p_buffer_start : p_pic;
1187 else
1189 /* use slower SSE2 unaligned fetch and store */
1190 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1192 p_pic_start = p_pic;
1194 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
1196 SSE2_CALL (
1197 SSE2_INIT_32_UNALIGNED
1198 SSE2_YUV_MUL
1199 SSE2_YUV_ADD
1200 SSE2_UNPACK_32_BGRA_UNALIGNED
1202 p_y += 16;
1203 p_u += 8;
1204 p_v += 8;
1205 p_buffer += 16;
1208 /* Here we do some unaligned reads and duplicate conversions, but
1209 * at least we have all the pixels */
1210 if( i_rewind )
1212 p_y -= i_rewind;
1213 p_u -= i_rewind >> 1;
1214 p_v -= i_rewind >> 1;
1215 p_buffer -= i_rewind;
1216 SSE2_CALL (
1217 SSE2_INIT_32_UNALIGNED
1218 SSE2_YUV_MUL
1219 SSE2_YUV_ADD
1220 SSE2_UNPACK_32_BGRA_UNALIGNED
1222 p_y += 16;
1223 p_u += 8;
1224 p_v += 8;
1226 SCALE_WIDTH;
1227 SCALE_HEIGHT( 420, 4 );
1229 p_y += i_source_margin;
1230 if( i_y % 2 )
1232 p_u += i_source_margin_c;
1233 p_v += i_source_margin_c;
1235 p_buffer = b_hscale ? p_buffer_start : p_pic;
1239 /* make sure all SSE2 stores are visible thereafter */
1240 SSE2_END;
1242 #else
1244 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
1246 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1248 p_pic_start = p_pic;
1249 p_buffer = b_hscale ? p_buffer_start : p_pic;
1251 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
1253 MMX_CALL (
1254 MMX_INIT_32
1255 MMX_YUV_MUL
1256 MMX_YUV_ADD
1257 MMX_UNPACK_32_BGRA
1259 p_y += 8;
1260 p_u += 4;
1261 p_v += 4;
1262 p_buffer += 8;
1265 /* Here we do some unaligned reads and duplicate conversions, but
1266 * at least we have all the pixels */
1267 if( i_rewind )
1269 p_y -= i_rewind;
1270 p_u -= i_rewind >> 1;
1271 p_v -= i_rewind >> 1;
1272 p_buffer -= i_rewind;
1273 MMX_CALL (
1274 MMX_INIT_32
1275 MMX_YUV_MUL
1276 MMX_YUV_ADD
1277 MMX_UNPACK_32_BGRA
1279 p_y += 8;
1280 p_u += 4;
1281 p_v += 4;
1283 SCALE_WIDTH;
1284 SCALE_HEIGHT( 420, 4 );
1286 p_y += i_source_margin;
1287 if( i_y % 2 )
1289 p_u += i_source_margin_c;
1290 p_v += i_source_margin_c;
1294 /* re-enable FPU registers */
1295 MMX_END;
1297 #endif
1300 VLC_TARGET
1301 void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
1303 filter_sys_t *p_sys = p_filter->p_sys;
1305 /* We got this one from the old arguments */
1306 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1307 uint8_t *p_y = p_src->Y_PIXELS;
1308 uint8_t *p_u = p_src->U_PIXELS;
1309 uint8_t *p_v = p_src->V_PIXELS;
1311 bool b_hscale; /* horizontal scaling type */
1312 unsigned int i_vscale; /* vertical scaling type */
1313 unsigned int i_x, i_y; /* horizontal and vertical indexes */
1315 int i_right_margin;
1316 int i_rewind;
1317 int i_scale_count; /* scale modulo counter */
1318 int i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
1319 uint32_t * p_pic_start; /* beginning of the current line for copy */
1320 /* Conversion buffer pointer */
1321 uint32_t * p_buffer_start;
1322 uint32_t * p_buffer;
1324 /* Offset array pointer */
1325 int * p_offset_start = p_sys->p_offset;
1326 int * p_offset;
1328 const int i_source_margin = p_src->p[0].i_pitch
1329 - p_src->p[0].i_visible_pitch
1330 - p_filter->fmt_in.video.i_x_offset;
1331 const int i_source_margin_c = p_src->p[1].i_pitch
1332 - p_src->p[1].i_visible_pitch
1333 - ( p_filter->fmt_in.video.i_x_offset / 2 );
1335 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1337 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1338 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1339 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1340 SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
1341 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
1342 (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
1343 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
1344 &b_hscale, &i_vscale, p_offset_start );
1346 if(b_hscale &&
1347 AllocateOrGrow(&p_sys->p_buffer, &p_sys->i_buffer_size,
1348 p_filter->fmt_in.video.i_x_offset +
1349 p_filter->fmt_in.video.i_visible_width,
1350 p_sys->i_bytespp))
1351 return;
1352 else p_buffer_start = (uint32_t*)p_sys->p_buffer;
1355 * Perform conversion
1357 i_scale_count = ( i_vscale == 1 ) ?
1358 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
1359 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
1361 #ifdef SSE2
1363 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
1366 ** SSE2 128 bits fetch/store instructions are faster
1367 ** if memory access is 16 bytes aligned
1370 p_buffer = b_hscale ? p_buffer_start : p_pic;
1372 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1373 p_dest->p->i_pitch|
1374 ((intptr_t)p_y)|
1375 ((intptr_t)p_buffer))) )
1377 /* use faster SSE2 aligned fetch and store */
1378 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1380 p_pic_start = p_pic;
1382 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
1384 SSE2_CALL (
1385 SSE2_INIT_32_ALIGNED
1386 SSE2_YUV_MUL
1387 SSE2_YUV_ADD
1388 SSE2_UNPACK_32_ABGR_ALIGNED
1390 p_y += 16;
1391 p_u += 8;
1392 p_v += 8;
1393 p_buffer += 16;
1396 /* Here we do some unaligned reads and duplicate conversions, but
1397 * at least we have all the pixels */
1398 if( i_rewind )
1400 p_y -= i_rewind;
1401 p_u -= i_rewind >> 1;
1402 p_v -= i_rewind >> 1;
1403 p_buffer -= i_rewind;
1404 SSE2_CALL (
1405 SSE2_INIT_32_UNALIGNED
1406 SSE2_YUV_MUL
1407 SSE2_YUV_ADD
1408 SSE2_UNPACK_32_ABGR_UNALIGNED
1410 p_y += 16;
1411 p_u += 8;
1412 p_v += 8;
1414 SCALE_WIDTH;
1415 SCALE_HEIGHT( 420, 4 );
1417 p_y += i_source_margin;
1418 if( i_y % 2 )
1420 p_u += i_source_margin_c;
1421 p_v += i_source_margin_c;
1423 p_buffer = b_hscale ? p_buffer_start : p_pic;
1426 else
1428 /* use slower SSE2 unaligned fetch and store */
1429 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1431 p_pic_start = p_pic;
1433 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
1435 SSE2_CALL (
1436 SSE2_INIT_32_UNALIGNED
1437 SSE2_YUV_MUL
1438 SSE2_YUV_ADD
1439 SSE2_UNPACK_32_ABGR_UNALIGNED
1441 p_y += 16;
1442 p_u += 8;
1443 p_v += 8;
1444 p_buffer += 16;
1447 /* Here we do some unaligned reads and duplicate conversions, but
1448 * at least we have all the pixels */
1449 if( i_rewind )
1451 p_y -= i_rewind;
1452 p_u -= i_rewind >> 1;
1453 p_v -= i_rewind >> 1;
1454 p_buffer -= i_rewind;
1455 SSE2_CALL (
1456 SSE2_INIT_32_UNALIGNED
1457 SSE2_YUV_MUL
1458 SSE2_YUV_ADD
1459 SSE2_UNPACK_32_ABGR_UNALIGNED
1461 p_y += 16;
1462 p_u += 8;
1463 p_v += 8;
1465 SCALE_WIDTH;
1466 SCALE_HEIGHT( 420, 4 );
1468 p_y += i_source_margin;
1469 if( i_y % 2 )
1471 p_u += i_source_margin_c;
1472 p_v += i_source_margin_c;
1474 p_buffer = b_hscale ? p_buffer_start : p_pic;
1478 /* make sure all SSE2 stores are visible thereafter */
1479 SSE2_END;
1481 #else
1483 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
1485 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1487 p_pic_start = p_pic;
1488 p_buffer = b_hscale ? p_buffer_start : p_pic;
1490 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
1492 MMX_CALL (
1493 MMX_INIT_32
1494 MMX_YUV_MUL
1495 MMX_YUV_ADD
1496 MMX_UNPACK_32_ABGR
1498 p_y += 8;
1499 p_u += 4;
1500 p_v += 4;
1501 p_buffer += 8;
1504 /* Here we do some unaligned reads and duplicate conversions, but
1505 * at least we have all the pixels */
1506 if( i_rewind )
1508 p_y -= i_rewind;
1509 p_u -= i_rewind >> 1;
1510 p_v -= i_rewind >> 1;
1511 p_buffer -= i_rewind;
1512 MMX_CALL (
1513 MMX_INIT_32
1514 MMX_YUV_MUL
1515 MMX_YUV_ADD
1516 MMX_UNPACK_32_ABGR
1518 p_y += 8;
1519 p_u += 4;
1520 p_v += 4;
1522 SCALE_WIDTH;
1523 SCALE_HEIGHT( 420, 4 );
1525 p_y += i_source_margin;
1526 if( i_y % 2 )
1528 p_u += i_source_margin_c;
1529 p_v += i_source_margin_c;
1533 /* re-enable FPU registers */
1534 MMX_END;
1536 #endif