demux: mp4: avoid audio cuts on seek
[vlc.git] / modules / video_chroma / i420_rgb16_x86.c
blob896694c48a92d8fc3d6a9ec064c1ba889498284d
1 /*****************************************************************************
2 * i420_rgb16_x86.c : YUV to bitmap RGB conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000 VLC authors and VideoLAN
5 * $Id$
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damienf@videolan.org>
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU Lesser General Public License as published by
12 * the Free Software Foundation; either version 2.1 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this program; if not, write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 #ifdef HAVE_CONFIG_H
26 # include "config.h"
27 #endif
29 #include <vlc_common.h>
30 #include <vlc_filter.h>
31 #include <vlc_picture.h>
32 #include <vlc_cpu.h>
34 #include "i420_rgb.h"
35 #ifdef SSE2
36 # include "i420_rgb_sse2.h"
37 # define VLC_TARGET VLC_SSE
38 #else
39 # include "i420_rgb_mmx.h"
40 # define VLC_TARGET VLC_MMX
41 #endif
43 /*****************************************************************************
44 * SetOffset: build offset array for conversion functions
45 *****************************************************************************
46 * This function will build an offset array used in later conversion functions.
47 * It will also set horizontal and vertical scaling indicators.
48 *****************************************************************************/
49 static void SetOffset( int i_width, int i_height, int i_pic_width,
50 int i_pic_height, bool *pb_hscale,
51 unsigned int *pi_vscale, int *p_offset )
54 * Prepare horizontal offset array
56 if( i_pic_width - i_width == 0 )
57 { /* No horizontal scaling: YUV conversion is done directly to picture */
58 *pb_hscale = 0;
60 else if( i_pic_width - i_width > 0 )
61 { /* Prepare scaling array for horizontal extension */
62 int i_scale_count = i_pic_width;
64 *pb_hscale = 1;
65 for( int i_x = i_width; i_x--; )
67 while( (i_scale_count -= i_width) > 0 )
69 *p_offset++ = 0;
71 *p_offset++ = 1;
72 i_scale_count += i_pic_width;
75 else /* if( i_pic_width - i_width < 0 ) */
76 { /* Prepare scaling array for horizontal reduction */
77 int i_scale_count = i_pic_width;
79 *pb_hscale = 1;
80 for( int i_x = i_pic_width; i_x--; )
82 *p_offset = 1;
83 while( (i_scale_count -= i_pic_width) > 0 )
85 *p_offset += 1;
87 p_offset++;
88 i_scale_count += i_width;
93 * Set vertical scaling indicator
95 if( i_pic_height - i_height == 0 )
96 *pi_vscale = 0;
97 else if( i_pic_height - i_height > 0 )
98 *pi_vscale = 1;
99 else /* if( i_pic_height - i_height < 0 ) */
100 *pi_vscale = -1;
103 VLC_TARGET
104 void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
106 /* We got this one from the old arguments */
107 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
108 uint8_t *p_y = p_src->Y_PIXELS;
109 uint8_t *p_u = p_src->U_PIXELS;
110 uint8_t *p_v = p_src->V_PIXELS;
112 bool b_hscale; /* horizontal scaling type */
113 unsigned int i_vscale; /* vertical scaling type */
114 unsigned int i_x, i_y; /* horizontal and vertical indexes */
116 int i_right_margin;
117 int i_rewind;
118 int i_scale_count; /* scale modulo counter */
119 int i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
120 uint16_t * p_pic_start; /* beginning of the current line for copy */
122 /* Conversion buffer pointer */
123 uint16_t * p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
124 uint16_t * p_buffer;
126 /* Offset array pointer */
127 int * p_offset_start = p_filter->p_sys->p_offset;
128 int * p_offset;
130 const int i_source_margin = p_src->p[0].i_pitch
131 - p_src->p[0].i_visible_pitch
132 - p_filter->fmt_in.video.i_x_offset;
133 const int i_source_margin_c = p_src->p[1].i_pitch
134 - p_src->p[1].i_visible_pitch
135 - ( p_filter->fmt_in.video.i_x_offset / 2 );
137 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
139 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
140 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
141 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
142 SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
143 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
144 (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
145 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
146 &b_hscale, &i_vscale, p_offset_start );
150 * Perform conversion
152 i_scale_count = ( i_vscale == 1 ) ?
153 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
154 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
156 #ifdef SSE2
158 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
161 ** SSE2 128 bits fetch/store instructions are faster
162 ** if memory access is 16 bytes aligned
165 p_buffer = b_hscale ? p_buffer_start : p_pic;
166 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
167 p_dest->p->i_pitch|
168 ((intptr_t)p_y)|
169 ((intptr_t)p_buffer))) )
171 /* use faster SSE2 aligned fetch and store */
172 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
174 p_pic_start = p_pic;
176 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/16; i_x--; )
178 SSE2_CALL (
179 SSE2_INIT_16_ALIGNED
180 SSE2_YUV_MUL
181 SSE2_YUV_ADD
182 SSE2_UNPACK_15_ALIGNED
184 p_y += 16;
185 p_u += 8;
186 p_v += 8;
187 p_buffer += 16;
189 /* Here we do some unaligned reads and duplicate conversions, but
190 * at least we have all the pixels */
191 if( i_rewind )
193 p_y -= i_rewind;
194 p_u -= i_rewind >> 1;
195 p_v -= i_rewind >> 1;
196 p_buffer -= i_rewind;
198 SSE2_CALL (
199 SSE2_INIT_16_UNALIGNED
200 SSE2_YUV_MUL
201 SSE2_YUV_ADD
202 SSE2_UNPACK_15_UNALIGNED
204 p_y += 16;
205 p_u += 8;
206 p_v += 8;
208 SCALE_WIDTH;
209 SCALE_HEIGHT( 420, 2 );
211 p_y += i_source_margin;
212 if( i_y % 2 )
214 p_u += i_source_margin_c;
215 p_v += i_source_margin_c;
217 p_buffer = b_hscale ? p_buffer_start : p_pic;
220 else
222 /* use slower SSE2 unaligned fetch and store */
223 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
225 p_pic_start = p_pic;
226 p_buffer = b_hscale ? p_buffer_start : p_pic;
228 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/16; i_x--; )
230 SSE2_CALL (
231 SSE2_INIT_16_UNALIGNED
232 SSE2_YUV_MUL
233 SSE2_YUV_ADD
234 SSE2_UNPACK_15_UNALIGNED
236 p_y += 16;
237 p_u += 8;
238 p_v += 8;
239 p_buffer += 16;
241 /* Here we do some unaligned reads and duplicate conversions, but
242 * at least we have all the pixels */
243 if( i_rewind )
245 p_y -= i_rewind;
246 p_u -= i_rewind >> 1;
247 p_v -= i_rewind >> 1;
248 p_buffer -= i_rewind;
250 SSE2_CALL (
251 SSE2_INIT_16_UNALIGNED
252 SSE2_YUV_MUL
253 SSE2_YUV_ADD
254 SSE2_UNPACK_15_UNALIGNED
256 p_y += 16;
257 p_u += 8;
258 p_v += 8;
260 SCALE_WIDTH;
261 SCALE_HEIGHT( 420, 2 );
263 p_y += i_source_margin;
264 if( i_y % 2 )
266 p_u += i_source_margin_c;
267 p_v += i_source_margin_c;
269 p_buffer = b_hscale ? p_buffer_start : p_pic;
273 /* make sure all SSE2 stores are visible thereafter */
274 SSE2_END;
276 #else /* SSE2 */
278 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
280 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
282 p_pic_start = p_pic;
283 p_buffer = b_hscale ? p_buffer_start : p_pic;
285 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
287 MMX_CALL (
288 MMX_INIT_16
289 MMX_YUV_MUL
290 MMX_YUV_ADD
291 MMX_UNPACK_15
293 p_y += 8;
294 p_u += 4;
295 p_v += 4;
296 p_buffer += 8;
299 /* Here we do some unaligned reads and duplicate conversions, but
300 * at least we have all the pixels */
301 if( i_rewind )
303 p_y -= i_rewind;
304 p_u -= i_rewind >> 1;
305 p_v -= i_rewind >> 1;
306 p_buffer -= i_rewind;
308 MMX_CALL (
309 MMX_INIT_16
310 MMX_YUV_MUL
311 MMX_YUV_ADD
312 MMX_UNPACK_15
314 p_y += 8;
315 p_u += 4;
316 p_v += 4;
317 p_buffer += 8;
319 SCALE_WIDTH;
320 SCALE_HEIGHT( 420, 2 );
322 p_y += i_source_margin;
323 if( i_y % 2 )
325 p_u += i_source_margin_c;
326 p_v += i_source_margin_c;
329 /* re-enable FPU registers */
330 MMX_END;
332 #endif /* SSE2 */
335 VLC_TARGET
336 void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
338 /* We got this one from the old arguments */
339 uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
340 uint8_t *p_y = p_src->Y_PIXELS;
341 uint8_t *p_u = p_src->U_PIXELS;
342 uint8_t *p_v = p_src->V_PIXELS;
344 bool b_hscale; /* horizontal scaling type */
345 unsigned int i_vscale; /* vertical scaling type */
346 unsigned int i_x, i_y; /* horizontal and vertical indexes */
348 int i_right_margin;
349 int i_rewind;
350 int i_scale_count; /* scale modulo counter */
351 int i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
352 uint16_t * p_pic_start; /* beginning of the current line for copy */
354 /* Conversion buffer pointer */
355 uint16_t * p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
356 uint16_t * p_buffer;
358 /* Offset array pointer */
359 int * p_offset_start = p_filter->p_sys->p_offset;
360 int * p_offset;
362 const int i_source_margin = p_src->p[0].i_pitch
363 - p_src->p[0].i_visible_pitch
364 - p_filter->fmt_in.video.i_x_offset;
365 const int i_source_margin_c = p_src->p[1].i_pitch
366 - p_src->p[1].i_visible_pitch
367 - ( p_filter->fmt_in.video.i_x_offset / 2 );
369 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
371 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
372 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
373 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
374 SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
375 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
376 (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
377 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
378 &b_hscale, &i_vscale, p_offset_start );
382 * Perform conversion
384 i_scale_count = ( i_vscale == 1 ) ?
385 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
386 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
388 #ifdef SSE2
390 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
393 ** SSE2 128 bits fetch/store instructions are faster
394 ** if memory access is 16 bytes aligned
397 p_buffer = b_hscale ? p_buffer_start : p_pic;
398 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
399 p_dest->p->i_pitch|
400 ((intptr_t)p_y)|
401 ((intptr_t)p_buffer))) )
403 /* use faster SSE2 aligned fetch and store */
404 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
406 p_pic_start = p_pic;
408 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/16; i_x--; )
410 SSE2_CALL (
411 SSE2_INIT_16_ALIGNED
412 SSE2_YUV_MUL
413 SSE2_YUV_ADD
414 SSE2_UNPACK_16_ALIGNED
416 p_y += 16;
417 p_u += 8;
418 p_v += 8;
419 p_buffer += 16;
421 /* Here we do some unaligned reads and duplicate conversions, but
422 * at least we have all the pixels */
423 if( i_rewind )
425 p_y -= i_rewind;
426 p_u -= i_rewind >> 1;
427 p_v -= i_rewind >> 1;
428 p_buffer -= i_rewind;
430 SSE2_CALL (
431 SSE2_INIT_16_UNALIGNED
432 SSE2_YUV_MUL
433 SSE2_YUV_ADD
434 SSE2_UNPACK_16_UNALIGNED
436 p_y += 16;
437 p_u += 8;
438 p_v += 8;
440 SCALE_WIDTH;
441 SCALE_HEIGHT( 420, 2 );
443 p_y += i_source_margin;
444 if( i_y % 2 )
446 p_u += i_source_margin_c;
447 p_v += i_source_margin_c;
449 p_buffer = b_hscale ? p_buffer_start : p_pic;
452 else
454 /* use slower SSE2 unaligned fetch and store */
455 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
457 p_pic_start = p_pic;
458 p_buffer = b_hscale ? p_buffer_start : p_pic;
460 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/16; i_x--; )
462 SSE2_CALL(
463 SSE2_INIT_16_UNALIGNED
464 SSE2_YUV_MUL
465 SSE2_YUV_ADD
466 SSE2_UNPACK_16_UNALIGNED
468 p_y += 16;
469 p_u += 8;
470 p_v += 8;
471 p_buffer += 16;
473 /* Here we do some unaligned reads and duplicate conversions, but
474 * at least we have all the pixels */
475 if( i_rewind )
477 p_y -= i_rewind;
478 p_u -= i_rewind >> 1;
479 p_v -= i_rewind >> 1;
480 p_buffer -= i_rewind;
482 SSE2_CALL(
483 SSE2_INIT_16_UNALIGNED
484 SSE2_YUV_MUL
485 SSE2_YUV_ADD
486 SSE2_UNPACK_16_UNALIGNED
488 p_y += 16;
489 p_u += 8;
490 p_v += 8;
492 SCALE_WIDTH;
493 SCALE_HEIGHT( 420, 2 );
495 p_y += i_source_margin;
496 if( i_y % 2 )
498 p_u += i_source_margin_c;
499 p_v += i_source_margin_c;
501 p_buffer = b_hscale ? p_buffer_start : p_pic;
505 /* make sure all SSE2 stores are visible thereafter */
506 SSE2_END;
508 #else /* SSE2 */
510 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
512 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
514 p_pic_start = p_pic;
515 p_buffer = b_hscale ? p_buffer_start : p_pic;
517 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
519 MMX_CALL (
520 MMX_INIT_16
521 MMX_YUV_MUL
522 MMX_YUV_ADD
523 MMX_UNPACK_16
525 p_y += 8;
526 p_u += 4;
527 p_v += 4;
528 p_buffer += 8;
531 /* Here we do some unaligned reads and duplicate conversions, but
532 * at least we have all the pixels */
533 if( i_rewind )
535 p_y -= i_rewind;
536 p_u -= i_rewind >> 1;
537 p_v -= i_rewind >> 1;
538 p_buffer -= i_rewind;
540 MMX_CALL (
541 MMX_INIT_16
542 MMX_YUV_MUL
543 MMX_YUV_ADD
544 MMX_UNPACK_16
546 p_y += 8;
547 p_u += 4;
548 p_v += 4;
549 p_buffer += 8;
551 SCALE_WIDTH;
552 SCALE_HEIGHT( 420, 2 );
554 p_y += i_source_margin;
555 if( i_y % 2 )
557 p_u += i_source_margin_c;
558 p_v += i_source_margin_c;
561 /* re-enable FPU registers */
562 MMX_END;
564 #endif /* SSE2 */
567 VLC_TARGET
568 void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
569 picture_t *p_dest )
571 /* We got this one from the old arguments */
572 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
573 uint8_t *p_y = p_src->Y_PIXELS;
574 uint8_t *p_u = p_src->U_PIXELS;
575 uint8_t *p_v = p_src->V_PIXELS;
577 bool b_hscale; /* horizontal scaling type */
578 unsigned int i_vscale; /* vertical scaling type */
579 unsigned int i_x, i_y; /* horizontal and vertical indexes */
581 int i_right_margin;
582 int i_rewind;
583 int i_scale_count; /* scale modulo counter */
584 int i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
585 uint32_t * p_pic_start; /* beginning of the current line for copy */
586 /* Conversion buffer pointer */
587 uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
588 uint32_t * p_buffer;
590 /* Offset array pointer */
591 int * p_offset_start = p_filter->p_sys->p_offset;
592 int * p_offset;
594 const int i_source_margin = p_src->p[0].i_pitch
595 - p_src->p[0].i_visible_pitch
596 - p_filter->fmt_in.video.i_x_offset;
597 const int i_source_margin_c = p_src->p[1].i_pitch
598 - p_src->p[1].i_visible_pitch
599 - ( p_filter->fmt_in.video.i_x_offset / 2 );
601 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
603 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
604 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
605 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
606 SetOffset( p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width,
607 p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height,
608 (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
609 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
610 &b_hscale, &i_vscale, p_offset_start );
613 * Perform conversion
615 i_scale_count = ( i_vscale == 1 ) ?
616 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
617 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
619 #ifdef SSE2
621 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
624 ** SSE2 128 bits fetch/store instructions are faster
625 ** if memory access is 16 bytes aligned
628 p_buffer = b_hscale ? p_buffer_start : p_pic;
629 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
630 p_dest->p->i_pitch|
631 ((intptr_t)p_y)|
632 ((intptr_t)p_buffer))) )
634 /* use faster SSE2 aligned fetch and store */
635 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
637 p_pic_start = p_pic;
639 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
641 SSE2_CALL (
642 SSE2_INIT_32_ALIGNED
643 SSE2_YUV_MUL
644 SSE2_YUV_ADD
645 SSE2_UNPACK_32_ARGB_ALIGNED
647 p_y += 16;
648 p_u += 8;
649 p_v += 8;
650 p_buffer += 16;
653 /* Here we do some unaligned reads and duplicate conversions, but
654 * at least we have all the pixels */
655 if( i_rewind )
657 p_y -= i_rewind;
658 p_u -= i_rewind >> 1;
659 p_v -= i_rewind >> 1;
660 p_buffer -= i_rewind;
661 SSE2_CALL (
662 SSE2_INIT_32_UNALIGNED
663 SSE2_YUV_MUL
664 SSE2_YUV_ADD
665 SSE2_UNPACK_32_ARGB_UNALIGNED
667 p_y += 16;
668 p_u += 4;
669 p_v += 4;
671 SCALE_WIDTH;
672 SCALE_HEIGHT( 420, 4 );
674 p_y += i_source_margin;
675 if( i_y % 2 )
677 p_u += i_source_margin_c;
678 p_v += i_source_margin_c;
680 p_buffer = b_hscale ? p_buffer_start : p_pic;
683 else
685 /* use slower SSE2 unaligned fetch and store */
686 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
688 p_pic_start = p_pic;
689 p_buffer = b_hscale ? p_buffer_start : p_pic;
691 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
693 SSE2_CALL (
694 SSE2_INIT_32_UNALIGNED
695 SSE2_YUV_MUL
696 SSE2_YUV_ADD
697 SSE2_UNPACK_32_ARGB_UNALIGNED
699 p_y += 16;
700 p_u += 8;
701 p_v += 8;
702 p_buffer += 16;
705 /* Here we do some unaligned reads and duplicate conversions, but
706 * at least we have all the pixels */
707 if( i_rewind )
709 p_y -= i_rewind;
710 p_u -= i_rewind >> 1;
711 p_v -= i_rewind >> 1;
712 p_buffer -= i_rewind;
713 SSE2_CALL (
714 SSE2_INIT_32_UNALIGNED
715 SSE2_YUV_MUL
716 SSE2_YUV_ADD
717 SSE2_UNPACK_32_ARGB_UNALIGNED
719 p_y += 16;
720 p_u += 8;
721 p_v += 8;
723 SCALE_WIDTH;
724 SCALE_HEIGHT( 420, 4 );
726 p_y += i_source_margin;
727 if( i_y % 2 )
729 p_u += i_source_margin_c;
730 p_v += i_source_margin_c;
732 p_buffer = b_hscale ? p_buffer_start : p_pic;
736 /* make sure all SSE2 stores are visible thereafter */
737 SSE2_END;
739 #else
741 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
743 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
745 p_pic_start = p_pic;
746 p_buffer = b_hscale ? p_buffer_start : p_pic;
748 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
750 MMX_CALL (
751 MMX_INIT_32
752 MMX_YUV_MUL
753 MMX_YUV_ADD
754 MMX_UNPACK_32_ARGB
756 p_y += 8;
757 p_u += 4;
758 p_v += 4;
759 p_buffer += 8;
762 /* Here we do some unaligned reads and duplicate conversions, but
763 * at least we have all the pixels */
764 if( i_rewind )
766 p_y -= i_rewind;
767 p_u -= i_rewind >> 1;
768 p_v -= i_rewind >> 1;
769 p_buffer -= i_rewind;
770 MMX_CALL (
771 MMX_INIT_32
772 MMX_YUV_MUL
773 MMX_YUV_ADD
774 MMX_UNPACK_32_ARGB
776 p_y += 8;
777 p_u += 4;
778 p_v += 4;
779 p_buffer += 8;
781 SCALE_WIDTH;
782 SCALE_HEIGHT( 420, 4 );
784 p_y += i_source_margin;
785 if( i_y % 2 )
787 p_u += i_source_margin_c;
788 p_v += i_source_margin_c;
792 /* re-enable FPU registers */
793 MMX_END;
795 #endif
798 VLC_TARGET
799 void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
801 /* We got this one from the old arguments */
802 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
803 uint8_t *p_y = p_src->Y_PIXELS;
804 uint8_t *p_u = p_src->U_PIXELS;
805 uint8_t *p_v = p_src->V_PIXELS;
807 bool b_hscale; /* horizontal scaling type */
808 unsigned int i_vscale; /* vertical scaling type */
809 unsigned int i_x, i_y; /* horizontal and vertical indexes */
811 int i_right_margin;
812 int i_rewind;
813 int i_scale_count; /* scale modulo counter */
814 int i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
815 uint32_t * p_pic_start; /* beginning of the current line for copy */
816 /* Conversion buffer pointer */
817 uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
818 uint32_t * p_buffer;
820 /* Offset array pointer */
821 int * p_offset_start = p_filter->p_sys->p_offset;
822 int * p_offset;
824 const int i_source_margin = p_src->p[0].i_pitch
825 - p_src->p[0].i_visible_pitch
826 - p_filter->fmt_in.video.i_x_offset;
827 const int i_source_margin_c = p_src->p[1].i_pitch
828 - p_src->p[1].i_visible_pitch
829 - ( p_filter->fmt_in.video.i_x_offset / 2 );
831 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
833 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
834 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
835 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
836 SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
837 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
838 (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
839 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
840 &b_hscale, &i_vscale, p_offset_start );
843 * Perform conversion
845 i_scale_count = ( i_vscale == 1 ) ?
846 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
847 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
849 #ifdef SSE2
851 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
854 ** SSE2 128 bits fetch/store instructions are faster
855 ** if memory access is 16 bytes aligned
858 p_buffer = b_hscale ? p_buffer_start : p_pic;
859 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
860 p_dest->p->i_pitch|
861 ((intptr_t)p_y)|
862 ((intptr_t)p_buffer))) )
864 /* use faster SSE2 aligned fetch and store */
865 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
867 p_pic_start = p_pic;
869 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
871 SSE2_CALL (
872 SSE2_INIT_32_ALIGNED
873 SSE2_YUV_MUL
874 SSE2_YUV_ADD
875 SSE2_UNPACK_32_RGBA_ALIGNED
877 p_y += 16;
878 p_u += 8;
879 p_v += 8;
880 p_buffer += 16;
883 /* Here we do some unaligned reads and duplicate conversions, but
884 * at least we have all the pixels */
885 if( i_rewind )
887 p_y -= i_rewind;
888 p_u -= i_rewind >> 1;
889 p_v -= i_rewind >> 1;
890 p_buffer -= i_rewind;
891 SSE2_CALL (
892 SSE2_INIT_32_UNALIGNED
893 SSE2_YUV_MUL
894 SSE2_YUV_ADD
895 SSE2_UNPACK_32_RGBA_UNALIGNED
897 p_y += 16;
898 p_u += 4;
899 p_v += 4;
901 SCALE_WIDTH;
902 SCALE_HEIGHT( 420, 4 );
904 p_y += i_source_margin;
905 if( i_y % 2 )
907 p_u += i_source_margin_c;
908 p_v += i_source_margin_c;
910 p_buffer = b_hscale ? p_buffer_start : p_pic;
913 else
915 /* use slower SSE2 unaligned fetch and store */
916 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
918 p_pic_start = p_pic;
919 p_buffer = b_hscale ? p_buffer_start : p_pic;
921 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
923 SSE2_CALL (
924 SSE2_INIT_32_UNALIGNED
925 SSE2_YUV_MUL
926 SSE2_YUV_ADD
927 SSE2_UNPACK_32_RGBA_UNALIGNED
929 p_y += 16;
930 p_u += 8;
931 p_v += 8;
932 p_buffer += 16;
935 /* Here we do some unaligned reads and duplicate conversions, but
936 * at least we have all the pixels */
937 if( i_rewind )
939 p_y -= i_rewind;
940 p_u -= i_rewind >> 1;
941 p_v -= i_rewind >> 1;
942 p_buffer -= i_rewind;
943 SSE2_CALL (
944 SSE2_INIT_32_UNALIGNED
945 SSE2_YUV_MUL
946 SSE2_YUV_ADD
947 SSE2_UNPACK_32_RGBA_UNALIGNED
949 p_y += 16;
950 p_u += 8;
951 p_v += 8;
953 SCALE_WIDTH;
954 SCALE_HEIGHT( 420, 4 );
956 p_y += i_source_margin;
957 if( i_y % 2 )
959 p_u += i_source_margin_c;
960 p_v += i_source_margin_c;
962 p_buffer = b_hscale ? p_buffer_start : p_pic;
966 /* make sure all SSE2 stores are visible thereafter */
967 SSE2_END;
969 #else
971 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
973 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
975 p_pic_start = p_pic;
976 p_buffer = b_hscale ? p_buffer_start : p_pic;
978 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
980 MMX_CALL (
981 MMX_INIT_32
982 MMX_YUV_MUL
983 MMX_YUV_ADD
984 MMX_UNPACK_32_RGBA
986 p_y += 8;
987 p_u += 4;
988 p_v += 4;
989 p_buffer += 8;
992 /* Here we do some unaligned reads and duplicate conversions, but
993 * at least we have all the pixels */
994 if( i_rewind )
996 p_y -= i_rewind;
997 p_u -= i_rewind >> 1;
998 p_v -= i_rewind >> 1;
999 p_buffer -= i_rewind;
1000 MMX_CALL (
1001 MMX_INIT_32
1002 MMX_YUV_MUL
1003 MMX_YUV_ADD
1004 MMX_UNPACK_32_RGBA
1006 p_y += 8;
1007 p_u += 4;
1008 p_v += 4;
1009 p_buffer += 8;
1011 SCALE_WIDTH;
1012 SCALE_HEIGHT( 420, 4 );
1014 p_y += i_source_margin;
1015 if( i_y % 2 )
1017 p_u += i_source_margin_c;
1018 p_v += i_source_margin_c;
1022 /* re-enable FPU registers */
1023 MMX_END;
1025 #endif
1028 VLC_TARGET
1029 void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
1031 /* We got this one from the old arguments */
1032 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1033 uint8_t *p_y = p_src->Y_PIXELS;
1034 uint8_t *p_u = p_src->U_PIXELS;
1035 uint8_t *p_v = p_src->V_PIXELS;
1037 bool b_hscale; /* horizontal scaling type */
1038 unsigned int i_vscale; /* vertical scaling type */
1039 unsigned int i_x, i_y; /* horizontal and vertical indexes */
1041 int i_right_margin;
1042 int i_rewind;
1043 int i_scale_count; /* scale modulo counter */
1044 int i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
1045 uint32_t * p_pic_start; /* beginning of the current line for copy */
1046 /* Conversion buffer pointer */
1047 uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1048 uint32_t * p_buffer;
1050 /* Offset array pointer */
1051 int * p_offset_start = p_filter->p_sys->p_offset;
1052 int * p_offset;
1054 const int i_source_margin = p_src->p[0].i_pitch
1055 - p_src->p[0].i_visible_pitch
1056 - p_filter->fmt_in.video.i_x_offset;
1057 const int i_source_margin_c = p_src->p[1].i_pitch
1058 - p_src->p[1].i_visible_pitch
1059 - ( p_filter->fmt_in.video.i_x_offset / 2 );
1061 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1063 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1064 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1065 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1066 SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
1067 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
1068 (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
1069 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
1070 &b_hscale, &i_vscale, p_offset_start );
1073 * Perform conversion
1075 i_scale_count = ( i_vscale == 1 ) ?
1076 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
1077 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
1079 #ifdef SSE2
1081 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
1084 ** SSE2 128 bits fetch/store instructions are faster
1085 ** if memory access is 16 bytes aligned
1088 p_buffer = b_hscale ? p_buffer_start : p_pic;
1089 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1090 p_dest->p->i_pitch|
1091 ((intptr_t)p_y)|
1092 ((intptr_t)p_buffer))) )
1094 /* use faster SSE2 aligned fetch and store */
1095 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1097 p_pic_start = p_pic;
1099 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
1101 SSE2_CALL (
1102 SSE2_INIT_32_ALIGNED
1103 SSE2_YUV_MUL
1104 SSE2_YUV_ADD
1105 SSE2_UNPACK_32_BGRA_ALIGNED
1107 p_y += 16;
1108 p_u += 8;
1109 p_v += 8;
1110 p_buffer += 16;
1113 /* Here we do some unaligned reads and duplicate conversions, but
1114 * at least we have all the pixels */
1115 if( i_rewind )
1117 p_y -= i_rewind;
1118 p_u -= i_rewind >> 1;
1119 p_v -= i_rewind >> 1;
1120 p_buffer -= i_rewind;
1121 SSE2_CALL (
1122 SSE2_INIT_32_UNALIGNED
1123 SSE2_YUV_MUL
1124 SSE2_YUV_ADD
1125 SSE2_UNPACK_32_BGRA_UNALIGNED
1127 p_y += 16;
1128 p_u += 4;
1129 p_v += 4;
1131 SCALE_WIDTH;
1132 SCALE_HEIGHT( 420, 4 );
1134 p_y += i_source_margin;
1135 if( i_y % 2 )
1137 p_u += i_source_margin_c;
1138 p_v += i_source_margin_c;
1140 p_buffer = b_hscale ? p_buffer_start : p_pic;
1143 else
1145 /* use slower SSE2 unaligned fetch and store */
1146 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1148 p_pic_start = p_pic;
1149 p_buffer = b_hscale ? p_buffer_start : p_pic;
1151 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
1153 SSE2_CALL (
1154 SSE2_INIT_32_UNALIGNED
1155 SSE2_YUV_MUL
1156 SSE2_YUV_ADD
1157 SSE2_UNPACK_32_BGRA_UNALIGNED
1159 p_y += 16;
1160 p_u += 8;
1161 p_v += 8;
1162 p_buffer += 16;
1165 /* Here we do some unaligned reads and duplicate conversions, but
1166 * at least we have all the pixels */
1167 if( i_rewind )
1169 p_y -= i_rewind;
1170 p_u -= i_rewind >> 1;
1171 p_v -= i_rewind >> 1;
1172 p_buffer -= i_rewind;
1173 SSE2_CALL (
1174 SSE2_INIT_32_UNALIGNED
1175 SSE2_YUV_MUL
1176 SSE2_YUV_ADD
1177 SSE2_UNPACK_32_BGRA_UNALIGNED
1179 p_y += 16;
1180 p_u += 8;
1181 p_v += 8;
1183 SCALE_WIDTH;
1184 SCALE_HEIGHT( 420, 4 );
1186 p_y += i_source_margin;
1187 if( i_y % 2 )
1189 p_u += i_source_margin_c;
1190 p_v += i_source_margin_c;
1192 p_buffer = b_hscale ? p_buffer_start : p_pic;
1196 #else
1198 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
1200 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1202 p_pic_start = p_pic;
1203 p_buffer = b_hscale ? p_buffer_start : p_pic;
1205 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
1207 MMX_CALL (
1208 MMX_INIT_32
1209 MMX_YUV_MUL
1210 MMX_YUV_ADD
1211 MMX_UNPACK_32_BGRA
1213 p_y += 8;
1214 p_u += 4;
1215 p_v += 4;
1216 p_buffer += 8;
1219 /* Here we do some unaligned reads and duplicate conversions, but
1220 * at least we have all the pixels */
1221 if( i_rewind )
1223 p_y -= i_rewind;
1224 p_u -= i_rewind >> 1;
1225 p_v -= i_rewind >> 1;
1226 p_buffer -= i_rewind;
1227 MMX_CALL (
1228 MMX_INIT_32
1229 MMX_YUV_MUL
1230 MMX_YUV_ADD
1231 MMX_UNPACK_32_BGRA
1233 p_y += 8;
1234 p_u += 4;
1235 p_v += 4;
1236 p_buffer += 8;
1238 SCALE_WIDTH;
1239 SCALE_HEIGHT( 420, 4 );
1241 p_y += i_source_margin;
1242 if( i_y % 2 )
1244 p_u += i_source_margin_c;
1245 p_v += i_source_margin_c;
1249 /* re-enable FPU registers */
1250 MMX_END;
1252 #endif
1255 VLC_TARGET
1256 void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
1258 /* We got this one from the old arguments */
1259 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1260 uint8_t *p_y = p_src->Y_PIXELS;
1261 uint8_t *p_u = p_src->U_PIXELS;
1262 uint8_t *p_v = p_src->V_PIXELS;
1264 bool b_hscale; /* horizontal scaling type */
1265 unsigned int i_vscale; /* vertical scaling type */
1266 unsigned int i_x, i_y; /* horizontal and vertical indexes */
1268 int i_right_margin;
1269 int i_rewind;
1270 int i_scale_count; /* scale modulo counter */
1271 int i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
1272 uint32_t * p_pic_start; /* beginning of the current line for copy */
1273 /* Conversion buffer pointer */
1274 uint32_t * p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1275 uint32_t * p_buffer;
1277 /* Offset array pointer */
1278 int * p_offset_start = p_filter->p_sys->p_offset;
1279 int * p_offset;
1281 const int i_source_margin = p_src->p[0].i_pitch
1282 - p_src->p[0].i_visible_pitch
1283 - p_filter->fmt_in.video.i_x_offset;
1284 const int i_source_margin_c = p_src->p[1].i_pitch
1285 - p_src->p[1].i_visible_pitch
1286 - ( p_filter->fmt_in.video.i_x_offset / 2 );
1288 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1290 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1291 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1292 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1293 SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
1294 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
1295 (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
1296 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
1297 &b_hscale, &i_vscale, p_offset_start );
1300 * Perform conversion
1302 i_scale_count = ( i_vscale == 1 ) ?
1303 (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
1304 (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
1306 #ifdef SSE2
1308 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
1311 ** SSE2 128 bits fetch/store instructions are faster
1312 ** if memory access is 16 bytes aligned
1315 p_buffer = b_hscale ? p_buffer_start : p_pic;
1316 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1317 p_dest->p->i_pitch|
1318 ((intptr_t)p_y)|
1319 ((intptr_t)p_buffer))) )
1321 /* use faster SSE2 aligned fetch and store */
1322 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1324 p_pic_start = p_pic;
1326 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
1328 SSE2_CALL (
1329 SSE2_INIT_32_ALIGNED
1330 SSE2_YUV_MUL
1331 SSE2_YUV_ADD
1332 SSE2_UNPACK_32_ABGR_ALIGNED
1334 p_y += 16;
1335 p_u += 8;
1336 p_v += 8;
1337 p_buffer += 16;
1340 /* Here we do some unaligned reads and duplicate conversions, but
1341 * at least we have all the pixels */
1342 if( i_rewind )
1344 p_y -= i_rewind;
1345 p_u -= i_rewind >> 1;
1346 p_v -= i_rewind >> 1;
1347 p_buffer -= i_rewind;
1348 SSE2_CALL (
1349 SSE2_INIT_32_UNALIGNED
1350 SSE2_YUV_MUL
1351 SSE2_YUV_ADD
1352 SSE2_UNPACK_32_ABGR_UNALIGNED
1354 p_y += 16;
1355 p_u += 4;
1356 p_v += 4;
1358 SCALE_WIDTH;
1359 SCALE_HEIGHT( 420, 4 );
1361 p_y += i_source_margin;
1362 if( i_y % 2 )
1364 p_u += i_source_margin_c;
1365 p_v += i_source_margin_c;
1367 p_buffer = b_hscale ? p_buffer_start : p_pic;
1370 else
1372 /* use slower SSE2 unaligned fetch and store */
1373 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1375 p_pic_start = p_pic;
1376 p_buffer = b_hscale ? p_buffer_start : p_pic;
1378 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
1380 SSE2_CALL (
1381 SSE2_INIT_32_UNALIGNED
1382 SSE2_YUV_MUL
1383 SSE2_YUV_ADD
1384 SSE2_UNPACK_32_ABGR_UNALIGNED
1386 p_y += 16;
1387 p_u += 8;
1388 p_v += 8;
1389 p_buffer += 16;
1392 /* Here we do some unaligned reads and duplicate conversions, but
1393 * at least we have all the pixels */
1394 if( i_rewind )
1396 p_y -= i_rewind;
1397 p_u -= i_rewind >> 1;
1398 p_v -= i_rewind >> 1;
1399 p_buffer -= i_rewind;
1400 SSE2_CALL (
1401 SSE2_INIT_32_UNALIGNED
1402 SSE2_YUV_MUL
1403 SSE2_YUV_ADD
1404 SSE2_UNPACK_32_ABGR_UNALIGNED
1406 p_y += 16;
1407 p_u += 8;
1408 p_v += 8;
1410 SCALE_WIDTH;
1411 SCALE_HEIGHT( 420, 4 );
1413 p_y += i_source_margin;
1414 if( i_y % 2 )
1416 p_u += i_source_margin_c;
1417 p_v += i_source_margin_c;
1419 p_buffer = b_hscale ? p_buffer_start : p_pic;
1423 #else
1425 i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
1427 for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1429 p_pic_start = p_pic;
1430 p_buffer = b_hscale ? p_buffer_start : p_pic;
1432 for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
1434 MMX_CALL (
1435 MMX_INIT_32
1436 MMX_YUV_MUL
1437 MMX_YUV_ADD
1438 MMX_UNPACK_32_ABGR
1440 p_y += 8;
1441 p_u += 4;
1442 p_v += 4;
1443 p_buffer += 8;
1446 /* Here we do some unaligned reads and duplicate conversions, but
1447 * at least we have all the pixels */
1448 if( i_rewind )
1450 p_y -= i_rewind;
1451 p_u -= i_rewind >> 1;
1452 p_v -= i_rewind >> 1;
1453 p_buffer -= i_rewind;
1454 MMX_CALL (
1455 MMX_INIT_32
1456 MMX_YUV_MUL
1457 MMX_YUV_ADD
1458 MMX_UNPACK_32_ABGR
1460 p_y += 8;
1461 p_u += 4;
1462 p_v += 4;
1463 p_buffer += 8;
1465 SCALE_WIDTH;
1466 SCALE_HEIGHT( 420, 4 );
1468 p_y += i_source_margin;
1469 if( i_y % 2 )
1471 p_u += i_source_margin_c;
1472 p_v += i_source_margin_c;
1476 /* re-enable FPU registers */
1477 MMX_END;
1479 #endif