1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
9 // x64 compiler doesn't support MMX and inline assembler. Use SSE2 intrinsics.
11 #define kCoefficientsRgbU (reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 2048)
12 #define kCoefficientsRgbV (reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 4096)
14 #include <emmintrin.h>
16 static void FastConvertYUVToRGB32Row_SSE2(const uint8_t* y_buf
,
21 __m128i xmm0
, xmmY1
, xmmY2
;
25 xmm0
= _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i
*>(kCoefficientsRgbU
+ 8 * *u_buf
++)),
26 _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(kCoefficientsRgbV
+ 8 * *v_buf
++)));
28 xmmY1
= _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY
) + 8 * *y_buf
++));
29 xmmY1
= _mm_adds_epi16(xmmY1
, xmm0
);
31 xmmY2
= _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY
) + 8 * *y_buf
++));
32 xmmY2
= _mm_adds_epi16(xmmY2
, xmm0
);
34 xmmY
= _mm_shuffle_ps(_mm_castsi128_ps(xmmY1
), _mm_castsi128_ps(xmmY2
),
36 xmmY1
= _mm_srai_epi16(_mm_castps_si128(xmmY
), 6);
37 xmmY1
= _mm_packus_epi16(xmmY1
, xmmY1
);
39 _mm_storel_epi64(reinterpret_cast<__m128i
*>(rgb_buf
), xmmY1
);
45 xmm0
= _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i
*>(kCoefficientsRgbU
+ 8 * *u_buf
)),
46 _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(kCoefficientsRgbV
+ 8 * *v_buf
)));
47 xmmY1
= _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY
) + 8 * *y_buf
));
48 xmmY1
= _mm_adds_epi16(xmmY1
, xmm0
);
49 xmmY1
= _mm_srai_epi16(xmmY1
, 6);
50 xmmY1
= _mm_packus_epi16(xmmY1
, xmmY1
);
51 *reinterpret_cast<uint32_t*>(rgb_buf
) = _mm_cvtsi128_si32(xmmY1
);
55 static void ScaleYUVToRGB32Row_SSE2(const uint8_t* y_buf
,
61 __m128i xmm0
, xmmY1
, xmmY2
;
72 xmm0
= _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i
*>(kCoefficientsRgbU
+ 8 * u
)),
73 _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(kCoefficientsRgbV
+ 8 * v
)));
74 xmmY1
= _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY
) + 8 * y
));
75 xmmY1
= _mm_adds_epi16(xmmY1
, xmm0
);
80 xmmY2
= _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY
) + 8 * y
));
81 xmmY2
= _mm_adds_epi16(xmmY2
, xmm0
);
83 xmmY
= _mm_shuffle_ps(_mm_castsi128_ps(xmmY1
), _mm_castsi128_ps(xmmY2
),
85 xmmY1
= _mm_srai_epi16(_mm_castps_si128(xmmY
), 6);
86 xmmY1
= _mm_packus_epi16(xmmY1
, xmmY1
);
88 _mm_storel_epi64(reinterpret_cast<__m128i
*>(rgb_buf
), xmmY1
);
98 xmm0
= _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i
*>(kCoefficientsRgbU
+ 8 * u
)),
99 _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(kCoefficientsRgbV
+ 8 * v
)));
100 xmmY1
= _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY
) + 8 * y
));
101 xmmY1
= _mm_adds_epi16(xmmY1
, xmm0
);
102 xmmY1
= _mm_srai_epi16(xmmY1
, 6);
103 xmmY1
= _mm_packus_epi16(xmmY1
, xmmY1
);
104 *reinterpret_cast<uint32_t*>(rgb_buf
) = _mm_cvtsi128_si32(xmmY1
);
108 static void LinearScaleYUVToRGB32Row_SSE2(const uint8_t* y_buf
,
109 const uint8_t* u_buf
,
110 const uint8_t* v_buf
,
114 __m128i xmm0
, xmmY1
, xmmY2
;
116 uint8_t u0
, u1
, v0
, v1
, y0
, y1
;
117 uint32_t uv_frac
, y_frac
, u
, v
, y
;
120 if (source_dx
>= 0x20000) {
126 u1
= u_buf
[(x
>> 17) + 1];
128 v1
= v_buf
[(x
>> 17) + 1];
130 y1
= y_buf
[(x
>> 16) + 1];
131 uv_frac
= (x
& 0x1fffe);
132 y_frac
= (x
& 0xffff);
133 u
= (uv_frac
* u1
+ (uv_frac
^ 0x1fffe) * u0
) >> 17;
134 v
= (uv_frac
* v1
+ (uv_frac
^ 0x1fffe) * v0
) >> 17;
135 y
= (y_frac
* y1
+ (y_frac
^ 0xffff) * y0
) >> 16;
138 xmm0
= _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i
*>(kCoefficientsRgbU
+ 8 * u
)),
139 _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(kCoefficientsRgbV
+ 8 * v
)));
140 xmmY1
= _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY
) + 8 * y
));
141 xmmY1
= _mm_adds_epi16(xmmY1
, xmm0
);
144 y1
= y_buf
[(x
>> 16) + 1];
145 y_frac
= (x
& 0xffff);
146 y
= (y_frac
* y1
+ (y_frac
^ 0xffff) * y0
) >> 16;
149 xmmY2
= _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY
) + 8 * y
));
150 xmmY2
= _mm_adds_epi16(xmmY2
, xmm0
);
152 xmmY
= _mm_shuffle_ps(_mm_castsi128_ps(xmmY1
), _mm_castsi128_ps(xmmY2
),
154 xmmY1
= _mm_srai_epi16(_mm_castps_si128(xmmY
), 6);
155 xmmY1
= _mm_packus_epi16(xmmY1
, xmmY1
);
157 _mm_storel_epi64(reinterpret_cast<__m128i
*>(rgb_buf
), xmmY1
);
167 xmm0
= _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i
*>(kCoefficientsRgbU
+ 8 * u
)),
168 _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(kCoefficientsRgbV
+ 8 * v
)));
169 xmmY1
= _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY
) + 8 * y
));
171 xmmY1
= _mm_adds_epi16(xmmY1
, xmm0
);
172 xmmY1
= _mm_srai_epi16(xmmY1
, 6);
173 xmmY1
= _mm_packus_epi16(xmmY1
, xmmY1
);
174 *reinterpret_cast<uint32_t*>(rgb_buf
) = _mm_cvtsi128_si32(xmmY1
);
178 void FastConvertYUVToRGB32Row(const uint8_t* y_buf
,
179 const uint8_t* u_buf
,
180 const uint8_t* v_buf
,
183 FastConvertYUVToRGB32Row_SSE2(y_buf
, u_buf
, v_buf
, rgb_buf
, width
);
186 void ScaleYUVToRGB32Row(const uint8_t* y_buf
,
187 const uint8_t* u_buf
,
188 const uint8_t* v_buf
,
192 ScaleYUVToRGB32Row_SSE2(y_buf
, u_buf
, v_buf
, rgb_buf
, width
, source_dx
);
195 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf
,
196 const uint8_t* u_buf
,
197 const uint8_t* v_buf
,
201 LinearScaleYUVToRGB32Row_SSE2(y_buf
, u_buf
, v_buf
, rgb_buf
, width
,