Bug 1857841 - pt 3. Add a new page kind named "fresh" r=glandium
[gecko.git] / gfx / ycbcr / yuv_row_win64.cpp
blob17b542449b122608ef0eb520d94708484dddf1b1
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "yuv_row.h"
7 extern "C" {
9 // x64 compiler doesn't support MMX and inline assembler. Use SSE2 intrinsics.
11 #define kCoefficientsRgbU (reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 2048)
12 #define kCoefficientsRgbV (reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 4096)
14 #include <emmintrin.h>
16 static void FastConvertYUVToRGB32Row_SSE2(const uint8_t* y_buf,
17 const uint8_t* u_buf,
18 const uint8_t* v_buf,
19 uint8_t* rgb_buf,
20 int width) {
21 __m128i xmm0, xmmY1, xmmY2;
22 __m128 xmmY;
24 while (width >= 2) {
25 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),
26 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));
28 xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf++));
29 xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
31 xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf++));
32 xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
34 xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
35 0x44);
36 xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
37 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
39 _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
40 rgb_buf += 8;
41 width -= 2;
44 if (width) {
45 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf)),
46 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf)));
47 xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf));
48 xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
49 xmmY1 = _mm_srai_epi16(xmmY1, 6);
50 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
51 *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
55 static void ScaleYUVToRGB32Row_SSE2(const uint8_t* y_buf,
56 const uint8_t* u_buf,
57 const uint8_t* v_buf,
58 uint8_t* rgb_buf,
59 int width,
60 int source_dx) {
61 __m128i xmm0, xmmY1, xmmY2;
62 __m128 xmmY;
63 uint8_t u, v, y;
64 int x = 0;
66 while (width >= 2) {
67 u = u_buf[x >> 17];
68 v = v_buf[x >> 17];
69 y = y_buf[x >> 16];
70 x += source_dx;
72 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
73 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
74 xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
75 xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
77 y = y_buf[x >> 16];
78 x += source_dx;
80 xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
81 xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
83 xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
84 0x44);
85 xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
86 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
88 _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
89 rgb_buf += 8;
90 width -= 2;
93 if (width) {
94 u = u_buf[x >> 17];
95 v = v_buf[x >> 17];
96 y = y_buf[x >> 16];
98 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
99 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
100 xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
101 xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
102 xmmY1 = _mm_srai_epi16(xmmY1, 6);
103 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
104 *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
108 static void LinearScaleYUVToRGB32Row_SSE2(const uint8_t* y_buf,
109 const uint8_t* u_buf,
110 const uint8_t* v_buf,
111 uint8_t* rgb_buf,
112 int width,
113 int source_dx) {
114 __m128i xmm0, xmmY1, xmmY2;
115 __m128 xmmY;
116 uint8_t u0, u1, v0, v1, y0, y1;
117 uint32_t uv_frac, y_frac, u, v, y;
118 int x = 0;
120 if (source_dx >= 0x20000) {
121 x = 32768;
124 while(width >= 2) {
125 u0 = u_buf[x >> 17];
126 u1 = u_buf[(x >> 17) + 1];
127 v0 = v_buf[x >> 17];
128 v1 = v_buf[(x >> 17) + 1];
129 y0 = y_buf[x >> 16];
130 y1 = y_buf[(x >> 16) + 1];
131 uv_frac = (x & 0x1fffe);
132 y_frac = (x & 0xffff);
133 u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;
134 v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;
135 y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
136 x += source_dx;
138 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
139 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
140 xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
141 xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
143 y0 = y_buf[x >> 16];
144 y1 = y_buf[(x >> 16) + 1];
145 y_frac = (x & 0xffff);
146 y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
147 x += source_dx;
149 xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
150 xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
152 xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
153 0x44);
154 xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
155 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
157 _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
158 rgb_buf += 8;
159 width -= 2;
162 if (width) {
163 u = u_buf[x >> 17];
164 v = v_buf[x >> 17];
165 y = y_buf[x >> 16];
167 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
168 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
169 xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
171 xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
172 xmmY1 = _mm_srai_epi16(xmmY1, 6);
173 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
174 *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
178 void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
179 const uint8_t* u_buf,
180 const uint8_t* v_buf,
181 uint8_t* rgb_buf,
182 int width) {
183 FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);
186 void ScaleYUVToRGB32Row(const uint8_t* y_buf,
187 const uint8_t* u_buf,
188 const uint8_t* v_buf,
189 uint8_t* rgb_buf,
190 int width,
191 int source_dx) {
192 ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
195 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
196 const uint8_t* u_buf,
197 const uint8_t* v_buf,
198 uint8_t* rgb_buf,
199 int width,
200 int source_dx) {
201 LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,
202 source_dx);
205 } // extern "C"