1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "cc/raster/texture_compressor_etc1_sse.h"
9 #include "base/compiler_specific.h"
10 #include "base/logging.h"
11 // Using this header for common functions such as Color handling
12 // and codeword table.
13 #include "cc/raster/texture_compressor_etc1.h"
19 inline uint32_t SetETC1MaxError(uint32_t avg_error
) {
20 // ETC1 codeword table is sorted in ascending order.
21 // Our algorithm will try to identify the index that generates the minimum
23 // The min error calculated during ComputeLuminance main loop will converge
24 // towards that value.
25 // We use this threshold to determine when it doesn't make sense to iterate
26 // further through the array.
27 return avg_error
+ avg_error
/ 2 + 384;
31 // This is used to store raw data.
33 // This is used to store 8 bit packed values.
35 // This is used to store 32 bit zero extended values into 4x4 arrays.
41 // Commonly used registers throughout the code.
42 static const __m128i __sse_zero
= _mm_set1_epi32(0);
43 static const __m128i __sse_max_int
= _mm_set1_epi32(0x7FFFFFFF);
45 inline __m128i
AddAndClamp(const __m128i x
, const __m128i y
) {
46 static const __m128i color_max
= _mm_set1_epi32(0xFF);
47 return _mm_max_epi16(__sse_zero
,
48 _mm_min_epi16(_mm_add_epi16(x
, y
), color_max
));
51 inline __m128i
GetColorErrorSSE(const __m128i x
, const __m128i y
) {
52 // Changed from _mm_mullo_epi32 (SSE4) to _mm_mullo_epi16 (SSE2).
53 __m128i ret
= _mm_sub_epi16(x
, y
);
54 return _mm_mullo_epi16(ret
, ret
);
57 inline __m128i
AddChannelError(const __m128i x
,
60 return _mm_add_epi32(x
, _mm_add_epi32(y
, z
));
63 inline uint32_t SumSSE(const __m128i x
) {
64 __m128i sum
= _mm_add_epi32(x
, _mm_shuffle_epi32(x
, 0x4E));
65 sum
= _mm_add_epi32(sum
, _mm_shuffle_epi32(sum
, 0xB1));
67 return _mm_cvtsi128_si32(sum
);
70 inline uint32_t GetVerticalError(const __sse_data
* data
,
71 const __m128i
* blue_avg
,
72 const __m128i
* green_avg
,
73 const __m128i
* red_avg
,
75 __m128i error
= __sse_zero
;
77 for (int i
= 0; i
< 4; i
++) {
78 error
= _mm_add_epi32(error
, GetColorErrorSSE(data
->blue
[i
], blue_avg
[0]));
80 _mm_add_epi32(error
, GetColorErrorSSE(data
->green
[i
], green_avg
[0]));
81 error
= _mm_add_epi32(error
, GetColorErrorSSE(data
->red
[i
], red_avg
[0]));
84 error
= _mm_add_epi32(error
, _mm_shuffle_epi32(error
, 0x4E));
86 verror
[0] = _mm_cvtsi128_si32(error
);
87 verror
[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error
, 0xB1));
89 return verror
[0] + verror
[1];
92 inline uint32_t GetHorizontalError(const __sse_data
* data
,
93 const __m128i
* blue_avg
,
94 const __m128i
* green_avg
,
95 const __m128i
* red_avg
,
97 __m128i error
= __sse_zero
;
98 int first_index
, second_index
;
100 for (int i
= 0; i
< 2; i
++) {
102 second_index
= first_index
+ 1;
104 error
= _mm_add_epi32(
105 error
, GetColorErrorSSE(data
->blue
[first_index
], blue_avg
[i
]));
106 error
= _mm_add_epi32(
107 error
, GetColorErrorSSE(data
->blue
[second_index
], blue_avg
[i
]));
108 error
= _mm_add_epi32(
109 error
, GetColorErrorSSE(data
->green
[first_index
], green_avg
[i
]));
110 error
= _mm_add_epi32(
111 error
, GetColorErrorSSE(data
->green
[second_index
], green_avg
[i
]));
112 error
= _mm_add_epi32(error
,
113 GetColorErrorSSE(data
->red
[first_index
], red_avg
[i
]));
114 error
= _mm_add_epi32(
115 error
, GetColorErrorSSE(data
->red
[second_index
], red_avg
[i
]));
118 error
= _mm_add_epi32(error
, _mm_shuffle_epi32(error
, 0x4E));
120 verror
[0] = _mm_cvtsi128_si32(error
);
121 verror
[1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(error
, 0xB1));
123 return verror
[0] + verror
[1];
126 inline void GetAvgColors(const __sse_data
* data
,
128 bool* __sse_use_diff
) {
131 // TODO(radu.velea): _mm_avg_epu8 on packed data maybe.
133 // Compute avg red value.
135 sum
[0] = _mm_add_epi32(data
->red
[0], data
->red
[1]);
136 sum
[0] = _mm_add_epi32(sum
[0], _mm_shuffle_epi32(sum
[0], 0xB1));
139 sum
[1] = _mm_add_epi32(data
->red
[2], data
->red
[3]);
140 sum
[1] = _mm_add_epi32(sum
[1], _mm_shuffle_epi32(sum
[1], 0xB1));
142 float hred
[2], vred
[2];
143 hred
[0] = (_mm_cvtsi128_si32(
144 _mm_add_epi32(sum
[0], _mm_shuffle_epi32(sum
[0], 0x4E)))) /
146 hred
[1] = (_mm_cvtsi128_si32(
147 _mm_add_epi32(sum
[1], _mm_shuffle_epi32(sum
[1], 0x4E)))) /
150 tmp
= _mm_add_epi32(sum
[0], sum
[1]);
151 vred
[0] = (_mm_cvtsi128_si32(tmp
)) / 8.0f
;
152 vred
[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp
, 0x2))) / 8.0f
;
154 // Compute avg green value.
156 sum
[0] = _mm_add_epi32(data
->green
[0], data
->green
[1]);
157 sum
[0] = _mm_add_epi32(sum
[0], _mm_shuffle_epi32(sum
[0], 0xB1));
160 sum
[1] = _mm_add_epi32(data
->green
[2], data
->green
[3]);
161 sum
[1] = _mm_add_epi32(sum
[1], _mm_shuffle_epi32(sum
[1], 0xB1));
163 float hgreen
[2], vgreen
[2];
164 hgreen
[0] = (_mm_cvtsi128_si32(
165 _mm_add_epi32(sum
[0], _mm_shuffle_epi32(sum
[0], 0x4E)))) /
167 hgreen
[1] = (_mm_cvtsi128_si32(
168 _mm_add_epi32(sum
[1], _mm_shuffle_epi32(sum
[1], 0x4E)))) /
171 tmp
= _mm_add_epi32(sum
[0], sum
[1]);
172 vgreen
[0] = (_mm_cvtsi128_si32(tmp
)) / 8.0f
;
173 vgreen
[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp
, 0x2))) / 8.0f
;
175 // Compute avg blue value.
177 sum
[0] = _mm_add_epi32(data
->blue
[0], data
->blue
[1]);
178 sum
[0] = _mm_add_epi32(sum
[0], _mm_shuffle_epi32(sum
[0], 0xB1));
181 sum
[1] = _mm_add_epi32(data
->blue
[2], data
->blue
[3]);
182 sum
[1] = _mm_add_epi32(sum
[1], _mm_shuffle_epi32(sum
[1], 0xB1));
184 float hblue
[2], vblue
[2];
185 hblue
[0] = (_mm_cvtsi128_si32(
186 _mm_add_epi32(sum
[0], _mm_shuffle_epi32(sum
[0], 0x4E)))) /
188 hblue
[1] = (_mm_cvtsi128_si32(
189 _mm_add_epi32(sum
[1], _mm_shuffle_epi32(sum
[1], 0x4E)))) /
192 tmp
= _mm_add_epi32(sum
[0], sum
[1]);
193 vblue
[0] = (_mm_cvtsi128_si32(tmp
)) / 8.0f
;
194 vblue
[1] = (_mm_cvtsi128_si32(_mm_shuffle_epi32(tmp
, 0x2))) / 8.0f
;
196 // TODO(radu.velea): Return int's instead of floats, based on Quality.
197 output
[0] = vblue
[0];
198 output
[1] = vgreen
[0];
201 output
[3] = vblue
[1];
202 output
[4] = vgreen
[1];
205 output
[6] = hblue
[0];
206 output
[7] = hgreen
[0];
209 output
[9] = hblue
[1];
210 output
[10] = hgreen
[1];
211 output
[11] = hred
[1];
213 __m128i threshold_upper
= _mm_set1_epi32(3);
214 __m128i threshold_lower
= _mm_set1_epi32(-4);
216 __m128 factor_v
= _mm_set1_ps(31.0f
/ 255.0f
);
217 __m128 rounding_v
= _mm_set1_ps(0.5f
);
218 __m128 h_avg_0
= _mm_set_ps(hblue
[0], hgreen
[0], hred
[0], 0);
219 __m128 h_avg_1
= _mm_set_ps(hblue
[1], hgreen
[1], hred
[1], 0);
221 __m128 v_avg_0
= _mm_set_ps(vblue
[0], vgreen
[0], vred
[0], 0);
222 __m128 v_avg_1
= _mm_set_ps(vblue
[1], vgreen
[1], vred
[1], 0);
224 h_avg_0
= _mm_mul_ps(h_avg_0
, factor_v
);
225 h_avg_1
= _mm_mul_ps(h_avg_1
, factor_v
);
226 v_avg_0
= _mm_mul_ps(v_avg_0
, factor_v
);
227 v_avg_1
= _mm_mul_ps(v_avg_1
, factor_v
);
229 h_avg_0
= _mm_add_ps(h_avg_0
, rounding_v
);
230 h_avg_1
= _mm_add_ps(h_avg_1
, rounding_v
);
231 v_avg_0
= _mm_add_ps(v_avg_0
, rounding_v
);
232 v_avg_1
= _mm_add_ps(v_avg_1
, rounding_v
);
234 __m128i h_avg_0i
= _mm_cvttps_epi32(h_avg_0
);
235 __m128i h_avg_1i
= _mm_cvttps_epi32(h_avg_1
);
237 __m128i v_avg_0i
= _mm_cvttps_epi32(v_avg_0
);
238 __m128i v_avg_1i
= _mm_cvttps_epi32(v_avg_1
);
240 h_avg_0i
= _mm_sub_epi32(h_avg_1i
, h_avg_0i
);
241 v_avg_0i
= _mm_sub_epi32(v_avg_1i
, v_avg_0i
);
244 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(v_avg_0i
, threshold_lower
)));
246 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(v_avg_0i
, threshold_upper
)));
249 (0 == _mm_movemask_epi8(_mm_cmplt_epi32(h_avg_0i
, threshold_lower
)));
251 (0 == _mm_movemask_epi8(_mm_cmpgt_epi32(h_avg_0i
, threshold_upper
)));
254 void ComputeLuminance(uint8_t* block
,
256 const int sub_block_id
,
257 const uint8_t* idx_to_num_tab
,
258 const __sse_data
* data
,
259 const uint32_t expected_error
) {
260 uint8_t best_tbl_idx
= 0;
261 uint32_t best_error
= 0x7FFFFFFF;
262 uint8_t best_mod_idx
[8][8]; // [table][texel]
264 const __m128i base_blue
= _mm_set1_epi32(base
.channels
.b
);
265 const __m128i base_green
= _mm_set1_epi32(base
.channels
.g
);
266 const __m128i base_red
= _mm_set1_epi32(base
.channels
.r
);
268 __m128i test_red
, test_blue
, test_green
, tmp
, tmp_blue
, tmp_green
, tmp_red
;
269 __m128i block_error
, mask
;
271 // This will have the minimum errors for each 4 pixels.
272 __m128i first_half_min
;
273 __m128i second_half_min
;
275 // This will have the matching table index combo for each 4 pixels.
276 __m128i first_half_pattern
;
277 __m128i second_half_pattern
;
279 const __m128i first_blue_data_block
= data
->blue
[2 * sub_block_id
];
280 const __m128i first_green_data_block
= data
->green
[2 * sub_block_id
];
281 const __m128i first_red_data_block
= data
->red
[2 * sub_block_id
];
283 const __m128i second_blue_data_block
= data
->blue
[2 * sub_block_id
+ 1];
284 const __m128i second_green_data_block
= data
->green
[2 * sub_block_id
+ 1];
285 const __m128i second_red_data_block
= data
->red
[2 * sub_block_id
+ 1];
288 // Fail early to increase speed.
289 long delta
= INT32_MAX
;
290 uint32_t last_min
= INT32_MAX
;
292 const uint8_t shuffle_mask
[] = {
293 0x1B, 0x4E, 0xB1, 0xE4}; // Important they are sorted ascending.
295 for (unsigned int tbl_idx
= 0; tbl_idx
< 8; ++tbl_idx
) {
297 g_codeword_tables
[tbl_idx
][3], g_codeword_tables
[tbl_idx
][2],
298 g_codeword_tables
[tbl_idx
][1], g_codeword_tables
[tbl_idx
][0]);
300 test_blue
= AddAndClamp(tmp
, base_blue
);
301 test_green
= AddAndClamp(tmp
, base_green
);
302 test_red
= AddAndClamp(tmp
, base_red
);
304 first_half_min
= __sse_max_int
;
305 second_half_min
= __sse_max_int
;
307 first_half_pattern
= __sse_zero
;
308 second_half_pattern
= __sse_zero
;
310 for (uint8_t imm8
: shuffle_mask
) {
313 tmp_blue
= _mm_shuffle_epi32(test_blue
, 0x1B);
314 tmp_green
= _mm_shuffle_epi32(test_green
, 0x1B);
315 tmp_red
= _mm_shuffle_epi32(test_red
, 0x1B);
318 tmp_blue
= _mm_shuffle_epi32(test_blue
, 0x4E);
319 tmp_green
= _mm_shuffle_epi32(test_green
, 0x4E);
320 tmp_red
= _mm_shuffle_epi32(test_red
, 0x4E);
323 tmp_blue
= _mm_shuffle_epi32(test_blue
, 0xB1);
324 tmp_green
= _mm_shuffle_epi32(test_green
, 0xB1);
325 tmp_red
= _mm_shuffle_epi32(test_red
, 0xB1);
328 tmp_blue
= _mm_shuffle_epi32(test_blue
, 0xE4);
329 tmp_green
= _mm_shuffle_epi32(test_green
, 0xE4);
330 tmp_red
= _mm_shuffle_epi32(test_red
, 0xE4);
333 tmp_blue
= test_blue
;
334 tmp_green
= test_green
;
338 tmp
= _mm_set1_epi32(imm8
);
341 AddChannelError(GetColorErrorSSE(tmp_blue
, first_blue_data_block
),
342 GetColorErrorSSE(tmp_green
, first_green_data_block
),
343 GetColorErrorSSE(tmp_red
, first_red_data_block
));
345 // Save winning pattern.
346 first_half_pattern
= _mm_max_epi16(
348 _mm_and_si128(tmp
, _mm_cmpgt_epi32(first_half_min
, block_error
)));
349 // Should use _mm_min_epi32(first_half_min, block_error); from SSE4
350 // otherwise we have a small performance penalty.
351 mask
= _mm_cmplt_epi32(block_error
, first_half_min
);
352 first_half_min
= _mm_add_epi32(_mm_and_si128(mask
, block_error
),
353 _mm_andnot_si128(mask
, first_half_min
));
355 // Compute second part of the block.
357 AddChannelError(GetColorErrorSSE(tmp_blue
, second_blue_data_block
),
358 GetColorErrorSSE(tmp_green
, second_green_data_block
),
359 GetColorErrorSSE(tmp_red
, second_red_data_block
));
361 // Save winning pattern.
362 second_half_pattern
= _mm_max_epi16(
364 _mm_and_si128(tmp
, _mm_cmpgt_epi32(second_half_min
, block_error
)));
365 // Should use _mm_min_epi32(second_half_min, block_error); from SSE4
366 // otherwise we have a small performance penalty.
367 mask
= _mm_cmplt_epi32(block_error
, second_half_min
);
368 second_half_min
= _mm_add_epi32(_mm_and_si128(mask
, block_error
),
369 _mm_andnot_si128(mask
, second_half_min
));
372 first_half_min
= _mm_add_epi32(first_half_min
, second_half_min
);
374 _mm_add_epi32(first_half_min
, _mm_shuffle_epi32(first_half_min
, 0x4E));
376 _mm_add_epi32(first_half_min
, _mm_shuffle_epi32(first_half_min
, 0xB1));
378 min
= _mm_cvtsi128_si32(first_half_min
);
380 delta
= min
- last_min
;
383 if (min
< best_error
) {
384 best_tbl_idx
= tbl_idx
;
387 best_mod_idx
[tbl_idx
][0] =
388 (_mm_cvtsi128_si32(first_half_pattern
) >> (0)) & 3;
389 best_mod_idx
[tbl_idx
][4] =
390 (_mm_cvtsi128_si32(second_half_pattern
) >> (0)) & 3;
392 best_mod_idx
[tbl_idx
][1] =
393 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern
, 0x1)) >>
396 best_mod_idx
[tbl_idx
][5] =
397 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern
, 0x1)) >>
401 best_mod_idx
[tbl_idx
][2] =
402 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern
, 0x2)) >>
405 best_mod_idx
[tbl_idx
][6] =
406 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern
, 0x2)) >>
410 best_mod_idx
[tbl_idx
][3] =
411 (_mm_cvtsi128_si32(_mm_shuffle_epi32(first_half_pattern
, 0x3)) >>
414 best_mod_idx
[tbl_idx
][7] =
415 (_mm_cvtsi128_si32(_mm_shuffle_epi32(second_half_pattern
, 0x3)) >>
419 if (best_error
== 0) {
422 } else if (delta
> 0 && expected_error
< min
) {
423 // The error is growing and is well beyond expected threshold.
428 WriteCodewordTable(block
, sub_block_id
, best_tbl_idx
);
430 uint32_t pix_data
= 0;
437 for (unsigned int i
= 0; i
< 8; ++i
) {
438 mod_idx
= best_mod_idx
[best_tbl_idx
][i
];
439 pix_idx
= g_mod_to_pix
[mod_idx
];
444 // Obtain the texel number as specified in the standard.
445 texel_num
= idx_to_num_tab
[i
];
446 pix_data
|= msb
<< (texel_num
+ 16);
447 pix_data
|= lsb
<< (texel_num
);
450 WritePixelData(block
, pix_data
);
453 void CompressBlock(uint8_t* dst
, __sse_data
* data
) {
454 // First 3 values are for vertical 1, second 3 vertical 2, third 3 horizontal
457 float __sse_avg_colors
[12] = {
460 bool use_differential
[2] = {true, true};
461 GetAvgColors(data
, __sse_avg_colors
, use_differential
);
462 Color sub_block_avg
[4];
464 // TODO(radu.velea): Remove floating point operations and use only int's +
465 // normal rounding and shifts for reduced Quality.
466 for (int i
= 0, j
= 1; i
< 4; i
+= 2, j
+= 2) {
467 if (use_differential
[i
/ 2] == false) {
468 sub_block_avg
[i
] = MakeColor444(&__sse_avg_colors
[i
* 3]);
469 sub_block_avg
[j
] = MakeColor444(&__sse_avg_colors
[j
* 3]);
471 sub_block_avg
[i
] = MakeColor555(&__sse_avg_colors
[i
* 3]);
472 sub_block_avg
[j
] = MakeColor555(&__sse_avg_colors
[j
* 3]);
476 __m128i red_avg
[2], green_avg
[2], blue_avg
[2];
478 // TODO(radu.velea): Perfect accuracy, maybe skip floating variables.
479 blue_avg
[0] = _mm_set_epi32(static_cast<int>(__sse_avg_colors
[3]),
480 static_cast<int>(__sse_avg_colors
[3]),
481 static_cast<int>(__sse_avg_colors
[0]),
482 static_cast<int>(__sse_avg_colors
[0]));
484 green_avg
[0] = _mm_set_epi32(static_cast<int>(__sse_avg_colors
[4]),
485 static_cast<int>(__sse_avg_colors
[4]),
486 static_cast<int>(__sse_avg_colors
[1]),
487 static_cast<int>(__sse_avg_colors
[1]));
489 red_avg
[0] = _mm_set_epi32(static_cast<int>(__sse_avg_colors
[5]),
490 static_cast<int>(__sse_avg_colors
[5]),
491 static_cast<int>(__sse_avg_colors
[2]),
492 static_cast<int>(__sse_avg_colors
[2]));
494 uint32_t vertical_error
[2];
495 GetVerticalError(data
, blue_avg
, green_avg
, red_avg
, vertical_error
);
497 // TODO(radu.velea): Perfect accuracy, maybe skip floating variables.
498 blue_avg
[0] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors
[6]));
499 blue_avg
[1] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors
[9]));
501 green_avg
[0] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors
[7]));
502 green_avg
[1] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors
[10]));
504 red_avg
[0] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors
[8]));
505 red_avg
[1] = _mm_set1_epi32(static_cast<int>(__sse_avg_colors
[11]));
507 uint32_t horizontal_error
[2];
508 GetHorizontalError(data
, blue_avg
, green_avg
, red_avg
, horizontal_error
);
510 bool flip
= (horizontal_error
[0] + horizontal_error
[1]) <
511 (vertical_error
[0] + vertical_error
[1]);
512 uint32_t* expected_errors
= flip
? horizontal_error
: vertical_error
;
514 // Clear destination buffer so that we can "or" in the results.
517 WriteDiff(dst
, use_differential
[!!flip
]);
518 WriteFlip(dst
, flip
);
520 uint8_t sub_block_off_0
= flip
? 2 : 0;
521 uint8_t sub_block_off_1
= sub_block_off_0
+ 1;
523 if (use_differential
[!!flip
]) {
524 WriteColors555(dst
, sub_block_avg
[sub_block_off_0
],
525 sub_block_avg
[sub_block_off_1
]);
527 WriteColors444(dst
, sub_block_avg
[sub_block_off_0
],
528 sub_block_avg
[sub_block_off_1
]);
532 // Transpose vertical data into horizontal lines.
534 for (int i
= 0; i
< 4; i
+= 2) {
536 data
->blue
[i
] = _mm_add_epi32(
537 _mm_move_epi64(data
->blue
[i
]),
538 _mm_shuffle_epi32(_mm_move_epi64(data
->blue
[i
+ 1]), 0x4E));
539 data
->blue
[i
+ 1] = _mm_add_epi32(
540 _mm_move_epi64(_mm_shuffle_epi32(tmp
, 0x4E)),
542 _mm_move_epi64(_mm_shuffle_epi32(data
->blue
[i
+ 1], 0x4E)),
545 tmp
= data
->green
[i
];
546 data
->green
[i
] = _mm_add_epi32(
547 _mm_move_epi64(data
->green
[i
]),
548 _mm_shuffle_epi32(_mm_move_epi64(data
->green
[i
+ 1]), 0x4E));
549 data
->green
[i
+ 1] = _mm_add_epi32(
550 _mm_move_epi64(_mm_shuffle_epi32(tmp
, 0x4E)),
552 _mm_move_epi64(_mm_shuffle_epi32(data
->green
[i
+ 1], 0x4E)),
556 data
->red
[i
] = _mm_add_epi32(
557 _mm_move_epi64(data
->red
[i
]),
558 _mm_shuffle_epi32(_mm_move_epi64(data
->red
[i
+ 1]), 0x4E));
559 data
->red
[i
+ 1] = _mm_add_epi32(
560 _mm_move_epi64(_mm_shuffle_epi32(tmp
, 0x4E)),
562 _mm_move_epi64(_mm_shuffle_epi32(data
->red
[i
+ 1], 0x4E)), 0x4E));
566 data
->blue
[1] = data
->blue
[2];
569 tmp
= data
->green
[1];
570 data
->green
[1] = data
->green
[2];
571 data
->green
[2] = tmp
;
574 data
->red
[1] = data
->red
[2];
578 // Compute luminance for the first sub block.
579 ComputeLuminance(dst
, sub_block_avg
[sub_block_off_0
], 0,
580 g_idx_to_num
[sub_block_off_0
], data
,
581 SetETC1MaxError(expected_errors
[0]));
582 // Compute luminance for the second sub block.
583 ComputeLuminance(dst
, sub_block_avg
[sub_block_off_1
], 1,
584 g_idx_to_num
[sub_block_off_1
], data
,
585 SetETC1MaxError(expected_errors
[1]));
588 static void ExtractBlock(uint8_t* dst
, const uint8_t* src
, int width
) {
589 for (int j
= 0; j
< 4; ++j
) {
590 memcpy(&dst
[j
* 4 * 4], src
, 4 * 4);
595 inline bool TransposeBlock(uint8_t* block
, __m128i
* transposed
) {
596 // This function transforms an incommig block of RGBA or GBRA pixels into 4
597 // registers, each containing the data corresponding for a single channel.
598 // Ex: transposed[0] will have all the R values for a RGBA block,
599 // transposed[1] will have G, etc.
600 // The values are packed as 8 bit unsigned values in the SSE registers.
602 // Before doing any work we check if the block is solid.
603 __m128i tmp3
, tmp2
, tmp1
, tmp0
;
604 __m128i test_solid
= _mm_set1_epi32(*((uint32_t*)block
));
605 uint16_t mask
= 0xFFFF;
607 // a0,a1,a2,...a7, ...a15
608 transposed
[0] = _mm_loadu_si128((__m128i
*)(block
));
609 // b0, b1,b2,...b7.... b15
610 transposed
[1] = _mm_loadu_si128((__m128i
*)(block
+ 16));
611 // c0, c1,c2,...c7....c15
612 transposed
[2] = _mm_loadu_si128((__m128i
*)(block
+ 32));
613 // d0,d1,d2,...d7....d15
614 transposed
[3] = _mm_loadu_si128((__m128i
*)(block
+ 48));
616 for (int i
= 0; i
< 4; i
++) {
617 mask
&= _mm_movemask_epi8(_mm_cmpeq_epi8(transposed
[i
], test_solid
));
620 if (mask
== 0xFFFF) {
621 // Block is solid, no need to do any more work.
625 // a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
626 tmp0
= _mm_unpacklo_epi8(transposed
[0], transposed
[1]);
627 // c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
628 tmp1
= _mm_unpacklo_epi8(transposed
[2], transposed
[3]);
629 // a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
630 tmp2
= _mm_unpackhi_epi8(transposed
[0], transposed
[1]);
631 // c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
632 tmp3
= _mm_unpackhi_epi8(transposed
[2], transposed
[3]);
634 // a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11
635 transposed
[0] = _mm_unpacklo_epi8(tmp0
, tmp2
);
636 // a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
637 transposed
[1] = _mm_unpackhi_epi8(tmp0
, tmp2
);
638 // c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
639 transposed
[2] = _mm_unpacklo_epi8(tmp1
, tmp3
);
640 // c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
641 transposed
[3] = _mm_unpackhi_epi8(tmp1
, tmp3
);
643 // a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
644 tmp0
= _mm_unpacklo_epi32(transposed
[0], transposed
[2]);
645 // a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
646 tmp1
= _mm_unpackhi_epi32(transposed
[0], transposed
[2]);
647 // a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13
648 tmp2
= _mm_unpacklo_epi32(transposed
[1], transposed
[3]);
649 // a6,a14, b6,b14, c6,c14, d6,d14, a7,a15, b7,b15, c7,c15, d7,d15
650 tmp3
= _mm_unpackhi_epi32(transposed
[1], transposed
[3]);
652 // a0,a4, a8,a12, b0,b4, b8,b12, c0,c4, c8,c12, d0,d4, d8,d12
653 transposed
[0] = _mm_unpacklo_epi8(tmp0
, tmp2
);
654 // a1,a5, a9,a13, b1,b5, b9,b13, c1,c5, c9,c13, d1,d5, d9,d13
655 transposed
[1] = _mm_unpackhi_epi8(tmp0
, tmp2
);
656 // a2,a6, a10,a14, b2,b6, b10,b14, c2,c6, c10,c14, d2,d6, d10,d14
657 transposed
[2] = _mm_unpacklo_epi8(tmp1
, tmp3
);
658 // a3,a7, a11,a15, b3,b7, b11,b15, c3,c7, c11,c15, d3,d7, d11,d15
659 transposed
[3] = _mm_unpackhi_epi8(tmp1
, tmp3
);
664 inline void UnpackBlock(__m128i
* packed
,
669 const __m128i zero
= _mm_set1_epi8(0);
670 __m128i tmp_low
, tmp_high
;
673 tmp_low
= _mm_unpacklo_epi8(packed
[0], zero
);
674 tmp_high
= _mm_unpackhi_epi8(packed
[0], zero
);
676 red
[0] = _mm_unpacklo_epi16(tmp_low
, zero
);
677 red
[1] = _mm_unpackhi_epi16(tmp_low
, zero
);
679 red
[2] = _mm_unpacklo_epi16(tmp_high
, zero
);
680 red
[3] = _mm_unpackhi_epi16(tmp_high
, zero
);
683 tmp_low
= _mm_unpacklo_epi8(packed
[1], zero
);
684 tmp_high
= _mm_unpackhi_epi8(packed
[1], zero
);
686 green
[0] = _mm_unpacklo_epi16(tmp_low
, zero
);
687 green
[1] = _mm_unpackhi_epi16(tmp_low
, zero
);
689 green
[2] = _mm_unpacklo_epi16(tmp_high
, zero
);
690 green
[3] = _mm_unpackhi_epi16(tmp_high
, zero
);
693 tmp_low
= _mm_unpacklo_epi8(packed
[2], zero
);
694 tmp_high
= _mm_unpackhi_epi8(packed
[2], zero
);
696 blue
[0] = _mm_unpacklo_epi16(tmp_low
, zero
);
697 blue
[1] = _mm_unpackhi_epi16(tmp_low
, zero
);
699 blue
[2] = _mm_unpacklo_epi16(tmp_high
, zero
);
700 blue
[3] = _mm_unpackhi_epi16(tmp_high
, zero
);
702 // Unpack alpha - unused for ETC1.
703 tmp_low
= _mm_unpacklo_epi8(packed
[3], zero
);
704 tmp_high
= _mm_unpackhi_epi8(packed
[3], zero
);
706 alpha
[0] = _mm_unpacklo_epi16(tmp_low
, zero
);
707 alpha
[1] = _mm_unpackhi_epi16(tmp_low
, zero
);
709 alpha
[2] = _mm_unpacklo_epi16(tmp_high
, zero
);
710 alpha
[3] = _mm_unpackhi_epi16(tmp_high
, zero
);
713 inline void CompressSolid(uint8_t* dst
, uint8_t* block
) {
714 // Clear destination buffer so that we can "or" in the results.
717 const float src_color_float
[3] = {static_cast<float>(block
[0]),
718 static_cast<float>(block
[1]),
719 static_cast<float>(block
[2])};
720 const Color base
= MakeColor555(src_color_float
);
721 const __m128i base_v
=
722 _mm_set_epi32(0, base
.channels
.r
, base
.channels
.g
, base
.channels
.b
);
724 const __m128i constant
= _mm_set_epi32(0, block
[2], block
[1], block
[0]);
727 static const __m128i rgb
=
728 _mm_set_epi32(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
730 WriteDiff(dst
, true);
731 WriteFlip(dst
, false);
733 WriteColors555(dst
, base
, base
);
735 uint8_t best_tbl_idx
= 0;
736 uint8_t best_mod_idx
= 0;
737 uint32_t best_mod_err
= INT32_MAX
;
739 for (unsigned int tbl_idx
= 0; tbl_idx
< 8; ++tbl_idx
) {
741 g_codeword_tables
[tbl_idx
][3], g_codeword_tables
[tbl_idx
][2],
742 g_codeword_tables
[tbl_idx
][1], g_codeword_tables
[tbl_idx
][0]);
743 colors
[0] = AddAndClamp(base_v
, _mm_shuffle_epi32(lum
, 0x0));
744 colors
[1] = AddAndClamp(base_v
, _mm_shuffle_epi32(lum
, 0x55));
745 colors
[2] = AddAndClamp(base_v
, _mm_shuffle_epi32(lum
, 0xAA));
746 colors
[3] = AddAndClamp(base_v
, _mm_shuffle_epi32(lum
, 0xFF));
748 for (int i
= 0; i
< 4; i
++) {
750 SumSSE(GetColorErrorSSE(constant
, _mm_and_si128(colors
[i
], rgb
)));
751 colors
[i
] = _mm_and_si128(colors
[i
], rgb
);
752 if (mod_err
< best_mod_err
) {
753 best_tbl_idx
= tbl_idx
;
755 best_mod_err
= mod_err
;
758 break; // We cannot do any better than this.
764 WriteCodewordTable(dst
, 0, best_tbl_idx
);
765 WriteCodewordTable(dst
, 1, best_tbl_idx
);
767 uint8_t pix_idx
= g_mod_to_pix
[best_mod_idx
];
768 uint32_t lsb
= pix_idx
& 0x1;
769 uint32_t msb
= pix_idx
>> 1;
771 uint32_t pix_data
= 0;
772 for (unsigned int i
= 0; i
< 2; ++i
) {
773 for (unsigned int j
= 0; j
< 8; ++j
) {
774 // Obtain the texel number as specified in the standard.
775 int texel_num
= g_idx_to_num
[i
][j
];
776 pix_data
|= msb
<< (texel_num
+ 16);
777 pix_data
|= lsb
<< (texel_num
);
781 WritePixelData(dst
, pix_data
);
786 void TextureCompressorETC1SSE::Compress(const uint8_t* src
,
792 DCHECK_EQ((width
& 3), 0);
793 DCHECK_GE(height
, 4);
794 DCHECK_EQ((height
& 3), 0);
796 ALIGNAS(16) uint8_t block
[64];
798 __m128i red
[4], green
[4], blue
[4], alpha
[4];
801 for (int y
= 0; y
< height
; y
+= 4, src
+= width
* 4 * 4) {
802 for (int x
= 0; x
< width
; x
+= 4, dst
+= 8) {
803 ExtractBlock(block
, src
+ x
* 4, width
);
804 if (TransposeBlock(block
, packed
) == false) {
805 CompressSolid(dst
, block
);
807 UnpackBlock(packed
, blue
, green
, red
, alpha
);
810 data
.packed
= packed
;
815 CompressBlock(dst
, &data
);