2 * Copyright © 2013 Soren Sandmann Pedersen
3 * Copyright © 2013 Red Hat, Inc.
4 * Copyright © 2016 Mozilla Foundation
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23 * DEALINGS IN THE SOFTWARE.
25 * Author: Soren Sandmann (soren.sandmann@gmail.com)
26 * Jeff Muizelaar (jmuizelaar@mozilla.com)
29 /* This has been adapted from the ssse3 code from pixman. It's currently
30 * a mess as I want to try it out in practice before finalizing the details.
34 #include <xmmintrin.h>
35 #include <emmintrin.h>
36 #include <tmmintrin.h>
39 #include "ssse3-scaler.h"
41 typedef int32_t pixman_fixed_16_16_t
;
42 typedef pixman_fixed_16_16_t pixman_fixed_t
;
43 #define pixman_fixed_1 (pixman_int_to_fixed(1))
44 #define pixman_fixed_to_int(f) ((int)((f) >> 16))
45 #define pixman_int_to_fixed(i) ((pixman_fixed_t)((i) << 16))
46 #define pixman_double_to_fixed(d) ((pixman_fixed_t)((d) * 65536.0))
47 #define PIXMAN_FIXED_INT_MAX 32767
48 #define PIXMAN_FIXED_INT_MIN -32768
49 typedef struct pixman_vector pixman_vector_t
;
51 typedef int pixman_bool_t
;
52 typedef int64_t pixman_fixed_32_32_t
;
53 typedef pixman_fixed_32_32_t pixman_fixed_48_16_t
;
55 pixman_fixed_48_16_t v
[3];
56 } pixman_vector_48_16_t
;
58 struct pixman_vector
{
59 pixman_fixed_t vector
[3];
61 typedef struct pixman_transform pixman_transform_t
;
63 struct pixman_transform
{
64 pixman_fixed_t matrix
[3][3];
68 # define force_inline __forceinline
70 # define force_inline __inline__ __attribute__((always_inline))
73 #define BILINEAR_INTERPOLATION_BITS 6
75 static force_inline
int pixman_fixed_to_bilinear_weight(pixman_fixed_t x
) {
76 return (x
>> (16 - BILINEAR_INTERPOLATION_BITS
)) &
77 ((1 << BILINEAR_INTERPOLATION_BITS
) - 1);
80 static void pixman_transform_point_31_16_3d(const pixman_transform_t
* t
,
81 const pixman_vector_48_16_t
* v
,
82 pixman_vector_48_16_t
* result
) {
86 /* input vector values must have no more than 31 bits (including sign)
87 * in the integer part */
88 assert(v
->v
[0] < ((pixman_fixed_48_16_t
)1 << (30 + 16)));
89 assert(v
->v
[0] >= -((pixman_fixed_48_16_t
)1 << (30 + 16)));
90 assert(v
->v
[1] < ((pixman_fixed_48_16_t
)1 << (30 + 16)));
91 assert(v
->v
[1] >= -((pixman_fixed_48_16_t
)1 << (30 + 16)));
92 assert(v
->v
[2] < ((pixman_fixed_48_16_t
)1 << (30 + 16)));
93 assert(v
->v
[2] >= -((pixman_fixed_48_16_t
)1 << (30 + 16)));
95 for (i
= 0; i
< 3; i
++) {
96 tmp
[i
][0] = (int64_t)t
->matrix
[i
][0] * (v
->v
[0] >> 16);
97 tmp
[i
][1] = (int64_t)t
->matrix
[i
][0] * (v
->v
[0] & 0xFFFF);
98 tmp
[i
][0] += (int64_t)t
->matrix
[i
][1] * (v
->v
[1] >> 16);
99 tmp
[i
][1] += (int64_t)t
->matrix
[i
][1] * (v
->v
[1] & 0xFFFF);
100 tmp
[i
][0] += (int64_t)t
->matrix
[i
][2] * (v
->v
[2] >> 16);
101 tmp
[i
][1] += (int64_t)t
->matrix
[i
][2] * (v
->v
[2] & 0xFFFF);
104 result
->v
[0] = tmp
[0][0] + ((tmp
[0][1] + 0x8000) >> 16);
105 result
->v
[1] = tmp
[1][0] + ((tmp
[1][1] + 0x8000) >> 16);
106 result
->v
[2] = tmp
[2][0] + ((tmp
[2][1] + 0x8000) >> 16);
109 static pixman_bool_t
pixman_transform_point_3d(
110 const struct pixman_transform
* transform
, struct pixman_vector
* vector
) {
111 pixman_vector_48_16_t tmp
;
112 tmp
.v
[0] = vector
->vector
[0];
113 tmp
.v
[1] = vector
->vector
[1];
114 tmp
.v
[2] = vector
->vector
[2];
116 pixman_transform_point_31_16_3d(transform
, &tmp
, &tmp
);
118 vector
->vector
[0] = tmp
.v
[0];
119 vector
->vector
[1] = tmp
.v
[1];
120 vector
->vector
[2] = tmp
.v
[2];
122 return vector
->vector
[0] == tmp
.v
[0] && vector
->vector
[1] == tmp
.v
[1] &&
123 vector
->vector
[2] == tmp
.v
[2];
126 struct bits_image_t
{
129 pixman_transform_t
* transform
;
132 typedef struct bits_image_t bits_image_t
;
135 } pixman_iter_info_t
;
137 typedef struct pixman_iter_t pixman_iter_t
;
138 typedef void (*pixman_iter_fini_t
)(pixman_iter_t
* iter
);
140 struct pixman_iter_t
{
142 pixman_iter_fini_t fini
;
162 static void ssse3_fetch_horizontal(bits_image_t
* image
, line_t
* line
, int y
,
163 pixman_fixed_t x
, pixman_fixed_t ux
, int n
) {
164 uint32_t* bits
= image
->bits
+ y
* image
->rowstride
;
165 __m128i vx
= _mm_set_epi16(-(x
+ 1), x
, -(x
+ 1), x
, -(x
+ ux
+ 1), x
+ ux
,
166 -(x
+ ux
+ 1), x
+ ux
);
167 __m128i vux
= _mm_set_epi16(-2 * ux
, 2 * ux
, -2 * ux
, 2 * ux
, -2 * ux
, 2 * ux
,
169 __m128i vaddc
= _mm_set_epi16(1, 0, 1, 0, 1, 0, 1, 0);
170 __m128i
* b
= (__m128i
*)line
->buffer
;
173 while ((n
-= 2) >= 0) {
176 if (pixman_fixed_to_int(x
+ ux
) >= image
->rowstride
) {
177 vrl1
= _mm_setzero_si128();
178 printf("overread 2loop\n");
180 if (pixman_fixed_to_int(x
+ ux
) < 0) printf("underflow\n");
181 vrl1
= _mm_loadl_epi64(
182 (__m128i
*)(bits
+ (pixman_fixed_to_int(x
+ ux
) < 0
184 : pixman_fixed_to_int(x
+ ux
))));
187 vrl1
= _mm_loadl_epi64((__m128i
*)(bits
+ pixman_fixed_to_int(x
+ ux
)));
193 vrl0
= _mm_loadl_epi64(
195 (pixman_fixed_to_int(x
) < 0 ? 0 : pixman_fixed_to_int(x
))));
197 vrl0
= _mm_loadl_epi64((__m128i
*)(bits
+ pixman_fixed_to_int(x
)));
201 /* The weights are based on vx which is a vector of
203 * - (x + 1), x, - (x + 1), x,
204 * - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
206 * so the 16 bit weights end up like this:
208 * iw0, w0, iw0, w0, iw1, w1, iw1, w1
210 * and after shifting and packing, we get these bytes:
212 * iw0, w0, iw0, w0, iw1, w1, iw1, w1,
213 * iw0, w0, iw0, w0, iw1, w1, iw1, w1,
215 * which means the first and the second input pixel
216 * have to be interleaved like this:
218 * la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
219 * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
221 * before maddubsw can be used.
224 vw
= _mm_add_epi16(vaddc
,
225 _mm_srli_epi16(vx
, 16 - BILINEAR_INTERPOLATION_BITS
));
226 /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
229 vw
= _mm_packus_epi16(vw
, vw
);
230 /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
231 * iw0, w0, iw0, w0, iw1, w1, iw1, w1
233 vx
= _mm_add_epi16(vx
, vux
);
237 vr
= _mm_unpacklo_epi16(vrl1
, vrl0
);
238 /* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */
240 s
= _mm_shuffle_epi32(vr
, _MM_SHUFFLE(1, 0, 3, 2));
241 /* s: lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */
243 vr
= _mm_unpackhi_epi8(vr
, s
);
244 /* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
245 * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
248 vr
= _mm_maddubs_epi16(vr
, vw
);
250 /* When the weight is 0, the inverse weight is
251 * 128 which can't be represented in a signed byte.
252 * As a result maddubsw computes the following:
254 * r = l * -128 + r * 0
256 * rather than the desired
258 * r = l * 128 + r * 0
260 * We fix this by taking the absolute value of the
263 // we can drop this if we use lower precision
265 vr
= _mm_shuffle_epi32(vr
, _MM_SHUFFLE(2, 0, 3, 1));
266 /* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
267 _mm_store_si128(b
++, vr
);
271 vrl1
= _mm_setzero_si128();
278 // scale a line of destination pixels
279 static uint32_t* ssse3_fetch_bilinear_cover(pixman_iter_t
* iter
,
280 const uint32_t* mask
) {
281 pixman_fixed_t fx
, ux
;
282 bilinear_info_t
* info
= iter
->data
;
283 line_t
*line0
, *line1
;
290 ux
= iter
->image
->transform
->matrix
[0][0];
292 y0
= pixman_fixed_to_int(info
->y
);
293 if (y0
< 0) *(volatile char*)0 = 9;
296 // clamping in y direction
297 if (y1
>= iter
->height
) {
298 y1
= iter
->height
- 1;
301 line0
= &info
->lines
[y0
& 0x01];
302 line1
= &info
->lines
[y1
& 0x01];
304 if (line0
->y
!= y0
) {
305 ssse3_fetch_horizontal(iter
->image
, line0
, y0
, fx
, ux
, iter
->width
);
308 if (line1
->y
!= y1
) {
309 ssse3_fetch_horizontal(iter
->image
, line1
, y1
, fx
, ux
, iter
->width
);
312 #ifdef PIXMAN_STYLE_INTERPOLATION
313 dist_y
= pixman_fixed_to_bilinear_weight(info
->y
);
314 dist_y
<<= (16 - BILINEAR_INTERPOLATION_BITS
);
316 vw
= _mm_set_epi16(dist_y
, dist_y
, dist_y
, dist_y
, dist_y
, dist_y
, dist_y
,
320 // setup the weights for the top (vw) and bottom (uvw) lines
321 dist_y
= pixman_fixed_to_bilinear_weight(info
->y
);
322 // we use 15 instead of 16 because we need an extra bit to handle when the
323 // weights are 0 and 1
324 dist_y
<<= (15 - BILINEAR_INTERPOLATION_BITS
);
326 vw
= _mm_set_epi16(dist_y
, dist_y
, dist_y
, dist_y
, dist_y
, dist_y
, dist_y
,
329 dist_y
= (1 << BILINEAR_INTERPOLATION_BITS
) -
330 pixman_fixed_to_bilinear_weight(info
->y
);
331 dist_y
<<= (15 - BILINEAR_INTERPOLATION_BITS
);
332 uvw
= _mm_set_epi16(dist_y
, dist_y
, dist_y
, dist_y
, dist_y
, dist_y
, dist_y
,
336 for (i
= 0; i
+ 3 < iter
->width
; i
+= 4) {
337 __m128i top0
= _mm_load_si128((__m128i
*)(line0
->buffer
+ i
));
338 __m128i bot0
= _mm_load_si128((__m128i
*)(line1
->buffer
+ i
));
339 __m128i top1
= _mm_load_si128((__m128i
*)(line0
->buffer
+ i
+ 2));
340 __m128i bot1
= _mm_load_si128((__m128i
*)(line1
->buffer
+ i
+ 2));
341 #ifdef PIXMAN_STYLE_INTERPOLATION
342 __m128i r0
, r1
, tmp
, p
;
344 r0
= _mm_mulhi_epu16(_mm_sub_epi16(bot0
, top0
), vw
);
345 tmp
= _mm_cmplt_epi16(bot0
, top0
);
346 tmp
= _mm_and_si128(tmp
, vw
);
347 r0
= _mm_sub_epi16(r0
, tmp
);
348 r0
= _mm_add_epi16(r0
, top0
);
349 r0
= _mm_srli_epi16(r0
, BILINEAR_INTERPOLATION_BITS
);
350 /* r0: A0 R0 A1 R1 G0 B0 G1 B1 */
351 // r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
352 /* r0: A1 R1 G1 B1 A0 R0 G0 B0 */
354 // tmp = bot1 < top1 ? vw : 0;
355 // r1 = (bot1 - top1)*vw + top1 - tmp
356 // r1 = bot1*vw - vw*top1 + top1 - tmp
357 // r1 = bot1*vw + top1 - vw*top1 - tmp
358 // r1 = bot1*vw + top1*(1 - vw) - tmp
359 r1
= _mm_mulhi_epu16(_mm_sub_epi16(bot1
, top1
), vw
);
360 tmp
= _mm_cmplt_epi16(bot1
, top1
);
361 tmp
= _mm_and_si128(tmp
, vw
);
362 r1
= _mm_sub_epi16(r1
, tmp
);
363 r1
= _mm_add_epi16(r1
, top1
);
364 r1
= _mm_srli_epi16(r1
, BILINEAR_INTERPOLATION_BITS
);
365 // r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
366 /* r1: A3 R3 G3 B3 A2 R2 G2 B2 */
369 top0
= _mm_mulhi_epu16(top0
, uvw
);
370 bot0
= _mm_mulhi_epu16(bot0
, vw
);
371 r0
= _mm_add_epi16(top0
, bot0
);
372 r0
= _mm_srli_epi16(r0
, BILINEAR_INTERPOLATION_BITS
- 1);
374 top1
= _mm_mulhi_epu16(top1
, uvw
);
375 bot1
= _mm_mulhi_epu16(bot1
, vw
);
376 r1
= _mm_add_epi16(top1
, bot1
);
377 r1
= _mm_srli_epi16(r1
, BILINEAR_INTERPOLATION_BITS
- 1);
380 p
= _mm_packus_epi16(r0
, r1
);
381 _mm_storeu_si128((__m128i
*)(iter
->buffer
+ i
), p
);
384 while (i
< iter
->width
) {
385 __m128i top0
= _mm_load_si128((__m128i
*)(line0
->buffer
+ i
));
386 __m128i bot0
= _mm_load_si128((__m128i
*)(line1
->buffer
+ i
));
388 #ifdef PIXMAN_STYLE_INTERPOLATION
390 r0
= _mm_mulhi_epu16(_mm_sub_epi16(bot0
, top0
), vw
);
391 tmp
= _mm_cmplt_epi16(bot0
, top0
);
392 tmp
= _mm_and_si128(tmp
, vw
);
393 r0
= _mm_sub_epi16(r0
, tmp
);
394 r0
= _mm_add_epi16(r0
, top0
);
395 r0
= _mm_srli_epi16(r0
, BILINEAR_INTERPOLATION_BITS
);
396 /* r0: A0 R0 A1 R1 G0 B0 G1 B1 */
397 r0
= _mm_shuffle_epi32(r0
, _MM_SHUFFLE(2, 0, 3, 1));
398 /* r0: A1 R1 G1 B1 A0 R0 G0 B0 */
401 top0
= _mm_mulhi_epu16(top0
, uvw
);
402 bot0
= _mm_mulhi_epu16(bot0
, vw
);
403 r0
= _mm_add_epi16(top0
, bot0
);
404 r0
= _mm_srli_epi16(r0
, BILINEAR_INTERPOLATION_BITS
- 1);
407 p
= _mm_packus_epi16(r0
, r0
);
409 if (iter
->width
- i
== 1) {
410 *(uint32_t*)(iter
->buffer
+ i
) = _mm_cvtsi128_si32(p
);
413 _mm_storel_epi64((__m128i
*)(iter
->buffer
+ i
), p
);
418 info
->y
+= iter
->image
->transform
->matrix
[1][1];
423 static void ssse3_bilinear_cover_iter_fini(pixman_iter_t
* iter
) {
427 static void ssse3_bilinear_cover_iter_init(pixman_iter_t
* iter
) {
428 int width
= iter
->width
;
429 bilinear_info_t
* info
;
432 if (iter
->x
> PIXMAN_FIXED_INT_MAX
|| iter
->x
< PIXMAN_FIXED_INT_MIN
||
433 iter
->y
> PIXMAN_FIXED_INT_MAX
|| iter
->y
< PIXMAN_FIXED_INT_MIN
)
436 /* Reference point is the center of the pixel */
437 v
.vector
[0] = pixman_int_to_fixed(iter
->x
) + pixman_fixed_1
/ 2;
438 v
.vector
[1] = pixman_int_to_fixed(iter
->y
) + pixman_fixed_1
/ 2;
439 v
.vector
[2] = pixman_fixed_1
;
441 if (!pixman_transform_point_3d(iter
->image
->transform
, &v
)) goto fail
;
443 info
= malloc(sizeof(*info
) + (2 * width
- 1) * sizeof(uint64_t) + 64);
444 if (!info
) goto fail
;
446 info
->x
= v
.vector
[0] - pixman_fixed_1
/ 2;
447 info
->y
= v
.vector
[1] - pixman_fixed_1
/ 2;
449 #define ALIGN(addr) ((void*)((((uintptr_t)(addr)) + 15) & (~15)))
451 /* It is safe to set the y coordinates to -1 initially
452 * because COVER_CLIP_BILINEAR ensures that we will only
453 * be asked to fetch lines in the [0, height) interval
455 info
->lines
[0].y
= -1;
456 info
->lines
[0].buffer
= ALIGN(&(info
->data
[0]));
457 info
->lines
[1].y
= -1;
458 info
->lines
[1].buffer
= ALIGN(info
->lines
[0].buffer
+ width
);
460 iter
->fini
= ssse3_bilinear_cover_iter_fini
;
466 /* Something went wrong, either a bad matrix or OOM; in such cases,
467 * we don't guarantee any particular rendering.
472 /* scale the src from src_width/height to dest_width/height drawn
473 * into the rectangle x,y width,height
474 * src_stride and dst_stride are 4 byte units */
475 bool ssse3_scale_data(uint32_t* src
, int src_width
, int src_height
,
476 int src_stride
, uint32_t* dest
, int dest_width
,
477 int dest_height
, int dest_stride
, int x
, int y
, int width
,
479 // XXX: assert(src_width > 1)
480 pixman_transform_t transform
= {
481 {{pixman_fixed_1
, 0, 0}, {0, pixman_fixed_1
, 0}, {0, 0, pixman_fixed_1
}}};
482 double width_scale
= ((double)src_width
) / dest_width
;
483 double height_scale
= ((double)src_height
) / dest_height
;
484 #define AVOID_PADDING
486 // scale up by enough that we don't read outside of the bounds of the source
487 // surface currently this is required to avoid reading out of bounds.
488 if (width_scale
< 1) {
489 width_scale
= (double)(src_width
- 1) / dest_width
;
490 transform
.matrix
[0][2] = pixman_fixed_1
/ 2;
492 if (height_scale
< 1) {
493 height_scale
= (double)(src_height
- 1) / dest_height
;
494 transform
.matrix
[1][2] = pixman_fixed_1
/ 2;
497 transform
.matrix
[0][0] = pixman_double_to_fixed(width_scale
);
498 transform
.matrix
[1][1] = pixman_double_to_fixed(height_scale
);
499 transform
.matrix
[2][2] = pixman_fixed_1
;
503 image
.transform
= &transform
;
504 image
.rowstride
= src_stride
;
511 iter
.height
= src_height
;
515 ssse3_bilinear_cover_iter_init(&iter
);
517 if (!iter
.fini
) return false;
520 for (int iy
= 0; iy
< height
; iy
++) {
521 ssse3_fetch_bilinear_cover(&iter
, NULL
);
522 iter
.buffer
+= dest_stride
;
524 ssse3_bilinear_cover_iter_fini(&iter
);