1 #include "framebuffer.hpp"
2 #include "arch-detect.hpp"
9 inline bool ssse3_available()
18 : "=c"(res
), "=a"(page
) : "a"(page
) : "%rbx", "%rdx");
23 const char mask_drop4_8
[] __attribute__ ((aligned (16))) = {
24 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1, //0 -> 0
25 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 4, //1 -> 0
26 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1, //1 -> 1
27 -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 4, 5, 6, 8, 9, //2 -> 1
28 10, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, //2 -> 2
29 -1, -1, -1, -1, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, //3 -> 2
31 const char mask_drop4s_8
[] __attribute__ ((aligned (16))) = {
32 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1, //0 -> 0
33 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 1, 0, 6, //1 -> 0
34 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1, -1, -1, -1, -1, //1 -> 1
35 -1, -1, -1, -1, -1, -1, -1, -1, 2, 1, 0, 6, 5, 4, 10, 9, //2 -> 1
36 8, 14, 13, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, //2 -> 2
37 -1, -1, -1, -1, 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, //3 -> 2
39 const char mask_swap4_8
[] __attribute__ ((aligned (16))) = {
40 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15, //0 -> 0
42 const char mask_drop4_16
[] __attribute__ ((aligned (16))) = {
43 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, //0 -> 0
44 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, //1 -> 0
45 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, //1 -> 1
46 -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 8, 9, //2 -> 1
47 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, //2 -> 2
48 -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, //3 -> 2
50 const char mask_drop4s_16
[] __attribute__ ((aligned (16))) = {
51 4, 5, 2, 3, 0, 1, 12, 13, 10, 11, 8, 9, -1, -1, -1, -1, //0 -> 0
52 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 2, 3, //1 -> 0
53 0, 1, 12, 13, 10, 11, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, //1 -> 1
54 -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 2, 3, 0, 1, 12, 13, //2 -> 1
55 10, 11, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, //2 -> 2
56 -1, -1, -1, -1, 4, 5, 2, 3, 0, 1, 12, 13, 10, 11, 8, 9, //3 -> 2
58 const char mask_swap4_16
[] __attribute__ ((aligned (16))) = {
59 4, 5, 2, 3, 0, 1, 6, 7, 12, 13, 10, 11, 8, 9, 14, 15, //0 -> 0
63 void ssse3_drop(uint8_t* dest
, const uint8_t* src
, size_t units
, const char* masks
)
65 size_t blocks
= (units
+ 11) / 12;
66 for(size_t i
= 0; i
< blocks
; i
++) {
68 "MOVDQA 0(%1),%%xmm0\n"
69 "\tMOVDQA 16(%1),%%xmm1\n"
70 "\tMOVDQA 32(%1),%%xmm3\n"
71 "\tMOVDQA 48(%1),%%xmm5\n"
72 "\tMOVDQA %%xmm1,%%xmm2\n"
73 "\tMOVDQA %%xmm3,%%xmm4\n"
74 "\tPSHUFB 0(%2),%%xmm0\n"
75 "\tPSHUFB 16(%2),%%xmm1\n"
76 "\tPSHUFB 32(%2),%%xmm2\n"
77 "\tPSHUFB 48(%2),%%xmm3\n"
78 "\tPSHUFB 64(%2),%%xmm4\n"
79 "\tPSHUFB 80(%2),%%xmm5\n"
80 "\tPOR %%xmm0,%%xmm1\n"
81 "\tPOR %%xmm2,%%xmm3\n"
82 "\tPOR %%xmm4,%%xmm5\n"
83 "\tMOVDQA %%xmm1,0(%0)\n"
84 "\tMOVDQA %%xmm3,16(%0)\n"
85 "\tMOVDQA %%xmm5,32(%0)\n"
86 : : "r" (dest
), "r" (src
), "r" (masks
) : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
92 void ssse3_swap(uint8_t* dest
, const uint8_t* src
, size_t units
, const char* masks
)
94 size_t blocks
= (units
+ 3) / 4;
95 for(size_t i
= 0; i
< blocks
; i
++) {
97 "MOVDQA (%1),%%xmm0\n"
98 "\tPSHUFB (%2),%%xmm0\n"
99 "\tMOVDQA %%xmm0,(%0)\n"
100 : : "r" (dest
), "r" (src
), "r" (masks
) : "xmm0");
108 void copy_drop4(uint8_t* dest
, const uint32_t* src
, size_t units
)
111 if(ssse3_available()) {
112 ssse3_drop(dest
, reinterpret_cast<const uint8_t*>(src
), units
, mask_drop4_8
);
116 const uint8_t* _src
= reinterpret_cast<const uint8_t*>(src
);
117 for(size_t i
= 0; i
< units
; i
++) {
118 dest
[3 * i
+ 0] = _src
[4 * i
+ 0];
119 dest
[3 * i
+ 1] = _src
[4 * i
+ 1];
120 dest
[3 * i
+ 2] = _src
[4 * i
+ 2];
124 void copy_drop4s(uint8_t* dest
, const uint32_t* src
, size_t units
)
127 if(ssse3_available()) {
128 ssse3_drop(dest
, reinterpret_cast<const uint8_t*>(src
), units
, mask_drop4s_8
);
132 const uint8_t* _src
= reinterpret_cast<const uint8_t*>(src
);
133 for(size_t i
= 0; i
< units
; i
++) {
134 dest
[3 * i
+ 0] = _src
[4 * i
+ 2];
135 dest
[3 * i
+ 1] = _src
[4 * i
+ 1];
136 dest
[3 * i
+ 2] = _src
[4 * i
+ 0];
140 void copy_swap4(uint8_t* dest
, const uint32_t* src
, size_t units
)
143 if(ssse3_available()) {
144 ssse3_swap(dest
, reinterpret_cast<const uint8_t*>(src
), units
, mask_swap4_8
);
148 const uint8_t* _src
= reinterpret_cast<const uint8_t*>(src
);
149 for(size_t i
= 0; i
< units
; i
++) {
150 dest
[4 * i
+ 0] = _src
[4 * i
+ 2];
151 dest
[4 * i
+ 1] = _src
[4 * i
+ 1];
152 dest
[4 * i
+ 2] = _src
[4 * i
+ 0];
153 dest
[4 * i
+ 3] = _src
[4 * i
+ 3];
157 void copy_drop4(uint16_t* dest
, const uint64_t* src
, size_t units
)
160 if(ssse3_available()) {
161 ssse3_drop(reinterpret_cast<uint8_t*>(dest
), reinterpret_cast<const uint8_t*>(src
), units
,
166 const uint16_t* _src
= reinterpret_cast<const uint16_t*>(src
);
167 for(size_t i
= 0; i
< units
; i
++) {
168 dest
[3 * i
+ 0] = _src
[4 * i
+ 0];
169 dest
[3 * i
+ 1] = _src
[4 * i
+ 1];
170 dest
[3 * i
+ 2] = _src
[4 * i
+ 2];
174 void copy_drop4s(uint16_t* dest
, const uint64_t* src
, size_t units
)
177 if(ssse3_available()) {
178 ssse3_drop(reinterpret_cast<uint8_t*>(dest
), reinterpret_cast<const uint8_t*>(src
), units
,
183 const uint16_t* _src
= reinterpret_cast<const uint16_t*>(src
);
184 for(size_t i
= 0; i
< units
; i
++) {
185 dest
[3 * i
+ 0] = _src
[4 * i
+ 2];
186 dest
[3 * i
+ 1] = _src
[4 * i
+ 1];
187 dest
[3 * i
+ 2] = _src
[4 * i
+ 0];
191 void copy_swap4(uint16_t* dest
, const uint64_t* src
, size_t units
)
194 if(ssse3_available()) {
195 ssse3_swap(reinterpret_cast<uint8_t*>(dest
), reinterpret_cast<const uint8_t*>(src
), units
,
200 const uint16_t* _src
= reinterpret_cast<const uint16_t*>(src
);
201 for(size_t i
= 0; i
< units
; i
++) {
202 dest
[4 * i
+ 0] = _src
[4 * i
+ 2];
203 dest
[4 * i
+ 1] = _src
[4 * i
+ 1];
204 dest
[4 * i
+ 2] = _src
[4 * i
+ 0];
205 dest
[4 * i
+ 3] = _src
[4 * i
+ 3];