Actually call on_reset callback
[lsnes.git] / src / library / framebuffer-cdrop.cpp
blobd42c0e6f79a676364f773b2688095f13730800a8
1 #include "framebuffer.hpp"
2 #include "arch-detect.hpp"
3 #include <iostream>
5 namespace framebuffer
7 namespace
9 inline bool ssse3_available()
11 size_t res = 0;
12 #ifdef ARCH_IS_I386
13 size_t page = 1;
14 asm volatile(
15 "cpuid\n"
16 "\tshr $9,%0\n"
17 "\tand $1,%0\n"
18 : "=c"(res), "=a"(page) : "a"(page) : "%rbx", "%rdx");
19 #endif
20 return res;
23 const char mask_drop4_8[] __attribute__ ((aligned (16))) = {
24 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1, //0 -> 0
25 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 4, //1 -> 0
26 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1, //1 -> 1
27 -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 4, 5, 6, 8, 9, //2 -> 1
28 10, 12, 13, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, //2 -> 2
29 -1, -1, -1, -1, 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, //3 -> 2
31 const char mask_drop4s_8[] __attribute__ ((aligned (16))) = {
32 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1, //0 -> 0
33 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 1, 0, 6, //1 -> 0
34 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1, -1, -1, -1, -1, //1 -> 1
35 -1, -1, -1, -1, -1, -1, -1, -1, 2, 1, 0, 6, 5, 4, 10, 9, //2 -> 1
36 8, 14, 13, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, //2 -> 2
37 -1, -1, -1, -1, 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, //3 -> 2
39 const char mask_swap4_8[] __attribute__ ((aligned (16))) = {
40 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15, //0 -> 0
42 const char mask_drop4_16[] __attribute__ ((aligned (16))) = {
43 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, //0 -> 0
44 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, //1 -> 0
45 4, 5, 8, 9, 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, //1 -> 1
46 -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 8, 9, //2 -> 1
47 10, 11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, //2 -> 2
48 -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, //3 -> 2
50 const char mask_drop4s_16[] __attribute__ ((aligned (16))) = {
51 4, 5, 2, 3, 0, 1, 12, 13, 10, 11, 8, 9, -1, -1, -1, -1, //0 -> 0
52 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 2, 3, //1 -> 0
53 0, 1, 12, 13, 10, 11, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, //1 -> 1
54 -1, -1, -1, -1, -1, -1, -1, -1, 4, 5, 2, 3, 0, 1, 12, 13, //2 -> 1
55 10, 11, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, //2 -> 2
56 -1, -1, -1, -1, 4, 5, 2, 3, 0, 1, 12, 13, 10, 11, 8, 9, //3 -> 2
58 const char mask_swap4_16[] __attribute__ ((aligned (16))) = {
59 4, 5, 2, 3, 0, 1, 6, 7, 12, 13, 10, 11, 8, 9, 14, 15, //0 -> 0
62 #ifdef ARCH_IS_I386
63 void ssse3_drop(uint8_t* dest, const uint8_t* src, size_t units, const char* masks)
65 size_t blocks = (units + 11) / 12;
66 for(size_t i = 0; i < blocks; i++) {
67 asm volatile(
68 "MOVDQA 0(%1),%%xmm0\n"
69 "\tMOVDQA 16(%1),%%xmm1\n"
70 "\tMOVDQA 32(%1),%%xmm3\n"
71 "\tMOVDQA 48(%1),%%xmm5\n"
72 "\tMOVDQA %%xmm1,%%xmm2\n"
73 "\tMOVDQA %%xmm3,%%xmm4\n"
74 "\tPSHUFB 0(%2),%%xmm0\n"
75 "\tPSHUFB 16(%2),%%xmm1\n"
76 "\tPSHUFB 32(%2),%%xmm2\n"
77 "\tPSHUFB 48(%2),%%xmm3\n"
78 "\tPSHUFB 64(%2),%%xmm4\n"
79 "\tPSHUFB 80(%2),%%xmm5\n"
80 "\tPOR %%xmm0,%%xmm1\n"
81 "\tPOR %%xmm2,%%xmm3\n"
82 "\tPOR %%xmm4,%%xmm5\n"
83 "\tMOVDQA %%xmm1,0(%0)\n"
84 "\tMOVDQA %%xmm3,16(%0)\n"
85 "\tMOVDQA %%xmm5,32(%0)\n"
86 : : "r" (dest), "r" (src), "r" (masks) : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
87 dest += 48;
88 src += 64;
92 void ssse3_swap(uint8_t* dest, const uint8_t* src, size_t units, const char* masks)
94 size_t blocks = (units + 3) / 4;
95 for(size_t i = 0; i < blocks; i++) {
96 asm volatile(
97 "MOVDQA (%1),%%xmm0\n"
98 "\tPSHUFB (%2),%%xmm0\n"
99 "\tMOVDQA %%xmm0,(%0)\n"
100 : : "r" (dest), "r" (src), "r" (masks) : "xmm0");
101 dest += 16;
102 src += 16;
105 #endif
108 void copy_drop4(uint8_t* dest, const uint32_t* src, size_t units)
110 #ifdef ARCH_IS_I386
111 if(ssse3_available()) {
112 ssse3_drop(dest, reinterpret_cast<const uint8_t*>(src), units, mask_drop4_8);
113 return;
115 #endif
116 const uint8_t* _src = reinterpret_cast<const uint8_t*>(src);
117 for(size_t i = 0; i < units; i++) {
118 dest[3 * i + 0] = _src[4 * i + 0];
119 dest[3 * i + 1] = _src[4 * i + 1];
120 dest[3 * i + 2] = _src[4 * i + 2];
124 void copy_drop4s(uint8_t* dest, const uint32_t* src, size_t units)
126 #ifdef ARCH_IS_I386
127 if(ssse3_available()) {
128 ssse3_drop(dest, reinterpret_cast<const uint8_t*>(src), units, mask_drop4s_8);
129 return;
131 #endif
132 const uint8_t* _src = reinterpret_cast<const uint8_t*>(src);
133 for(size_t i = 0; i < units; i++) {
134 dest[3 * i + 0] = _src[4 * i + 2];
135 dest[3 * i + 1] = _src[4 * i + 1];
136 dest[3 * i + 2] = _src[4 * i + 0];
140 void copy_swap4(uint8_t* dest, const uint32_t* src, size_t units)
142 #ifdef ARCH_IS_I386
143 if(ssse3_available()) {
144 ssse3_swap(dest, reinterpret_cast<const uint8_t*>(src), units, mask_swap4_8);
145 return;
147 #endif
148 const uint8_t* _src = reinterpret_cast<const uint8_t*>(src);
149 for(size_t i = 0; i < units; i++) {
150 dest[4 * i + 0] = _src[4 * i + 2];
151 dest[4 * i + 1] = _src[4 * i + 1];
152 dest[4 * i + 2] = _src[4 * i + 0];
153 dest[4 * i + 3] = _src[4 * i + 3];
157 void copy_drop4(uint16_t* dest, const uint64_t* src, size_t units)
159 #ifdef ARCH_IS_I386
160 if(ssse3_available()) {
161 ssse3_drop(reinterpret_cast<uint8_t*>(dest), reinterpret_cast<const uint8_t*>(src), units,
162 mask_drop4_16);
163 return;
165 #endif
166 const uint16_t* _src = reinterpret_cast<const uint16_t*>(src);
167 for(size_t i = 0; i < units; i++) {
168 dest[3 * i + 0] = _src[4 * i + 0];
169 dest[3 * i + 1] = _src[4 * i + 1];
170 dest[3 * i + 2] = _src[4 * i + 2];
174 void copy_drop4s(uint16_t* dest, const uint64_t* src, size_t units)
176 #ifdef ARCH_IS_I386
177 if(ssse3_available()) {
178 ssse3_drop(reinterpret_cast<uint8_t*>(dest), reinterpret_cast<const uint8_t*>(src), units,
179 mask_drop4s_16);
180 return;
182 #endif
183 const uint16_t* _src = reinterpret_cast<const uint16_t*>(src);
184 for(size_t i = 0; i < units; i++) {
185 dest[3 * i + 0] = _src[4 * i + 2];
186 dest[3 * i + 1] = _src[4 * i + 1];
187 dest[3 * i + 2] = _src[4 * i + 0];
191 void copy_swap4(uint16_t* dest, const uint64_t* src, size_t units)
193 #ifdef ARCH_IS_I386
194 if(ssse3_available()) {
195 ssse3_swap(reinterpret_cast<uint8_t*>(dest), reinterpret_cast<const uint8_t*>(src), units,
196 mask_swap4_16);
197 return;
199 #endif
200 const uint16_t* _src = reinterpret_cast<const uint16_t*>(src);
201 for(size_t i = 0; i < units; i++) {
202 dest[4 * i + 0] = _src[4 * i + 2];
203 dest[4 * i + 1] = _src[4 * i + 1];
204 dest[4 * i + 2] = _src[4 * i + 0];
205 dest[4 * i + 3] = _src[4 * i + 3];