1 // Copyright (C) 2003 Dolphin Project.
3 // This program is free software: you can redistribute it and/or modify
4 // it under the terms of the GNU General Public License as published by
5 // the Free Software Foundation, version 2.0.
7 // This program is distributed in the hope that it will be useful,
8 // but WITHOUT ANY WARRANTY; without even the implied warranty of
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 // GNU General Public License 2.0 for more details.
12 // A copy of the GPL 2.0 should have been included with the program.
13 // If not, see http://www.gnu.org/licenses/
15 // Official SVN repository and contact information can be found at
16 // http://code.google.com/p/dolphin-emu/
19 #include "VideoCommon.h"
20 #include "VertexLoader.h"
21 #include "VertexLoader_Position.h"
22 #include "VertexManagerBase.h"
23 #include "CPUDetect.h"
25 #if _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__)
26 #include <tmmintrin.h>
29 extern float posScale
;
30 extern TVtxAttr
*pVtxAttr
;
32 // Thoughts on the implementation of a vertex loader compiler.
33 // s_pCurBufferPointer should definitely be in a register.
34 // Could load the position scale factor in XMM7, for example.
36 // The pointer inside DataReadU8 in another.
37 // Let's check out Pos_ReadDirect_UByte(). For Byte, replace MOVZX with MOVSX.
40 MOVZX(32, R(EAX), MOffset(ESI, 0));
41 MOVZX(32, R(EBX), MOffset(ESI, 1));
42 MOVZX(32, R(ECX), MOffset(ESI, 2));
52 MOVSS(MOffset(EDI, 0), XMM0);
53 MOVSS(MOffset(EDI, 4), XMM1);
54 MOVSS(MOffset(EDI, 8), XMM2);
56 Alternatively, lookup table:
57 MOVZX(32, R(EAX), MOffset(ESI, 0));
58 MOVZX(32, R(EBX), MOffset(ESI, 1));
59 MOVZX(32, R(ECX), MOffset(ESI, 2));
60 MOV(32, R(EAX), MComplex(LUTREG, EAX, 4));
61 MOV(32, R(EBX), MComplex(LUTREG, EBX, 4));
62 MOV(32, R(ECX), MComplex(LUTREG, ECX, 4));
63 MOV(MOffset(EDI, 0), XMM0);
64 MOV(MOffset(EDI, 4), XMM1);
65 MOV(MOffset(EDI, 8), XMM2);
68 PINSRB(XMM0, MOffset(ESI, 0), 0);
69 PINSRB(XMM0, MOffset(ESI, 1), 4);
70 PINSRB(XMM0, MOffset(ESI, 2), 8);
72 <two unpacks here to sign extend>
74 MOVUPS(MOffset(EDI, 0), XMM0);
78 // ==============================================================================
80 // ==============================================================================
82 template <class T
, bool three
>
85 ((float*)VertexManager::s_pCurBufferPointer
)[0] = (float)(T
)DataRead
<T
>() * posScale
;
86 ((float*)VertexManager::s_pCurBufferPointer
)[1] = (float)(T
)DataRead
<T
>() * posScale
;
88 ((float*)VertexManager::s_pCurBufferPointer
)[2] = (float)(T
)DataRead
<T
>() * posScale
;
90 ((float*)VertexManager::s_pCurBufferPointer
)[2] = 0.0f
;
92 VertexManager::s_pCurBufferPointer
+= 12;
95 void LOADERDECL
Pos_ReadDirect_UByte3() { Pos_ReadDirect
<u8
, true>(); }
96 void LOADERDECL
Pos_ReadDirect_Byte3() { Pos_ReadDirect
<s8
, true>(); }
97 void LOADERDECL
Pos_ReadDirect_UShort3() { Pos_ReadDirect
<u16
, true>(); }
98 void LOADERDECL
Pos_ReadDirect_Short3() { Pos_ReadDirect
<s16
, true>(); }
99 void LOADERDECL
Pos_ReadDirect_UByte2() { Pos_ReadDirect
<u8
, false>(); }
100 void LOADERDECL
Pos_ReadDirect_Byte2() { Pos_ReadDirect
<s8
, false>(); }
101 void LOADERDECL
Pos_ReadDirect_UShort2() { Pos_ReadDirect
<u16
, false>(); }
102 void LOADERDECL
Pos_ReadDirect_Short2() { Pos_ReadDirect
<s16
, false>(); }
104 void LOADERDECL
Pos_ReadDirect_Float3()
106 // No need to use floating point here.
107 ((u32
*)VertexManager::s_pCurBufferPointer
)[0] = DataReadU32();
108 ((u32
*)VertexManager::s_pCurBufferPointer
)[1] = DataReadU32();
109 ((u32
*)VertexManager::s_pCurBufferPointer
)[2] = DataReadU32();
111 VertexManager::s_pCurBufferPointer
+= 12;
114 void LOADERDECL
Pos_ReadDirect_Float2()
116 // No need to use floating point here.
117 ((u32
*)VertexManager::s_pCurBufferPointer
)[0] = DataReadU32();
118 ((u32
*)VertexManager::s_pCurBufferPointer
)[1] = DataReadU32();
119 ((u32
*)VertexManager::s_pCurBufferPointer
)[2] = 0;
121 VertexManager::s_pCurBufferPointer
+= 12;
125 template<class T
, bool three
>
126 inline void Pos_ReadIndex_Byte(int Index
)
128 const u8
* pData
= cached_arraybases
[ARRAY_POSITION
] + ((u32
)Index
* arraystrides
[ARRAY_POSITION
]);
129 ((float*)VertexManager::s_pCurBufferPointer
)[0] = ((float)(T
)(pData
[0])) * posScale
;
130 ((float*)VertexManager::s_pCurBufferPointer
)[1] = ((float)(T
)(pData
[1])) * posScale
;
132 ((float*)VertexManager::s_pCurBufferPointer
)[2] = ((float)(T
)(pData
[2])) * posScale
;
134 ((float*)VertexManager::s_pCurBufferPointer
)[2] = 0.0f
;
136 VertexManager::s_pCurBufferPointer
+= 12;
139 template<class T
, bool three
>
140 inline void Pos_ReadIndex_Short(int Index
)
142 const u16
* pData
= (const u16
*)(cached_arraybases
[ARRAY_POSITION
] + ((u32
)Index
* arraystrides
[ARRAY_POSITION
]));
143 ((float*)VertexManager::s_pCurBufferPointer
)[0] = ((float)(T
)Common::swap16(pData
[0])) * posScale
;
144 ((float*)VertexManager::s_pCurBufferPointer
)[1] = ((float)(T
)Common::swap16(pData
[1])) * posScale
;
146 ((float*)VertexManager::s_pCurBufferPointer
)[2] = ((float)(T
)Common::swap16(pData
[2])) * posScale
;
148 ((float*)VertexManager::s_pCurBufferPointer
)[2] = 0.0f
;
150 VertexManager::s_pCurBufferPointer
+= 12;
154 void Pos_ReadIndex_Float(int Index
)
156 const u32
* pData
= (const u32
*)(cached_arraybases
[ARRAY_POSITION
] + (Index
* arraystrides
[ARRAY_POSITION
]));
157 ((u32
*)VertexManager::s_pCurBufferPointer
)[0] = Common::swap32(pData
[0]);
158 ((u32
*)VertexManager::s_pCurBufferPointer
)[1] = Common::swap32(pData
[1]);
160 ((u32
*)VertexManager::s_pCurBufferPointer
)[2] = Common::swap32(pData
[2]);
162 ((float*)VertexManager::s_pCurBufferPointer
)[2] = 0.0f
;
164 VertexManager::s_pCurBufferPointer
+= 12;
168 static const __m128i kMaskSwap32_3
= _mm_set_epi32(0xFFFFFFFFL
, 0x08090A0BL
, 0x04050607L
, 0x00010203L
);
169 static const __m128i kMaskSwap32_2
= _mm_set_epi32(0xFFFFFFFFL
, 0xFFFFFFFFL
, 0x04050607L
, 0x00010203L
);
172 void Pos_ReadIndex_Float_SSSE3(int Index
)
174 const u32
* pData
= (const u32
*)(cached_arraybases
[ARRAY_POSITION
] + (Index
* arraystrides
[ARRAY_POSITION
]));
175 const __m128i a
= _mm_loadu_si128((__m128i
*)pData
);
176 __m128i b
= _mm_shuffle_epi8(a
, three
? kMaskSwap32_3
: kMaskSwap32_2
);
177 _mm_storeu_si128((__m128i
*)VertexManager::s_pCurBufferPointer
, b
);
179 VertexManager::s_pCurBufferPointer
+= 12;
183 // Explicitly instantiate these functions to decrease the possibility of
184 // symbol binding problems when (only) calling them from JIT compiled code.
185 template void Pos_ReadDirect
<u8
, true>();
186 template void Pos_ReadDirect
<s8
, true>();
187 template void Pos_ReadDirect
<u16
, true>();
188 template void Pos_ReadDirect
<s16
, true>();
189 template void Pos_ReadDirect
<u8
, false>();
190 template void Pos_ReadDirect
<s8
, false>();
191 template void Pos_ReadDirect
<u16
, false>();
192 template void Pos_ReadDirect
<s16
, false>();
193 template void Pos_ReadIndex_Byte
<u8
, true>(int Index
);
194 template void Pos_ReadIndex_Byte
<s8
, true>(int Index
);
195 template void Pos_ReadIndex_Short
<u16
, true>(int Index
);
196 template void Pos_ReadIndex_Short
<s16
, true>(int Index
);
197 template void Pos_ReadIndex_Float
<true>(int Index
);
198 template void Pos_ReadIndex_Byte
<u8
, false>(int Index
);
199 template void Pos_ReadIndex_Byte
<s8
, false>(int Index
);
200 template void Pos_ReadIndex_Short
<u16
, false>(int Index
);
201 template void Pos_ReadIndex_Short
<s16
, false>(int Index
);
202 template void Pos_ReadIndex_Float
<false>(int Index
);
204 // ==============================================================================
206 // ==============================================================================
207 void LOADERDECL
Pos_ReadIndex8_UByte3() {Pos_ReadIndex_Byte
<u8
, true> (DataReadU8());}
208 void LOADERDECL
Pos_ReadIndex8_Byte3() {Pos_ReadIndex_Byte
<s8
, true> (DataReadU8());}
209 void LOADERDECL
Pos_ReadIndex8_UShort3() {Pos_ReadIndex_Short
<u16
, true> (DataReadU8());}
210 void LOADERDECL
Pos_ReadIndex8_Short3() {Pos_ReadIndex_Short
<s16
, true> (DataReadU8());}
211 void LOADERDECL
Pos_ReadIndex8_Float3() {Pos_ReadIndex_Float
<true> (DataReadU8());}
212 void LOADERDECL
Pos_ReadIndex8_UByte2() {Pos_ReadIndex_Byte
<u8
, false>(DataReadU8());}
213 void LOADERDECL
Pos_ReadIndex8_Byte2() {Pos_ReadIndex_Byte
<s8
, false>(DataReadU8());}
214 void LOADERDECL
Pos_ReadIndex8_UShort2() {Pos_ReadIndex_Short
<u16
, false>(DataReadU8());}
215 void LOADERDECL
Pos_ReadIndex8_Short2() {Pos_ReadIndex_Short
<s16
, false>(DataReadU8());}
216 void LOADERDECL
Pos_ReadIndex8_Float2() {Pos_ReadIndex_Float
<false> (DataReadU8());}
218 // ==============================================================================
220 // ==============================================================================
221 void LOADERDECL
Pos_ReadIndex16_UByte3() {Pos_ReadIndex_Byte
<u8
, true> (DataReadU16());}
222 void LOADERDECL
Pos_ReadIndex16_Byte3() {Pos_ReadIndex_Byte
<s8
, true> (DataReadU16());}
223 void LOADERDECL
Pos_ReadIndex16_UShort3() {Pos_ReadIndex_Short
<u16
, true> (DataReadU16());}
224 void LOADERDECL
Pos_ReadIndex16_Short3() {Pos_ReadIndex_Short
<s16
, true> (DataReadU16());}
225 void LOADERDECL
Pos_ReadIndex16_Float3() {Pos_ReadIndex_Float
<true> (DataReadU16());}
226 void LOADERDECL
Pos_ReadIndex16_UByte2() {Pos_ReadIndex_Byte
<u8
, false>(DataReadU16());}
227 void LOADERDECL
Pos_ReadIndex16_Byte2() {Pos_ReadIndex_Byte
<s8
, false>(DataReadU16());}
228 void LOADERDECL
Pos_ReadIndex16_UShort2() {Pos_ReadIndex_Short
<u16
, false>(DataReadU16());}
229 void LOADERDECL
Pos_ReadIndex16_Short2() {Pos_ReadIndex_Short
<s16
, false>(DataReadU16());}
230 void LOADERDECL
Pos_ReadIndex16_Float2() {Pos_ReadIndex_Float
<false> (DataReadU16());}
233 void LOADERDECL
Pos_ReadIndex8_Float3_SSSE3() {Pos_ReadIndex_Float_SSSE3
<true> (DataReadU8());}
234 void LOADERDECL
Pos_ReadIndex8_Float2_SSSE3() {Pos_ReadIndex_Float_SSSE3
<false> (DataReadU8());}
235 void LOADERDECL
Pos_ReadIndex16_Float3_SSSE3() {Pos_ReadIndex_Float_SSSE3
<true> (DataReadU16());}
236 void LOADERDECL
Pos_ReadIndex16_Float2_SSSE3() {Pos_ReadIndex_Float_SSSE3
<false> (DataReadU16());}
239 static TPipelineFunction tableReadPosition
[4][8][2] = {
248 {Pos_ReadDirect_UByte2
, Pos_ReadDirect_UByte3
,},
249 {Pos_ReadDirect_Byte2
, Pos_ReadDirect_Byte3
,},
250 {Pos_ReadDirect_UShort2
, Pos_ReadDirect_UShort3
,},
251 {Pos_ReadDirect_Short2
, Pos_ReadDirect_Short3
,},
252 {Pos_ReadDirect_Float2
, Pos_ReadDirect_Float3
,},
255 {Pos_ReadIndex8_UByte2
, Pos_ReadIndex8_UByte3
,},
256 {Pos_ReadIndex8_Byte2
, Pos_ReadIndex8_Byte3
,},
257 {Pos_ReadIndex8_UShort2
, Pos_ReadIndex8_UShort3
,},
258 {Pos_ReadIndex8_Short2
, Pos_ReadIndex8_Short3
,},
259 {Pos_ReadIndex8_Float2
, Pos_ReadIndex8_Float3
,},
262 {Pos_ReadIndex16_UByte2
, Pos_ReadIndex16_UByte3
,},
263 {Pos_ReadIndex16_Byte2
, Pos_ReadIndex16_Byte3
,},
264 {Pos_ReadIndex16_UShort2
, Pos_ReadIndex16_UShort3
,},
265 {Pos_ReadIndex16_Short2
, Pos_ReadIndex16_Short3
,},
266 {Pos_ReadIndex16_Float2
, Pos_ReadIndex16_Float3
,},
270 static int tableReadPositionVertexSize
[4][8][2] = {
302 void VertexLoader_Position::Init(void) {
306 if (cpu_info
.bSSSE3
) {
307 tableReadPosition
[2][4][0] = Pos_ReadIndex8_Float2_SSSE3
;
308 tableReadPosition
[2][4][1] = Pos_ReadIndex8_Float3_SSSE3
;
309 tableReadPosition
[3][4][0] = Pos_ReadIndex16_Float2_SSSE3
;
310 tableReadPosition
[3][4][1] = Pos_ReadIndex16_Float3_SSSE3
;
317 unsigned int VertexLoader_Position::GetSize(unsigned int _type
, unsigned int _format
, unsigned int _elements
) {
318 return tableReadPositionVertexSize
[_type
][_format
][_elements
];
321 TPipelineFunction
VertexLoader_Position::GetFunction(unsigned int _type
, unsigned int _format
, unsigned int _elements
) {
322 return tableReadPosition
[_type
][_format
][_elements
];