1 // This file is part of the ustl library, an STL implementation.
3 // Copyright (C) 2005 by Mike Sharov <msharov@users.sourceforge.net>
4 // This file is free software, distributed under the MIT License.
9 #ifndef ULAALGO_H_2E403D182E83FB596AFB800E68B255A1
10 #define ULAALGO_H_2E403D182E83FB596AFB800E68B255A1
17 /// \brief Creates an identity matrix in \p m
18 /// \ingroup NumericAlgorithms
19 template <size_t NX
, size_t NY
, typename T
>
20 void load_identity (matrix
<NX
,NY
,T
>& m
)
22 fill_n (m
.begin(), NX
* NY
, 0);
23 for (typename matrix
<NX
,NY
,T
>::iterator i
= m
.begin(); i
< m
.end(); i
+= NX
+ 1)
27 /// \brief Multiplies two matrices
28 /// \ingroup NumericAlgorithms
29 template <size_t NX
, size_t NY
, typename T
>
30 matrix
<NY
,NY
,T
> operator* (const matrix
<NX
,NY
,T
>& m1
, const matrix
<NY
,NX
,T
>& m2
)
33 for (uoff_t ry
= 0; ry
< NY
; ++ ry
) {
34 for (uoff_t rx
= 0; rx
< NY
; ++ rx
) {
36 for (uoff_t x
= 0; x
< NX
; ++ x
)
37 dpv
+= m1
[ry
][x
] * m2
[x
][rx
];
44 /// \brief Transforms vector \p t with matrix \p m
45 /// \ingroup NumericAlgorithms
46 template <size_t NX
, size_t NY
, typename T
>
47 tuple
<NX
,T
> operator* (const tuple
<NY
,T
>& t
, const matrix
<NX
,NY
,T
>& m
)
50 for (uoff_t x
= 0; x
< NX
; ++ x
) {
52 for (uoff_t y
= 0; y
< NY
; ++ y
)
53 dpv
+= t
[y
] * m
[y
][x
];
59 /// \brief Transposes (exchanges rows and columns) matrix \p m.
60 /// \ingroup NumericAlgorithms
61 template <size_t N
, typename T
>
62 void transpose (matrix
<N
,N
,T
>& m
)
64 for (uoff_t x
= 0; x
< N
; ++ x
)
65 for (uoff_t y
= x
; y
< N
; ++ y
)
66 swap (m
[x
][y
], m
[y
][x
]);
69 #if WANT_UNROLLED_COPY
73 #if linux // Non-linux gcc versions (BSD, Solaris) can't handle "x" constraint and provide no alternative.
75 inline void load_identity (matrix
<4,4,float>& m
)
78 "movaps %4, %%xmm1 \n\t" // 1 0 0 0
79 "movups %4, %0 \n\t" // 1 0 0 0
80 "shufps $0xB1,%%xmm1,%%xmm1 \n\t" // 0 1 0 0
81 "movups %%xmm1, %1 \n\t" // 0 1 0 0
82 "shufps $0x4F,%4,%%xmm1 \n\t" // 0 0 1 0
83 "shufps $0x1B,%4,%4 \n\t" // 0 0 0 1
84 "movups %%xmm1, %2 \n\t" // 0 0 1 0
85 "movups %4, %3" // 0 0 0 1
86 : "=m"(m
[0][0]), "=m"(m
[1][0]), "=m"(m
[2][0]), "=m"(m
[3][0])
94 inline void _sse_load_matrix (const float* m
)
97 "movups %0, %%xmm4 \n\t" // xmm4 = m[1 2 3 4]
98 "movups %1, %%xmm5 \n\t" // xmm5 = m[1 2 3 4]
99 "movups %2, %%xmm6 \n\t" // xmm6 = m[1 2 3 4]
100 "movups %3, %%xmm7" // xmm7 = m[1 2 3 4]
101 : : "m"(m
[0]), "m"(m
[4]), "m"(m
[8]), "m"(m
[12])
102 : "xmm4", "xmm5", "xmm6", "xmm7", "memory"
106 inline void _sse_transform_to_vector (float* result
)
109 "movaps %%xmm0, %%xmm1 \n\t" // xmm1 = t[0 1 2 3]
110 "movaps %%xmm0, %%xmm2 \n\t" // xmm1 = t[0 1 2 3]
111 "movaps %%xmm0, %%xmm3 \n\t" // xmm1 = t[0 1 2 3]
112 "shufps $0x00, %%xmm0, %%xmm0 \n\t" // xmm0 = t[0 0 0 0]
113 "shufps $0x66, %%xmm1, %%xmm1 \n\t" // xmm1 = t[1 1 1 1]
114 "shufps $0xAA, %%xmm2, %%xmm2 \n\t" // xmm2 = t[2 2 2 2]
115 "shufps $0xFF, %%xmm3, %%xmm3 \n\t" // xmm3 = t[3 3 3 3]
116 "mulps %%xmm4, %%xmm0 \n\t" // xmm0 = t[0 0 0 0] * m[0 1 2 3]
117 "mulps %%xmm5, %%xmm1 \n\t" // xmm1 = t[1 1 1 1] * m[0 1 2 3]
118 "addps %%xmm1, %%xmm0 \n\t" // xmm0 = xmm0 + xmm1
119 "mulps %%xmm6, %%xmm2 \n\t" // xmm2 = t[2 2 2 2] * m[0 1 2 3]
120 "mulps %%xmm7, %%xmm3 \n\t" // xmm3 = t[3 3 3 3] * m[0 1 2 3]
121 "addps %%xmm3, %%xmm2 \n\t" // xmm2 = xmm2 + xmm3
122 "addps %%xmm2, %%xmm0 \n\t" // xmm0 = result
124 : "=m"(result
[0]), "=m"(result
[1]), "=m"(result
[2]), "=m"(result
[3]) :
125 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
130 inline tuple
<4,float> operator* (const tuple
<4,float>& t
, const matrix
<4,4,float>& m
)
132 tuple
<4,float> result
;
133 _sse_load_matrix (m
.begin());
134 asm ("movups %0, %%xmm0" : : "m"(t
[0]), "m"(t
[1]), "m"(t
[2]), "m"(t
[3]) : "xmm0", "memory");
135 _sse_transform_to_vector (result
.begin());
140 inline matrix
<4,4,float> operator* (const matrix
<4,4,float>& m1
, const matrix
<4,4,float>& m2
)
142 matrix
<4,4,float> result
;
143 _sse_load_matrix (m2
.begin());
144 for (uoff_t r
= 0; r
< 4; ++ r
) {
145 asm ("movups %0, %%xmm0" : : "m"(m1
[r
][0]), "m"(m1
[r
][1]), "m"(m1
[r
][2]), "m"(m1
[r
][3]) : "xmm0", "memory");
146 _sse_transform_to_vector (result
[r
]);
153 /// Specialization for 4-component vector transform, the slow part of 3D graphics.
155 static tuple
<4,float> operator* (const tuple
<4,float>& t
, const matrix
<4,4,float>& m
)
157 tuple
<4,float> result
;
158 // This is taken from "AMD Athlon Code Optimization Guide" from AMD. 18 cycles!
159 // If you are writing a 3D engine, you may want to copy it instead of calling it
160 // because of the femms instruction at the end, which takes 2 cycles.
162 "movq %2, %%mm0 \n\t" // y | x
163 "movq %3, %%mm1 \n\t" // w | z
164 "movq %%mm0, %%mm2 \n\t" // y | x
165 "movq %4, %%mm3 \n\t" // m[0][1] | m[0][0]
166 "punpckldq %%mm0, %%mm0 \n\t" // x | x
167 "movq %6, %%mm4 \n\t" // m[1][1] | m[1][0]
168 "pfmul %%mm0, %%mm3 \n\t" // x*m[0][1] | x*m[0][0]
169 "punpckhdq %%mm2, %%mm2 \n\t" // y | y
170 "pfmul %%mm2, %%mm4 \n\t" // y*m[1][1] | y*m[1][0]
171 "movq %5, %%mm5 \n\t" // m[0][3] | m[0][2]
172 "movq %7, %%mm7 \n\t" // m[1][3] | m[1][2]
173 "movq %%mm1, %%mm6 \n\t" // w | z
174 "pfmul %%mm0, %%mm5 \n\t" // x*m[0][3] | v0>x*m[0][2]
175 "movq %8, %%mm0 \n\t" // m[2][1] | m[2][0]
176 "punpckldq %%mm1, %%mm1 \n\t" // z | z
177 "pfmul %%mm2, %%mm7 \n\t" // y*m[1][3] | y*m[1][2]
178 "movq %9, %%mm2 \n\t" // m[2][3] | m[2][2]
179 "pfmul %%mm1, %%mm0 \n\t" // z*m[2][1] | z*m[2][0]
180 "pfadd %%mm4, %%mm3 \n\t" // x*m[0][1]+y*m[1][1] | x*m[0][0]+y*m[1][0]
181 "movq %10, %%mm4 \n\t" // m[3][1] | m[3][0]
182 "pfmul %%mm1, %%mm2 \n\t" // z*m[2][3] | z*m[2][2]
183 "pfadd %%mm7, %%mm5 \n\t" // x*m[0][3]+y*m[1][3] | x*m[0][2]+y*m[1][2]
184 "movq %11, %%mm1 \n\t" // m[3][3] | m[3][2]
185 "punpckhdq %%mm6, %%mm6 \n\t" // w | w
186 "pfadd %%mm0, %%mm3 \n\t" // x*m[0][1]+y*m[1][1]+z*m[2][1] | x*m[0][0]+y*m[1][0]+z*m[2][0]
187 "pfmul %%mm6, %%mm4 \n\t" // w*m[3][1] | w*m[3][0]
188 "pfmul %%mm6, %%mm1 \n\t" // w*m[3][3] | w*m[3][2]
189 "pfadd %%mm2, %%mm5 \n\t" // x*m[0][3]+y*m[1][3]+z*m[2][3] | x*m[0][2]+y*m[1][2]+z*m[2][2]
190 "pfadd %%mm4, %%mm3 \n\t" // x*m[0][1]+y*m[1][1]+z*m[2][1]+w*m[3][1] | x*m[0][0]+y*m[1][0]+z*m[2][0]+w*m[3][0]
191 "movq %%mm3, %0 \n\t" // store result->y | result->x
192 "pfadd %%mm1, %%mm5 \n\t" // x*m[0][3]+y*m[1][3]+z*m[2][3]+w*m[3][3] | x*m[0][2]+y*m[1][2]+z*m[2][2]+w*m[3][2]
193 "movq %%mm5, %1" // store result->w | result->z
194 : "=m"(result
[0]), "=m"(result
[2])
195 : "m"(t
[0]), "m"(t
[2]),
196 "m"(m
[0][0]), "m"(m
[0][2]),
197 "m"(m
[1][0]), "m"(m
[1][2]),
198 "m"(m
[2][0]), "m"(m
[2][2]),
199 "m"(m
[3][0]), "m"(m
[3][2])
200 : ALL_MMX_REGS_CHANGELIST
, "memory"
207 #else // If no processor extensions, just unroll the multiplication
209 /// Specialization for 4-component vector transform, the slow part of 3D graphics.
211 static tuple
<4,float> operator* (const tuple
<4,float>& t
, const matrix
<4,4,float>& m
)
214 for (uoff_t i
= 0; i
< 4; ++ i
)
215 tr
[i
] = t
[0] * m
[0][i
] + t
[1] * m
[1][i
] + t
[2] * m
[2][i
] + t
[3] * m
[3][i
];
219 #endif // CPU_HAS_3DNOW
220 #endif // WANT_UNROLLED_COPY