Fix libogc hardware lighting (GX_SetChanCtrl) - patch from https://sourceforge.net...
[libogc.git] / libogc / gu_psasm.S
blob5aea608f0a406ac13ea21d0ca1a6d1940df53d06
1 #include <asm.h>
3 #define A00_A01         fr0
4 #define A02_A03         fr1
5 #define A10_A11         fr2
6 #define A12_A13         fr3
7 #define A20_A21         fr4
8 #define A22_A23         fr5
10 #define B00_B01         fr6
11 #define B02_B03         fr7
12 #define B10_B11         fr8
13 #define B12_B13         fr9
14 #define B20_B21         fr10
15 #define B22_B23         fr11
17 #define D00_D01         fr12
18 #define D02_D03         fr13
19 #define D10_D11         fr14
20 #define D12_D13         fr15
21 #define D20_D21         fr2
22 #define D22_D23         fr0
24 #define UNIT01          fr31
26 #define RET_REG         fr1
27 #define V1_XY           fr2
28 #define V1_Z            fr3
29 #define V2_XY           fr4
30 #define V2_Z            fr5
31 #define D1_XY           fr6
32 #define D1_Z            fr7
33 #define D2_XY           fr8
34 #define D2_Z            fr9
35 #define W1_XY           fr10
36 #define W1_Z            fr11
37 #define W2_XY           fr12
38 #define W2_Z            fr13
40         .globl  ps_guMtxConcat
41         //r3 = mtxA, r4 = mtxB, r5 = mtxAB
42 ps_guMtxConcat:
43         stwu            r1,-64(r1)
44         psq_l           A00_A01,0(r3),0,0
45         stfd            fr14,8(r1)
46         psq_l           B00_B01,0(r4),0,0
47         lis                     r6,Unit01@ha
48         psq_l           B02_B03,8(r4),0,0
49         stfd            fr15,16(r1)
50         addi            6,6,Unit01@l
51         stfd            fr31,40(r1)
52         psq_l           B10_B11,16(r4),0,0
53         ps_muls0        D00_D01,B00_B01,A00_A01
54         psq_l           A10_A11,16(r3),0,0
55         ps_muls0        D02_D03,B02_B03,A00_A01
56         psq_l           UNIT01,0(r6),0,0
57         ps_muls0        D10_D11,B00_B01,A10_A11
58         psq_l           B12_B13,24(r4),0,0
59         ps_muls0        D12_D13,B02_B03,A10_A11
60         psq_l           A02_A03,8(r3),0,0
61         ps_madds1       D00_D01,B10_B11,A00_A01,D00_D01
62         psq_l           A12_A13,24(r3),0,0
63         ps_madds1       D10_D11,B10_B11,A10_A11,D10_D11
64         psq_l           B20_B21,32(r4),0,0
65         ps_madds1       D02_D03,B12_B13,A00_A01,D02_D03
66         psq_l           B22_B23,40(r4),0,0
67         ps_madds1       D12_D13,B12_B13,A10_A11,D12_D13
68         psq_l           A20_A21,32(r3),0,0
69         psq_l           A22_A23,40(r3),0,0
70         ps_madds0       D00_D01,B20_B21,A02_A03,D00_D01
71         ps_madds0       D02_D03,B22_B23,A02_A03,D02_D03
72         ps_madds0       D10_D11,B20_B21,A12_A13,D10_D11
73         ps_madds0       D12_D13,B22_B23,A12_A13,D12_D13
74         psq_st          D00_D01,0(r5),0,0
75         ps_muls0        D20_D21,B00_B01,A20_A21
76         ps_madds1       D02_D03,UNIT01,A02_A03,D02_D03
77         ps_muls0        D22_D23,B02_B03,A20_A21
78         psq_st          D10_D11,16(r5),0,0
79         ps_madds1       D12_D13,UNIT01,A12_A13,D12_D13
80         psq_st          D02_D03,8(r5),0,0
81         ps_madds1       D20_D21,B10_B11,A20_A21,D20_D21
82         ps_madds1       D22_D23,B12_B13,A20_A21,D22_D23
83         ps_madds0       D20_D21,B20_B21,A22_A23,D20_D21
84         lfd                     fr14,8(r1)
85         psq_st          D12_D13,24(r5),0,0
86         ps_madds0       D22_D23,B22_B23,A22_A23,D22_D23
87         psq_st          D20_D21,32(r5),0,0
88         ps_madds1       D22_D23,UNIT01,A22_A23,D22_D23
89         lfd                     fr15,16(r1)
90         psq_st          D22_D23,40(r5),0,0
91         lfd                     fr31,40(r1)
92         addi            r1,r1,64
93         blr
95         .globl ps_guMtxIdentity
96         //r3 == mtx
97 ps_guMtxIdentity:
98         lis                     r9,Unit01@ha
99         addi            r9,r9,Unit01@l
100         lfs                     fr0,0(r9)
101         lfs                     fr1,4(r9)
102         psq_st          fr0,8(r3),0,0
103         ps_merge01      fr2,fr0,fr1
104         psq_st          fr0,24(r3),0,0
105         ps_merge10      fr3,fr1,fr0
106         psq_st          fr0,32(r3),0,0
107         psq_st          fr2,16(r3),0,0
108         psq_st          fr3,0(r3),0,0
109         psq_st          fr3,40(r3),0,0
110         blr
112         .globl ps_guMtxCopy
113         //r3 = src, r4 = dst
114 ps_guMtxCopy:
115         psq_l           fr0,0(r3),0,0
116         psq_st          fr0,0(r4),0,0
117         psq_l           fr1,8(r3),0,0
118         psq_st          fr1,8(r4),0,0
119         psq_l           fr2,16(r3),0,0
120         psq_st          fr2,16(r4),0,0
121         psq_l           fr3,24(r3),0,0
122         psq_st          fr3,24(r4),0,0
123         psq_l           fr4,32(r3),0,0
124         psq_st          fr4,32(r4),0,0
125         psq_l           fr5,40(r3),0,0
126         psq_st          fr5,40(r4),0,0
127         blr
129         .globl ps_guMtxTranspose
130         //r3 = src, r4 = xpose
131 ps_guMtxTranspose:
132         lis                     r9,Unit01@ha
133         addi            r9,r9,Unit01@l
134         lfs                     fr0,0(r9)
135         psq_l           fr1,0(r3),0,0
136         stfs            fr0,44(r4)
137         psq_l           fr2,16(r3),0,0
138         ps_merge00      fr5,fr1,fr2
139         psq_l           fr3,8(r3),1,0
140         ps_merge11      fr6,fr1,fr2
141         psq_l           fr4,24(r3),1,0
142         psq_st          fr5,0(r4),0,0
143         psq_l           fr1,32(r3),0,0
144         ps_merge00      fr7,fr3,fr4
145         psq_st          fr6,16(r4),0,0
146         ps_merge00      fr5,fr1,fr0
147         psq_st          fr7,32(r4),0,0
148         ps_merge10      fr6,fr1,fr0
149         psq_st          fr5,8(r4),0,0
150         lfs                     fr3,40(r3)
151         psq_st          fr6,24(r4),0,0
152         stfs            fr3,40(r4)
153         blr
155         .globl ps_guMtxInverse
156         //r3 = src, r4 = inv
157 ps_guMtxInverse:
158         psq_l           fr0,0(r3),1,0
159         psq_l           fr1,4(r3),0,0
160         psq_l           fr2,16(r3),1,0
161         ps_merge10      fr6,fr1,fr0
162         psq_l           fr3,20(r3),0,0
163         psq_l           fr4,32(r3),1,0
164         ps_merge10      fr7,fr3,fr2
165         psq_l           fr5,36(r3),0,0
166         ps_mul          fr11,fr3,fr6
167         ps_mul          fr13,fr5,fr7
168         ps_merge10      fr8,fr5,fr4
169         ps_msub         fr11,fr1,fr7,fr11
170         ps_mul          fr12,fr1,fr8
171         ps_msub         fr13,fr3,fr8,fr13
172         ps_mul          fr10,fr3,fr4
173         ps_msub         fr12,fr5,fr6,fr12
174         ps_mul          fr9,fr0,fr5
175         ps_mul          fr8,fr1,fr2
176         ps_sub          fr6,fr6,fr6
177         ps_msub         fr10,fr2,fr5,fr10
178         ps_mul          fr7,fr0,fr13
179         ps_msub         fr9,fr1,fr4,fr9
180         ps_madd         fr7,fr2,fr12,fr7
181         ps_msub         fr8,fr0,fr3,fr8
182         ps_madd         fr7,fr4,fr11,fr7
183         ps_cmpo0        cr0,fr7,fr6
184         bne                     0f
185         li                      r3,0
186         blr
188 0:      fres            fr0,fr7
189         ps_add          fr6,fr0,fr0
190         ps_mul          fr5,fr0,fr0
191         ps_nmsub        fr0,fr7,fr5,fr6
192         lfs                     fr1,12(r3)
193         ps_muls0        fr13,fr13,fr0
194         lfs                     fr2,28(r3)
195         ps_muls0        fr12,fr12,fr0
196         lfs                     fr3,44(r3)
197         ps_muls0        fr11,fr11,fr0
198         ps_merge00      fr5,fr13,fr12
199         ps_muls0        fr10,fr10,fr0
200         ps_merge11      fr4,fr13,fr12
201         ps_muls0        fr9,fr9,fr0
202         psq_st          fr5,0(r4),0,0
203         ps_mul          fr6,fr13,fr1
204         psq_st          fr4,16(r4),0,0
205         ps_muls0        fr8,fr8,fr0
206         ps_madd         fr6,fr12,fr2,fr6
207         psq_st          fr10,32(r4),1,0
208         ps_nmadd        fr6,fr11,fr3,fr6
209         psq_st          fr9,36(r4),1,0
210         ps_mul          fr7,fr10,fr1
211         ps_merge00      fr5,fr11,fr6
212         psq_st          fr8,40(r4),1,0
213         ps_merge11      fr4,fr11,fr6
214         psq_st          fr5,8(r4),0,0
215         ps_madd         fr7,fr9,fr2,fr7
216         psq_st          fr4,24(r4),0,0
217         ps_nmadd        fr7,fr8,fr3,fr7
218         li                      r3,1
219         psq_st          fr7,44(r4),1,0
220         blr
222         .globl ps_guMtxInvXpose
223         //r3 = src, r4 = invx
224 ps_guMtxInvXpose:
225         psq_l       fr0, 0(r3), 1, 0
226         psq_l       fr1, 4(r3), 0, 0
227         psq_l       fr2, 16(r3), 1, 0
228         ps_merge10  fr6, fr1, fr0
229         psq_l       fr3, 20(r3), 0, 0
230         psq_l       fr4, 32(r3), 1, 0
231         ps_merge10  fr7, fr3, fr2
232         psq_l       fr5, 36(r3), 0, 0
233         ps_mul      fr11, fr3, fr6
234         ps_merge10  fr8, fr5, fr4
235         ps_mul      fr13, fr5, fr7
236         ps_msub     fr11, fr1, fr7, fr11
237         ps_mul      fr12, fr1, fr8
238         ps_msub     fr13, fr3, fr8, fr13
239         ps_msub     fr12, fr5, fr6, fr12
240         ps_mul      fr10, fr3, fr4
241         ps_mul      fr9,  fr0, fr5
242         ps_mul      fr8,  fr1, fr2
243         ps_msub     fr10, fr2, fr5, fr10
244         ps_msub     fr9,  fr1, fr4, fr9
245         ps_msub     fr8,  fr0, fr3, fr8
246         ps_mul      fr7, fr0, fr13
247         ps_sub      fr1, fr1, fr1
248         ps_madd     fr7, fr2, fr12, fr7
249         ps_madd     fr7, fr4, fr11, fr7
250         ps_cmpo0    cr0, fr7, fr1
251         bne         0f
252         addi        r3, 0, 0
253         blr
255 0:      fres        fr0, fr7
256         psq_st      fr1,  12(r4), 1, 0
257         ps_add      fr6, fr0, fr0
258         ps_mul      fr5, fr0, fr0
259         psq_st      fr1,  28(r4), 1, 0
260         ps_nmsub    fr0, fr7, fr5, fr6
261         psq_st      fr1,  44(r4), 1, 0
262         ps_muls0    fr13, fr13, fr0
263         ps_muls0    fr12, fr12, fr0
264         ps_muls0    fr11, fr11, fr0
265         psq_st      fr13,  0(r4), 0, 0
266         psq_st      fr12,  16(r4), 0, 0
267         ps_muls0    fr10, fr10, fr0
268         ps_muls0    fr9,  fr9,  fr0
269         psq_st      fr11,  32(r4), 0, 0
270         psq_st      fr10,  8(r4), 1, 0
271         ps_muls0    fr8,  fr8,  fr0
272         addi        r3, 0, 1
273         psq_st      fr9,   24(r4), 1, 0
274         psq_st      fr8,   40(r4), 1, 0
275         blr
277         .globl ps_guMtxScale
278         //r3 = mtx,fr1 = xS,fr2 = yS,fr3 = zS
279 ps_guMtxScale:
280         lis                     r9,Unit01@ha
281         addi            r9,r9,Unit01@l
282         lfs                     fr0,0(r9)
283         stfs            fr1,0(r3)
284         psq_st          fr0,4(r3),0,0
285         psq_st          fr0,12(r3),0,0
286         stfs            fr2,20(r3)
287         psq_st          fr0,24(r3),0,0
288         psq_st          fr0,32(r3),0,0
289         stfs            fr3,40(r3)
290         stfs            fr0,44(r3)
291         blr
293         .globl ps_guMtxScaleApply
294         //r3 = src,r4 = dst,fr1 = xS,fr2 = yS,fr3 = zS
295 ps_guMtxScaleApply:
296         frsp            fr1,fr1
297         psq_l           fr4,0(r3),0,0
298         frsp            fr2,fr2
299         psq_l           fr5,8(r3),0,0
300         frsp            fr3,fr3
301         ps_muls0        fr4,fr4,fr1
302         psq_l           fr6,16(r3),0,0
303         ps_muls0        fr5,fr5,fr1
304         psq_l           fr7,24(r3),0,0
305         ps_muls0        fr6,fr6,fr2
306         psq_l           fr8,32(r3),0,0
307         psq_st          fr4,0(r4),0,0
308         ps_muls0        fr7,fr7,fr2
309         psq_l           fr2,40(r3),0,0
310         psq_st          fr5,8(r4),0,0
311         ps_muls0        fr8,fr8,fr3
312         psq_st          fr6,16(r4),0,0
313         ps_muls0        fr2,fr2,fr3
314         psq_st          fr7,24(r4),0,0
315         psq_st          fr8,32(r4),0,0
316         psq_st          fr2,40(r4),0,0
317         blr
319         .globl ps_guMtxApplyScale
320         //r3 = src,r4 = dst,fr1 = xS,fr2 = yS,fr3 = zS
321 ps_guMtxApplyScale:
322         lis                     r9,Unit01@ha
323         addi            r9,r9,Unit01@l
324         lfs                     fr6,4(r9)
325         frsp            fr1,fr1
326         psq_l           fr4,0(r3),0,0
327         frsp            fr2,fr2
328         psq_l           fr5,8(r3),0,0
329         frsp            fr3,fr3
330         ps_merge00      fr10,fr1,fr2
331         ps_merge00      fr11,fr3,fr6
332         ps_mul          fr4,fr4,fr10
333         psq_l           fr6,16(r3),0,0
334         ps_mul          fr5,fr5,fr11
335         psq_l           fr7,24(r3),0,0
336         ps_mul          fr6,fr6,fr10
337         psq_l           fr8,32(r3),0,0
338         psq_st          fr4,0(r4),0,0
339         ps_mul          fr7,fr7,fr11
340         psq_l           fr2,40(r3),0,0
341         psq_st          fr5,8(r4),0,0
342         ps_mul          fr8,fr8,fr10
343         psq_st          fr6,16(r4),0,0
344         ps_mul          fr2,fr2,fr11
345         psq_st          fr7,24(r4),0,0
346         psq_st          fr8,32(r4),0,0
347         psq_st          fr2,40(r4),0,0
348         blr
350         .globl ps_guMtxTrans
351         //r3 = mtx,fr1 = xT,fr2 = yT,fr3 = zT
352 ps_guMtxTrans:
353         lis                     r9,Unit01@ha
354         addi            r9,r9,Unit01@l
355         lfs                     fr4,0(r9)
356         lfs                     fr5,4(r9)
357         stfs            fr4,16(r3)
358         stfs            fr1,12(r3)
359         stfs            fr2,28(r3)
360         psq_st          fr4,4(r3),0,0
361         psq_st          fr4,32(r3),0,0
362         stfs            fr5,20(r3)
363         stfs            fr4,24(r3)
364         stfs            fr5,40(r3)
365         stfs            fr3,44(r3)
366         stfs            fr5,0(r3)
367         blr
369         .globl ps_guMtxTransApply
370         //r3 = src,r4 = dst,fr1 = xT,fr2 = yT,fr3 = zT
371 ps_guMtxTransApply:
372         psq_l           fr4,0(r3),0,0
373         frsp            fr1,fr1
374         psq_l           fr5,8(r3),0,0
375         frsp            fr2,fr2
376         psq_l           fr7,24(r3),0,0
377         frsp            fr3,fr3
378         psq_l           fr8,40(r3),0,0
379         ps_sum1         fr5,fr1,fr5,fr5
380         psq_l           fr6,16(r3),0,0
381         ps_sum1         fr7,fr2,fr7,fr7
382         psq_l           fr9,32(r3),0,0
383         ps_sum1         fr8,fr3,fr8,fr8
384         psq_st          fr4,0(r4),0,0
385         psq_st          fr5,8(r4),0,0
386         psq_st          fr6,16(r4),0,0
387         psq_st          fr7,24(r4),0,0
388         psq_st          fr9,32(r4),0,0
389         psq_st          fr8,40(r4),0,0
390         blr
392         .globl ps_guMtxApplyTrans
393         //r3 = src,r4 = dst,fr1 = xT,fr2 = yT,fr3 = zT
394 ps_guMtxApplyTrans:
395         lis                     r9,Unit01@ha
396         addi            r9,r9,Unit01@l
397         lfs                     fr6,4(r9)
398         psq_l           fr4,0(r3),0,0
399         frsp            fr1,fr1
400         psq_l           fr5,8(r3),0,0
401         frsp            fr2,fr2
402         ps_merge00      fr10,fr1,fr2
403         psq_l           fr7,24(r3),0,0
404         frsp            fr3,fr3
405         ps_mul          fr1,fr4,fr10
406         ps_merge00      fr11,fr3,fr6
407         psq_l           fr8,40(r3),0,0
408         ps_madd         fr2,fr5,fr11,fr1
409         psq_l           fr6,16(r3),0,0
410         ps_sum0         fr3,fr2,fr3,fr2
411         psq_l           fr9,32(r3),0,0
412         ps_mul          fr12,fr6,fr10
413         psq_st          fr4,0(r4),0,0
414         ps_madd         fr4,fr7,fr11,fr12
415         psq_st          fr5,8(r4),1,0
416         ps_sum0         fr12,fr4,fr12,fr4
417         psq_st          fr3,12(r4),1,0
418         ps_mul          fr3,fr9,fr10
419         psq_st          fr6,16(r4),0,0
420         ps_madd         fr2,fr8,fr11,fr3
421         psq_st          fr7,24(r4),1,0
422         ps_sum0         fr3,fr2,fr3,fr2
423         psq_st          fr12,28(r4),1,0
424         psq_st          fr9,32(r4),0,0
425         psq_st          fr8,40(r4),1,0
426         psq_st          fr3,44(r4),1,0
427         blr
429         .globl ps_guMtxRotTrig
430         //r3 = mt,r4 = axis,fr1 = sinA,fr2 = cosA
431 ps_guMtxRotTrig:
432         lis                     r9,Unit01@ha
433         addi            r9,r9,Unit01@l
434         frsp            fr1,fr1
435         lfs                     fr3,0(r9)
436         frsp            fr2,fr2
437         lfs                     fr4,4(r9)
438         ori                     r4,r4,0x20
439         ps_neg          fr5,fr1
440         cmplwi          r4,'x'
441         beq                     0f
442         cmplwi          r4,'y'
443         beq                     1f
444         cmplwi          r4,'z'
445         beq                     2f
446         b                       3f
448         psq_st          fr4,0(r3),1,0
449         psq_st          fr3,4(r3),0,0
450         ps_merge00      fr6,fr1,fr2
451         psq_st          fr3,12(r3),0,0
452         ps_merge00      fr7,fr2,fr5
453         psq_st          fr3,28(r3),0,0
454         psq_st          fr3,44(r3),1,0
455         psq_st          fr6,36(r3),0,0
456         psq_st          fr7,20(r3),0,0
457         b                       3f
459         ps_merge00      fr6,fr2,fr3
460         ps_merge00      fr7,fr3,fr4
461         psq_st          fr3,24(r3),0,0
462         psq_st          fr6,0(r3),0,0
463         ps_merge00      fr8,fr5,fr3
464         ps_merge00      fr9,fr1,fr3
465         psq_st          fr6,40(r3),0,0
466         psq_st          fr7,16(r3),0,0
467         psq_st          fr9,8(r3),0,0
468         psq_st          fr8,32(r3),0,0
469         b                       3f
471         psq_st          fr3,8(r3),0,0
472         ps_merge00      fr6,fr1,fr2
473         ps_merge00      fr8,fr2,fr5
474         psq_st          fr3,24(r3),0,0
475         psq_st          fr3,32(r3),0,0
476         ps_merge00      fr7,fr4,fr3
477         psq_st          fr6,16(r3),0,0
478         psq_st          fr8,0(r3),0,0
479         psq_st          fr7,40(r3),0,0
481         blr
483         .globl __ps_guMtxRotAxisRadInternal
484         //r3 = mtx, r4 = vec, fr1 = sT, fr2 = cT
485 __ps_guMtxRotAxisRadInternal:
486         stwu            r1,-64(r1)
487         frsp            fr2,fr2
488         psq_l           fr3,0(r4),0,0
489         lis                     r6,NrmData@ha
490         frsp            fr1,fr1
491         stfd            fr14,8(r1)
492         addi            r6,r6,NrmData@l
493         lfs                     fr11,4(r6);
494         lfs                     fr12,0(r6)
495         ps_mul          fr5,fr3,fr3
496         lfs                     fr4,8(r4)
497         fadds           fr10,fr12,fr12
498         ps_madd         fr6,fr4,fr4,fr5
499         fsubs           fr14,fr12,fr12
500         ps_sum0         fr7,fr6,fr4,fr5
501         fsubs           fr13,fr10,fr2
502         frsqrte         fr8,fr7
503         fmuls           fr5,fr8,fr8
504         fmuls           fr6,fr8,fr12
505         fnmsubs         fr5,fr5,fr7,fr11
506         fmuls           fr8,fr5,fr6
507         ps_merge00      fr2,fr2,fr2
508         ps_muls0        fr3,fr3,fr8
509         ps_muls0        fr4,fr4,fr8
510         ps_muls0        fr7,fr3,fr13
511         ps_muls0        fr12,fr3,fr1
512         ps_muls0        fr8,fr4,fr13
513         ps_muls1        fr6,fr7,fr3
514         ps_muls0        fr5,fr7,fr3
515         ps_muls0        fr7,fr7,fr4
516         fnmsubs         fr9,fr4,fr1,fr6
517         fmadds          fr10,fr4,fr1,fr6
518         ps_neg          fr3,fr12
519         ps_sum0         fr11,fr7,fr14,fr12
520         ps_sum0         fr5,fr5,fr9,fr2
521         ps_sum1         fr6,fr2,fr10,fr6
522         ps_sum0         fr9,fr3,fr14,fr7
523         psq_st          fr11,8(r3),0,0
524         ps_sum0         fr3,fr7,fr7,fr3
525         psq_st          fr5,0(r3),0,0
526         ps_muls0        fr8,fr8,fr4
527         psq_st          fr6,16(r3),0,0
528         ps_sum1         fr7,fr12,fr3,fr7
529         psq_st          fr9,24(r3),0,0
530         ps_sum0         fr8,fr8,fr14,fr2
531         psq_st          fr7,32(r3),0,0
532         psq_st          fr8,40(r3),0,0
533         lfd                     fr14,8(r1)
534         addi            r1,r1,64
535         blr
537         .globl ps_guMtxReflect
538         //r3 = mtx,r4 = vec1,r5 = vec2
539 ps_guMtxReflect:
540         lis                     r9,Unit01@ha
541         addi            r9,r9,Unit01@l
542         lfs                     fr0,4(r9)
543         psq_l           fr1,8(r5),1,0
544         psq_l           fr2,0(r5),0,0
545         psq_l           fr3,0(r4),0,0
546         ps_nmadd        fr5,fr1,fr0,fr1
547         psq_l           fr4,8(r4),1,0
548         ps_nmadd        fr6,fr2,fr0,fr2
549         ps_muls0        fr7,fr2,fr5
550         ps_mul          fr8,fr6,fr3
551         ps_muls0        fr9,fr2,fr6
552         ps_sum0         fr8,fr8,fr8,fr8
553         ps_muls1        fr10,fr2,fr6
554         psq_st          fr7,32(r3),0,0
555         ps_sum0         fr2,fr2,fr2,fr0
556         ps_nmadd        fr8,fr5,fr4,fr8
557         ps_sum1         fr10,fr0,fr10,fr10
558         psq_st          fr9,0(r3),0,0
559         ps_muls0        fr11,fr2,fr8
560         ps_merge00      fr12,fr5,fr8
561         psq_st          fr10,16(r3),0,0
562         ps_merge00      fr13,fr7,fr11
563         ps_muls0        fr12,fr12,fr1
564         ps_merge11      fr11,fr7,fr11
565         psq_st          fr13,8(r3),0,0
566         ps_sum0         fr12,fr12,fr12,fr0
567         psq_st          fr11,24(r3),0,0
568         psq_st          fr12,40(r3),0,0
569         blr
571         .globl ps_guVecAdd
572         //r3 = v1,r4 = v2,r5 = dst
573 ps_guVecAdd:
574         psq_l           V1_XY,0(r3),0,0
575         psq_l           V2_XY,0(r4),0,0
576         ps_add          D1_XY,V1_XY,V2_XY
577         psq_st          D1_XY,0(r5),0,0
578         psq_l           V1_Z,8(r3),1,0
579         psq_l           V2_Z,8(r4),1,0
580         ps_add          D1_Z,V1_Z,V2_Z
581         psq_st          D1_Z,8(r5),1,0
582         blr
584         .globl ps_guVecSub
585         //r3 = v1,r4 = v2,r5 = dst
586 ps_guVecSub:
587         psq_l           V1_XY,0(r3),0,0
588         psq_l           V2_XY,0(r4),0,0
589         ps_sub          D1_XY,V1_XY,V2_XY
590         psq_st          D1_XY,0(r5),0,0
591         psq_l           V1_Z,8(r3),1,0
592         psq_l           V2_Z,8(r4),1,0
593         ps_sub          D1_Z,V1_Z,V2_Z
594         psq_st          D1_Z,8(r5),1,0
595         blr
597         .globl ps_guVecScale
598         //r3 = src,r4 = dst,fr1 = S
599 ps_guVecScale:
600         psq_l           fr2,0(r3),0,0
601         psq_l           fr3,8(r3),1,0
602         ps_muls0        fr4,fr2,fr1
603         psq_st          fr4,0(r4),0,0
604         ps_muls0        fr4,fr3,fr1
605         psq_st          fr4,8(r4),1,0   
606         blr
608         .globl  ps_guVecNormalize
609         //r3 = v
610 ps_guVecNormalize:
611         lis                     r9,NrmData@ha
612         addi            r9,r9,NrmData@l
613         lfs                     fr0,0(r9)
614         lfs                     fr1,4(r9)
615         psq_l           fr2,0(r3),0,0
616         ps_mul          fr4,fr2,fr2
617         psq_l           fr3,8(r3),1,0
618         ps_madd         fr5,fr3,fr3,fr4
619         ps_sum0         fr6,fr5,fr3,fr4
620         frsqrte         fr7,fr6
621         fmuls           fr8,fr7,fr7
622         fmuls           fr9,fr7,fr0
623         fnmsubs         fr8,fr8,fr6,fr1
624         fmuls           fr7,fr8,fr9
625         ps_muls0        fr2,fr2,fr7
626         psq_st          fr2,0(r3),0,0
627         ps_muls0        fr3,fr3,fr7
628         psq_st          fr3,8(r3),1,0
629         blr
631         .globl ps_guVecCross
632         //r3 = v1,r4 = v2,r5 = v12
633 ps_guVecCross:
634         psq_l           fr1,0(r4),0,0
635         lfs                     fr2,8(r3)
636         psq_l           fr0,0(r3),0,0
637         ps_merge10      fr6,fr1,fr1
638         lfs                     fr3,8(r4)
639         ps_mul          fr4,fr1,fr2
640         ps_muls0        fr7,fr1,fr0
641         ps_msub         fr5,fr0,fr3,fr4
642         ps_msub         fr8,fr0,fr6,fr7
643         ps_merge11      fr9,fr5,fr5
644         ps_merge01      fr10,fr5,fr8
645         psq_st          fr9,0(r5),1,0
646         ps_neg          fr10,fr10
647         psq_st          fr10,4(r5),0,0  
648         blr
650         .globl ps_guVecDotProduct
651         //r3 = vec1,r4 = vec2
652 ps_guVecDotProduct:
653         psq_l           fr2,4(r3),0,0
654         psq_l           fr3,4(r4),0,0
655         ps_mul          fr2,fr2,fr3
656         psq_l           fr5,0(r3),0,0
657         psq_l           fr4,0(r4),0,0
658         ps_madd         fr3,fr5,fr4,fr2
659         ps_sum0         fr1,fr3,fr2,fr2
660         blr
662         .globl ps_guVecMultiply
663 ps_guVecMultiply:
664         psq_l           fr0,0(r4),0,0
665         psq_l           fr2,0(r3),0,0
666         psq_l           fr1,8(r4),1,0
667         ps_mul          fr4,fr2,fr0
668         psq_l           fr3,8(r3),0,0
669         ps_madd         fr5,fr3,fr1,fr4
670         psq_l           fr8,16(r3),0,0
671         ps_sum0         fr6,fr5,fr6,fr5
672         psq_l           fr9,24(r3),0,0
673         ps_mul          fr10,fr8,fr0
674         psq_st          fr6,0(r5),1,0
675         ps_madd         fr11,fr9,fr1,fr10
676         psq_l           fr2,32(r3),0,0
677         ps_sum0         fr12,fr11,fr12,fr11
678         psq_l           fr3,40(r3),0,0
679         ps_mul          fr4,fr2,fr0
680         psq_st          fr12,4(r5),1,0
681         ps_madd         fr5,fr3,fr1,fr4
682         ps_sum0         fr6,fr5,fr6,fr5
683         psq_st          fr6,8(r5),1,0
684         blr
686         .globl ps_guVecMultiplySR
687         // r3 = mt, r4 = src, r5 = dst
688 ps_guVecMultiplySR:
689     psq_l               fr0,0(r3),0,0    // m[0][0], m[0][1] GQR0 = 0
690     // fp6 - x y 
691     psq_l               fr6,0(r4),0,0 
692     psq_l               fr2,16(r3),0,0   // m[1][0], m[1][1]
693     // fp8 = m00x m01y // next X
694     ps_mul              fr8,fr0,fr6
695     psq_l               fr4,32(r3),0,0   // m[2][0], m[2][1]
696     // fp10 = m10x m11y // next Y
697     ps_mul              fr10,fr2,fr6
698     psq_l               fr7,8(r4),1,0   // fp7 - z,1.0
699     // fp12 = m20x m21y // next Z
700     ps_mul              fr12,fr4,fr6  // YYY last FP6 usage
701     psq_l               fr3,24(r3),0,0   // m[1][2], m[1][3]
702     ps_sum0             fr8,fr8,fr8,fr8
703     psq_l               fr5,40(r3),0,0   // m[2][2], m[2][3]
704     ps_sum0             fr10,fr10,fr10,fr10
705     psq_l               fr1,8(r3),0,0    // m[0][2], m[0][3] 
706     ps_sum0             fr12,fr12,fr12,fr12
707     ps_madd             fr9,fr1,fr7,fr8
708     psq_st              fr9,0(r5),1,0      // store X
709     ps_madd             fr11,fr3,fr7,fr10
710     psq_st              fr11,4(r5),1,0      // store Y  
711     ps_madd             fr13,fr5,fr7,fr12
712     psq_st              fr13,8(r5),1,0      //  sore Z
713         blr
715         .globl ps_quQuatScale
716         //r3 = q,r4 = r, fr1 = scale
717 ps_guQuatScale:
718         psq_l           fr4,0(r3),0,0
719         psq_l           fr5,8(r3),0,0
720         ps_muls0        fr4,fr4,fr1
721         psq_st          fr4,0(r4),0,0
722         ps_muls0        fr5,fr5,fr1
723         psq_st          fr5,8(r4),0,0
724         blr
726         .globl ps_guQuatDotProduct
727         //r3 = p, r4 = q ; fr1 = res
728 ps_guQuatDotProduct:
729         psq_l           fr2,0(r3),0,0
730         psq_l           fr4,0(r4),0,0
731         ps_mul          fr1,fr2,fr4
732         psq_l           fr3,8(r3),0,0
733         psq_l           fr5,8(r4),0,0
734         ps_madd         fr1,fr3,fr5,fr1
735         ps_sum0         fr1,fr1,fr1,fr1
736         blr
738         .section .data
739         .balign 4
740 QuatEpsilon:
741         .float  0.00001
742 Unit01:
743         .float  0.0, 1.0
744 NrmData:
745         .float  0.5, 3.0