patch to make enabling 64B L2 cache optional (tueidj)
[libogc.git] / libogc / gu_psasm.S
blob5770d788a2a77122a83ed5437d11d841851cc1aa
1 #include <asm.h>
3 #define A00_A01         fr0
4 #define A02_A03         fr1
5 #define A10_A11         fr2
6 #define A12_A13         fr3
7 #define A20_A21         fr4
8 #define A22_A23         fr5
10 #define B00_B01         fr6
11 #define B02_B03         fr7
12 #define B10_B11         fr8
13 #define B12_B13         fr9
14 #define B20_B21         fr10
15 #define B22_B23         fr11
17 #define D00_D01         fr12
18 #define D02_D03         fr13
19 #define D10_D11         fr14
20 #define D12_D13         fr15
21 #define D20_D21         fr2
22 #define D22_D23         fr0
24 #define UNIT01          fr31
26 #define RET_REG         fr1
27 #define V1_XY           fr2
28 #define V1_Z            fr3
29 #define V2_XY           fr4
30 #define V2_Z            fr5
31 #define D1_XY           fr6
32 #define D1_Z            fr7
33 #define D2_XY           fr8
34 #define D2_Z            fr9
35 #define W1_XY           fr10
36 #define W1_Z            fr11
37 #define W2_XY           fr12
38 #define W2_Z            fr13
40         .globl  ps_guMtxConcat
41         //r3 = mtxA, r4 = mtxB, r5 = mtxAB
42 ps_guMtxConcat:
43         stwu            r1,-64(r1)
44         psq_l           A00_A01,0(r3),0,0
45         stfd            fr14,8(r1)
46         psq_l           B00_B01,0(r4),0,0
47         psq_l           B02_B03,8(r4),0,0
48         stfd            fr15,16(r1)
49         stfd            fr31,40(r1)
50         psq_l           B10_B11,16(r4),0,0
51         ps_muls0        D00_D01,B00_B01,A00_A01
52         psq_l           A10_A11,16(r3),0,0
53         ps_muls0        D02_D03,B02_B03,A00_A01
54         psq_l           UNIT01,Unit01@sdarel(r13),0,0
55         ps_muls0        D10_D11,B00_B01,A10_A11
56         psq_l           B12_B13,24(r4),0,0
57         ps_muls0        D12_D13,B02_B03,A10_A11
58         psq_l           A02_A03,8(r3),0,0
59         ps_madds1       D00_D01,B10_B11,A00_A01,D00_D01
60         psq_l           A12_A13,24(r3),0,0
61         ps_madds1       D10_D11,B10_B11,A10_A11,D10_D11
62         psq_l           B20_B21,32(r4),0,0
63         ps_madds1       D02_D03,B12_B13,A00_A01,D02_D03
64         psq_l           B22_B23,40(r4),0,0
65         ps_madds1       D12_D13,B12_B13,A10_A11,D12_D13
66         psq_l           A20_A21,32(r3),0,0
67         psq_l           A22_A23,40(r3),0,0
68         ps_madds0       D00_D01,B20_B21,A02_A03,D00_D01
69         ps_madds0       D02_D03,B22_B23,A02_A03,D02_D03
70         ps_madds0       D10_D11,B20_B21,A12_A13,D10_D11
71         ps_madds0       D12_D13,B22_B23,A12_A13,D12_D13
72         psq_st          D00_D01,0(r5),0,0
73         ps_muls0        D20_D21,B00_B01,A20_A21
74         ps_madds1       D02_D03,UNIT01,A02_A03,D02_D03
75         ps_muls0        D22_D23,B02_B03,A20_A21
76         psq_st          D10_D11,16(r5),0,0
77         ps_madds1       D12_D13,UNIT01,A12_A13,D12_D13
78         psq_st          D02_D03,8(r5),0,0
79         ps_madds1       D20_D21,B10_B11,A20_A21,D20_D21
80         ps_madds1       D22_D23,B12_B13,A20_A21,D22_D23
81         ps_madds0       D20_D21,B20_B21,A22_A23,D20_D21
82         lfd             fr14,8(r1)
83         psq_st          D12_D13,24(r5),0,0
84         ps_madds0       D22_D23,B22_B23,A22_A23,D22_D23
85         psq_st          D20_D21,32(r5),0,0
86         ps_madds1       D22_D23,UNIT01,A22_A23,D22_D23
87         lfd             fr15,16(r1)
88         psq_st          D22_D23,40(r5),0,0
89         lfd             fr31,40(r1)
90         addi            r1,r1,64
91         blr
93         .globl ps_guMtxIdentity
94         //r3 == mtx
95 ps_guMtxIdentity:
96         lfs             fr0,Unit01@sdarel(r13)
97         lfs             fr1,Unit01+4@sdarel(r13)
98         psq_st          fr0,8(r3),0,0
99         ps_merge01      fr2,fr0,fr1
100         psq_st          fr0,24(r3),0,0
101         ps_merge10      fr3,fr1,fr0
102         psq_st          fr0,32(r3),0,0
103         psq_st          fr2,16(r3),0,0
104         psq_st          fr3,0(r3),0,0
105         psq_st          fr3,40(r3),0,0
106         blr
108         .globl ps_guMtxCopy
109         //r3 = src, r4 = dst
110 ps_guMtxCopy:
111         psq_l           fr0,0(r3),0,0
112         psq_st          fr0,0(r4),0,0
113         psq_l           fr1,8(r3),0,0
114         psq_st          fr1,8(r4),0,0
115         psq_l           fr2,16(r3),0,0
116         psq_st          fr2,16(r4),0,0
117         psq_l           fr3,24(r3),0,0
118         psq_st          fr3,24(r4),0,0
119         psq_l           fr4,32(r3),0,0
120         psq_st          fr4,32(r4),0,0
121         psq_l           fr5,40(r3),0,0
122         psq_st          fr5,40(r4),0,0
123         blr
125         .globl ps_guMtxTranspose
126         //r3 = src, r4 = xpose
127 ps_guMtxTranspose:
128         lfs             fr0,Unit01@sdarel(r13)
129         psq_l           fr1,0(r3),0,0
130         stfs            fr0,44(r4)
131         psq_l           fr2,16(r3),0,0
132         ps_merge00      fr5,fr1,fr2
133         psq_l           fr3,8(r3),1,0
134         ps_merge11      fr6,fr1,fr2
135         psq_l           fr4,24(r3),1,0
136         psq_st          fr5,0(r4),0,0
137         psq_l           fr1,32(r3),0,0
138         ps_merge00      fr7,fr3,fr4
139         psq_st          fr6,16(r4),0,0
140         ps_merge00      fr5,fr1,fr0
141         psq_st          fr7,32(r4),0,0
142         ps_merge10      fr6,fr1,fr0
143         psq_st          fr5,8(r4),0,0
144         lfs             fr3,40(r3)
145         psq_st          fr6,24(r4),0,0
146         stfs            fr3,40(r4)
147         blr
149         .globl ps_guMtxInverse
150         //r3 = src, r4 = inv
151 ps_guMtxInverse:
152         psq_l           fr0,0(r3),1,0
153         psq_l           fr1,4(r3),0,0
154         psq_l           fr2,16(r3),1,0
155         ps_merge10      fr6,fr1,fr0
156         psq_l           fr3,20(r3),0,0
157         psq_l           fr4,32(r3),1,0
158         ps_merge10      fr7,fr3,fr2
159         psq_l           fr5,36(r3),0,0
160         ps_mul          fr11,fr3,fr6
161         ps_mul          fr13,fr5,fr7
162         ps_merge10      fr8,fr5,fr4
163         ps_msub         fr11,fr1,fr7,fr11
164         ps_mul          fr12,fr1,fr8
165         ps_msub         fr13,fr3,fr8,fr13
166         ps_mul          fr10,fr3,fr4
167         ps_msub         fr12,fr5,fr6,fr12
168         ps_mul          fr9,fr0,fr5
169         ps_mul          fr8,fr1,fr2
170         ps_sub          fr6,fr6,fr6
171         ps_msub         fr10,fr2,fr5,fr10
172         ps_mul          fr7,fr0,fr13
173         ps_msub         fr9,fr1,fr4,fr9
174         ps_madd         fr7,fr2,fr12,fr7
175         ps_msub         fr8,fr0,fr3,fr8
176         ps_madd         fr7,fr4,fr11,fr7
177         ps_cmpo0        cr0,fr7,fr6
178         bne                     0f
179         li                      r3,0
180         blr
182 0:      fres            fr0,fr7
183         ps_add          fr6,fr0,fr0
184         ps_mul          fr5,fr0,fr0
185         ps_nmsub        fr0,fr7,fr5,fr6
186         lfs             fr1,12(r3)
187         ps_muls0        fr13,fr13,fr0
188         lfs             fr2,28(r3)
189         ps_muls0        fr12,fr12,fr0
190         lfs             fr3,44(r3)
191         ps_muls0        fr11,fr11,fr0
192         ps_merge00      fr5,fr13,fr12
193         ps_muls0        fr10,fr10,fr0
194         ps_merge11      fr4,fr13,fr12
195         ps_muls0        fr9,fr9,fr0
196         psq_st          fr5,0(r4),0,0
197         ps_mul          fr6,fr13,fr1
198         psq_st          fr4,16(r4),0,0
199         ps_muls0        fr8,fr8,fr0
200         ps_madd         fr6,fr12,fr2,fr6
201         psq_st          fr10,32(r4),1,0
202         ps_nmadd        fr6,fr11,fr3,fr6
203         psq_st          fr9,36(r4),1,0
204         ps_mul          fr7,fr10,fr1
205         ps_merge00      fr5,fr11,fr6
206         psq_st          fr8,40(r4),1,0
207         ps_merge11      fr4,fr11,fr6
208         psq_st          fr5,8(r4),0,0
209         ps_madd         fr7,fr9,fr2,fr7
210         psq_st          fr4,24(r4),0,0
211         ps_nmadd        fr7,fr8,fr3,fr7
212         li              r3,1
213         psq_st          fr7,44(r4),1,0
214         blr
216         .globl ps_guMtxInvXpose
217         //r3 = src, r4 = invx
218 ps_guMtxInvXpose:
219         psq_l       fr0, 0(r3), 1, 0
220         psq_l       fr1, 4(r3), 0, 0
221         psq_l       fr2, 16(r3), 1, 0
222         ps_merge10  fr6, fr1, fr0
223         psq_l       fr3, 20(r3), 0, 0
224         psq_l       fr4, 32(r3), 1, 0
225         ps_merge10  fr7, fr3, fr2
226         psq_l       fr5, 36(r3), 0, 0
227         ps_mul      fr11, fr3, fr6
228         ps_merge10  fr8, fr5, fr4
229         ps_mul      fr13, fr5, fr7
230         ps_msub     fr11, fr1, fr7, fr11
231         ps_mul      fr12, fr1, fr8
232         ps_msub     fr13, fr3, fr8, fr13
233         ps_msub     fr12, fr5, fr6, fr12
234         ps_mul      fr10, fr3, fr4
235         ps_mul      fr9,  fr0, fr5
236         ps_mul      fr8,  fr1, fr2
237         ps_msub     fr10, fr2, fr5, fr10
238         ps_msub     fr9,  fr1, fr4, fr9
239         ps_msub     fr8,  fr0, fr3, fr8
240         ps_mul      fr7, fr0, fr13
241         ps_sub      fr1, fr1, fr1
242         ps_madd     fr7, fr2, fr12, fr7
243         ps_madd     fr7, fr4, fr11, fr7
244         ps_cmpo0    cr0, fr7, fr1
245         bne         0f
246         addi        r3, 0, 0
247         blr
249 0:      fres        fr0, fr7
250         psq_st      fr1,  12(r4), 1, 0
251         ps_add      fr6, fr0, fr0
252         ps_mul      fr5, fr0, fr0
253         psq_st      fr1,  28(r4), 1, 0
254         ps_nmsub    fr0, fr7, fr5, fr6
255         psq_st      fr1,  44(r4), 1, 0
256         ps_muls0    fr13, fr13, fr0
257         ps_muls0    fr12, fr12, fr0
258         ps_muls0    fr11, fr11, fr0
259         psq_st      fr13,  0(r4), 0, 0
260         psq_st      fr12,  16(r4), 0, 0
261         ps_muls0    fr10, fr10, fr0
262         ps_muls0    fr9,  fr9,  fr0
263         psq_st      fr11,  32(r4), 0, 0
264         psq_st      fr10,  8(r4), 1, 0
265         ps_muls0    fr8,  fr8,  fr0
266         addi        r3, 0, 1
267         psq_st      fr9,   24(r4), 1, 0
268         psq_st      fr8,   40(r4), 1, 0
269         blr
271         .globl ps_guMtxScale
272         //r3 = mtx,fr1 = xS,fr2 = yS,fr3 = zS
273 ps_guMtxScale:
274         lfs             fr0,Unit01@sdarel(r13)
275         stfs            fr1,0(r3)
276         psq_st          fr0,4(r3),0,0
277         psq_st          fr0,12(r3),0,0
278         stfs            fr2,20(r3)
279         psq_st          fr0,24(r3),0,0
280         psq_st          fr0,32(r3),0,0
281         stfs            fr3,40(r3)
282         stfs            fr0,44(r3)
283         blr
285         .globl ps_guMtxScaleApply
286         //r3 = src,r4 = dst,fr1 = xS,fr2 = yS,fr3 = zS
287 ps_guMtxScaleApply:
288         frsp            fr1,fr1
289         psq_l           fr4,0(r3),0,0
290         frsp            fr2,fr2
291         psq_l           fr5,8(r3),0,0
292         frsp            fr3,fr3
293         ps_muls0        fr4,fr4,fr1
294         psq_l           fr6,16(r3),0,0
295         ps_muls0        fr5,fr5,fr1
296         psq_l           fr7,24(r3),0,0
297         ps_muls0        fr6,fr6,fr2
298         psq_l           fr8,32(r3),0,0
299         psq_st          fr4,0(r4),0,0
300         ps_muls0        fr7,fr7,fr2
301         psq_l           fr2,40(r3),0,0
302         psq_st          fr5,8(r4),0,0
303         ps_muls0        fr8,fr8,fr3
304         psq_st          fr6,16(r4),0,0
305         ps_muls0        fr2,fr2,fr3
306         psq_st          fr7,24(r4),0,0
307         psq_st          fr8,32(r4),0,0
308         psq_st          fr2,40(r4),0,0
309         blr
311         .globl ps_guMtxApplyScale
312         //r3 = src,r4 = dst,fr1 = xS,fr2 = yS,fr3 = zS
313 ps_guMtxApplyScale:
314         lfs             fr6,Unit01+4@sdarel(r13)
315         frsp            fr1,fr1
316         psq_l           fr4,0(r3),0,0
317         frsp            fr2,fr2
318         psq_l           fr5,8(r3),0,0
319         frsp            fr3,fr3
320         ps_merge00      fr10,fr1,fr2
321         ps_merge00      fr11,fr3,fr6
322         ps_mul          fr4,fr4,fr10
323         psq_l           fr6,16(r3),0,0
324         ps_mul          fr5,fr5,fr11
325         psq_l           fr7,24(r3),0,0
326         ps_mul          fr6,fr6,fr10
327         psq_l           fr8,32(r3),0,0
328         psq_st          fr4,0(r4),0,0
329         ps_mul          fr7,fr7,fr11
330         psq_l           fr2,40(r3),0,0
331         psq_st          fr5,8(r4),0,0
332         ps_mul          fr8,fr8,fr10
333         psq_st          fr6,16(r4),0,0
334         ps_mul          fr2,fr2,fr11
335         psq_st          fr7,24(r4),0,0
336         psq_st          fr8,32(r4),0,0
337         psq_st          fr2,40(r4),0,0
338         blr
340         .globl ps_guMtxTrans
341         //r3 = mtx,fr1 = xT,fr2 = yT,fr3 = zT
342 ps_guMtxTrans:
343         lfs             fr4,Unit01@sdarel(r13)
344         lfs             fr5,Unit01+4@sdarel(r13)
345         stfs            fr4,16(r3)
346         stfs            fr1,12(r3)
347         stfs            fr2,28(r3)
348         psq_st          fr4,4(r3),0,0
349         psq_st          fr4,32(r3),0,0
350         stfs            fr5,20(r3)
351         stfs            fr4,24(r3)
352         stfs            fr5,40(r3)
353         stfs            fr3,44(r3)
354         stfs            fr5,0(r3)
355         blr
357         .globl ps_guMtxTransApply
358         //r3 = src,r4 = dst,fr1 = xT,fr2 = yT,fr3 = zT
359 ps_guMtxTransApply:
360         psq_l           fr4,0(r3),0,0
361         frsp            fr1,fr1
362         psq_l           fr5,8(r3),0,0
363         frsp            fr2,fr2
364         psq_l           fr7,24(r3),0,0
365         frsp            fr3,fr3
366         psq_l           fr8,40(r3),0,0
367         ps_sum1         fr5,fr1,fr5,fr5
368         psq_l           fr6,16(r3),0,0
369         ps_sum1         fr7,fr2,fr7,fr7
370         psq_l           fr9,32(r3),0,0
371         ps_sum1         fr8,fr3,fr8,fr8
372         psq_st          fr4,0(r4),0,0
373         psq_st          fr5,8(r4),0,0
374         psq_st          fr6,16(r4),0,0
375         psq_st          fr7,24(r4),0,0
376         psq_st          fr9,32(r4),0,0
377         psq_st          fr8,40(r4),0,0
378         blr
380         .globl ps_guMtxApplyTrans
381         //r3 = src,r4 = dst,fr1 = xT,fr2 = yT,fr3 = zT
382 ps_guMtxApplyTrans:
383         lfs             fr6,Unit01+4@sdarel(r13)
384         psq_l           fr4,0(r3),0,0
385         frsp            fr1,fr1
386         psq_l           fr5,8(r3),0,0
387         frsp            fr2,fr2
388         ps_merge00      fr10,fr1,fr2
389         psq_l           fr7,24(r3),0,0
390         frsp            fr3,fr3
391         ps_mul          fr1,fr4,fr10
392         ps_merge00      fr11,fr3,fr6
393         psq_l           fr8,40(r3),0,0
394         ps_madd         fr2,fr5,fr11,fr1
395         psq_l           fr6,16(r3),0,0
396         ps_sum0         fr3,fr2,fr3,fr2
397         psq_l           fr9,32(r3),0,0
398         ps_mul          fr12,fr6,fr10
399         psq_st          fr4,0(r4),0,0
400         ps_madd         fr4,fr7,fr11,fr12
401         psq_st          fr5,8(r4),1,0
402         ps_sum0         fr12,fr4,fr12,fr4
403         psq_st          fr3,12(r4),1,0
404         ps_mul          fr3,fr9,fr10
405         psq_st          fr6,16(r4),0,0
406         ps_madd         fr2,fr8,fr11,fr3
407         psq_st          fr7,24(r4),1,0
408         ps_sum0         fr3,fr2,fr3,fr2
409         psq_st          fr12,28(r4),1,0
410         psq_st          fr9,32(r4),0,0
411         psq_st          fr8,40(r4),1,0
412         psq_st          fr3,44(r4),1,0
413         blr
415         .globl ps_guMtxRotTrig
416         //r3 = mt,r4 = axis,fr1 = sinA,fr2 = cosA
417 ps_guMtxRotTrig:
418         frsp            fr1,fr1
419         lfs             fr3,Unit01@sdarel(r13)
420         frsp            fr2,fr2
421         lfs             fr4,Unit01+4@sdarel(r13)
422         ori             r4,r4,0x20
423         ps_neg          fr5,fr1
424         cmplwi          r4,'x'
425         beq             0f
426         cmplwi          r4,'y'
427         beq             1f
428         cmplwi          r4,'z'
429         beq             2f
430         b               3f
432         psq_st          fr4,0(r3),1,0
433         psq_st          fr3,4(r3),0,0
434         ps_merge00      fr6,fr1,fr2
435         psq_st          fr3,12(r3),0,0
436         ps_merge00      fr7,fr2,fr5
437         psq_st          fr3,28(r3),0,0
438         psq_st          fr3,44(r3),1,0
439         psq_st          fr6,36(r3),0,0
440         psq_st          fr7,20(r3),0,0
441         b               3f
443         ps_merge00      fr6,fr2,fr3
444         ps_merge00      fr7,fr3,fr4
445         psq_st          fr3,24(r3),0,0
446         psq_st          fr6,0(r3),0,0
447         ps_merge00      fr8,fr5,fr3
448         ps_merge00      fr9,fr1,fr3
449         psq_st          fr6,40(r3),0,0
450         psq_st          fr7,16(r3),0,0
451         psq_st          fr9,8(r3),0,0
452         psq_st          fr8,32(r3),0,0
453         b               3f
455         psq_st          fr3,8(r3),0,0
456         ps_merge00      fr6,fr1,fr2
457         ps_merge00      fr8,fr2,fr5
458         psq_st          fr3,24(r3),0,0
459         psq_st          fr3,32(r3),0,0
460         ps_merge00      fr7,fr4,fr3
461         psq_st          fr6,16(r3),0,0
462         psq_st          fr8,0(r3),0,0
463         psq_st          fr7,40(r3),0,0
465         blr
467         .globl __ps_guMtxRotAxisRadInternal
468         //r3 = mtx, r4 = vec, fr1 = sT, fr2 = cT
469 __ps_guMtxRotAxisRadInternal:
470         stwu            r1,-64(r1)
471         frsp            fr2,fr2
472         psq_l           fr3,0(r4),0,0
473         frsp            fr1,fr1
474         stfd            fr14,8(r1)
475         lfs             fr11,NrmData+4@sdarel(r13)
476         lfs             fr12,NrmData@sdarel(r13)
477         ps_mul          fr5,fr3,fr3
478         lfs             fr4,8(r4)
479         fadds           fr10,fr12,fr12
480         ps_madd         fr6,fr4,fr4,fr5
481         fsubs           fr14,fr12,fr12
482         ps_sum0         fr7,fr6,fr4,fr5
483         fsubs           fr13,fr10,fr2
484         frsqrte         fr8,fr7
485         fmuls           fr5,fr8,fr8
486         fmuls           fr6,fr8,fr12
487         fnmsubs         fr5,fr5,fr7,fr11
488         fmuls           fr8,fr5,fr6
489         ps_merge00      fr2,fr2,fr2
490         ps_muls0        fr3,fr3,fr8
491         ps_muls0        fr4,fr4,fr8
492         ps_muls0        fr7,fr3,fr13
493         ps_muls0        fr12,fr3,fr1
494         ps_muls0        fr8,fr4,fr13
495         ps_muls1        fr6,fr7,fr3
496         ps_muls0        fr5,fr7,fr3
497         ps_muls0        fr7,fr7,fr4
498         fnmsubs         fr9,fr4,fr1,fr6
499         fmadds          fr10,fr4,fr1,fr6
500         ps_neg          fr3,fr12
501         ps_sum0         fr11,fr7,fr14,fr12
502         ps_sum0         fr5,fr5,fr9,fr2
503         ps_sum1         fr6,fr2,fr10,fr6
504         ps_sum0         fr9,fr3,fr14,fr7
505         psq_st          fr11,8(r3),0,0
506         ps_sum0         fr3,fr7,fr7,fr3
507         psq_st          fr5,0(r3),0,0
508         ps_muls0        fr8,fr8,fr4
509         psq_st          fr6,16(r3),0,0
510         ps_sum1         fr7,fr12,fr3,fr7
511         psq_st          fr9,24(r3),0,0
512         ps_sum0         fr8,fr8,fr14,fr2
513         psq_st          fr7,32(r3),0,0
514         psq_st          fr8,40(r3),0,0
515         lfd                     fr14,8(r1)
516         addi            r1,r1,64
517         blr
519         .globl ps_guMtxReflect
520         //r3 = mtx,r4 = vec1,r5 = vec2
521 ps_guMtxReflect:
522         lfs             fr0,Unit01+4@sdarel(r13)
523         psq_l           fr1,8(r5),1,0
524         psq_l           fr2,0(r5),0,0
525         psq_l           fr3,0(r4),0,0
526         ps_nmadd        fr5,fr1,fr0,fr1
527         psq_l           fr4,8(r4),1,0
528         ps_nmadd        fr6,fr2,fr0,fr2
529         ps_muls0        fr7,fr2,fr5
530         ps_mul          fr8,fr6,fr3
531         ps_muls0        fr9,fr2,fr6
532         ps_sum0         fr8,fr8,fr8,fr8
533         ps_muls1        fr10,fr2,fr6
534         psq_st          fr7,32(r3),0,0
535         ps_sum0         fr2,fr2,fr2,fr0
536         ps_nmadd        fr8,fr5,fr4,fr8
537         ps_sum1         fr10,fr0,fr10,fr10
538         psq_st          fr9,0(r3),0,0
539         ps_muls0        fr11,fr2,fr8
540         ps_merge00      fr12,fr5,fr8
541         psq_st          fr10,16(r3),0,0
542         ps_merge00      fr13,fr7,fr11
543         ps_muls0        fr12,fr12,fr1
544         ps_merge11      fr11,fr7,fr11
545         psq_st          fr13,8(r3),0,0
546         ps_sum0         fr12,fr12,fr12,fr0
547         psq_st          fr11,24(r3),0,0
548         psq_st          fr12,40(r3),0,0
549         blr
551         .globl ps_guVecAdd
552         //r3 = v1,r4 = v2,r5 = dst
553 ps_guVecAdd:
554         psq_l           V1_XY,0(r3),0,0
555         psq_l           V2_XY,0(r4),0,0
556         ps_add          D1_XY,V1_XY,V2_XY
557         psq_st          D1_XY,0(r5),0,0
558         psq_l           V1_Z,8(r3),1,0
559         psq_l           V2_Z,8(r4),1,0
560         ps_add          D1_Z,V1_Z,V2_Z
561         psq_st          D1_Z,8(r5),1,0
562         blr
564         .globl ps_guVecSub
565         //r3 = v1,r4 = v2,r5 = dst
566 ps_guVecSub:
567         psq_l           V1_XY,0(r3),0,0
568         psq_l           V2_XY,0(r4),0,0
569         ps_sub          D1_XY,V1_XY,V2_XY
570         psq_st          D1_XY,0(r5),0,0
571         psq_l           V1_Z,8(r3),1,0
572         psq_l           V2_Z,8(r4),1,0
573         ps_sub          D1_Z,V1_Z,V2_Z
574         psq_st          D1_Z,8(r5),1,0
575         blr
577         .globl ps_guVecScale
578         //r3 = src,r4 = dst,fr1 = S
579 ps_guVecScale:
580         psq_l           fr2,0(r3),0,0
581         psq_l           fr3,8(r3),1,0
582         ps_muls0        fr4,fr2,fr1
583         psq_st          fr4,0(r4),0,0
584         ps_muls0        fr4,fr3,fr1
585         psq_st          fr4,8(r4),1,0   
586         blr
588         .globl  ps_guVecNormalize
589         //r3 = v
590 ps_guVecNormalize:
591         lfs             fr0,NrmData@sdarel(r13)
592         lfs             fr1,NrmData+4@sdarel(r13)
593         psq_l           fr2,0(r3),0,0
594         ps_mul          fr4,fr2,fr2
595         psq_l           fr3,8(r3),1,0
596         ps_madd         fr5,fr3,fr3,fr4
597         ps_sum0         fr6,fr5,fr3,fr4
598         frsqrte         fr7,fr6
599         fmuls           fr8,fr7,fr7
600         fmuls           fr9,fr7,fr0
601         fnmsubs         fr8,fr8,fr6,fr1
602         fmuls           fr7,fr8,fr9
603         ps_muls0        fr2,fr2,fr7
604         psq_st          fr2,0(r3),0,0
605         ps_muls0        fr3,fr3,fr7
606         psq_st          fr3,8(r3),1,0
607         blr
609         .globl ps_guVecCross
610         //r3 = v1,r4 = v2,r5 = v12
611 ps_guVecCross:
612         psq_l           fr1,0(r4),0,0
613         lfs             fr2,8(r3)
614         psq_l           fr0,0(r3),0,0
615         ps_merge10      fr6,fr1,fr1
616         lfs             fr3,8(r4)
617         ps_mul          fr4,fr1,fr2
618         ps_muls0        fr7,fr1,fr0
619         ps_msub         fr5,fr0,fr3,fr4
620         ps_msub         fr8,fr0,fr6,fr7
621         ps_merge11      fr9,fr5,fr5
622         ps_merge01      fr10,fr5,fr8
623         psq_st          fr9,0(r5),1,0
624         ps_neg          fr10,fr10
625         psq_st          fr10,4(r5),0,0  
626         blr
628         .globl ps_guVecDotProduct
629         //r3 = vec1,r4 = vec2
630 ps_guVecDotProduct:
631         psq_l           fr2,4(r3),0,0
632         psq_l           fr3,4(r4),0,0
633         ps_mul          fr2,fr2,fr3
634         psq_l           fr5,0(r3),0,0
635         psq_l           fr4,0(r4),0,0
636         ps_madd         fr3,fr5,fr4,fr2
637         ps_sum0         fr1,fr3,fr2,fr2
638         blr
640         .globl ps_guVecMultiply
641 ps_guVecMultiply:
642         psq_l           fr0,0(r4),0,0
643         psq_l           fr2,0(r3),0,0
644         psq_l           fr1,8(r4),1,0
645         ps_mul          fr4,fr2,fr0
646         psq_l           fr3,8(r3),0,0
647         ps_madd         fr5,fr3,fr1,fr4
648         psq_l           fr8,16(r3),0,0
649         ps_sum0         fr6,fr5,fr6,fr5
650         psq_l           fr9,24(r3),0,0
651         ps_mul          fr10,fr8,fr0
652         psq_st          fr6,0(r5),1,0
653         ps_madd         fr11,fr9,fr1,fr10
654         psq_l           fr2,32(r3),0,0
655         ps_sum0         fr12,fr11,fr12,fr11
656         psq_l           fr3,40(r3),0,0
657         ps_mul          fr4,fr2,fr0
658         psq_st          fr12,4(r5),1,0
659         ps_madd         fr5,fr3,fr1,fr4
660         ps_sum0         fr6,fr5,fr6,fr5
661         psq_st          fr6,8(r5),1,0
662         blr
664         .globl ps_guVecMultiplySR
665         // r3 = mt, r4 = src, r5 = dst
666 ps_guVecMultiplySR:
667     psq_l               fr0,0(r3),0,0    // m[0][0], m[0][1] GQR0 = 0
668     // fp6 - x y 
669     psq_l               fr6,0(r4),0,0 
670     psq_l               fr2,16(r3),0,0   // m[1][0], m[1][1]
671     // fp8 = m00x m01y // next X
672     ps_mul              fr8,fr0,fr6
673     psq_l               fr4,32(r3),0,0   // m[2][0], m[2][1]
674     // fp10 = m10x m11y // next Y
675     ps_mul              fr10,fr2,fr6
676     psq_l               fr7,8(r4),1,0   // fp7 - z,1.0
677     // fp12 = m20x m21y // next Z
678     ps_mul              fr12,fr4,fr6  // YYY last FP6 usage
679     psq_l               fr3,24(r3),0,0   // m[1][2], m[1][3]
680     ps_sum0             fr8,fr8,fr8,fr8
681     psq_l               fr5,40(r3),0,0   // m[2][2], m[2][3]
682     ps_sum0             fr10,fr10,fr10,fr10
683     psq_l               fr1,8(r3),0,0    // m[0][2], m[0][3] 
684     ps_sum0             fr12,fr12,fr12,fr12
685     ps_madd             fr9,fr1,fr7,fr8
686     psq_st              fr9,0(r5),1,0      // store X
687     ps_madd             fr11,fr3,fr7,fr10
688     psq_st              fr11,4(r5),1,0      // store Y  
689     ps_madd             fr13,fr5,fr7,fr12
690     psq_st              fr13,8(r5),1,0      //  sore Z
691         blr
693         .globl ps_quQuatScale
694         //r3 = q,r4 = r, fr1 = scale
695 ps_guQuatScale:
696         psq_l           fr4,0(r3),0,0
697         psq_l           fr5,8(r3),0,0
698         ps_muls0        fr4,fr4,fr1
699         psq_st          fr4,0(r4),0,0
700         ps_muls0        fr5,fr5,fr1
701         psq_st          fr5,8(r4),0,0
702         blr
704         .globl ps_guQuatDotProduct
705         //r3 = p, r4 = q ; fr1 = res
706 ps_guQuatDotProduct:
707         psq_l           fr2,0(r3),0,0
708         psq_l           fr4,0(r4),0,0
709         ps_mul          fr1,fr2,fr4
710         psq_l           fr3,8(r3),0,0
711         psq_l           fr5,8(r4),0,0
712         ps_madd         fr1,fr3,fr5,fr1
713         ps_sum0         fr1,fr1,fr1,fr1
714         blr
716         .section .sdata
717         .balign 4
718 QuatEpsilon:
719         .float  0.00001
720 Unit01:
721         .float  0.0, 1.0
722 NrmData:
723         .float  0.5, 3.0