41 //r3 = mtxA, r4 = mtxB, r5 = mtxAB
44 psq_l A00_A01,0(r3),0,0
46 psq_l B00_B01,0(r4),0,0
48 psq_l B02_B03,8(r4),0,0
52 psq_l B10_B11,16(r4),0,0
53 ps_muls0 D00_D01,B00_B01,A00_A01
54 psq_l A10_A11,16(r3),0,0
55 ps_muls0 D02_D03,B02_B03,A00_A01
56 psq_l UNIT01,0(r6),0,0
57 ps_muls0 D10_D11,B00_B01,A10_A11
58 psq_l B12_B13,24(r4),0,0
59 ps_muls0 D12_D13,B02_B03,A10_A11
60 psq_l A02_A03,8(r3),0,0
61 ps_madds1 D00_D01,B10_B11,A00_A01,D00_D01
62 psq_l A12_A13,24(r3),0,0
63 ps_madds1 D10_D11,B10_B11,A10_A11,D10_D11
64 psq_l B20_B21,32(r4),0,0
65 ps_madds1 D02_D03,B12_B13,A00_A01,D02_D03
66 psq_l B22_B23,40(r4),0,0
67 ps_madds1 D12_D13,B12_B13,A10_A11,D12_D13
68 psq_l A20_A21,32(r3),0,0
69 psq_l A22_A23,40(r3),0,0
70 ps_madds0 D00_D01,B20_B21,A02_A03,D00_D01
71 ps_madds0 D02_D03,B22_B23,A02_A03,D02_D03
72 ps_madds0 D10_D11,B20_B21,A12_A13,D10_D11
73 ps_madds0 D12_D13,B22_B23,A12_A13,D12_D13
74 psq_st D00_D01,0(r5),0,0
75 ps_muls0 D20_D21,B00_B01,A20_A21
76 ps_madds1 D02_D03,UNIT01,A02_A03,D02_D03
77 ps_muls0 D22_D23,B02_B03,A20_A21
78 psq_st D10_D11,16(r5),0,0
79 ps_madds1 D12_D13,UNIT01,A12_A13,D12_D13
80 psq_st D02_D03,8(r5),0,0
81 ps_madds1 D20_D21,B10_B11,A20_A21,D20_D21
82 ps_madds1 D22_D23,B12_B13,A20_A21,D22_D23
83 ps_madds0 D20_D21,B20_B21,A22_A23,D20_D21
85 psq_st D12_D13,24(r5),0,0
86 ps_madds0 D22_D23,B22_B23,A22_A23,D22_D23
87 psq_st D20_D21,32(r5),0,0
88 ps_madds1 D22_D23,UNIT01,A22_A23,D22_D23
90 psq_st D22_D23,40(r5),0,0
95 .globl ps_guMtxIdentity
103 ps_merge01 fr2,fr0,fr1
104 psq_st fr0,24(r3),0,0
105 ps_merge10 fr3,fr1,fr0
106 psq_st fr0,32(r3),0,0
107 psq_st fr2,16(r3),0,0
109 psq_st fr3,40(r3),0,0
120 psq_st fr2,16(r4),0,0
122 psq_st fr3,24(r4),0,0
124 psq_st fr4,32(r4),0,0
126 psq_st fr5,40(r4),0,0
129 .globl ps_guMtxTranspose
130 //r3 = src, r4 = xpose
138 ps_merge00 fr5,fr1,fr2
140 ps_merge11 fr6,fr1,fr2
144 ps_merge00 fr7,fr3,fr4
145 psq_st fr6,16(r4),0,0
146 ps_merge00 fr5,fr1,fr0
147 psq_st fr7,32(r4),0,0
148 ps_merge10 fr6,fr1,fr0
151 psq_st fr6,24(r4),0,0
155 .globl ps_guMtxInverse
161 ps_merge10 fr6,fr1,fr0
164 ps_merge10 fr7,fr3,fr2
168 ps_merge10 fr8,fr5,fr4
169 ps_msub fr11,fr1,fr7,fr11
171 ps_msub fr13,fr3,fr8,fr13
173 ps_msub fr12,fr5,fr6,fr12
177 ps_msub fr10,fr2,fr5,fr10
179 ps_msub fr9,fr1,fr4,fr9
180 ps_madd fr7,fr2,fr12,fr7
181 ps_msub fr8,fr0,fr3,fr8
182 ps_madd fr7,fr4,fr11,fr7
191 ps_nmsub fr0,fr7,fr5,fr6
193 ps_muls0 fr13,fr13,fr0
195 ps_muls0 fr12,fr12,fr0
197 ps_muls0 fr11,fr11,fr0
198 ps_merge00 fr5,fr13,fr12
199 ps_muls0 fr10,fr10,fr0
200 ps_merge11 fr4,fr13,fr12
204 psq_st fr4,16(r4),0,0
206 ps_madd fr6,fr12,fr2,fr6
207 psq_st fr10,32(r4),1,0
208 ps_nmadd fr6,fr11,fr3,fr6
209 psq_st fr9,36(r4),1,0
211 ps_merge00 fr5,fr11,fr6
212 psq_st fr8,40(r4),1,0
213 ps_merge11 fr4,fr11,fr6
215 ps_madd fr7,fr9,fr2,fr7
216 psq_st fr4,24(r4),0,0
217 ps_nmadd fr7,fr8,fr3,fr7
219 psq_st fr7,44(r4),1,0
222 .globl ps_guMtxInvXpose
223 //r3 = src, r4 = invx
225 psq_l fr0, 0(r3), 1, 0
226 psq_l fr1, 4(r3), 0, 0
227 psq_l fr2, 16(r3), 1, 0
228 ps_merge10 fr6, fr1, fr0
229 psq_l fr3, 20(r3), 0, 0
230 psq_l fr4, 32(r3), 1, 0
231 ps_merge10 fr7, fr3, fr2
232 psq_l fr5, 36(r3), 0, 0
233 ps_mul fr11, fr3, fr6
234 ps_merge10 fr8, fr5, fr4
235 ps_mul fr13, fr5, fr7
236 ps_msub fr11, fr1, fr7, fr11
237 ps_mul fr12, fr1, fr8
238 ps_msub fr13, fr3, fr8, fr13
239 ps_msub fr12, fr5, fr6, fr12
240 ps_mul fr10, fr3, fr4
243 ps_msub fr10, fr2, fr5, fr10
244 ps_msub fr9, fr1, fr4, fr9
245 ps_msub fr8, fr0, fr3, fr8
246 ps_mul fr7, fr0, fr13
248 ps_madd fr7, fr2, fr12, fr7
249 ps_madd fr7, fr4, fr11, fr7
250 ps_cmpo0 cr0, fr7, fr1
256 psq_st fr1, 12(r4), 1, 0
259 psq_st fr1, 28(r4), 1, 0
260 ps_nmsub fr0, fr7, fr5, fr6
261 psq_st fr1, 44(r4), 1, 0
262 ps_muls0 fr13, fr13, fr0
263 ps_muls0 fr12, fr12, fr0
264 ps_muls0 fr11, fr11, fr0
265 psq_st fr13, 0(r4), 0, 0
266 psq_st fr12, 16(r4), 0, 0
267 ps_muls0 fr10, fr10, fr0
268 ps_muls0 fr9, fr9, fr0
269 psq_st fr11, 32(r4), 0, 0
270 psq_st fr10, 8(r4), 1, 0
271 ps_muls0 fr8, fr8, fr0
273 psq_st fr9, 24(r4), 1, 0
274 psq_st fr8, 40(r4), 1, 0
278 //r3 = mtx,fr1 = xS,fr2 = yS,fr3 = zS
285 psq_st fr0,12(r3),0,0
287 psq_st fr0,24(r3),0,0
288 psq_st fr0,32(r3),0,0
293 .globl ps_guMtxScaleApply
294 //r3 = src,r4 = dst,fr1 = xS,fr2 = yS,fr3 = zS
312 psq_st fr6,16(r4),0,0
314 psq_st fr7,24(r4),0,0
315 psq_st fr8,32(r4),0,0
316 psq_st fr2,40(r4),0,0
319 .globl ps_guMtxApplyScale
320 //r3 = src,r4 = dst,fr1 = xS,fr2 = yS,fr3 = zS
330 ps_merge00 fr10,fr1,fr2
331 ps_merge00 fr11,fr3,fr6
343 psq_st fr6,16(r4),0,0
345 psq_st fr7,24(r4),0,0
346 psq_st fr8,32(r4),0,0
347 psq_st fr2,40(r4),0,0
351 //r3 = mtx,fr1 = xT,fr2 = yT,fr3 = zT
361 psq_st fr4,32(r3),0,0
369 .globl ps_guMtxTransApply
370 //r3 = src,r4 = dst,fr1 = xT,fr2 = yT,fr3 = zT
379 ps_sum1 fr5,fr1,fr5,fr5
381 ps_sum1 fr7,fr2,fr7,fr7
383 ps_sum1 fr8,fr3,fr8,fr8
386 psq_st fr6,16(r4),0,0
387 psq_st fr7,24(r4),0,0
388 psq_st fr9,32(r4),0,0
389 psq_st fr8,40(r4),0,0
392 .globl ps_guMtxApplyTrans
393 //r3 = src,r4 = dst,fr1 = xT,fr2 = yT,fr3 = zT
402 ps_merge00 fr10,fr1,fr2
406 ps_merge00 fr11,fr3,fr6
408 ps_madd fr2,fr5,fr11,fr1
410 ps_sum0 fr3,fr2,fr3,fr2
414 ps_madd fr4,fr7,fr11,fr12
416 ps_sum0 fr12,fr4,fr12,fr4
417 psq_st fr3,12(r4),1,0
419 psq_st fr6,16(r4),0,0
420 ps_madd fr2,fr8,fr11,fr3
421 psq_st fr7,24(r4),1,0
422 ps_sum0 fr3,fr2,fr3,fr2
423 psq_st fr12,28(r4),1,0
424 psq_st fr9,32(r4),0,0
425 psq_st fr8,40(r4),1,0
426 psq_st fr3,44(r4),1,0
429 .globl ps_guMtxRotTrig
430 //r3 = mt,r4 = axis,fr1 = sinA,fr2 = cosA
450 ps_merge00 fr6,fr1,fr2
451 psq_st fr3,12(r3),0,0
452 ps_merge00 fr7,fr2,fr5
453 psq_st fr3,28(r3),0,0
454 psq_st fr3,44(r3),1,0
455 psq_st fr6,36(r3),0,0
456 psq_st fr7,20(r3),0,0
459 ps_merge00 fr6,fr2,fr3
460 ps_merge00 fr7,fr3,fr4
461 psq_st fr3,24(r3),0,0
463 ps_merge00 fr8,fr5,fr3
464 ps_merge00 fr9,fr1,fr3
465 psq_st fr6,40(r3),0,0
466 psq_st fr7,16(r3),0,0
468 psq_st fr8,32(r3),0,0
472 ps_merge00 fr6,fr1,fr2
473 ps_merge00 fr8,fr2,fr5
474 psq_st fr3,24(r3),0,0
475 psq_st fr3,32(r3),0,0
476 ps_merge00 fr7,fr4,fr3
477 psq_st fr6,16(r3),0,0
479 psq_st fr7,40(r3),0,0
483 .globl __ps_guMtxRotAxisRadInternal
484 //r3 = mtx, r4 = vec, fr1 = sT, fr2 = cT
485 __ps_guMtxRotAxisRadInternal:
498 ps_madd fr6,fr4,fr4,fr5
500 ps_sum0 fr7,fr6,fr4,fr5
505 fnmsubs fr5,fr5,fr7,fr11
507 ps_merge00 fr2,fr2,fr2
510 ps_muls0 fr7,fr3,fr13
511 ps_muls0 fr12,fr3,fr1
512 ps_muls0 fr8,fr4,fr13
516 fnmsubs fr9,fr4,fr1,fr6
517 fmadds fr10,fr4,fr1,fr6
519 ps_sum0 fr11,fr7,fr14,fr12
520 ps_sum0 fr5,fr5,fr9,fr2
521 ps_sum1 fr6,fr2,fr10,fr6
522 ps_sum0 fr9,fr3,fr14,fr7
523 psq_st fr11,8(r3),0,0
524 ps_sum0 fr3,fr7,fr7,fr3
527 psq_st fr6,16(r3),0,0
528 ps_sum1 fr7,fr12,fr3,fr7
529 psq_st fr9,24(r3),0,0
530 ps_sum0 fr8,fr8,fr14,fr2
531 psq_st fr7,32(r3),0,0
532 psq_st fr8,40(r3),0,0
537 .globl ps_guMtxReflect
538 //r3 = mtx,r4 = vec1,r5 = vec2
546 ps_nmadd fr5,fr1,fr0,fr1
548 ps_nmadd fr6,fr2,fr0,fr2
552 ps_sum0 fr8,fr8,fr8,fr8
553 ps_muls1 fr10,fr2,fr6
554 psq_st fr7,32(r3),0,0
555 ps_sum0 fr2,fr2,fr2,fr0
556 ps_nmadd fr8,fr5,fr4,fr8
557 ps_sum1 fr10,fr0,fr10,fr10
559 ps_muls0 fr11,fr2,fr8
560 ps_merge00 fr12,fr5,fr8
561 psq_st fr10,16(r3),0,0
562 ps_merge00 fr13,fr7,fr11
563 ps_muls0 fr12,fr12,fr1
564 ps_merge11 fr11,fr7,fr11
565 psq_st fr13,8(r3),0,0
566 ps_sum0 fr12,fr12,fr12,fr0
567 psq_st fr11,24(r3),0,0
568 psq_st fr12,40(r3),0,0
572 //r3 = v1,r4 = v2,r5 = dst
574 psq_l V1_XY,0(r3),0,0
575 psq_l V2_XY,0(r4),0,0
576 ps_add D1_XY,V1_XY,V2_XY
577 psq_st D1_XY,0(r5),0,0
580 ps_add D1_Z,V1_Z,V2_Z
581 psq_st D1_Z,8(r5),1,0
585 //r3 = v1,r4 = v2,r5 = dst
587 psq_l V1_XY,0(r3),0,0
588 psq_l V2_XY,0(r4),0,0
589 ps_sub D1_XY,V1_XY,V2_XY
590 psq_st D1_XY,0(r5),0,0
593 ps_sub D1_Z,V1_Z,V2_Z
594 psq_st D1_Z,8(r5),1,0
598 //r3 = src,r4 = dst,fr1 = S
608 .globl ps_guVecNormalize
618 ps_madd fr5,fr3,fr3,fr4
619 ps_sum0 fr6,fr5,fr3,fr4
623 fnmsubs fr8,fr8,fr6,fr1
632 //r3 = v1,r4 = v2,r5 = v12
637 ps_merge10 fr6,fr1,fr1
641 ps_msub fr5,fr0,fr3,fr4
642 ps_msub fr8,fr0,fr6,fr7
643 ps_merge11 fr9,fr5,fr5
644 ps_merge01 fr10,fr5,fr8
647 psq_st fr10,4(r5),0,0
650 .globl ps_guVecDotProduct
651 //r3 = vec1,r4 = vec2
658 ps_madd fr3,fr5,fr4,fr2
659 ps_sum0 fr1,fr3,fr2,fr2
662 .globl ps_guVecMultiply
669 ps_madd fr5,fr3,fr1,fr4
671 ps_sum0 fr6,fr5,fr6,fr5
675 ps_madd fr11,fr9,fr1,fr10
677 ps_sum0 fr12,fr11,fr12,fr11
680 psq_st fr12,4(r5),1,0
681 ps_madd fr5,fr3,fr1,fr4
682 ps_sum0 fr6,fr5,fr6,fr5
686 .globl ps_guVecMultiplySR
687 // r3 = mt, r4 = src, r5 = dst
689 psq_l fr0,0(r3),0,0 // m[0][0], m[0][1] GQR0 = 0
692 psq_l fr2,16(r3),0,0 // m[1][0], m[1][1]
693 // fp8 = m00x m01y // next X
695 psq_l fr4,32(r3),0,0 // m[2][0], m[2][1]
696 // fp10 = m10x m11y // next Y
698 psq_l fr7,8(r4),1,0 // fp7 - z,1.0
699 // fp12 = m20x m21y // next Z
700 ps_mul fr12,fr4,fr6 // YYY last FP6 usage
701 psq_l fr3,24(r3),0,0 // m[1][2], m[1][3]
702 ps_sum0 fr8,fr8,fr8,fr8
703 psq_l fr5,40(r3),0,0 // m[2][2], m[2][3]
704 ps_sum0 fr10,fr10,fr10,fr10
705 psq_l fr1,8(r3),0,0 // m[0][2], m[0][3]
706 ps_sum0 fr12,fr12,fr12,fr12
707 ps_madd fr9,fr1,fr7,fr8
708 psq_st fr9,0(r5),1,0 // store X
709 ps_madd fr11,fr3,fr7,fr10
710 psq_st fr11,4(r5),1,0 // store Y
711 ps_madd fr13,fr5,fr7,fr12
712 psq_st fr13,8(r5),1,0 // sore Z
715 .globl ps_quQuatScale
716 //r3 = q,r4 = r, fr1 = scale
726 .globl ps_guQuatDotProduct
727 //r3 = p, r4 = q ; fr1 = res
734 ps_madd fr1,fr3,fr5,fr1
735 ps_sum0 fr1,fr1,fr1,fr1