41 //r3 = mtxA, r4 = mtxB, r5 = mtxAB
44 psq_l A00_A01,0(r3),0,0
46 psq_l B00_B01,0(r4),0,0
47 psq_l B02_B03,8(r4),0,0
50 psq_l B10_B11,16(r4),0,0
51 ps_muls0 D00_D01,B00_B01,A00_A01
52 psq_l A10_A11,16(r3),0,0
53 ps_muls0 D02_D03,B02_B03,A00_A01
54 psq_l UNIT01,Unit01@sdarel(r13),0,0
55 ps_muls0 D10_D11,B00_B01,A10_A11
56 psq_l B12_B13,24(r4),0,0
57 ps_muls0 D12_D13,B02_B03,A10_A11
58 psq_l A02_A03,8(r3),0,0
59 ps_madds1 D00_D01,B10_B11,A00_A01,D00_D01
60 psq_l A12_A13,24(r3),0,0
61 ps_madds1 D10_D11,B10_B11,A10_A11,D10_D11
62 psq_l B20_B21,32(r4),0,0
63 ps_madds1 D02_D03,B12_B13,A00_A01,D02_D03
64 psq_l B22_B23,40(r4),0,0
65 ps_madds1 D12_D13,B12_B13,A10_A11,D12_D13
66 psq_l A20_A21,32(r3),0,0
67 psq_l A22_A23,40(r3),0,0
68 ps_madds0 D00_D01,B20_B21,A02_A03,D00_D01
69 ps_madds0 D02_D03,B22_B23,A02_A03,D02_D03
70 ps_madds0 D10_D11,B20_B21,A12_A13,D10_D11
71 ps_madds0 D12_D13,B22_B23,A12_A13,D12_D13
72 psq_st D00_D01,0(r5),0,0
73 ps_muls0 D20_D21,B00_B01,A20_A21
74 ps_madds1 D02_D03,UNIT01,A02_A03,D02_D03
75 ps_muls0 D22_D23,B02_B03,A20_A21
76 psq_st D10_D11,16(r5),0,0
77 ps_madds1 D12_D13,UNIT01,A12_A13,D12_D13
78 psq_st D02_D03,8(r5),0,0
79 ps_madds1 D20_D21,B10_B11,A20_A21,D20_D21
80 ps_madds1 D22_D23,B12_B13,A20_A21,D22_D23
81 ps_madds0 D20_D21,B20_B21,A22_A23,D20_D21
83 psq_st D12_D13,24(r5),0,0
84 ps_madds0 D22_D23,B22_B23,A22_A23,D22_D23
85 psq_st D20_D21,32(r5),0,0
86 ps_madds1 D22_D23,UNIT01,A22_A23,D22_D23
88 psq_st D22_D23,40(r5),0,0
93 .globl ps_guMtxIdentity
96 lfs fr0,Unit01@sdarel(r13)
97 lfs fr1,Unit01+4@sdarel(r13)
99 ps_merge01 fr2,fr0,fr1
100 psq_st fr0,24(r3),0,0
101 ps_merge10 fr3,fr1,fr0
102 psq_st fr0,32(r3),0,0
103 psq_st fr2,16(r3),0,0
105 psq_st fr3,40(r3),0,0
116 psq_st fr2,16(r4),0,0
118 psq_st fr3,24(r4),0,0
120 psq_st fr4,32(r4),0,0
122 psq_st fr5,40(r4),0,0
125 .globl ps_guMtxTranspose
126 //r3 = src, r4 = xpose
128 lfs fr0,Unit01@sdarel(r13)
132 ps_merge00 fr5,fr1,fr2
134 ps_merge11 fr6,fr1,fr2
138 ps_merge00 fr7,fr3,fr4
139 psq_st fr6,16(r4),0,0
140 ps_merge00 fr5,fr1,fr0
141 psq_st fr7,32(r4),0,0
142 ps_merge10 fr6,fr1,fr0
145 psq_st fr6,24(r4),0,0
149 .globl ps_guMtxInverse
155 ps_merge10 fr6,fr1,fr0
158 ps_merge10 fr7,fr3,fr2
162 ps_merge10 fr8,fr5,fr4
163 ps_msub fr11,fr1,fr7,fr11
165 ps_msub fr13,fr3,fr8,fr13
167 ps_msub fr12,fr5,fr6,fr12
171 ps_msub fr10,fr2,fr5,fr10
173 ps_msub fr9,fr1,fr4,fr9
174 ps_madd fr7,fr2,fr12,fr7
175 ps_msub fr8,fr0,fr3,fr8
176 ps_madd fr7,fr4,fr11,fr7
185 ps_nmsub fr0,fr7,fr5,fr6
187 ps_muls0 fr13,fr13,fr0
189 ps_muls0 fr12,fr12,fr0
191 ps_muls0 fr11,fr11,fr0
192 ps_merge00 fr5,fr13,fr12
193 ps_muls0 fr10,fr10,fr0
194 ps_merge11 fr4,fr13,fr12
198 psq_st fr4,16(r4),0,0
200 ps_madd fr6,fr12,fr2,fr6
201 psq_st fr10,32(r4),1,0
202 ps_nmadd fr6,fr11,fr3,fr6
203 psq_st fr9,36(r4),1,0
205 ps_merge00 fr5,fr11,fr6
206 psq_st fr8,40(r4),1,0
207 ps_merge11 fr4,fr11,fr6
209 ps_madd fr7,fr9,fr2,fr7
210 psq_st fr4,24(r4),0,0
211 ps_nmadd fr7,fr8,fr3,fr7
213 psq_st fr7,44(r4),1,0
216 .globl ps_guMtxInvXpose
217 //r3 = src, r4 = invx
219 psq_l fr0, 0(r3), 1, 0
220 psq_l fr1, 4(r3), 0, 0
221 psq_l fr2, 16(r3), 1, 0
222 ps_merge10 fr6, fr1, fr0
223 psq_l fr3, 20(r3), 0, 0
224 psq_l fr4, 32(r3), 1, 0
225 ps_merge10 fr7, fr3, fr2
226 psq_l fr5, 36(r3), 0, 0
227 ps_mul fr11, fr3, fr6
228 ps_merge10 fr8, fr5, fr4
229 ps_mul fr13, fr5, fr7
230 ps_msub fr11, fr1, fr7, fr11
231 ps_mul fr12, fr1, fr8
232 ps_msub fr13, fr3, fr8, fr13
233 ps_msub fr12, fr5, fr6, fr12
234 ps_mul fr10, fr3, fr4
237 ps_msub fr10, fr2, fr5, fr10
238 ps_msub fr9, fr1, fr4, fr9
239 ps_msub fr8, fr0, fr3, fr8
240 ps_mul fr7, fr0, fr13
242 ps_madd fr7, fr2, fr12, fr7
243 ps_madd fr7, fr4, fr11, fr7
244 ps_cmpo0 cr0, fr7, fr1
250 psq_st fr1, 12(r4), 1, 0
253 psq_st fr1, 28(r4), 1, 0
254 ps_nmsub fr0, fr7, fr5, fr6
255 psq_st fr1, 44(r4), 1, 0
256 ps_muls0 fr13, fr13, fr0
257 ps_muls0 fr12, fr12, fr0
258 ps_muls0 fr11, fr11, fr0
259 psq_st fr13, 0(r4), 0, 0
260 psq_st fr12, 16(r4), 0, 0
261 ps_muls0 fr10, fr10, fr0
262 ps_muls0 fr9, fr9, fr0
263 psq_st fr11, 32(r4), 0, 0
264 psq_st fr10, 8(r4), 1, 0
265 ps_muls0 fr8, fr8, fr0
267 psq_st fr9, 24(r4), 1, 0
268 psq_st fr8, 40(r4), 1, 0
272 //r3 = mtx,fr1 = xS,fr2 = yS,fr3 = zS
274 lfs fr0,Unit01@sdarel(r13)
277 psq_st fr0,12(r3),0,0
279 psq_st fr0,24(r3),0,0
280 psq_st fr0,32(r3),0,0
285 .globl ps_guMtxScaleApply
286 //r3 = src,r4 = dst,fr1 = xS,fr2 = yS,fr3 = zS
304 psq_st fr6,16(r4),0,0
306 psq_st fr7,24(r4),0,0
307 psq_st fr8,32(r4),0,0
308 psq_st fr2,40(r4),0,0
311 .globl ps_guMtxApplyScale
312 //r3 = src,r4 = dst,fr1 = xS,fr2 = yS,fr3 = zS
314 lfs fr6,Unit01+4@sdarel(r13)
320 ps_merge00 fr10,fr1,fr2
321 ps_merge00 fr11,fr3,fr6
333 psq_st fr6,16(r4),0,0
335 psq_st fr7,24(r4),0,0
336 psq_st fr8,32(r4),0,0
337 psq_st fr2,40(r4),0,0
341 //r3 = mtx,fr1 = xT,fr2 = yT,fr3 = zT
343 lfs fr4,Unit01@sdarel(r13)
344 lfs fr5,Unit01+4@sdarel(r13)
349 psq_st fr4,32(r3),0,0
357 .globl ps_guMtxTransApply
358 //r3 = src,r4 = dst,fr1 = xT,fr2 = yT,fr3 = zT
367 ps_sum1 fr5,fr1,fr5,fr5
369 ps_sum1 fr7,fr2,fr7,fr7
371 ps_sum1 fr8,fr3,fr8,fr8
374 psq_st fr6,16(r4),0,0
375 psq_st fr7,24(r4),0,0
376 psq_st fr9,32(r4),0,0
377 psq_st fr8,40(r4),0,0
380 .globl ps_guMtxApplyTrans
381 //r3 = src,r4 = dst,fr1 = xT,fr2 = yT,fr3 = zT
383 lfs fr6,Unit01+4@sdarel(r13)
388 ps_merge00 fr10,fr1,fr2
392 ps_merge00 fr11,fr3,fr6
394 ps_madd fr2,fr5,fr11,fr1
396 ps_sum0 fr3,fr2,fr3,fr2
400 ps_madd fr4,fr7,fr11,fr12
402 ps_sum0 fr12,fr4,fr12,fr4
403 psq_st fr3,12(r4),1,0
405 psq_st fr6,16(r4),0,0
406 ps_madd fr2,fr8,fr11,fr3
407 psq_st fr7,24(r4),1,0
408 ps_sum0 fr3,fr2,fr3,fr2
409 psq_st fr12,28(r4),1,0
410 psq_st fr9,32(r4),0,0
411 psq_st fr8,40(r4),1,0
412 psq_st fr3,44(r4),1,0
415 .globl ps_guMtxRotTrig
416 //r3 = mt,r4 = axis,fr1 = sinA,fr2 = cosA
419 lfs fr3,Unit01@sdarel(r13)
421 lfs fr4,Unit01+4@sdarel(r13)
434 ps_merge00 fr6,fr1,fr2
435 psq_st fr3,12(r3),0,0
436 ps_merge00 fr7,fr2,fr5
437 psq_st fr3,28(r3),0,0
438 psq_st fr3,44(r3),1,0
439 psq_st fr6,36(r3),0,0
440 psq_st fr7,20(r3),0,0
443 ps_merge00 fr6,fr2,fr3
444 ps_merge00 fr7,fr3,fr4
445 psq_st fr3,24(r3),0,0
447 ps_merge00 fr8,fr5,fr3
448 ps_merge00 fr9,fr1,fr3
449 psq_st fr6,40(r3),0,0
450 psq_st fr7,16(r3),0,0
452 psq_st fr8,32(r3),0,0
456 ps_merge00 fr6,fr1,fr2
457 ps_merge00 fr8,fr2,fr5
458 psq_st fr3,24(r3),0,0
459 psq_st fr3,32(r3),0,0
460 ps_merge00 fr7,fr4,fr3
461 psq_st fr6,16(r3),0,0
463 psq_st fr7,40(r3),0,0
467 .globl __ps_guMtxRotAxisRadInternal
468 //r3 = mtx, r4 = vec, fr1 = sT, fr2 = cT
469 __ps_guMtxRotAxisRadInternal:
475 lfs fr11,NrmData+4@sdarel(r13)
476 lfs fr12,NrmData@sdarel(r13)
480 ps_madd fr6,fr4,fr4,fr5
482 ps_sum0 fr7,fr6,fr4,fr5
487 fnmsubs fr5,fr5,fr7,fr11
489 ps_merge00 fr2,fr2,fr2
492 ps_muls0 fr7,fr3,fr13
493 ps_muls0 fr12,fr3,fr1
494 ps_muls0 fr8,fr4,fr13
498 fnmsubs fr9,fr4,fr1,fr6
499 fmadds fr10,fr4,fr1,fr6
501 ps_sum0 fr11,fr7,fr14,fr12
502 ps_sum0 fr5,fr5,fr9,fr2
503 ps_sum1 fr6,fr2,fr10,fr6
504 ps_sum0 fr9,fr3,fr14,fr7
505 psq_st fr11,8(r3),0,0
506 ps_sum0 fr3,fr7,fr7,fr3
509 psq_st fr6,16(r3),0,0
510 ps_sum1 fr7,fr12,fr3,fr7
511 psq_st fr9,24(r3),0,0
512 ps_sum0 fr8,fr8,fr14,fr2
513 psq_st fr7,32(r3),0,0
514 psq_st fr8,40(r3),0,0
519 .globl ps_guMtxReflect
520 //r3 = mtx,r4 = vec1,r5 = vec2
522 lfs fr0,Unit01+4@sdarel(r13)
526 ps_nmadd fr5,fr1,fr0,fr1
528 ps_nmadd fr6,fr2,fr0,fr2
532 ps_sum0 fr8,fr8,fr8,fr8
533 ps_muls1 fr10,fr2,fr6
534 psq_st fr7,32(r3),0,0
535 ps_sum0 fr2,fr2,fr2,fr0
536 ps_nmadd fr8,fr5,fr4,fr8
537 ps_sum1 fr10,fr0,fr10,fr10
539 ps_muls0 fr11,fr2,fr8
540 ps_merge00 fr12,fr5,fr8
541 psq_st fr10,16(r3),0,0
542 ps_merge00 fr13,fr7,fr11
543 ps_muls0 fr12,fr12,fr1
544 ps_merge11 fr11,fr7,fr11
545 psq_st fr13,8(r3),0,0
546 ps_sum0 fr12,fr12,fr12,fr0
547 psq_st fr11,24(r3),0,0
548 psq_st fr12,40(r3),0,0
552 //r3 = v1,r4 = v2,r5 = dst
554 psq_l V1_XY,0(r3),0,0
555 psq_l V2_XY,0(r4),0,0
556 ps_add D1_XY,V1_XY,V2_XY
557 psq_st D1_XY,0(r5),0,0
560 ps_add D1_Z,V1_Z,V2_Z
561 psq_st D1_Z,8(r5),1,0
565 //r3 = v1,r4 = v2,r5 = dst
567 psq_l V1_XY,0(r3),0,0
568 psq_l V2_XY,0(r4),0,0
569 ps_sub D1_XY,V1_XY,V2_XY
570 psq_st D1_XY,0(r5),0,0
573 ps_sub D1_Z,V1_Z,V2_Z
574 psq_st D1_Z,8(r5),1,0
578 //r3 = src,r4 = dst,fr1 = S
588 .globl ps_guVecNormalize
591 lfs fr0,NrmData@sdarel(r13)
592 lfs fr1,NrmData+4@sdarel(r13)
596 ps_madd fr5,fr3,fr3,fr4
597 ps_sum0 fr6,fr5,fr3,fr4
601 fnmsubs fr8,fr8,fr6,fr1
610 //r3 = v1,r4 = v2,r5 = v12
615 ps_merge10 fr6,fr1,fr1
619 ps_msub fr5,fr0,fr3,fr4
620 ps_msub fr8,fr0,fr6,fr7
621 ps_merge11 fr9,fr5,fr5
622 ps_merge01 fr10,fr5,fr8
625 psq_st fr10,4(r5),0,0
628 .globl ps_guVecDotProduct
629 //r3 = vec1,r4 = vec2
636 ps_madd fr3,fr5,fr4,fr2
637 ps_sum0 fr1,fr3,fr2,fr2
640 .globl ps_guVecMultiply
647 ps_madd fr5,fr3,fr1,fr4
649 ps_sum0 fr6,fr5,fr6,fr5
653 ps_madd fr11,fr9,fr1,fr10
655 ps_sum0 fr12,fr11,fr12,fr11
658 psq_st fr12,4(r5),1,0
659 ps_madd fr5,fr3,fr1,fr4
660 ps_sum0 fr6,fr5,fr6,fr5
664 .globl ps_guVecMultiplySR
665 // r3 = mt, r4 = src, r5 = dst
667 psq_l fr0,0(r3),0,0 // m[0][0], m[0][1] GQR0 = 0
670 psq_l fr2,16(r3),0,0 // m[1][0], m[1][1]
671 // fp8 = m00x m01y // next X
673 psq_l fr4,32(r3),0,0 // m[2][0], m[2][1]
674 // fp10 = m10x m11y // next Y
676 psq_l fr7,8(r4),1,0 // fp7 - z,1.0
677 // fp12 = m20x m21y // next Z
678 ps_mul fr12,fr4,fr6 // YYY last FP6 usage
679 psq_l fr3,24(r3),0,0 // m[1][2], m[1][3]
680 ps_sum0 fr8,fr8,fr8,fr8
681 psq_l fr5,40(r3),0,0 // m[2][2], m[2][3]
682 ps_sum0 fr10,fr10,fr10,fr10
683 psq_l fr1,8(r3),0,0 // m[0][2], m[0][3]
684 ps_sum0 fr12,fr12,fr12,fr12
685 ps_madd fr9,fr1,fr7,fr8
686 psq_st fr9,0(r5),1,0 // store X
687 ps_madd fr11,fr3,fr7,fr10
688 psq_st fr11,4(r5),1,0 // store Y
689 ps_madd fr13,fr5,fr7,fr12
690 psq_st fr13,8(r5),1,0 // sore Z
693 .globl ps_quQuatScale
694 //r3 = q,r4 = r, fr1 = scale
704 .globl ps_guQuatDotProduct
705 //r3 = p, r4 = q ; fr1 = res
712 ps_madd fr1,fr3,fr5,fr1
713 ps_sum0 fr1,fr1,fr1,fr1