beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / powerpc32 / submul_1.asm
blob8ef37b04202402debd3463a4681ead5ec27714ae
1 dnl PowerPC-32 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
2 dnl the result from a second limb vector.
4 dnl Copyright 1995, 1997, 1998, 2000, 2002, 2005 Free Software Foundation,
5 dnl Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb
36 C 603e: ?
37 C 604e: 7.5
38 C 75x (G3): 9.3-15
39 C 7400,7410 (G4): 9.3-15
40 C 744x,745x (G4+): 10.5
41 C power4/ppc970: 6.75
42 C power5: 6.5
44 C INPUT PARAMETERS
45 C rp r3
46 C up r4
47 C n r5
48 C vl r6
50 C This is optimized for the PPC604. See addmul_1.asm for additional comments.
52 ASM_START()
53 PROLOGUE(mpn_submul_1)
54 cmpwi cr0,r5,9 C more than 9 limbs?
55 bgt cr0,L(big) C branch if more than 9 limbs
57 mtctr r5
58 lwz r0,0(r4)
59 mullw r7,r0,r6
60 mulhwu r10,r0,r6
61 lwz r9,0(r3)
62 subfc r8,r7,r9
63 addc r7,r7,r8 C invert cy (r7 is junk)
64 addi r3,r3,-4
65 bdz L(end)
66 L(loop):
67 lwzu r0,4(r4)
68 stwu r8,4(r3)
69 mullw r8,r0,r6
70 adde r7,r8,r10
71 mulhwu r10,r0,r6
72 lwz r9,4(r3)
73 addze r10,r10
74 subfc r8,r7,r9
75 addc r7,r7,r8 C invert cy (r7 is junk)
76 bdnz L(loop)
77 L(end): stw r8,4(r3)
78 addze r3,r10
79 blr
81 L(big): stwu r1,-16(r1)
82 addi r5,r5,-1
83 stw r30,8(r1)
84 srwi r0,r5,2
85 stw r31,12(r1)
86 mtctr r0
88 lwz r7,0(r4)
89 mullw r8,r7,r6
90 mulhwu r0,r7,r6
91 lwz r7,0(r3)
92 subfc r7,r8,r7
93 addc r8,r8,r7
94 stw r7,0(r3)
96 L(loopU):
97 lwz r7,4(r4)
98 lwz r12,8(r4)
99 lwz r30,12(r4)
100 lwzu r31,16(r4)
101 mullw r8,r7,r6
102 mullw r9,r12,r6
103 mullw r10,r30,r6
104 mullw r11,r31,r6
105 adde r8,r8,r0 C add cy_limb
106 mulhwu r0,r7,r6
107 lwz r7,4(r3)
108 adde r9,r9,r0
109 mulhwu r0,r12,r6
110 lwz r12,8(r3)
111 adde r10,r10,r0
112 mulhwu r0,r30,r6
113 lwz r30,12(r3)
114 adde r11,r11,r0
115 mulhwu r0,r31,r6
116 lwz r31,16(r3)
117 addze r0,r0 C new cy_limb
118 subfc r7,r8,r7
119 stw r7,4(r3)
120 subfe r12,r9,r12
121 stw r12,8(r3)
122 subfe r30,r10,r30
123 stw r30,12(r3)
124 subfe r31,r11,r31
125 stwu r31,16(r3)
126 subfe r11,r11,r11 C invert ...
127 addic r11,r11,1 C ... carry
128 bdnz L(loopU)
130 andi. r31,r5,3
131 mtctr r31
132 beq cr0,L(endx)
134 L(loopE):
135 lwzu r7,4(r4)
136 mullw r8,r7,r6
137 adde r8,r8,r0 C add cy_limb
138 mulhwu r0,r7,r6
139 lwz r7,4(r3)
140 addze r0,r0 C new cy_limb
141 subfc r7,r8,r7
142 addc r8,r8,r7
143 stwu r7,4(r3)
144 bdnz L(loopE)
145 L(endx):
146 addze r3,r0
147 lwz r30,8(r1)
148 lwz r31,12(r1)
149 addi r1,r1,16
151 EPILOGUE(mpn_submul_1)