beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k7 / invert_limb.asm
blob31a867e68a48406fe3903921b898594a747b5d4a
1 dnl x86 mpn_invert_limb
3 dnl Contributed to the GNU project by Niels Möller
5 dnl Copyright 2009, 2011, 2015 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
8 dnl
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
11 dnl
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
15 dnl
16 dnl or
17 dnl
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl later version.
21 dnl
22 dnl or both in parallel, as here.
23 dnl
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl for more details.
28 dnl
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles (approx) div
36 C P5 ?
37 C P6 model 0-8,10-12 ?
38 C P6 model 9 (Banias) ?
39 C P6 model 13 (Dothan) ?
40 C P4 model 0 (Willamette) ?
41 C P4 model 1 (?) ?
42 C P4 model 2 (Northwood) ?
43 C P4 model 3 (Prescott) ?
44 C P4 model 4 (Nocona) ?
45 C AMD K6 ?
46 C AMD K7 41 53
47 C AMD K8 ?
49 C TODO
50 C * These c/l numbers are for a non-PIC build. Consider falling back to using
51 C the 'div' instruction for PIC builds.
52 C * Perhaps use this file--or at least the algorithm--for more machines than k7.
54 C Register usage:
55 C Input D in %edi
56 C Current approximation is in %eax and/or %ecx
57 C %ebx and %edx are temporaries
58 C %esi and %ebp are unused
60 defframe(PARAM_DIVISOR,4)
62 ASM_START()
64 C Make approx_tab global to work around Apple relocation bug.
65 ifdef(`DARWIN',`
66 deflit(`approx_tab', MPN(invert_limb_tab))
67 GLOBL approx_tab')
69 TEXT
70 ALIGN(16)
71 PROLOGUE(mpn_invert_limb)
72 deflit(`FRAME', 0)
73 mov PARAM_DIVISOR, %eax
74 C Avoid push/pop on k7.
75 sub $8, %esp FRAME_subl_esp(8)
76 mov %ebx, (%esp)
77 mov %edi, 4(%esp)
79 mov %eax, %edi
80 shr $22, %eax
81 ifdef(`PIC',`
82 LEAL( approx_tab, %ebx)
83 movzwl -1024(%ebx, %eax, 2), %eax
84 ',`
85 movzwl -1024+approx_tab(%eax, %eax), %eax C %eax = v0
88 C v1 = (v0 << 4) - ((v0*v0*d_21) >> 32) - 1
89 mov %eax, %ecx
90 imul %eax, %eax
91 mov %edi, %ebx
92 shr $11, %ebx
93 inc %ebx
94 mul %ebx
95 mov %edi, %ebx C Prepare
96 shr %ebx
97 sbb %eax, %eax
98 sub %eax, %ebx C %ebx = d_31, %eax = mask
99 shl $4, %ecx
100 dec %ecx
101 sub %edx, %ecx C %ecx = v1
103 C v_2 = (v1 << 15) + ((v1 *(2^48 - v1 * d31 + (v1 >> 1) & mask)) >> 33)
104 imul %ecx, %ebx
105 and %ecx, %eax
106 shr %eax
107 sub %ebx, %eax
108 mul %ecx
109 mov %edi, %eax C Prepare for next mul
110 shl $15, %ecx
111 shr %edx
112 add %edx, %ecx C %ecx = v2
114 mul %ecx
115 add %edi, %eax
116 mov %ecx, %eax
117 adc %edi, %edx
118 sub %edx, %eax C %eax = v3
120 mov (%esp), %ebx
121 mov 4(%esp), %edi
122 add $8, %esp
126 EPILOGUE()
128 DEF_OBJECT(approx_tab,2)
129 .value 0x7fe1,0x7fa1,0x7f61,0x7f22,0x7ee3,0x7ea4,0x7e65,0x7e27
130 .value 0x7de9,0x7dab,0x7d6d,0x7d30,0x7cf3,0x7cb6,0x7c79,0x7c3d
131 .value 0x7c00,0x7bc4,0x7b89,0x7b4d,0x7b12,0x7ad7,0x7a9c,0x7a61
132 .value 0x7a27,0x79ec,0x79b2,0x7979,0x793f,0x7906,0x78cc,0x7894
133 .value 0x785b,0x7822,0x77ea,0x77b2,0x777a,0x7742,0x770b,0x76d3
134 .value 0x769c,0x7665,0x762f,0x75f8,0x75c2,0x758c,0x7556,0x7520
135 .value 0x74ea,0x74b5,0x7480,0x744b,0x7416,0x73e2,0x73ad,0x7379
136 .value 0x7345,0x7311,0x72dd,0x72aa,0x7277,0x7243,0x7210,0x71de
137 .value 0x71ab,0x7179,0x7146,0x7114,0x70e2,0x70b1,0x707f,0x704e
138 .value 0x701c,0x6feb,0x6fba,0x6f8a,0x6f59,0x6f29,0x6ef9,0x6ec8
139 .value 0x6e99,0x6e69,0x6e39,0x6e0a,0x6ddb,0x6dab,0x6d7d,0x6d4e
140 .value 0x6d1f,0x6cf1,0x6cc2,0x6c94,0x6c66,0x6c38,0x6c0a,0x6bdd
141 .value 0x6bb0,0x6b82,0x6b55,0x6b28,0x6afb,0x6acf,0x6aa2,0x6a76
142 .value 0x6a49,0x6a1d,0x69f1,0x69c6,0x699a,0x696e,0x6943,0x6918
143 .value 0x68ed,0x68c2,0x6897,0x686c,0x6842,0x6817,0x67ed,0x67c3
144 .value 0x6799,0x676f,0x6745,0x671b,0x66f2,0x66c8,0x669f,0x6676
145 .value 0x664d,0x6624,0x65fc,0x65d3,0x65aa,0x6582,0x655a,0x6532
146 .value 0x650a,0x64e2,0x64ba,0x6493,0x646b,0x6444,0x641c,0x63f5
147 .value 0x63ce,0x63a7,0x6381,0x635a,0x6333,0x630d,0x62e7,0x62c1
148 .value 0x629a,0x6275,0x624f,0x6229,0x6203,0x61de,0x61b8,0x6193
149 .value 0x616e,0x6149,0x6124,0x60ff,0x60da,0x60b6,0x6091,0x606d
150 .value 0x6049,0x6024,0x6000,0x5fdc,0x5fb8,0x5f95,0x5f71,0x5f4d
151 .value 0x5f2a,0x5f07,0x5ee3,0x5ec0,0x5e9d,0x5e7a,0x5e57,0x5e35
152 .value 0x5e12,0x5def,0x5dcd,0x5dab,0x5d88,0x5d66,0x5d44,0x5d22
153 .value 0x5d00,0x5cde,0x5cbd,0x5c9b,0x5c7a,0x5c58,0x5c37,0x5c16
154 .value 0x5bf5,0x5bd4,0x5bb3,0x5b92,0x5b71,0x5b51,0x5b30,0x5b10
155 .value 0x5aef,0x5acf,0x5aaf,0x5a8f,0x5a6f,0x5a4f,0x5a2f,0x5a0f
156 .value 0x59ef,0x59d0,0x59b0,0x5991,0x5972,0x5952,0x5933,0x5914
157 .value 0x58f5,0x58d6,0x58b7,0x5899,0x587a,0x585b,0x583d,0x581f
158 .value 0x5800,0x57e2,0x57c4,0x57a6,0x5788,0x576a,0x574c,0x572e
159 .value 0x5711,0x56f3,0x56d5,0x56b8,0x569b,0x567d,0x5660,0x5643
160 .value 0x5626,0x5609,0x55ec,0x55cf,0x55b2,0x5596,0x5579,0x555d
161 .value 0x5540,0x5524,0x5507,0x54eb,0x54cf,0x54b3,0x5497,0x547b
162 .value 0x545f,0x5443,0x5428,0x540c,0x53f0,0x53d5,0x53b9,0x539e
163 .value 0x5383,0x5368,0x534c,0x5331,0x5316,0x52fb,0x52e0,0x52c6
164 .value 0x52ab,0x5290,0x5276,0x525b,0x5240,0x5226,0x520c,0x51f1
165 .value 0x51d7,0x51bd,0x51a3,0x5189,0x516f,0x5155,0x513b,0x5121
166 .value 0x5108,0x50ee,0x50d5,0x50bb,0x50a2,0x5088,0x506f,0x5056
167 .value 0x503c,0x5023,0x500a,0x4ff1,0x4fd8,0x4fbf,0x4fa6,0x4f8e
168 .value 0x4f75,0x4f5c,0x4f44,0x4f2b,0x4f13,0x4efa,0x4ee2,0x4eca
169 .value 0x4eb1,0x4e99,0x4e81,0x4e69,0x4e51,0x4e39,0x4e21,0x4e09
170 .value 0x4df1,0x4dda,0x4dc2,0x4daa,0x4d93,0x4d7b,0x4d64,0x4d4d
171 .value 0x4d35,0x4d1e,0x4d07,0x4cf0,0x4cd8,0x4cc1,0x4caa,0x4c93
172 .value 0x4c7d,0x4c66,0x4c4f,0x4c38,0x4c21,0x4c0b,0x4bf4,0x4bde
173 .value 0x4bc7,0x4bb1,0x4b9a,0x4b84,0x4b6e,0x4b58,0x4b41,0x4b2b
174 .value 0x4b15,0x4aff,0x4ae9,0x4ad3,0x4abd,0x4aa8,0x4a92,0x4a7c
175 .value 0x4a66,0x4a51,0x4a3b,0x4a26,0x4a10,0x49fb,0x49e5,0x49d0
176 .value 0x49bb,0x49a6,0x4990,0x497b,0x4966,0x4951,0x493c,0x4927
177 .value 0x4912,0x48fe,0x48e9,0x48d4,0x48bf,0x48ab,0x4896,0x4881
178 .value 0x486d,0x4858,0x4844,0x482f,0x481b,0x4807,0x47f3,0x47de
179 .value 0x47ca,0x47b6,0x47a2,0x478e,0x477a,0x4766,0x4752,0x473e
180 .value 0x472a,0x4717,0x4703,0x46ef,0x46db,0x46c8,0x46b4,0x46a1
181 .value 0x468d,0x467a,0x4666,0x4653,0x4640,0x462c,0x4619,0x4606
182 .value 0x45f3,0x45e0,0x45cd,0x45ba,0x45a7,0x4594,0x4581,0x456e
183 .value 0x455b,0x4548,0x4536,0x4523,0x4510,0x44fe,0x44eb,0x44d8
184 .value 0x44c6,0x44b3,0x44a1,0x448f,0x447c,0x446a,0x4458,0x4445
185 .value 0x4433,0x4421,0x440f,0x43fd,0x43eb,0x43d9,0x43c7,0x43b5
186 .value 0x43a3,0x4391,0x437f,0x436d,0x435c,0x434a,0x4338,0x4327
187 .value 0x4315,0x4303,0x42f2,0x42e0,0x42cf,0x42bd,0x42ac,0x429b
188 .value 0x4289,0x4278,0x4267,0x4256,0x4244,0x4233,0x4222,0x4211
189 .value 0x4200,0x41ef,0x41de,0x41cd,0x41bc,0x41ab,0x419a,0x418a
190 .value 0x4179,0x4168,0x4157,0x4147,0x4136,0x4125,0x4115,0x4104
191 .value 0x40f4,0x40e3,0x40d3,0x40c2,0x40b2,0x40a2,0x4091,0x4081
192 .value 0x4071,0x4061,0x4050,0x4040,0x4030,0x4020,0x4010,0x4000
193 END_OBJECT(approx_tab)
194 ASM_END()