2007-09-27 H.J. Lu <hongjiu.lu@intel.com>
[official-gcc.git] / libgcc / config / libbid / bid_div_macros.h
blob7773d97ebb00d8ba13d4c47da40c41b4489578d9
1 /* Copyright (C) 2007 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify it under
6 the terms of the GNU General Public License as published by the Free
7 Software Foundation; either version 2, or (at your option) any later
8 version.
10 In addition to the permissions in the GNU General Public License, the
11 Free Software Foundation gives you unlimited permission to link the
12 compiled version of this file into combinations with other programs,
13 and to distribute those combinations without any restriction coming
14 from the use of this file. (The General Public License restrictions
15 do apply in other respects; for example, they cover modification of
16 the file, and distribution when not linked into a combine
17 executable.)
19 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
20 WARRANTY; without even the implied warranty of MERCHANTABILITY or
21 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
22 for more details.
24 You should have received a copy of the GNU General Public License
25 along with GCC; see the file COPYING. If not, write to the Free
26 Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
27 02110-1301, USA. */
29 #ifndef _DIV_MACROS_H_
30 #define _DIV_MACROS_H_
32 #include "bid_internal.h"
34 #define FENCE __fence
35 //#define FENCE
37 //#define DOUBLE_EXTENDED_ON
39 #if DOUBLE_EXTENDED_ON
42 __BID_INLINE__ void
43 __div_128_by_128 (UINT128 * pCQ, UINT128 * pCR, UINT128 CX, UINT128 CY) {
44 UINT128 CB, CB2, CB4, CB8, CQB, CA;
45 int_double d64, dm64, ds;
46 int_float t64;
47 double dx, dq, dqh;
48 BINARY80 lq, lx, ly;
49 UINT64 Rh, R, B2, B4, Ph, Ql, Ql2, carry, Qh;
51 if (!CY.w[1]) {
52 pCR->w[1] = 0;
54 if (!CX.w[1]) {
55 pCQ->w[0] = CX.w[0] / CY.w[0];
56 pCQ->w[1] = 0;
57 pCR->w[1] = 0;
58 pCR->w[0] = CX.w[0] - pCQ->w[0] * CY.w[0];
59 } else {
61 // This path works for CX<2^116 only
63 // 2^64
64 d64.i = 0x43f0000000000000;
65 // 2^64
66 dm64.i = 0x3bf0000000000000;
67 // 1.5*2^(-52)
68 ds.i = 0x3cb8000000000000;
69 dx = (BINARY80) CX.w[1] * d64.d + (BINARY80) CX.w[0];
70 dq = dx / (BINARY80) CY.w[0];
71 dq -= dq * (ds.d);
72 dqh = dq * dm64.d;
73 Qh = (UINT64) dqh;
74 Ql = (UINT64) (dq - ((double) Qh) * d64.d);
76 Rh = CX.w[0] - Ql * CY.w[0];
77 Ql2 = Rh / CY.w[0];
78 pCR->w[0] = Rh - Ql2 * CY.w[0];
79 __add_carry_out ((pCQ->w[0]), carry, Ql, Ql2);
80 pCQ->w[1] = Qh + carry;
83 return;
85 // now CY.w[1] > 0
87 // 2^64
88 t64.i = 0x5f800000;
89 lx = (BINARY80) CX.w[1] * (BINARY80) t64.d + (BINARY80) CX.w[0];
90 ly = (BINARY80) CY.w[1] * (BINARY80) t64.d + (BINARY80) CY.w[0];
91 lq = lx / ly;
92 pCQ->w[0] = (UINT64) lq;
94 pCQ->w[1] = 0;
96 if (!pCQ->w[0]) {
97 /*if(__unsigned_compare_ge_128(CX,CY))
99 pCQ->w[0] = 1;
100 __sub_128_128((*pCR), CX, CY);
102 else */
104 pCR->w[1] = CX.w[1];
105 pCR->w[0] = CX.w[0];
107 return;
110 if (CY.w[1] >= 16 || pCQ->w[0] <= 0x1000000000000000ull) {
111 pCQ->w[0] = (UINT64) lq - 1;
112 __mul_64x128_full (Ph, CQB, (pCQ->w[0]), CY);
113 __sub_128_128 (CA, CX, CQB);
114 if (__unsigned_compare_ge_128 (CA, CY)) {
115 __sub_128_128 (CA, CA, CY);
116 pCQ->w[0]++;
117 if (__unsigned_compare_ge_128 (CA, CY)) {
118 __sub_128_128 (CA, CA, CY);
119 pCQ->w[0]++;
122 pCR->w[1] = CA.w[1];
123 pCR->w[0] = CA.w[0];
124 } else {
125 pCQ->w[0] = (UINT64) lq - 6;
127 __mul_64x128_full (Ph, CQB, (pCQ->w[0]), CY);
128 __sub_128_128 (CA, CX, CQB);
130 CB8.w[1] = (CY.w[1] << 3) | (CY.w[0] >> 61);
131 CB8.w[0] = CY.w[0] << 3;
132 CB4.w[1] = (CY.w[1] << 2) | (CY.w[0] >> 62);
133 CB4.w[0] = CY.w[0] << 2;
134 CB2.w[1] = (CY.w[1] << 1) | (CY.w[0] >> 63);
135 CB2.w[0] = CY.w[0] << 1;
137 if (__unsigned_compare_ge_128 (CA, CB8)) {
138 pCQ->w[0] += 8;
139 __sub_128_128 (CA, CA, CB8);
141 if (__unsigned_compare_ge_128 (CA, CB4)) {
142 pCQ->w[0] += 4;
143 __sub_128_128 (CA, CA, CB4);
145 if (__unsigned_compare_ge_128 (CA, CB2)) {
146 pCQ->w[0] += 2;
147 __sub_128_128 (CA, CA, CB2);
149 if (__unsigned_compare_ge_128 (CA, CY)) {
150 pCQ->w[0] += 1;
151 __sub_128_128 (CA, CA, CY);
154 pCR->w[1] = CA.w[1];
155 pCR->w[0] = CA.w[0];
164 __BID_INLINE__ void
165 __div_256_by_128 (UINT128 * pCQ, UINT256 * pCA4, UINT128 CY) {
166 UINT256 CQ2Y;
167 UINT128 CQ2, CQ3Y;
168 UINT64 Q3, carry64;
169 int_double d64;
170 BINARY80 lx, ly, lq, l64, l128;
172 // 2^64
173 d64.i = 0x43f0000000000000ull;
174 l64 = (BINARY80) d64.d;
175 // 2^128
176 l128 = l64 * l64;
178 lx =
179 ((BINARY80) (*pCA4).w[3] * l64 +
180 (BINARY80) (*pCA4).w[2]) * l128 +
181 (BINARY80) (*pCA4).w[1] * l64 + (BINARY80) (*pCA4).w[0];
182 ly = (BINARY80) CY.w[1] * l128 + (BINARY80) CY.w[0] * l64;
184 lq = lx / ly;
185 CQ2.w[1] = (UINT64) lq;
186 lq = (lq - CQ2.w[1]) * l64;
187 CQ2.w[0] = (UINT64) lq;
189 // CQ2*CY
190 __mul_128x128_to_256 (CQ2Y, CY, CQ2);
192 // CQ2Y <= (*pCA4) ?
193 if (CQ2Y.w[3] < (*pCA4).w[3]
194 || (CQ2Y.w[3] == (*pCA4).w[3]
195 && (CQ2Y.w[2] < (*pCA4).w[2]
196 || (CQ2Y.w[2] == (*pCA4).w[2]
197 && (CQ2Y.w[1] < (*pCA4).w[1]
198 || (CQ2Y.w[1] == (*pCA4).w[1]
199 && (CQ2Y.w[0] <= (*pCA4).w[0]))))))) {
201 // (*pCA4) -CQ2Y, guaranteed below 5*2^49*CY < 5*2^(49+128)
202 __sub_borrow_out ((*pCA4).w[0], carry64, (*pCA4).w[0], CQ2Y.w[0]);
203 __sub_borrow_in_out ((*pCA4).w[1], carry64, (*pCA4).w[1], CQ2Y.w[1],
204 carry64);
205 (*pCA4).w[2] = (*pCA4).w[2] - CQ2Y.w[2] - carry64;
207 lx = ((BINARY80) (*pCA4).w[2] * l128 +
208 ((BINARY80) (*pCA4).w[1] * l64 +
209 (BINARY80) (*pCA4).w[0])) * l64;
210 lq = lx / ly;
211 Q3 = (UINT64) lq;
213 if (Q3) {
214 Q3--;
215 __mul_64x128_short (CQ3Y, Q3, CY);
216 __sub_borrow_out ((*pCA4).w[0], carry64, (*pCA4).w[0], CQ3Y.w[0]);
217 (*pCA4).w[1] = (*pCA4).w[1] - CQ3Y.w[1] - carry64;
219 if ((*pCA4).w[1] > CY.w[1]
220 || ((*pCA4).w[1] == CY.w[1] && (*pCA4).w[0] >= CY.w[0])) {
221 Q3++;
222 __sub_borrow_out ((*pCA4).w[0], carry64, (*pCA4).w[0], CY.w[0]);
223 (*pCA4).w[1] = (*pCA4).w[1] - CY.w[1] - carry64;
224 if ((*pCA4).w[1] > CY.w[1]
225 || ((*pCA4).w[1] == CY.w[1] && (*pCA4).w[0] >= CY.w[0])) {
226 Q3++;
227 __sub_borrow_out ((*pCA4).w[0], carry64, (*pCA4).w[0],
228 CY.w[0]);
229 (*pCA4).w[1] = (*pCA4).w[1] - CY.w[1] - carry64;
232 // add Q3 to Q2
233 __add_carry_out (CQ2.w[0], carry64, Q3, CQ2.w[0]);
234 CQ2.w[1] += carry64;
236 } else {
237 // CQ2Y - (*pCA4), guaranteed below 5*2^(49+128)
238 __sub_borrow_out ((*pCA4).w[0], carry64, CQ2Y.w[0], (*pCA4).w[0]);
239 __sub_borrow_in_out ((*pCA4).w[1], carry64, CQ2Y.w[1], (*pCA4).w[1],
240 carry64);
241 (*pCA4).w[2] = CQ2Y.w[2] - (*pCA4).w[2] - carry64;
243 lx =
244 ((BINARY80) (*pCA4).w[2] * l128 +
245 (BINARY80) (*pCA4).w[1] * l64 + (BINARY80) (*pCA4).w[0]) * l64;
246 lq = lx / ly;
247 Q3 = 1 + (UINT64) lq;
249 __mul_64x128_short (CQ3Y, Q3, CY);
250 __sub_borrow_out ((*pCA4).w[0], carry64, CQ3Y.w[0], (*pCA4).w[0]);
251 (*pCA4).w[1] = CQ3Y.w[1] - (*pCA4).w[1] - carry64;
253 if ((SINT64) (*pCA4).w[1] > (SINT64) CY.w[1]
254 || ((*pCA4).w[1] == CY.w[1] && (*pCA4).w[0] >= CY.w[0])) {
255 Q3--;
256 __sub_borrow_out ((*pCA4).w[0], carry64, (*pCA4).w[0], CY.w[0]);
257 (*pCA4).w[1] = (*pCA4).w[1] - CY.w[1] - carry64;
258 } else if ((SINT64) (*pCA4).w[1] < 0) {
259 Q3++;
260 __add_carry_out ((*pCA4).w[0], carry64, (*pCA4).w[0], CY.w[0]);
261 (*pCA4).w[1] = (*pCA4).w[1] + CY.w[1] + carry64;
263 // subtract Q3 from Q2
264 __sub_borrow_out (CQ2.w[0], carry64, CQ2.w[0], Q3);
265 CQ2.w[1] -= carry64;
268 // (*pCQ) + CQ2 + carry
269 __add_carry_out ((*pCQ).w[0], carry64, CQ2.w[0], (*pCQ).w[0]);
270 (*pCQ).w[1] = (*pCQ).w[1] + CQ2.w[1] + carry64;
274 #else
276 __BID_INLINE__ void
277 __div_128_by_128 (UINT128 * pCQ, UINT128 * pCR, UINT128 CX0, UINT128 CY) {
278 UINT128 CY36, CY51, CQ, A2, CX, CQT;
279 UINT64 Q;
280 int_double t64, d49, d60;
281 double lx, ly, lq;
283 if (!CX0.w[1] && !CY.w[1]) {
284 pCQ->w[0] = CX0.w[0] / CY.w[0];
285 pCQ->w[1] = 0;
286 pCR->w[1] = pCR->w[0] = 0;
287 pCR->w[0] = CX0.w[0] - pCQ->w[0] * CY.w[0];
288 return;
291 CX.w[1] = CX0.w[1];
292 CX.w[0] = CX0.w[0];
294 // 2^64
295 t64.i = 0x43f0000000000000ull;
296 lx = (double) CX.w[1] * t64.d + (double) CX.w[0];
297 ly = (double) CY.w[1] * t64.d + (double) CY.w[0];
298 lq = lx / ly;
300 CY36.w[1] = CY.w[0] >> (64 - 36);
301 CY36.w[0] = CY.w[0] << 36;
303 CQ.w[1] = CQ.w[0] = 0;
305 // Q >= 2^100 ?
306 if (!CY.w[1] && !CY36.w[1] && (CX.w[1] >= CY36.w[0])) {
307 // then Q >= 2^100
309 // 2^(-60)*CX/CY
310 d60.i = 0x3c30000000000000ull;
311 lq *= d60.d;
312 Q = (UINT64) lq - 4ull;
314 // Q*CY
315 __mul_64x64_to_128 (A2, Q, CY.w[0]);
317 // A2 <<= 60
318 A2.w[1] = (A2.w[1] << 60) | (A2.w[0] >> (64 - 60));
319 A2.w[0] <<= 60;
321 __sub_128_128 (CX, CX, A2);
323 lx = (double) CX.w[1] * t64.d + (double) CX.w[0];
324 lq = lx / ly;
326 CQ.w[1] = Q >> (64 - 60);
327 CQ.w[0] = Q << 60;
331 CY51.w[1] = (CY.w[1] << 51) | (CY.w[0] >> (64 - 51));
332 CY51.w[0] = CY.w[0] << 51;
334 if (CY.w[1] < (UINT64) (1 << (64 - 51))
335 && (__unsigned_compare_gt_128 (CX, CY51))) {
336 // Q > 2^51
338 // 2^(-49)*CX/CY
339 d49.i = 0x3ce0000000000000ull;
340 lq *= d49.d;
342 Q = (UINT64) lq - 1ull;
344 // Q*CY
345 __mul_64x64_to_128 (A2, Q, CY.w[0]);
346 A2.w[1] += Q * CY.w[1];
348 // A2 <<= 49
349 A2.w[1] = (A2.w[1] << 49) | (A2.w[0] >> (64 - 49));
350 A2.w[0] <<= 49;
352 __sub_128_128 (CX, CX, A2);
354 CQT.w[1] = Q >> (64 - 49);
355 CQT.w[0] = Q << 49;
356 __add_128_128 (CQ, CQ, CQT);
358 lx = (double) CX.w[1] * t64.d + (double) CX.w[0];
359 lq = lx / ly;
362 Q = (UINT64) lq;
364 __mul_64x64_to_128 (A2, Q, CY.w[0]);
365 A2.w[1] += Q * CY.w[1];
367 __sub_128_128 (CX, CX, A2);
368 if ((SINT64) CX.w[1] < 0) {
369 Q--;
370 CX.w[0] += CY.w[0];
371 if (CX.w[0] < CY.w[0])
372 CX.w[1]++;
373 CX.w[1] += CY.w[1];
374 if ((SINT64) CX.w[1] < 0) {
375 Q--;
376 CX.w[0] += CY.w[0];
377 if (CX.w[0] < CY.w[0])
378 CX.w[1]++;
379 CX.w[1] += CY.w[1];
381 } else if (__unsigned_compare_ge_128 (CX, CY)) {
382 Q++;
383 __sub_128_128 (CX, CX, CY);
386 __add_128_64 (CQ, CQ, Q);
389 pCQ->w[1] = CQ.w[1];
390 pCQ->w[0] = CQ.w[0];
391 pCR->w[1] = CX.w[1];
392 pCR->w[0] = CX.w[0];
393 return;
397 __BID_INLINE__ void
398 __div_256_by_128 (UINT128 * pCQ, UINT256 * pCA4, UINT128 CY) {
399 UINT256 CA4, CA2, CY51, CY36;
400 UINT128 CQ, A2, A2h, CQT;
401 UINT64 Q, carry64;
402 int_double t64, d49, d60;
403 double lx, ly, lq, d128, d192;
405 // the quotient is assumed to be at most 113 bits,
406 // as needed by BID128 divide routines
408 // initial dividend
409 CA4.w[3] = (*pCA4).w[3];
410 CA4.w[2] = (*pCA4).w[2];
411 CA4.w[1] = (*pCA4).w[1];
412 CA4.w[0] = (*pCA4).w[0];
413 CQ.w[1] = (*pCQ).w[1];
414 CQ.w[0] = (*pCQ).w[0];
416 // 2^64
417 t64.i = 0x43f0000000000000ull;
418 d128 = t64.d * t64.d;
419 d192 = d128 * t64.d;
420 lx = (double) CA4.w[3] * d192 + ((double) CA4.w[2] * d128 +
421 ((double) CA4.w[1] * t64.d +
422 (double) CA4.w[0]));
423 ly = (double) CY.w[1] * t64.d + (double) CY.w[0];
424 lq = lx / ly;
426 CY36.w[2] = CY.w[1] >> (64 - 36);
427 CY36.w[1] = (CY.w[1] << 36) | (CY.w[0] >> (64 - 36));
428 CY36.w[0] = CY.w[0] << 36;
430 CQ.w[1] = (*pCQ).w[1];
431 CQ.w[0] = (*pCQ).w[0];
433 // Q >= 2^100 ?
434 if (CA4.w[3] > CY36.w[2]
435 || (CA4.w[3] == CY36.w[2]
436 && (CA4.w[2] > CY36.w[1]
437 || (CA4.w[2] == CY36.w[1] && CA4.w[1] >= CY36.w[0])))) {
438 // 2^(-60)*CA4/CY
439 d60.i = 0x3c30000000000000ull;
440 lq *= d60.d;
441 Q = (UINT64) lq - 4ull;
443 // Q*CY
444 __mul_64x128_to_192 (CA2, Q, CY);
446 // CA2 <<= 60
447 // CA2.w[3] = CA2.w[2] >> (64-60);
448 CA2.w[2] = (CA2.w[2] << 60) | (CA2.w[1] >> (64 - 60));
449 CA2.w[1] = (CA2.w[1] << 60) | (CA2.w[0] >> (64 - 60));
450 CA2.w[0] <<= 60;
452 // CA4 -= CA2
453 __sub_borrow_out (CA4.w[0], carry64, CA4.w[0], CA2.w[0]);
454 __sub_borrow_in_out (CA4.w[1], carry64, CA4.w[1], CA2.w[1],
455 carry64);
456 CA4.w[2] = CA4.w[2] - CA2.w[2] - carry64;
458 lx = ((double) CA4.w[2] * d128 +
459 ((double) CA4.w[1] * t64.d + (double) CA4.w[0]));
460 lq = lx / ly;
462 CQT.w[1] = Q >> (64 - 60);
463 CQT.w[0] = Q << 60;
464 __add_128_128 (CQ, CQ, CQT);
467 CY51.w[2] = CY.w[1] >> (64 - 51);
468 CY51.w[1] = (CY.w[1] << 51) | (CY.w[0] >> (64 - 51));
469 CY51.w[0] = CY.w[0] << 51;
471 if (CA4.w[2] > CY51.w[2] || ((CA4.w[2] == CY51.w[2])
473 (__unsigned_compare_gt_128 (CA4, CY51))))
475 // Q > 2^51
477 // 2^(-49)*CA4/CY
478 d49.i = 0x3ce0000000000000ull;
479 lq *= d49.d;
481 Q = (UINT64) lq - 1ull;
483 // Q*CY
484 __mul_64x64_to_128 (A2, Q, CY.w[0]);
485 __mul_64x64_to_128 (A2h, Q, CY.w[1]);
486 A2.w[1] += A2h.w[0];
487 if (A2.w[1] < A2h.w[0])
488 A2h.w[1]++;
490 // A2 <<= 49
491 CA2.w[2] = (A2h.w[1] << 49) | (A2.w[1] >> (64 - 49));
492 CA2.w[1] = (A2.w[1] << 49) | (A2.w[0] >> (64 - 49));
493 CA2.w[0] = A2.w[0] << 49;
495 __sub_borrow_out (CA4.w[0], carry64, CA4.w[0], CA2.w[0]);
496 __sub_borrow_in_out (CA4.w[1], carry64, CA4.w[1], CA2.w[1],
497 carry64);
498 CA4.w[2] = CA4.w[2] - CA2.w[2] - carry64;
500 CQT.w[1] = Q >> (64 - 49);
501 CQT.w[0] = Q << 49;
502 __add_128_128 (CQ, CQ, CQT);
504 lx = ((double) CA4.w[2] * d128 +
505 ((double) CA4.w[1] * t64.d + (double) CA4.w[0]));
506 lq = lx / ly;
509 Q = (UINT64) lq;
510 __mul_64x64_to_128 (A2, Q, CY.w[0]);
511 A2.w[1] += Q * CY.w[1];
513 __sub_128_128 (CA4, CA4, A2);
514 if ((SINT64) CA4.w[1] < 0) {
515 Q--;
516 CA4.w[0] += CY.w[0];
517 if (CA4.w[0] < CY.w[0])
518 CA4.w[1]++;
519 CA4.w[1] += CY.w[1];
520 if ((SINT64) CA4.w[1] < 0) {
521 Q--;
522 CA4.w[0] += CY.w[0];
523 if (CA4.w[0] < CY.w[0])
524 CA4.w[1]++;
525 CA4.w[1] += CY.w[1];
527 } else if (__unsigned_compare_ge_128 (CA4, CY)) {
528 Q++;
529 __sub_128_128 (CA4, CA4, CY);
532 __add_128_64 (CQ, CQ, Q);
534 pCQ->w[1] = CQ.w[1];
535 pCQ->w[0] = CQ.w[0];
536 pCA4->w[1] = CA4.w[1];
537 pCA4->w[0] = CA4.w[0];
538 return;
544 #endif
545 #endif