1 /* multi_arith.h: multi-precision integer arithmetic functions, needed
2 to do extended-precision floating point.
4 (c) 1998 David Huggins-Daines.
6 Somewhat based on arch/alpha/math-emu/ieee-math.c, which is (c)
9 You may copy, modify, and redistribute this file under the terms of
10 the GNU General Public License, version 2, or any later version, at
15 These are not general multi-precision math routines. Rather, they
16 implement the subset of integer arithmetic that we need in order to
17 multiply, divide, and normalize 128-bit unsigned mantissae. */
22 #if 0 /* old code... */
24 /* Unsigned only, because we don't need signs to multiply and divide. */
25 typedef unsigned int int128
[4];
36 #define LO_WORD(ll) (((unsigned int *) &ll)[1])
37 #define HI_WORD(ll) (((unsigned int *) &ll)[0])
39 /* Convenience functions to stuff various integer values into int128s */
41 static inline void zero128(int128 a
)
43 a
[LSW128
] = a
[NLSW128
] = a
[NMSW128
] = a
[MSW128
] = 0;
46 /* Human-readable word order in the arguments */
47 static inline void set128(unsigned int i3
, unsigned int i2
, unsigned int i1
,
48 unsigned int i0
, int128 a
)
56 /* Convenience functions (for testing as well) */
57 static inline void int64_to_128(unsigned long long src
, int128 dest
)
59 dest
[LSW128
] = (unsigned int) src
;
60 dest
[NLSW128
] = src
>> 32;
61 dest
[NMSW128
] = dest
[MSW128
] = 0;
64 static inline void int128_to_64(const int128 src
, unsigned long long *dest
)
66 *dest
= src
[LSW128
] | (long long) src
[NLSW128
] << 32;
69 static inline void put_i128(const int128 a
)
71 printk("%08x %08x %08x %08x\n", a
[MSW128
], a
[NMSW128
],
72 a
[NLSW128
], a
[LSW128
]);
77 Note that these are only good for 0 < count < 32.
80 static inline void _lsl128(unsigned int count
, int128 a
)
82 a
[MSW128
] = (a
[MSW128
] << count
) | (a
[NMSW128
] >> (32 - count
));
83 a
[NMSW128
] = (a
[NMSW128
] << count
) | (a
[NLSW128
] >> (32 - count
));
84 a
[NLSW128
] = (a
[NLSW128
] << count
) | (a
[LSW128
] >> (32 - count
));
88 static inline void _lsr128(unsigned int count
, int128 a
)
90 a
[LSW128
] = (a
[LSW128
] >> count
) | (a
[NLSW128
] << (32 - count
));
91 a
[NLSW128
] = (a
[NLSW128
] >> count
) | (a
[NMSW128
] << (32 - count
));
92 a
[NMSW128
] = (a
[NMSW128
] >> count
) | (a
[MSW128
] << (32 - count
));
96 /* Should be faster, one would hope */
98 static inline void lslone128(int128 a
)
100 asm volatile ("lsl.l #1,%0\n"
116 static inline void lsrone128(int128 a
)
118 asm volatile ("lsr.l #1,%0\n"
134 /* Generalized 128-bit shifters:
136 These bit-shift to a multiple of 32, then move whole longwords. */
138 static inline void lsl128(unsigned int count
, int128 a
)
143 _lsl128(count
% 32, a
);
145 if (0 == (wordcount
= count
/ 32))
148 /* argh, gak, endian-sensitive */
149 for (i
= 0; i
< 4 - wordcount
; i
++) {
150 a
[i
] = a
[i
+ wordcount
];
152 for (i
= 3; i
>= 4 - wordcount
; --i
) {
157 static inline void lsr128(unsigned int count
, int128 a
)
162 _lsr128(count
% 32, a
);
164 if (0 == (wordcount
= count
/ 32))
167 for (i
= 3; i
>= wordcount
; --i
) {
168 a
[i
] = a
[i
- wordcount
];
170 for (i
= 0; i
< wordcount
; i
++) {
175 static inline int orl128(int a
, int128 b
)
180 static inline int btsthi128(const int128 a
)
182 return a
[MSW128
] & 0x80000000;
185 /* test bits (numbered from 0 = LSB) up to and including "top" */
186 static inline int bftestlo128(int top
, const int128 a
)
197 r
|= a
[3 - (top
/ 32)] & ((1 << (top
% 32 + 1)) - 1);
202 /* Aargh. We need these because GCC is broken */
203 /* FIXME: do them in assembly, for goodness' sake! */
204 static inline void mask64(int pos
, unsigned long long *mask
)
209 LO_WORD(*mask
) = (1 << pos
) - 1;
213 HI_WORD(*mask
) = (1 << (pos
- 32)) - 1;
216 static inline void bset64(int pos
, unsigned long long *dest
)
218 /* This conditional will be optimized away. Thanks, GCC! */
220 asm volatile ("bset %1,%0":"=m"
221 (LO_WORD(*dest
)):"id"(pos
));
223 asm volatile ("bset %1,%0":"=m"
224 (HI_WORD(*dest
)):"id"(pos
- 32));
227 static inline int btst64(int pos
, unsigned long long dest
)
230 return (0 != (LO_WORD(dest
) & (1 << pos
)));
232 return (0 != (HI_WORD(dest
) & (1 << (pos
- 32))));
235 static inline void lsl64(int count
, unsigned long long *dest
)
238 HI_WORD(*dest
) = (HI_WORD(*dest
) << count
)
239 | (LO_WORD(*dest
) >> count
);
240 LO_WORD(*dest
) <<= count
;
244 HI_WORD(*dest
) = LO_WORD(*dest
) << count
;
248 static inline void lsr64(int count
, unsigned long long *dest
)
251 LO_WORD(*dest
) = (LO_WORD(*dest
) >> count
)
252 | (HI_WORD(*dest
) << (32 - count
));
253 HI_WORD(*dest
) >>= count
;
257 LO_WORD(*dest
) = HI_WORD(*dest
) >> count
;
262 static inline void fp_denormalize(struct fp_ext
*reg
, unsigned int cnt
)
268 reg
->lowmant
= reg
->mant
.m32
[1] << (8 - cnt
);
269 reg
->mant
.m32
[1] = (reg
->mant
.m32
[1] >> cnt
) |
270 (reg
->mant
.m32
[0] << (32 - cnt
));
271 reg
->mant
.m32
[0] = reg
->mant
.m32
[0] >> cnt
;
274 reg
->lowmant
= reg
->mant
.m32
[1] >> (cnt
- 8);
275 if (reg
->mant
.m32
[1] << (40 - cnt
))
277 reg
->mant
.m32
[1] = (reg
->mant
.m32
[1] >> cnt
) |
278 (reg
->mant
.m32
[0] << (32 - cnt
));
279 reg
->mant
.m32
[0] = reg
->mant
.m32
[0] >> cnt
;
282 asm volatile ("bfextu %1{%2,#8},%0" : "=d" (reg
->lowmant
)
283 : "m" (reg
->mant
.m32
[0]), "d" (64 - cnt
));
284 if (reg
->mant
.m32
[1] << (40 - cnt
))
286 reg
->mant
.m32
[1] = reg
->mant
.m32
[0] >> (cnt
- 32);
287 reg
->mant
.m32
[0] = 0;
290 reg
->lowmant
= reg
->mant
.m32
[0] >> (cnt
- 40);
291 if ((reg
->mant
.m32
[0] << (72 - cnt
)) || reg
->mant
.m32
[1])
293 reg
->mant
.m32
[1] = reg
->mant
.m32
[0] >> (cnt
- 32);
294 reg
->mant
.m32
[0] = 0;
297 reg
->lowmant
= reg
->mant
.m32
[0] || reg
->mant
.m32
[1];
298 reg
->mant
.m32
[0] = 0;
299 reg
->mant
.m32
[1] = 0;
304 static inline int fp_overnormalize(struct fp_ext
*reg
)
308 if (reg
->mant
.m32
[0]) {
309 asm ("bfffo %1{#0,#32},%0" : "=d" (shift
) : "dm" (reg
->mant
.m32
[0]));
310 reg
->mant
.m32
[0] = (reg
->mant
.m32
[0] << shift
) | (reg
->mant
.m32
[1] >> (32 - shift
));
311 reg
->mant
.m32
[1] = (reg
->mant
.m32
[1] << shift
);
313 asm ("bfffo %1{#0,#32},%0" : "=d" (shift
) : "dm" (reg
->mant
.m32
[1]));
314 reg
->mant
.m32
[0] = (reg
->mant
.m32
[1] << shift
);
315 reg
->mant
.m32
[1] = 0;
322 static inline int fp_addmant(struct fp_ext
*dest
, struct fp_ext
*src
)
326 /* we assume here, gcc only insert move and a clr instr */
327 asm volatile ("add.b %1,%0" : "=d,g" (dest
->lowmant
)
328 : "g,d" (src
->lowmant
), "0,0" (dest
->lowmant
));
329 asm volatile ("addx.l %1,%0" : "=d" (dest
->mant
.m32
[1])
330 : "d" (src
->mant
.m32
[1]), "0" (dest
->mant
.m32
[1]));
331 asm volatile ("addx.l %1,%0" : "=d" (dest
->mant
.m32
[0])
332 : "d" (src
->mant
.m32
[0]), "0" (dest
->mant
.m32
[0]));
333 asm volatile ("addx.l %0,%0" : "=d" (carry
) : "0" (0));
338 static inline int fp_addcarry(struct fp_ext
*reg
)
340 if (++reg
->exp
== 0x7fff) {
342 fp_set_sr(FPSR_EXC_INEX2
);
344 fp_set_sr(FPSR_EXC_OVFL
);
347 reg
->lowmant
= (reg
->mant
.m32
[1] << 7) | (reg
->lowmant
? 1 : 0);
348 reg
->mant
.m32
[1] = (reg
->mant
.m32
[1] >> 1) |
349 (reg
->mant
.m32
[0] << 31);
350 reg
->mant
.m32
[0] = (reg
->mant
.m32
[0] >> 1) | 0x80000000;
355 static inline void fp_submant(struct fp_ext
*dest
, struct fp_ext
*src1
,
358 /* we assume here, gcc only insert move and a clr instr */
359 asm volatile ("sub.b %1,%0" : "=d,g" (dest
->lowmant
)
360 : "g,d" (src2
->lowmant
), "0,0" (src1
->lowmant
));
361 asm volatile ("subx.l %1,%0" : "=d" (dest
->mant
.m32
[1])
362 : "d" (src2
->mant
.m32
[1]), "0" (src1
->mant
.m32
[1]));
363 asm volatile ("subx.l %1,%0" : "=d" (dest
->mant
.m32
[0])
364 : "d" (src2
->mant
.m32
[0]), "0" (src1
->mant
.m32
[0]));
367 #define fp_mul64(desth, destl, src1, src2) ({ \
368 asm ("mulu.l %2,%1:%0" : "=d" (destl), "=d" (desth) \
369 : "dm" (src1), "0" (src2)); \
371 #define fp_div64(quot, rem, srch, srcl, div) \
372 asm ("divu.l %2,%1:%0" : "=d" (quot), "=d" (rem) \
373 : "dm" (div), "1" (srch), "0" (srcl))
374 #define fp_add64(dest1, dest2, src1, src2) ({ \
375 asm ("add.l %1,%0" : "=d,dm" (dest2) \
376 : "dm,d" (src2), "0,0" (dest2)); \
377 asm ("addx.l %1,%0" : "=d" (dest1) \
378 : "d" (src1), "0" (dest1)); \
380 #define fp_addx96(dest, src) ({ \
381 /* we assume here, gcc only insert move and a clr instr */ \
382 asm volatile ("add.l %1,%0" : "=d,g" (dest->m32[2]) \
383 : "g,d" (temp.m32[1]), "0,0" (dest->m32[2])); \
384 asm volatile ("addx.l %1,%0" : "=d" (dest->m32[1]) \
385 : "d" (temp.m32[0]), "0" (dest->m32[1])); \
386 asm volatile ("addx.l %1,%0" : "=d" (dest->m32[0]) \
387 : "d" (0), "0" (dest->m32[0])); \
389 #define fp_sub64(dest, src) ({ \
390 asm ("sub.l %1,%0" : "=d,dm" (dest.m32[1]) \
391 : "dm,d" (src.m32[1]), "0,0" (dest.m32[1])); \
392 asm ("subx.l %1,%0" : "=d" (dest.m32[0]) \
393 : "d" (src.m32[0]), "0" (dest.m32[0])); \
395 #define fp_sub96c(dest, srch, srcm, srcl) ({ \
397 asm ("sub.l %1,%0" : "=d,dm" (dest.m32[2]) \
398 : "dm,d" (srcl), "0,0" (dest.m32[2])); \
399 asm ("subx.l %1,%0" : "=d" (dest.m32[1]) \
400 : "d" (srcm), "0" (dest.m32[1])); \
401 asm ("subx.l %2,%1; scs %0" : "=d" (carry), "=d" (dest.m32[0]) \
402 : "d" (srch), "1" (dest.m32[0])); \
406 static inline void fp_multiplymant(union fp_mant128
*dest
, struct fp_ext
*src1
,
409 union fp_mant64 temp
;
411 fp_mul64(dest
->m32
[0], dest
->m32
[1], src1
->mant
.m32
[0], src2
->mant
.m32
[0]);
412 fp_mul64(dest
->m32
[2], dest
->m32
[3], src1
->mant
.m32
[1], src2
->mant
.m32
[1]);
414 fp_mul64(temp
.m32
[0], temp
.m32
[1], src1
->mant
.m32
[0], src2
->mant
.m32
[1]);
415 fp_addx96(dest
, temp
);
417 fp_mul64(temp
.m32
[0], temp
.m32
[1], src1
->mant
.m32
[1], src2
->mant
.m32
[0]);
418 fp_addx96(dest
, temp
);
421 static inline void fp_dividemant(union fp_mant128
*dest
, struct fp_ext
*src
,
424 union fp_mant128 tmp
;
425 union fp_mant64 tmp64
;
426 unsigned long *mantp
= dest
->m32
;
427 unsigned long fix
, rem
, first
, dummy
;
430 /* the algorithm below requires dest to be smaller than div,
431 but both have the high bit set */
432 if (src
->mant
.m64
>= div
->mant
.m64
) {
433 fp_sub64(src
->mant
, div
->mant
);
439 /* basic idea behind this algorithm: we can't divide two 64bit numbers
440 (AB/CD) directly, but we can calculate AB/C0, but this means this
441 quotient is off by C0/CD, so we have to multiply the first result
442 to fix the result, after that we have nearly the correct result
443 and only a few corrections are needed. */
445 /* C0/CD can be precalculated, but it's an 64bit division again, but
446 we can make it a bit easier, by dividing first through C so we get
447 10/1D and now only a single shift and the value fits into 32bit. */
449 dummy
= div
->mant
.m32
[1] / div
->mant
.m32
[0] + 1;
450 dummy
= (dummy
>> 1) | fix
;
451 fp_div64(fix
, dummy
, fix
, 0, dummy
);
454 for (i
= 0; i
< 3; i
++, mantp
++) {
455 if (src
->mant
.m32
[0] == div
->mant
.m32
[0]) {
456 fp_div64(first
, rem
, 0, src
->mant
.m32
[1], div
->mant
.m32
[0]);
458 fp_mul64(*mantp
, dummy
, first
, fix
);
461 fp_div64(first
, rem
, src
->mant
.m32
[0], src
->mant
.m32
[1], div
->mant
.m32
[0]);
463 fp_mul64(*mantp
, dummy
, first
, fix
);
466 fp_mul64(tmp
.m32
[0], tmp
.m32
[1], div
->mant
.m32
[0], first
- *mantp
);
467 fp_add64(tmp
.m32
[0], tmp
.m32
[1], 0, rem
);
470 fp_mul64(tmp64
.m32
[0], tmp64
.m32
[1], *mantp
, div
->mant
.m32
[1]);
471 fp_sub96c(tmp
, 0, tmp64
.m32
[0], tmp64
.m32
[1]);
473 src
->mant
.m32
[0] = tmp
.m32
[1];
474 src
->mant
.m32
[1] = tmp
.m32
[2];
476 while (!fp_sub96c(tmp
, 0, div
->mant
.m32
[0], div
->mant
.m32
[1])) {
477 src
->mant
.m32
[0] = tmp
.m32
[1];
478 src
->mant
.m32
[1] = tmp
.m32
[2];
485 static inline unsigned int fp_fls128(union fp_mant128
*src
)
488 unsigned int res
, off
;
490 if ((data
= src
->m32
[0]))
492 else if ((data
= src
->m32
[1]))
494 else if ((data
= src
->m32
[2]))
496 else if ((data
= src
->m32
[3]))
501 asm ("bfffo %1{#0,#32},%0" : "=d" (res
) : "dm" (data
));
505 static inline void fp_shiftmant128(union fp_mant128
*src
, int shift
)
507 unsigned long sticky
;
513 asm volatile ("lsl.l #1,%0"
514 : "=d" (src
->m32
[3]) : "0" (src
->m32
[3]));
515 asm volatile ("roxl.l #1,%0"
516 : "=d" (src
->m32
[2]) : "0" (src
->m32
[2]));
517 asm volatile ("roxl.l #1,%0"
518 : "=d" (src
->m32
[1]) : "0" (src
->m32
[1]));
519 asm volatile ("roxl.l #1,%0"
520 : "=d" (src
->m32
[0]) : "0" (src
->m32
[0]));
523 src
->m32
[0] = (src
->m32
[0] << shift
) | (src
->m32
[1] >> (32 - shift
));
524 src
->m32
[1] = (src
->m32
[1] << shift
) | (src
->m32
[2] >> (32 - shift
));
525 src
->m32
[2] = (src
->m32
[2] << shift
) | (src
->m32
[3] >> (32 - shift
));
526 src
->m32
[3] = (src
->m32
[3] << shift
);
530 src
->m32
[0] = (src
->m32
[1] << shift
) | (src
->m32
[2] >> (32 - shift
));
531 src
->m32
[1] = (src
->m32
[2] << shift
) | (src
->m32
[3] >> (32 - shift
));
532 src
->m32
[2] = (src
->m32
[3] << shift
);
537 src
->m32
[0] = (src
->m32
[2] << shift
) | (src
->m32
[3] >> (32 - shift
));
538 src
->m32
[1] = (src
->m32
[3] << shift
);
539 src
->m32
[2] = src
->m32
[3] = 0;
543 src
->m32
[0] = (src
->m32
[3] << shift
);
544 src
->m32
[1] = src
->m32
[2] = src
->m32
[3] = 0;
549 if (src
->m32
[3] << (32 - shift
))
551 src
->m32
[3] = (src
->m32
[3] >> shift
) | (src
->m32
[2] << (32 - shift
)) | sticky
;
552 src
->m32
[2] = (src
->m32
[2] >> shift
) | (src
->m32
[1] << (32 - shift
));
553 src
->m32
[1] = (src
->m32
[1] >> shift
) | (src
->m32
[0] << (32 - shift
));
554 src
->m32
[0] = (src
->m32
[0] >> shift
);
559 if ((src
->m32
[2] << (32 - shift
)) || src
->m32
[3])
561 src
->m32
[3] = (src
->m32
[2] >> shift
) | (src
->m32
[1] << (32 - shift
)) | sticky
;
562 src
->m32
[2] = (src
->m32
[1] >> shift
) | (src
->m32
[0] << (32 - shift
));
563 src
->m32
[1] = (src
->m32
[0] >> shift
);
569 if ((src
->m32
[1] << (32 - shift
)) || src
->m32
[2] || src
->m32
[3])
571 src
->m32
[3] = (src
->m32
[1] >> shift
) | (src
->m32
[0] << (32 - shift
)) | sticky
;
572 src
->m32
[2] = (src
->m32
[0] >> shift
);
573 src
->m32
[1] = src
->m32
[0] = 0;
578 if ((src
->m32
[0] << (32 - shift
)) || src
->m32
[1] || src
->m32
[2] || src
->m32
[3])
580 src
->m32
[3] = (src
->m32
[0] >> shift
) | sticky
;
581 src
->m32
[2] = src
->m32
[1] = src
->m32
[0] = 0;
585 if (shift
< 0 && (src
->m32
[0] || src
->m32
[1] || src
->m32
[2] || src
->m32
[3]))
595 static inline void fp_putmant128(struct fp_ext
*dest
, union fp_mant128
*src
,
602 dest
->mant
.m64
= src
->m64
[0];
603 dest
->lowmant
= src
->m32
[2] >> 24;
604 if (src
->m32
[3] || (src
->m32
[2] << 8))
608 asm volatile ("lsl.l #1,%0"
609 : "=d" (tmp
) : "0" (src
->m32
[2]));
610 asm volatile ("roxl.l #1,%0"
611 : "=d" (dest
->mant
.m32
[1]) : "0" (src
->m32
[1]));
612 asm volatile ("roxl.l #1,%0"
613 : "=d" (dest
->mant
.m32
[0]) : "0" (src
->m32
[0]));
614 dest
->lowmant
= tmp
>> 24;
615 if (src
->m32
[3] || (tmp
<< 8))
619 asm volatile ("lsr.l #1,%1; roxr.l #1,%0"
620 : "=d" (dest
->mant
.m32
[0])
621 : "d" (src
->m32
[0]), "0" (src
->m32
[1]));
622 asm volatile ("roxr.l #1,%0"
623 : "=d" (dest
->mant
.m32
[1]) : "0" (src
->m32
[2]));
624 asm volatile ("roxr.l #1,%0"
625 : "=d" (tmp
) : "0" (src
->m32
[3]));
626 dest
->lowmant
= tmp
>> 24;
627 if (src
->m32
[3] << 7)
631 dest
->mant
.m32
[0] = src
->m32
[1];
632 dest
->mant
.m32
[1] = src
->m32
[2];
633 dest
->lowmant
= src
->m32
[3] >> 24;
634 if (src
->m32
[3] << 8)
640 #if 0 /* old code... */
641 static inline int fls(unsigned int a
)
645 asm volatile ("bfffo %1{#0,#32},%0"
646 : "=d" (r
) : "md" (a
));
650 /* fls = "find last set" (cf. ffs(3)) */
651 static inline int fls128(const int128 a
)
654 return fls(a
[MSW128
]);
656 return fls(a
[NMSW128
]) + 32;
657 /* XXX: it probably never gets beyond this point in actual
658 use, but that's indicative of a more general problem in the
659 algorithm (i.e. as per the actual 68881 implementation, we
660 really only need at most 67 bits of precision [plus
661 overflow]) so I'm not going to fix it. */
663 return fls(a
[NLSW128
]) + 64;
665 return fls(a
[LSW128
]) + 96;
670 static inline int zerop128(const int128 a
)
672 return !(a
[LSW128
] | a
[NLSW128
] | a
[NMSW128
] | a
[MSW128
]);
675 static inline int nonzerop128(const int128 a
)
677 return (a
[LSW128
] | a
[NLSW128
] | a
[NMSW128
] | a
[MSW128
]);
680 /* Addition and subtraction */
681 /* Do these in "pure" assembly, because "extended" asm is unmanageable
683 static inline void add128(const int128 a
, int128 b
)
685 /* rotating carry flags */
686 unsigned int carry
[2];
688 carry
[0] = a
[LSW128
] > (0xffffffff - b
[LSW128
]);
689 b
[LSW128
] += a
[LSW128
];
691 carry
[1] = a
[NLSW128
] > (0xffffffff - b
[NLSW128
] - carry
[0]);
692 b
[NLSW128
] = a
[NLSW128
] + b
[NLSW128
] + carry
[0];
694 carry
[0] = a
[NMSW128
] > (0xffffffff - b
[NMSW128
] - carry
[1]);
695 b
[NMSW128
] = a
[NMSW128
] + b
[NMSW128
] + carry
[1];
697 b
[MSW128
] = a
[MSW128
] + b
[MSW128
] + carry
[0];
700 /* Note: assembler semantics: "b -= a" */
701 static inline void sub128(const int128 a
, int128 b
)
703 /* rotating borrow flags */
704 unsigned int borrow
[2];
706 borrow
[0] = b
[LSW128
] < a
[LSW128
];
707 b
[LSW128
] -= a
[LSW128
];
709 borrow
[1] = b
[NLSW128
] < a
[NLSW128
] + borrow
[0];
710 b
[NLSW128
] = b
[NLSW128
] - a
[NLSW128
] - borrow
[0];
712 borrow
[0] = b
[NMSW128
] < a
[NMSW128
] + borrow
[1];
713 b
[NMSW128
] = b
[NMSW128
] - a
[NMSW128
] - borrow
[1];
715 b
[MSW128
] = b
[MSW128
] - a
[MSW128
] - borrow
[0];
718 /* Poor man's 64-bit expanding multiply */
719 static inline void mul64(unsigned long long a
, unsigned long long b
, int128 c
)
721 unsigned long long acc
;
727 /* first the low words */
728 if (LO_WORD(a
) && LO_WORD(b
)) {
729 acc
= (long long) LO_WORD(a
) * LO_WORD(b
);
730 c
[NLSW128
] = HI_WORD(acc
);
731 c
[LSW128
] = LO_WORD(acc
);
733 /* Next the high words */
734 if (HI_WORD(a
) && HI_WORD(b
)) {
735 acc
= (long long) HI_WORD(a
) * HI_WORD(b
);
736 c
[MSW128
] = HI_WORD(acc
);
737 c
[NMSW128
] = LO_WORD(acc
);
739 /* The middle words */
740 if (LO_WORD(a
) && HI_WORD(b
)) {
741 acc
= (long long) LO_WORD(a
) * HI_WORD(b
);
742 acc128
[NMSW128
] = HI_WORD(acc
);
743 acc128
[NLSW128
] = LO_WORD(acc
);
746 /* The first and last words */
747 if (HI_WORD(a
) && LO_WORD(b
)) {
748 acc
= (long long) HI_WORD(a
) * LO_WORD(b
);
749 acc128
[NMSW128
] = HI_WORD(acc
);
750 acc128
[NLSW128
] = LO_WORD(acc
);
756 static inline int cmp128(int128 a
, int128 b
)
758 if (a
[MSW128
] < b
[MSW128
])
760 if (a
[MSW128
] > b
[MSW128
])
762 if (a
[NMSW128
] < b
[NMSW128
])
764 if (a
[NMSW128
] > b
[NMSW128
])
766 if (a
[NLSW128
] < b
[NLSW128
])
768 if (a
[NLSW128
] > b
[NLSW128
])
771 return (signed) a
[LSW128
] - b
[LSW128
];
774 inline void div128(int128 a
, int128 b
, int128 c
)
780 Shift the divisor until it's at least as big as the
781 dividend, keeping track of the position to which we've
782 shifted it, i.e. the power of 2 which we've multiplied it
785 Then, for this power of 2 (the mask), and every one smaller
786 than it, subtract the mask from the dividend and add it to
787 the quotient until the dividend is smaller than the raised
788 divisor. At this point, divide the dividend and the mask
789 by 2 (i.e. shift one place to the right). Lather, rinse,
790 and repeat, until there are no more powers of 2 left. */
792 /* FIXME: needless to say, there's room for improvement here too. */
795 /* XXX: since it just has to be "at least as big", we can
796 probably eliminate this horribly wasteful loop. I will
797 have to prove this first, though */
798 set128(0, 0, 0, 1, mask
);
799 while (cmp128(b
, a
) < 0 && !btsthi128(b
)) {
807 if (cmp128(a
, b
) >= 0) {
813 } while (nonzerop128(mask
));
815 /* The remainder is in a... */
819 #endif /* MULTI_ARITH_H */