Committer: Michael Beasley <mike@snafu.setup>
[mikesnafu-overlay.git] / arch / m68k / math-emu / multi_arith.h
blob4ad0ca918e2e3e08426c5dd012adec1c29781b9e
1 /* multi_arith.h: multi-precision integer arithmetic functions, needed
2 to do extended-precision floating point.
4 (c) 1998 David Huggins-Daines.
6 Somewhat based on arch/alpha/math-emu/ieee-math.c, which is (c)
7 David Mosberger-Tang.
9 You may copy, modify, and redistribute this file under the terms of
10 the GNU General Public License, version 2, or any later version, at
11 your convenience. */
13 /* Note:
15 These are not general multi-precision math routines. Rather, they
16 implement the subset of integer arithmetic that we need in order to
17 multiply, divide, and normalize 128-bit unsigned mantissae. */
19 #ifndef MULTI_ARITH_H
20 #define MULTI_ARITH_H
22 #if 0 /* old code... */
24 /* Unsigned only, because we don't need signs to multiply and divide. */
25 typedef unsigned int int128[4];
27 /* Word order */
28 enum {
29 MSW128,
30 NMSW128,
31 NLSW128,
32 LSW128
35 /* big-endian */
36 #define LO_WORD(ll) (((unsigned int *) &ll)[1])
37 #define HI_WORD(ll) (((unsigned int *) &ll)[0])
39 /* Convenience functions to stuff various integer values into int128s */
41 static inline void zero128(int128 a)
43 a[LSW128] = a[NLSW128] = a[NMSW128] = a[MSW128] = 0;
46 /* Human-readable word order in the arguments */
47 static inline void set128(unsigned int i3, unsigned int i2, unsigned int i1,
48 unsigned int i0, int128 a)
50 a[LSW128] = i0;
51 a[NLSW128] = i1;
52 a[NMSW128] = i2;
53 a[MSW128] = i3;
56 /* Convenience functions (for testing as well) */
57 static inline void int64_to_128(unsigned long long src, int128 dest)
59 dest[LSW128] = (unsigned int) src;
60 dest[NLSW128] = src >> 32;
61 dest[NMSW128] = dest[MSW128] = 0;
64 static inline void int128_to_64(const int128 src, unsigned long long *dest)
66 *dest = src[LSW128] | (long long) src[NLSW128] << 32;
69 static inline void put_i128(const int128 a)
71 printk("%08x %08x %08x %08x\n", a[MSW128], a[NMSW128],
72 a[NLSW128], a[LSW128]);
75 /* Internal shifters:
77 Note that these are only good for 0 < count < 32.
80 static inline void _lsl128(unsigned int count, int128 a)
82 a[MSW128] = (a[MSW128] << count) | (a[NMSW128] >> (32 - count));
83 a[NMSW128] = (a[NMSW128] << count) | (a[NLSW128] >> (32 - count));
84 a[NLSW128] = (a[NLSW128] << count) | (a[LSW128] >> (32 - count));
85 a[LSW128] <<= count;
88 static inline void _lsr128(unsigned int count, int128 a)
90 a[LSW128] = (a[LSW128] >> count) | (a[NLSW128] << (32 - count));
91 a[NLSW128] = (a[NLSW128] >> count) | (a[NMSW128] << (32 - count));
92 a[NMSW128] = (a[NMSW128] >> count) | (a[MSW128] << (32 - count));
93 a[MSW128] >>= count;
96 /* Should be faster, one would hope */
98 static inline void lslone128(int128 a)
100 asm volatile ("lsl.l #1,%0\n"
101 "roxl.l #1,%1\n"
102 "roxl.l #1,%2\n"
103 "roxl.l #1,%3\n"
105 "=d" (a[LSW128]),
106 "=d"(a[NLSW128]),
107 "=d"(a[NMSW128]),
108 "=d"(a[MSW128])
110 "0"(a[LSW128]),
111 "1"(a[NLSW128]),
112 "2"(a[NMSW128]),
113 "3"(a[MSW128]));
116 static inline void lsrone128(int128 a)
118 asm volatile ("lsr.l #1,%0\n"
119 "roxr.l #1,%1\n"
120 "roxr.l #1,%2\n"
121 "roxr.l #1,%3\n"
123 "=d" (a[MSW128]),
124 "=d"(a[NMSW128]),
125 "=d"(a[NLSW128]),
126 "=d"(a[LSW128])
128 "0"(a[MSW128]),
129 "1"(a[NMSW128]),
130 "2"(a[NLSW128]),
131 "3"(a[LSW128]));
134 /* Generalized 128-bit shifters:
136 These bit-shift to a multiple of 32, then move whole longwords. */
138 static inline void lsl128(unsigned int count, int128 a)
140 int wordcount, i;
142 if (count % 32)
143 _lsl128(count % 32, a);
145 if (0 == (wordcount = count / 32))
146 return;
148 /* argh, gak, endian-sensitive */
149 for (i = 0; i < 4 - wordcount; i++) {
150 a[i] = a[i + wordcount];
152 for (i = 3; i >= 4 - wordcount; --i) {
153 a[i] = 0;
157 static inline void lsr128(unsigned int count, int128 a)
159 int wordcount, i;
161 if (count % 32)
162 _lsr128(count % 32, a);
164 if (0 == (wordcount = count / 32))
165 return;
167 for (i = 3; i >= wordcount; --i) {
168 a[i] = a[i - wordcount];
170 for (i = 0; i < wordcount; i++) {
171 a[i] = 0;
175 static inline int orl128(int a, int128 b)
177 b[LSW128] |= a;
180 static inline int btsthi128(const int128 a)
182 return a[MSW128] & 0x80000000;
185 /* test bits (numbered from 0 = LSB) up to and including "top" */
186 static inline int bftestlo128(int top, const int128 a)
188 int r = 0;
190 if (top > 31)
191 r |= a[LSW128];
192 if (top > 63)
193 r |= a[NLSW128];
194 if (top > 95)
195 r |= a[NMSW128];
197 r |= a[3 - (top / 32)] & ((1 << (top % 32 + 1)) - 1);
199 return (r != 0);
202 /* Aargh. We need these because GCC is broken */
203 /* FIXME: do them in assembly, for goodness' sake! */
204 static inline void mask64(int pos, unsigned long long *mask)
206 *mask = 0;
208 if (pos < 32) {
209 LO_WORD(*mask) = (1 << pos) - 1;
210 return;
212 LO_WORD(*mask) = -1;
213 HI_WORD(*mask) = (1 << (pos - 32)) - 1;
216 static inline void bset64(int pos, unsigned long long *dest)
218 /* This conditional will be optimized away. Thanks, GCC! */
219 if (pos < 32)
220 asm volatile ("bset %1,%0":"=m"
221 (LO_WORD(*dest)):"id"(pos));
222 else
223 asm volatile ("bset %1,%0":"=m"
224 (HI_WORD(*dest)):"id"(pos - 32));
227 static inline int btst64(int pos, unsigned long long dest)
229 if (pos < 32)
230 return (0 != (LO_WORD(dest) & (1 << pos)));
231 else
232 return (0 != (HI_WORD(dest) & (1 << (pos - 32))));
235 static inline void lsl64(int count, unsigned long long *dest)
237 if (count < 32) {
238 HI_WORD(*dest) = (HI_WORD(*dest) << count)
239 | (LO_WORD(*dest) >> count);
240 LO_WORD(*dest) <<= count;
241 return;
243 count -= 32;
244 HI_WORD(*dest) = LO_WORD(*dest) << count;
245 LO_WORD(*dest) = 0;
248 static inline void lsr64(int count, unsigned long long *dest)
250 if (count < 32) {
251 LO_WORD(*dest) = (LO_WORD(*dest) >> count)
252 | (HI_WORD(*dest) << (32 - count));
253 HI_WORD(*dest) >>= count;
254 return;
256 count -= 32;
257 LO_WORD(*dest) = HI_WORD(*dest) >> count;
258 HI_WORD(*dest) = 0;
260 #endif
262 static inline void fp_denormalize(struct fp_ext *reg, unsigned int cnt)
264 reg->exp += cnt;
266 switch (cnt) {
267 case 0 ... 8:
268 reg->lowmant = reg->mant.m32[1] << (8 - cnt);
269 reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) |
270 (reg->mant.m32[0] << (32 - cnt));
271 reg->mant.m32[0] = reg->mant.m32[0] >> cnt;
272 break;
273 case 9 ... 32:
274 reg->lowmant = reg->mant.m32[1] >> (cnt - 8);
275 if (reg->mant.m32[1] << (40 - cnt))
276 reg->lowmant |= 1;
277 reg->mant.m32[1] = (reg->mant.m32[1] >> cnt) |
278 (reg->mant.m32[0] << (32 - cnt));
279 reg->mant.m32[0] = reg->mant.m32[0] >> cnt;
280 break;
281 case 33 ... 39:
282 asm volatile ("bfextu %1{%2,#8},%0" : "=d" (reg->lowmant)
283 : "m" (reg->mant.m32[0]), "d" (64 - cnt));
284 if (reg->mant.m32[1] << (40 - cnt))
285 reg->lowmant |= 1;
286 reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);
287 reg->mant.m32[0] = 0;
288 break;
289 case 40 ... 71:
290 reg->lowmant = reg->mant.m32[0] >> (cnt - 40);
291 if ((reg->mant.m32[0] << (72 - cnt)) || reg->mant.m32[1])
292 reg->lowmant |= 1;
293 reg->mant.m32[1] = reg->mant.m32[0] >> (cnt - 32);
294 reg->mant.m32[0] = 0;
295 break;
296 default:
297 reg->lowmant = reg->mant.m32[0] || reg->mant.m32[1];
298 reg->mant.m32[0] = 0;
299 reg->mant.m32[1] = 0;
300 break;
304 static inline int fp_overnormalize(struct fp_ext *reg)
306 int shift;
308 if (reg->mant.m32[0]) {
309 asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[0]));
310 reg->mant.m32[0] = (reg->mant.m32[0] << shift) | (reg->mant.m32[1] >> (32 - shift));
311 reg->mant.m32[1] = (reg->mant.m32[1] << shift);
312 } else {
313 asm ("bfffo %1{#0,#32},%0" : "=d" (shift) : "dm" (reg->mant.m32[1]));
314 reg->mant.m32[0] = (reg->mant.m32[1] << shift);
315 reg->mant.m32[1] = 0;
316 shift += 32;
319 return shift;
322 static inline int fp_addmant(struct fp_ext *dest, struct fp_ext *src)
324 int carry;
326 /* we assume here, gcc only insert move and a clr instr */
327 asm volatile ("add.b %1,%0" : "=d,g" (dest->lowmant)
328 : "g,d" (src->lowmant), "0,0" (dest->lowmant));
329 asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[1])
330 : "d" (src->mant.m32[1]), "0" (dest->mant.m32[1]));
331 asm volatile ("addx.l %1,%0" : "=d" (dest->mant.m32[0])
332 : "d" (src->mant.m32[0]), "0" (dest->mant.m32[0]));
333 asm volatile ("addx.l %0,%0" : "=d" (carry) : "0" (0));
335 return carry;
338 static inline int fp_addcarry(struct fp_ext *reg)
340 if (++reg->exp == 0x7fff) {
341 if (reg->mant.m64)
342 fp_set_sr(FPSR_EXC_INEX2);
343 reg->mant.m64 = 0;
344 fp_set_sr(FPSR_EXC_OVFL);
345 return 0;
347 reg->lowmant = (reg->mant.m32[1] << 7) | (reg->lowmant ? 1 : 0);
348 reg->mant.m32[1] = (reg->mant.m32[1] >> 1) |
349 (reg->mant.m32[0] << 31);
350 reg->mant.m32[0] = (reg->mant.m32[0] >> 1) | 0x80000000;
352 return 1;
355 static inline void fp_submant(struct fp_ext *dest, struct fp_ext *src1,
356 struct fp_ext *src2)
358 /* we assume here, gcc only insert move and a clr instr */
359 asm volatile ("sub.b %1,%0" : "=d,g" (dest->lowmant)
360 : "g,d" (src2->lowmant), "0,0" (src1->lowmant));
361 asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[1])
362 : "d" (src2->mant.m32[1]), "0" (src1->mant.m32[1]));
363 asm volatile ("subx.l %1,%0" : "=d" (dest->mant.m32[0])
364 : "d" (src2->mant.m32[0]), "0" (src1->mant.m32[0]));
367 #define fp_mul64(desth, destl, src1, src2) ({ \
368 asm ("mulu.l %2,%1:%0" : "=d" (destl), "=d" (desth) \
369 : "dm" (src1), "0" (src2)); \
371 #define fp_div64(quot, rem, srch, srcl, div) \
372 asm ("divu.l %2,%1:%0" : "=d" (quot), "=d" (rem) \
373 : "dm" (div), "1" (srch), "0" (srcl))
374 #define fp_add64(dest1, dest2, src1, src2) ({ \
375 asm ("add.l %1,%0" : "=d,dm" (dest2) \
376 : "dm,d" (src2), "0,0" (dest2)); \
377 asm ("addx.l %1,%0" : "=d" (dest1) \
378 : "d" (src1), "0" (dest1)); \
380 #define fp_addx96(dest, src) ({ \
381 /* we assume here, gcc only insert move and a clr instr */ \
382 asm volatile ("add.l %1,%0" : "=d,g" (dest->m32[2]) \
383 : "g,d" (temp.m32[1]), "0,0" (dest->m32[2])); \
384 asm volatile ("addx.l %1,%0" : "=d" (dest->m32[1]) \
385 : "d" (temp.m32[0]), "0" (dest->m32[1])); \
386 asm volatile ("addx.l %1,%0" : "=d" (dest->m32[0]) \
387 : "d" (0), "0" (dest->m32[0])); \
389 #define fp_sub64(dest, src) ({ \
390 asm ("sub.l %1,%0" : "=d,dm" (dest.m32[1]) \
391 : "dm,d" (src.m32[1]), "0,0" (dest.m32[1])); \
392 asm ("subx.l %1,%0" : "=d" (dest.m32[0]) \
393 : "d" (src.m32[0]), "0" (dest.m32[0])); \
395 #define fp_sub96c(dest, srch, srcm, srcl) ({ \
396 char carry; \
397 asm ("sub.l %1,%0" : "=d,dm" (dest.m32[2]) \
398 : "dm,d" (srcl), "0,0" (dest.m32[2])); \
399 asm ("subx.l %1,%0" : "=d" (dest.m32[1]) \
400 : "d" (srcm), "0" (dest.m32[1])); \
401 asm ("subx.l %2,%1; scs %0" : "=d" (carry), "=d" (dest.m32[0]) \
402 : "d" (srch), "1" (dest.m32[0])); \
403 carry; \
406 static inline void fp_multiplymant(union fp_mant128 *dest, struct fp_ext *src1,
407 struct fp_ext *src2)
409 union fp_mant64 temp;
411 fp_mul64(dest->m32[0], dest->m32[1], src1->mant.m32[0], src2->mant.m32[0]);
412 fp_mul64(dest->m32[2], dest->m32[3], src1->mant.m32[1], src2->mant.m32[1]);
414 fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[0], src2->mant.m32[1]);
415 fp_addx96(dest, temp);
417 fp_mul64(temp.m32[0], temp.m32[1], src1->mant.m32[1], src2->mant.m32[0]);
418 fp_addx96(dest, temp);
421 static inline void fp_dividemant(union fp_mant128 *dest, struct fp_ext *src,
422 struct fp_ext *div)
424 union fp_mant128 tmp;
425 union fp_mant64 tmp64;
426 unsigned long *mantp = dest->m32;
427 unsigned long fix, rem, first, dummy;
428 int i;
430 /* the algorithm below requires dest to be smaller than div,
431 but both have the high bit set */
432 if (src->mant.m64 >= div->mant.m64) {
433 fp_sub64(src->mant, div->mant);
434 *mantp = 1;
435 } else
436 *mantp = 0;
437 mantp++;
439 /* basic idea behind this algorithm: we can't divide two 64bit numbers
440 (AB/CD) directly, but we can calculate AB/C0, but this means this
441 quotient is off by C0/CD, so we have to multiply the first result
442 to fix the result, after that we have nearly the correct result
443 and only a few corrections are needed. */
445 /* C0/CD can be precalculated, but it's an 64bit division again, but
446 we can make it a bit easier, by dividing first through C so we get
447 10/1D and now only a single shift and the value fits into 32bit. */
448 fix = 0x80000000;
449 dummy = div->mant.m32[1] / div->mant.m32[0] + 1;
450 dummy = (dummy >> 1) | fix;
451 fp_div64(fix, dummy, fix, 0, dummy);
452 fix--;
454 for (i = 0; i < 3; i++, mantp++) {
455 if (src->mant.m32[0] == div->mant.m32[0]) {
456 fp_div64(first, rem, 0, src->mant.m32[1], div->mant.m32[0]);
458 fp_mul64(*mantp, dummy, first, fix);
459 *mantp += fix;
460 } else {
461 fp_div64(first, rem, src->mant.m32[0], src->mant.m32[1], div->mant.m32[0]);
463 fp_mul64(*mantp, dummy, first, fix);
466 fp_mul64(tmp.m32[0], tmp.m32[1], div->mant.m32[0], first - *mantp);
467 fp_add64(tmp.m32[0], tmp.m32[1], 0, rem);
468 tmp.m32[2] = 0;
470 fp_mul64(tmp64.m32[0], tmp64.m32[1], *mantp, div->mant.m32[1]);
471 fp_sub96c(tmp, 0, tmp64.m32[0], tmp64.m32[1]);
473 src->mant.m32[0] = tmp.m32[1];
474 src->mant.m32[1] = tmp.m32[2];
476 while (!fp_sub96c(tmp, 0, div->mant.m32[0], div->mant.m32[1])) {
477 src->mant.m32[0] = tmp.m32[1];
478 src->mant.m32[1] = tmp.m32[2];
479 *mantp += 1;
484 #if 0
485 static inline unsigned int fp_fls128(union fp_mant128 *src)
487 unsigned long data;
488 unsigned int res, off;
490 if ((data = src->m32[0]))
491 off = 0;
492 else if ((data = src->m32[1]))
493 off = 32;
494 else if ((data = src->m32[2]))
495 off = 64;
496 else if ((data = src->m32[3]))
497 off = 96;
498 else
499 return 128;
501 asm ("bfffo %1{#0,#32},%0" : "=d" (res) : "dm" (data));
502 return res + off;
505 static inline void fp_shiftmant128(union fp_mant128 *src, int shift)
507 unsigned long sticky;
509 switch (shift) {
510 case 0:
511 return;
512 case 1:
513 asm volatile ("lsl.l #1,%0"
514 : "=d" (src->m32[3]) : "0" (src->m32[3]));
515 asm volatile ("roxl.l #1,%0"
516 : "=d" (src->m32[2]) : "0" (src->m32[2]));
517 asm volatile ("roxl.l #1,%0"
518 : "=d" (src->m32[1]) : "0" (src->m32[1]));
519 asm volatile ("roxl.l #1,%0"
520 : "=d" (src->m32[0]) : "0" (src->m32[0]));
521 return;
522 case 2 ... 31:
523 src->m32[0] = (src->m32[0] << shift) | (src->m32[1] >> (32 - shift));
524 src->m32[1] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));
525 src->m32[2] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
526 src->m32[3] = (src->m32[3] << shift);
527 return;
528 case 32 ... 63:
529 shift -= 32;
530 src->m32[0] = (src->m32[1] << shift) | (src->m32[2] >> (32 - shift));
531 src->m32[1] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
532 src->m32[2] = (src->m32[3] << shift);
533 src->m32[3] = 0;
534 return;
535 case 64 ... 95:
536 shift -= 64;
537 src->m32[0] = (src->m32[2] << shift) | (src->m32[3] >> (32 - shift));
538 src->m32[1] = (src->m32[3] << shift);
539 src->m32[2] = src->m32[3] = 0;
540 return;
541 case 96 ... 127:
542 shift -= 96;
543 src->m32[0] = (src->m32[3] << shift);
544 src->m32[1] = src->m32[2] = src->m32[3] = 0;
545 return;
546 case -31 ... -1:
547 shift = -shift;
548 sticky = 0;
549 if (src->m32[3] << (32 - shift))
550 sticky = 1;
551 src->m32[3] = (src->m32[3] >> shift) | (src->m32[2] << (32 - shift)) | sticky;
552 src->m32[2] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift));
553 src->m32[1] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));
554 src->m32[0] = (src->m32[0] >> shift);
555 return;
556 case -63 ... -32:
557 shift = -shift - 32;
558 sticky = 0;
559 if ((src->m32[2] << (32 - shift)) || src->m32[3])
560 sticky = 1;
561 src->m32[3] = (src->m32[2] >> shift) | (src->m32[1] << (32 - shift)) | sticky;
562 src->m32[2] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift));
563 src->m32[1] = (src->m32[0] >> shift);
564 src->m32[0] = 0;
565 return;
566 case -95 ... -64:
567 shift = -shift - 64;
568 sticky = 0;
569 if ((src->m32[1] << (32 - shift)) || src->m32[2] || src->m32[3])
570 sticky = 1;
571 src->m32[3] = (src->m32[1] >> shift) | (src->m32[0] << (32 - shift)) | sticky;
572 src->m32[2] = (src->m32[0] >> shift);
573 src->m32[1] = src->m32[0] = 0;
574 return;
575 case -127 ... -96:
576 shift = -shift - 96;
577 sticky = 0;
578 if ((src->m32[0] << (32 - shift)) || src->m32[1] || src->m32[2] || src->m32[3])
579 sticky = 1;
580 src->m32[3] = (src->m32[0] >> shift) | sticky;
581 src->m32[2] = src->m32[1] = src->m32[0] = 0;
582 return;
585 if (shift < 0 && (src->m32[0] || src->m32[1] || src->m32[2] || src->m32[3]))
586 src->m32[3] = 1;
587 else
588 src->m32[3] = 0;
589 src->m32[2] = 0;
590 src->m32[1] = 0;
591 src->m32[0] = 0;
593 #endif
595 static inline void fp_putmant128(struct fp_ext *dest, union fp_mant128 *src,
596 int shift)
598 unsigned long tmp;
600 switch (shift) {
601 case 0:
602 dest->mant.m64 = src->m64[0];
603 dest->lowmant = src->m32[2] >> 24;
604 if (src->m32[3] || (src->m32[2] << 8))
605 dest->lowmant |= 1;
606 break;
607 case 1:
608 asm volatile ("lsl.l #1,%0"
609 : "=d" (tmp) : "0" (src->m32[2]));
610 asm volatile ("roxl.l #1,%0"
611 : "=d" (dest->mant.m32[1]) : "0" (src->m32[1]));
612 asm volatile ("roxl.l #1,%0"
613 : "=d" (dest->mant.m32[0]) : "0" (src->m32[0]));
614 dest->lowmant = tmp >> 24;
615 if (src->m32[3] || (tmp << 8))
616 dest->lowmant |= 1;
617 break;
618 case 31:
619 asm volatile ("lsr.l #1,%1; roxr.l #1,%0"
620 : "=d" (dest->mant.m32[0])
621 : "d" (src->m32[0]), "0" (src->m32[1]));
622 asm volatile ("roxr.l #1,%0"
623 : "=d" (dest->mant.m32[1]) : "0" (src->m32[2]));
624 asm volatile ("roxr.l #1,%0"
625 : "=d" (tmp) : "0" (src->m32[3]));
626 dest->lowmant = tmp >> 24;
627 if (src->m32[3] << 7)
628 dest->lowmant |= 1;
629 break;
630 case 32:
631 dest->mant.m32[0] = src->m32[1];
632 dest->mant.m32[1] = src->m32[2];
633 dest->lowmant = src->m32[3] >> 24;
634 if (src->m32[3] << 8)
635 dest->lowmant |= 1;
636 break;
640 #if 0 /* old code... */
641 static inline int fls(unsigned int a)
643 int r;
645 asm volatile ("bfffo %1{#0,#32},%0"
646 : "=d" (r) : "md" (a));
647 return r;
650 /* fls = "find last set" (cf. ffs(3)) */
651 static inline int fls128(const int128 a)
653 if (a[MSW128])
654 return fls(a[MSW128]);
655 if (a[NMSW128])
656 return fls(a[NMSW128]) + 32;
657 /* XXX: it probably never gets beyond this point in actual
658 use, but that's indicative of a more general problem in the
659 algorithm (i.e. as per the actual 68881 implementation, we
660 really only need at most 67 bits of precision [plus
661 overflow]) so I'm not going to fix it. */
662 if (a[NLSW128])
663 return fls(a[NLSW128]) + 64;
664 if (a[LSW128])
665 return fls(a[LSW128]) + 96;
666 else
667 return -1;
670 static inline int zerop128(const int128 a)
672 return !(a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);
675 static inline int nonzerop128(const int128 a)
677 return (a[LSW128] | a[NLSW128] | a[NMSW128] | a[MSW128]);
680 /* Addition and subtraction */
681 /* Do these in "pure" assembly, because "extended" asm is unmanageable
682 here */
683 static inline void add128(const int128 a, int128 b)
685 /* rotating carry flags */
686 unsigned int carry[2];
688 carry[0] = a[LSW128] > (0xffffffff - b[LSW128]);
689 b[LSW128] += a[LSW128];
691 carry[1] = a[NLSW128] > (0xffffffff - b[NLSW128] - carry[0]);
692 b[NLSW128] = a[NLSW128] + b[NLSW128] + carry[0];
694 carry[0] = a[NMSW128] > (0xffffffff - b[NMSW128] - carry[1]);
695 b[NMSW128] = a[NMSW128] + b[NMSW128] + carry[1];
697 b[MSW128] = a[MSW128] + b[MSW128] + carry[0];
700 /* Note: assembler semantics: "b -= a" */
701 static inline void sub128(const int128 a, int128 b)
703 /* rotating borrow flags */
704 unsigned int borrow[2];
706 borrow[0] = b[LSW128] < a[LSW128];
707 b[LSW128] -= a[LSW128];
709 borrow[1] = b[NLSW128] < a[NLSW128] + borrow[0];
710 b[NLSW128] = b[NLSW128] - a[NLSW128] - borrow[0];
712 borrow[0] = b[NMSW128] < a[NMSW128] + borrow[1];
713 b[NMSW128] = b[NMSW128] - a[NMSW128] - borrow[1];
715 b[MSW128] = b[MSW128] - a[MSW128] - borrow[0];
718 /* Poor man's 64-bit expanding multiply */
719 static inline void mul64(unsigned long long a, unsigned long long b, int128 c)
721 unsigned long long acc;
722 int128 acc128;
724 zero128(acc128);
725 zero128(c);
727 /* first the low words */
728 if (LO_WORD(a) && LO_WORD(b)) {
729 acc = (long long) LO_WORD(a) * LO_WORD(b);
730 c[NLSW128] = HI_WORD(acc);
731 c[LSW128] = LO_WORD(acc);
733 /* Next the high words */
734 if (HI_WORD(a) && HI_WORD(b)) {
735 acc = (long long) HI_WORD(a) * HI_WORD(b);
736 c[MSW128] = HI_WORD(acc);
737 c[NMSW128] = LO_WORD(acc);
739 /* The middle words */
740 if (LO_WORD(a) && HI_WORD(b)) {
741 acc = (long long) LO_WORD(a) * HI_WORD(b);
742 acc128[NMSW128] = HI_WORD(acc);
743 acc128[NLSW128] = LO_WORD(acc);
744 add128(acc128, c);
746 /* The first and last words */
747 if (HI_WORD(a) && LO_WORD(b)) {
748 acc = (long long) HI_WORD(a) * LO_WORD(b);
749 acc128[NMSW128] = HI_WORD(acc);
750 acc128[NLSW128] = LO_WORD(acc);
751 add128(acc128, c);
755 /* Note: unsigned */
756 static inline int cmp128(int128 a, int128 b)
758 if (a[MSW128] < b[MSW128])
759 return -1;
760 if (a[MSW128] > b[MSW128])
761 return 1;
762 if (a[NMSW128] < b[NMSW128])
763 return -1;
764 if (a[NMSW128] > b[NMSW128])
765 return 1;
766 if (a[NLSW128] < b[NLSW128])
767 return -1;
768 if (a[NLSW128] > b[NLSW128])
769 return 1;
771 return (signed) a[LSW128] - b[LSW128];
774 inline void div128(int128 a, int128 b, int128 c)
776 int128 mask;
778 /* Algorithm:
780 Shift the divisor until it's at least as big as the
781 dividend, keeping track of the position to which we've
782 shifted it, i.e. the power of 2 which we've multiplied it
785 Then, for this power of 2 (the mask), and every one smaller
786 than it, subtract the mask from the dividend and add it to
787 the quotient until the dividend is smaller than the raised
788 divisor. At this point, divide the dividend and the mask
789 by 2 (i.e. shift one place to the right). Lather, rinse,
790 and repeat, until there are no more powers of 2 left. */
792 /* FIXME: needless to say, there's room for improvement here too. */
794 /* Shift up */
795 /* XXX: since it just has to be "at least as big", we can
796 probably eliminate this horribly wasteful loop. I will
797 have to prove this first, though */
798 set128(0, 0, 0, 1, mask);
799 while (cmp128(b, a) < 0 && !btsthi128(b)) {
800 lslone128(b);
801 lslone128(mask);
804 /* Shift down */
805 zero128(c);
806 do {
807 if (cmp128(a, b) >= 0) {
808 sub128(b, a);
809 add128(mask, c);
811 lsrone128(mask);
812 lsrone128(b);
813 } while (nonzerop128(mask));
815 /* The remainder is in a... */
817 #endif
819 #endif /* MULTI_ARITH_H */