source/libs/gmp/gmp-src/mpn/generic/div_qr_2.c

   1 /* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and
   2    quotient.  The divisor is two limbs.
   3
   4    Contributed to the GNU project by Torbjorn Granlund and Niels Möller
   5
   6    THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY
   7    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   8    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
   9
  10
  11 Copyright 1993-1996, 1999-2002, 2011 Free Software Foundation, Inc.
  12
  13 This file is part of the GNU MP Library.
  14
  15 The GNU MP Library is free software; you can redistribute it and/or modify
  16 it under the terms of either:
  17
  18   * the GNU Lesser General Public License as published by the Free
  19     Software Foundation; either version 3 of the License, or (at your
  20     option) any later version.
  21
  22 or
  23
  24   * the GNU General Public License as published by the Free Software
  25     Foundation; either version 2 of the License, or (at your option) any
  26     later version.
  27
  28 or both in parallel, as here.
  29
  30 The GNU MP Library is distributed in the hope that it will be useful, but
  31 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  32 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  33 for more details.
  34
  35 You should have received copies of the GNU General Public License and the
  36 GNU Lesser General Public License along with the GNU MP Library.  If not,
  37 see https://www.gnu.org/licenses/.  */
  38
  39 #include "gmp.h"
  40 #include "gmp-impl.h"
  41 #include "longlong.h"
  42
  43 #ifndef DIV_QR_2_PI2_THRESHOLD
  44 /* Disabled unless explicitly tuned. */
  45 #define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX
  46 #endif
  47
  48 #ifndef SANITY_CHECK
  49 #define SANITY_CHECK 0
  50 #endif
  51
  52 /* Define some longlong.h-style macros, but for wider operations.
  53    * add_sssaaaa is like longlong.h's add_ssaaaa but the propagating
  54      carry-out into an additional sum operand.
  55    * add_csaac accepts two addends and a carry in, and generates a sum
  56      and a carry out.  A little like a "full adder".
  57 */
  58 #if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
  59
  60 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
  61 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  62   __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"               \
  63            : "=r" (s2), "=&r" (s1), "=&r" (s0)                          \
  64            : "0"  ((USItype)(s2)),                                      \
  65              "1"  ((USItype)(a1)), "g" ((USItype)(b1)),                 \
  66              "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
  67 #define add_csaac(co, s, a, b, ci)                                      \
  68   __asm__ ("bt\t$0, %2\n\tadc\t%5, %k1\n\tadc\t%k0, %k0"                \
  69            : "=r" (co), "=r" (s)                                        \
  70            : "rm"  ((USItype)(ci)), "0" (CNST_LIMB(0)),                 \
  71              "%1" ((USItype)(a)), "g" ((USItype)(b)))
  72 #endif
  73
  74 #if defined (__amd64__) && W_TYPE_SIZE == 64
  75 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  76   __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"               \
  77            : "=r" (s2), "=&r" (s1), "=&r" (s0)                          \
  78            : "0"  ((UDItype)(s2)),                                      \
  79              "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),               \
  80              "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
  81 #define add_csaac(co, s, a, b, ci)                                      \
  82   __asm__ ("bt\t$0, %2\n\tadc\t%5, %q1\n\tadc\t%q0, %q0"                \
  83            : "=r" (co), "=r" (s)                                        \
  84            : "rm"  ((UDItype)(ci)), "0" (CNST_LIMB(0)),                 \
  85              "%1" ((UDItype)(a)), "g" ((UDItype)(b)))
  86 #endif
  87
  88 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
  89 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
  90    processor running in 32-bit mode, since the carry flag then gets the 32-bit
  91    carry.  */
  92 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  93   __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0"        \
  94            : "=r" (s2), "=&r" (s1), "=&r" (s0)                          \
  95            : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0))
  96 #endif
  97
  98 #endif /* __GNUC__ */
  99
 100 #ifndef add_sssaaaa
 101 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
 102   do {                                                                  \
 103     UWtype __s0, __s1, __c0, __c1;                                      \
 104     __s0 = (a0) + (b0);                                                 \
 105     __s1 = (a1) + (b1);                                                 \
 106     __c0 = __s0 < (a0);                                                 \
 107     __c1 = __s1 < (a1);                                                 \
 108     (s0) = __s0;                                                        \
 109     __s1 = __s1 + __c0;                                                 \
 110     (s1) = __s1;                                                        \
 111     (s2) += __c1 + (__s1 < __c0);                                       \
 112   } while (0)
 113 #endif
 114
 115 #ifndef add_csaac
 116 #define add_csaac(co, s, a, b, ci)                                      \
 117   do {                                                                  \
 118     UWtype __s, __c;                                                    \
 119     __s = (a) + (b);                                                    \
 120     __c = __s < (a);                                                    \
 121     __s = __s + (ci);                                                   \
 122     (s) = __s;                                                          \
 123     (co) = __c + (__s < (ci));                                          \
 124   } while (0)
 125 #endif
 126
 127 /* Typically used with r1, r0 same as n3, n2. Other types of overlap
 128    between inputs and outputs are not supported. */
 129 #define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0)         \
 130   do {                                                                  \
 131     mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0;               \
 132     mp_limb_t _t1, _t0;                                                 \
 133     mp_limb_t _c, _mask;                                                \
 134                                                                         \
 135     umul_ppmm (_q3,_q2a, n3, di1);                                      \
 136     umul_ppmm (_q2,_q1, n2, di1);                                       \
 137     umul_ppmm (_q2c,_q1c, n3, di0);                                     \
 138     add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1c);                      \
 139     umul_ppmm (_q1d,_q0, n2, di0);                                      \
 140     add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2a,_q1d);                      \
 141                                                                         \
 142     add_ssaaaa (r1, r0, n3, n2, CNST_LIMB(0), CNST_LIMB(1));            \
 143                                                                         \
 144     /* [q3,q2,q1,q0] += [n3,n3,n1,n0] */                                \
 145     add_csaac (_c, _q0, _q0, n0, CNST_LIMB(0));                         \
 146     add_csaac (_c, _q1, _q1, n1, _c);                                   \
 147     add_csaac (_c, _q2, _q2, r0, _c);                                   \
 148     _q3 = _q3 + r1 + _c;                                                \
 149                                                                         \
 150     umul_ppmm (_t1,_t0, _q2, d0);                                       \
 151     _t1 += _q2 * d1 + _q3 * d0;                                         \
 152                                                                         \
 153     sub_ddmmss (r1, r0, n1, n0, _t1, _t0);                              \
 154                                                                         \
 155     _mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0)));  /* (r1,r0) >= (q1,q0) */  \
 156     add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask);                \
 157     sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask);              \
 158                                                                         \
 159     if (UNLIKELY (r1 >= d1))                                            \
 160       {                                                                 \
 161         if (r1 > d1 || r0 >= d0)                                        \
 162           {                                                             \
 163             sub_ddmmss (r1, r0, r1, r0, d1, d0);                        \
 164             add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\
 165           }                                                             \
 166       }                                                                 \
 167     (q1) = _q3;                                                         \
 168     (q0) = _q2;                                                         \
 169   } while (0)
 170
 171 static void
 172 invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0)
 173 {
 174   mp_limb_t v1, v0, p1, t1, t0, p0, mask;
 175   invert_limb (v1, d1);
 176   p1 = d1 * v1;
 177   /* <1, v1> * d1 = <B-1, p1> */
 178   p1 += d0;
 179   if (p1 < d0)
 180     {
 181       v1--;
 182       mask = -(mp_limb_t) (p1 >= d1);
 183       p1 -= d1;
 184       v1 += mask;
 185       p1 -= mask & d1;
 186     }
 187   /* <1, v1> * d1 + d0 = <B-1, p1> */
 188   umul_ppmm (t1, p0, d0, v1);
 189   p1 += t1;
 190   if (p1 < t1)
 191     {
 192       if (UNLIKELY (p1 >= d1))
 193         {
 194           if (p1 > d1 || p0 >= d0)
 195             {
 196               sub_ddmmss (p1, p0, p1, p0, d1, d0);
 197               v1--;
 198             }
 199         }
 200       sub_ddmmss (p1, p0, p1, p0, d1, d0);
 201       v1--;
 202     }
 203   /* Now v1 is the 3/2 inverse, <1, v1> * <d1, d0> = <B-1, p1, p0>,
 204    * with <p1, p0> + <d1, d0> >= B^2.
 205    *
 206    * The 4/2 inverse is (B^4 - 1) / <d1, d0> = <1, v1, v0>. The
 207    * partial remainder after <1, v1> is
 208    *
 209    * B^4 - 1 - B <1, v1> <d1, d0> = <B-1, B-1, B-1, B-1> - <B-1, p1, p0, 0>
 210    *                              = <~p1, ~p0, B-1>
 211    */
 212   udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1);
 213   di[0] = v0;
 214   di[1] = v1;
 215
 216 #if SANITY_CHECK
 217   {
 218     mp_limb_t tp[4];
 219     mp_limb_t dp[2];
 220     dp[0] = d0;
 221     dp[1] = d1;
 222     mpn_mul_n (tp, dp, di, 2);
 223     ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0);
 224     ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX);
 225     ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX);
 226     ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1);
 227   }
 228 #endif
 229 }
 230
 231 static mp_limb_t
 232 mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
 233                    mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0)
 234 {
 235   mp_limb_t qh;
 236   mp_size_t i;
 237   mp_limb_t r1, r0;
 238
 239   ASSERT (nn >= 2);
 240   ASSERT (d1 & GMP_NUMB_HIGHBIT);
 241
 242   r1 = np[nn-1];
 243   r0 = np[nn-2];
 244
 245   qh = 0;
 246   if (r1 >= d1 && (r1 > d1 || r0 >= d0))
 247     {
 248 #if GMP_NAIL_BITS == 0
 249       sub_ddmmss (r1, r0, r1, r0, d1, d0);
 250 #else
 251       r0 = r0 - d0;
 252       r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1);
 253       r0 &= GMP_NUMB_MASK;
 254 #endif
 255       qh = 1;
 256     }
 257
 258   for (i = nn - 2; i >= 2; i -= 2)
 259     {
 260       mp_limb_t n1, n0, q1, q0;
 261       n1 = np[i-1];
 262       n0 = np[i-2];
 263       udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0);
 264       qp[i-1] = q1;
 265       qp[i-2] = q0;
 266     }
 267
 268   if (i > 0)
 269     {
 270       mp_limb_t q;
 271       udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1);
 272       qp[0] = q;
 273     }
 274   rp[1] = r1;
 275   rp[0] = r0;
 276
 277   return qh;
 278 }
 279
 280
 281 /* Divide num {np,nn} by den {dp,2} and write the nn-2 least
 282    significant quotient limbs at qp and the 2 long remainder at np.
 283    Return the most significant limb of the quotient.
 284
 285    Preconditions:
 286    1. qp must either not overlap with the input operands at all, or
 287       qp >= np + 2 must hold true.  (This means that it's possible to put
 288       the quotient in the high part of {np,nn}, right above the remainder.
 289    2. nn >= 2.  */
 290
 291 mp_limb_t
 292 mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn,
 293               mp_srcptr dp)
 294 {
 295   mp_limb_t d1;
 296   mp_limb_t d0;
 297   gmp_pi1_t dinv;
 298
 299   ASSERT (nn >= 2);
 300   ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2);
 301   ASSERT_MPN (np, nn);
 302   ASSERT_MPN (dp, 2);
 303
 304   d1 = dp[1]; d0 = dp[0];
 305
 306   ASSERT (d1 > 0);
 307
 308   if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT))
 309     {
 310       if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD))
 311         {
 312           gmp_pi1_t dinv;
 313           invert_pi1 (dinv, d1, d0);
 314           return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32);
 315         }
 316       else
 317         {
 318           mp_limb_t di[2];
 319           invert_4by2 (di, d1, d0);
 320           return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]);
 321         }
 322     }
 323   else
 324     {
 325       int shift;
 326       count_leading_zeros (shift, d1);
 327       d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift));
 328       d0 <<= shift;
 329       invert_pi1 (dinv, d1, d0);
 330       return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32);
 331     }
 332 }