source/libs/gmp/gmp-src/mpn/generic/div_qr_1n_pi1.c

   1 /* mpn_div_qr_1n_pi1
   2
   3    Contributed to the GNU project by Niels Möller
   4
   5    THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES.  IT IS ONLY
   6    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   7    GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
   8
   9
  10 Copyright 2013 Free Software Foundation, Inc.
  11
  12 This file is part of the GNU MP Library.
  13
  14 The GNU MP Library is free software; you can redistribute it and/or modify
  15 it under the terms of either:
  16
  17   * the GNU Lesser General Public License as published by the Free
  18     Software Foundation; either version 3 of the License, or (at your
  19     option) any later version.
  20
  21 or
  22
  23   * the GNU General Public License as published by the Free Software
  24     Foundation; either version 2 of the License, or (at your option) any
  25     later version.
  26
  27 or both in parallel, as here.
  28
  29 The GNU MP Library is distributed in the hope that it will be useful, but
  30 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  31 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  32 for more details.
  33
  34 You should have received copies of the GNU General Public License and the
  35 GNU Lesser General Public License along with the GNU MP Library.  If not,
  36 see https://www.gnu.org/licenses/.  */
  37
  38 #include "gmp.h"
  39 #include "gmp-impl.h"
  40 #include "longlong.h"
  41
  42 #if GMP_NAIL_BITS > 0
  43 #error Nail bits not supported
  44 #endif
  45
  46 #ifndef DIV_QR_1N_METHOD
  47 #define DIV_QR_1N_METHOD 2
  48 #endif
  49
  50 /* FIXME: Duplicated in mod_1_1.c. Move to gmp-impl.h */
  51
  52 #if defined (__GNUC__) && ! defined (NO_ASM)
  53
  54 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
  55 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)                          \
  56   __asm__ (  "add       %6, %k2\n\t"                                    \
  57              "adc       %4, %k1\n\t"                                    \
  58              "sbb       %k0, %k0"                                       \
  59            : "=r" (m), "=r" (s1), "=&r" (s0)                            \
  60            : "1"  ((USItype)(a1)), "g" ((USItype)(b1)),                 \
  61              "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
  62 #endif
  63
  64 #if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64
  65 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)                          \
  66   __asm__ (  "add       %6, %q2\n\t"                                    \
  67              "adc       %4, %q1\n\t"                                    \
  68              "sbb       %q0, %q0"                                       \
  69            : "=r" (m), "=r" (s1), "=&r" (s0)                            \
  70            : "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),               \
  71              "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
  72 #endif
  73
  74 #if defined (__sparc__) && W_TYPE_SIZE == 32
  75 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)                          \
  76   __asm__ (  "addcc     %r5, %6, %2\n\t"                                \
  77              "addxcc    %r3, %4, %1\n\t"                                \
  78              "subx      %%g0, %%g0, %0"                                 \
  79            : "=r" (m), "=r" (sh), "=&r" (sl)                            \
  80            : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl)                \
  81          __CLOBBER_CC)
  82 #endif
  83
  84 #if defined (__sparc__) && W_TYPE_SIZE == 64
  85 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)                          \
  86   __asm__ (  "addcc     %r5, %6, %2\n\t"                                \
  87              "addccc    %r7, %8, %%g0\n\t"                              \
  88              "addccc    %r3, %4, %1\n\t"                                \
  89              "clr       %0\n\t"                                         \
  90              "movcs     %%xcc, -1, %0"                                  \
  91            : "=r" (m), "=r" (sh), "=&r" (sl)                            \
  92            : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),               \
  93              "rJ" ((al) >> 32), "rI" ((bl) >> 32)                       \
  94          __CLOBBER_CC)
  95 #if __VIS__ >= 0x300
  96 #undef add_mssaaaa
  97 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)                          \
  98   __asm__ (  "addcc     %r5, %6, %2\n\t"                                \
  99              "addxccc   %r3, %4, %1\n\t"                                \
 100              "clr       %0\n\t"                                         \
 101              "movcs     %%xcc, -1, %0"                                  \
 102            : "=r" (m), "=r" (sh), "=&r" (sl)                            \
 103            : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl)                \
 104          __CLOBBER_CC)
 105 #endif
 106 #endif
 107
 108 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
 109 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
 110    processor running in 32-bit mode, since the carry flag then gets the 32-bit
 111    carry.  */
 112 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)                          \
 113   __asm__ (  "add%I6c   %2, %5, %6\n\t"                                 \
 114              "adde      %1, %3, %4\n\t"                                 \
 115              "subfe     %0, %0, %0\n\t"                                 \
 116              "nor       %0, %0, %0"                                     \
 117            : "=r" (m), "=r" (s1), "=&r" (s0)                            \
 118            : "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0))
 119 #endif
 120
 121 #if defined (__s390x__) && W_TYPE_SIZE == 64
 122 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)                          \
 123   __asm__ (  "algr      %2, %6\n\t"                                     \
 124              "alcgr     %1, %4\n\t"                                     \
 125              "lghi      %0, 0\n\t"                                      \
 126              "alcgr     %0, %0\n\t"                                     \
 127              "lcgr      %0, %0"                                         \
 128            : "=r" (m), "=r" (s1), "=&r" (s0)                            \
 129            : "1"  ((UDItype)(a1)), "r" ((UDItype)(b1)),                 \
 130              "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC)
 131 #endif
 132
 133 #if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32
 134 #define add_mssaaaa(m, sh, sl, ah, al, bh, bl)                          \
 135   __asm__ (  "adds      %2, %5, %6\n\t"                                 \
 136              "adcs      %1, %3, %4\n\t"                                 \
 137              "movcc     %0, #0\n\t"                                     \
 138              "movcs     %0, #-1"                                        \
 139            : "=r" (m), "=r" (sh), "=&r" (sl)                            \
 140            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
 141 #endif
 142 #endif /* defined (__GNUC__) */
 143
 144 #ifndef add_mssaaaa
 145 #define add_mssaaaa(m, s1, s0, a1, a0, b1, b0)                          \
 146   do {                                                                  \
 147     UWtype __s0, __s1, __c0, __c1;                                      \
 148     __s0 = (a0) + (b0);                                                 \
 149     __s1 = (a1) + (b1);                                                 \
 150     __c0 = __s0 < (a0);                                                 \
 151     __c1 = __s1 < (a1);                                                 \
 152     (s0) = __s0;                                                        \
 153     __s1 = __s1 + __c0;                                                 \
 154     (s1) = __s1;                                                        \
 155     (m) = - (__c1 + (__s1 < __c0));                                     \
 156   } while (0)
 157 #endif
 158
 159 #if DIV_QR_1N_METHOD == 1
 160
 161 /* Divides (uh B^n + {up, n}) by d, storing the quotient at {qp, n}.
 162    Requires that uh < d. */
 163 mp_limb_t
 164 mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t uh,
 165                    mp_limb_t d, mp_limb_t dinv)
 166 {
 167   ASSERT (n > 0);
 168   ASSERT (uh < d);
 169   ASSERT (d & GMP_NUMB_HIGHBIT);
 170   ASSERT (MPN_SAME_OR_SEPARATE_P (qp, up, n));
 171
 172   do
 173     {
 174       mp_limb_t q, ul;
 175
 176       ul = up[--n];
 177       udiv_qrnnd_preinv (q, uh, uh, ul, d, dinv);
 178       qp[n] = q;
 179     }
 180   while (n > 0);
 181
 182   return uh;
 183 }
 184
 185 #elif DIV_QR_1N_METHOD == 2
 186
 187 mp_limb_t
 188 mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
 189                    mp_limb_t d, mp_limb_t dinv)
 190 {
 191   mp_limb_t B2;
 192   mp_limb_t u0, u2;
 193   mp_limb_t q0, q1;
 194   mp_limb_t p0, p1;
 195   mp_limb_t t;
 196   mp_size_t j;
 197
 198   ASSERT (d & GMP_LIMB_HIGHBIT);
 199   ASSERT (n > 0);
 200   ASSERT (u1 < d);
 201
 202   if (n == 1)
 203     {
 204       udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
 205       return u1;
 206     }
 207
 208   /* FIXME: Could be precomputed */
 209   B2 = -d*dinv;
 210
 211   umul_ppmm (q1, q0, dinv, u1);
 212   umul_ppmm (p1, p0, B2, u1);
 213   q1 += u1;
 214   ASSERT (q1 >= u1);
 215   u0 = up[n-1]; /* Early read, to allow qp == up. */
 216   qp[n-1] = q1;
 217
 218   add_mssaaaa (u2, u1, u0, u0, up[n-2], p1, p0);
 219
 220   /* FIXME: Keep q1 in a variable between iterations, to reduce number
 221      of memory accesses. */
 222   for (j = n-2; j-- > 0; )
 223     {
 224       mp_limb_t q2, cy;
 225
 226       /* Additions for the q update:
 227        *        +-------+
 228        *        |u1 * v |
 229        *        +---+---+
 230        *        | u1|
 231        *    +---+---+
 232        *    | 1 | v |  (conditional on u2)
 233        *    +---+---+
 234        *        | 1 |  (conditional on u0 + u2 B2 carry)
 235        *        +---+
 236        * +      | q0|
 237        *   -+---+---+---+
 238        *    | q2| q1| q0|
 239        *    +---+---+---+
 240       */
 241       umul_ppmm (p1, t, u1, dinv);
 242       add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1);
 243       add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1);
 244       add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0);
 245       q0 = t;
 246
 247       umul_ppmm (p1, p0, u1, B2);
 248       ADDC_LIMB (cy, u0, u0, u2 & B2);
 249       u0 -= (-cy) & d;
 250
 251       /* Final q update */
 252       add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), cy);
 253       qp[j+1] = q1;
 254       MPN_INCR_U (qp+j+2, n-j-2, q2);
 255
 256       add_mssaaaa (u2, u1, u0, u0, up[j], p1, p0);
 257     }
 258
 259   q1 = (u2 > 0);
 260   u1 -= (-q1) & d;
 261
 262   t = (u1 >= d);
 263   q1 += t;
 264   u1 -= (-t) & d;
 265
 266   udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
 267   add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
 268
 269   MPN_INCR_U (qp+1, n-1, q1);
 270
 271   qp[0] = q0;
 272   return u0;
 273 }
 274
 275 #else
 276 #error Unknown DIV_QR_1N_METHOD
 277 #endif