source/libs/gmp/gmp-src/mpn/generic/div_qr_1u_pi2.c

   1 /* mpn_div_qr_1u_pi2.
   2
   3    THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS
   4    ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   5    GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
   6
   7 Copyright 2013 Free Software Foundation, Inc.
   8
   9 This file is part of the GNU MP Library.
  10
  11 The GNU MP Library is free software; you can redistribute it and/or modify
  12 it under the terms of either:
  13
  14   * the GNU Lesser General Public License as published by the Free
  15     Software Foundation; either version 3 of the License, or (at your
  16     option) any later version.
  17
  18 or
  19
  20   * the GNU General Public License as published by the Free Software
  21     Foundation; either version 2 of the License, or (at your option) any
  22     later version.
  23
  24 or both in parallel, as here.
  25
  26 The GNU MP Library is distributed in the hope that it will be useful, but
  27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  28 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  29 for more details.
  30
  31 You should have received copies of the GNU General Public License and the
  32 GNU Lesser General Public License along with the GNU MP Library.  If not,
  33 see https://www.gnu.org/licenses/.  */
  34
  35 /* ISSUES:
  36
  37    * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv?
  38
  39    * Are there any problems with generating n quotient limbs in the q area?  It
  40      surely simplifies things.
  41
  42    * Not yet adequately tested.
  43 */
  44
  45 #include "gmp.h"
  46 #include "gmp-impl.h"
  47 #include "longlong.h"
  48
  49 /* Define some longlong.h-style macros, but for wider operations.
  50    * add_sssaaaa is like longlong.h's add_ssaaaa but propagating
  51      carry-out into an additional sum operand.
  52 */
  53 #if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
  54
  55 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
  56 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  57   __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"               \
  58            : "=r" (s2), "=&r" (s1), "=&r" (s0)                          \
  59            : "0"  ((USItype)(s2)),                                      \
  60              "1"  ((USItype)(a1)), "g" ((USItype)(b1)),                 \
  61              "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
  62 #endif
  63
  64 #if defined (__amd64__) && W_TYPE_SIZE == 64
  65 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  66   __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"               \
  67            : "=r" (s2), "=&r" (s1), "=&r" (s0)                          \
  68            : "0"  ((UDItype)(s2)),                                      \
  69              "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),               \
  70              "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
  71 #endif
  72
  73 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
  74 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
  75    processor running in 32-bit mode, since the carry flag then gets the 32-bit
  76    carry.  */
  77 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  78   __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0"        \
  79            : "=r" (s2), "=&r" (s1), "=&r" (s0)                          \
  80            : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0))
  81 #endif
  82
  83 #endif /* __GNUC__ */
  84
  85 #ifndef add_sssaaaa
  86 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  87   do {                                                                  \
  88     UWtype __s0, __s1, __c0, __c1;                                      \
  89     __s0 = (a0) + (b0);                                                 \
  90     __s1 = (a1) + (b1);                                                 \
  91     __c0 = __s0 < (a0);                                                 \
  92     __c1 = __s1 < (a1);                                                 \
  93     (s0) = __s0;                                                        \
  94     __s1 = __s1 + __c0;                                                 \
  95     (s1) = __s1;                                                        \
  96     (s2) += __c1 + (__s1 < __c0);                                       \
  97   } while (0)
  98 #endif
  99
 100 struct precomp_div_1_pi2
 101 {
 102   mp_limb_t dip[2];
 103   mp_limb_t d;
 104   int norm_cnt;
 105 };
 106
 107 mp_limb_t
 108 mpn_div_qr_1u_pi2 (mp_ptr qp,
 109                    mp_srcptr up, mp_size_t un,
 110                    struct precomp_div_1_pi2 *pd)
 111 {
 112   mp_size_t i;
 113   mp_limb_t r, u2, u1, u0;
 114   mp_limb_t d0, di1, di0;
 115   mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d;
 116   mp_limb_t cnd;
 117   int cnt;
 118
 119   ASSERT (un >= 2);
 120   ASSERT ((pd->d & GMP_NUMB_HIGHBIT) == 0);
 121   ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up);
 122   ASSERT_MPN (up, un);
 123
 124 #define q3 q3a
 125 #define q2 q2b
 126 #define q1 q1b
 127
 128   up += un - 3;
 129   cnt = pd->norm_cnt;
 130   r = up[2] >> (GMP_NUMB_BITS - cnt);
 131   d0 = pd->d << cnt;
 132
 133   qp += un - 2;
 134
 135   di1 = pd->dip[1];
 136   di0 = pd->dip[0];
 137
 138   for (i = un - 3; i >= 0; i -= 2)
 139     {
 140       u2 = r;
 141       u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt));
 142       u0 = (up[1] << cnt) | (up[0] >> (GMP_NUMB_BITS - cnt));
 143
 144       /* Dividend in {r,u1,u0} */
 145
 146       umul_ppmm (q1d,q0d, u1, di0);
 147       umul_ppmm (q2b,q1b, u1, di1);
 148       q2b++;                            /* cannot spill */
 149       add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
 150
 151       umul_ppmm (q2c,q1c, u2,  di0);
 152       add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
 153       umul_ppmm (q3a,q2a, u2, di1);
 154
 155       add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
 156
 157       q3 += r;
 158
 159       r = u0 - q2 * d0;
 160
 161       cnd = (r >= q1);
 162       r += d0 & -cnd;
 163       sub_ddmmss (q3,q2,  q3,q2,  0,cnd);
 164
 165       if (UNLIKELY (r >= d0))
 166         {
 167           r -= d0;
 168           add_ssaaaa (q3,q2,  q3,q2,  0,1);
 169         }
 170
 171       qp[0] = q2;
 172       qp[1] = q3;
 173
 174       up -= 2;
 175       qp -= 2;
 176     }
 177
 178   if ((un & 1) != 0)
 179     {
 180       u2 = r;
 181       u1 = (up[2] << cnt);
 182
 183       udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1);
 184       qp[1] = q3;
 185     }
 186   else
 187     {
 188       u2 = r;
 189       u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt));
 190       u0 = (up[1] << cnt);
 191
 192       /* Dividend in {r,u1,u0} */
 193
 194       umul_ppmm (q1d,q0d, u1, di0);
 195       umul_ppmm (q2b,q1b, u1, di1);
 196       q2b++;                            /* cannot spill */
 197       add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
 198
 199       umul_ppmm (q2c,q1c, u2,  di0);
 200       add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
 201       umul_ppmm (q3a,q2a, u2, di1);
 202
 203       add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
 204
 205       q3 += r;
 206
 207       r = u0 - q2 * d0;
 208
 209       cnd = (r >= q1);
 210       r += d0 & -cnd;
 211       sub_ddmmss (q3,q2,  q3,q2,  0,cnd);
 212
 213       if (UNLIKELY (r >= d0))
 214         {
 215           r -= d0;
 216           add_ssaaaa (q3,q2,  q3,q2,  0,1);
 217         }
 218
 219       qp[0] = q2;
 220       qp[1] = q3;
 221     }
 222
 223   return r >> cnt;
 224
 225 #undef q3
 226 #undef q2
 227 #undef q1
 228 }