source/libs/gmp/gmp-src/mpn/generic/div_qr_1n_pi2.c

   1 /* mpn_div_qr_1u_pi2.
   2
   3    THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS
   4    ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   5    GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
   6
   7 Copyright 2013 Free Software Foundation, Inc.
   8
   9 This file is part of the GNU MP Library.
  10
  11 The GNU MP Library is free software; you can redistribute it and/or modify
  12 it under the terms of either:
  13
  14   * the GNU Lesser General Public License as published by the Free
  15     Software Foundation; either version 3 of the License, or (at your
  16     option) any later version.
  17
  18 or
  19
  20   * the GNU General Public License as published by the Free Software
  21     Foundation; either version 2 of the License, or (at your option) any
  22     later version.
  23
  24 or both in parallel, as here.
  25
  26 The GNU MP Library is distributed in the hope that it will be useful, but
  27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  28 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  29 for more details.
  30
  31 You should have received copies of the GNU General Public License and the
  32 GNU Lesser General Public License along with the GNU MP Library.  If not,
  33 see https://www.gnu.org/licenses/.  */
  34
  35 /* ISSUES:
  36
  37    * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv?
  38
  39    * Are there any problems with generating n quotient limbs in the q area?  It
  40      surely simplifies things.
  41
  42    * Not yet adequately tested.
  43 */
  44
  45 #include "gmp.h"
  46 #include "gmp-impl.h"
  47 #include "longlong.h"
  48
  49 /* Define some longlong.h-style macros, but for wider operations.
  50    * add_sssaaaa is like longlong.h's add_ssaaaa but propagating
  51      carry-out into an additional sum operand.
  52 */
  53 #if defined (__GNUC__)  && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM)
  54
  55 #if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32
  56 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  57   __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0"               \
  58            : "=r" (s2), "=&r" (s1), "=&r" (s0)                          \
  59            : "0"  ((USItype)(s2)),                                      \
  60              "1"  ((USItype)(a1)), "g" ((USItype)(b1)),                 \
  61              "%2" ((USItype)(a0)), "g" ((USItype)(b0)))
  62 #endif
  63
  64 #if defined (__amd64__) && W_TYPE_SIZE == 64
  65 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  66   __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0"               \
  67            : "=r" (s2), "=&r" (s1), "=&r" (s0)                          \
  68            : "0"  ((UDItype)(s2)),                                      \
  69              "1"  ((UDItype)(a1)), "rme" ((UDItype)(b1)),               \
  70              "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0)))
  71 #endif
  72
  73 #if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB)
  74 /* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a
  75    processor running in 32-bit mode, since the carry flag then gets the 32-bit
  76    carry.  */
  77 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  78   __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%0"        \
  79            : "=r" (s2), "=&r" (s1), "=&r" (s0)                          \
  80            : "r"  (s2), "r"  (a1), "r" (b1), "%r" (a0), "rI" (b0))
  81 #endif
  82
  83 #endif /* __GNUC__ */
  84
  85 #ifndef add_sssaaaa
  86 #define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0)                         \
  87   do {                                                                  \
  88     UWtype __s0, __s1, __c0, __c1;                                      \
  89     __s0 = (a0) + (b0);                                                 \
  90     __s1 = (a1) + (b1);                                                 \
  91     __c0 = __s0 < (a0);                                                 \
  92     __c1 = __s1 < (a1);                                                 \
  93     (s0) = __s0;                                                        \
  94     __s1 = __s1 + __c0;                                                 \
  95     (s1) = __s1;                                                        \
  96     (s2) += __c1 + (__s1 < __c0);                                       \
  97   } while (0)
  98 #endif
  99
 100 struct precomp_div_1_pi2
 101 {
 102   mp_limb_t dip[2];
 103   mp_limb_t d;
 104   int norm_cnt;
 105 };
 106
 107 mp_limb_t
 108 mpn_div_qr_1n_pi2 (mp_ptr qp,
 109                    mp_srcptr up, mp_size_t un,
 110                    struct precomp_div_1_pi2 *pd)
 111 {
 112   mp_limb_t most_significant_q_limb;
 113   mp_size_t i;
 114   mp_limb_t r, u2, u1, u0;
 115   mp_limb_t d0, di1, di0;
 116   mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d;
 117   mp_limb_t cnd;
 118
 119   ASSERT (un >= 2);
 120   ASSERT ((pd->d & GMP_NUMB_HIGHBIT) != 0);
 121   ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up);
 122   ASSERT_MPN (up, un);
 123
 124 #define q3 q3a
 125 #define q2 q2b
 126 #define q1 q1b
 127
 128   up += un - 3;
 129   r = up[2];
 130   d0 = pd->d;
 131
 132   most_significant_q_limb = (r >= d0);
 133   r -= d0 & -most_significant_q_limb;
 134
 135   qp += un - 3;
 136   qp[2] = most_significant_q_limb;
 137
 138   di1 = pd->dip[1];
 139   di0 = pd->dip[0];
 140
 141   for (i = un - 3; i >= 0; i -= 2)
 142     {
 143       u2 = r;
 144       u1 = up[1];
 145       u0 = up[0];
 146
 147       /* Dividend in {r,u1,u0} */
 148
 149       umul_ppmm (q1d,q0d, u1, di0);
 150       umul_ppmm (q2b,q1b, u1, di1);
 151       q2b++;                            /* cannot spill */
 152       add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0);
 153
 154       umul_ppmm (q2c,q1c, u2,  di0);
 155       add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c);
 156       umul_ppmm (q3a,q2a, u2, di1);
 157
 158       add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d);
 159
 160       q3 += r;
 161
 162       r = u0 - q2 * d0;
 163
 164       cnd = (r >= q1);
 165       r += d0 & -cnd;
 166       sub_ddmmss (q3,q2,  q3,q2,  0,cnd);
 167
 168       if (UNLIKELY (r >= d0))
 169         {
 170           r -= d0;
 171           add_ssaaaa (q3,q2,  q3,q2,  0,1);
 172         }
 173
 174       qp[0] = q2;
 175       qp[1] = q3;
 176
 177       up -= 2;
 178       qp -= 2;
 179     }
 180
 181   if ((un & 1) == 0)
 182     {
 183       u2 = r;
 184       u1 = up[1];
 185
 186       udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1);
 187       qp[1] = q3;
 188     }
 189
 190   return r;
 191
 192 #undef q3
 193 #undef q2
 194 #undef q1
 195 }