source/libs/gmp/gmp-src/mpn/generic/perfsqr.c

   1 /* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square,
   2    zero otherwise.
   3
   4 Copyright 1991, 1993, 1994, 1996, 1997, 2000-2002, 2005, 2012 Free Software
   5 Foundation, Inc.
   6
   7 This file is part of the GNU MP Library.
   8
   9 The GNU MP Library is free software; you can redistribute it and/or modify
  10 it under the terms of either:
  11
  12   * the GNU Lesser General Public License as published by the Free
  13     Software Foundation; either version 3 of the License, or (at your
  14     option) any later version.
  15
  16 or
  17
  18   * the GNU General Public License as published by the Free Software
  19     Foundation; either version 2 of the License, or (at your option) any
  20     later version.
  21
  22 or both in parallel, as here.
  23
  24 The GNU MP Library is distributed in the hope that it will be useful, but
  25 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 for more details.
  28
  29 You should have received copies of the GNU General Public License and the
  30 GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 see https://www.gnu.org/licenses/.  */
  32
  33 #include <stdio.h> /* for NULL */
  34 #include "gmp.h"
  35 #include "gmp-impl.h"
  36 #include "longlong.h"
  37
  38 #include "perfsqr.h"
  39
  40
  41 /* change this to "#define TRACE(x) x" for diagnostics */
  42 #define TRACE(x)
  43
  44
  45
  46 /* PERFSQR_MOD_* detects non-squares using residue tests.
  47
  48    A macro PERFSQR_MOD_TEST is setup by gen-psqr.c in perfsqr.h.  It takes
  49    {up,usize} modulo a selected modulus to get a remainder r.  For 32-bit or
  50    64-bit limbs this modulus will be 2^24-1 or 2^48-1 using PERFSQR_MOD_34,
  51    or for other limb or nail sizes a PERFSQR_PP is chosen and PERFSQR_MOD_PP
  52    used.  PERFSQR_PP_NORM and PERFSQR_PP_INVERTED are pre-calculated in this
  53    case too.
  54
  55    PERFSQR_MOD_TEST then makes various calls to PERFSQR_MOD_1 or
  56    PERFSQR_MOD_2 with divisors d which are factors of the modulus, and table
  57    data indicating residues and non-residues modulo those divisors.  The
  58    table data is in 1 or 2 limbs worth of bits respectively, per the size of
  59    each d.
  60
  61    A "modexact" style remainder is taken to reduce r modulo d.
  62    PERFSQR_MOD_IDX implements this, producing an index "idx" for use with
  63    the table data.  Notice there's just one multiplication by a constant
  64    "inv", for each d.
  65
  66    The modexact doesn't produce a true r%d remainder, instead idx satisfies
  67    "-(idx<<PERFSQR_MOD_BITS) == r mod d".  Because d is odd, this factor
  68    -2^PERFSQR_MOD_BITS is a one-to-one mapping between r and idx, and is
  69    accounted for by having the table data suitably permuted.
  70
  71    The remainder r fits within PERFSQR_MOD_BITS which is less than a limb.
  72    In fact the GMP_LIMB_BITS - PERFSQR_MOD_BITS spare bits are enough to fit
  73    each divisor d meaning the modexact multiply can take place entirely
  74    within one limb, giving the compiler the chance to optimize it, in a way
  75    that say umul_ppmm would not give.
  76
  77    There's no need for the divisors d to be prime, in fact gen-psqr.c makes
  78    a deliberate effort to combine factors so as to reduce the number of
  79    separate tests done on r.  But such combining is limited to d <=
  80    2*GMP_LIMB_BITS so that the table data fits in at most 2 limbs.
  81
  82    Alternatives:
  83
  84    It'd be possible to use bigger divisors d, and more than 2 limbs of table
  85    data, but this doesn't look like it would be of much help to the prime
  86    factors in the usual moduli 2^24-1 or 2^48-1.
  87
  88    The moduli 2^24-1 or 2^48-1 are nothing particularly special, they're
  89    just easy to calculate (see mpn_mod_34lsub1) and have a nice set of prime
  90    factors.  2^32-1 and 2^64-1 would be equally easy to calculate, but have
  91    fewer prime factors.
  92
  93    The nails case usually ends up using mpn_mod_1, which is a lot slower
  94    than mpn_mod_34lsub1.  Perhaps other such special moduli could be found
  95    for the nails case.  Two-term things like 2^30-2^15-1 might be
  96    candidates.  Or at worst some on-the-fly de-nailing would allow the plain
  97    2^24-1 to be used.  Currently nails are too preliminary to be worried
  98    about.
  99
 100 */
 101
 102 #define PERFSQR_MOD_MASK       ((CNST_LIMB(1) << PERFSQR_MOD_BITS) - 1)
 103
 104 #define MOD34_BITS  (GMP_NUMB_BITS / 4 * 3)
 105 #define MOD34_MASK  ((CNST_LIMB(1) << MOD34_BITS) - 1)
 106
 107 #define PERFSQR_MOD_34(r, up, usize)                            \
 108   do {                                                          \
 109     (r) = mpn_mod_34lsub1 (up, usize);                          \
 110     (r) = ((r) & MOD34_MASK) + ((r) >> MOD34_BITS);             \
 111   } while (0)
 112
 113 /* FIXME: The %= here isn't good, and might destroy any savings from keeping
 114    the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm).
 115    Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor
 116    and a shift count, like mpn_preinv_divrem_1.  But mod_34lsub1 is our
 117    normal case, so lets not worry too much about mod_1.  */
 118 #define PERFSQR_MOD_PP(r, up, usize)                                    \
 119   do {                                                                  \
 120     if (BELOW_THRESHOLD (usize, PREINV_MOD_1_TO_MOD_1_THRESHOLD))       \
 121       {                                                                 \
 122         (r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM,             \
 123                                 PERFSQR_PP_INVERTED);                   \
 124         (r) %= PERFSQR_PP;                                              \
 125       }                                                                 \
 126     else                                                                \
 127       {                                                                 \
 128         (r) = mpn_mod_1 (up, usize, PERFSQR_PP);                        \
 129       }                                                                 \
 130   } while (0)
 131
 132 #define PERFSQR_MOD_IDX(idx, r, d, inv)                         \
 133   do {                                                          \
 134     mp_limb_t  q;                                               \
 135     ASSERT ((r) <= PERFSQR_MOD_MASK);                           \
 136     ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1);           \
 137     ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK);           \
 138                                                                 \
 139     q = ((r) * (inv)) & PERFSQR_MOD_MASK;                       \
 140     ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK));               \
 141     (idx) = (q * (d)) >> PERFSQR_MOD_BITS;                      \
 142   } while (0)
 143
 144 #define PERFSQR_MOD_1(r, d, inv, mask)                          \
 145   do {                                                          \
 146     unsigned   idx;                                             \
 147     ASSERT ((d) <= GMP_LIMB_BITS);                              \
 148     PERFSQR_MOD_IDX(idx, r, d, inv);                            \
 149     TRACE (printf ("  PERFSQR_MOD_1 d=%u r=%lu idx=%u\n",       \
 150                    d, r%d, idx));                               \
 151     if ((((mask) >> idx) & 1) == 0)                             \
 152       {                                                         \
 153         TRACE (printf ("  non-square\n"));                      \
 154         return 0;                                               \
 155       }                                                         \
 156   } while (0)
 157
 158 /* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the
 159    sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch.  */
 160 #define PERFSQR_MOD_2(r, d, inv, mhi, mlo)                      \
 161   do {                                                          \
 162     mp_limb_t  m;                                               \
 163     unsigned   idx;                                             \
 164     ASSERT ((d) <= 2*GMP_LIMB_BITS);                            \
 165                                                                 \
 166     PERFSQR_MOD_IDX (idx, r, d, inv);                           \
 167     TRACE (printf ("  PERFSQR_MOD_2 d=%u r=%lu idx=%u\n",       \
 168                    d, r%d, idx));                               \
 169     m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi));        \
 170     idx %= GMP_LIMB_BITS;                                       \
 171     if (((m >> idx) & 1) == 0)                                  \
 172       {                                                         \
 173         TRACE (printf ("  non-square\n"));                      \
 174         return 0;                                               \
 175       }                                                         \
 176   } while (0)
 177
 178
 179 int
 180 mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
 181 {
 182   ASSERT (usize >= 1);
 183
 184   TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize));
 185
 186   /* The first test excludes 212/256 (82.8%) of the perfect square candidates
 187      in O(1) time.  */
 188   {
 189     unsigned  idx = up[0] % 0x100;
 190     if (((sq_res_0x100[idx / GMP_LIMB_BITS]
 191           >> (idx % GMP_LIMB_BITS)) & 1) == 0)
 192       return 0;
 193   }
 194
 195 #if 0
 196   /* Check that we have even multiplicity of 2, and then check that the rest is
 197      a possible perfect square.  Leave disabled until we can determine this
 198      really is an improvement.  It it is, it could completely replace the
 199      simple probe above, since this should throw out more non-squares, but at
 200      the expense of somewhat more cycles.  */
 201   {
 202     mp_limb_t lo;
 203     int cnt;
 204     lo = up[0];
 205     while (lo == 0)
 206       up++, lo = up[0], usize--;
 207     count_trailing_zeros (cnt, lo);
 208     if ((cnt & 1) != 0)
 209       return 0;                 /* return of not even multiplicity of 2 */
 210     lo >>= cnt;                 /* shift down to align lowest non-zero bit */
 211     lo >>= 1;                   /* shift away lowest non-zero bit */
 212     if ((lo & 3) != 0)
 213       return 0;
 214   }
 215 #endif
 216
 217
 218   /* The second test uses mpn_mod_34lsub1 or mpn_mod_1 to detect non-squares
 219      according to their residues modulo small primes (or powers of
 220      primes).  See perfsqr.h.  */
 221   PERFSQR_MOD_TEST (up, usize);
 222
 223
 224   /* For the third and last test, we finally compute the square root,
 225      to make sure we've really got a perfect square.  */
 226   {
 227     mp_ptr root_ptr;
 228     int res;
 229     TMP_DECL;
 230
 231     TMP_MARK;
 232     root_ptr = TMP_ALLOC_LIMBS ((usize + 1) / 2);
 233
 234     /* Iff mpn_sqrtrem returns zero, the square is perfect.  */
 235     res = ! mpn_sqrtrem (root_ptr, NULL, up, usize);
 236     TMP_FREE;
 237
 238     return res;
 239   }
 240 }