soft-fp/op-2.h

   1 /* Software floating-point emulation.
   2    Basic two-word fraction declaration and manipulation.
   3    Copyright (C) 1997-2023 Free Software Foundation, Inc.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    In addition to the permissions in the GNU Lesser General Public
  12    License, the Free Software Foundation gives you unlimited
  13    permission to link the compiled version of this file into
  14    combinations with other programs, and to distribute those
  15    combinations without any restriction coming from the use of this
  16    file.  (The Lesser General Public License restrictions do apply in
  17    other respects; for example, they cover modification of the file,
  18    and distribution when not linked into a combine executable.)
  19
  20    The GNU C Library is distributed in the hope that it will be useful,
  21    but WITHOUT ANY WARRANTY; without even the implied warranty of
  22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  23    Lesser General Public License for more details.
  24
  25    You should have received a copy of the GNU Lesser General Public
  26    License along with the GNU C Library; if not, see
  27    <https://www.gnu.org/licenses/>.  */
  28
  29 #ifndef SOFT_FP_OP_2_H
  30 #define SOFT_FP_OP_2_H  1
  31
  32 #define _FP_FRAC_DECL_2(X)                              \
  33   _FP_W_TYPE X##_f0 _FP_ZERO_INIT, X##_f1 _FP_ZERO_INIT
  34 #define _FP_FRAC_COPY_2(D, S)   (D##_f0 = S##_f0, D##_f1 = S##_f1)
  35 #define _FP_FRAC_SET_2(X, I)    __FP_FRAC_SET_2 (X, I)
  36 #define _FP_FRAC_HIGH_2(X)      (X##_f1)
  37 #define _FP_FRAC_LOW_2(X)       (X##_f0)
  38 #define _FP_FRAC_WORD_2(X, w)   (X##_f##w)
  39
  40 #define _FP_FRAC_SLL_2(X, N)                                            \
  41   (void) (((N) < _FP_W_TYPE_SIZE)                                       \
  42           ? ({                                                          \
  43               if (__builtin_constant_p (N) && (N) == 1)                 \
  44                 {                                                       \
  45                   X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE) (X##_f0)) < 0); \
  46                   X##_f0 += X##_f0;                                     \
  47                 }                                                       \
  48               else                                                      \
  49                 {                                                       \
  50                   X##_f1 = X##_f1 << (N) | X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \
  51                   X##_f0 <<= (N);                                       \
  52                 }                                                       \
  53               0;                                                        \
  54             })                                                          \
  55           : ({                                                          \
  56               X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE);               \
  57               X##_f0 = 0;                                               \
  58             }))
  59
  60
  61 #define _FP_FRAC_SRL_2(X, N)                                            \
  62   (void) (((N) < _FP_W_TYPE_SIZE)                                       \
  63           ? ({                                                          \
  64               X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N)); \
  65               X##_f1 >>= (N);                                           \
  66             })                                                          \
  67           : ({                                                          \
  68               X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE);               \
  69               X##_f1 = 0;                                               \
  70             }))
  71
  72 /* Right shift with sticky-lsb.  */
  73 #define _FP_FRAC_SRST_2(X, S, N, sz)                                    \
  74   (void) (((N) < _FP_W_TYPE_SIZE)                                       \
  75           ? ({                                                          \
  76               S = (__builtin_constant_p (N) && (N) == 1                 \
  77                    ? X##_f0 & 1                                         \
  78                    : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0);         \
  79               X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N)); \
  80               X##_f1 >>= (N);                                           \
  81             })                                                          \
  82           : ({                                                          \
  83               S = ((((N) == _FP_W_TYPE_SIZE                             \
  84                      ? 0                                                \
  85                      : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))           \
  86                     | X##_f0) != 0);                                    \
  87               X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE));             \
  88               X##_f1 = 0;                                               \
  89             }))
  90
  91 #define _FP_FRAC_SRS_2(X, N, sz)                                        \
  92   (void) (((N) < _FP_W_TYPE_SIZE)                                       \
  93           ? ({                                                          \
  94               X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) \
  95                         | (__builtin_constant_p (N) && (N) == 1         \
  96                            ? X##_f0 & 1                                 \
  97                            : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0)); \
  98               X##_f1 >>= (N);                                           \
  99             })                                                          \
 100           : ({                                                          \
 101               X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE)               \
 102                         | ((((N) == _FP_W_TYPE_SIZE                     \
 103                              ? 0                                        \
 104                              : (X##_f1 << (2*_FP_W_TYPE_SIZE - (N))))   \
 105                             | X##_f0) != 0));                           \
 106               X##_f1 = 0;                                               \
 107             }))
 108
 109 #define _FP_FRAC_ADDI_2(X, I)   \
 110   __FP_FRAC_ADDI_2 (X##_f1, X##_f0, I)
 111
 112 #define _FP_FRAC_ADD_2(R, X, Y) \
 113   __FP_FRAC_ADD_2 (R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
 114
 115 #define _FP_FRAC_SUB_2(R, X, Y) \
 116   __FP_FRAC_SUB_2 (R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
 117
 118 #define _FP_FRAC_DEC_2(X, Y)    \
 119   __FP_FRAC_DEC_2 (X##_f1, X##_f0, Y##_f1, Y##_f0)
 120
 121 #define _FP_FRAC_CLZ_2(R, X)                    \
 122   do                                            \
 123     {                                           \
 124       if (X##_f1)                               \
 125         __FP_CLZ ((R), X##_f1);                 \
 126       else                                      \
 127         {                                       \
 128           __FP_CLZ ((R), X##_f0);               \
 129           (R) += _FP_W_TYPE_SIZE;               \
 130         }                                       \
 131     }                                           \
 132   while (0)
 133
 134 /* Predicates.  */
 135 #define _FP_FRAC_NEGP_2(X)      ((_FP_WS_TYPE) X##_f1 < 0)
 136 #define _FP_FRAC_ZEROP_2(X)     ((X##_f1 | X##_f0) == 0)
 137 #define _FP_FRAC_OVERP_2(fs, X) (_FP_FRAC_HIGH_##fs (X) & _FP_OVERFLOW_##fs)
 138 #define _FP_FRAC_CLEAR_OVERP_2(fs, X)   (_FP_FRAC_HIGH_##fs (X) &= ~_FP_OVERFLOW_##fs)
 139 #define _FP_FRAC_HIGHBIT_DW_2(fs, X)    \
 140   (_FP_FRAC_HIGH_DW_##fs (X) & _FP_HIGHBIT_DW_##fs)
 141 #define _FP_FRAC_EQ_2(X, Y)     (X##_f1 == Y##_f1 && X##_f0 == Y##_f0)
 142 #define _FP_FRAC_GT_2(X, Y)     \
 143   (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 > Y##_f0))
 144 #define _FP_FRAC_GE_2(X, Y)     \
 145   (X##_f1 > Y##_f1 || (X##_f1 == Y##_f1 && X##_f0 >= Y##_f0))
 146
 147 #define _FP_ZEROFRAC_2          0, 0
 148 #define _FP_MINFRAC_2           0, 1
 149 #define _FP_MAXFRAC_2           (~(_FP_WS_TYPE) 0), (~(_FP_WS_TYPE) 0)
 150
 151 /* Internals.  */
 152
 153 #define __FP_FRAC_SET_2(X, I1, I0)      (X##_f0 = I0, X##_f1 = I1)
 154
 155 #define __FP_CLZ_2(R, xh, xl)                   \
 156   do                                            \
 157     {                                           \
 158       if (xh)                                   \
 159         __FP_CLZ ((R), xh);                     \
 160       else                                      \
 161         {                                       \
 162           __FP_CLZ ((R), xl);                   \
 163           (R) += _FP_W_TYPE_SIZE;               \
 164         }                                       \
 165     }                                           \
 166   while (0)
 167
 168 #if 0
 169
 170 # ifndef __FP_FRAC_ADDI_2
 171 #  define __FP_FRAC_ADDI_2(xh, xl, i)   \
 172   (xh += ((xl += i) < i))
 173 # endif
 174 # ifndef __FP_FRAC_ADD_2
 175 #  define __FP_FRAC_ADD_2(rh, rl, xh, xl, yh, yl)       \
 176   (rh = xh + yh + ((rl = xl + yl) < xl))
 177 # endif
 178 # ifndef __FP_FRAC_SUB_2
 179 #  define __FP_FRAC_SUB_2(rh, rl, xh, xl, yh, yl)       \
 180   (rh = xh - yh - ((rl = xl - yl) > xl))
 181 # endif
 182 # ifndef __FP_FRAC_DEC_2
 183 #  define __FP_FRAC_DEC_2(xh, xl, yh, yl)               \
 184   do                                                    \
 185     {                                                   \
 186       UWtype __FP_FRAC_DEC_2_t = xl;                    \
 187       xh -= yh + ((xl -= yl) > __FP_FRAC_DEC_2_t);      \
 188     }                                                   \
 189   while (0)
 190 # endif
 191
 192 #else
 193
 194 # undef __FP_FRAC_ADDI_2
 195 # define __FP_FRAC_ADDI_2(xh, xl, i)    add_ssaaaa (xh, xl, xh, xl, 0, i)
 196 # undef __FP_FRAC_ADD_2
 197 # define __FP_FRAC_ADD_2                add_ssaaaa
 198 # undef __FP_FRAC_SUB_2
 199 # define __FP_FRAC_SUB_2                sub_ddmmss
 200 # undef __FP_FRAC_DEC_2
 201 # define __FP_FRAC_DEC_2(xh, xl, yh, yl)        \
 202   sub_ddmmss (xh, xl, xh, xl, yh, yl)
 203
 204 #endif
 205
 206 /* Unpack the raw bits of a native fp value.  Do not classify or
 207    normalize the data.  */
 208
 209 #define _FP_UNPACK_RAW_2(fs, X, val)                    \
 210   do                                                    \
 211     {                                                   \
 212       union _FP_UNION_##fs _FP_UNPACK_RAW_2_flo;        \
 213       _FP_UNPACK_RAW_2_flo.flt = (val);                 \
 214                                                         \
 215       X##_f0 = _FP_UNPACK_RAW_2_flo.bits.frac0;         \
 216       X##_f1 = _FP_UNPACK_RAW_2_flo.bits.frac1;         \
 217       X##_e  = _FP_UNPACK_RAW_2_flo.bits.exp;           \
 218       X##_s  = _FP_UNPACK_RAW_2_flo.bits.sign;          \
 219     }                                                   \
 220   while (0)
 221
 222 #define _FP_UNPACK_RAW_2_P(fs, X, val)                  \
 223   do                                                    \
 224     {                                                   \
 225       union _FP_UNION_##fs *_FP_UNPACK_RAW_2_P_flo      \
 226         = (union _FP_UNION_##fs *) (val);               \
 227                                                         \
 228       X##_f0 = _FP_UNPACK_RAW_2_P_flo->bits.frac0;      \
 229       X##_f1 = _FP_UNPACK_RAW_2_P_flo->bits.frac1;      \
 230       X##_e  = _FP_UNPACK_RAW_2_P_flo->bits.exp;        \
 231       X##_s  = _FP_UNPACK_RAW_2_P_flo->bits.sign;       \
 232     }                                                   \
 233   while (0)
 234
 235
 236 /* Repack the raw bits of a native fp value.  */
 237
 238 #define _FP_PACK_RAW_2(fs, val, X)              \
 239   do                                            \
 240     {                                           \
 241       union _FP_UNION_##fs _FP_PACK_RAW_2_flo;  \
 242                                                 \
 243       _FP_PACK_RAW_2_flo.bits.frac0 = X##_f0;   \
 244       _FP_PACK_RAW_2_flo.bits.frac1 = X##_f1;   \
 245       _FP_PACK_RAW_2_flo.bits.exp   = X##_e;    \
 246       _FP_PACK_RAW_2_flo.bits.sign  = X##_s;    \
 247                                                 \
 248       (val) = _FP_PACK_RAW_2_flo.flt;           \
 249     }                                           \
 250   while (0)
 251
 252 #define _FP_PACK_RAW_2_P(fs, val, X)                    \
 253   do                                                    \
 254     {                                                   \
 255       union _FP_UNION_##fs *_FP_PACK_RAW_2_P_flo        \
 256         = (union _FP_UNION_##fs *) (val);               \
 257                                                         \
 258       _FP_PACK_RAW_2_P_flo->bits.frac0 = X##_f0;        \
 259       _FP_PACK_RAW_2_P_flo->bits.frac1 = X##_f1;        \
 260       _FP_PACK_RAW_2_P_flo->bits.exp   = X##_e;         \
 261       _FP_PACK_RAW_2_P_flo->bits.sign  = X##_s;         \
 262     }                                                   \
 263   while (0)
 264
 265
 266 /* Multiplication algorithms: */
 267
 268 /* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */
 269
 270 #define _FP_MUL_MEAT_DW_2_wide(wfracbits, R, X, Y, doit)                \
 271   do                                                                    \
 272     {                                                                   \
 273       _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_b);                       \
 274       _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_c);                       \
 275                                                                         \
 276       doit (_FP_FRAC_WORD_4 (R, 1), _FP_FRAC_WORD_4 (R, 0),             \
 277             X##_f0, Y##_f0);                                            \
 278       doit (_FP_MUL_MEAT_DW_2_wide_b_f1, _FP_MUL_MEAT_DW_2_wide_b_f0,   \
 279             X##_f0, Y##_f1);                                            \
 280       doit (_FP_MUL_MEAT_DW_2_wide_c_f1, _FP_MUL_MEAT_DW_2_wide_c_f0,   \
 281             X##_f1, Y##_f0);                                            \
 282       doit (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),             \
 283             X##_f1, Y##_f1);                                            \
 284                                                                         \
 285       __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),  \
 286                        _FP_FRAC_WORD_4 (R, 1), 0,                       \
 287                        _FP_MUL_MEAT_DW_2_wide_b_f1,                     \
 288                        _FP_MUL_MEAT_DW_2_wide_b_f0,                     \
 289                        _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),  \
 290                        _FP_FRAC_WORD_4 (R, 1));                         \
 291       __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),  \
 292                        _FP_FRAC_WORD_4 (R, 1), 0,                       \
 293                        _FP_MUL_MEAT_DW_2_wide_c_f1,                     \
 294                        _FP_MUL_MEAT_DW_2_wide_c_f0,                     \
 295                        _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),  \
 296                        _FP_FRAC_WORD_4 (R, 1));                         \
 297     }                                                                   \
 298   while (0)
 299
 300 #define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit)                   \
 301   do                                                                    \
 302     {                                                                   \
 303       _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_wide_z);                          \
 304                                                                         \
 305       _FP_MUL_MEAT_DW_2_wide ((wfracbits), _FP_MUL_MEAT_2_wide_z,       \
 306                               X, Y, doit);                              \
 307                                                                         \
 308       /* Normalize since we know where the msb of the multiplicands     \
 309          were (bit B), we know that the msb of the of the product is    \
 310          at either 2B or 2B-1.  */                                      \
 311       _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_wide_z, (wfracbits)-1,             \
 312                       2*(wfracbits));                                   \
 313       R##_f0 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_z, 0);              \
 314       R##_f1 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_z, 1);              \
 315     }                                                                   \
 316   while (0)
 317
 318 /* Given a 1W * 1W => 2W primitive, do the extended multiplication.
 319    Do only 3 multiplications instead of four. This one is for machines
 320    where multiplication is much more expensive than subtraction.  */
 321
 322 #define _FP_MUL_MEAT_DW_2_wide_3mul(wfracbits, R, X, Y, doit)           \
 323   do                                                                    \
 324     {                                                                   \
 325       _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_3mul_b);                  \
 326       _FP_FRAC_DECL_2 (_FP_MUL_MEAT_DW_2_wide_3mul_c);                  \
 327       _FP_W_TYPE _FP_MUL_MEAT_DW_2_wide_3mul_d;                         \
 328       int _FP_MUL_MEAT_DW_2_wide_3mul_c1;                               \
 329       int _FP_MUL_MEAT_DW_2_wide_3mul_c2;                               \
 330                                                                         \
 331       _FP_MUL_MEAT_DW_2_wide_3mul_b_f0 = X##_f0 + X##_f1;               \
 332       _FP_MUL_MEAT_DW_2_wide_3mul_c1                                    \
 333         = _FP_MUL_MEAT_DW_2_wide_3mul_b_f0 < X##_f0;                    \
 334       _FP_MUL_MEAT_DW_2_wide_3mul_b_f1 = Y##_f0 + Y##_f1;               \
 335       _FP_MUL_MEAT_DW_2_wide_3mul_c2                                    \
 336         = _FP_MUL_MEAT_DW_2_wide_3mul_b_f1 < Y##_f0;                    \
 337       doit (_FP_MUL_MEAT_DW_2_wide_3mul_d, _FP_FRAC_WORD_4 (R, 0),      \
 338             X##_f0, Y##_f0);                                            \
 339       doit (_FP_FRAC_WORD_4 (R, 2), _FP_FRAC_WORD_4 (R, 1),             \
 340             _FP_MUL_MEAT_DW_2_wide_3mul_b_f0,                           \
 341             _FP_MUL_MEAT_DW_2_wide_3mul_b_f1);                          \
 342       doit (_FP_MUL_MEAT_DW_2_wide_3mul_c_f1,                           \
 343             _FP_MUL_MEAT_DW_2_wide_3mul_c_f0, X##_f1, Y##_f1);          \
 344                                                                         \
 345       _FP_MUL_MEAT_DW_2_wide_3mul_b_f0                                  \
 346         &= -_FP_MUL_MEAT_DW_2_wide_3mul_c2;                             \
 347       _FP_MUL_MEAT_DW_2_wide_3mul_b_f1                                  \
 348         &= -_FP_MUL_MEAT_DW_2_wide_3mul_c1;                             \
 349       __FP_FRAC_ADD_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),  \
 350                        _FP_FRAC_WORD_4 (R, 1),                          \
 351                        (_FP_MUL_MEAT_DW_2_wide_3mul_c1                  \
 352                         & _FP_MUL_MEAT_DW_2_wide_3mul_c2), 0,           \
 353                        _FP_MUL_MEAT_DW_2_wide_3mul_d,                   \
 354                        0, _FP_FRAC_WORD_4 (R, 2), _FP_FRAC_WORD_4 (R, 1)); \
 355       __FP_FRAC_ADDI_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \
 356                         _FP_MUL_MEAT_DW_2_wide_3mul_b_f0);              \
 357       __FP_FRAC_ADDI_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2), \
 358                         _FP_MUL_MEAT_DW_2_wide_3mul_b_f1);              \
 359       __FP_FRAC_DEC_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),  \
 360                        _FP_FRAC_WORD_4 (R, 1),                          \
 361                        0, _FP_MUL_MEAT_DW_2_wide_3mul_d,                \
 362                        _FP_FRAC_WORD_4 (R, 0));                         \
 363       __FP_FRAC_DEC_3 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),  \
 364                        _FP_FRAC_WORD_4 (R, 1), 0,                       \
 365                        _FP_MUL_MEAT_DW_2_wide_3mul_c_f1,                \
 366                        _FP_MUL_MEAT_DW_2_wide_3mul_c_f0);               \
 367       __FP_FRAC_ADD_2 (_FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2),  \
 368                        _FP_MUL_MEAT_DW_2_wide_3mul_c_f1,                \
 369                        _FP_MUL_MEAT_DW_2_wide_3mul_c_f0,                \
 370                        _FP_FRAC_WORD_4 (R, 3), _FP_FRAC_WORD_4 (R, 2)); \
 371     }                                                                   \
 372   while (0)
 373
 374 #define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit)              \
 375   do                                                                    \
 376     {                                                                   \
 377       _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_wide_3mul_z);                     \
 378                                                                         \
 379       _FP_MUL_MEAT_DW_2_wide_3mul ((wfracbits),                         \
 380                                    _FP_MUL_MEAT_2_wide_3mul_z,          \
 381                                    X, Y, doit);                         \
 382                                                                         \
 383       /* Normalize since we know where the msb of the multiplicands     \
 384          were (bit B), we know that the msb of the of the product is    \
 385          at either 2B or 2B-1.  */                                      \
 386       _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_wide_3mul_z,                       \
 387                       (wfracbits)-1, 2*(wfracbits));                    \
 388       R##_f0 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_3mul_z, 0);         \
 389       R##_f1 = _FP_FRAC_WORD_4 (_FP_MUL_MEAT_2_wide_3mul_z, 1);         \
 390     }                                                                   \
 391   while (0)
 392
 393 #define _FP_MUL_MEAT_DW_2_gmp(wfracbits, R, X, Y)       \
 394   do                                                    \
 395     {                                                   \
 396       _FP_W_TYPE _FP_MUL_MEAT_DW_2_gmp_x[2];            \
 397       _FP_W_TYPE _FP_MUL_MEAT_DW_2_gmp_y[2];            \
 398       _FP_MUL_MEAT_DW_2_gmp_x[0] = X##_f0;              \
 399       _FP_MUL_MEAT_DW_2_gmp_x[1] = X##_f1;              \
 400       _FP_MUL_MEAT_DW_2_gmp_y[0] = Y##_f0;              \
 401       _FP_MUL_MEAT_DW_2_gmp_y[1] = Y##_f1;              \
 402                                                         \
 403       mpn_mul_n (R##_f, _FP_MUL_MEAT_DW_2_gmp_x,        \
 404                  _FP_MUL_MEAT_DW_2_gmp_y, 2);           \
 405     }                                                   \
 406   while (0)
 407
 408 #define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y)                          \
 409   do                                                                    \
 410     {                                                                   \
 411       _FP_FRAC_DECL_4 (_FP_MUL_MEAT_2_gmp_z);                           \
 412                                                                         \
 413       _FP_MUL_MEAT_DW_2_gmp ((wfracbits), _FP_MUL_MEAT_2_gmp_z, X, Y);  \
 414                                                                         \
 415       /* Normalize since we know where the msb of the multiplicands     \
 416          were (bit B), we know that the msb of the of the product is    \
 417          at either 2B or 2B-1.  */                                      \
 418       _FP_FRAC_SRS_4 (_FP_MUL_MEAT_2_gmp_z, (wfracbits)-1,              \
 419                       2*(wfracbits));                                   \
 420       R##_f0 = _FP_MUL_MEAT_2_gmp_z_f[0];                               \
 421       R##_f1 = _FP_MUL_MEAT_2_gmp_z_f[1];                               \
 422     }                                                                   \
 423   while (0)
 424
 425 /* Do at most 120x120=240 bits multiplication using double floating
 426    point multiplication.  This is useful if floating point
 427    multiplication has much bigger throughput than integer multiply.
 428    It is supposed to work for _FP_W_TYPE_SIZE 64 and wfracbits
 429    between 106 and 120 only.
 430    Caller guarantees that X and Y has (1LLL << (wfracbits - 1)) set.
 431    SETFETZ is a macro which will disable all FPU exceptions and set rounding
 432    towards zero,  RESETFE should optionally reset it back.  */
 433
 434 #define _FP_MUL_MEAT_2_120_240_double(wfracbits, R, X, Y, setfetz, resetfe) \
 435   do                                                                    \
 436     {                                                                   \
 437       static const double _const[] =                                    \
 438         {                                                               \
 439           /* 2^-24 */ 5.9604644775390625e-08,                           \
 440           /* 2^-48 */ 3.5527136788005009e-15,                           \
 441           /* 2^-72 */ 2.1175823681357508e-22,                           \
 442           /* 2^-96 */ 1.2621774483536189e-29,                           \
 443           /* 2^28 */ 2.68435456e+08,                                    \
 444           /* 2^4 */ 1.600000e+01,                                       \
 445           /* 2^-20 */ 9.5367431640625e-07,                              \
 446           /* 2^-44 */ 5.6843418860808015e-14,                           \
 447           /* 2^-68 */ 3.3881317890172014e-21,                           \
 448           /* 2^-92 */ 2.0194839173657902e-28,                           \
 449           /* 2^-116 */ 1.2037062152420224e-35                           \
 450         };                                                              \
 451       double _a240, _b240, _c240, _d240, _e240, _f240,                  \
 452         _g240, _h240, _i240, _j240, _k240;                              \
 453       union { double d; UDItype i; } _l240, _m240, _n240, _o240,        \
 454                                        _p240, _q240, _r240, _s240;      \
 455       UDItype _t240, _u240, _v240, _w240, _x240, _y240 = 0;             \
 456                                                                         \
 457       _FP_STATIC_ASSERT ((wfracbits) >= 106 && (wfracbits) <= 120,      \
 458                          "wfracbits out of range");                     \
 459                                                                         \
 460       setfetz;                                                          \
 461                                                                         \
 462       _e240 = (double) (long) (X##_f0 & 0xffffff);                      \
 463       _j240 = (double) (long) (Y##_f0 & 0xffffff);                      \
 464       _d240 = (double) (long) ((X##_f0 >> 24) & 0xffffff);              \
 465       _i240 = (double) (long) ((Y##_f0 >> 24) & 0xffffff);              \
 466       _c240 = (double) (long) (((X##_f1 << 16) & 0xffffff) | (X##_f0 >> 48)); \
 467       _h240 = (double) (long) (((Y##_f1 << 16) & 0xffffff) | (Y##_f0 >> 48)); \
 468       _b240 = (double) (long) ((X##_f1 >> 8) & 0xffffff);               \
 469       _g240 = (double) (long) ((Y##_f1 >> 8) & 0xffffff);               \
 470       _a240 = (double) (long) (X##_f1 >> 32);                           \
 471       _f240 = (double) (long) (Y##_f1 >> 32);                           \
 472       _e240 *= _const[3];                                               \
 473       _j240 *= _const[3];                                               \
 474       _d240 *= _const[2];                                               \
 475       _i240 *= _const[2];                                               \
 476       _c240 *= _const[1];                                               \
 477       _h240 *= _const[1];                                               \
 478       _b240 *= _const[0];                                               \
 479       _g240 *= _const[0];                                               \
 480       _s240.d =                                                       _e240*_j240; \
 481       _r240.d =                                         _d240*_j240 + _e240*_i240; \
 482       _q240.d =                           _c240*_j240 + _d240*_i240 + _e240*_h240; \
 483       _p240.d =             _b240*_j240 + _c240*_i240 + _d240*_h240 + _e240*_g240; \
 484       _o240.d = _a240*_j240 + _b240*_i240 + _c240*_h240 + _d240*_g240 + _e240*_f240; \
 485       _n240.d = _a240*_i240 + _b240*_h240 + _c240*_g240 + _d240*_f240;  \
 486       _m240.d = _a240*_h240 + _b240*_g240 + _c240*_f240;                \
 487       _l240.d = _a240*_g240 + _b240*_f240;                              \
 488       _k240 =   _a240*_f240;                                            \
 489       _r240.d += _s240.d;                                               \
 490       _q240.d += _r240.d;                                               \
 491       _p240.d += _q240.d;                                               \
 492       _o240.d += _p240.d;                                               \
 493       _n240.d += _o240.d;                                               \
 494       _m240.d += _n240.d;                                               \
 495       _l240.d += _m240.d;                                               \
 496       _k240 += _l240.d;                                                 \
 497       _s240.d -= ((_const[10]+_s240.d)-_const[10]);                     \
 498       _r240.d -= ((_const[9]+_r240.d)-_const[9]);                       \
 499       _q240.d -= ((_const[8]+_q240.d)-_const[8]);                       \
 500       _p240.d -= ((_const[7]+_p240.d)-_const[7]);                       \
 501       _o240.d += _const[7];                                             \
 502       _n240.d += _const[6];                                             \
 503       _m240.d += _const[5];                                             \
 504       _l240.d += _const[4];                                             \
 505       if (_s240.d != 0.0)                                               \
 506         _y240 = 1;                                                      \
 507       if (_r240.d != 0.0)                                               \
 508         _y240 = 1;                                                      \
 509       if (_q240.d != 0.0)                                               \
 510         _y240 = 1;                                                      \
 511       if (_p240.d != 0.0)                                               \
 512         _y240 = 1;                                                      \
 513       _t240 = (DItype) _k240;                                           \
 514       _u240 = _l240.i;                                                  \
 515       _v240 = _m240.i;                                                  \
 516       _w240 = _n240.i;                                                  \
 517       _x240 = _o240.i;                                                  \
 518       R##_f1 = ((_t240 << (128 - (wfracbits - 1)))                      \
 519                 | ((_u240 & 0xffffff) >> ((wfracbits - 1) - 104)));     \
 520       R##_f0 = (((_u240 & 0xffffff) << (168 - (wfracbits - 1)))         \
 521                 | ((_v240 & 0xffffff) << (144 - (wfracbits - 1)))       \
 522                 | ((_w240 & 0xffffff) << (120 - (wfracbits - 1)))       \
 523                 | ((_x240 & 0xffffff) >> ((wfracbits - 1) - 96))        \
 524                 | _y240);                                               \
 525       resetfe;                                                          \
 526     }                                                                   \
 527   while (0)
 528
 529 /* Division algorithms: */
 530
 531 #define _FP_DIV_MEAT_2_udiv(fs, R, X, Y)                                \
 532   do                                                                    \
 533     {                                                                   \
 534       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f2;                              \
 535       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f1;                              \
 536       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_n_f0;                              \
 537       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_r_f1;                              \
 538       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_r_f0;                              \
 539       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_m_f1;                              \
 540       _FP_W_TYPE _FP_DIV_MEAT_2_udiv_m_f0;                              \
 541       if (_FP_FRAC_GE_2 (X, Y))                                         \
 542         {                                                               \
 543           _FP_DIV_MEAT_2_udiv_n_f2 = X##_f1 >> 1;                       \
 544           _FP_DIV_MEAT_2_udiv_n_f1                                      \
 545             = X##_f1 << (_FP_W_TYPE_SIZE - 1) | X##_f0 >> 1;            \
 546           _FP_DIV_MEAT_2_udiv_n_f0                                      \
 547             = X##_f0 << (_FP_W_TYPE_SIZE - 1);                          \
 548         }                                                               \
 549       else                                                              \
 550         {                                                               \
 551           R##_e--;                                                      \
 552           _FP_DIV_MEAT_2_udiv_n_f2 = X##_f1;                            \
 553           _FP_DIV_MEAT_2_udiv_n_f1 = X##_f0;                            \
 554           _FP_DIV_MEAT_2_udiv_n_f0 = 0;                                 \
 555         }                                                               \
 556                                                                         \
 557       /* Normalize, i.e. make the most significant bit of the           \
 558          denominator set.  */                                           \
 559       _FP_FRAC_SLL_2 (Y, _FP_WFRACXBITS_##fs);                          \
 560                                                                         \
 561       udiv_qrnnd (R##_f1, _FP_DIV_MEAT_2_udiv_r_f1,                     \
 562                   _FP_DIV_MEAT_2_udiv_n_f2, _FP_DIV_MEAT_2_udiv_n_f1,   \
 563                   Y##_f1);                                              \
 564       umul_ppmm (_FP_DIV_MEAT_2_udiv_m_f1, _FP_DIV_MEAT_2_udiv_m_f0,    \
 565                  R##_f1, Y##_f0);                                       \
 566       _FP_DIV_MEAT_2_udiv_r_f0 = _FP_DIV_MEAT_2_udiv_n_f0;              \
 567       if (_FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m, _FP_DIV_MEAT_2_udiv_r)) \
 568         {                                                               \
 569           R##_f1--;                                                     \
 570           _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,                     \
 571                           _FP_DIV_MEAT_2_udiv_r);                       \
 572           if (_FP_FRAC_GE_2 (_FP_DIV_MEAT_2_udiv_r, Y)                  \
 573               && _FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m,                  \
 574                                 _FP_DIV_MEAT_2_udiv_r))                 \
 575             {                                                           \
 576               R##_f1--;                                                 \
 577               _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,                 \
 578                               _FP_DIV_MEAT_2_udiv_r);                   \
 579             }                                                           \
 580         }                                                               \
 581       _FP_FRAC_DEC_2 (_FP_DIV_MEAT_2_udiv_r, _FP_DIV_MEAT_2_udiv_m);    \
 582                                                                         \
 583       if (_FP_DIV_MEAT_2_udiv_r_f1 == Y##_f1)                           \
 584         {                                                               \
 585           /* This is a special case, not an optimization                \
 586              (_FP_DIV_MEAT_2_udiv_r/Y##_f1 would not fit into UWtype).  \
 587              As _FP_DIV_MEAT_2_udiv_r is guaranteed to be < Y,          \
 588              R##_f0 can be either (UWtype)-1 or (UWtype)-2.  But as we  \
 589              know what kind of bits it is (sticky, guard, round),       \
 590              we don't care.  We also don't care what the reminder is,   \
 591              because the guard bit will be set anyway.  -jj */          \
 592           R##_f0 = -1;                                                  \
 593         }                                                               \
 594       else                                                              \
 595         {                                                               \
 596           udiv_qrnnd (R##_f0, _FP_DIV_MEAT_2_udiv_r_f1,                 \
 597                       _FP_DIV_MEAT_2_udiv_r_f1,                         \
 598                       _FP_DIV_MEAT_2_udiv_r_f0, Y##_f1);                \
 599           umul_ppmm (_FP_DIV_MEAT_2_udiv_m_f1,                          \
 600                      _FP_DIV_MEAT_2_udiv_m_f0, R##_f0, Y##_f0);         \
 601           _FP_DIV_MEAT_2_udiv_r_f0 = 0;                                 \
 602           if (_FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m,                     \
 603                              _FP_DIV_MEAT_2_udiv_r))                    \
 604             {                                                           \
 605               R##_f0--;                                                 \
 606               _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,                 \
 607                               _FP_DIV_MEAT_2_udiv_r);                   \
 608               if (_FP_FRAC_GE_2 (_FP_DIV_MEAT_2_udiv_r, Y)              \
 609                   && _FP_FRAC_GT_2 (_FP_DIV_MEAT_2_udiv_m,              \
 610                                     _FP_DIV_MEAT_2_udiv_r))             \
 611                 {                                                       \
 612                   R##_f0--;                                             \
 613                   _FP_FRAC_ADD_2 (_FP_DIV_MEAT_2_udiv_r, Y,             \
 614                                   _FP_DIV_MEAT_2_udiv_r);               \
 615                 }                                                       \
 616             }                                                           \
 617           if (!_FP_FRAC_EQ_2 (_FP_DIV_MEAT_2_udiv_r,                    \
 618                               _FP_DIV_MEAT_2_udiv_m))                   \
 619             R##_f0 |= _FP_WORK_STICKY;                                  \
 620         }                                                               \
 621     }                                                                   \
 622   while (0)
 623
 624
 625 /* Square root algorithms:
 626    We have just one right now, maybe Newton approximation
 627    should be added for those machines where division is fast.  */
 628
 629 #define _FP_SQRT_MEAT_2(R, S, T, X, q)                          \
 630   do                                                            \
 631     {                                                           \
 632       while (q)                                                 \
 633         {                                                       \
 634           T##_f1 = S##_f1 + (q);                                \
 635           if (T##_f1 <= X##_f1)                                 \
 636             {                                                   \
 637               S##_f1 = T##_f1 + (q);                            \
 638               X##_f1 -= T##_f1;                                 \
 639               R##_f1 += (q);                                    \
 640             }                                                   \
 641           _FP_FRAC_SLL_2 (X, 1);                                \
 642           (q) >>= 1;                                            \
 643         }                                                       \
 644       (q) = (_FP_W_TYPE) 1 << (_FP_W_TYPE_SIZE - 1);            \
 645       while ((q) != _FP_WORK_ROUND)                             \
 646         {                                                       \
 647           T##_f0 = S##_f0 + (q);                                \
 648           T##_f1 = S##_f1;                                      \
 649           if (T##_f1 < X##_f1                                   \
 650               || (T##_f1 == X##_f1 && T##_f0 <= X##_f0))        \
 651             {                                                   \
 652               S##_f0 = T##_f0 + (q);                            \
 653               S##_f1 += (T##_f0 > S##_f0);                      \
 654               _FP_FRAC_DEC_2 (X, T);                            \
 655               R##_f0 += (q);                                    \
 656             }                                                   \
 657           _FP_FRAC_SLL_2 (X, 1);                                \
 658           (q) >>= 1;                                            \
 659         }                                                       \
 660       if (X##_f0 | X##_f1)                                      \
 661         {                                                       \
 662           if (S##_f1 < X##_f1                                   \
 663               || (S##_f1 == X##_f1 && S##_f0 < X##_f0))         \
 664             R##_f0 |= _FP_WORK_ROUND;                           \
 665           R##_f0 |= _FP_WORK_STICKY;                            \
 666         }                                                       \
 667     }                                                           \
 668   while (0)
 669
 670
 671 /* Assembly/disassembly for converting to/from integral types.
 672    No shifting or overflow handled here.  */
 673
 674 #define _FP_FRAC_ASSEMBLE_2(r, X, rsize)        \
 675   (void) (((rsize) <= _FP_W_TYPE_SIZE)          \
 676           ? ({ (r) = X##_f0; })                 \
 677           : ({                                  \
 678               (r) = X##_f1;                     \
 679               (r) <<= _FP_W_TYPE_SIZE;          \
 680               (r) += X##_f0;                    \
 681             }))
 682
 683 #define _FP_FRAC_DISASSEMBLE_2(X, r, rsize)     \
 684   do                                            \
 685     {                                           \
 686       X##_f0 = (r);                             \
 687       X##_f1 = ((rsize) <= _FP_W_TYPE_SIZE      \
 688                 ? 0                             \
 689                 : (r) >> _FP_W_TYPE_SIZE);      \
 690     }                                           \
 691   while (0)
 692
 693 /* Convert FP values between word sizes.  */
 694
 695 #define _FP_FRAC_COPY_1_2(D, S)         (D##_f = S##_f0)
 696
 697 #define _FP_FRAC_COPY_2_1(D, S)         ((D##_f0 = S##_f), (D##_f1 = 0))
 698
 699 #define _FP_FRAC_COPY_2_2(D, S)         _FP_FRAC_COPY_2 (D, S)
 700
 701 #endif /* !SOFT_FP_OP_2_H */