3 // Copyright (C) 2000, 2001, Intel Corporation
4 // All rights reserved.
6 // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
7 // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
13 // * Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
16 // * Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
20 // * The name of Intel Corporation may not be used to endorse or promote
21 // products derived from this software without specific prior written
24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 // Intel Corporation is the author of this code, and requests that all
37 // problem reports or change requests be submitted to it directly at
38 // http://developer.intel.com/opensource.
41 // 2/02/00 Initial Version
42 // 4/04/00 Unwind support added
43 // 8/15/00 Bundle added after call to __libm_error_support to properly
44 // set [the previously overwritten] GR_Parameter_RESULT.
46 // *********************************************************************
48 // Function: Combined exp(x) and expm1(x), where
50 // exp(x) = e , for double precision x values
52 // expm1(x) = e - 1 for double precision x values
54 // *********************************************************************
56 // Accuracy: Within .7 ulps for 80-bit floating point values
57 // Very accurate for double precision values
59 // *********************************************************************
63 // Floating-Point Registers: f8 (Input and Return Value)
64 // f9,f32-f61, f99-f102
66 // General Purpose Registers:
68 // r62-r65 (Used to pass arguments to error handling routine)
70 // Predicate Registers: p6-p15
72 // *********************************************************************
74 // IEEE Special Conditions:
76 // Denormal fault raised on denormal inputs
77 // Overflow exceptions raised when appropriate for exp and expm1
78 // Underflow exceptions raised when appropriate for exp and expm1
79 // (Error Handling Routine called for overflow and Underflow)
80 // Inexact raised when appropriate by algorithm
87 // exp(EM_special Values) = QNaN
93 // expm1(EM_special Values) = QNaN
95 // *********************************************************************
97 // Implementation and Algorithm Notes:
99 // ker_exp_64( in_FR : X,
101 // in_GR : Expo_Range
107 // On input, X is in register format and
109 // Flag = 1 for expm1,
111 // On output, provided X and X_cor are real numbers, then
113 // scale*(Y_hi + Y_lo) approximates exp(X) if Flag is 0
114 // scale*(Y_hi + Y_lo) approximates exp(X)-1 if Flag is 1
116 // The accuracy is sufficient for a highly accurate 64 sig.
117 // bit implementation. Safe is set if there is no danger of
118 // overflow/underflow when the result is composed from scale,
119 // Y_hi and Y_lo. Thus, we can have a fast return if Safe is set.
120 // Otherwise, one must prepare to handle the possible exception
121 // appropriately. Note that SAFE not set (false) does not mean
122 // that overflow/underflow will occur; only the setting of SAFE
123 // guarantees the opposite.
125 // **** High Level Overview ****
127 // The method consists of three cases.
129 // If |X| < Tiny use case exp_tiny;
130 // else if |X| < 2^(-6) use case exp_small;
131 // else use case exp_regular;
135 // 1 + X can be used to approximate exp(X) or exp(X+X_cor);
136 // X + X^2/2 can be used to approximate exp(X) - 1
140 // Here, exp(X), exp(X+X_cor), and exp(X) - 1 can all be
141 // appproximated by a relatively simple polynomial.
143 // This polynomial resembles the truncated Taylor series
145 // exp(w) = 1 + w + w^2/2! + w^3/3! + ... + w^n/n!
149 // Here we use a table lookup method. The basic idea is that in
150 // order to compute exp(X), we accurately decompose X into
152 // X = N * log(2)/(2^12) + r, |r| <= log(2)/2^13.
156 // exp(X) = 2^( N / 2^12 ) * exp(r).
158 // The value 2^( N / 2^12 ) is obtained by simple combinations
159 // of values calculated beforehand and stored in table; exp(r)
160 // is approximated by a short polynomial because |r| is small.
162 // We elaborate this method in 4 steps.
166 // The value 2^12/log(2) is stored as a double-extended number
169 // N := round_to_nearest_integer( X * L_Inv )
171 // The value log(2)/2^12 is stored as two numbers L_hi and L_lo so
172 // that r can be computed accurately via
174 // r := (X - N*L_hi) - N*L_lo
176 // We pick L_hi such that N*L_hi is representable in 64 sig. bits
177 // and thus the FMA X - N*L_hi is error free. So r is the
178 // 1 rounding error from an exact reduction with respect to
182 // In particular, L_hi has 30 significant bit and can be stored
183 // as a double-precision number; L_lo has 64 significant bits and
184 // stored as a double-extended number.
186 // In the case Flag = 2, we further modify r by
190 // Step 2: Approximation
192 // exp(r) - 1 is approximated by a short polynomial of the form
194 // r + A_1 r^2 + A_2 r^3 + A_3 r^4 .
196 // Step 3: Composition from Table Values
198 // The value 2^( N / 2^12 ) can be composed from a couple of tables
199 // of precalculated values. First, express N as three integers
200 // K, M_1, and M_2 as
202 // N = K * 2^12 + M_1 * 2^6 + M_2
204 // Where 0 <= M_1, M_2 < 2^6; and K can be positive or negative.
205 // When N is represented in 2's complement, M_2 is simply the 6
206 // lsb's, M_1 is the next 6, and K is simply N shifted right
207 // arithmetically (sign extended) by 12 bits.
209 // Now, 2^( N / 2^12 ) is simply
211 // 2^K * 2^( M_1 / 2^6 ) * 2^( M_2 / 2^12 )
213 // Clearly, 2^K needs no tabulation. The other two values are less
214 // trivial because if we store each accurately to more than working
215 // precision, than its product is too expensive to calculate. We
216 // use the following method.
218 // Define two mathematical values, delta_1 and delta_2, implicitly
221 // T_1 = exp( [M_1 log(2)/2^6] - delta_1 )
222 // T_2 = exp( [M_2 log(2)/2^12] - delta_2 )
224 // are representable as 24 significant bits. To illustrate the idea,
225 // we show how we define delta_1:
227 // T_1 := round_to_24_bits( exp( M_1 log(2)/2^6 ) )
228 // delta_1 = (M_1 log(2)/2^6) - log( T_1 )
230 // The last equality means mathematical equality. We then tabulate
232 // W_1 := exp(delta_1) - 1
233 // W_2 := exp(delta_2) - 1
235 // Both in double precision.
237 // From the tabulated values T_1, T_2, W_1, W_2, we compose the values
240 // T := T_1 * T_2 ...exactly
241 // W := W_1 + (1 + W_1)*W_2
243 // W approximates exp( delta ) - 1 where delta = delta_1 + delta_2.
244 // The mathematical product of T and (W+1) is an accurate representation
245 // of 2^(M_1/2^6) * 2^(M_2/2^12).
247 // Step 4. Reconstruction
249 // Finally, we can reconstruct exp(X), exp(X) - 1.
252 // X = K * log(2) + (M_1*log(2)/2^6 - delta_1)
253 // + (M_2*log(2)/2^12 - delta_2)
254 // + delta_1 + delta_2 + r ...accurately
257 // exp(X) ~=~ 2^K * ( T + T*[exp(delta_1+delta_2+r) - 1] )
258 // ~=~ 2^K * ( T + T*[exp(delta + r) - 1] )
259 // ~=~ 2^K * ( T + T*[(exp(delta)-1)
260 // + exp(delta)*(exp(r)-1)] )
261 // ~=~ 2^K * ( T + T*( W + (1+W)*poly(r) ) )
262 // ~=~ 2^K * ( Y_hi + Y_lo )
264 // where Y_hi = T and Y_lo = T*(W + (1+W)*poly(r))
266 // For exp(X)-1, we have
268 // exp(X)-1 ~=~ 2^K * ( Y_hi + Y_lo ) - 1
269 // ~=~ 2^K * ( Y_hi + Y_lo - 2^(-K) )
271 // and we combine Y_hi + Y_lo - 2^(-N) into the form of two
272 // numbers Y_hi + Y_lo carefully.
274 // **** Algorithm Details ****
276 // A careful algorithm must be used to realize the mathematical ideas
277 // accurately. We describe each of the three cases. We assume SAFE
278 // is preset to be TRUE.
282 // The important points are to ensure an accurate result under
283 // different rounding directions and a correct setting of the SAFE
286 // If Flag is 1, then
287 // SAFE := False ...possibility of underflow
290 // Y_lo := 2^(-17000)
294 // Y_lo := X ...for different rounding modes
299 // Here we compute a simple polynomial. To exploit parallelism, we split
300 // the polynomial into several portions.
304 // If Flag is not 1 ...i.e. exp( argument )
308 // poly_lo := P_3 + r*(P_4 + r*(P_5 + r*P_6))
309 // poly_hi := r + rsq*(P_1 + r*P_2)
310 // Y_lo := poly_hi + r4 * poly_lo
311 // set lsb(Y_lo) to 1
315 // Else ...i.e. exp( argument ) - 1
320 // poly_lo := r6*(Q_5 + r*(Q_6 + r*Q_7))
321 // poly_hi := Q_1 + r*(Q_2 + r*(Q_3 + r*Q_4))
322 // Y_lo := rsq*poly_hi + poly_lo
323 // set lsb(Y_lo) to 1
331 // The previous description contain enough information except the
332 // computation of poly and the final Y_hi and Y_lo in the case for
335 // The computation of poly for Step 2:
338 // poly := r + rsq*(A_1 + r*(A_2 + r*A_3))
340 // For the case exp(X) - 1, we need to incorporate 2^(-K) into
341 // Y_hi and Y_lo at the end of Step 4.
344 // Y_lo := Y_lo - 2^(-K)
347 // Y_lo := Y_hi + Y_lo
350 // Y_hi := Y_hi - 2^(-K)
355 #include "libm_support.h"
363 GR_Parameter_RESULT = r64
376 Constants_exp_64_Arg:
377 ASM_TYPE_DIRECTIVE(Constants_exp_64_Arg,@object)
378 data4 0x5C17F0BC,0xB8AA3B29,0x0000400B,0x00000000
379 data4 0x00000000,0xB17217F4,0x00003FF2,0x00000000
380 data4 0xF278ECE6,0xF473DE6A,0x00003FD4,0x00000000
381 // /* Inv_L, L_hi, L_lo */
382 ASM_SIZE_DIRECTIVE(Constants_exp_64_Arg)
385 Constants_exp_64_Exponents:
386 ASM_TYPE_DIRECTIVE(Constants_exp_64_Exponents,@object)
387 data4 0x0000007E,0x00000000,0xFFFFFF83,0xFFFFFFFF
388 data4 0x000003FE,0x00000000,0xFFFFFC03,0xFFFFFFFF
389 data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF
390 data4 0x00003FFE,0x00000000,0xFFFFC003,0xFFFFFFFF
391 data4 0xFFFFFFE2,0xFFFFFFFF,0xFFFFFFC4,0xFFFFFFFF
392 data4 0xFFFFFFBA,0xFFFFFFFF,0xFFFFFFBA,0xFFFFFFFF
393 ASM_SIZE_DIRECTIVE(Constants_exp_64_Exponents)
397 ASM_TYPE_DIRECTIVE(Constants_exp_64_A,@object)
398 data4 0xB1B736A0,0xAAAAAAAB,0x00003FFA,0x00000000
399 data4 0x90CD6327,0xAAAAAAAB,0x00003FFC,0x00000000
400 data4 0xFFFFFFFF,0xFFFFFFFF,0x00003FFD,0x00000000
402 ASM_SIZE_DIRECTIVE(Constants_exp_64_A)
406 ASM_TYPE_DIRECTIVE(Constants_exp_64_P,@object)
407 data4 0x43914A8A,0xD00D6C81,0x00003FF2,0x00000000
408 data4 0x30304B30,0xB60BC4AC,0x00003FF5,0x00000000
409 data4 0x7474C518,0x88888888,0x00003FF8,0x00000000
410 data4 0x8DAE729D,0xAAAAAAAA,0x00003FFA,0x00000000
411 data4 0xAAAAAF61,0xAAAAAAAA,0x00003FFC,0x00000000
412 data4 0x000004C7,0x80000000,0x00003FFE,0x00000000
414 ASM_SIZE_DIRECTIVE(Constants_exp_64_P)
418 ASM_TYPE_DIRECTIVE(Constants_exp_64_Q,@object)
419 data4 0xA49EF6CA,0xD00D56F7,0x00003FEF,0x00000000
420 data4 0x1C63493D,0xD00D59AB,0x00003FF2,0x00000000
421 data4 0xFB50CDD2,0xB60B60B5,0x00003FF5,0x00000000
422 data4 0x7BA68DC8,0x88888888,0x00003FF8,0x00000000
423 data4 0xAAAAAC8D,0xAAAAAAAA,0x00003FFA,0x00000000
424 data4 0xAAAAACCA,0xAAAAAAAA,0x00003FFC,0x00000000
425 data4 0x00000000,0x80000000,0x00003FFE,0x00000000
427 ASM_SIZE_DIRECTIVE(Constants_exp_64_Q)
431 ASM_TYPE_DIRECTIVE(Constants_exp_64_T1,@object)
432 data4 0x3F800000,0x3F8164D2,0x3F82CD87,0x3F843A29
433 data4 0x3F85AAC3,0x3F871F62,0x3F88980F,0x3F8A14D5
434 data4 0x3F8B95C2,0x3F8D1ADF,0x3F8EA43A,0x3F9031DC
435 data4 0x3F91C3D3,0x3F935A2B,0x3F94F4F0,0x3F96942D
436 data4 0x3F9837F0,0x3F99E046,0x3F9B8D3A,0x3F9D3EDA
437 data4 0x3F9EF532,0x3FA0B051,0x3FA27043,0x3FA43516
438 data4 0x3FA5FED7,0x3FA7CD94,0x3FA9A15B,0x3FAB7A3A
439 data4 0x3FAD583F,0x3FAF3B79,0x3FB123F6,0x3FB311C4
440 data4 0x3FB504F3,0x3FB6FD92,0x3FB8FBAF,0x3FBAFF5B
441 data4 0x3FBD08A4,0x3FBF179A,0x3FC12C4D,0x3FC346CD
442 data4 0x3FC5672A,0x3FC78D75,0x3FC9B9BE,0x3FCBEC15
443 data4 0x3FCE248C,0x3FD06334,0x3FD2A81E,0x3FD4F35B
444 data4 0x3FD744FD,0x3FD99D16,0x3FDBFBB8,0x3FDE60F5
445 data4 0x3FE0CCDF,0x3FE33F89,0x3FE5B907,0x3FE8396A
446 data4 0x3FEAC0C7,0x3FED4F30,0x3FEFE4BA,0x3FF28177
447 data4 0x3FF5257D,0x3FF7D0DF,0x3FFA83B3,0x3FFD3E0C
448 ASM_SIZE_DIRECTIVE(Constants_exp_64_T1)
452 ASM_TYPE_DIRECTIVE(Constants_exp_64_T2,@object)
453 data4 0x3F800000,0x3F80058C,0x3F800B18,0x3F8010A4
454 data4 0x3F801630,0x3F801BBD,0x3F80214A,0x3F8026D7
455 data4 0x3F802C64,0x3F8031F2,0x3F803780,0x3F803D0E
456 data4 0x3F80429C,0x3F80482B,0x3F804DB9,0x3F805349
457 data4 0x3F8058D8,0x3F805E67,0x3F8063F7,0x3F806987
458 data4 0x3F806F17,0x3F8074A8,0x3F807A39,0x3F807FCA
459 data4 0x3F80855B,0x3F808AEC,0x3F80907E,0x3F809610
460 data4 0x3F809BA2,0x3F80A135,0x3F80A6C7,0x3F80AC5A
461 data4 0x3F80B1ED,0x3F80B781,0x3F80BD14,0x3F80C2A8
462 data4 0x3F80C83C,0x3F80CDD1,0x3F80D365,0x3F80D8FA
463 data4 0x3F80DE8F,0x3F80E425,0x3F80E9BA,0x3F80EF50
464 data4 0x3F80F4E6,0x3F80FA7C,0x3F810013,0x3F8105AA
465 data4 0x3F810B41,0x3F8110D8,0x3F81166F,0x3F811C07
466 data4 0x3F81219F,0x3F812737,0x3F812CD0,0x3F813269
467 data4 0x3F813802,0x3F813D9B,0x3F814334,0x3F8148CE
468 data4 0x3F814E68,0x3F815402,0x3F81599C,0x3F815F37
469 ASM_SIZE_DIRECTIVE(Constants_exp_64_T2)
473 ASM_TYPE_DIRECTIVE(Constants_exp_64_W1,@object)
474 data4 0x00000000,0x00000000,0x171EC4B4,0xBE384454
475 data4 0x4AA72766,0xBE694741,0xD42518F8,0xBE5D32B6
476 data4 0x3A319149,0x3E68D96D,0x62415F36,0xBE68F4DA
477 data4 0xC9C86A3B,0xBE6DDA2F,0xF49228FE,0x3E6B2E50
478 data4 0x1188B886,0xBE49C0C2,0x1A4C2F1F,0x3E64BFC2
479 data4 0x2CB98B54,0xBE6A2FBB,0x9A55D329,0x3E5DC5DE
480 data4 0x39A7AACE,0x3E696490,0x5C66DBA5,0x3E54728B
481 data4 0xBA1C7D7D,0xBE62B0DB,0x09F1AF5F,0x3E576E04
482 data4 0x1A0DD6A1,0x3E612500,0x795FBDEF,0xBE66A419
483 data4 0xE1BD41FC,0xBE5CDE8C,0xEA54964F,0xBE621376
484 data4 0x476E76EE,0x3E6370BE,0x3427EB92,0x3E390D1A
485 data4 0x2BF82BF8,0x3E1336DE,0xD0F7BD9E,0xBE5FF1CB
486 data4 0x0CEB09DD,0xBE60A355,0x0980F30D,0xBE5CA37E
487 data4 0x4C082D25,0xBE5C541B,0x3B467D29,0xBE5BBECA
488 data4 0xB9D946C5,0xBE400D8A,0x07ED374A,0xBE5E2A08
489 data4 0x365C8B0A,0xBE66CB28,0xD3403BCA,0x3E3AAD5B
490 data4 0xC7EA21E0,0x3E526055,0xE72880D6,0xBE442C75
491 data4 0x85222A43,0x3E58B2BB,0x522C42BF,0xBE5AAB79
492 data4 0x469DC2BC,0xBE605CB4,0xA48C40DC,0xBE589FA7
493 data4 0x1AA42614,0xBE51C214,0xC37293F4,0xBE48D087
494 data4 0xA2D673E0,0x3E367A1C,0x114F7A38,0xBE51BEBB
495 data4 0x661A4B48,0xBE6348E5,0x1D3B9962,0xBDF52643
496 data4 0x35A78A53,0x3E3A3B5E,0x1CECD788,0xBE46C46C
497 data4 0x7857D689,0xBE60B7EC,0xD14F1AD7,0xBE594D3D
498 data4 0x4C9A8F60,0xBE4F9C30,0x02DFF9D2,0xBE521873
499 data4 0x55E6D68F,0xBE5E4C88,0x667F3DC4,0xBE62140F
500 data4 0x3BF88747,0xBE36961B,0xC96EC6AA,0x3E602861
501 data4 0xD57FD718,0xBE3B5151,0xFC4A627B,0x3E561CD0
502 data4 0xCA913FEA,0xBE3A5217,0x9A5D193A,0x3E40A3CC
503 data4 0x10A9C312,0xBE5AB713,0xC5F57719,0x3E4FDADB
504 data4 0xDBDF59D5,0x3E361428,0x61B4180D,0x3E5DB5DB
505 data4 0x7408D856,0xBE42AD5F,0x31B2B707,0x3E2A3148
506 ASM_SIZE_DIRECTIVE(Constants_exp_64_W1)
510 ASM_TYPE_DIRECTIVE(Constants_exp_64_W2,@object)
511 data4 0x00000000,0x00000000,0x37A3D7A2,0xBE641F25
512 data4 0xAD028C40,0xBE68DD57,0xF212B1B6,0xBE5C77D8
513 data4 0x1BA5B070,0x3E57878F,0x2ECAE6FE,0xBE55A36A
514 data4 0x569DFA3B,0xBE620608,0xA6D300A3,0xBE53B50E
515 data4 0x223F8F2C,0x3E5B5EF2,0xD6DE0DF4,0xBE56A0D9
516 data4 0xEAE28F51,0xBE64EEF3,0x367EA80B,0xBE5E5AE2
517 data4 0x5FCBC02D,0x3E47CB1A,0x9BDAFEB7,0xBE656BA0
518 data4 0x805AFEE7,0x3E6E70C6,0xA3415EBA,0xBE6E0509
519 data4 0x49BFF529,0xBE56856B,0x00508651,0x3E66DD33
520 data4 0xC114BC13,0x3E51165F,0xC453290F,0x3E53333D
521 data4 0x05539FDA,0x3E6A072B,0x7C0A7696,0xBE47CD87
522 data4 0xEB05C6D9,0xBE668BF4,0x6AE86C93,0xBE67C3E3
523 data4 0xD0B3E84B,0xBE533904,0x556B53CE,0x3E63E8D9
524 data4 0x63A98DC8,0x3E212C89,0x032A7A22,0xBE33138F
525 data4 0xBC584008,0x3E530FA9,0xCCB93C97,0xBE6ADF82
526 data4 0x8370EA39,0x3E5F9113,0xFB6A05D8,0x3E5443A4
527 data4 0x181FEE7A,0x3E63DACD,0xF0F67DEC,0xBE62B29D
528 data4 0x3DDE6307,0x3E65C483,0xD40A24C1,0x3E5BF030
529 data4 0x14E437BE,0x3E658B8F,0xED98B6C7,0xBE631C29
530 data4 0x04CF7C71,0x3E6335D2,0xE954A79D,0x3E529EED
531 data4 0xF64A2FB8,0x3E5D9257,0x854ED06C,0xBE6BED1B
532 data4 0xD71405CB,0x3E5096F6,0xACB9FDF5,0xBE3D4893
533 data4 0x01B68349,0xBDFEB158,0xC6A463B9,0x3E628D35
534 data4 0xADE45917,0xBE559725,0x042FC476,0xBE68C29C
535 data4 0x01E511FA,0xBE67593B,0x398801ED,0xBE4A4313
536 data4 0xDA7C3300,0x3E699571,0x08062A9E,0x3E5349BE
537 data4 0x755BB28E,0x3E5229C4,0x77A1F80D,0x3E67E426
538 data4 0x6B69C352,0xBE52B33F,0x084DA57F,0xBE6B3550
539 data4 0xD1D09A20,0xBE6DB03F,0x2161B2C1,0xBE60CBC4
540 data4 0x78A2B771,0x3E56ED9C,0x9D0FA795,0xBE508E31
541 data4 0xFD1A54E9,0xBE59482A,0xB07FD23E,0xBE2A17CE
542 data4 0x17365712,0x3E68BF5C,0xB3785569,0x3E3956F9
543 ASM_SIZE_DIRECTIVE(Constants_exp_64_W2)
558 alloc r32 = ar.pfs,0,30,4,0
560 (p0) cmp.eq.unc p7, p0 = r0, r0
566 // Set p7 true for expm1
567 // Set Flag = r33 = 1 for expm1
568 // These are really no longer necesary, but are a remnant
569 // when this file had multiple entry points.
570 // They should be carefully removed
576 (p0) fnorm.s1 f9 = f8
583 (p0) fclass.m.unc p6, p8 = f8, 0x1E7
589 (p0) fclass.nm.unc p9, p0 = f8, 0x1FF
600 // Identify NatVals, NaNs, Infs, and Zeros.
601 // Identify EM unsupporteds.
602 // Save special input registers
604 // Create FR_X_cor = 0.0
613 (p6) br.cond.spnt EXP_64_SPECIAL ;;
619 (p9) br.cond.spnt EXP_64_UNSUPPORTED ;;
623 // Branch out for special input values
627 (p0) cmp.ne.unc p12, p13 = 0x01, r33
628 (p0) fcmp.lt.unc.s0 p9,p0 = f8, f0
629 (p0) cmp.eq.unc p15, p0 = r0, r0
633 // Raise possible denormal operand exception
636 // This function computes exp( x + x_cor)
638 // Input FR 2: FR_X_cor
639 // Input GR 1: GR_Flag
640 // Input GR 2: GR_Expo_Range
641 // Output FR 3: FR_Y_hi
642 // Output FR 4: FR_Y_lo
643 // Output FR 5: FR_Scale
644 // Output PR 1: PR_Safe
647 // Prepare to load constants
652 (p0) addl r34 = @ltoff(Constants_exp_64_Arg#), gp
653 (p0) addl r40 = @ltoff(Constants_exp_64_W1#), gp
654 (p0) addl r41 = @ltoff(Constants_exp_64_W2#), gp
661 (p0) addl r50 = @ltoff(Constants_exp_64_T1#), gp
668 (p0) ldfe f37 = [r34],16
669 (p0) addl r51 = @ltoff(Constants_exp_64_T2#), gp
674 // N = fcvt.fx(float_N)
675 // Set p14 if -6 > expo_X
681 // expo_X = expo_X and Mask
686 // Set p10 if 14 < expo_X
691 (p0) ldfe f40 = [r34],16
698 (p0) movl r58 = 0x0FFFF
704 // Branch to SMALL is expo_X < -6
708 // float_N = X * L_Inv
709 // expo_X = exponent of X
715 (p0) ldfe f41 = [r34],16
720 (p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp
721 (p0) movl r39 = 0x1FFFF
727 (p0) getf.exp r37 = f9
735 (p0) and r37 = r37, r39 ;;
739 (p0) sub r37 = r37, r58 ;;
740 (p0) cmp.gt.unc p14, p0 = -6, r37
741 (p0) cmp.lt.unc p10, p0 = 14, r37 ;;
748 // Set p12 true for Flag = 0 (exp)
749 // Set p13 true for Flag = 1 (expm1)
751 (p0) fmpy.s1 f38 = f9, f37
759 // expo_X = expo_X - Bias
762 (p0) fcvt.fx.s1 f39 = f38
763 (p14) br.cond.spnt EXP_SMALL ;;
769 (p10) br.cond.spnt EXP_HUGE ;;
773 (p0) shladd r34 = r32,4,r34
774 (p0) addl r35 = @ltoff(Constants_exp_64_A#), gp
791 (p0) ldfe f51 = [r35],16
792 (p0) ld8 r45 = [r34],8
796 // Set Safe = True if k >= big_expo_neg
797 // Set Safe = False if k < big_expo_neg
801 (p0) ldfe f49 = [r35],16
802 (p0) ld8 r48 = [r34],0
809 // Branch to HUGE is expo_X > 14
811 (p0) fcvt.xf f38 = f39
816 (p0) getf.sig r52 = f39
823 (p0) extr.u r43 = r52, 6, 6 ;;
825 // r = r - float_N * L_lo
826 // K = extr(N_fix,12,52)
828 (p0) shladd r40 = r43,3,r40 ;;
832 (p0) shladd r50 = r43,2,r50
833 (p0) fnma.s1 f42 = f40, f38, f9
835 // float_N = float(N)
836 // N_fix = signficand N
838 (p0) extr.u r42 = r52, 0, 6
842 (p0) ldfd f43 = [r40],0 ;;
843 (p0) shladd r41 = r42,3,r41
844 (p0) shladd r51 = r42,2,r51
851 (p0) ldfs f44 = [r50],0 ;;
852 (p0) ldfd f45 = [r41],0
854 // M_2 = extr(N_fix,0,6)
855 // M_1 = extr(N_fix,6,6)
856 // r = X - float_N * L_hi
858 (p0) extr r44 = r52, 12, 52
862 (p0) ldfs f46 = [r51],0 ;;
863 (p0) sub r46 = r58, r44
864 (p0) cmp.gt.unc p8, p15 = r44, r45
867 // W = W_1 + W_1_p1*W_2
869 // Bias_m_K = Bias - K
873 (p0) ldfe f40 = [r35],16
876 // poly = A_2 + r*A_3
878 // neg_2_mK = exponent of Bias_m_k
880 (p0) add r47 = r58, r44 ;;
882 // Set Safe = True if k <= big_expo_pos
883 // Set Safe = False if k > big_expo_pos
886 (p15) cmp.lt p8,p15 = r44,r48 ;;
890 (p0) setf.exp f61 = r46
892 // Bias_p + K = Bias + K
895 (p0) setf.exp f36 = r47
896 (p0) fnma.s1 f42 = f41, f38, f42 ;;
903 // Load big_exp_pos, load big_exp_neg
905 (p0) fadd.s1 f47 = f43, f1
911 (p0) fma.s1 f52 = f42, f51, f49
917 (p0) fmpy.s1 f48 = f42, f42
923 (p0) fmpy.s1 f53 = f44, f46
929 (p0) fma.s1 f54 = f45, f47, f43
941 (p0) fma.s1 f52 = f42, f52, f40
947 (p0) fadd.s1 f55 = f54, f1
964 // Scale = setf_exp(Bias_p_k)
966 (p0) fma.s1 f52 = f48, f52, f42
973 // poly = r + rsq(A_1 + r*poly)
975 // neg_2_mK = -neg_2_mK
977 (p0) fma.s1 f35 = f55, f52, f54
983 (p0) fmpy.s1 f35 = f35, f53
986 // Y_lo = T * (W + Wp1*poly)
988 (p12) br.cond.sptk EXP_MAIN ;;
992 // Continue for exp(x-1)
996 (p0) cmp.lt.unc p12, p13 = 10, r44
999 // Set p12 if 10 < K, Else p13
1001 (p13) cmp.gt.unc p13, p14 = -10, r44 ;;
1004 // K > 10: Y_lo = Y_lo + neg_2_mK
1005 // K <=10: Set p13 if -10 > K, Else set p14
1009 (p13) cmp.eq p15, p0 = r0, r0
1010 (p14) fadd.s1 f34 = f61, f34
1016 (p12) fadd.s1 f35 = f35, f61
1022 (p13) fadd.s1 f35 = f35, f34
1029 // K <= 10 and K < -10, Set Safe = True
1030 // K <= 10 and K < 10, Y_lo = Y_hi + Y_lo
1031 // K <= 10 and K > =-10, Y_hi = Y_hi + neg_2_mk
1034 (p0) br.cond.sptk EXP_MAIN ;;
1039 (p12) addl r35 = @ltoff(Constants_exp_64_P#), gp
1040 (p0) addl r34 = @ltoff(Constants_exp_64_Exponents#), gp
1046 (p12) ld8 r35 = [r35]
1054 (p13) addl r35 = @ltoff(Constants_exp_64_Q#), gp
1063 // K <= 10 and K < 10, Y_hi = neg_2_mk
1065 // /*******************************************************/
1066 // /*********** Branch EXP_SMALL *************************/
1067 // /*******************************************************/
1070 (p13) ld8 r35 = [r35]
1072 (p0) add r34 = 0x48,r34
1082 (p0) ld8 r49 =[r34],0
1093 (p0) cmp.lt.unc p14, p0 = r37, r49 ;;
1101 (p0) fmpy.s1 f48 = f42, f42
1110 (p0) fmpy.s1 f50 = f48, f48
1112 // Is input very small?
1114 (p14) br.cond.spnt EXP_VERY_SMALL ;;
1117 // Flag_not1: Y_hi = 1.0
1118 // Flag is 1: r6 = rsq * r4
1122 (p12) ldfe f52 = [r35],16
1124 (p0) add r53 = 0x1,r0 ;;
1128 (p13) ldfe f51 = [r35],16
1130 // Flag_not_1: Y_lo = poly_hi + r4 * poly_lo
1137 (p12) ldfe f53 = [r35],16
1139 // For Flag_not_1, Y_hi = X
1141 // Create 0x000...01
1143 (p0) setf.sig f37 = r53
1144 (p0) mov f36 = f1 ;;
1148 (p13) ldfe f52 = [r35],16 ;;
1149 (p12) ldfe f54 = [r35],16
1154 (p13) ldfe f53 = [r35],16
1155 (p13) fmpy.s1 f58 = f48, f50
1159 // Flag_not1: poly_lo = P_5 + r*P_6
1160 // Flag_1: poly_lo = Q_6 + r*Q_7
1164 (p13) ldfe f54 = [r35],16 ;;
1165 (p12) ldfe f55 = [r35],16
1170 (p12) ldfe f56 = [r35],16 ;;
1171 (p13) ldfe f55 = [r35],16
1176 (p12) ldfe f57 = [r35],0 ;;
1177 (p13) ldfe f56 = [r35],16
1182 (p13) ldfe f57 = [r35],0
1190 // For Flag_not_1, load p5,p6,p1,p2
1191 // Else load p5,p6,p1,p2
1193 (p12) fma.s1 f60 = f52, f42, f53
1199 (p13) fma.s1 f60 = f51, f42, f52
1205 (p12) fma.s1 f60 = f60, f42, f54
1211 (p12) fma.s1 f59 = f56, f42, f57
1217 (p13) fma.s1 f60 = f42, f60, f53
1223 (p12) fma.s1 f59 = f59, f48, f42
1230 // Flag_1: poly_lo = Q_5 + r*(Q_6 + r*Q_7)
1231 // Flag_not1: poly_lo = P_4 + r*(P_5 + r*P_6)
1232 // Flag_not1: poly_hi = (P_1 + r*P_2)
1234 (p13) fmpy.s1 f60 = f60, f58
1240 (p12) fma.s1 f60 = f60, f42, f55
1247 // Flag_1: poly_lo = r6 *(Q_5 + ....)
1248 // Flag_not1: poly_hi = r + rsq *(P_1 + r*P_2)
1250 (p12) fma.s1 f35 = f60, f50, f59
1256 (p13) fma.s1 f59 = f54, f42, f55
1263 // Flag_not1: Y_lo = rsq* poly_hi + poly_lo
1264 // Flag_1: poly_lo = rsq* poly_hi + poly_lo
1266 (p13) fma.s1 f59 = f59, f42, f56
1273 // Flag_not_1: (P_1 + r*P_2)
1275 (p13) fma.s1 f59 = f59, f42, f57
1282 // Flag_not_1: poly_hi = r + rsq * (P_1 + r*P_2)
1284 (p13) fma.s1 f35 = f59, f48, f60
1291 // Create 0.000...01
1293 (p0) for f37 = f35, f37
1300 // Set lsb of Y_lo to 1
1302 (p0) fmerge.se f35 = f35,f37
1303 (p0) br.cond.sptk EXP_MAIN ;;
1309 (p13) addl r34 = @ltoff(Constants_exp_64_Exponents#),gp
1314 (p13) ld8 r34 = [r34];
1322 (p12) br.cond.sptk EXP_MAIN ;;
1326 (p13) add r34 = 8,r34
1327 (p13) movl r39 = 0x0FFFE ;;
1331 // Create 1/2's exponent
1335 (p13) setf.exp f56 = r39
1336 (p13) shladd r34 = r32,4,r34 ;;
1340 // Negative exponents are stored after positive
1344 (p13) ld8 r45 = [r34],0
1349 (p13) fmpy.s1 f35 = f9, f9
1356 // Reset Safe if necessary
1364 (p13) cmp.lt.unc p0, p15 = r37, r45
1374 (p13) fmpy.s1 f35 = f35, f56
1378 (p13) br.cond.sptk EXP_MAIN ;;
1384 (p0) fcmp.gt.unc.s1 p14, p0 = f9, f0
1390 (p0) movl r39 = 0x15DC0 ;;
1394 (p14) setf.exp f34 = r39
1396 (p14) cmp.eq p0, p15 = r0, r0 ;;
1403 // If x > 0, Set Safe = False
1404 // If x > 0, Y_hi = 2**(24,000)
1405 // If x > 0, Y_lo = 1.0
1406 // If x > 0, Scale = 2**(24,000)
1408 (p14) br.cond.sptk EXP_MAIN ;;
1413 (p12) movl r39 = 0xA240
1418 (p12) movl r38 = 0xA1DC ;;
1422 (p13) cmp.eq p15, p14 = r0, r0
1423 (p12) setf.exp f34 = r39
1428 (p12) setf.exp f35 = r38
1429 (p13) movl r39 = 0xFF9C
1434 (p13) fsub.s1 f34 = f0, f1
1441 (p12) cmp.eq p0, p15 = r0, r0 ;;
1445 (p13) setf.exp f35 = r39
1452 (p0) cmp.ne.unc p12, p0 = 0x01, r33
1453 (p0) fmpy.s1 f101 = f36, f35
1459 (p0) fma.d.s0 f99 = f34, f36, f101
1460 (p15) br.cond.sptk EXP_64_RETURN;;
1465 (p0) fsetc.s3 0x7F,0x01
1471 (p0) movl r50 = 0x000000000103FF ;;
1474 // S0 user supplied status
1475 // S2 user supplied status + WRE + TD (Overflows)
1476 // S3 user supplied status + RZ + TD (Underflows)
1479 // If (Safe) is true, then
1480 // Compute result using user supplied status field.
1481 // No overflow or underflow here, but perhaps inexact.
1484 // Determine if overflow or underflow was raised.
1485 // Fetch +/- overflow threshold for IEEE single, double,
1490 (p0) setf.exp f60 = r50
1491 (p0) fma.d.s3 f102 = f34, f36, f101
1497 (p0) fsetc.s3 0x7F,0x40
1504 // For Safe, no need to check for over/under.
1505 // For expm1, handle errors like exp.
1507 (p0) fsetc.s2 0x7F,0x42
1513 (p0) fma.d.s2 f100 = f34, f36, f101
1519 (p0) fsetc.s2 0x7F,0x40
1525 (p7) fclass.m.unc p12, p0 = f102, 0x00F
1531 (p0) fclass.m.unc p11, p0 = f102, 0x00F
1537 (p7) fcmp.ge.unc.s1 p10, p0 = f100, f60
1544 // Create largest double exponent + 1.
1545 // Create smallest double exponent - 1.
1547 (p0) fcmp.ge.unc.s1 p8, p0 = f100, f60
1551 // fcmp: resultS2 >= + overflow threshold -> set (a) if true
1552 // fcmp: resultS2 <= - overflow threshold -> set (b) if true
1553 // fclass: resultS3 is denorm/unorm/0 -> set (d) if true
1559 (p10) br.cond.sptk __libm_error_region ;;
1565 (p8) br.cond.sptk __libm_error_region ;;
1568 // Report that exp overflowed
1574 (p12) br.cond.sptk __libm_error_region ;;
1580 (p11) br.cond.sptk __libm_error_region ;;
1587 // Report that exp underflowed
1589 (p0) br.cond.sptk EXP_64_RETURN;;
1595 (p0) fclass.m.unc p6, p0 = f8, 0x0c3
1601 (p0) fclass.m.unc p13, p8 = f8, 0x007
1607 (p7) fclass.m.unc p14, p0 = f8, 0x007
1613 (p0) fclass.m.unc p12, p9 = f8, 0x021
1619 (p0) fclass.m.unc p11, p0 = f8, 0x022
1625 (p7) fclass.m.unc p10, p0 = f8, 0x022
1632 // Identify +/- 0, Inf, or -Inf
1633 // Generate the right kind of NaN.
1635 (p13) fadd.d.s0 f99 = f0, f1
1647 (p6) fadd.d.s0 f99 = f8, f1
1650 // expm1(+/-0) = +/-0
1651 // No exceptions raised
1653 (p6) br.cond.sptk EXP_64_RETURN;;
1659 (p14) br.cond.sptk EXP_64_RETURN;;
1670 (p10) fsub.d.s1 f99 = f0, f1
1674 // No exceptions raised.
1676 (p10) br.cond.sptk EXP_64_RETURN;;
1681 (p12) fmpy.d.s1 f99 = f8, f1
1684 // No exceptions raised.
1686 (p0) br.cond.sptk EXP_64_RETURN;;
1694 (p0) fmpy.d.s0 f99 = f8, f0
1705 ASM_SIZE_DIRECTIVE(expm1)
1707 .proc __libm_error_region
1708 __libm_error_region:
1712 add GR_Parameter_Y=-32,sp // Parameter 2 value
1714 .save ar.pfs,GR_SAVE_PFS
1715 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
1719 add sp=-64,sp // Create new stack
1721 mov GR_SAVE_GP=gp // Save gp
1726 stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
1727 add GR_Parameter_X = 16,sp // Parameter 1 address
1728 .save b0, GR_SAVE_B0
1729 mov GR_SAVE_B0=b0 // Save b0
1735 stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
1736 add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
1740 stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
1741 add GR_Parameter_Y = -16,GR_Parameter_Y
1742 br.call.sptk b0=__libm_error_support# // Call error handling function
1747 add GR_Parameter_RESULT = 48,sp
1752 ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
1754 add sp = 64,sp // Restore stack pointer
1755 mov b0 = GR_SAVE_B0 // Restore return address
1758 mov gp = GR_SAVE_GP // Restore gp
1759 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
1760 br.ret.sptk b0 // Return
1763 .endp __libm_error_region
1764 ASM_SIZE_DIRECTIVE(__libm_error_region)
1767 .type __libm_error_support#,@function
1768 .global __libm_error_support#