2.9
[glibc/nacl-glibc.git] / sysdeps / ia64 / fpu / s_tanhl.S
blob3435f4313ed46e308ae4c025d8b11233d001dd0a
1 .file "tanhl.s"
4 // Copyright (c) 2001 - 2003, Intel Corporation
5 // All rights reserved.
6 //
7 // Contributed 2001 by the Intel Numerics Group, Intel Corporation
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
13 // * Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
16 // * Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
20 // * The name of Intel Corporation may not be used to endorse or promote
21 // products derived from this software without specific prior written
22 // permission.
24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS 
28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 
30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 
32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
35 // 
36 // Intel Corporation is the author of this code, and requests that all
37 // problem reports or change requests be submitted to it directly at 
38 // http://www.intel.com/software/products/opensource/libraries/num.htm.
40 // History
41 //==============================================================
42 // 11/29/01  Initial version
43 // 05/20/02  Cleaned up namespace and sf0 syntax
44 // 08/14/02  Changed mli templates to mlx
45 // 02/10/03  Reordered header: .section, .global, .proc, .align
47 // API
48 //==============================================================
49 // long double tanhl(long double)
51 // Overview of operation
52 //==============================================================
54 // Algorithm description
55 // ---------------------
57 // There are 4 paths:
59 // 1. Special path: x = 0, Inf, NaNs, denormal
60 //    Return tanhl(x) = +/-0.0 for zeros
61 //    Return tanhl(x) = QNaN for NaNs
62 //    Return tanhl(x) = sign(x)*1.0 for Inf
63 //    Return tanhl(x) = x + x^2   for - denormals
64 //    Return tanhl(x) = x - x^2   for + denormals
66 // 2. [0;1/8] path: 0.0 < |x| < 1/8
67 //    Return tanhl(x) = x + x^3*A3 + ... + x^15*A15
69 // 3. Main path: 1/8 <= |x| < 22.8
70 //    For several ranges of 1/8 <= |x| < 22.8
71 //    Return tanhl(x) = sign(x)*((A0H+A0L) + y*(A1H+A1L) + y^2*(A2H+A2L) + 
72 //                                       + y^3*A3 + y^4*A4 + ... + y^25*A25 )
73 //    where y = (|x|/a) - b
75 //    For each range there is particular set of coefficients.
76 //    Below is the list of ranges:
77 //    1/8  <= |x| < 1/4     a = 0.125, b = 1.5
78 //    1/4  <= |x| < 1/2     a = 0.25,  b = 1.5
79 //    1/2  <= |x| < 1.0     a = 0.5,   b = 1.5
80 //    1.0  <= |x| < 2.0     a = 1.0,   b = 1.5
81 //    2.0  <= |x| < 3.25    a = 2.0,   b = 1.5
82 //    3.25 <= |x| < 4.0     a = 2.0,   b = 2.0
83 //    4.0  <= |x| < 6.5     a = 4.0,   b = 1.5
84 //    6.5  <= |x| < 8.0     a = 4.0,   b = 2.0
85 //    8.0  <= |x| < 13.0    a = 8.0,   b = 1.5
86 //    13.0 <= |x| < 16.0    a = 8.0,   b = 2.0
87 //    16.0 <= |x| < 22.8    a = 16.0,  b = 1.5
88 //    ( [3.25;4.0], [6.5;8.0], [13.9;16.0] subranges separated 
89 //                               for monotonicity issues resolve )
91 // 4. Saturation path: 22.8 <= |x| < +INF 
92 //    Return tanhl(x) = sign(x)*(1.0 - tiny_value)
93 //    (tiny_value ~ 1e-1233)
95 // Implementation notes
96 // --------------------
98 // 1. Special path: x = 0, INF, NaNa, denormals
100 //    This branch is cut off by one fclass operation.
101 //    Then zeros+nans, infinities and denormals processed separately.
102 //    For denormals we use simple fma operaton x+x*x (- for +denorms)
104 // 2. [0;1/8] path: 0.0 < |x| < 1/8
106 //    Here we use simple polynimial computations, where last step
107 //    is performed as x + x^3*A3+...
108 //    The rest of polynomial is factorized using binary tree technique.
110 // 3. Main path: 1/8 <= |x| < 22.8
112 //    Multiprecision have to be performed only for first few
113 //    polynomial iterations (up to 3-rd x degree)
114 //    Here we use the same parallelisation way as above:
115 //    Split whole polynomial to first, "multiprecision" part, and second, 
116 //    so called "tail", native precision part.
118 //    1) Multiprecision part:  
119 //    [v1=(A0H+A0L)+y*(A1H+A1L)] + [v2=y^2*((A2H+A2L)+y*A3)]
120 //    v1 and v2 terms calculated in parallel
122 //    2) Tail part:
123 //    v3 = x^4 * ( A4 + x*A5 + ... + x^21*A25 )
124 //    v3 is splitted to 2 even parts (10 coefficient in each one).
125 //    These 2 parts are also factorized using binary tree technique.
126 //    
127 //    So Multiprecision and Tail parts cost is almost the same
128 //    and we have both results ready before final summation.
130 //    Some tricks were applied to maintain symmetry at direct
131 //    rounding modes (to +/-inf). We had to set result sign
132 //    not at the last operation but much more earlier and at
133 //    several places.
135 // 4. Saturation path: 22.8 <= |x| < +INF 
137 //    We use formula sign(x)*(1.0 - tiny_value) instead of simple sign(x)*1.0
138 //    just to meet IEEE requirements for different rounding modes in this case.
140 // Registers used
141 //==============================================================
142 // Floating Point registers used: 
143 // f8 - input & output
144 // f32 -> f92
146 // General registers used:  
147 // r2, r3, r32 -> r52 
149 // Predicate registers used:
150 // p0, p6 -> p11, p14, p15
152 // p6  - arg is zero, denormal or special IEEE
153 // p7  - arg is in [16;32] binary interval
154 // p8  - arg is in one of subranges 
155 //         [3.25;4.0], [6.5;8.0], [13.9;16.0]
156 // p9  - arg < 1/8
157 // p10  - arg is NOT in one of subranges 
158 //         [3.25;4.0], [6.5;8.0], [13.9;16.0]
159 // p11 - arg in saturation domain
160 // p14 - arg is positive
161 // p15 - arg is negative
163 // Assembly macros
164 //==============================================================
165 rDataPtr           = r2
166 rTailDataPtr       = r3
168 rBias              = r33
169 rSignBit           = r34
170 rInterval          = r35
172 rArgExp            = r36
173 rArgSig            = r37
174 r3p25Offset        = r38
175 r2to4              = r39
176 r1p25              = r40
177 rOffset            = r41
178 r1p5               = r42
179 rSaturation        = r43
180 r1625Sign          = r44
181 rTiny              = r45
182 rAddr1             = r46
183 rAddr2             = r47
184 rTailAddr1         = r48
185 rTailAddr2         = r49
186 rTailOffset        = r50
187 rTailAddOffset     = r51
188 rShiftedDataPtr    = r52
190 //==============================================================
191 fA0H               = f32
192 fA0L               = f33
193 fA1H               = f34
194 fA1L               = f35
195 fA2H               = f36
196 fA2L               = f37
197 fA3                = f38
198 fA4                = f39
199 fA5                = f40
200 fA6                = f41
201 fA7                = f42
202 fA8                = f43
203 fA9                = f44
204 fA10               = f45
205 fA11               = f46
206 fA12               = f47
207 fA13               = f48
208 fA14               = f49
209 fA15               = f50
210 fA16               = f51
211 fA17               = f52
212 fA18               = f53
213 fA19               = f54
214 fA20               = f55 
215 fA21               = f56 
216 fA22               = f57 
217 fA23               = f58
218 fA24               = f59
219 fA25               = f60
221 fArgSqr            = f61
222 fArgCube           = f62
223 fArgFour           = f63
224 fArgEight          = f64
226 fArgAbsNorm        = f65
227 fArgAbsNorm2       = f66
228 fArgAbsNorm2L      = f67
229 fArgAbsNorm3       = f68
230 fArgAbsNorm4       = f69
231 fArgAbsNorm11      = f70
233 fRes               = f71
234 fResH              = f72
235 fResL              = f73
236 fRes1H             = f74
237 fRes1L             = f75
238 fRes1Hd            = f76
239 fRes2H             = f77
240 fRes2L             = f78
241 fRes3H             = f79
242 fRes3L             = f80
243 fRes4              = f81
245 fTT                = f82 
246 fTH                = f83
247 fTL                = f84
248 fTT2               = f85 
249 fTH2               = f86
250 fTL2               = f87
252 f1p5               = f88
253 f2p0               = f89
254 fTiny              = f90
255 fSignumX           = f91
256 fArgAbsNorm4X      = f92
258 // Data tables
259 //==============================================================
260 RODATA
262 .align 16
263 LOCAL_OBJECT_START(tanhl_data)
265 ////////// Main tables ///////////
266 _0p125_to_0p25_data: // exp = 2^-3
267 // Polynomial coefficients for the tanh(x), 1/8 <= |x| < 1/4 
268 data8 0x93D27D6AE7E835F8, 0x0000BFF4 //A3 = -5.6389704216278164626050408239e-04
269 data8 0xBF66E8668A78A8BC //A2H = -2.7963640930198357253955165902e-03
270 data8 0xBBD5384EFD0E7A54 //A2L = -1.7974001252014762983581666453e-20
271 data8 0x3FBEE69E31DB6156 //A1H = 1.2070645062647619716322822114e-01
272 data8 0x3C43A0B4E24A3DCA //A1L = 2.1280460108882061756490131241e-18
273 data8 0x3FC7B8FF903BF776 //A0H = 1.8533319990813951205765874874e-01
274 data8 0x3C593F1A61986FD4 //A0L = 5.4744612262799573374268254539e-18
275 data8 0xDB9E6735560AAE5A, 0x0000BFA3 //A25 = -3.4649731131719154051239475238e-28
276 data8 0xF0DDE953E4327704, 0x00003FA4 //A24 = 7.6004173864565644629900702857e-28
277 data8 0x8532AED11DEC5612, 0x00003FAB //A23 = 5.3798235684551098715428515761e-26
278 data8 0xAEF72A34D88B0038, 0x0000BFAD //A22 = -2.8267199091484508912273222600e-25
279 data8 0x9645EF1DCB759DDD, 0x0000BFB2 //A21 = -7.7689413112830095709522203109e-24
280 data8 0xA5D12364E121F70F, 0x00003FB5 //A20 = 6.8580281614531622113161030550e-23
281 data8 0x9CF166EA815AC705, 0x00003FB9 //A19 = 1.0385615003184753213024737634e-21
282 data8 0x852B1D0252498752, 0x0000BFBD //A18 = -1.4099753997949827217635356478e-20
283 data8 0x9270F5716D25EC9F, 0x0000BFC0 //A17 = -1.2404055949090177751123473821e-19
284 data8 0xC216A9C4EEBDDDCA, 0x00003FC4 //A16 = 2.6303900460415782677749729120e-18
285 data8 0xDCE944D89FF592F2, 0x00003FC6 //A15 = 1.1975620514752377092265425941e-17
286 data8 0x83C8DDF213711381, 0x0000BFCC //A14 = -4.5721980583985311263109531319e-16
287 LOCAL_OBJECT_END(tanhl_data)
289 LOCAL_OBJECT_START(_0p25_to_0p5_data)
290 // Polynomial coefficients for the tanh(x), 1/4 <= |x| < 1/2 
291 data8 0xB6E27B747C47C8AD, 0x0000BFF6 //A3 = -2.7905990032063258105302045572e-03
292 data8 0xBF93FD54E226F8F7 //A2H = -1.9521070769536099515084615064e-02
293 data8 0xBC491BC884F6F18A //A2L = -2.7222721075104525371410300625e-18
294 data8 0x3FCBE3FBB015A591 //A1H = 2.1789499376181400980279079249e-01
295 data8 0x3C76AFC2D1AE35F7 //A1L = 1.9677459707672596091076696742e-17
296 data8 0x3FD6EF53DE8C8FAF //A0H = 3.5835739835078589399230963863e-01
297 data8 0x3C8E2A1C14355F9D //A0L = 5.2327050592919416045278607775e-17
298 data8 0xF56D363AAE3BAD53, 0x00003FBB //A25 = 6.4963882412697389947564301120e-21
299 data8 0xAD6348526CEEB897, 0x0000BFBD //A24 = -1.8358149767147407353343152624e-20
300 data8 0x85D96A988565FD65, 0x0000BFC1 //A23 = -2.2674950494950919052759556703e-19
301 data8 0xD52CAF6B1E4D9717, 0x00003FC3 //A22 = 1.4445269502644677106995571101e-18
302 data8 0xBD7E1BE5CBEF7A01, 0x00003FC5 //A21 = 5.1362075721080004718090799595e-18
303 data8 0xAE84A9B12ADD6948, 0x0000BFC9 //A20 = -7.5685210830925426342786733068e-17
304 data8 0xEAC2D5FCF80E250C, 0x00003FC6 //A19 = 1.2726423522879522181100392135e-17
305 data8 0xE0D2A8AC8C2EDB95, 0x00003FCE //A18 = 3.1200443098733419749016380203e-15
306 data8 0xB22F0AB7B417F78E, 0x0000BFD0 //A17 = -9.8911854977385933809488291835e-15
307 data8 0xE25A627BAEFFA7A4, 0x0000BFD3 //A16 = -1.0052095388666003876301743498e-13
308 data8 0xC90F32EC4A17F908, 0x00003FD6 //A15 = 7.1430637679768183097897337145e-13
309 data8 0x905F6F124AF956B1, 0x00003FD8 //A14 = 2.0516607231389483452611375485e-12
310 LOCAL_OBJECT_END(_0p25_to_0p5_data)
312 LOCAL_OBJECT_START(_0p5_to_1_data)
313 // Polynomial coefficients for the tanh(x), 1/2 <= |x| < 1 
314 data8 0xAB402BE491EE72A7, 0x00003FF7 //A3 = 5.2261556931080934657023772945e-03
315 data8 0xBFB8403D3DDA87BE //A2H = -9.4730212784752659826992271519e-02
316 data8 0xBC6FF7BC2AB71A8B //A2L = -1.3863786398568460929625760740e-17
317 data8 0x3FD3173B1EFA6EF4 //A1H = 2.9829290414066567116435635398e-01
318 data8 0x3C881E4DCABDE840 //A1L = 4.1838710466827119847963316219e-17
319 data8 0x3FE45323E552F228 //A0H = 6.3514895238728730220145735075e-01
320 data8 0x3C739D5832BF7BCF //A0L = 1.7012977006567066423682445459e-17
321 data8 0xF153980BECD8AE12, 0x00003FD0 //A25 = 1.3396313991261493342597057700e-14
322 data8 0xEC9ACCD245368129, 0x0000BFD3 //A24 = -1.0507358886349528807350792383e-13
323 data8 0x8AE6498CA36D2D1A, 0x00003FD4 //A23 = 1.2336759149738309660361813001e-13
324 data8 0x8DF02FBF5AC70E64, 0x00003FD7 //A22 = 1.0085317723615282268326194551e-12
325 data8 0x9E15C7125DA204EE, 0x0000BFD9 //A21 = -4.4930478919612724261941857560e-12
326 data8 0xA62C6F39BDDCEC1C, 0x00003FD7 //A20 = 1.1807342457875095150035780314e-12
327 data8 0xDFD8D65D30F80F52, 0x00003FDC //A19 = 5.0896919887121116317817665996e-11
328 data8 0xB795AFFD458F743E, 0x0000BFDE //A18 = -1.6696932710534097241291327756e-10
329 data8 0xFEF30234CB01EC89, 0x0000BFDD //A17 = -1.1593749714588103589483091370e-10
330 data8 0xA2F638356E13761E, 0x00003FE2 //A16 = 2.3714062288761887457674853605e-09
331 data8 0xC429CC0D031E4FD5, 0x0000BFE3 //A15 = -5.7091025466377379046489586383e-09
332 data8 0xC78363FF929EFF62, 0x0000BFE4 //A14 = -1.1613199289622686725595739572e-08
333 LOCAL_OBJECT_END(_0p5_to_1_data)
335 LOCAL_OBJECT_START(_1_to_2_data)
336 // Polynomial coefficients for the tanh(x), 1 <= |x| < 2.0 
337 data8 0xB3D8FB48A548D99A, 0x00003FFB //A3 = 8.7816203264683800892441646129e-02
338 data8 0xBFC4EFBD8FB38E3B //A2H = -1.6356629864377389416141284073e-01
339 data8 0xBC77687FD8087B23 //A2L = -2.0303377679446772162287121190e-17
340 data8 0x3FC72165282C6F72 //A1H = 1.8070663892364852154415189034e-01
341 data8 0x3C64E01F7A76D777 //A1L = 9.0532964466719018524360408402e-18
342 data8 0x3FECF6F9786DF577 //A0H = 9.0514825364486639625027919465e-01
343 data8 0x3C8834EDCE71A65B //A0L = 4.1992023813070331863928976191e-17
344 data8 0xC3EEEB3EFA688094, 0x00003FE2 //A25 = 2.8512044383274095705865793485e-09
345 data8 0x88461973672AEB12, 0x0000BFE1 //A24 = -9.9152258079470849685057375343e-10
346 data8 0xFC2AF9950DC5027E, 0x0000BFE4 //A23 = -1.4678101918123116001692289670e-08
347 data8 0x9C80CA742F89B7B5, 0x00003FE6 //A22 = 3.6438714992394138274843759814e-08
348 data8 0xA0B3D7FAA606260A, 0x0000BFE6 //A21 = -3.7416469848124568887944709492e-08
349 data8 0xDA5858432FBD9D9D, 0x0000BFE6 //A20 = -5.0837429421503142141842414978e-08
350 data8 0xB0244D1E1AE9C1B0, 0x00003FE9 //A19 = 3.2808967255272595749004827841e-07
351 data8 0xC8D3109ACF740738, 0x0000BFEA //A18 = -7.4812945767507614821609020680e-07
352 data8 0xBB0F3440EEA55BBF, 0x00003FEA //A17 = 6.9685053481643125932497676583e-07
353 data8 0xC13A8B08D8576C19, 0x00003FEB //A16 = 1.4396658837712390333960587173e-06
354 data8 0xFF3A1163CC5522A1, 0x0000BFED //A15 = -7.6063522055104010298762276148e-06
355 data8 0x8672AF27EB0823B7, 0x00003FEF //A14 = 1.6027448793338500004496520337e-05
356 LOCAL_OBJECT_END(_1_to_2_data)
358 LOCAL_OBJECT_START(_2_to_3p25_data)
359 // Polynomial coefficients for the tanh(x), 2 <= |x| < 3.25 
360 data8 0xD45657BEC559E366, 0x00003FFA //A3 = 5.1840155367548909799883161889e-02
361 data8 0xBFA41B109CA6AB81 //A2H = -3.9268988726084870510835145296e-02
362 data8 0xBC2C3D708A4E56C5 //A2L = -7.6544669252238280132415018518e-19
363 data8 0x3F9434A517BBC5F4 //A1H = 1.9732074330880380874653212686e-02
364 data8 0x3C3ED62DD9585229 //A1L = 1.6716574468135097509707871438e-18
365 data8 0x3FEFD77D111A0AFF //A0H = 9.9505475368673035330147058630e-01
366 data8 0x3C9C415E151C6CA5 //A0L = 9.8030409604070051319822874013e-17
367 data8 0xB1596391D4534D52, 0x00003FEC //A25 = 2.6427086526487251988631279067e-06
368 data8 0xC4DC44E243D1AF5F, 0x00003FEF //A24 = 2.3467591534149209236830008333e-05
369 data8 0xAED5786023982BB8, 0x00003FF0 //A23 = 4.1683642395739762658623742687e-05
370 data8 0xCF39926C9FBC6A10, 0x00003FF0 //A22 = 4.9406263949321793291856681624e-05
371 data8 0xA255A72359928142, 0x00003FF0 //A21 = 3.8703580278108400672236161973e-05
372 data8 0xA2E573B9FC332C0D, 0x00003FED //A20 = 4.8546879618263642155709302480e-06
373 data8 0x82C7BD01830ACA93, 0x00003FF0 //A19 = 3.1180436075031301077175550468e-05
374 data8 0xB38AF4C76E96444B, 0x0000BFF0 //A18 = -4.2806338675404452784440167120e-05
375 data8 0xEC08FF0FB194464C, 0x00003FF0 //A17 = 5.6275163156181928637744511210e-05
376 data8 0xB850825D9E235135, 0x0000BFF0 //A16 = -4.3943998628289568813056822585e-05
377 data8 0xF98436E838763687, 0x0000BFEF //A15 = -2.9744680263523220185672219686e-05
378 data8 0xE1851A2D00737A5D, 0x00003FF2 //A14 = 2.1507256570895163202182573369e-04
379 LOCAL_OBJECT_END(_2_to_3p25_data)
381 LOCAL_OBJECT_START(_4_to_6p5_data)
382 // Polynomial coefficients for the tanh(x), 4 <= |x| < 6.5 
383 data8 0x896FDBD321A0BE58, 0x00003FF5 //A3 = 1.0485606995331904734870550114e-03
384 data8 0xBF39C522B95A37D6 //A2H = -3.9321992640217512306882730044e-04
385 data8 0xBBA9B3EC39A45338 //A2L = -2.7213922673282819034134988241e-21
386 data8 0x3F19C5377A48B5AD //A1H = 9.8306189621330793766869338146e-05
387 data8 0x3BCAFCB1D08A891C //A1L = 1.1429476443042275163117526657e-20
388 data8 0x3FEFFFE63ABE253B //A0H = 9.9998771165079547440512897083e-01
389 data8 0x3C9BB74C4EE0D16F //A0L = 9.6159219890436197391279544561e-17
390 data8 0x8D86121D469AFA7E, 0x0000BFEF //A25 = -1.6870941388985743600323604423e-05
391 data8 0x9D3656A36593C5C4, 0x00003FEF //A24 = 1.8741161763079973068909254398e-05
392 data8 0xDCD772D5BF9ADB96, 0x00003FF0 //A23 = 5.2652739523018349983563695656e-05
393 data8 0xFF79ADCF0DCBCC2D, 0x00003FF1 //A22 = 1.2182012003034659966028035977e-04
394 data8 0x84D24E394DEFD0D2, 0x00003FF1 //A21 = 6.3334229517535065590380468696e-05
395 data8 0xA66B56BFD2782544, 0x00003FF1 //A20 = 7.9354902476954571736114945842e-05
396 data8 0xFB15771FBF3155FE, 0x0000BFEE //A19 = -1.4965763624796745134798717707e-05
397 data8 0xC774790126BE54C3, 0x00003FEF //A18 = 2.3776885435831770523136610539e-05
398 data8 0x825A13DACB8C68CD, 0x00003FEF //A17 = 1.5539153272890695426189818556e-05
399 data8 0xCFF96E6810AACE27, 0x0000BFF1 //A16 = -9.9169893703251156059893890295e-05
400 data8 0x8A85D2061B865024, 0x00003FF3 //A15 = 2.6421115104625621420758344535e-04
401 data8 0x922EC6F3CFE0496E, 0x0000BFF4 //A14 = -5.5764283474946207558456581668e-04
402 LOCAL_OBJECT_END(_4_to_6p5_data)
404 LOCAL_OBJECT_START(_8_to_13_data)
405 // Polynomial coefficients for the tanh(x), 8 <= |x| < 13 
406 data8 0xDD6050A898303460, 0x00003FE6 //A3 = 5.1543170295688189081352133793e-08
407 data8 0xBE44C1078FDBADC0 //A2H = -9.6643444318955652627581125180e-09
408 data8 0xBAF95FCAA6DBBA6F //A2L = -1.3118146684038113473094275420e-24
409 data8 0x3E14C1078FE26748 //A1H = 1.2080430540780827633746315479e-09
410 data8 0x3A88168082F37D95 //A1L = 9.7290246966246404028418245094e-27
411 data8 0x3FEFFFFFFFF59F7C //A0H = 9.9999999992449728480892190419e-01
412 data8 0x3C7C068EBC5C2EEB //A0L = 2.4308346546749583521003998922e-17
413 data8 0x9DC155C77A6C46E5, 0x00003FF2 //A25 = 1.5044709695520252096006763473e-04
414 data8 0xF2F9E09CA47F46E9, 0x00003FF3 //A24 = 4.6344010077547944693833282056e-04
415 data8 0xCBFD67E704734BC8, 0x00003FF4 //A23 = 7.7815958662026429864083620142e-04
416 data8 0xC18DC821CD67E621, 0x00003FF4 //A22 = 7.3834928521190855055818897104e-04
417 data8 0x8AF72BCAB05A296E, 0x00003FF4 //A21 = 5.3011135848666430331904214879e-04
418 data8 0xC2E73BE9B9AB4007, 0x00003FF2 //A20 = 1.8587423129049905806822275188e-04
419 data8 0xE7E8C2058E2FF9F7, 0x00003FF1 //A19 = 1.1058292891321512917337425414e-04
420 data8 0xC46309F52E429F97, 0x0000BFF0 //A18 = -4.6822278664829811025251866877e-05
421 data8 0x81966C1E007E9BEB, 0x00003FF1 //A17 = 6.1792176836716291200611553354e-05
422 data8 0x8CEDC4BEFCAB9A7E, 0x0000BFF1 //A16 = -6.7200080564674449915571760779e-05
423 data8 0x8B64E9FA53210018, 0x00003FF1 //A15 = 6.6468331917938095774361868182e-05
424 data8 0x82DEDAA539A3A3F1, 0x0000BFF1 //A14 = -6.2403928644276709411156885292e-05
425 LOCAL_OBJECT_END(_8_to_13_data)
427 LOCAL_OBJECT_START(_16_to_22p8_data)
428 // Polynomial coefficients for the tanh(x), 16 <= |x| < 22.88 
429 data8 0x992C00F33DDE804D, 0x00003FCE //A3 = 2.1256869805798788337547274131e-15
430 data8 0x3C8D42EA28102760 //A2H = 5.0760412270332007485198379096e-17
431 data8 0x391A747B43B072DD //A2L = 1.2737621993898125881520341053e-33
432 data8 0x3C309BC5C3CB4D5F //A1H = 9.0034785192019775952205276560e-19
433 data8 0x38A8EF3B5C9DCE71 //A1L = 9.3793162715476168397242934494e-36
434 data8 0x3FF0000000000000 //A0H = 1.0000000000000000000000000000e+00
435 data8 0x3BACC66AFD5CA22A //A0L = 3.0466790472070565954180861749e-21
436 data8 0xF020FB351C2F37CB, 0x00003FF1 //A25 = 1.1450235038836625246604146870e-04
437 data8 0xBE80596C51302A7B, 0x00003FF4 //A24 = 7.2670503421185030764546828414e-04
438 data8 0x91343CF8577E0131, 0x00003FF6 //A23 = 2.2156380512949603402001207105e-03
439 data8 0x8D029A8679641286, 0x00003FF7 //A22 = 4.3032888906494613055765544559e-03
440 data8 0xC3713F64D8DC4BAB, 0x00003FF7 //A21 = 5.9644279041951657632420721490e-03
441 data8 0xCD678C455A5D06C2, 0x00003FF7 //A20 = 6.2684473911812928601693994403e-03
442 data8 0xA9E1C825BDCEEBCC, 0x00003FF7 //A19 = 5.1843859941826642445235686826e-03
443 data8 0xE29C919AD93F6EB9, 0x00003FF6 //A18 = 3.4578185539872939928152204329e-03
444 data8 0xF7E615A75994A607, 0x00003FF5 //A17 = 1.8913175041916131006881986311e-03
445 data8 0xE102EFE0F7F2B2AD, 0x00003FF4 //A16 = 8.5835064987089641065525269712e-04
446 data8 0xAAD62946DEE96996, 0x00003FF3 //A15 = 3.2584489313998677644253007210e-04
447 data8 0xDA2470DE110B293E, 0x00003FF1 //A14 = 1.0401837693241806604296821650e-04
448 LOCAL_OBJECT_END(_16_to_22p8_data)
450 LOCAL_OBJECT_START(_3p25_to_4_data)
451 // Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4 
452 data8 0xE9E07240432926E6, 0x00003FF7 //A3 = 7.1373517862636557382403555215e-03
453 data8 0xBF75F495227AF306 //A2H = -5.3602052282115727338540622782e-03
454 data8 0xBBBE92D355A6B716 //A2L = -6.4741983326810209847018826624e-21
455 data8 0x3F65F85AD510B690 //A1H = 2.6819013660517934671823070403e-03
456 data8 0x3C159A0B73E6EC01 //A1L = 2.9275813076637328121849573333e-19
457 data8 0x3FEFFA81708A0B42 //A0H = 9.9932929973906703402519724477e-01
458 data8 0x3C66857246C19DC6 //A0L = 9.7670460995685717424398031188e-18
459 data8 0xE6B6B8365B1E4D6C, 0x00003FE3 //A25 = 6.7146538162212081470554423396e-09
460 data8 0xE0453CEEF483A510, 0x00003FE2 //A24 = 3.2635647369924061614015292015e-09
461 data8 0x9C7D83B56E92CF1A, 0x00003FE5 //A23 = 1.8217867585545497089756353348e-08
462 data8 0xA94635C48ABA9EB4, 0x0000BFE4 //A22 = -9.8530586070049930796756799547e-09
463 data8 0xB1B0C14443067646, 0x00003FE5 //A21 = 2.0685890807654992387562340307e-08
464 data8 0x9C6E549781E293C3, 0x00003FDE //A20 = 1.4227314592865135171341122138e-10
465 data8 0xB0CBFCE7C80F57A7, 0x0000BFE7 //A19 = -8.2327438416004542109809245219e-08
466 data8 0xB151AB3876E896E1, 0x00003FE9 //A18 = 3.3028241036175815328309577940e-07
467 data8 0xFCF3A5C1A5CB7EEE, 0x0000BFEA //A17 = -9.4231869277542043001280640966e-07
468 data8 0x96A9016C7C95BEDA, 0x00003FEC //A16 = 2.2450115975007100522962781833e-06
469 data8 0x9B9B0A3901DEC05B, 0x0000BFED //A15 = -4.6374089937147736266514566049e-06
470 data8 0x8987DF26A6789CCF, 0x00003FEE //A14 = 8.1974714257536543772040700977e-06
471 LOCAL_OBJECT_END(_3p25_to_4_data)
473 LOCAL_OBJECT_START(_6p5_to_8_data)
474 // Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0 
475 data8 0xA11C8A63815E5657, 0x00003FEF //A3 = 1.9205985861286093001394561449e-05
476 data8 0xBEDE355AD6CB61D8 //A2H = -7.2022479400070228499307345427e-06
477 data8 0xBB8E6B50B8468A63 //A2L = -8.0518953122203408718779840543e-22
478 data8 0x3EBE355B48DCF330 //A1H = 1.8005623902549165889479948488e-06
479 data8 0x3B5837550FFA98DA //A1L = 8.0124491698609178046195694087e-23
480 data8 0x3FEFFFFF872A91F8 //A0H = 9.9999977492967584424832239165e-01
481 data8 0x3C8A43B839B4EB63 //A0L = 4.5561696441306660142461355317e-17
482 data8 0xB5BC1948966B8826, 0x0000BFE6 //A25 = -4.2313421330480692560677276010e-08
483 data8 0x91D0BE367389BDFC, 0x0000BFE8 //A24 = -1.3580117599617083801153887619e-07
484 data8 0xFFD950AF282AB36C, 0x0000BFE8 //A23 = -2.3827784451962439125197203287e-07
485 data8 0x959B1770EBB8903A, 0x0000BFE9 //A22 = -2.7866256690165347051403663794e-07
486 data8 0xCC78060D1C0CFF3C, 0x0000BFE8 //A21 = -1.9042644867126442102188429523e-07
487 data8 0xF8919BAF2E87F31D, 0x0000BFE8 //A20 = -2.3149771783868910586746973299e-07
488 data8 0xC5B6AC942A3F2440, 0x00003FE8 //A19 = 1.8413511183396213757149263639e-07
489 data8 0xABF1A4703056450A, 0x0000BFEA //A18 = -6.4054099983863829656292958643e-07
490 data8 0xBB543D8BDB670453, 0x00003FEB //A17 = 1.3957102903892251890348444989e-06
491 data8 0xC9D6F37700C1D092, 0x0000BFEC //A16 = -3.0076451968978522605262647414e-06
492 data8 0xCA6EF4BB64E49EC8, 0x00003FED //A15 = 6.0329860989478473738709576062e-06
493 data8 0xBE25D0FD069D0A93, 0x0000BFEE //A14 = -1.1333687314965721384777951065e-05
494 LOCAL_OBJECT_END(_6p5_to_8_data)
496 LOCAL_OBJECT_START(_13_to_16_data)
497 // Polynomial coefficients for the tanh(x), 13 <= |x| < 16 
498 data8 0x98176FD2075BDBD5, 0x00003FDB //A3 = 1.7290807363028159200235264756e-11
499 data8 0xBD8C8464F76162D1 //A2H = -3.2420263805679445515400340441e-12
500 data8 0xBA2D56B508E0F1FD //A2L = -1.8515322669984580704502445180e-28
501 data8 0x3D5C8464F761639C //A1H = 4.0525329757100331782338488690e-13
502 data8 0x3A0A09D9E328E620 //A1L = 4.1081479300866418212862258651e-29
503 data8 0x3FEFFFFFFFFFFF1B //A0H = 9.9999999999997457589273608392e-01
504 data8 0x3C9B9B089E9BFD89 //A0L = 9.5776165728054091471814161399e-17
505 data8 0xC5395B9EC765BDB7, 0x00003FE6 //A25 = 4.5919803498257974411526879804e-08
506 data8 0x9A0F1FCB1DC24C3A, 0x00003FE8 //A24 = 1.4347869798460288751020493795e-07
507 data8 0x8AA5C3459FAD0B28, 0x00003FE9 //A23 = 2.5825111356333853968900510087e-07
508 data8 0x9578B747988CFF9D, 0x00003FE9 //A22 = 2.7841245127068220034870119246e-07
509 data8 0x810DF1A589D9CAF1, 0x00003FE9 //A21 = 2.4038267971021370956311255310e-07
510 data8 0x8A00D77B9416EB75, 0x00003FE8 //A20 = 1.2852557749068320312899366352e-07
511 data8 0xB2436C4A1849C498, 0x00003FE7 //A19 = 8.3010350873515703893886683374e-08
512 data8 0xEA6405B18356600B, 0x00003FE3 //A18 = 6.8216675390299296071261114202e-09
513 data8 0xF7606C022194B7E8, 0x00003FE5 //A17 = 2.8798432098264655723769995993e-08
514 data8 0xAF4B0C453FCAF34E, 0x0000BFE5 //A16 = -2.0406809167824936143455638336e-08
515 data8 0xC324C1F10D5FA7CC, 0x00003FE5 //A15 = 2.2717703170390130238356558599e-08
516 data8 0xB34A2E3A4D3B9C31, 0x0000BFE5 //A14 = -2.0872076027950789618606920471e-08
517 LOCAL_OBJECT_END(_13_to_16_data)
520 //////// "Tail" tables //////////
521 LOCAL_OBJECT_START(_0p125_to_0p25_data_tail)
522 // Polynomial coefficients for the erf(x), 1/8 <= |x| < 1/4 
523 data8 0x9D7D206E97ADC83A, 0x0000BFCC //A13 = -5.4639895428711257047470806445e-16
524 data8 0xA8972B666A845810, 0x00003FD3 //A12 = 7.4869224589947988668562043110e-14
525 data8 0x9A5B31511C9F4698, 0x0000BFD4 //A11 = -1.3709586467430093373657009487e-13
526 data8 0xCBB8047BCB274982, 0x0000BFDA //A10 = -1.1580074124926108509393610532e-11
527 data8 0xF95EB849E5F9247C, 0x00003FDC //A9 = 5.6700173336564916962945623180e-11
528 data8 0xE7893404C6A53386, 0x00003FE1 //A8 = 1.6846457582993065168777704528e-09
529 data8 0xF2E5C7E2B5F55ECC, 0x0000BFE4 //A7 = -1.4138500046802141367543484859e-08
530 data8 0xF43906FF53A002C0, 0x0000BFE8 //A6 = -2.2745017243678613107034288816e-07
531 data8 0xC6175D5E47D1D259, 0x00003FEC //A5 = 2.9517899220726077077586632607e-06
532 data8 0xE7C2AE92CB36769B, 0x00003FEF //A4 = 2.7628001723157068127646694830e-05
533 LOCAL_OBJECT_END(_0p125_to_0p25_data_tail)
535 LOCAL_OBJECT_START(_0p25_to_0p5_data_tail)
536 // Polynomial coefficients for the tanh(x), 1/4 <= |x| < 1/2 
537 data8 0x9E2972C008B9965E, 0x0000BFDC //A13 = -3.5961854154738002253192260213e-11
538 data8 0xC3EABA3D219BEA8A, 0x00003FDB //A12 = 2.2273173303628274478819473067e-11
539 data8 0xC50FB68D960D5CD9, 0x00003FE1 //A11 = 1.4338102430978399800743148719e-09
540 data8 0xB3BB92499EF2D583, 0x0000BFE3 //A10 = -5.2309100551458044083112632491e-09
541 data8 0xBD915BE632F1D04E, 0x0000BFE6 //A9 = -4.4137194873936112573773943707e-08
542 data8 0xBC48C813FA819141, 0x00003FE9 //A8 = 3.5070684356359066908197915734e-07
543 data8 0xD3E34EA031AC611B, 0x00003FEA //A7 = 7.8934400708919584259192272835e-07
544 data8 0x8EAC489D859541CD, 0x0000BFEF //A6 = -1.7007944944124693133572815137e-05
545 data8 0x98D4D7E5D1508B8A, 0x00003FEF //A5 = 1.8218924920302265989878708948e-05
546 data8 0xAC262F3F8CF49C02, 0x00003FF4 //A4 = 6.5669692402266433496312492412e-04
547 LOCAL_OBJECT_END(_0p25_to_0p5_data_tail)
549 LOCAL_OBJECT_START(_0p5_to_1_data_tail)
550 // Polynomial coefficients for the tanh(x), 1/2 <= |x| < 1 
551 data8 0xDF67FB36FFA2A538, 0x00003FE7 //A13 = 1.0403160796697495720021114635e-07
552 data8 0xB7FB80FB5AFA63A4, 0x0000BFE8 //A12 = -1.7134699677764282023124981753e-07
553 data8 0xC87625A0BA7D6C5F, 0x0000BFEA //A11 = -7.4677732458471897291461679095e-07
554 data8 0x90DA375DD9AF6D79, 0x00003FED //A10 = 4.3169381418023765618186668159e-06
555 data8 0x82DFB03317B17316, 0x0000BFED //A9 = -3.9003426534601562552753368105e-06
556 data8 0xAA582FD4F3438BB4, 0x0000BFF0 //A8 = -4.0613288845040776435400454867e-05
557 data8 0xB1532D8CF763B21C, 0x00003FF2 //A7 = 1.6911021594787399557528570601e-04
558 data8 0x82E12AEF7CAB76C6, 0x0000BFEF //A6 = -1.5602059530458172761585925044e-05
559 data8 0x83256E3D0FBA5C93, 0x0000BFF6 //A5 = -2.0011324059500451791903108104e-03
560 data8 0xCC4AB2EC0965499B, 0x00003FF7 //A4 = 6.2344907419841579664122448353e-03
561 LOCAL_OBJECT_END(_0p5_to_1_data_tail)
563 LOCAL_OBJECT_START(_1_to_2_data_tail)
564 // Polynomial coefficients for the tanh(x), 1 <= |x| < 2.0 
565 data8 0xCCAEE174EAC17F78, 0x0000BFEE //A13 = -1.2200065117856038355953618829e-05
566 data8 0xA39DD0981D1A2776, 0x0000BFF0 //A12 = -3.9009204899026604074167603200e-05
567 data8 0xB7104FA27FAF80D0, 0x00003FF2 //A11 = 1.7458316338540792661905876072e-04
568 data8 0xB219A7274436A734, 0x0000BFF3 //A10 = -3.3969918595931391572998415468e-04
569 data8 0xCCD9D03C0C73CECF, 0x00003FF2 //A9 = 1.9536097875337884986025498958e-04
570 data8 0x85321EA40CFEEBEE, 0x00003FF5 //A8 = 1.0162031558369402750607778300e-03
571 data8 0x81F272C08C308220, 0x0000BFF7 //A7 = -3.9656696618251138315464862909e-03
572 data8 0xE8761C6BDEA9ED87, 0x00003FF7 //A6 = 7.0941580558970243020090656343e-03
573 data8 0xAE4E9F3691F66877, 0x0000BFF6 //A5 = -2.6597155288710984120834711909e-03
574 data8 0xCC8286B331BD8AAA, 0x0000BFF9 //A4 = -2.4964583478826523250880337777e-02
575 LOCAL_OBJECT_END(_1_to_2_data_tail)
577 LOCAL_OBJECT_START(_2_to_3p25_data_tail)
578 // Polynomial coefficients for the tanh(x), 2 <= |x| < 3.25 
579 data8 0x92E1711A3BD6408B, 0x0000BFF4 //A13 = -5.6030514548041036913731470443e-04
580 data8 0x8B9BD885FF3E98C5, 0x00003FF5 //A12 = 1.0651304064581604055612602669e-03
581 data8 0xD041356C7FA26A22, 0x0000BFF5 //A11 = -1.5888574328066952147023520244e-03
582 data8 0xDFA210BE9BE6B7FD, 0x00003FF5 //A10 = 1.7061849060196387827639060629e-03
583 data8 0x8ECC3606808028E9, 0x0000BFF4 //A9 = -5.4472999329435778312080340471e-04
584 data8 0xD5C053B8EEBD10C8, 0x0000BFF6 //A8 = -3.2615856552479930645151033322e-03
585 data8 0xB7BFD63AC5051539, 0x00003FF8 //A7 = 1.1215171059191957498023766643e-02
586 data8 0xC367C59D7FA3ADA2, 0x0000BFF9 //A6 = -2.3853193251842394834616848995e-02
587 data8 0x9FC9FB890BB053CF, 0x00003FFA //A5 = 3.9010984954739386625695104667e-02
588 data8 0xD01D077B42E7ED76, 0x0000BFFA //A4 = -5.0808934425896607486919526567e-02
589 LOCAL_OBJECT_END(_2_to_3p25_data_tail)
591 LOCAL_OBJECT_START(_4_to_6p5_data_tail)
592 // Polynomial coefficients for the tanh(x), 4 <= |x| < 6.5 
593 data8 0x870CCE8C76C52C7E, 0x00003FF5 //A13 = 1.0303499350193060915603525934e-03
594 data8 0xE1431E54AD2A738B, 0x0000BFF5 //A12 = -1.7186140560972621669872002486e-03
595 data8 0xAB20056533E28734, 0x00003FF6 //A11 = 2.6111615345168277554841545330e-03
596 data8 0xECCB91D64718B9BD, 0x0000BFF6 //A10 = -3.6132079169671860943878776041e-03
597 data8 0x94771DA3B8C2EB4F, 0x00003FF7 //A9 = 4.5308012699419563988381317896e-03
598 data8 0xA7497377E4946F2C, 0x0000BFF7 //A8 = -5.1051915941441437592654444804e-03
599 data8 0xA76B2D6FCA088AE9, 0x00003FF7 //A7 = 5.1092120989582196669504468168e-03
600 data8 0x928C8961F33C9560, 0x0000BFF7 //A6 = -4.4723196805537430568162704711e-03
601 data8 0xDBDDDF6CDE9AB9BE, 0x00003FF6 //A5 = 3.3548994514326736175581084349e-03
602 data8 0x896E211733AD9D40, 0x0000BFF6 //A4 = -2.0970183170010094667442967500e-03
603 LOCAL_OBJECT_END(_4_to_6p5_data_tail)
605 LOCAL_OBJECT_START(_8_to_13_data_tail)
606 // Polynomial coefficients for the tanh(x), 8 <= |x| < 13 
607 data8 0xE50C3476BED020AA, 0x00003FF0 //A13 = 5.4609221347524272615754239857e-05
608 data8 0xBA16F5F4EDC0EABC, 0x0000BFF0 //A12 = -4.4367239594986428539386662937e-05
609 data8 0x8B916C2F002C3D91, 0x00003FF0 //A11 = 3.3275617838067362533536610680e-05
610 data8 0xBFE8031097CB4442, 0x0000BFEF //A10 = -2.2877013297722792747267224605e-05
611 data8 0xEFE1FFD106B2DA41, 0x00003FEE //A9 = 1.4298129659899553350478452989e-05
612 data8 0x86EF1FF403A6622E, 0x0000BFEE //A8 = -8.0426979849841642112688693288e-06
613 data8 0x86EF200FD047306B, 0x00003FED //A7 = 4.0213490418736097707257704218e-06
614 data8 0xEC22782377882553, 0x0000BFEB //A6 = -1.7593402092805559754997565942e-06
615 data8 0xB119DA1DB7C47773, 0x00003FEA //A5 = 6.5975257917246601211360847253e-07
616 data8 0xDD6050A7761D67BB, 0x0000BFE8 //A4 = -2.0617268111985310661707082242e-07
617 LOCAL_OBJECT_END(_8_to_13_data_tail)
619 LOCAL_OBJECT_START(_16_to_22p8_data_tail)
620 // Polynomial coefficients for the tanh(x), 16 <= |x| < 22.88 
621 data8 0xEAF4AF87336E81B1, 0x00003FEF //A13 = 2.8008914392791730186582989654e-05
622 data8 0xD5B309EA768E2711, 0x00003FED //A12 = 6.3687375204024238267961143128e-06
623 data8 0xA4048CA537113538, 0x00003FEB //A11 = 1.2220276227448617951538196845e-06
624 data8 0xD3EC78BB3425377D, 0x00003FE8 //A10 = 1.9736934193679794194181457250e-07
625 data8 0xE5763CD37440266E, 0x00003FE5 //A9 = 2.6712876934440631473215182284e-08
626 data8 0xCECA765EEB4A265F, 0x00003FE2 //A8 = 3.0092031912460315516888139627e-09
627 data8 0x99ABF588DF81A52E, 0x00003FDF //A7 = 2.7952722177649984066847682907e-10
628 data8 0xB9C78918294A4685, 0x00003FDB //A6 = 2.1120676552098603524020495036e-11
629 data8 0xB3A3C42AD539D50F, 0x00003FD7 //A5 = 1.2764169243389521270291967366e-12
630 data8 0x86BC347939478174, 0x00003FD3 //A4 = 5.9834437707863962671883176163e-14
631 LOCAL_OBJECT_END(_16_to_22p8_data_tail)
633 LOCAL_OBJECT_START(_3p25_to_4_data_tail)
634 // Polynomial coefficients for the tanh(x), 3.25 <= |x| < 4 
635 data8 0xBE9A2BE19F21BA1C, 0x0000BFEE //A13 = -1.1360778336288065244475976873e-05
636 data8 0xF84910F515BDB014, 0x00003FED //A12 = 7.3994819819577018481862729782e-06
637 data8 0xC4C84FB788AA4007, 0x00003FEF //A11 = 2.3458298013663976251972482656e-05
638 data8 0x86CC6243C170E5ED, 0x0000BFF2 //A10 = -1.2855374755847770638424932233e-04
639 data8 0xD3065AC539ABABFF, 0x00003FF3 //A9 = 4.0249790677367806832685138089e-04
640 data8 0x82C4413795EC381B, 0x0000BFF5 //A8 = -9.9767013652382759950854031514e-04
641 data8 0x88D588720888899A, 0x00003FF6 //A7 = 2.0879228705174076794011525274e-03
642 data8 0xF4CA066137741469, 0x0000BFF6 //A6 = -3.7351861548964870836350490741e-03
643 data8 0xB998746D56E81737, 0x00003FF7 //A5 = 5.6639259807333999973200378964e-03
644 data8 0xE93FB2F48233275B, 0x0000BFF7 //A4 = -7.1181892208343798194003322900e-03
645 LOCAL_OBJECT_END(_3p25_to_4_data_tail)
647 LOCAL_OBJECT_START(_6p5_to_8_data_tail)
648 // Polynomial coefficients for the tanh(x), 6.5 <= |x| < 8.0 
649 data8 0xA6881D7D21774BFD, 0x00003FEF //A13 = 1.9852125640303530752913966680e-05
650 data8 0x875E983AA042E605, 0x0000BFF0 //A12 = -3.2274606306629334402383651599e-05
651 data8 0xCB19E01E94FC133C, 0x00003FF0 //A11 = 4.8423069963831314927026982707e-05
652 data8 0x8BA5E8D9E72D56B2, 0x0000BFF1 //A10 = -6.6589395655200734237190902534e-05
653 data8 0xAE91F647ED4E46B2, 0x00003FF1 //A9 = 8.3241541003842930001632190258e-05
654 data8 0xC465A7E0B22F884E, 0x0000BFF1 //A8 = -9.3649431639051891449916386619e-05
655 data8 0xC4666148AA01A4D7, 0x00003FF1 //A7 = 9.3650780646160216748407869111e-05
656 data8 0xABD9E63D181B0C6C, 0x0000BFF1 //A6 = -8.1945023256769295802996591839e-05
657 data8 0x80E38B18E509387A, 0x00003FF1 //A5 = 6.1458988764532931141264026311e-05
658 data8 0xA11C80E20ADA5A64, 0x0000BFF0 //A4 = -3.8411937140983728563216440713e-05
659 LOCAL_OBJECT_END(_6p5_to_8_data_tail)
661 LOCAL_OBJECT_START(_13_to_16_data_tail)
662 // Polynomial coefficients for the tanh(x), 13 <= |x| < 16 
663 data8 0x9D6CCDA4767CA6D9, 0x00003FE5 //A13 = 1.8326683535066775712253572575e-08
664 data8 0xFFAF154F334BF403, 0x0000BFE4 //A12 = -1.4882762852665077172347508377e-08
665 data8 0xBFC68FA7C61B6C17, 0x00003FE4 //A11 = 1.1162810813806544919835662888e-08
666 data8 0x83D8439A6B19A015, 0x0000BFE4 //A10 = -7.6743763372603959795701788561e-09
667 data8 0xA4CE5BE9DC6A2962, 0x00003FE3 //A9 = 4.7964885012772346158732715382e-09
668 data8 0xB96826C0697253CA, 0x0000BFE2 //A8 = -2.6980246373950994097953903952e-09
669 data8 0xB96826CADDC00E35, 0x00003FE1 //A7 = 1.3490123232313844006540534789e-09
670 data8 0xA23B21F1155DF322, 0x0000BFE0 //A6 = -5.9019289132168830718664922372e-10
671 data8 0xF358B2E9A50C349C, 0x00003FDE //A5 = 2.2132233424669131155945897524e-10
672 data8 0x98176FD2074C1D77, 0x0000BFDD //A4 = -6.9163229452106125388824134881e-11
673 LOCAL_OBJECT_END(_13_to_16_data_tail)
675 LOCAL_OBJECT_START(_0_to_1o8_data)
676 // Polynomial coefficients for the tanh(x), 0.0 <= |x| < 0.125 
677 data8 0xBA0EC1879495150B, 0x0000BFF5 // A15 = -1.4195071451378679802688367813e-03
678 data8 0xEB5A82898D1BCBA4, 0x00003FF6 // A13 = 3.5912102408030526706365632879e-03
679 data8 0x91370DAFE0B64438, 0x0000BFF8 // A11 = -8.8632234251336964576640807982e-03
680 data8 0xB327A435358F1200, 0x00003FF9 // A9 = 2.1869488447622383899199238857e-02
681 data8 0xDD0DD0DD07A0775F, 0x0000BFFA // A7 = -5.3968253967902161405327069187e-02
682 data8 0x888888888887C299, 0x00003FFC // A5 = 1.3333333333333264660338062012e-01
683 data8 0xAAAAAAAAAAAAAA98, 0x0000BFFD // A3 = -3.3333333333333333282255458755e-01
684 LOCAL_OBJECT_END(_0_to_1o8_data)
687 .section .text
688 GLOBAL_LIBM_ENTRY(tanhl)
690 { .mfi
691       alloc          r32         = ar.pfs, 0, 21, 0, 0 
692       fmerge.se      fArgAbsNorm = f1, f8      // normalized x (1.0 <= x < 2.0)
693       addl           rSignBit    = 0x20000, r0 // Set sign bit for exponent
695 { .mlx
696       addl           rDataPtr    = @ltoff(tanhl_data), gp // Get common data ptr
697       movl           r1p5        = 0x3FF8000000000000    // 1.5 in dbl repres.
700 { .mfi
701       getf.exp       rArgExp     = f8              // Get arg exponent
702       fclass.m       p6,p0       = f8, 0xEF // Filter 0, denormals and specials 
703                             // 0xEF = @qnan|@snan|@pos|@neg|@zero|@unorm|@inf
704       addl           rBias       = 0xfffc, r0 // Value to subtract from exp 
705                                             // to get actual interval number
707 { .mfi
708       ld8            rDataPtr    = [rDataPtr]  // Get real common data pointer
709       fma.s1         fArgSqr     = f8, f8, f0  // x^2 (for [0;1/8] path)
710       addl           r2to4       = 0x10000, r0 // unbiased exponent 
711                                              // for [2;4] binary interval
714 { .mfi
715       getf.sig       rArgSig     = f8              // Get arg significand 
716       fcmp.lt.s1     p15, p14    = f8, f0          // Is arg negative/positive?
717       addl           rSaturation = 0xb70, r0       // First 12 bits of
718                                                    // saturation value signif.
720 { .mfi
721       setf.d         f1p5        = r1p5            // 1.5 construction 
722       fma.s1         f2p0        = f1,f1,f1        // 2.0 construction
723       addl           r1625Sign   = 0xd01, r0       // First 12 bits of
724                                                    // 1.625 value signif.
725       // 1.625 significand used to filter values greater than 3.25, 6.5, 13.0
728 { .mfi
729       addl           rTailDataPtr = 0xB00, rDataPtr  // Pointer to "tail" data
730       fmerge.s       fSignumX = f8, f1            // signum(x)
731       andcm          rArgExp     = rArgExp, rSignBit // Remove sign of exp
733 { .mfb
734       addl           rTiny       = 0xf000, r0 // Tiny value for saturation path
735       nop.f          0
736 (p6)  br.cond.spnt   tanhl_spec               // Branch to zero, denorm & specs      
739 { .mfi
740       sub            rInterval   = rArgExp, rBias // Get actual interval number
741       nop.f          0
742       shr.u          rArgSig     = rArgSig, 52    // Leave only 12 bits of sign. 
744 { .mfi
745       adds           rShiftedDataPtr = 0x10, rDataPtr // Second ptr to data
746       nop.f          0
747       cmp.ge         p8, p10     = rArgExp, r2to4  // If exp >= 2to4 interval?
750 { .mfi
751 (p8)  cmp.le         p8, p10     = r1625Sign, rArgSig // If signd is greater 
752                             //  than 1.625? (arg is at one of binary subranges)
753       nop.f          0
754       shl            rOffset     = rInterval, 8 // Make offset from 
755                                               // interval number
757 { .mfi
758       cmp.gt         p9, p0      = 0x0, rInterval // If interval is less than 0
759                                                 // (means arg is in [0; 1/8])
760       nop.f          0
761       cmp.eq         p7, p0      = 0x7, rInterval // If arg is in [16;] interv.?
764 { .mfi
765 (p8)  adds           rOffset     = 0x400, rOffset // Add additional offset 
766                             //  (arg is at one of binary subranges)
767       fma.s1         fArgCube    = fArgSqr, f8, f0  // x^3 (for [0;1/8] path)
768       shl            rTailOffset = rInterval, 7  // Make offset to "tail" data
769                                                  // from interval number
771 { .mib
772       setf.exp       fTiny       = rTiny // Construct "tiny" value 
773                                        // for saturation path
774       cmp.ltu        p11, p0     = 0x7, rInterval // if arg > 32
775 (p9)  br.cond.spnt   _0_to_1o8       
778 { .mfi
779       add            rAddr1      = rDataPtr, rOffset // Get address for 
780                                                    // interval data 
781       nop.f          0
782       shl            rTailAddOffset = rInterval, 5 // Offset to interval
783                                                    // "tail" data 
785 { .mib
786       add            rAddr2      = rShiftedDataPtr, rOffset // Get second
787                                                  // address for interval data 
788 (p7)  cmp.leu        p11, p0     = rSaturation, rArgSig // if arg is 
789                                                         // in [22.8;32] interval
790 (p11) br.cond.spnt   _saturation // Branch to Saturation path
793 { .mmi
794       ldfe           fA3         = [rAddr1], 0x90 // Load A3
795       ldfpd          fA2H, fA2L  = [rAddr2], 16 // Load A2High, A2Low
796       add            rTailOffset = rTailOffset, rTailAddOffset // "Tail" offset
799 { .mmi
800       ldfe           fA20        = [rAddr1], 16 // Load A20
801       ldfpd          fA1H, fA1L  = [rAddr2], 16 // Load A1High, A1Low
802 (p8)  adds           rTailOffset = 0x280, rTailOffset // Additional offset
803                                     //  (arg is at one of binary subranges)
806 { .mmi
807       ldfe           fA19        = [rAddr1], 16 // Load A19
808       ldfpd          fA0H, fA0L  = [rAddr2], 16 // Load A0High, A0Low
809       add            rTailAddr1  = rTailDataPtr, rTailOffset // First tail
810                                                            // data address
813 .pred.rel "mutex",p8,p10
814 { .mfi
815       ldfe           fA18        = [rAddr1], 16 // Load A18
816 (p8)  fms.s1         fArgAbsNorm = fArgAbsNorm, f1, f2p0 // Add 2.0 
817                             //  (arg is at one of binary subranges)
818       adds           rTailAddr2  = 0x10, rTailAddr1  // First tail
819                                                      // data address
821 { .mfi
822       ldfe           fA25        = [rAddr2], 16 // Load A25 
823 (p10) fms.s1         fArgAbsNorm = fArgAbsNorm, f1, f1p5  // Add 1.5 
824                                                 // to normalized arg
825       nop.i          0
828 { .mmi
829       ldfe           fA17        = [rAddr1], 16 // Load A17
830       ldfe           fA24        = [rAddr2], 16 // Load A24
831       nop.i          0
834 { .mmi
835       ldfe           fA16        = [rAddr1], 16 // Load A16
836       ldfe           fA23        = [rAddr2], 16 // Load A23
837       nop.i          0
840 { .mmi
841       ldfe           fA15        = [rAddr1], 16 // Load A15
842       ldfe           fA22        = [rAddr2], 16 // Load A22
843       nop.i          0
846 { .mmi
847       ldfe           fA14        = [rAddr1], 16 // Load A14
848       ldfe           fA21        = [rAddr2], 16 // Load A21
849       nop.i          0
852 { .mfi
853       ldfe           fA13        = [rTailAddr1], 32              // Load A13
854       fms.s1         fArgAbsNorm2 = fArgAbsNorm, fArgAbsNorm, f0 // x^2
855       nop.i          0
857 { .mfi
858       ldfe           fA12        = [rTailAddr2], 32 // Load A12
859       nop.f          0
860       nop.i          0
863 { .mfi
864       ldfe           fA11        = [rTailAddr1], 32       // Load A11
865       fma.s1         fRes3H      = fA3, fArgAbsNorm, fA2H // (A3*x+A2)*x^2
866       nop.i          0
868 { .mfi
869       ldfe           fA10        = [rTailAddr2], 32     // Load A10
870       fma.s1         fTH         = fA3, fArgAbsNorm, f0 // (A3*x+A2)*x^2
871       nop.i          0
874 { .mfi
875       ldfe           fA9         = [rTailAddr1], 32      // Load A9
876       fma.s1         fTT2        = fA1L, fArgAbsNorm, f0 // A1*x+A0
877       nop.i          0
879 { .mfi
880       ldfe           fA8         = [rTailAddr2], 32 // Load A8
881       nop.f          0
882       nop.i          0
885 { .mmi
886       ldfe           fA7         = [rTailAddr1], 32 // Load A7
887       ldfe           fA6         = [rTailAddr2], 32 // Load A6
888       nop.i          0
891 { .mmi
892       ldfe           fA5         = [rTailAddr1], 32 // Load A5
893       ldfe           fA4         = [rTailAddr2], 32 // Load A4
894       nop.i          0
897 { .mfi
898       nop.m          0
899       fms.s1         fArgAbsNorm2L = fArgAbsNorm, fArgAbsNorm, fArgAbsNorm2
900                                                   // Low part of x^2 (delta)
901       nop.i          0
903 { .mfi
904       nop.m          0
905       fms.s1         fArgAbsNorm4  = fArgAbsNorm2, fArgAbsNorm2, f0 // x^4
906       nop.i          0
909 { .mfi
910       nop.m          0
911       fms.s1         fRes3L      = fA2H, f1, fRes3H // // (A3*x+A2)*x^2
912       nop.i          0
915 { .mfi
916       nop.m          0
917       fms.s1         fArgAbsNorm3 = fArgAbsNorm2, fArgAbsNorm, f0 // x^3
918       nop.i          0
920 { .mfi
921       nop.m          0
922       fma.s1         fTH2        = fA1H, fArgAbsNorm, fTT2 // A1*x+A0
923       nop.i          0
926 { .mfi
927       nop.m          0
928       fma.s1         fA23        = fA24,  fArgAbsNorm, fA23 // Polynomial tail
929       nop.i          0
931 { .mfi 
932       nop.m          0
933       fma.s1         fA21        = fA22,  fArgAbsNorm, fA21 // Polynomial tail 
934       nop.i          0
937 { .mfi
938       nop.m          0
939       fma.s1         fA12        = fA13,  fArgAbsNorm, fA12 // Polynomial tail
940       nop.i          0
944 { .mfi
945       nop.m          0
946       fma.s1         fRes3L      = fRes3L, f1, fTH // (A3*x+A2)*x^2
947       nop.i          0
949 { .mfi 
950       nop.m          0
951       fma.s1         fA19        = fA20,  fArgAbsNorm, fA19 // Polynomial tail
952       nop.i          0
955 { .mfi
956       nop.m          0
957       fma.s1         fRes1H      = fTH2, f1, fA0H // A1*x+A0
958       nop.i          0
960 { .mfi 
961       nop.m          0
962       fms.s1         fTL2        = fA1H, fArgAbsNorm, fTH2 // A1*x+A0
963       nop.i          0
966 { .mfi
967       nop.m          0
968       fma.s1         fA8         = fA9,  fArgAbsNorm, fA8 // Polynomial tail
969       nop.i          0
971 { .mfi 
972       nop.m          0
973       fma.s1         fA10        = fA11,  fArgAbsNorm, fA10 // Polynomial tail
974       nop.i          0
977 { .mfi
978       nop.m          0
979       fma.s1         fA15        = fA16,  fArgAbsNorm, fA15 // Polynomial tail
980       nop.i          0
982 { .mfi
983       nop.m          0
984       fma.s1         fA17        = fA18,  fArgAbsNorm, fA17 // Polynomial tail
985       nop.i          0
988 { .mfi
989       nop.m          0
990       fms.s1         fArgAbsNorm11 = fArgAbsNorm4, fArgAbsNorm4, f0 // x^8
991       nop.i          0
993 { .mfi 
994       nop.m          0
995       fma.s1         fA4         = fA5,  fArgAbsNorm, fA4 // Polynomial tail
996       nop.i          0
999 { .mfi
1000       nop.m          0
1001       fma.s1         fRes3L      = fRes3L, f1, fA2L // (A3*x+A2)*x^2
1002       nop.i          0
1004 { .mfi 
1005       nop.m          0
1006       fma.s1         fA6         = fA7,  fArgAbsNorm, fA6 // Polynomial tail
1007       nop.i          0
1010 { .mfi
1011       nop.m          0
1012       fma.s1         fTL2        = fTL2, f1, fTT2 // A1*x+A0
1013       nop.i          0
1015 { .mfi 
1016       nop.m          0
1017       fms.s1         fRes1L      = fA0H, f1, fRes1H // A1*x+A0
1018       nop.i          0
1021 { .mfi
1022       nop.m          0
1023       fma.s1         fA23        = fA25,  fArgAbsNorm2, fA23 // Polynomial tail
1024       nop.i          0
1026 { .mfi 
1027       nop.m          0
1028       fma.s1         fA12        = fA14,  fArgAbsNorm2, fA12 // Polynomial tail
1029       nop.i          0
1032 { .mfi
1033       nop.m          0
1034       fma.s1         fA19        = fA21,  fArgAbsNorm2, fA19  // Polynomial tail
1035       nop.i          0
1037 { .mfi 
1038       nop.m          0
1039       fma.s1         fA8         = fA10,  fArgAbsNorm2, fA8 // Polynomial tail
1040       nop.i          0
1043 { .mfi
1044       nop.m          0
1045       fma.s1         fA15        = fA17,  fArgAbsNorm2, fA15 // Polynomial tail
1046       nop.i          0
1048 { .mfi 
1049       nop.m          0
1050       fms.s1         fArgAbsNorm11 = fArgAbsNorm11, fArgAbsNorm3, f0 // x^11
1051       nop.i          0
1054 { .mfi
1055       nop.m          0
1056       fma.s1         fTT         = fRes3L, fArgAbsNorm2, f0 // (A3*x+A2)*x^2
1057       nop.i          0
1059 { .mfi 
1060       nop.m          0
1061       fma.s1         fA4         = fA6,  fArgAbsNorm2, fA4 // Polynomial tail
1062       nop.i          0
1065 { .mfi
1066       nop.m          0
1067       fma.s1         fRes1L      = fRes1L, f1, fTH2 // A1*x+A0
1068       nop.i          0
1070 { .mfi
1071       nop.m          0
1072       fms.s1         fArgAbsNorm4X  = fArgAbsNorm4, fSignumX, f0 // x^4 * signum
1073       nop.i          0
1076 { .mfi
1077       nop.m          0
1078       fma.s1         fA19        = fA23,  fArgAbsNorm4, fA19 // Polynomial tail
1079       nop.i          0
1081 { .mfi 
1082       nop.m          0
1083       fma.s1         fA8         = fA12,  fArgAbsNorm4, fA8 // Polynomial tail
1084       nop.i          0
1087 { .mfi
1088       nop.m          0
1089       fma.s1         fTT         = fRes3H, fArgAbsNorm2L, fTT // (A3*x+A2)*x^2
1090       nop.i          0
1093 { .mfi
1094       nop.m          0
1095       fma.s1         fRes1L      = fRes1L, f1, fTL2 // A1*x+A0
1096       nop.i          0
1099 { .mfi
1100       nop.m          0
1101       fma.s1         fA15        = fA19,  fArgAbsNorm4, fA15 // Polynomial tail
1102       nop.i          0
1104 { .mfi
1105       nop.m          0
1106       fma.s1         fA4         = fA8,  fArgAbsNorm4, fA4 // Polynomial tail
1107       nop.i          0
1110 { .mfi
1111       nop.m          0
1112       fma.s1         fRes2H      = fRes3H, fArgAbsNorm2, fTT // (A3*x+A2)*x^2
1113       nop.i          0
1116 { .mfi
1117       nop.m          0
1118       fma.s1         fRes1L      = fRes1L, f1, fA0L // A1*x+A0
1119       nop.i          0
1122 { .mfi
1123       nop.m          0
1124       fma.s1         fRes4       = fA15, fArgAbsNorm11, fA4 // Result of
1125                                                     // polynomial tail
1126       nop.i          0
1129 { .mfi
1130       nop.m          0
1131       fms.s1         fRes2L      = fRes3H, fArgAbsNorm2, fRes2H // (A3*x+A2)*x^2
1132       nop.i          0
1134 { .mfi 
1135       nop.m          0
1136       fma.s1         fResH       = fRes2H, f1, fRes1H // High result
1137       nop.i          0
1140 { .mfi
1141       nop.m          0
1142 (p14) fma.s1         fRes1L      = fRes4, fArgAbsNorm4X, fRes1L // A1*x+A0
1143       nop.i          0
1145 { .mfi
1146       nop.m          0
1147 (p15) fms.s1         fRes1L      = fRes4, fArgAbsNorm4X, fRes1L // A1*x+A0
1148       nop.i          0
1151 { .mfi 
1152       nop.m          0
1153       fma.s1         fRes2L      = fRes2L, f1, fTT // (A3*x+A2)*x^2
1154       nop.i          0
1156 { .mfi 
1157       nop.m          0
1158       fms.s1         fResL       = fRes1H, f1, fResH // Low result
1159       nop.i          0
1162 { .mfi
1163       nop.m          0
1164       fma.s0         fRes1L      = fRes2L, fSignumX, fRes1L // Low result
1165                  // .s0 - for symmetry issue resolving at +/-inf rounding mode
1166       nop.i          0
1168 { .mfi 
1169       nop.m          0
1170       fma.s1         fResL       = fResL, f1, fRes2H // Low result
1171       nop.i          0
1174 { .mfi
1175       nop.m          0
1176 (p14) fma.s0         fResL       = fRes1L, f1, fResL // Low result
1177                  // .s0 - for symmetry issue resolving at +/-inf rounding mode
1178       nop.i          0
1180 { .mfi
1181       nop.m          0
1182 (p15) fms.s0         fResL     = fRes1L, f1, fResL // Low result
1183                  // .s0 - for symmetry issue resolving at +/-inf rounding mode
1184       nop.i          0
1187 .pred.rel "mutex",p14,p15
1188 { .mfi 
1189       nop.m          0
1190 (p14) fma.s0         f8          = fResL, f1,  fResH// Add high and low results
1191       nop.i          0
1193 { .mfb 
1194       nop.m          0
1195 (p15) fms.s0         f8          = fResL, f1, fResH // Add high and low results
1196       br.ret.sptk    b0      // Main path return
1199 //  satiration path ////////////////////////////////////////////////////////////
1200 _saturation:
1202 .pred.rel "mutex",p14,p15
1203 { .mfi 
1204       nop.m          0
1205 (p14) fms.s0            f8          = f1, f1, fTiny // Saturation result r = 1-tiny
1206       nop.i 0
1208 { .mfb 
1209       nop.m          0
1210 (p15) fnma.s0           f8          = f1, f1, fTiny // Saturation result r = tiny-1
1211       br.ret.sptk    b0     // Saturation path return
1215 //  0, denormals and special IEEE numbers path /////////////////////////////////
1216 tanhl_spec:
1218 { .mfi 
1219       nop.m          0
1220       fclass.m       p6,p0       = f8, 0x23 // To filter infinities
1221                                           // 0x23 = @pos|@neg|@inf 
1222       nop.i          0
1225 { .mfi 
1226       nop.m          0
1227       fclass.m       p7,p0       = f8, 0xC7 // To filter NaNs & Zeros
1228                                  // 0xC7 = @pos|@neg|@zero|@qnan|@snan
1229       nop.i          0
1232 { .mfb 
1233       nop.m          0
1234 (p6)  fmerge.s       f8          = f8, f1     // +/-1 for INF args 
1235 (p6)  br.ret.spnt    b0                       // exit for x = INF
1238 { .mfb 
1239       nop.m          0
1240 (p7)  fma.s0         f8          = f8, f1, f8    // +/-0 for 0 args 
1241                                                  // and NaNs for NaNs
1242 (p7)  br.ret.spnt    b0                          // exit for x = NaN or +/-0
1245 { .mfi 
1246       nop.m          0
1247       fnorm.s0       f8          = f8            // Normalize arg
1248       nop.i          0
1251 .pred.rel "mutex",p14,p15
1252 { .mfi 
1253       nop.m          0
1254 (p14) fnma.s0        f8          = f8, f8, f8  // res = r-r^2
1255       nop.i          0
1257 { .mfb 
1258       nop.m          0
1259 (p15) fma.s0         f8          = f8, f8, f8  // res = r+r^2
1260       br.ret.sptk    b0          // 0, denormals, IEEE specials return
1264 //  0 < |x| < 1/8 path /////////////////////////////////////////////////////////
1265 _0_to_1o8:
1267 { .mmi 
1268       adds           rAddr1      = 0x11e0, rDataPtr // Ptr 1 to coeffs
1269       adds           rAddr2      = 0x11f0, rDataPtr // Ptr 2 to coeffs
1270       nop.i          0
1273 { .mmi 
1274       ldfe           fA15        = [rAddr1], 32 // Load A15
1275       ldfe           fA13        = [rAddr2], 32 // Load A13
1276       nop.i          0
1279 { .mmi 
1280       ldfe           fA11        = [rAddr1], 32 // Load A11
1281       ldfe           fA9         = [rAddr2], 32 // Load A9
1282       nop.i          0
1285 { .mmi 
1286       ldfe           fA7         = [rAddr1], 32 // Load A7
1287       ldfe           fA5         = [rAddr2]  // Load A5
1288       nop.i          0
1291 { .mfi 
1292       ldfe           fA3         = [rAddr1] // Load A3
1293       fma.s1         fA11        = fA13, fArgSqr, fA11 // Polynomial tail
1294       nop.i          0
1296 { .mfi 
1297       nop.m          0
1298       fma.s1         fArgFour    = fArgSqr, fArgSqr, f0 // a^4        
1299       nop.i          0
1303 { .mfi 
1304       nop.m          0
1305       fma.s1         fA3         = fA5, fArgSqr, fA3 // Polynomial tail
1306       nop.i          0
1308 { .mfi 
1309       nop.m          0
1310       fma.s1         fA7         = fA9, fArgSqr, fA7 // Polynomial tail
1311       nop.i          0
1315 { .mfi 
1316       nop.m          0
1317       fma.s1         fA11        = fA15, fArgFour, fA11 // Polynomial tail
1318       nop.i          0
1321 { .mfi 
1322       nop.m          0
1323       fma.s1         fA3         = fA7, fArgFour, fA3 // Polynomial tail
1324       nop.i          0
1326 { .mfi 
1327       nop.m          0
1328       fma.s1         fArgEight   = fArgFour, fArgFour, f0 // a^8
1329       nop.i          0
1332 { .mfi 
1333       nop.m          0
1334       fma.s1         fRes        = fA11, fArgEight, fA3 //Polynomial tail result
1335       nop.i          0
1338 { .mfb 
1339       nop.m          0
1340       fma.s0         f8          = fRes, fArgCube, f8 // (Polynomial tail)*x^3
1341       br.ret.sptk    b0          // [0;1/8] interval return
1343   
1344 GLOBAL_LIBM_END(tanhl)