gcc/config/ia64/lib1funcs.asm

   1 /* Copyright (C) 2000, 2001, 2003, 2005, 2009 Free Software Foundation, Inc.
   2    Contributed by James E. Wilson <wilson@cygnus.com>.
   3
   4    This file is part of GCC.
   5
   6    GCC is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    GCC is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with GCC; see the file COPYING.  If not, write to
  18    the Free Software Foundation, 51 Franklin Street, Fifth Floor,
  19    Boston, MA 02110-1301, USA.  */
  20
  21 /* As a special exception, if you link this library with other files,
  22    some of which are compiled with GCC, to produce an executable,
  23    this library does not by itself cause the resulting executable
  24    to be covered by the GNU General Public License.
  25    This exception does not however invalidate any other reasons why
  26    the executable file might be covered by the GNU General Public License.  */
  27
  28 #ifdef L__divxf3
  29 // Compute a 80-bit IEEE double-extended quotient.
  30 //
  31 // From the Intel IA-64 Optimization Guide, choose the minimum latency
  32 // alternative.
  33 //
  34 // farg0 holds the dividend.  farg1 holds the divisor.
  35 //
  36 // __divtf3 is an alternate symbol name for backward compatibility.
  37
  38         .text
  39         .align 16
  40         .global __divxf3
  41         .proc __divxf3
  42 __divxf3:
  43 #ifdef SHARED
  44         .global __divtf3
  45 __divtf3:
  46 #endif
  47         cmp.eq p7, p0 = r0, r0
  48         frcpa.s0 f10, p6 = farg0, farg1
  49         ;;
  50 (p6)    cmp.ne p7, p0 = r0, r0
  51         .pred.rel.mutex p6, p7
  52 (p6)    fnma.s1 f11 = farg1, f10, f1
  53 (p6)    fma.s1 f12 = farg0, f10, f0
  54         ;;
  55 (p6)    fma.s1 f13 = f11, f11, f0
  56 (p6)    fma.s1 f14 = f11, f11, f11
  57         ;;
  58 (p6)    fma.s1 f11 = f13, f13, f11
  59 (p6)    fma.s1 f13 = f14, f10, f10
  60         ;;
  61 (p6)    fma.s1 f10 = f13, f11, f10
  62 (p6)    fnma.s1 f11 = farg1, f12, farg0
  63         ;;
  64 (p6)    fma.s1 f11 = f11, f10, f12
  65 (p6)    fnma.s1 f12 = farg1, f10, f1
  66         ;;
  67 (p6)    fma.s1 f10 = f12, f10, f10
  68 (p6)    fnma.s1 f12 = farg1, f11, farg0
  69         ;;
  70 (p6)    fma.s0 fret0 = f12, f10, f11
  71 (p7)    mov fret0 = f10
  72         br.ret.sptk rp
  73         .endp __divxf3
  74 #endif
  75
  76 #ifdef L__divdf3
  77 // Compute a 64-bit IEEE double quotient.
  78 //
  79 // From the Intel IA-64 Optimization Guide, choose the minimum latency
  80 // alternative.
  81 //
  82 // farg0 holds the dividend.  farg1 holds the divisor.
  83
  84         .text
  85         .align 16
  86         .global __divdf3
  87         .proc __divdf3
  88 __divdf3:
  89         cmp.eq p7, p0 = r0, r0
  90         frcpa.s0 f10, p6 = farg0, farg1
  91         ;;
  92 (p6)    cmp.ne p7, p0 = r0, r0
  93         .pred.rel.mutex p6, p7
  94 (p6)    fmpy.s1 f11 = farg0, f10
  95 (p6)    fnma.s1 f12 = farg1, f10, f1
  96         ;;
  97 (p6)    fma.s1 f11 = f12, f11, f11
  98 (p6)    fmpy.s1 f13 = f12, f12
  99         ;;
 100 (p6)    fma.s1 f10 = f12, f10, f10
 101 (p6)    fma.s1 f11 = f13, f11, f11
 102         ;;
 103 (p6)    fmpy.s1 f12 = f13, f13
 104 (p6)    fma.s1 f10 = f13, f10, f10
 105         ;;
 106 (p6)    fma.d.s1 f11 = f12, f11, f11
 107 (p6)    fma.s1 f10 = f12, f10, f10
 108         ;;
 109 (p6)    fnma.d.s1 f8 = farg1, f11, farg0
 110         ;;
 111 (p6)    fma.d fret0 = f8, f10, f11
 112 (p7)    mov fret0 = f10
 113         br.ret.sptk rp
 114         ;;
 115         .endp __divdf3
 116 #endif
 117
 118 #ifdef L__divsf3
 119 // Compute a 32-bit IEEE float quotient.
 120 //
 121 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 122 // alternative.
 123 //
 124 // farg0 holds the dividend.  farg1 holds the divisor.
 125
 126         .text
 127         .align 16
 128         .global __divsf3
 129         .proc __divsf3
 130 __divsf3:
 131         cmp.eq p7, p0 = r0, r0
 132         frcpa.s0 f10, p6 = farg0, farg1
 133         ;;
 134 (p6)    cmp.ne p7, p0 = r0, r0
 135         .pred.rel.mutex p6, p7
 136 (p6)    fmpy.s1 f8 = farg0, f10
 137 (p6)    fnma.s1 f9 = farg1, f10, f1
 138         ;;
 139 (p6)    fma.s1 f8 = f9, f8, f8
 140 (p6)    fmpy.s1 f9 = f9, f9
 141         ;;
 142 (p6)    fma.s1 f8 = f9, f8, f8
 143 (p6)    fmpy.s1 f9 = f9, f9
 144         ;;
 145 (p6)    fma.d.s1 f10 = f9, f8, f8
 146         ;;
 147 (p6)    fnorm.s.s0 fret0 = f10
 148 (p7)    mov fret0 = f10
 149         br.ret.sptk rp
 150         ;;
 151         .endp __divsf3
 152 #endif
 153
 154 #ifdef L__divdi3
 155 // Compute a 64-bit integer quotient.
 156 //
 157 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 158 // alternative.
 159 //
 160 // in0 holds the dividend.  in1 holds the divisor.
 161
 162         .text
 163         .align 16
 164         .global __divdi3
 165         .proc __divdi3
 166 __divdi3:
 167         .regstk 2,0,0,0
 168         // Transfer inputs to FP registers.
 169         setf.sig f8 = in0
 170         setf.sig f9 = in1
 171         // Check divide by zero.
 172         cmp.ne.unc p0,p7=0,in1
 173         ;;
 174         // Convert the inputs to FP, so that they won't be treated as unsigned.
 175         fcvt.xf f8 = f8
 176         fcvt.xf f9 = f9
 177 (p7)    break 1
 178         ;;
 179         // Compute the reciprocal approximation.
 180         frcpa.s1 f10, p6 = f8, f9
 181         ;;
 182         // 3 Newton-Raphson iterations.
 183 (p6)    fnma.s1 f11 = f9, f10, f1
 184 (p6)    fmpy.s1 f12 = f8, f10
 185         ;;
 186 (p6)    fmpy.s1 f13 = f11, f11
 187 (p6)    fma.s1 f12 = f11, f12, f12
 188         ;;
 189 (p6)    fma.s1 f10 = f11, f10, f10
 190 (p6)    fma.s1 f11 = f13, f12, f12
 191         ;;
 192 (p6)    fma.s1 f10 = f13, f10, f10
 193 (p6)    fnma.s1 f12 = f9, f11, f8
 194         ;;
 195 (p6)    fma.s1 f10 = f12, f10, f11
 196         ;;
 197         // Round quotient to an integer.
 198         fcvt.fx.trunc.s1 f10 = f10
 199         ;;
 200         // Transfer result to GP registers.
 201         getf.sig ret0 = f10
 202         br.ret.sptk rp
 203         ;;
 204         .endp __divdi3
 205 #endif
 206
 207 #ifdef L__moddi3
 208 // Compute a 64-bit integer modulus.
 209 //
 210 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 211 // alternative.
 212 //
 213 // in0 holds the dividend (a).  in1 holds the divisor (b).
 214
 215         .text
 216         .align 16
 217         .global __moddi3
 218         .proc __moddi3
 219 __moddi3:
 220         .regstk 2,0,0,0
 221         // Transfer inputs to FP registers.
 222         setf.sig f14 = in0
 223         setf.sig f9 = in1
 224         // Check divide by zero.
 225         cmp.ne.unc p0,p7=0,in1
 226         ;;
 227         // Convert the inputs to FP, so that they won't be treated as unsigned.
 228         fcvt.xf f8 = f14
 229         fcvt.xf f9 = f9
 230 (p7)    break 1
 231         ;;
 232         // Compute the reciprocal approximation.
 233         frcpa.s1 f10, p6 = f8, f9
 234         ;;
 235         // 3 Newton-Raphson iterations.
 236 (p6)    fmpy.s1 f12 = f8, f10
 237 (p6)    fnma.s1 f11 = f9, f10, f1
 238         ;;
 239 (p6)    fma.s1 f12 = f11, f12, f12
 240 (p6)    fmpy.s1 f13 = f11, f11
 241         ;;
 242 (p6)    fma.s1 f10 = f11, f10, f10
 243 (p6)    fma.s1 f11 = f13, f12, f12
 244         ;;
 245         sub in1 = r0, in1
 246 (p6)    fma.s1 f10 = f13, f10, f10
 247 (p6)    fnma.s1 f12 = f9, f11, f8
 248         ;;
 249         setf.sig f9 = in1
 250 (p6)    fma.s1 f10 = f12, f10, f11
 251         ;;
 252         fcvt.fx.trunc.s1 f10 = f10
 253         ;;
 254         // r = q * (-b) + a
 255         xma.l f10 = f10, f9, f14
 256         ;;
 257         // Transfer result to GP registers.
 258         getf.sig ret0 = f10
 259         br.ret.sptk rp
 260         ;;
 261         .endp __moddi3
 262 #endif
 263
 264 #ifdef L__udivdi3
 265 // Compute a 64-bit unsigned integer quotient.
 266 //
 267 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 268 // alternative.
 269 //
 270 // in0 holds the dividend.  in1 holds the divisor.
 271
 272         .text
 273         .align 16
 274         .global __udivdi3
 275         .proc __udivdi3
 276 __udivdi3:
 277         .regstk 2,0,0,0
 278         // Transfer inputs to FP registers.
 279         setf.sig f8 = in0
 280         setf.sig f9 = in1
 281         // Check divide by zero.
 282         cmp.ne.unc p0,p7=0,in1
 283         ;;
 284         // Convert the inputs to FP, to avoid FP software-assist faults.
 285         fcvt.xuf.s1 f8 = f8
 286         fcvt.xuf.s1 f9 = f9
 287 (p7)    break 1
 288         ;;
 289         // Compute the reciprocal approximation.
 290         frcpa.s1 f10, p6 = f8, f9
 291         ;;
 292         // 3 Newton-Raphson iterations.
 293 (p6)    fnma.s1 f11 = f9, f10, f1
 294 (p6)    fmpy.s1 f12 = f8, f10
 295         ;;
 296 (p6)    fmpy.s1 f13 = f11, f11
 297 (p6)    fma.s1 f12 = f11, f12, f12
 298         ;;
 299 (p6)    fma.s1 f10 = f11, f10, f10
 300 (p6)    fma.s1 f11 = f13, f12, f12
 301         ;;
 302 (p6)    fma.s1 f10 = f13, f10, f10
 303 (p6)    fnma.s1 f12 = f9, f11, f8
 304         ;;
 305 (p6)    fma.s1 f10 = f12, f10, f11
 306         ;;
 307         // Round quotient to an unsigned integer.
 308         fcvt.fxu.trunc.s1 f10 = f10
 309         ;;
 310         // Transfer result to GP registers.
 311         getf.sig ret0 = f10
 312         br.ret.sptk rp
 313         ;;
 314         .endp __udivdi3
 315 #endif
 316
 317 #ifdef L__umoddi3
 318 // Compute a 64-bit unsigned integer modulus.
 319 //
 320 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 321 // alternative.
 322 //
 323 // in0 holds the dividend (a).  in1 holds the divisor (b).
 324
 325         .text
 326         .align 16
 327         .global __umoddi3
 328         .proc __umoddi3
 329 __umoddi3:
 330         .regstk 2,0,0,0
 331         // Transfer inputs to FP registers.
 332         setf.sig f14 = in0
 333         setf.sig f9 = in1
 334         // Check divide by zero.
 335         cmp.ne.unc p0,p7=0,in1
 336         ;;
 337         // Convert the inputs to FP, to avoid FP software assist faults.
 338         fcvt.xuf.s1 f8 = f14
 339         fcvt.xuf.s1 f9 = f9
 340 (p7)    break 1;
 341         ;;
 342         // Compute the reciprocal approximation.
 343         frcpa.s1 f10, p6 = f8, f9
 344         ;;
 345         // 3 Newton-Raphson iterations.
 346 (p6)    fmpy.s1 f12 = f8, f10
 347 (p6)    fnma.s1 f11 = f9, f10, f1
 348         ;;
 349 (p6)    fma.s1 f12 = f11, f12, f12
 350 (p6)    fmpy.s1 f13 = f11, f11
 351         ;;
 352 (p6)    fma.s1 f10 = f11, f10, f10
 353 (p6)    fma.s1 f11 = f13, f12, f12
 354         ;;
 355         sub in1 = r0, in1
 356 (p6)    fma.s1 f10 = f13, f10, f10
 357 (p6)    fnma.s1 f12 = f9, f11, f8
 358         ;;
 359         setf.sig f9 = in1
 360 (p6)    fma.s1 f10 = f12, f10, f11
 361         ;;
 362         // Round quotient to an unsigned integer.
 363         fcvt.fxu.trunc.s1 f10 = f10
 364         ;;
 365         // r = q * (-b) + a
 366         xma.l f10 = f10, f9, f14
 367         ;;
 368         // Transfer result to GP registers.
 369         getf.sig ret0 = f10
 370         br.ret.sptk rp
 371         ;;
 372         .endp __umoddi3
 373 #endif
 374
 375 #ifdef L__divsi3
 376 // Compute a 32-bit integer quotient.
 377 //
 378 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 379 // alternative.
 380 //
 381 // in0 holds the dividend.  in1 holds the divisor.
 382
 383         .text
 384         .align 16
 385         .global __divsi3
 386         .proc __divsi3
 387 __divsi3:
 388         .regstk 2,0,0,0
 389         // Check divide by zero.
 390         cmp.ne.unc p0,p7=0,in1
 391         sxt4 in0 = in0
 392         sxt4 in1 = in1
 393         ;;
 394         setf.sig f8 = in0
 395         setf.sig f9 = in1
 396 (p7)    break 1
 397         ;;
 398         mov r2 = 0x0ffdd
 399         fcvt.xf f8 = f8
 400         fcvt.xf f9 = f9
 401         ;;
 402         setf.exp f11 = r2
 403         frcpa.s1 f10, p6 = f8, f9
 404         ;;
 405 (p6)    fmpy.s1 f8 = f8, f10
 406 (p6)    fnma.s1 f9 = f9, f10, f1
 407         ;;
 408 (p6)    fma.s1 f8 = f9, f8, f8
 409 (p6)    fma.s1 f9 = f9, f9, f11
 410         ;;
 411 (p6)    fma.s1 f10 = f9, f8, f8
 412         ;;
 413         fcvt.fx.trunc.s1 f10 = f10
 414         ;;
 415         getf.sig ret0 = f10
 416         br.ret.sptk rp
 417         ;;
 418         .endp __divsi3
 419 #endif
 420
 421 #ifdef L__modsi3
 422 // Compute a 32-bit integer modulus.
 423 //
 424 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 425 // alternative.
 426 //
 427 // in0 holds the dividend.  in1 holds the divisor.
 428
 429         .text
 430         .align 16
 431         .global __modsi3
 432         .proc __modsi3
 433 __modsi3:
 434         .regstk 2,0,0,0
 435         mov r2 = 0x0ffdd
 436         sxt4 in0 = in0
 437         sxt4 in1 = in1
 438         ;;
 439         setf.sig f13 = r32
 440         setf.sig f9 = r33
 441         // Check divide by zero.
 442         cmp.ne.unc p0,p7=0,in1
 443         ;;
 444         sub in1 = r0, in1
 445         fcvt.xf f8 = f13
 446         fcvt.xf f9 = f9
 447         ;;
 448         setf.exp f11 = r2
 449         frcpa.s1 f10, p6 = f8, f9
 450 (p7)    break 1
 451         ;;
 452 (p6)    fmpy.s1 f12 = f8, f10
 453 (p6)    fnma.s1 f10 = f9, f10, f1
 454         ;;
 455         setf.sig f9 = in1
 456 (p6)    fma.s1 f12 = f10, f12, f12
 457 (p6)    fma.s1 f10 = f10, f10, f11
 458         ;;
 459 (p6)    fma.s1 f10 = f10, f12, f12
 460         ;;
 461         fcvt.fx.trunc.s1 f10 = f10
 462         ;;
 463         xma.l f10 = f10, f9, f13
 464         ;;
 465         getf.sig ret0 = f10
 466         br.ret.sptk rp
 467         ;;
 468         .endp __modsi3
 469 #endif
 470
 471 #ifdef L__udivsi3
 472 // Compute a 32-bit unsigned integer quotient.
 473 //
 474 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 475 // alternative.
 476 //
 477 // in0 holds the dividend.  in1 holds the divisor.
 478
 479         .text
 480         .align 16
 481         .global __udivsi3
 482         .proc __udivsi3
 483 __udivsi3:
 484         .regstk 2,0,0,0
 485         mov r2 = 0x0ffdd
 486         zxt4 in0 = in0
 487         zxt4 in1 = in1
 488         ;;
 489         setf.sig f8 = in0
 490         setf.sig f9 = in1
 491         // Check divide by zero.
 492         cmp.ne.unc p0,p7=0,in1
 493         ;;
 494         fcvt.xf f8 = f8
 495         fcvt.xf f9 = f9
 496 (p7)    break 1
 497         ;;
 498         setf.exp f11 = r2
 499         frcpa.s1 f10, p6 = f8, f9
 500         ;;
 501 (p6)    fmpy.s1 f8 = f8, f10
 502 (p6)    fnma.s1 f9 = f9, f10, f1
 503         ;;
 504 (p6)    fma.s1 f8 = f9, f8, f8
 505 (p6)    fma.s1 f9 = f9, f9, f11
 506         ;;
 507 (p6)    fma.s1 f10 = f9, f8, f8
 508         ;;
 509         fcvt.fxu.trunc.s1 f10 = f10
 510         ;;
 511         getf.sig ret0 = f10
 512         br.ret.sptk rp
 513         ;;
 514         .endp __udivsi3
 515 #endif
 516
 517 #ifdef L__umodsi3
 518 // Compute a 32-bit unsigned integer modulus.
 519 //
 520 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 521 // alternative.
 522 //
 523 // in0 holds the dividend.  in1 holds the divisor.
 524
 525         .text
 526         .align 16
 527         .global __umodsi3
 528         .proc __umodsi3
 529 __umodsi3:
 530         .regstk 2,0,0,0
 531         mov r2 = 0x0ffdd
 532         zxt4 in0 = in0
 533         zxt4 in1 = in1
 534         ;;
 535         setf.sig f13 = in0
 536         setf.sig f9 = in1
 537         // Check divide by zero.
 538         cmp.ne.unc p0,p7=0,in1
 539         ;;
 540         sub in1 = r0, in1
 541         fcvt.xf f8 = f13
 542         fcvt.xf f9 = f9
 543         ;;
 544         setf.exp f11 = r2
 545         frcpa.s1 f10, p6 = f8, f9
 546 (p7)    break 1;
 547         ;;
 548 (p6)    fmpy.s1 f12 = f8, f10
 549 (p6)    fnma.s1 f10 = f9, f10, f1
 550         ;;
 551         setf.sig f9 = in1
 552 (p6)    fma.s1 f12 = f10, f12, f12
 553 (p6)    fma.s1 f10 = f10, f10, f11
 554         ;;
 555 (p6)    fma.s1 f10 = f10, f12, f12
 556         ;;
 557         fcvt.fxu.trunc.s1 f10 = f10
 558         ;;
 559         xma.l f10 = f10, f9, f13
 560         ;;
 561         getf.sig ret0 = f10
 562         br.ret.sptk rp
 563         ;;
 564         .endp __umodsi3
 565 #endif
 566
 567 #ifdef L__save_stack_nonlocal
 568 // Notes on save/restore stack nonlocal: We read ar.bsp but write
 569 // ar.bspstore.  This is because ar.bsp can be read at all times
 570 // (independent of the RSE mode) but since it's read-only we need to
 571 // restore the value via ar.bspstore.  This is OK because
 572 // ar.bsp==ar.bspstore after executing "flushrs".
 573
 574 // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
 575
 576         .text
 577         .align 16
 578         .global __ia64_save_stack_nonlocal
 579         .proc __ia64_save_stack_nonlocal
 580 __ia64_save_stack_nonlocal:
 581         { .mmf
 582           alloc r18 = ar.pfs, 2, 0, 0, 0
 583           mov r19 = ar.rsc
 584           ;;
 585         }
 586         { .mmi
 587           flushrs
 588           st8 [in0] = in1, 24
 589           and r19 = 0x1c, r19
 590           ;;
 591         }
 592         { .mmi
 593           st8 [in0] = r18, -16
 594           mov ar.rsc = r19
 595           or r19 = 0x3, r19
 596           ;;
 597         }
 598         { .mmi
 599           mov r16 = ar.bsp
 600           mov r17 = ar.rnat
 601           adds r2 = 8, in0
 602           ;;
 603         }
 604         { .mmi
 605           st8 [in0] = r16
 606           st8 [r2] = r17
 607         }
 608         { .mib
 609           mov ar.rsc = r19
 610           br.ret.sptk.few rp
 611           ;;
 612         }
 613         .endp __ia64_save_stack_nonlocal
 614 #endif
 615
 616 #ifdef L__nonlocal_goto
 617 // void __ia64_nonlocal_goto(void *target_label, void *save_area,
 618 //                           void *static_chain);
 619
 620         .text
 621         .align 16
 622         .global __ia64_nonlocal_goto
 623         .proc __ia64_nonlocal_goto
 624 __ia64_nonlocal_goto:
 625         { .mmi
 626           alloc r20 = ar.pfs, 3, 0, 0, 0
 627           ld8 r12 = [in1], 8
 628           mov.ret.sptk rp = in0, .L0
 629           ;;
 630         }
 631         { .mmf
 632           ld8 r16 = [in1], 8
 633           mov r19 = ar.rsc
 634           ;;
 635         }
 636         { .mmi
 637           flushrs
 638           ld8 r17 = [in1], 8
 639           and r19 = 0x1c, r19
 640           ;;
 641         }
 642         { .mmi
 643           ld8 r18 = [in1]
 644           mov ar.rsc = r19
 645           or r19 = 0x3, r19
 646           ;;
 647         }
 648         { .mmi
 649           mov ar.bspstore = r16
 650           ;;
 651           mov ar.rnat = r17
 652           ;;
 653         }
 654         { .mmi
 655           loadrs
 656           invala
 657           mov r15 = in2
 658           ;;
 659         }
 660 .L0:    { .mib
 661           mov ar.rsc = r19
 662           mov ar.pfs = r18
 663           br.ret.sptk.few rp
 664           ;;
 665         }
 666         .endp __ia64_nonlocal_goto
 667 #endif
 668
 669 #ifdef L__restore_stack_nonlocal
 670 // This is mostly the same as nonlocal_goto above.
 671 // ??? This has not been tested yet.
 672
 673 // void __ia64_restore_stack_nonlocal(void *save_area)
 674
 675         .text
 676         .align 16
 677         .global __ia64_restore_stack_nonlocal
 678         .proc __ia64_restore_stack_nonlocal
 679 __ia64_restore_stack_nonlocal:
 680         { .mmf
 681           alloc r20 = ar.pfs, 4, 0, 0, 0
 682           ld8 r12 = [in0], 8
 683           ;;
 684         }
 685         { .mmb
 686           ld8 r16=[in0], 8
 687           mov r19 = ar.rsc
 688           ;;
 689         }
 690         { .mmi
 691           flushrs
 692           ld8 r17 = [in0], 8
 693           and r19 = 0x1c, r19
 694           ;;
 695         }
 696         { .mmf
 697           ld8 r18 = [in0]
 698           mov ar.rsc = r19
 699           ;;
 700         }
 701         { .mmi
 702           mov ar.bspstore = r16
 703           ;;
 704           mov ar.rnat = r17
 705           or r19 = 0x3, r19
 706           ;;
 707         }
 708         { .mmf
 709           loadrs
 710           invala
 711           ;;
 712         }
 713 .L0:    { .mib
 714           mov ar.rsc = r19
 715           mov ar.pfs = r18
 716           br.ret.sptk.few rp
 717           ;;
 718         }
 719         .endp __ia64_restore_stack_nonlocal
 720 #endif
 721
 722 #ifdef L__trampoline
 723 // Implement the nested function trampoline.  This is out of line
 724 // so that we don't have to bother with flushing the icache, as
 725 // well as making the on-stack trampoline smaller.
 726 //
 727 // The trampoline has the following form:
 728 //
 729 //              +-------------------+ >
 730 //      TRAMP:  | __ia64_trampoline | |
 731 //              +-------------------+  > fake function descriptor
 732 //              | TRAMP+16          | |
 733 //              +-------------------+ >
 734 //              | target descriptor |
 735 //              +-------------------+
 736 //              | static link       |
 737 //              +-------------------+
 738
 739         .text
 740         .align 16
 741         .global __ia64_trampoline
 742         .proc __ia64_trampoline
 743 __ia64_trampoline:
 744         { .mmi
 745           ld8 r2 = [r1], 8
 746           ;;
 747           ld8 r15 = [r1]
 748         }
 749         { .mmi
 750           ld8 r3 = [r2], 8
 751           ;;
 752           ld8 r1 = [r2]
 753           mov b6 = r3
 754         }
 755         { .bbb
 756           br.sptk.many b6
 757           ;;
 758         }
 759         .endp __ia64_trampoline
 760 #endif
 761
 762 #ifdef SHARED
 763 // Thunks for backward compatibility.
 764 #ifdef L_fixtfdi
 765         .text
 766         .align 16
 767         .global __fixtfti
 768         .proc __fixtfti
 769 __fixtfti:
 770         { .bbb
 771           br.sptk.many __fixxfti
 772           ;;
 773         }
 774         .endp __fixtfti
 775 #endif
 776 #ifdef L_fixunstfdi
 777         .align 16
 778         .global __fixunstfti
 779         .proc __fixunstfti
 780 __fixunstfti:
 781         { .bbb
 782           br.sptk.many __fixunsxfti
 783           ;;
 784         }
 785         .endp __fixunstfti
 786 #endif
 787 #ifdef L_floatditf
 788         .align 16
 789         .global __floattitf
 790         .proc __floattitf
 791 __floattitf:
 792         { .bbb
 793           br.sptk.many __floattixf
 794           ;;
 795         }
 796         .endp __floattitf
 797 #endif
 798 #endif