libgcc/config/ia64/lib1funcs.S

   1 /* Copyright (C) 2000-2024 Free Software Foundation, Inc.
   2    Contributed by James E. Wilson <wilson@cygnus.com>.
   3
   4    This file is part of GCC.
   5
   6    GCC is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3, or (at your option)
   9    any later version.
  10
  11    GCC is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    Under Section 7 of GPL version 3, you are granted additional
  17    permissions described in the GCC Runtime Library Exception, version
  18    3.1, as published by the Free Software Foundation.
  19
  20    You should have received a copy of the GNU General Public License and
  21    a copy of the GCC Runtime Library Exception along with this program;
  22    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23    <http://www.gnu.org/licenses/>.  */
  24
  25 #ifdef L__divxf3
  26 // Compute a 80-bit IEEE double-extended quotient.
  27 //
  28 // From the Intel IA-64 Optimization Guide, choose the minimum latency
  29 // alternative.
  30 //
  31 // farg0 holds the dividend.  farg1 holds the divisor.
  32 //
  33 // __divtf3 is an alternate symbol name for backward compatibility.
  34
  35         .text
  36         .align 16
  37         .global __divxf3
  38         .proc __divxf3
  39 __divxf3:
  40 #ifdef SHARED
  41         .global __divtf3
  42 __divtf3:
  43 #endif
  44         cmp.eq p7, p0 = r0, r0
  45         frcpa.s0 f10, p6 = farg0, farg1
  46         ;;
  47 (p6)    cmp.ne p7, p0 = r0, r0
  48         .pred.rel.mutex p6, p7
  49 (p6)    fnma.s1 f11 = farg1, f10, f1
  50 (p6)    fma.s1 f12 = farg0, f10, f0
  51         ;;
  52 (p6)    fma.s1 f13 = f11, f11, f0
  53 (p6)    fma.s1 f14 = f11, f11, f11
  54         ;;
  55 (p6)    fma.s1 f11 = f13, f13, f11
  56 (p6)    fma.s1 f13 = f14, f10, f10
  57         ;;
  58 (p6)    fma.s1 f10 = f13, f11, f10
  59 (p6)    fnma.s1 f11 = farg1, f12, farg0
  60         ;;
  61 (p6)    fma.s1 f11 = f11, f10, f12
  62 (p6)    fnma.s1 f12 = farg1, f10, f1
  63         ;;
  64 (p6)    fma.s1 f10 = f12, f10, f10
  65 (p6)    fnma.s1 f12 = farg1, f11, farg0
  66         ;;
  67 (p6)    fma.s0 fret0 = f12, f10, f11
  68 (p7)    mov fret0 = f10
  69         br.ret.sptk rp
  70         .endp __divxf3
  71 #endif
  72
  73 #ifdef L__divdf3
  74 // Compute a 64-bit IEEE double quotient.
  75 //
  76 // From the Intel IA-64 Optimization Guide, choose the minimum latency
  77 // alternative.
  78 //
  79 // farg0 holds the dividend.  farg1 holds the divisor.
  80
  81         .text
  82         .align 16
  83         .global __divdf3
  84         .proc __divdf3
  85 __divdf3:
  86         cmp.eq p7, p0 = r0, r0
  87         frcpa.s0 f10, p6 = farg0, farg1
  88         ;;
  89 (p6)    cmp.ne p7, p0 = r0, r0
  90         .pred.rel.mutex p6, p7
  91 (p6)    fmpy.s1 f11 = farg0, f10
  92 (p6)    fnma.s1 f12 = farg1, f10, f1
  93         ;;
  94 (p6)    fma.s1 f11 = f12, f11, f11
  95 (p6)    fmpy.s1 f13 = f12, f12
  96         ;;
  97 (p6)    fma.s1 f10 = f12, f10, f10
  98 (p6)    fma.s1 f11 = f13, f11, f11
  99         ;;
 100 (p6)    fmpy.s1 f12 = f13, f13
 101 (p6)    fma.s1 f10 = f13, f10, f10
 102         ;;
 103 (p6)    fma.d.s1 f11 = f12, f11, f11
 104 (p6)    fma.s1 f10 = f12, f10, f10
 105         ;;
 106 (p6)    fnma.d.s1 f8 = farg1, f11, farg0
 107         ;;
 108 (p6)    fma.d fret0 = f8, f10, f11
 109 (p7)    mov fret0 = f10
 110         br.ret.sptk rp
 111         ;;
 112         .endp __divdf3
 113 #endif
 114
 115 #ifdef L__divsf3
 116 // Compute a 32-bit IEEE float quotient.
 117 //
 118 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 119 // alternative.
 120 //
 121 // farg0 holds the dividend.  farg1 holds the divisor.
 122
 123         .text
 124         .align 16
 125         .global __divsf3
 126         .proc __divsf3
 127 __divsf3:
 128         cmp.eq p7, p0 = r0, r0
 129         frcpa.s0 f10, p6 = farg0, farg1
 130         ;;
 131 (p6)    cmp.ne p7, p0 = r0, r0
 132         .pred.rel.mutex p6, p7
 133 (p6)    fmpy.s1 f8 = farg0, f10
 134 (p6)    fnma.s1 f9 = farg1, f10, f1
 135         ;;
 136 (p6)    fma.s1 f8 = f9, f8, f8
 137 (p6)    fmpy.s1 f9 = f9, f9
 138         ;;
 139 (p6)    fma.s1 f8 = f9, f8, f8
 140 (p6)    fmpy.s1 f9 = f9, f9
 141         ;;
 142 (p6)    fma.d.s1 f10 = f9, f8, f8
 143         ;;
 144 (p6)    fnorm.s.s0 fret0 = f10
 145 (p7)    mov fret0 = f10
 146         br.ret.sptk rp
 147         ;;
 148         .endp __divsf3
 149 #endif
 150
 151 #ifdef L__divdi3
 152 // Compute a 64-bit integer quotient.
 153 //
 154 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 155 // alternative.
 156 //
 157 // in0 holds the dividend.  in1 holds the divisor.
 158
 159         .text
 160         .align 16
 161         .global __divdi3
 162         .proc __divdi3
 163 __divdi3:
 164         .regstk 2,0,0,0
 165         // Transfer inputs to FP registers.
 166         setf.sig f8 = in0
 167         setf.sig f9 = in1
 168         // Check divide by zero.
 169         cmp.ne.unc p0,p7=0,in1
 170         ;;
 171         // Convert the inputs to FP, so that they won't be treated as unsigned.
 172         fcvt.xf f8 = f8
 173         fcvt.xf f9 = f9
 174 (p7)    break 1
 175         ;;
 176         // Compute the reciprocal approximation.
 177         frcpa.s1 f10, p6 = f8, f9
 178         ;;
 179         // 3 Newton-Raphson iterations.
 180 (p6)    fnma.s1 f11 = f9, f10, f1
 181 (p6)    fmpy.s1 f12 = f8, f10
 182         ;;
 183 (p6)    fmpy.s1 f13 = f11, f11
 184 (p6)    fma.s1 f12 = f11, f12, f12
 185         ;;
 186 (p6)    fma.s1 f10 = f11, f10, f10
 187 (p6)    fma.s1 f11 = f13, f12, f12
 188         ;;
 189 (p6)    fma.s1 f10 = f13, f10, f10
 190 (p6)    fnma.s1 f12 = f9, f11, f8
 191         ;;
 192 (p6)    fma.s1 f10 = f12, f10, f11
 193         ;;
 194         // Round quotient to an integer.
 195         fcvt.fx.trunc.s1 f10 = f10
 196         ;;
 197         // Transfer result to GP registers.
 198         getf.sig ret0 = f10
 199         br.ret.sptk rp
 200         ;;
 201         .endp __divdi3
 202 #endif
 203
 204 #ifdef L__moddi3
 205 // Compute a 64-bit integer modulus.
 206 //
 207 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 208 // alternative.
 209 //
 210 // in0 holds the dividend (a).  in1 holds the divisor (b).
 211
 212         .text
 213         .align 16
 214         .global __moddi3
 215         .proc __moddi3
 216 __moddi3:
 217         .regstk 2,0,0,0
 218         // Transfer inputs to FP registers.
 219         setf.sig f14 = in0
 220         setf.sig f9 = in1
 221         // Check divide by zero.
 222         cmp.ne.unc p0,p7=0,in1
 223         ;;
 224         // Convert the inputs to FP, so that they won't be treated as unsigned.
 225         fcvt.xf f8 = f14
 226         fcvt.xf f9 = f9
 227 (p7)    break 1
 228         ;;
 229         // Compute the reciprocal approximation.
 230         frcpa.s1 f10, p6 = f8, f9
 231         ;;
 232         // 3 Newton-Raphson iterations.
 233 (p6)    fmpy.s1 f12 = f8, f10
 234 (p6)    fnma.s1 f11 = f9, f10, f1
 235         ;;
 236 (p6)    fma.s1 f12 = f11, f12, f12
 237 (p6)    fmpy.s1 f13 = f11, f11
 238         ;;
 239 (p6)    fma.s1 f10 = f11, f10, f10
 240 (p6)    fma.s1 f11 = f13, f12, f12
 241         ;;
 242         sub in1 = r0, in1
 243 (p6)    fma.s1 f10 = f13, f10, f10
 244 (p6)    fnma.s1 f12 = f9, f11, f8
 245         ;;
 246         setf.sig f9 = in1
 247 (p6)    fma.s1 f10 = f12, f10, f11
 248         ;;
 249         fcvt.fx.trunc.s1 f10 = f10
 250         ;;
 251         // r = q * (-b) + a
 252         xma.l f10 = f10, f9, f14
 253         ;;
 254         // Transfer result to GP registers.
 255         getf.sig ret0 = f10
 256         br.ret.sptk rp
 257         ;;
 258         .endp __moddi3
 259 #endif
 260
 261 #ifdef L__udivdi3
 262 // Compute a 64-bit unsigned integer quotient.
 263 //
 264 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 265 // alternative.
 266 //
 267 // in0 holds the dividend.  in1 holds the divisor.
 268
 269         .text
 270         .align 16
 271         .global __udivdi3
 272         .proc __udivdi3
 273 __udivdi3:
 274         .regstk 2,0,0,0
 275         // Transfer inputs to FP registers.
 276         setf.sig f8 = in0
 277         setf.sig f9 = in1
 278         // Check divide by zero.
 279         cmp.ne.unc p0,p7=0,in1
 280         ;;
 281         // Convert the inputs to FP, to avoid FP software-assist faults.
 282         fcvt.xuf.s1 f8 = f8
 283         fcvt.xuf.s1 f9 = f9
 284 (p7)    break 1
 285         ;;
 286         // Compute the reciprocal approximation.
 287         frcpa.s1 f10, p6 = f8, f9
 288         ;;
 289         // 3 Newton-Raphson iterations.
 290 (p6)    fnma.s1 f11 = f9, f10, f1
 291 (p6)    fmpy.s1 f12 = f8, f10
 292         ;;
 293 (p6)    fmpy.s1 f13 = f11, f11
 294 (p6)    fma.s1 f12 = f11, f12, f12
 295         ;;
 296 (p6)    fma.s1 f10 = f11, f10, f10
 297 (p6)    fma.s1 f11 = f13, f12, f12
 298         ;;
 299 (p6)    fma.s1 f10 = f13, f10, f10
 300 (p6)    fnma.s1 f12 = f9, f11, f8
 301         ;;
 302 (p6)    fma.s1 f10 = f12, f10, f11
 303         ;;
 304         // Round quotient to an unsigned integer.
 305         fcvt.fxu.trunc.s1 f10 = f10
 306         ;;
 307         // Transfer result to GP registers.
 308         getf.sig ret0 = f10
 309         br.ret.sptk rp
 310         ;;
 311         .endp __udivdi3
 312 #endif
 313
 314 #ifdef L__umoddi3
 315 // Compute a 64-bit unsigned integer modulus.
 316 //
 317 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 318 // alternative.
 319 //
 320 // in0 holds the dividend (a).  in1 holds the divisor (b).
 321
 322         .text
 323         .align 16
 324         .global __umoddi3
 325         .proc __umoddi3
 326 __umoddi3:
 327         .regstk 2,0,0,0
 328         // Transfer inputs to FP registers.
 329         setf.sig f14 = in0
 330         setf.sig f9 = in1
 331         // Check divide by zero.
 332         cmp.ne.unc p0,p7=0,in1
 333         ;;
 334         // Convert the inputs to FP, to avoid FP software assist faults.
 335         fcvt.xuf.s1 f8 = f14
 336         fcvt.xuf.s1 f9 = f9
 337 (p7)    break 1;
 338         ;;
 339         // Compute the reciprocal approximation.
 340         frcpa.s1 f10, p6 = f8, f9
 341         ;;
 342         // 3 Newton-Raphson iterations.
 343 (p6)    fmpy.s1 f12 = f8, f10
 344 (p6)    fnma.s1 f11 = f9, f10, f1
 345         ;;
 346 (p6)    fma.s1 f12 = f11, f12, f12
 347 (p6)    fmpy.s1 f13 = f11, f11
 348         ;;
 349 (p6)    fma.s1 f10 = f11, f10, f10
 350 (p6)    fma.s1 f11 = f13, f12, f12
 351         ;;
 352         sub in1 = r0, in1
 353 (p6)    fma.s1 f10 = f13, f10, f10
 354 (p6)    fnma.s1 f12 = f9, f11, f8
 355         ;;
 356         setf.sig f9 = in1
 357 (p6)    fma.s1 f10 = f12, f10, f11
 358         ;;
 359         // Round quotient to an unsigned integer.
 360         fcvt.fxu.trunc.s1 f10 = f10
 361         ;;
 362         // r = q * (-b) + a
 363         xma.l f10 = f10, f9, f14
 364         ;;
 365         // Transfer result to GP registers.
 366         getf.sig ret0 = f10
 367         br.ret.sptk rp
 368         ;;
 369         .endp __umoddi3
 370 #endif
 371
 372 #ifdef L__divsi3
 373 // Compute a 32-bit integer quotient.
 374 //
 375 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 376 // alternative.
 377 //
 378 // in0 holds the dividend.  in1 holds the divisor.
 379
 380         .text
 381         .align 16
 382         .global __divsi3
 383         .proc __divsi3
 384 __divsi3:
 385         .regstk 2,0,0,0
 386         // Check divide by zero.
 387         cmp.ne.unc p0,p7=0,in1
 388         sxt4 in0 = in0
 389         sxt4 in1 = in1
 390         ;;
 391         setf.sig f8 = in0
 392         setf.sig f9 = in1
 393 (p7)    break 1
 394         ;;
 395         mov r2 = 0x0ffdd
 396         fcvt.xf f8 = f8
 397         fcvt.xf f9 = f9
 398         ;;
 399         setf.exp f11 = r2
 400         frcpa.s1 f10, p6 = f8, f9
 401         ;;
 402 (p6)    fmpy.s1 f8 = f8, f10
 403 (p6)    fnma.s1 f9 = f9, f10, f1
 404         ;;
 405 (p6)    fma.s1 f8 = f9, f8, f8
 406 (p6)    fma.s1 f9 = f9, f9, f11
 407         ;;
 408 (p6)    fma.s1 f10 = f9, f8, f8
 409         ;;
 410         fcvt.fx.trunc.s1 f10 = f10
 411         ;;
 412         getf.sig ret0 = f10
 413         br.ret.sptk rp
 414         ;;
 415         .endp __divsi3
 416 #endif
 417
 418 #ifdef L__modsi3
 419 // Compute a 32-bit integer modulus.
 420 //
 421 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 422 // alternative.
 423 //
 424 // in0 holds the dividend.  in1 holds the divisor.
 425
 426         .text
 427         .align 16
 428         .global __modsi3
 429         .proc __modsi3
 430 __modsi3:
 431         .regstk 2,0,0,0
 432         mov r2 = 0x0ffdd
 433         sxt4 in0 = in0
 434         sxt4 in1 = in1
 435         ;;
 436         setf.sig f13 = r32
 437         setf.sig f9 = r33
 438         // Check divide by zero.
 439         cmp.ne.unc p0,p7=0,in1
 440         ;;
 441         sub in1 = r0, in1
 442         fcvt.xf f8 = f13
 443         fcvt.xf f9 = f9
 444         ;;
 445         setf.exp f11 = r2
 446         frcpa.s1 f10, p6 = f8, f9
 447 (p7)    break 1
 448         ;;
 449 (p6)    fmpy.s1 f12 = f8, f10
 450 (p6)    fnma.s1 f10 = f9, f10, f1
 451         ;;
 452         setf.sig f9 = in1
 453 (p6)    fma.s1 f12 = f10, f12, f12
 454 (p6)    fma.s1 f10 = f10, f10, f11
 455         ;;
 456 (p6)    fma.s1 f10 = f10, f12, f12
 457         ;;
 458         fcvt.fx.trunc.s1 f10 = f10
 459         ;;
 460         xma.l f10 = f10, f9, f13
 461         ;;
 462         getf.sig ret0 = f10
 463         br.ret.sptk rp
 464         ;;
 465         .endp __modsi3
 466 #endif
 467
 468 #ifdef L__udivsi3
 469 // Compute a 32-bit unsigned integer quotient.
 470 //
 471 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 472 // alternative.
 473 //
 474 // in0 holds the dividend.  in1 holds the divisor.
 475
 476         .text
 477         .align 16
 478         .global __udivsi3
 479         .proc __udivsi3
 480 __udivsi3:
 481         .regstk 2,0,0,0
 482         mov r2 = 0x0ffdd
 483         zxt4 in0 = in0
 484         zxt4 in1 = in1
 485         ;;
 486         setf.sig f8 = in0
 487         setf.sig f9 = in1
 488         // Check divide by zero.
 489         cmp.ne.unc p0,p7=0,in1
 490         ;;
 491         fcvt.xf f8 = f8
 492         fcvt.xf f9 = f9
 493 (p7)    break 1
 494         ;;
 495         setf.exp f11 = r2
 496         frcpa.s1 f10, p6 = f8, f9
 497         ;;
 498 (p6)    fmpy.s1 f8 = f8, f10
 499 (p6)    fnma.s1 f9 = f9, f10, f1
 500         ;;
 501 (p6)    fma.s1 f8 = f9, f8, f8
 502 (p6)    fma.s1 f9 = f9, f9, f11
 503         ;;
 504 (p6)    fma.s1 f10 = f9, f8, f8
 505         ;;
 506         fcvt.fxu.trunc.s1 f10 = f10
 507         ;;
 508         getf.sig ret0 = f10
 509         br.ret.sptk rp
 510         ;;
 511         .endp __udivsi3
 512 #endif
 513
 514 #ifdef L__umodsi3
 515 // Compute a 32-bit unsigned integer modulus.
 516 //
 517 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 518 // alternative.
 519 //
 520 // in0 holds the dividend.  in1 holds the divisor.
 521
 522         .text
 523         .align 16
 524         .global __umodsi3
 525         .proc __umodsi3
 526 __umodsi3:
 527         .regstk 2,0,0,0
 528         mov r2 = 0x0ffdd
 529         zxt4 in0 = in0
 530         zxt4 in1 = in1
 531         ;;
 532         setf.sig f13 = in0
 533         setf.sig f9 = in1
 534         // Check divide by zero.
 535         cmp.ne.unc p0,p7=0,in1
 536         ;;
 537         sub in1 = r0, in1
 538         fcvt.xf f8 = f13
 539         fcvt.xf f9 = f9
 540         ;;
 541         setf.exp f11 = r2
 542         frcpa.s1 f10, p6 = f8, f9
 543 (p7)    break 1;
 544         ;;
 545 (p6)    fmpy.s1 f12 = f8, f10
 546 (p6)    fnma.s1 f10 = f9, f10, f1
 547         ;;
 548         setf.sig f9 = in1
 549 (p6)    fma.s1 f12 = f10, f12, f12
 550 (p6)    fma.s1 f10 = f10, f10, f11
 551         ;;
 552 (p6)    fma.s1 f10 = f10, f12, f12
 553         ;;
 554         fcvt.fxu.trunc.s1 f10 = f10
 555         ;;
 556         xma.l f10 = f10, f9, f13
 557         ;;
 558         getf.sig ret0 = f10
 559         br.ret.sptk rp
 560         ;;
 561         .endp __umodsi3
 562 #endif
 563
 564 #ifdef L__save_stack_nonlocal
 565 // Notes on save/restore stack nonlocal: We read ar.bsp but write
 566 // ar.bspstore.  This is because ar.bsp can be read at all times
 567 // (independent of the RSE mode) but since it's read-only we need to
 568 // restore the value via ar.bspstore.  This is OK because
 569 // ar.bsp==ar.bspstore after executing "flushrs".
 570
 571 // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
 572
 573         .text
 574         .align 16
 575         .global __ia64_save_stack_nonlocal
 576         .proc __ia64_save_stack_nonlocal
 577 __ia64_save_stack_nonlocal:
 578         { .mmf
 579           alloc r18 = ar.pfs, 2, 0, 0, 0
 580           mov r19 = ar.rsc
 581           ;;
 582         }
 583         { .mmi
 584           flushrs
 585           st8 [in0] = in1, 24
 586           and r19 = 0x1c, r19
 587           ;;
 588         }
 589         { .mmi
 590           st8 [in0] = r18, -16
 591           mov ar.rsc = r19
 592           or r19 = 0x3, r19
 593           ;;
 594         }
 595         { .mmi
 596           mov r16 = ar.bsp
 597           mov r17 = ar.rnat
 598           adds r2 = 8, in0
 599           ;;
 600         }
 601         { .mmi
 602           st8 [in0] = r16
 603           st8 [r2] = r17
 604         }
 605         { .mib
 606           mov ar.rsc = r19
 607           br.ret.sptk.few rp
 608           ;;
 609         }
 610         .endp __ia64_save_stack_nonlocal
 611 #endif
 612
 613 #ifdef L__nonlocal_goto
 614 // void __ia64_nonlocal_goto(void *target_label, void *save_area,
 615 //                           void *static_chain);
 616
 617         .text
 618         .align 16
 619         .global __ia64_nonlocal_goto
 620         .proc __ia64_nonlocal_goto
 621 __ia64_nonlocal_goto:
 622         { .mmi
 623           alloc r20 = ar.pfs, 3, 0, 0, 0
 624           ld8 r12 = [in1], 8
 625           mov.ret.sptk rp = in0, .L0
 626           ;;
 627         }
 628         { .mmf
 629           ld8 r16 = [in1], 8
 630           mov r19 = ar.rsc
 631           ;;
 632         }
 633         { .mmi
 634           flushrs
 635           ld8 r17 = [in1], 8
 636           and r19 = 0x1c, r19
 637           ;;
 638         }
 639         { .mmi
 640           ld8 r18 = [in1]
 641           mov ar.rsc = r19
 642           or r19 = 0x3, r19
 643           ;;
 644         }
 645         { .mmi
 646           mov ar.bspstore = r16
 647           ;;
 648           mov ar.rnat = r17
 649           ;;
 650         }
 651         { .mmi
 652           loadrs
 653           invala
 654           mov r15 = in2
 655           ;;
 656         }
 657 .L0:    { .mib
 658           mov ar.rsc = r19
 659           mov ar.pfs = r18
 660           br.ret.sptk.few rp
 661           ;;
 662         }
 663         .endp __ia64_nonlocal_goto
 664 #endif
 665
 666 #ifdef L__restore_stack_nonlocal
 667 // This is mostly the same as nonlocal_goto above.
 668 // ??? This has not been tested yet.
 669
 670 // void __ia64_restore_stack_nonlocal(void *save_area)
 671
 672         .text
 673         .align 16
 674         .global __ia64_restore_stack_nonlocal
 675         .proc __ia64_restore_stack_nonlocal
 676 __ia64_restore_stack_nonlocal:
 677         { .mmf
 678           alloc r20 = ar.pfs, 4, 0, 0, 0
 679           ld8 r12 = [in0], 8
 680           ;;
 681         }
 682         { .mmb
 683           ld8 r16=[in0], 8
 684           mov r19 = ar.rsc
 685           ;;
 686         }
 687         { .mmi
 688           flushrs
 689           ld8 r17 = [in0], 8
 690           and r19 = 0x1c, r19
 691           ;;
 692         }
 693         { .mmf
 694           ld8 r18 = [in0]
 695           mov ar.rsc = r19
 696           ;;
 697         }
 698         { .mmi
 699           mov ar.bspstore = r16
 700           ;;
 701           mov ar.rnat = r17
 702           or r19 = 0x3, r19
 703           ;;
 704         }
 705         { .mmf
 706           loadrs
 707           invala
 708           ;;
 709         }
 710 .L0:    { .mib
 711           mov ar.rsc = r19
 712           mov ar.pfs = r18
 713           br.ret.sptk.few rp
 714           ;;
 715         }
 716         .endp __ia64_restore_stack_nonlocal
 717 #endif
 718
 719 #ifdef L__trampoline
 720 // Implement the nested function trampoline.  This is out of line
 721 // so that we don't have to bother with flushing the icache, as
 722 // well as making the on-stack trampoline smaller.
 723 //
 724 // The trampoline has the following form:
 725 //
 726 //              +-------------------+ >
 727 //      TRAMP:  | __ia64_trampoline | |
 728 //              +-------------------+  > fake function descriptor
 729 //              | TRAMP+16          | |
 730 //              +-------------------+ >
 731 //              | target descriptor |
 732 //              +-------------------+
 733 //              | static link       |
 734 //              +-------------------+
 735
 736         .text
 737         .align 16
 738         .global __ia64_trampoline
 739         .proc __ia64_trampoline
 740 __ia64_trampoline:
 741         { .mmi
 742           ld8 r2 = [r1], 8
 743           ;;
 744           ld8 r15 = [r1]
 745         }
 746         { .mmi
 747           ld8 r3 = [r2], 8
 748           ;;
 749           ld8 r1 = [r2]
 750           mov b6 = r3
 751         }
 752         { .bbb
 753           br.sptk.many b6
 754           ;;
 755         }
 756         .endp __ia64_trampoline
 757 #endif
 758
 759 #ifdef SHARED
 760 // Thunks for backward compatibility.
 761 #ifdef L_fixtfdi
 762         .text
 763         .align 16
 764         .global __fixtfti
 765         .proc __fixtfti
 766 __fixtfti:
 767         { .bbb
 768           br.sptk.many __fixxfti
 769           ;;
 770         }
 771         .endp __fixtfti
 772 #endif
 773 #ifdef L_fixunstfdi
 774         .align 16
 775         .global __fixunstfti
 776         .proc __fixunstfti
 777 __fixunstfti:
 778         { .bbb
 779           br.sptk.many __fixunsxfti
 780           ;;
 781         }
 782         .endp __fixunstfti
 783 #endif
 784 #ifdef L_floatditf
 785         .align 16
 786         .global __floattitf
 787         .proc __floattitf
 788 __floattitf:
 789         { .bbb
 790           br.sptk.many __floattixf
 791           ;;
 792         }
 793         .endp __floattitf
 794 #endif
 795 #endif