old-autovect-branch/gcc/config/ia64/lib1funcs.asm

   1 /* Copyright (C) 2000, 2001, 2003, 2005 Free Software Foundation, Inc.
   2    Contributed by James E. Wilson <wilson@cygnus.com>.
   3
   4    This file is part of GCC.
   5
   6    GCC is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    GCC is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with GCC; see the file COPYING.  If not, write to
  18    the Free Software Foundation, 51 Franklin Street, Fifth Floor,
  19    Boston, MA 02110-1301, USA.  */
  20
  21 /* As a special exception, if you link this library with other files,
  22    some of which are compiled with GCC, to produce an executable,
  23    this library does not by itself cause the resulting executable
  24    to be covered by the GNU General Public License.
  25    This exception does not however invalidate any other reasons why
  26    the executable file might be covered by the GNU General Public License.  */
  27
  28 #ifdef L__divxf3
  29 // Compute a 80-bit IEEE double-extended quotient.
  30 //
  31 // From the Intel IA-64 Optimization Guide, choose the minimum latency
  32 // alternative.
  33 //
  34 // farg0 holds the dividend.  farg1 holds the divisor.
  35 //
  36 // __divtf3 is an alternate symbol name for backward compatibility.
  37
  38         .text
  39         .align 16
  40         .global __divxf3
  41         .global __divtf3
  42         .proc __divxf3
  43 __divxf3:
  44 __divtf3:
  45         cmp.eq p7, p0 = r0, r0
  46         frcpa.s0 f10, p6 = farg0, farg1
  47         ;;
  48 (p6)    cmp.ne p7, p0 = r0, r0
  49         .pred.rel.mutex p6, p7
  50 (p6)    fnma.s1 f11 = farg1, f10, f1
  51 (p6)    fma.s1 f12 = farg0, f10, f0
  52         ;;
  53 (p6)    fma.s1 f13 = f11, f11, f0
  54 (p6)    fma.s1 f14 = f11, f11, f11
  55         ;;
  56 (p6)    fma.s1 f11 = f13, f13, f11
  57 (p6)    fma.s1 f13 = f14, f10, f10
  58         ;;
  59 (p6)    fma.s1 f10 = f13, f11, f10
  60 (p6)    fnma.s1 f11 = farg1, f12, farg0
  61         ;;
  62 (p6)    fma.s1 f11 = f11, f10, f12
  63 (p6)    fnma.s1 f12 = farg1, f10, f1
  64         ;;
  65 (p6)    fma.s1 f10 = f12, f10, f10
  66 (p6)    fnma.s1 f12 = farg1, f11, farg0
  67         ;;
  68 (p6)    fma.s0 fret0 = f12, f10, f11
  69 (p7)    mov fret0 = f10
  70         br.ret.sptk rp
  71         .endp __divxf3
  72 #endif
  73
  74 #ifdef L__divdf3
  75 // Compute a 64-bit IEEE double quotient.
  76 //
  77 // From the Intel IA-64 Optimization Guide, choose the minimum latency
  78 // alternative.
  79 //
  80 // farg0 holds the dividend.  farg1 holds the divisor.
  81
  82         .text
  83         .align 16
  84         .global __divdf3
  85         .proc __divdf3
  86 __divdf3:
  87         cmp.eq p7, p0 = r0, r0
  88         frcpa.s0 f10, p6 = farg0, farg1
  89         ;;
  90 (p6)    cmp.ne p7, p0 = r0, r0
  91         .pred.rel.mutex p6, p7
  92 (p6)    fmpy.s1 f11 = farg0, f10
  93 (p6)    fnma.s1 f12 = farg1, f10, f1
  94         ;;
  95 (p6)    fma.s1 f11 = f12, f11, f11
  96 (p6)    fmpy.s1 f13 = f12, f12
  97         ;;
  98 (p6)    fma.s1 f10 = f12, f10, f10
  99 (p6)    fma.s1 f11 = f13, f11, f11
 100         ;;
 101 (p6)    fmpy.s1 f12 = f13, f13
 102 (p6)    fma.s1 f10 = f13, f10, f10
 103         ;;
 104 (p6)    fma.d.s1 f11 = f12, f11, f11
 105 (p6)    fma.s1 f10 = f12, f10, f10
 106         ;;
 107 (p6)    fnma.d.s1 f8 = farg1, f11, farg0
 108         ;;
 109 (p6)    fma.d fret0 = f8, f10, f11
 110 (p7)    mov fret0 = f10
 111         br.ret.sptk rp
 112         ;;
 113         .endp __divdf3
 114 #endif
 115
 116 #ifdef L__divsf3
 117 // Compute a 32-bit IEEE float quotient.
 118 //
 119 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 120 // alternative.
 121 //
 122 // farg0 holds the dividend.  farg1 holds the divisor.
 123
 124         .text
 125         .align 16
 126         .global __divsf3
 127         .proc __divsf3
 128 __divsf3:
 129         cmp.eq p7, p0 = r0, r0
 130         frcpa.s0 f10, p6 = farg0, farg1
 131         ;;
 132 (p6)    cmp.ne p7, p0 = r0, r0
 133         .pred.rel.mutex p6, p7
 134 (p6)    fmpy.s1 f8 = farg0, f10
 135 (p6)    fnma.s1 f9 = farg1, f10, f1
 136         ;;
 137 (p6)    fma.s1 f8 = f9, f8, f8
 138 (p6)    fmpy.s1 f9 = f9, f9
 139         ;;
 140 (p6)    fma.s1 f8 = f9, f8, f8
 141 (p6)    fmpy.s1 f9 = f9, f9
 142         ;;
 143 (p6)    fma.d.s1 f10 = f9, f8, f8
 144         ;;
 145 (p6)    fnorm.s.s0 fret0 = f10
 146 (p7)    mov fret0 = f10
 147         br.ret.sptk rp
 148         ;;
 149         .endp __divsf3
 150 #endif
 151
 152 #ifdef L__divdi3
 153 // Compute a 64-bit integer quotient.
 154 //
 155 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 156 // alternative.
 157 //
 158 // in0 holds the dividend.  in1 holds the divisor.
 159
 160         .text
 161         .align 16
 162         .global __divdi3
 163         .proc __divdi3
 164 __divdi3:
 165         .regstk 2,0,0,0
 166         // Transfer inputs to FP registers.
 167         setf.sig f8 = in0
 168         setf.sig f9 = in1
 169         // Check divide by zero.
 170         cmp.ne.unc p0,p7=0,in1
 171         ;;
 172         // Convert the inputs to FP, so that they won't be treated as unsigned.
 173         fcvt.xf f8 = f8
 174         fcvt.xf f9 = f9
 175 (p7)    break 1
 176         ;;
 177         // Compute the reciprocal approximation.
 178         frcpa.s1 f10, p6 = f8, f9
 179         ;;
 180         // 3 Newton-Raphson iterations.
 181 (p6)    fnma.s1 f11 = f9, f10, f1
 182 (p6)    fmpy.s1 f12 = f8, f10
 183         ;;
 184 (p6)    fmpy.s1 f13 = f11, f11
 185 (p6)    fma.s1 f12 = f11, f12, f12
 186         ;;
 187 (p6)    fma.s1 f10 = f11, f10, f10
 188 (p6)    fma.s1 f11 = f13, f12, f12
 189         ;;
 190 (p6)    fma.s1 f10 = f13, f10, f10
 191 (p6)    fnma.s1 f12 = f9, f11, f8
 192         ;;
 193 (p6)    fma.s1 f10 = f12, f10, f11
 194         ;;
 195         // Round quotient to an integer.
 196         fcvt.fx.trunc.s1 f10 = f10
 197         ;;
 198         // Transfer result to GP registers.
 199         getf.sig ret0 = f10
 200         br.ret.sptk rp
 201         ;;
 202         .endp __divdi3
 203 #endif
 204
 205 #ifdef L__moddi3
 206 // Compute a 64-bit integer modulus.
 207 //
 208 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 209 // alternative.
 210 //
 211 // in0 holds the dividend (a).  in1 holds the divisor (b).
 212
 213         .text
 214         .align 16
 215         .global __moddi3
 216         .proc __moddi3
 217 __moddi3:
 218         .regstk 2,0,0,0
 219         // Transfer inputs to FP registers.
 220         setf.sig f14 = in0
 221         setf.sig f9 = in1
 222         // Check divide by zero.
 223         cmp.ne.unc p0,p7=0,in1
 224         ;;
 225         // Convert the inputs to FP, so that they won't be treated as unsigned.
 226         fcvt.xf f8 = f14
 227         fcvt.xf f9 = f9
 228 (p7)    break 1
 229         ;;
 230         // Compute the reciprocal approximation.
 231         frcpa.s1 f10, p6 = f8, f9
 232         ;;
 233         // 3 Newton-Raphson iterations.
 234 (p6)    fmpy.s1 f12 = f8, f10
 235 (p6)    fnma.s1 f11 = f9, f10, f1
 236         ;;
 237 (p6)    fma.s1 f12 = f11, f12, f12
 238 (p6)    fmpy.s1 f13 = f11, f11
 239         ;;
 240 (p6)    fma.s1 f10 = f11, f10, f10
 241 (p6)    fma.s1 f11 = f13, f12, f12
 242         ;;
 243         sub in1 = r0, in1
 244 (p6)    fma.s1 f10 = f13, f10, f10
 245 (p6)    fnma.s1 f12 = f9, f11, f8
 246         ;;
 247         setf.sig f9 = in1
 248 (p6)    fma.s1 f10 = f12, f10, f11
 249         ;;
 250         fcvt.fx.trunc.s1 f10 = f10
 251         ;;
 252         // r = q * (-b) + a
 253         xma.l f10 = f10, f9, f14
 254         ;;
 255         // Transfer result to GP registers.
 256         getf.sig ret0 = f10
 257         br.ret.sptk rp
 258         ;;
 259         .endp __moddi3
 260 #endif
 261
 262 #ifdef L__udivdi3
 263 // Compute a 64-bit unsigned integer quotient.
 264 //
 265 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 266 // alternative.
 267 //
 268 // in0 holds the dividend.  in1 holds the divisor.
 269
 270         .text
 271         .align 16
 272         .global __udivdi3
 273         .proc __udivdi3
 274 __udivdi3:
 275         .regstk 2,0,0,0
 276         // Transfer inputs to FP registers.
 277         setf.sig f8 = in0
 278         setf.sig f9 = in1
 279         // Check divide by zero.
 280         cmp.ne.unc p0,p7=0,in1
 281         ;;
 282         // Convert the inputs to FP, to avoid FP software-assist faults.
 283         fcvt.xuf.s1 f8 = f8
 284         fcvt.xuf.s1 f9 = f9
 285 (p7)    break 1
 286         ;;
 287         // Compute the reciprocal approximation.
 288         frcpa.s1 f10, p6 = f8, f9
 289         ;;
 290         // 3 Newton-Raphson iterations.
 291 (p6)    fnma.s1 f11 = f9, f10, f1
 292 (p6)    fmpy.s1 f12 = f8, f10
 293         ;;
 294 (p6)    fmpy.s1 f13 = f11, f11
 295 (p6)    fma.s1 f12 = f11, f12, f12
 296         ;;
 297 (p6)    fma.s1 f10 = f11, f10, f10
 298 (p6)    fma.s1 f11 = f13, f12, f12
 299         ;;
 300 (p6)    fma.s1 f10 = f13, f10, f10
 301 (p6)    fnma.s1 f12 = f9, f11, f8
 302         ;;
 303 (p6)    fma.s1 f10 = f12, f10, f11
 304         ;;
 305         // Round quotient to an unsigned integer.
 306         fcvt.fxu.trunc.s1 f10 = f10
 307         ;;
 308         // Transfer result to GP registers.
 309         getf.sig ret0 = f10
 310         br.ret.sptk rp
 311         ;;
 312         .endp __udivdi3
 313 #endif
 314
 315 #ifdef L__umoddi3
 316 // Compute a 64-bit unsigned integer modulus.
 317 //
 318 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 319 // alternative.
 320 //
 321 // in0 holds the dividend (a).  in1 holds the divisor (b).
 322
 323         .text
 324         .align 16
 325         .global __umoddi3
 326         .proc __umoddi3
 327 __umoddi3:
 328         .regstk 2,0,0,0
 329         // Transfer inputs to FP registers.
 330         setf.sig f14 = in0
 331         setf.sig f9 = in1
 332         // Check divide by zero.
 333         cmp.ne.unc p0,p7=0,in1
 334         ;;
 335         // Convert the inputs to FP, to avoid FP software assist faults.
 336         fcvt.xuf.s1 f8 = f14
 337         fcvt.xuf.s1 f9 = f9
 338 (p7)    break 1;
 339         ;;
 340         // Compute the reciprocal approximation.
 341         frcpa.s1 f10, p6 = f8, f9
 342         ;;
 343         // 3 Newton-Raphson iterations.
 344 (p6)    fmpy.s1 f12 = f8, f10
 345 (p6)    fnma.s1 f11 = f9, f10, f1
 346         ;;
 347 (p6)    fma.s1 f12 = f11, f12, f12
 348 (p6)    fmpy.s1 f13 = f11, f11
 349         ;;
 350 (p6)    fma.s1 f10 = f11, f10, f10
 351 (p6)    fma.s1 f11 = f13, f12, f12
 352         ;;
 353         sub in1 = r0, in1
 354 (p6)    fma.s1 f10 = f13, f10, f10
 355 (p6)    fnma.s1 f12 = f9, f11, f8
 356         ;;
 357         setf.sig f9 = in1
 358 (p6)    fma.s1 f10 = f12, f10, f11
 359         ;;
 360         // Round quotient to an unsigned integer.
 361         fcvt.fxu.trunc.s1 f10 = f10
 362         ;;
 363         // r = q * (-b) + a
 364         xma.l f10 = f10, f9, f14
 365         ;;
 366         // Transfer result to GP registers.
 367         getf.sig ret0 = f10
 368         br.ret.sptk rp
 369         ;;
 370         .endp __umoddi3
 371 #endif
 372
 373 #ifdef L__divsi3
 374 // Compute a 32-bit integer quotient.
 375 //
 376 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 377 // alternative.
 378 //
 379 // in0 holds the dividend.  in1 holds the divisor.
 380
 381         .text
 382         .align 16
 383         .global __divsi3
 384         .proc __divsi3
 385 __divsi3:
 386         .regstk 2,0,0,0
 387         // Check divide by zero.
 388         cmp.ne.unc p0,p7=0,in1
 389         sxt4 in0 = in0
 390         sxt4 in1 = in1
 391         ;;
 392         setf.sig f8 = in0
 393         setf.sig f9 = in1
 394 (p7)    break 1
 395         ;;
 396         mov r2 = 0x0ffdd
 397         fcvt.xf f8 = f8
 398         fcvt.xf f9 = f9
 399         ;;
 400         setf.exp f11 = r2
 401         frcpa.s1 f10, p6 = f8, f9
 402         ;;
 403 (p6)    fmpy.s1 f8 = f8, f10
 404 (p6)    fnma.s1 f9 = f9, f10, f1
 405         ;;
 406 (p6)    fma.s1 f8 = f9, f8, f8
 407 (p6)    fma.s1 f9 = f9, f9, f11
 408         ;;
 409 (p6)    fma.s1 f10 = f9, f8, f8
 410         ;;
 411         fcvt.fx.trunc.s1 f10 = f10
 412         ;;
 413         getf.sig ret0 = f10
 414         br.ret.sptk rp
 415         ;;
 416         .endp __divsi3
 417 #endif
 418
 419 #ifdef L__modsi3
 420 // Compute a 32-bit integer modulus.
 421 //
 422 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 423 // alternative.
 424 //
 425 // in0 holds the dividend.  in1 holds the divisor.
 426
 427         .text
 428         .align 16
 429         .global __modsi3
 430         .proc __modsi3
 431 __modsi3:
 432         .regstk 2,0,0,0
 433         mov r2 = 0x0ffdd
 434         sxt4 in0 = in0
 435         sxt4 in1 = in1
 436         ;;
 437         setf.sig f13 = r32
 438         setf.sig f9 = r33
 439         // Check divide by zero.
 440         cmp.ne.unc p0,p7=0,in1
 441         ;;
 442         sub in1 = r0, in1
 443         fcvt.xf f8 = f13
 444         fcvt.xf f9 = f9
 445         ;;
 446         setf.exp f11 = r2
 447         frcpa.s1 f10, p6 = f8, f9
 448 (p7)    break 1
 449         ;;
 450 (p6)    fmpy.s1 f12 = f8, f10
 451 (p6)    fnma.s1 f10 = f9, f10, f1
 452         ;;
 453         setf.sig f9 = in1
 454 (p6)    fma.s1 f12 = f10, f12, f12
 455 (p6)    fma.s1 f10 = f10, f10, f11
 456         ;;
 457 (p6)    fma.s1 f10 = f10, f12, f12
 458         ;;
 459         fcvt.fx.trunc.s1 f10 = f10
 460         ;;
 461         xma.l f10 = f10, f9, f13
 462         ;;
 463         getf.sig ret0 = f10
 464         br.ret.sptk rp
 465         ;;
 466         .endp __modsi3
 467 #endif
 468
 469 #ifdef L__udivsi3
 470 // Compute a 32-bit unsigned integer quotient.
 471 //
 472 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 473 // alternative.
 474 //
 475 // in0 holds the dividend.  in1 holds the divisor.
 476
 477         .text
 478         .align 16
 479         .global __udivsi3
 480         .proc __udivsi3
 481 __udivsi3:
 482         .regstk 2,0,0,0
 483         mov r2 = 0x0ffdd
 484         zxt4 in0 = in0
 485         zxt4 in1 = in1
 486         ;;
 487         setf.sig f8 = in0
 488         setf.sig f9 = in1
 489         // Check divide by zero.
 490         cmp.ne.unc p0,p7=0,in1
 491         ;;
 492         fcvt.xf f8 = f8
 493         fcvt.xf f9 = f9
 494 (p7)    break 1
 495         ;;
 496         setf.exp f11 = r2
 497         frcpa.s1 f10, p6 = f8, f9
 498         ;;
 499 (p6)    fmpy.s1 f8 = f8, f10
 500 (p6)    fnma.s1 f9 = f9, f10, f1
 501         ;;
 502 (p6)    fma.s1 f8 = f9, f8, f8
 503 (p6)    fma.s1 f9 = f9, f9, f11
 504         ;;
 505 (p6)    fma.s1 f10 = f9, f8, f8
 506         ;;
 507         fcvt.fxu.trunc.s1 f10 = f10
 508         ;;
 509         getf.sig ret0 = f10
 510         br.ret.sptk rp
 511         ;;
 512         .endp __udivsi3
 513 #endif
 514
 515 #ifdef L__umodsi3
 516 // Compute a 32-bit unsigned integer modulus.
 517 //
 518 // From the Intel IA-64 Optimization Guide, choose the minimum latency
 519 // alternative.
 520 //
 521 // in0 holds the dividend.  in1 holds the divisor.
 522
 523         .text
 524         .align 16
 525         .global __umodsi3
 526         .proc __umodsi3
 527 __umodsi3:
 528         .regstk 2,0,0,0
 529         mov r2 = 0x0ffdd
 530         zxt4 in0 = in0
 531         zxt4 in1 = in1
 532         ;;
 533         setf.sig f13 = in0
 534         setf.sig f9 = in1
 535         // Check divide by zero.
 536         cmp.ne.unc p0,p7=0,in1
 537         ;;
 538         sub in1 = r0, in1
 539         fcvt.xf f8 = f13
 540         fcvt.xf f9 = f9
 541         ;;
 542         setf.exp f11 = r2
 543         frcpa.s1 f10, p6 = f8, f9
 544 (p7)    break 1;
 545         ;;
 546 (p6)    fmpy.s1 f12 = f8, f10
 547 (p6)    fnma.s1 f10 = f9, f10, f1
 548         ;;
 549         setf.sig f9 = in1
 550 (p6)    fma.s1 f12 = f10, f12, f12
 551 (p6)    fma.s1 f10 = f10, f10, f11
 552         ;;
 553 (p6)    fma.s1 f10 = f10, f12, f12
 554         ;;
 555         fcvt.fxu.trunc.s1 f10 = f10
 556         ;;
 557         xma.l f10 = f10, f9, f13
 558         ;;
 559         getf.sig ret0 = f10
 560         br.ret.sptk rp
 561         ;;
 562         .endp __umodsi3
 563 #endif
 564
 565 #ifdef L__save_stack_nonlocal
 566 // Notes on save/restore stack nonlocal: We read ar.bsp but write
 567 // ar.bspstore.  This is because ar.bsp can be read at all times
 568 // (independent of the RSE mode) but since it's read-only we need to
 569 // restore the value via ar.bspstore.  This is OK because
 570 // ar.bsp==ar.bspstore after executing "flushrs".
 571
 572 // void __ia64_save_stack_nonlocal(void *save_area, void *stack_pointer)
 573
 574         .text
 575         .align 16
 576         .global __ia64_save_stack_nonlocal
 577         .proc __ia64_save_stack_nonlocal
 578 __ia64_save_stack_nonlocal:
 579         { .mmf
 580           alloc r18 = ar.pfs, 2, 0, 0, 0
 581           mov r19 = ar.rsc
 582           ;;
 583         }
 584         { .mmi
 585           flushrs
 586           st8 [in0] = in1, 24
 587           and r19 = 0x1c, r19
 588           ;;
 589         }
 590         { .mmi
 591           st8 [in0] = r18, -16
 592           mov ar.rsc = r19
 593           or r19 = 0x3, r19
 594           ;;
 595         }
 596         { .mmi
 597           mov r16 = ar.bsp
 598           mov r17 = ar.rnat
 599           adds r2 = 8, in0
 600           ;;
 601         }
 602         { .mmi
 603           st8 [in0] = r16
 604           st8 [r2] = r17
 605         }
 606         { .mib
 607           mov ar.rsc = r19
 608           br.ret.sptk.few rp
 609           ;;
 610         }
 611         .endp __ia64_save_stack_nonlocal
 612 #endif
 613
 614 #ifdef L__nonlocal_goto
 615 // void __ia64_nonlocal_goto(void *target_label, void *save_area,
 616 //                           void *static_chain);
 617
 618         .text
 619         .align 16
 620         .global __ia64_nonlocal_goto
 621         .proc __ia64_nonlocal_goto
 622 __ia64_nonlocal_goto:
 623         { .mmi
 624           alloc r20 = ar.pfs, 3, 0, 0, 0
 625           ld8 r12 = [in1], 8
 626           mov.ret.sptk rp = in0, .L0
 627           ;;
 628         }
 629         { .mmf
 630           ld8 r16 = [in1], 8
 631           mov r19 = ar.rsc
 632           ;;
 633         }
 634         { .mmi
 635           flushrs
 636           ld8 r17 = [in1], 8
 637           and r19 = 0x1c, r19
 638           ;;
 639         }
 640         { .mmi
 641           ld8 r18 = [in1]
 642           mov ar.rsc = r19
 643           or r19 = 0x3, r19
 644           ;;
 645         }
 646         { .mmi
 647           mov ar.bspstore = r16
 648           ;;
 649           mov ar.rnat = r17
 650           ;;
 651         }
 652         { .mmi
 653           loadrs
 654           invala
 655           mov r15 = in2
 656           ;;
 657         }
 658 .L0:    { .mib
 659           mov ar.rsc = r19
 660           mov ar.pfs = r18
 661           br.ret.sptk.few rp
 662           ;;
 663         }
 664         .endp __ia64_nonlocal_goto
 665 #endif
 666
 667 #ifdef L__restore_stack_nonlocal
 668 // This is mostly the same as nonlocal_goto above.
 669 // ??? This has not been tested yet.
 670
 671 // void __ia64_restore_stack_nonlocal(void *save_area)
 672
 673         .text
 674         .align 16
 675         .global __ia64_restore_stack_nonlocal
 676         .proc __ia64_restore_stack_nonlocal
 677 __ia64_restore_stack_nonlocal:
 678         { .mmf
 679           alloc r20 = ar.pfs, 4, 0, 0, 0
 680           ld8 r12 = [in0], 8
 681           ;;
 682         }
 683         { .mmb
 684           ld8 r16=[in0], 8
 685           mov r19 = ar.rsc
 686           ;;
 687         }
 688         { .mmi
 689           flushrs
 690           ld8 r17 = [in0], 8
 691           and r19 = 0x1c, r19
 692           ;;
 693         }
 694         { .mmf
 695           ld8 r18 = [in0]
 696           mov ar.rsc = r19
 697           ;;
 698         }
 699         { .mmi
 700           mov ar.bspstore = r16
 701           ;;
 702           mov ar.rnat = r17
 703           or r19 = 0x3, r19
 704           ;;
 705         }
 706         { .mmf
 707           loadrs
 708           invala
 709           ;;
 710         }
 711 .L0:    { .mib
 712           mov ar.rsc = r19
 713           mov ar.pfs = r18
 714           br.ret.sptk.few rp
 715           ;;
 716         }
 717         .endp __ia64_restore_stack_nonlocal
 718 #endif
 719
 720 #ifdef L__trampoline
 721 // Implement the nested function trampoline.  This is out of line
 722 // so that we don't have to bother with flushing the icache, as
 723 // well as making the on-stack trampoline smaller.
 724 //
 725 // The trampoline has the following form:
 726 //
 727 //              +-------------------+ >
 728 //      TRAMP:  | __ia64_trampoline | |
 729 //              +-------------------+  > fake function descriptor
 730 //              | TRAMP+16          | |
 731 //              +-------------------+ >
 732 //              | target descriptor |
 733 //              +-------------------+
 734 //              | static link       |
 735 //              +-------------------+
 736
 737         .text
 738         .align 16
 739         .global __ia64_trampoline
 740         .proc __ia64_trampoline
 741 __ia64_trampoline:
 742         { .mmi
 743           ld8 r2 = [r1], 8
 744           ;;
 745           ld8 r15 = [r1]
 746         }
 747         { .mmi
 748           ld8 r3 = [r2], 8
 749           ;;
 750           ld8 r1 = [r2]
 751           mov b6 = r3
 752         }
 753         { .bbb
 754           br.sptk.many b6
 755           ;;
 756         }
 757         .endp __ia64_trampoline
 758 #endif
 759
 760 // Thunks for backward compatibility.
 761 #ifdef L_fixtfdi
 762         .text
 763         .align 16
 764         .global __fixtfti
 765         .proc __fixtfti
 766 __fixtfti:
 767         { .bbb
 768           br.sptk.many __fixxfti
 769           ;;
 770         }
 771         .endp __fixtfti
 772 #endif
 773 #ifdef L_fixunstfdi
 774         .align 16
 775         .global __fixunstfti
 776         .proc __fixunstfti
 777 __fixunstfti:
 778         { .bbb
 779           br.sptk.many __fixunsxfti
 780           ;;
 781         }
 782         .endp __fixunstfti
 783 #endif
 784 #if L_floatditf
 785         .align 16
 786         .global __floattitf
 787         .proc __floattitf
 788 __floattitf:
 789         { .bbb
 790           br.sptk.many __floattixf
 791           ;;
 792         }
 793         .endp __floattitf
 794 #endif