source/libs/gmp/gmp-src/mpn/ia64/sqr_diag_addlsh1.asm

   1 dnl  IA-64 mpn_sqr_diag_addlsh1
   2
   3 dnl  Contributed to the GNU project by Torbjorn Granlund.
   4
   5 dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35 C           cycles/limb
  36 C Itanium:      ?
  37 C Itanium 2:    2       Unrolling could bring it to 1.5 + epsilon
  38
  39 C Exact performance table.  The 2nd line is this code, the 3rd line is ctop-
  40 C less code.  In an assembly sqr_basecase, the ctop-full numbers will become a
  41 C few cycles better since we can mitigate the many I0 instructions.
  42 C
  43 C 1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20
  44 C -  20  22  24  26  28  30  32  34  36  38  40  42  44  46  48  50  52  54  56 Needs updating
  45 C -  13  16  17  18  20  21  23  25  26  30  31  31  33  34  36  38  39  42  43
  46
  47 C We should keep in mind that this code takes linear time in a O(n^2) context
  48 C and that it will only be used under SQR_TOOM2_THRESHOLD, which might become
  49 C around 60.  Keeping overhead down for smallish operands (< 10) is more
  50 C important than optimal cycle counts.
  51
  52 C TODO
  53 C  * Make sure we don't depend on uninitialised r-registers, f-registers, or
  54 C  * p-registers.
  55 C  * Optimise by doing first two loop iterations in function header.
  56
  57 C INPUT PARAMETERS
  58 define(`rp_param', `r32')  define(`rp', `r14')          C size: 2n
  59 define(`tp_param', `r33')  define(`tp', `r15')          C size: 2n - 2
  60 define(`up_param', `r34')  define(`up', `r31')          C size: n
  61 define(`n',  `r35')
  62
  63 ifdef(`HAVE_ABI_32',`
  64         define(`ABI64', `')
  65         define(`ABI32', `$1')
  66 ',`
  67         define(`ABI64', `$1')
  68         define(`ABI32', `')
  69 ')
  70
  71 ASM_START()
  72 PROLOGUE(mpn_sqr_diag_addlsh1)
  73
  74         .prologue
  75         .save   ar.pfs, r2
  76         .save   ar.lc, r3
  77         .body
  78
  79  {.mii;         alloc   r2 = ar.pfs, 4,24,0,24  C                       M
  80                 mov     r3 = ar.lc              C                       I0
  81         ABI64(` nop     4711            ')
  82         ABI32(` zxt4    n = n           ')
  83 }{.mmi; ABI64(` mov     tp = tp_param   ')      C                       M I
  84         ABI32(` addp4   tp = 0, tp_param')      C                       M I
  85         ABI64(` mov     up = up_param   ')      C                       M I
  86         ABI32(` addp4   up = 0, up_param')      C                       M I
  87         ABI64(` mov     rp = rp_param   ')      C                       M I
  88         ABI32(` addp4   rp = 0, rp_param')      C                       M I
  89         ;;
  90 }{.mmi;         ld8     r36 = [tp], 8           C                       M
  91                 add     r20 = -2, n             C                       M I
  92                 mov     r9 = ar.ec              C                       I0
  93         ;;
  94 }{.mmi;         ld8     r32 = [tp], 8           C                       M
  95                 mov     r16 = 0                 C                       M I
  96                 mov     ar.ec = 7               C                       I0
  97         ;;
  98 }{.mmi;         nop     4711
  99                 mov     r44 = 0                 C                       M I
 100                 mov     ar.lc = r20             C                       I0
 101         ;;
 102 }{.mii;         mov     r33 = 0
 103                 mov     r10 = pr                C                       I0
 104                 mov     pr.rot = 0x30000        C                       I0
 105         ;;
 106 }               br.cexit.spnt.few.clr   L(end)
 107
 108 dnl *** MAIN LOOP START ***
 109         ALIGN(32)
 110 L(top):
 111  {.mfi; (p18)   ldf8    f33 = [up], 8           C                       M
 112         (p20)   xma.l   f36 = f35, f35, f42     C                       F
 113         (p41)   cmpequc p50, p0 = -1, r44       C                       M I
 114 }{.mfi;         setfsig f40 = r16               C                       M23
 115         (p20)   xma.hu  f38 = f35, f35, f42     C                       F
 116         (p23)   add     r50 = r41, r49          C                       M I
 117         ;;
 118 }{.mmi; (p16)   ld8     r36 = [tp], 8           C                       M
 119         (p23)   cmpltu  p40, p0 = r50, r41      C cyout hi              M I
 120         (p19)   shrp    r45 = r38, r35, 63      C non-critical          I0
 121 }{.mmi; (p21)   getfsig r39 = f39               C hi                    M2
 122         (p24)   st8     [rp] = r51, 8           C hi                    M23
 123         (p41)   add     r44 = 1, r44            C                       M I
 124         ;;
 125 }{.mmi; (p16)   ld8     r32 = [tp], 8           C                       M
 126         (p50)   cmpeqor p40, p0 = -1, r50       C cyout hi              M I
 127         (p17)   shrp    r16 = r33, r37, 63      C critical              I0
 128 }{.mmi; (p21)   getfsig r42 = f37               C lo                    M2
 129         (p23)   st8     [rp] = r44, 8           C lo                    M23
 130         (p50)   add     r50 = 1, r50            C                       M I
 131         ;;
 132 }               br.ctop.sptk.few.clr L(top)     C                       B
 133 dnl *** MAIN LOOP END ***
 134         ;;
 135 L(end):
 136  {.mmi;         nop     4711
 137         (p41)   add     r44 = 1, r44            C                       M I
 138                 shr.u   r48 = r39, 63           C                       I0
 139         ;;
 140 }{.mmi;         st8     [rp] = r51, 8           C                       M23
 141         (p41)   cmpequc p6, p0 = 0, r44         C                       M I
 142                 add     r50 = r41, r48          C                       M I
 143         ;;
 144 }{.mmi;         st8     [rp] = r44, 8           C                       M23
 145         (p6)    add     r50 = 1, r50            C                       M I
 146                 mov     ar.lc = r3              C                       I0
 147         ;;
 148 }{.mii;         st8     [rp] = r50              C                       M23
 149                 mov     ar.ec = r9              C                       I0
 150                 mov     pr = r10                C                       I0
 151         ;;
 152 }{.mib;         nop     4711
 153                 mov     ar.pfs = r2             C                       I0
 154                 br.ret.sptk.many b0             C                       B
 155 }
 156 EPILOGUE()