source/libs/gmp/gmp-src/mpn/ia64/mod_34lsub1.asm

   1 dnl  IA-64 mpn_mod_34lsub1
   2
   3 dnl  Contributed to the GNU project by Torbjorn Granlund.
   4
   5 dnl  Copyright 2003-2005, 2010 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35 C           cycles/limb
  36 C Itanium:      ?
  37 C Itanium 2:    1
  38
  39
  40 C INPUT PARAMETERS
  41 define(`up', `r32')
  42 define(`n',  `r33')
  43
  44 C Some useful aliases for registers we use
  45 define(`u0',`r14') define(`u1',`r15') define(`u2',`r16')
  46 define(`a0',`r17') define(`a1',`r18') define(`a2',`r19')
  47 define(`c0',`r20') define(`c1',`r21') define(`c2',`r22')
  48
  49 C This is a fairly simple-minded implementation.  One could approach 0.67 c/l
  50 C with a more sophisticated implementation.  If we're really crazy, we could
  51 C super-unroll, storing carries just in predicate registers, then copy them to
  52 C a general register, and population count them from there.  That'd bring us
  53 C close to 3 insn/limb, for nearly 0.5 c/l.
  54
  55 C Computing n/3 needs 16 cycles, which is a lot of startup overhead.
  56 C We therefore use a plain while-style loop:
  57 C       add             n = -3, n
  58 C       cmp.le          p9, p0 = 3, n
  59 C  (p9) br.cond         .Loop
  60 C Alternatively, we could table n/3 for, say, n < 256, and predicate the
  61 C 16-cycle code.
  62
  63 C The summing-up code at the end was written quickly, and could surely be
  64 C vastly improved.
  65
  66 ASM_START()
  67 PROLOGUE(mpn_mod_34lsub1)
  68         .prologue
  69         .save   ar.lc, r2
  70         .body
  71 ifdef(`HAVE_ABI_32',`
  72         addp4           up = 0, up              C                       M I
  73         nop.m           0
  74         zxt4            n = n                   C                       I
  75         ;;
  76 ')
  77
  78 ifelse(0,1,`
  79         movl            r14 = 0xAAAAAAAAAAAAAAAB
  80         ;;
  81         setf.sig        f6 = r14
  82         setf.sig        f7 = r33
  83         ;;
  84         xmpy.hu         f6 = f6, f7
  85         ;;
  86         getf.sig        r8 = f6
  87         ;;
  88         shr.u           r8 = r8, 1              C Loop count
  89         ;;
  90         mov.i           ar.lc = r8
  91 ')
  92
  93         ld8     u0 = [up], 8
  94         cmp.ne  p9, p0 = 1, n
  95   (p9)  br      L(gt1)
  96         ;;
  97         shr.u   r8 = u0, 48
  98         dep.z   r27 = u0, 0, 48
  99         ;;
 100         add     r8 = r8, r27
 101         br.ret.sptk.many b0
 102
 103
 104 L(gt1):
 105  {.mmi; nop.m   0
 106         mov     a0 = 0
 107         add     n = -2, n
 108 }{.mmi; mov     c0 = 0
 109         mov     c1 = 0
 110         mov     c2 = 0
 111         ;;
 112 }{.mmi; ld8     u1 = [up], 8
 113         mov     a1 = 0
 114         cmp.ltu p6, p0 = r0, r0         C clear p6
 115 }{.mmb; cmp.gt  p9, p0 = 3, n
 116         mov     a2 = 0
 117   (p9)  br.cond.dptk    L(end)
 118         ;;
 119 }
 120         ALIGN(32)
 121 L(top):
 122  {.mmi; ld8     u2 = [up], 8
 123   (p6)  add     c0 = 1, c0
 124         cmp.ltu p7, p0 = a0, u0
 125 }{.mmb; sub     a0 = a0, u0
 126         add     n = -3, n
 127         nop.b   0
 128         ;;
 129 }{.mmi; ld8     u0 = [up], 8
 130   (p7)  add     c1 = 1, c1
 131         cmp.ltu p8, p0 = a1, u1
 132 }{.mmb; sub     a1 = a1, u1
 133         cmp.le  p9, p0 = 3, n
 134         nop.b   0
 135         ;;
 136 }{.mmi; ld8     u1 = [up], 8
 137   (p8)  add     c2 = 1, c2
 138         cmp.ltu p6, p0 = a2, u2
 139 }{.mmb; sub     a2 = a2, u2
 140         nop.m   0
 141 dnl     br.cloop.dptk   L(top)
 142   (p9)  br.cond.dptk    L(top)
 143         ;;
 144 }
 145 L(end):
 146         cmp.eq  p10, p0 = 0, n
 147         cmp.eq  p11, p0 = 1, n
 148   (p10) br      L(0)
 149
 150 L(2):
 151  {.mmi; ld8     u2 = [up], 8
 152   (p6)  add     c0 = 1, c0
 153         cmp.ltu p7, p0 = a0, u0
 154 }{.mmb; sub     a0 = a0, u0
 155         nop.m   0
 156   (p11) br      L(1)
 157         ;;
 158 }       ld8     u0 = [up], 8
 159   (p7)  add     c1 = 1, c1
 160         cmp.ltu p8, p0 = a1, u1
 161         sub     a1 = a1, u1
 162         ;;
 163   (p8)  add     c2 = 1, c2
 164         cmp.ltu p6, p0 = a2, u2
 165         sub     a2 = a2, u2
 166         ;;
 167   (p6)  add     c0 = 1, c0
 168         cmp.ltu p7, p0 = a0, u0
 169         sub     a0 = a0, u0
 170         ;;
 171   (p7)  add     c1 = 1, c1
 172         br      L(com)
 173
 174
 175 L(1):
 176   (p7)  add     c1 = 1, c1
 177         cmp.ltu p8, p0 = a1, u1
 178         sub     a1 = a1, u1
 179         ;;
 180   (p8)  add     c2 = 1, c2
 181         cmp.ltu p6, p0 = a2, u2
 182         sub     a2 = a2, u2
 183         ;;
 184   (p6)  add     c0 = 1, c0
 185         br      L(com)
 186
 187
 188 L(0):
 189   (p6)  add     c0 = 1, c0
 190         cmp.ltu p7, p0 = a0, u0
 191         sub     a0 = a0, u0
 192         ;;
 193   (p7)  add     c1 = 1, c1
 194         cmp.ltu p8, p0 = a1, u1
 195         sub     a1 = a1, u1
 196         ;;
 197   (p8)  add     c2 = 1, c2
 198
 199 L(com):
 200 C |     a2    |     a1    |     a0    |
 201 C |        |        |        |        |
 202         shr.u   r24 = a0, 48            C 16 bits
 203         shr.u   r25 = a1, 32            C 32 bits
 204         shr.u   r26 = a2, 16            C 48 bits
 205         ;;
 206         shr.u   r10 = c0, 48            C 16 bits, always zero
 207         shr.u   r11 = c1, 32            C 32 bits
 208         shr.u   r30 = c2, 16            C 48 bits
 209         ;;
 210         dep.z   r27 = a0,  0, 48        C 48 bits
 211         dep.z   r28 = a1, 16, 32        C 48 bits
 212         dep.z   r29 = a2, 32, 16        C 48 bits
 213         dep.z   r31 = c0,  0, 48        C 48 bits
 214         dep.z   r14 = c1, 16, 32        C 48 bits
 215         dep.z   r15 = c2, 32, 16        C 48 bits
 216         ;;
 217  {.mmi; add     r24 = r24, r25
 218         add     r26 = r26, r27
 219         add     r28 = r28, r29
 220 }{.mmi; add     r10 = r10, r11
 221         add     r30 = r30, r31
 222         add     r14 = r14, r15
 223         ;;
 224 }
 225         movl    r8 = 0xffffffffffff0
 226         add     r24 = r24, r26
 227         add     r10 = r10, r30
 228         ;;
 229         add     r24 = r24, r28
 230         add     r10 = r10, r14
 231         ;;
 232         sub     r8 = r8, r24
 233         ;;
 234         add     r8 = r8, r10
 235         br.ret.sptk.many b0
 236 EPILOGUE()
 237 ASM_END()