source/libs/gmp/gmp-src/mpn/arm/neon/lorrshift.asm

   1 dnl  ARM Neon mpn_lshift and mpn_rshift.
   2
   3 dnl  Contributed to the GNU project by Torbjörn Granlund.
   4
   5 dnl  Copyright 2013 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35 C            cycles/limb     cycles/limb     cycles/limb      good
  36 C              aligned        unaligned       best seen      for cpu?
  37 C StrongARM      -               -
  38 C XScale         -               -
  39 C Cortex-A7      ?               ?
  40 C Cortex-A8      ?               ?
  41 C Cortex-A9      3               3                              Y
  42 C Cortex-A15     1.5             1.5                            Y
  43
  44
  45 C We read 64 bits at a time at 32-bit aligned addresses, and except for the
  46 C first and last store, we write using 64-bit aligned addresses.  All shifting
  47 C is done on 64-bit words in 'extension' registers.
  48 C
  49 C It should be possible to read also using 64-bit alignment, by manipulating
  50 C the shift count for unaligned operands.  Not done, since it does not seem to
  51 C matter for A9 or A15.
  52 C
  53 C This will not work in big-endian mode.
  54
  55 C TODO
  56 C  * Try using 128-bit operations.  Note that Neon lacks pure 128-bit shifts,
  57 C    which might make it tricky.
  58 C  * Clean up and simplify.
  59 C  * Consider sharing most of the code for lshift and rshift, since the feed-in code,
  60 C    the loop, and most of the wind-down code are identical.
  61 C  * Replace the basecase code with code using 'extension' registers.
  62 C  * Optimise.  It is not clear that this loop insn permutation is optimal for
  63 C    either A9 or A15.
  64
  65 C INPUT PARAMETERS
  66 define(`rp',  `r0')
  67 define(`ap',  `r1')
  68 define(`n',   `r2')
  69 define(`cnt', `r3')
  70
  71 ifdef(`OPERATION_lshift',`
  72         define(`IFLSH', `$1')
  73         define(`IFRSH', `')
  74         define(`X',`0')
  75         define(`Y',`1')
  76         define(`func',`mpn_lshift')
  77 ')
  78 ifdef(`OPERATION_rshift',`
  79         define(`IFLSH', `')
  80         define(`IFRSH', `$1')
  81         define(`X',`1')
  82         define(`Y',`0')
  83         define(`func',`mpn_rshift')
  84 ')
  85
  86 MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
  87
  88 ASM_START()
  89         TEXT
  90         ALIGN(64)
  91 PROLOGUE(func)
  92 IFLSH(` mov     r12, n, lsl #2  ')
  93 IFLSH(` add     rp, rp, r12     ')
  94 IFLSH(` add     ap, ap, r12     ')
  95
  96         cmp     n, #4                   C SIMD code n limit
  97         ble     L(base)
  98
  99 ifdef(`OPERATION_lshift',`
 100         vdup.32 d6, r3                  C left shift count is positive
 101         sub     r3, r3, #64             C right shift count is negative
 102         vdup.32 d7, r3
 103         mov     r12, #-8')              C lshift pointer update offset
 104 ifdef(`OPERATION_rshift',`
 105         rsb     r3, r3, #0              C right shift count is negative
 106         vdup.32 d6, r3
 107         add     r3, r3, #64             C left shift count is positive
 108         vdup.32 d7, r3
 109         mov     r12, #8')               C rshift pointer update offset
 110
 111 IFLSH(` sub     ap, ap, #8      ')
 112         vld1.32 {d19}, [ap], r12        C load initial 2 limbs
 113         vshl.u64 d18, d19, d7           C retval
 114
 115         tst     rp, #4                  C is rp 64-bit aligned already?
 116         beq     L(rp_aligned)           C yes, skip
 117 IFLSH(` add     ap, ap, #4      ')      C move back ap pointer
 118 IFRSH(` sub     ap, ap, #4      ')      C move back ap pointer
 119         vshl.u64 d4, d19, d6
 120         sub     n, n, #1                C first limb handled
 121 IFLSH(` sub      rp, rp, #4     ')
 122         vst1.32  {d4[Y]}, [rp]IFRSH(!)  C store first limb, rp gets aligned
 123         vld1.32  {d19}, [ap], r12       C load ap[1] and ap[2]
 124
 125 L(rp_aligned):
 126 IFLSH(` sub     rp, rp, #8      ')
 127         subs    n, n, #6
 128         blt     L(two_or_three_more)
 129         tst     n, #2
 130         beq     L(2)
 131
 132 L(1):   vld1.32  {d17}, [ap], r12
 133         vshl.u64 d5, d19, d6
 134         vld1.32  {d16}, [ap], r12
 135         vshl.u64 d0, d17, d7
 136         vshl.u64 d4, d17, d6
 137         sub     n, n, #2
 138         b        L(mid)
 139
 140 L(2):   vld1.32  {d16}, [ap], r12
 141         vshl.u64 d4, d19, d6
 142         vld1.32  {d17}, [ap], r12
 143         vshl.u64 d1, d16, d7
 144         vshl.u64 d5, d16, d6
 145         subs    n, n, #4
 146         blt     L(end)
 147
 148 L(top): vld1.32  {d16}, [ap], r12
 149         vorr     d2, d4, d1
 150         vshl.u64 d0, d17, d7
 151         vshl.u64 d4, d17, d6
 152         vst1.32  {d2}, [rp:64], r12
 153 L(mid): vld1.32  {d17}, [ap], r12
 154         vorr     d3, d5, d0
 155         vshl.u64 d1, d16, d7
 156         vshl.u64 d5, d16, d6
 157         vst1.32  {d3}, [rp:64], r12
 158         subs    n, n, #4
 159         bge     L(top)
 160
 161 L(end): tst      n, #1
 162         beq      L(evn)
 163
 164         vorr     d2, d4, d1
 165         vst1.32  {d2}, [rp:64], r12
 166         b        L(cj1)
 167
 168 L(evn): vorr     d2, d4, d1
 169         vshl.u64 d0, d17, d7
 170         vshl.u64 d16, d17, d6
 171         vst1.32  {d2}, [rp:64], r12
 172         vorr     d2, d5, d0
 173         b        L(cj2)
 174
 175 C Load last 2 - 3 limbs, store last 4 - 5 limbs
 176 L(two_or_three_more):
 177         tst     n, #1
 178         beq     L(l2)
 179
 180 L(l3):  vshl.u64 d5, d19, d6
 181         vld1.32  {d17}, [ap], r12
 182 L(cj1): veor     d16, d16, d16
 183 IFLSH(` add      ap, ap, #4     ')
 184         vld1.32  {d16[Y]}, [ap], r12
 185         vshl.u64 d0, d17, d7
 186         vshl.u64 d4, d17, d6
 187         vorr     d3, d5, d0
 188         vshl.u64 d1, d16, d7
 189         vshl.u64 d5, d16, d6
 190         vst1.32  {d3}, [rp:64], r12
 191         vorr     d2, d4, d1
 192         vst1.32  {d2}, [rp:64], r12
 193 IFLSH(` add      rp, rp, #4     ')
 194         vst1.32  {d5[Y]}, [rp]
 195         vmov.32  r0, d18[X]
 196         bx      lr
 197
 198 L(l2):  vld1.32  {d16}, [ap], r12
 199         vshl.u64 d4, d19, d6
 200         vshl.u64 d1, d16, d7
 201         vshl.u64 d16, d16, d6
 202         vorr     d2, d4, d1
 203 L(cj2): vst1.32  {d2}, [rp:64], r12
 204         vst1.32  {d16}, [rp]
 205         vmov.32  r0, d18[X]
 206         bx      lr
 207
 208
 209 define(`tnc', `r12')
 210 L(base):
 211         push    {r4, r6, r7, r8}
 212 ifdef(`OPERATION_lshift',`
 213         ldr     r4, [ap, #-4]!
 214         rsb     tnc, cnt, #32
 215
 216         mov     r7, r4, lsl cnt
 217         tst     n, #1
 218         beq     L(ev)                   C n even
 219
 220 L(od):  subs    n, n, #2
 221         bcc     L(ed1)                  C n = 1
 222         ldr     r8, [ap, #-4]!
 223         b       L(md)                   C n = 3
 224
 225 L(ev):  ldr     r6, [ap, #-4]!
 226         subs    n, n, #2
 227         beq     L(ed)                   C n = 3
 228                                         C n = 4
 229 L(tp):  ldr     r8, [ap, #-4]!
 230         orr     r7, r7, r6, lsr tnc
 231         str     r7, [rp, #-4]!
 232         mov     r7, r6, lsl cnt
 233 L(md):  ldr     r6, [ap, #-4]!
 234         orr     r7, r7, r8, lsr tnc
 235         str     r7, [rp, #-4]!
 236         mov     r7, r8, lsl cnt
 237
 238 L(ed):  orr     r7, r7, r6, lsr tnc
 239         str     r7, [rp, #-4]!
 240         mov     r7, r6, lsl cnt
 241 L(ed1): str     r7, [rp, #-4]
 242         mov     r0, r4, lsr tnc
 243 ')
 244 ifdef(`OPERATION_rshift',`
 245         ldr     r4, [ap]
 246         rsb     tnc, cnt, #32
 247
 248         mov     r7, r4, lsr cnt
 249         tst     n, #1
 250         beq     L(ev)                   C n even
 251
 252 L(od):  subs    n, n, #2
 253         bcc     L(ed1)                  C n = 1
 254         ldr     r8, [ap, #4]!
 255         b       L(md)                   C n = 3
 256
 257 L(ev):  ldr     r6, [ap, #4]!
 258         subs    n, n, #2
 259         beq     L(ed)                   C n = 2
 260                                         C n = 4
 261
 262 L(tp):  ldr     r8, [ap, #4]!
 263         orr     r7, r7, r6, lsl tnc
 264         str     r7, [rp], #4
 265         mov     r7, r6, lsr cnt
 266 L(md):  ldr     r6, [ap, #4]!
 267         orr     r7, r7, r8, lsl tnc
 268         str     r7, [rp], #4
 269         mov     r7, r8, lsr cnt
 270
 271 L(ed):  orr     r7, r7, r6, lsl tnc
 272         str     r7, [rp], #4
 273         mov     r7, r6, lsr cnt
 274 L(ed1): str     r7, [rp], #4
 275         mov     r0, r4, lsl tnc
 276 ')
 277         pop     {r4, r6, r7, r8}
 278         bx      r14
 279 EPILOGUE()