source/libs/gmp/gmp-src/mpn/arm/neon/lshiftc.asm

   1 dnl  ARM Neon mpn_lshiftc.
   2
   3 dnl  Contributed to the GNU project by Torbjörn Granlund.
   4
   5 dnl  Copyright 2013 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35 C            cycles/limb     cycles/limb     cycles/limb      good
  36 C              aligned        unaligned       best seen      for cpu?
  37 C StrongARM      -               -
  38 C XScale         -               -
  39 C Cortex-A7      ?               ?
  40 C Cortex-A8      ?               ?
  41 C Cortex-A9      3.5             3.5                            Y
  42 C Cortex-A15     1.75            1.75                           Y
  43
  44
  45 C We read 64 bits at a time at 32-bit aligned addresses, and except for the
  46 C first and last store, we write using 64-bit aligned addresses.  All shifting
  47 C is done on 64-bit words in 'extension' registers.
  48 C
  49 C It should be possible to read also using 64-bit alignment, by manipulating
  50 C the shift count for unaligned operands.  Not done, since it does not seem to
  51 C matter for A9 or A15.
  52 C
  53 C This will not work in big-endian mode.
  54
  55 C TODO
  56 C  * Try using 128-bit operations.  Note that Neon lacks pure 128-bit shifts,
  57 C    which might make it tricky.
  58 C  * Clean up and simplify.
  59 C  * Consider sharing most of the code for lshift and rshift, since the feed-in
  60 C    code, the loop, and most of the wind-down code are identical.
  61 C  * Replace the basecase code with code using 'extension' registers.
  62 C  * Optimise.  It is not clear that this loop insn permutation is optimal for
  63 C    either A9 or A15.
  64
  65 C INPUT PARAMETERS
  66 define(`rp',  `r0')
  67 define(`ap',  `r1')
  68 define(`n',   `r2')
  69 define(`cnt', `r3')
  70
  71         define(`IFLSH', `$1')
  72         define(`IFRSH', `')
  73         define(`X',`0')
  74         define(`Y',`1')
  75         define(`func',`mpn_lshiftc')
  76 define(`OPERATION_lshiftc',1)
  77
  78 ASM_START()
  79         TEXT
  80         ALIGN(64)
  81 PROLOGUE(mpn_lshiftc)
  82 IFLSH(` mov     r12, n, lsl #2  ')
  83 IFLSH(` add     rp, rp, r12     ')
  84 IFLSH(` add     ap, ap, r12     ')
  85
  86         cmp     n, #4                   C SIMD code n limit
  87         ble     L(base)
  88
  89 ifdef(`OPERATION_lshiftc',`
  90         vdup.32 d6, r3                  C left shift count is positive
  91         sub     r3, r3, #64             C right shift count is negative
  92         vdup.32 d7, r3
  93         mov     r12, #-8')              C lshift pointer update offset
  94 ifdef(`OPERATION_rshift',`
  95         rsb     r3, r3, #0              C right shift count is negative
  96         vdup.32 d6, r3
  97         add     r3, r3, #64             C left shift count is positive
  98         vdup.32 d7, r3
  99         mov     r12, #8')               C rshift pointer update offset
 100
 101 IFLSH(` sub     ap, ap, #8      ')
 102         vld1.32 {d19}, [ap], r12        C load initial 2 limbs
 103         vshl.u64 d18, d19, d7           C retval
 104
 105         tst     rp, #4                  C is rp 64-bit aligned already?
 106         beq     L(rp_aligned)           C yes, skip
 107         vmvn     d19, d19
 108 IFLSH(` add     ap, ap, #4      ')      C move back ap pointer
 109 IFRSH(` sub     ap, ap, #4      ')      C move back ap pointer
 110         vshl.u64 d4, d19, d6
 111         sub     n, n, #1                C first limb handled
 112 IFLSH(` sub      rp, rp, #4     ')
 113         vst1.32  {d4[Y]}, [rp]IFRSH(!)  C store first limb, rp gets aligned
 114         vld1.32  {d19}, [ap], r12       C load ap[1] and ap[2]
 115
 116 L(rp_aligned):
 117 IFLSH(` sub     rp, rp, #8      ')
 118         subs    n, n, #6
 119         vmvn     d19, d19
 120         blt     L(two_or_three_more)
 121         tst     n, #2
 122         beq     L(2)
 123
 124 L(1):   vld1.32  {d17}, [ap], r12
 125         vshl.u64 d5, d19, d6
 126         vmvn     d17, d17
 127         vld1.32  {d16}, [ap], r12
 128         vshl.u64 d0, d17, d7
 129         vshl.u64 d4, d17, d6
 130         sub     n, n, #2
 131         b        L(mid)
 132
 133 L(2):   vld1.32  {d16}, [ap], r12
 134         vshl.u64 d4, d19, d6
 135         vmvn     d16, d16
 136         vld1.32  {d17}, [ap], r12
 137         vshl.u64 d1, d16, d7
 138         vshl.u64 d5, d16, d6
 139         subs    n, n, #4
 140         blt     L(end)
 141
 142 L(top): vmvn     d17, d17
 143         vld1.32  {d16}, [ap], r12
 144         vorr     d2, d4, d1
 145         vshl.u64 d0, d17, d7
 146         vshl.u64 d4, d17, d6
 147         vst1.32  {d2}, [rp:64], r12
 148 L(mid): vmvn     d16, d16
 149         vld1.32  {d17}, [ap], r12
 150         vorr     d3, d5, d0
 151         vshl.u64 d1, d16, d7
 152         vshl.u64 d5, d16, d6
 153         vst1.32  {d3}, [rp:64], r12
 154         subs    n, n, #4
 155         bge     L(top)
 156
 157 L(end): tst      n, #1
 158         beq      L(evn)
 159
 160         vorr     d2, d4, d1
 161         vst1.32  {d2}, [rp:64], r12
 162         b        L(cj1)
 163
 164 L(evn): vmvn     d17, d17
 165         vorr     d2, d4, d1
 166         vshl.u64 d0, d17, d7
 167         vshl.u64 d4, d17, d6
 168         vst1.32  {d2}, [rp:64], r12
 169         vmov.u8  d17, #255
 170         vorr     d2, d5, d0
 171         vshl.u64 d0, d17, d7
 172         vorr     d3, d4, d0
 173         b        L(cj2)
 174
 175 C Load last 2 - 3 limbs, store last 4 - 5 limbs
 176 L(two_or_three_more):
 177         tst     n, #1
 178         beq     L(l2)
 179
 180 L(l3):  vshl.u64 d5, d19, d6
 181         vld1.32  {d17}, [ap], r12
 182 L(cj1): vmov.u8  d16, #0
 183 IFLSH(` add      ap, ap, #4     ')
 184         vmvn     d17, d17
 185         vld1.32  {d16[Y]}, [ap], r12
 186         vshl.u64 d0, d17, d7
 187         vshl.u64 d4, d17, d6
 188         vmvn     d16, d16
 189         vorr     d3, d5, d0
 190         vshl.u64 d1, d16, d7
 191         vshl.u64 d5, d16, d6
 192         vst1.32  {d3}, [rp:64], r12
 193         vorr     d2, d4, d1
 194         vst1.32  {d2}, [rp:64], r12
 195 IFLSH(` add      rp, rp, #4     ')
 196         vst1.32  {d5[Y]}, [rp]
 197         vmov.32  r0, d18[X]
 198         bx      lr
 199
 200 L(l2):  vld1.32  {d16}, [ap], r12
 201         vshl.u64 d4, d19, d6
 202         vmvn     d16, d16
 203         vshl.u64 d1, d16, d7
 204         vshl.u64 d5, d16, d6
 205         vmov.u8  d17, #255
 206         vorr     d2, d4, d1
 207         vshl.u64 d0, d17, d7
 208         vorr     d3, d5, d0
 209 L(cj2): vst1.32  {d2}, [rp:64], r12
 210         vst1.32  {d3}, [rp]
 211         vmov.32  r0, d18[X]
 212         bx      lr
 213
 214
 215 define(`tnc', `r12')
 216 L(base):
 217         push    {r4, r6, r7, r8}
 218         ldr     r4, [ap, #-4]!
 219         rsb     tnc, cnt, #32
 220         mvn     r6, r4
 221
 222         mov     r7, r6, lsl cnt
 223         tst     n, #1
 224         beq     L(ev)                   C n even
 225
 226 L(od):  subs    n, n, #2
 227         bcc     L(ed1)                  C n = 1
 228         ldr     r8, [ap, #-4]!
 229         mvn     r8, r8
 230         b       L(md)                   C n = 3
 231
 232 L(ev):  ldr     r6, [ap, #-4]!
 233         mvn     r6, r6
 234         subs    n, n, #2
 235         beq     L(ed)                   C n = 3
 236                                         C n = 4
 237 L(tp):  ldr     r8, [ap, #-4]!
 238         orr     r7, r7, r6, lsr tnc
 239         str     r7, [rp, #-4]!
 240         mvn     r8, r8
 241         mov     r7, r6, lsl cnt
 242 L(md):  ldr     r6, [ap, #-4]!
 243         orr     r7, r7, r8, lsr tnc
 244         str     r7, [rp, #-4]!
 245         mvn     r6, r6
 246         mov     r7, r8, lsl cnt
 247
 248 L(ed):  orr     r7, r7, r6, lsr tnc
 249         str     r7, [rp, #-4]!
 250         mov     r7, r6, lsl cnt
 251 L(ed1): mvn     r6, #0
 252         orr     r7, r7, r6, lsr tnc
 253         str     r7, [rp, #-4]
 254         mov     r0, r4, lsr tnc
 255         pop     {r4, r6, r7, r8}
 256         bx      r14
 257 EPILOGUE()