source/libs/gmp/gmp-src/mpn/arm/v7a/cora15/neon/copyd.asm

   1 dnl  ARM Neon mpn_copyd optimised for A15.
   2
   3 dnl  Copyright 2013 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33 C            cycles/limb
  34 C StrongARM      -
  35 C XScale         -
  36 C Cortex-A7      ?
  37 C Cortex-A8      ?
  38 C Cortex-A9      1.75           slower than core register code
  39 C Cortex-A15     0.52
  40
  41 define(`rp', `r0')
  42 define(`up', `r1')
  43 define(`n',  `r2')
  44
  45 ASM_START()
  46 PROLOGUE(mpn_copyd)
  47         add     rp, rp, n, lsl #2
  48         add     up, up, n, lsl #2
  49
  50         cmp     n, #7
  51         ble     L(bc)
  52
  53 C Copy until rp is 128-bit aligned
  54         tst     rp, #4
  55         beq     L(al1)
  56         sub     up, up, #4
  57         vld1.32 {d22[0]}, [up]
  58         sub     n, n, #1
  59         sub     rp, rp, #4
  60         vst1.32 {d22[0]}, [rp]
  61 L(al1): tst     rp, #8
  62         beq     L(al2)
  63         sub     up, up, #8
  64         vld1.32 {d22}, [up]
  65         sub     n, n, #2
  66         sub     rp, rp, #8
  67         vst1.32 {d22}, [rp:64]
  68 L(al2): sub     up, up, #16
  69         vld1.32 {d26-d27}, [up]
  70         subs    n, n, #12
  71         sub     rp, rp, #16                     C offset rp for loop
  72         blt     L(end)
  73
  74         sub     up, up, #16                     C offset up for loop
  75         mov     r12, #-16
  76
  77         ALIGN(16)
  78 L(top): vld1.32 {d22-d23}, [up], r12
  79         vst1.32 {d26-d27}, [rp:128], r12
  80         vld1.32 {d26-d27}, [up], r12
  81         vst1.32 {d22-d23}, [rp:128], r12
  82         subs    n, n, #8
  83         bge     L(top)
  84
  85         add     up, up, #16                     C undo up offset
  86                                                 C rp offset undoing folded
  87 L(end): vst1.32 {d26-d27}, [rp:128]
  88
  89 C Copy last 0-7 limbs.  Note that rp is aligned after loop, but not when we
  90 C arrive here via L(bc)
  91 L(bc):  tst     n, #4
  92         beq     L(tl1)
  93         sub     up, up, #16
  94         vld1.32 {d22-d23}, [up]
  95         sub     rp, rp, #16
  96         vst1.32 {d22-d23}, [rp]
  97 L(tl1): tst     n, #2
  98         beq     L(tl2)
  99         sub     up, up, #8
 100         vld1.32 {d22}, [up]
 101         sub     rp, rp, #8
 102         vst1.32 {d22}, [rp]
 103 L(tl2): tst     n, #1
 104         beq     L(tl3)
 105         sub     up, up, #4
 106         vld1.32 {d22[0]}, [up]
 107         sub     rp, rp, #4
 108         vst1.32 {d22[0]}, [rp]
 109 L(tl3): bx      lr
 110 EPILOGUE()