source/libs/gmp/gmp-src/mpn/arm/v7a/cora15/submul_1.asm

   1 dnl  ARM mpn_submul_1 optimised for A15.
   2
   3 dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33 C            cycles/limb                best
  34 C StrongARM:     -
  35 C XScale         ?
  36 C Cortex-A7      ?
  37 C Cortex-A8      ?
  38 C Cortex-A9      5.75                   3.75
  39 C Cortex-A15     2.32                   this
  40
  41 C This code uses umlal and umaal for adding in the rp[] data, keeping the
  42 C recurrency path separate from any multiply instructions.  It performs well on
  43 C A15, but not quite at the multiply bandwidth like the corresponding addmul_1
  44 C code.
  45 C
  46 C We don't use r12 due to ldrd and strd limitations.
  47 C
  48 C This loop complements U on the fly,
  49 C   U' = B^n - 1 - U
  50 C and then uses that
  51 C   R - U*v = R + U'*v + v - B^n v
  52
  53 C Architecture requirements:
  54 C v5    -
  55 C v5t   -
  56 C v5te  ldrd strd
  57 C v6    umaal
  58 C v6t2  -
  59 C v7a   -
  60
  61 define(`rp', `r0')
  62 define(`up', `r1')
  63 define(`n',  `r2')
  64 define(`v0', `r3')
  65
  66 define(`w0', `r10') define(`w1', `r11')
  67 define(`u0', `r8')  define(`u1', `r9')
  68
  69 ASM_START()
  70 PROLOGUE(mpn_submul_1)
  71         sub     sp, sp, #32
  72         strd    r10, r11, [sp, #24]
  73         strd    r8, r9, [sp, #16]
  74         strd    r6, r7, [sp, #8]
  75         strd    r4, r5, [sp, #0]
  76 C       push    { r4-r11 }
  77
  78         ands    r6, n, #3
  79         sub     n, n, #3
  80         beq     L(b00)
  81         cmp     r6, #2
  82         bcc     L(b01)
  83         beq     L(b10)
  84
  85 L(b11): mov     r6, #0
  86         ldr     u1, [up], #-4
  87         ldr     w1, [rp], #-16
  88         mvn     u1, u1
  89         adds    r7, v0, #0
  90         b       L(mid)
  91
  92 L(b00): ldrd    u0, u1, [up]
  93         ldrd    w0, w1, [rp], #-12
  94         mvn     u0, u0
  95         mvn     u1, u1
  96         mov     r6, v0
  97         umaal   w0, r6, u0, v0
  98         cmn     r13, #0                 C carry clear
  99         mov     r7, #0
 100         str     w0, [rp, #12]
 101         b       L(mid)
 102
 103 L(b10): ldrd    u0, u1, [up], #8
 104         ldrd    w0, w1, [rp]
 105         mvn     u0, u0
 106         mvn     u1, u1
 107         mov     r4, v0
 108         umaal   w0, r4, u0, v0
 109         mov     r5, #0
 110         str     w0, [rp], #-4
 111         umlal   w1, r5, u1, v0
 112         adds    n, n, #0
 113         bmi     L(end)
 114         b       L(top)
 115
 116 L(b01): ldr     u1, [up], #4
 117         ldr     w1, [rp], #-8
 118         mvn     u1, u1
 119         mov     r5, v0
 120         mov     r4, #0
 121         umaal   w1, r5, u1, v0
 122         tst     n, n
 123         bmi     L(end)
 124
 125 C       ALIGN(16)
 126 L(top): ldrd    u0, u1, [up, #0]
 127         adcs    r4, r4, w1
 128         mvn     u0, u0
 129         ldrd    w0, w1, [rp, #12]
 130         mvn     u1, u1
 131         mov     r6, #0
 132         umlal   w0, r6, u0, v0          C 1 2
 133         adcs    r5, r5, w0
 134         mov     r7, #0
 135         strd    r4, r5, [rp, #8]
 136 L(mid): umaal   w1, r7, u1, v0          C 2 3
 137         ldrd    u0, u1, [up, #8]
 138         add     up, up, #16
 139         adcs    r6, r6, w1
 140         mvn     u0, u0
 141         ldrd    w0, w1, [rp, #20]
 142         mvn     u1, u1
 143         mov     r4, #0
 144         umlal   w0, r4, u0, v0          C 3 4
 145         adcs    r7, r7, w0
 146         mov     r5, #0
 147         strd    r6, r7, [rp, #16]!
 148         sub     n, n, #4
 149         umlal   w1, r5, u1, v0          C 0 1
 150         tst     n, n
 151         bpl     L(top)
 152
 153 L(end): adcs    r4, r4, w1
 154         str     r4, [rp, #8]
 155         adc     r0, r5, #0
 156         sub     r0, v0, r0
 157         pop     { r4-r11 }
 158         bx      r14
 159 EPILOGUE()