source/libs/gmp/gmp-src/mpn/arm/neon/hamdist.asm

   1 dnl  ARM Neon mpn_hamdist -- mpn bit hamming distance.
   2
   3 dnl  Copyright 2013 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33 C            cycles/limb
  34 C StrongARM:     -
  35 C XScale         -
  36 C Cortex-A7      ?
  37 C Cortex-A8      ?
  38 C Cortex-A9      1.89
  39 C Cortex-A15     0.95
  40
  41 C TODO
  42 C  * Explore using vldr and vldm.  Does it help on A9?  (These loads do
  43 C    64-bits-at-a-time, which will mess up in big-endian mode.  Except not for
  44 C    popcount. Except perhaps also for popcount for the edge loads.)
  45 C  * Arrange to align the pointer, if that helps performance.  Use the same
  46 C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
  47 C    valgrind!)
  48 C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
  49 C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
  50
  51 C INPUT PARAMETERS
  52 define(`ap', r0)
  53 define(`bp', r1)
  54 define(`n',  r2)
  55
  56 C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end
  57 C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
  58 C (8*2^16-1)/32 = 0x3fff limbs.  We use a chunksize close to that, but which
  59 C can be represented as a 8-bit ARM constant.
  60 C
  61 define(`chunksize',0x3f80)
  62
  63 ASM_START()
  64 PROLOGUE(mpn_hamdist)
  65
  66         cmp     n, #chunksize
  67         bhi     L(gt16k)
  68
  69 L(lt16k):
  70         vmov.i64   q8, #0               C clear summation register
  71         vmov.i64   q9, #0               C clear summation register
  72
  73         tst        n, #1
  74         beq        L(xxx0)
  75         vmov.i64   d0, #0
  76         vmov.i64   d20, #0
  77         sub        n, n, #1
  78         vld1.32   {d0[0]}, [ap]!        C load 1 limb
  79         vld1.32   {d20[0]}, [bp]!       C load 1 limb
  80         veor       d0, d0, d20
  81         vcnt.8     d24, d0
  82         vpadal.u8  d16, d24             C d16/q8 = 0; could just splat
  83
  84 L(xxx0):tst        n, #2
  85         beq        L(xx00)
  86         sub        n, n, #2
  87         vld1.32    {d0}, [ap]!          C load 2 limbs
  88         vld1.32    {d20}, [bp]!         C load 2 limbs
  89         veor       d0, d0, d20
  90         vcnt.8     d24, d0
  91         vpadal.u8  d16, d24
  92
  93 L(xx00):tst        n, #4
  94         beq        L(x000)
  95         sub        n, n, #4
  96         vld1.32    {q0}, [ap]!          C load 4 limbs
  97         vld1.32    {q10}, [bp]!         C load 4 limbs
  98         veor       q0, q0, q10
  99         vcnt.8     q12, q0
 100         vpadal.u8  q8, q12
 101
 102 L(x000):tst        n, #8
 103         beq        L(0000)
 104
 105         subs       n, n, #8
 106         vld1.32    {q0,q1}, [ap]!       C load 8 limbs
 107         vld1.32    {q10,q11}, [bp]!     C load 8 limbs
 108         bls        L(sum)
 109
 110 L(gt8): vld1.32    {q2,q3}, [ap]!       C load 8 limbs
 111         vld1.32    {q14,q15}, [bp]!     C load 8 limbs
 112         veor       q0, q0, q10
 113         veor       q1, q1, q11
 114         sub        n, n, #8
 115         vcnt.8     q12, q0
 116         vcnt.8     q13, q1
 117         b          L(mid)
 118
 119 L(0000):subs       n, n, #16
 120         blo        L(e0)
 121
 122         vld1.32    {q2,q3}, [ap]!       C load 8 limbs
 123         vld1.32    {q0,q1}, [ap]!       C load 8 limbs
 124         vld1.32    {q14,q15}, [bp]!     C load 8 limbs
 125         vld1.32    {q10,q11}, [bp]!     C load 8 limbs
 126         veor       q2, q2, q14
 127         veor       q3, q3, q15
 128         vcnt.8     q12, q2
 129         vcnt.8     q13, q3
 130         subs       n, n, #16
 131         blo        L(end)
 132
 133 L(top): vld1.32    {q2,q3}, [ap]!       C load 8 limbs
 134         vld1.32    {q14,q15}, [bp]!     C load 8 limbs
 135         veor       q0, q0, q10
 136         veor       q1, q1, q11
 137         vpadal.u8  q8, q12
 138         vcnt.8     q12, q0
 139         vpadal.u8  q9, q13
 140         vcnt.8     q13, q1
 141 L(mid): vld1.32    {q0,q1}, [ap]!       C load 8 limbs
 142         vld1.32    {q10,q11}, [bp]!     C load 8 limbs
 143         veor       q2, q2, q14
 144         veor       q3, q3, q15
 145         subs       n, n, #16
 146         vpadal.u8  q8, q12
 147         vcnt.8     q12, q2
 148         vpadal.u8  q9, q13
 149         vcnt.8     q13, q3
 150         bhs        L(top)
 151
 152 L(end): vpadal.u8  q8, q12
 153         vpadal.u8  q9, q13
 154 L(sum): veor       q0, q0, q10
 155         veor       q1, q1, q11
 156         vcnt.8     q12, q0
 157         vcnt.8     q13, q1
 158         vpadal.u8  q8, q12
 159         vpadal.u8  q9, q13
 160         vadd.i16   q8, q8, q9
 161                                         C we have 8 16-bit counts
 162 L(e0):  vpaddl.u16 q8, q8               C we have 4 32-bit counts
 163         vpaddl.u32 q8, q8               C we have 2 64-bit counts
 164         vmov.32    r0, d16[0]
 165         vmov.32    r1, d17[0]
 166         add        r0, r0, r1
 167         bx      lr
 168
 169 C Code for large count.  Splits operand and calls above code.
 170 define(`ap2', r5)
 171 define(`bp2', r6)
 172 L(gt16k):
 173         push    {r4,r5,r6,r14}
 174         mov     ap2, ap
 175         mov     bp2, bp
 176         mov     r3, n                   C full count
 177         mov     r4, #0                  C total sum
 178
 179 1:      mov     n, #chunksize           C count for this invocation
 180         bl      L(lt16k)                C could jump deep inside code
 181         add     ap2, ap2, #chunksize*4  C point at next chunk
 182         add     bp2, bp2, #chunksize*4  C point at next chunk
 183         add     r4, r4, r0
 184         mov     ap, ap2                 C put chunk pointer in place for call
 185         mov     bp, bp2                 C put chunk pointer in place for call
 186         sub     r3, r3, #chunksize
 187         cmp     r3, #chunksize
 188         bhi     1b
 189
 190         mov     n, r3                   C count for final invocation
 191         bl      L(lt16k)
 192         add     r0, r4, r0
 193         pop     {r4,r5,r6,pc}
 194 EPILOGUE()