source/libs/gmp/gmp-src/mpn/arm64/hamdist.asm

   1 dnl  ARM64 Neon mpn_hamdist -- mpn bit hamming distance.
   2
   3 dnl  Copyright 2013, 2014 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33 C            cycles/limb
  34 C Cortex-A53     ?
  35 C Cortex-A57     ?
  36
  37 C TODO
  38 C  * Consider greater unrolling.
  39 C  * Arrange to align the pointer, if that helps performance.  Use the same
  40 C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
  41 C    valgrind!)
  42 C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
  43 C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
  44
  45 changecom(@&*$)
  46
  47 C INPUT PARAMETERS
  48 define(`ap', x0)
  49 define(`bp', x1)
  50 define(`n',  x2)
  51
  52 C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
  53 C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
  54 C (8*2^16-1)/64 = 0x1fff limbs.  We use a chunksize close to that, but which
  55 C  allows the huge count code to jump deep into the code (at L(chu)).
  56
  57 define(`maxsize',  0x1fff)
  58 define(`chunksize',0x1ff0)
  59
  60 ASM_START()
  61 PROLOGUE(mpn_hamdist)
  62
  63         mov     x11, #maxsize
  64         cmp     n, x11
  65         b.hi    L(gt8k)
  66
  67 L(lt8k):
  68         movi    v4.16b, #0                      C clear summation register
  69         movi    v5.16b, #0                      C clear summation register
  70
  71         tbz     n, #0, L(xx0)
  72         sub     n, n, #1
  73         ld1     {v0.1d}, [ap], #8               C load 1 limb
  74         ld1     {v16.1d}, [bp], #8              C load 1 limb
  75         eor     v0.16b, v0.16b, v16.16b
  76         cnt     v6.16b, v0.16b
  77         uadalp  v4.8h,  v6.16b                  C could also splat
  78
  79 L(xx0): tbz     n, #1, L(x00)
  80         sub     n, n, #2
  81         ld1     {v0.2d}, [ap], #16              C load 2 limbs
  82         ld1     {v16.2d}, [bp], #16             C load 2 limbs
  83         eor     v0.16b, v0.16b, v16.16b
  84         cnt     v6.16b, v0.16b
  85         uadalp  v4.8h,  v6.16b
  86
  87 L(x00): tbz     n, #2, L(000)
  88         subs    n, n, #4
  89         ld1     {v0.2d,v1.2d}, [ap], #32        C load 4 limbs
  90         ld1     {v16.2d,v17.2d}, [bp], #32      C load 4 limbs
  91         b.ls    L(sum)
  92
  93 L(gt4): ld1     {v2.2d,v3.2d}, [ap], #32        C load 4 limbs
  94         ld1     {v18.2d,v19.2d}, [bp], #32      C load 4 limbs
  95         eor     v0.16b, v0.16b, v16.16b
  96         eor     v1.16b, v1.16b, v17.16b
  97         sub     n, n, #4
  98         cnt     v6.16b, v0.16b
  99         cnt     v7.16b, v1.16b
 100         b       L(mid)
 101
 102 L(000): subs    n, n, #8
 103         b.lo    L(e0)
 104
 105 L(chu): ld1     {v2.2d,v3.2d}, [ap], #32        C load 4 limbs
 106         ld1     {v0.2d,v1.2d}, [ap], #32        C load 4 limbs
 107         ld1     {v18.2d,v19.2d}, [bp], #32      C load 4 limbs
 108         ld1     {v16.2d,v17.2d}, [bp], #32      C load 4 limbs
 109         eor     v2.16b, v2.16b, v18.16b
 110         eor     v3.16b, v3.16b, v19.16b
 111         cnt     v6.16b, v2.16b
 112         cnt     v7.16b, v3.16b
 113         subs    n, n, #8
 114         b.lo    L(end)
 115
 116 L(top): ld1     {v2.2d,v3.2d}, [ap], #32        C load 4 limbs
 117         ld1     {v18.2d,v19.2d}, [bp], #32      C load 4 limbs
 118         eor     v0.16b, v0.16b, v16.16b
 119         eor     v1.16b, v1.16b, v17.16b
 120         uadalp  v4.8h,  v6.16b
 121         cnt     v6.16b, v0.16b
 122         uadalp  v5.8h,  v7.16b
 123         cnt     v7.16b, v1.16b
 124 L(mid): ld1     {v0.2d,v1.2d}, [ap], #32        C load 4 limbs
 125         ld1     {v16.2d,v17.2d}, [bp], #32      C load 4 limbs
 126         eor     v2.16b, v2.16b, v18.16b
 127         eor     v3.16b, v3.16b, v19.16b
 128         subs    n, n, #8
 129         uadalp  v4.8h,  v6.16b
 130         cnt     v6.16b, v2.16b
 131         uadalp  v5.8h,  v7.16b
 132         cnt     v7.16b, v3.16b
 133         b.hs    L(top)
 134
 135 L(end): uadalp  v4.8h,  v6.16b
 136         uadalp  v5.8h,  v7.16b
 137 L(sum): eor     v0.16b, v0.16b, v16.16b
 138         eor     v1.16b, v1.16b, v17.16b
 139         cnt     v6.16b, v0.16b
 140         cnt     v7.16b, v1.16b
 141         uadalp  v4.8h,  v6.16b
 142         uadalp  v5.8h,  v7.16b
 143         add     v4.8h, v4.8h, v5.8h
 144                                         C we have 8 16-bit counts
 145 L(e0):  uaddlp  v4.4s,  v4.8h           C we have 4 32-bit counts
 146         uaddlp  v4.2d,  v4.4s           C we have 2 64-bit counts
 147         mov     x0, v4.d[0]
 148         mov     x1, v4.d[1]
 149         add     x0, x0, x1
 150         ret
 151
 152 C Code for count > maxsize.  Splits operand and calls above code.
 153 define(`ap2', x5)                       C caller-saves reg not used above
 154 define(`bp2', x6)                       C caller-saves reg not used above
 155 L(gt8k):
 156         mov     x8, x30
 157         mov     x7, n                   C full count (caller-saves reg not used above)
 158         mov     x4, #0                  C total sum  (caller-saves reg not used above)
 159         mov     x9, #chunksize*8        C caller-saves reg not used above
 160         mov     x10, #chunksize         C caller-saves reg not used above
 161
 162 1:      add     ap2, ap, x9             C point at subsequent block
 163         add     bp2, bp, x9             C point at subsequent block
 164         mov     n, #chunksize-8         C count for this invocation, adjusted for entry pt
 165         movi    v4.16b, #0              C clear chunk summation register
 166         movi    v5.16b, #0              C clear chunk summation register
 167         bl      L(chu)                  C jump deep inside code
 168         add     x4, x4, x0
 169         mov     ap, ap2                 C put chunk pointer in place for calls
 170         mov     bp, bp2                 C put chunk pointer in place for calls
 171         sub     x7, x7, x10
 172         cmp     x7, x11
 173         b.hi    1b
 174
 175         mov     n, x7                   C count for final invocation
 176         bl      L(lt8k)
 177         add     x0, x4, x0
 178         mov     x30, x8
 179         ret
 180 EPILOGUE()