source/libs/gmp/gmp-src/mpn/arm64/popcount.asm

   1 dnl  ARM64 Neon mpn_popcount -- mpn bit population count.
   2
   3 dnl  Copyright 2013, 2014 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33 C            cycles/limb
  34 C Cortex-A53     ?
  35 C Cortex-A57     ?
  36
  37 C TODO
  38 C  * Consider greater unrolling.
  39 C  * Arrange to align the pointer, if that helps performance.  Use the same
  40 C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
  41 C    valgrind!)
  42 C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
  43 C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
  44
  45 changecom(@&*$)
  46
  47 C INPUT PARAMETERS
  48 define(`ap', x0)
  49 define(`n',  x1)
  50
  51 C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
  52 C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
  53 C (8*2^16-1)/64 = 0x1fff limbs.  We use a chunksize close to that, but which
  54 C  allows the huge count code to jump deep into the code (at L(chu)).
  55
  56 define(`maxsize',  0x1fff)
  57 define(`chunksize',0x1ff0)
  58
  59 ASM_START()
  60 PROLOGUE(mpn_popcount)
  61
  62         mov     x11, #maxsize
  63         cmp     n, x11
  64         b.hi    L(gt8k)
  65
  66 L(lt8k):
  67         movi    v4.16b, #0                      C clear summation register
  68         movi    v5.16b, #0                      C clear summation register
  69
  70         tbz     n, #0, L(xx0)
  71         sub     n, n, #1
  72         ld1     {v0.1d}, [ap], #8               C load 1 limb
  73         cnt     v6.16b, v0.16b
  74         uadalp  v4.8h,  v6.16b                  C could also splat
  75
  76 L(xx0): tbz     n, #1, L(x00)
  77         sub     n, n, #2
  78         ld1     {v0.2d}, [ap], #16              C load 2 limbs
  79         cnt     v6.16b, v0.16b
  80         uadalp  v4.8h,  v6.16b
  81
  82 L(x00): tbz     n, #2, L(000)
  83         subs    n, n, #4
  84         ld1     {v0.2d,v1.2d}, [ap], #32        C load 4 limbs
  85         b.ls    L(sum)
  86
  87 L(gt4): ld1     {v2.2d,v3.2d}, [ap], #32        C load 4 limbs
  88         sub     n, n, #4
  89         cnt     v6.16b, v0.16b
  90         cnt     v7.16b, v1.16b
  91         b       L(mid)
  92
  93 L(000): subs    n, n, #8
  94         b.lo    L(e0)
  95
  96 L(chu): ld1     {v2.2d,v3.2d}, [ap], #32        C load 4 limbs
  97         ld1     {v0.2d,v1.2d}, [ap], #32        C load 4 limbs
  98         cnt     v6.16b, v2.16b
  99         cnt     v7.16b, v3.16b
 100         subs    n, n, #8
 101         b.lo    L(end)
 102
 103 L(top): ld1     {v2.2d,v3.2d}, [ap], #32        C load 4 limbs
 104         uadalp  v4.8h,  v6.16b
 105         cnt     v6.16b, v0.16b
 106         uadalp  v5.8h,  v7.16b
 107         cnt     v7.16b, v1.16b
 108 L(mid): ld1     {v0.2d,v1.2d}, [ap], #32        C load 4 limbs
 109         subs    n, n, #8
 110         uadalp  v4.8h,  v6.16b
 111         cnt     v6.16b, v2.16b
 112         uadalp  v5.8h,  v7.16b
 113         cnt     v7.16b, v3.16b
 114         b.hs    L(top)
 115
 116 L(end): uadalp  v4.8h,  v6.16b
 117         uadalp  v5.8h,  v7.16b
 118 L(sum): cnt     v6.16b, v0.16b
 119         cnt     v7.16b, v1.16b
 120         uadalp  v4.8h,  v6.16b
 121         uadalp  v5.8h,  v7.16b
 122         add     v4.8h, v4.8h, v5.8h
 123                                         C we have 8 16-bit counts
 124 L(e0):  uaddlp  v4.4s,  v4.8h           C we have 4 32-bit counts
 125         uaddlp  v4.2d,  v4.4s           C we have 2 64-bit counts
 126         mov     x0, v4.d[0]
 127         mov     x1, v4.d[1]
 128         add     x0, x0, x1
 129         ret
 130
 131 C Code for count > maxsize.  Splits operand and calls above code.
 132 define(`ap2', x5)                       C caller-saves reg not used above
 133 L(gt8k):
 134         mov     x8, x30
 135         mov     x7, n                   C full count (caller-saves reg not used above)
 136         mov     x4, #0                  C total sum  (caller-saves reg not used above)
 137         mov     x9, #chunksize*8        C caller-saves reg not used above
 138         mov     x10, #chunksize         C caller-saves reg not used above
 139
 140 1:      add     ap2, ap, x9             C point at subsequent block
 141         mov     n, #chunksize-8         C count for this invocation, adjusted for entry pt
 142         movi    v4.16b, #0              C clear chunk summation register
 143         movi    v5.16b, #0              C clear chunk summation register
 144         bl      L(chu)                  C jump deep inside code
 145         add     x4, x4, x0
 146         mov     ap, ap2                 C put chunk pointer in place for calls
 147         sub     x7, x7, x10
 148         cmp     x7, x11
 149         b.hi    1b
 150
 151         mov     n, x7                   C count for final invocation
 152         bl      L(lt8k)
 153         add     x0, x4, x0
 154         mov     x30, x8
 155         ret
 156 EPILOGUE()