source/libs/gmp/gmp-src/mpn/arm/neon/popcount.asm

   1 dnl  ARM Neon mpn_popcount -- mpn bit population count.
   2
   3 dnl  Copyright 2013 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33 C            cycles/limb
  34 C StrongARM:     -
  35 C XScale         -
  36 C Cortex-A7      ?
  37 C Cortex-A8      ?
  38 C Cortex-A9      1.125
  39 C Cortex-A15     0.56
  40
  41 C TODO
  42 C  * Explore using vldr and vldm.  Does it help on A9?  (These loads do
  43 C    64-bits-at-a-time, which will mess up in big-endian mode.  Except not for
  44 C    popcount. Except perhaps also for popcount for the edge loads.)
  45 C  * Arrange to align the pointer, if that helps performance.  Use the same
  46 C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
  47 C    valgrind!)
  48 C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
  49 C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
  50
  51 C INPUT PARAMETERS
  52 define(`ap', r0)
  53 define(`n',  r1)
  54
  55 C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end
  56 C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
  57 C (8*2^16-1)/32 = 0x3fff limbs.  We use a chunksize close to that, but which
  58 C can be represented as a 8-bit ARM constant.
  59 C
  60 define(`chunksize',0x3f80)
  61
  62 ASM_START()
  63 PROLOGUE(mpn_popcount)
  64
  65         cmp     n, #chunksize
  66         bhi     L(gt16k)
  67
  68 L(lt16k):
  69         vmov.i64   q8, #0               C clear summation register
  70         vmov.i64   q9, #0               C clear summation register
  71
  72         tst        n, #1
  73         beq        L(xxx0)
  74         vmov.i64   d0, #0
  75         sub        n, n, #1
  76         vld1.32   {d0[0]}, [ap]!        C load 1 limb
  77         vcnt.8     d24, d0
  78         vpadal.u8  d16, d24             C d16/q8 = 0; could just splat
  79
  80 L(xxx0):tst        n, #2
  81         beq        L(xx00)
  82         sub        n, n, #2
  83         vld1.32    {d0}, [ap]!          C load 2 limbs
  84         vcnt.8     d24, d0
  85         vpadal.u8  d16, d24
  86
  87 L(xx00):tst        n, #4
  88         beq        L(x000)
  89         sub        n, n, #4
  90         vld1.32    {q0}, [ap]!          C load 4 limbs
  91         vcnt.8     q12, q0
  92         vpadal.u8  q8, q12
  93
  94 L(x000):tst        n, #8
  95         beq        L(0000)
  96
  97         subs       n, n, #8
  98         vld1.32    {q0,q1}, [ap]!       C load 8 limbs
  99         bls        L(sum)
 100
 101 L(gt8): vld1.32    {q2,q3}, [ap]!       C load 8 limbs
 102         sub        n, n, #8
 103         vcnt.8     q12, q0
 104         vcnt.8     q13, q1
 105         b          L(mid)
 106
 107 L(0000):subs       n, n, #16
 108         blo        L(e0)
 109
 110         vld1.32    {q2,q3}, [ap]!       C load 8 limbs
 111         vld1.32    {q0,q1}, [ap]!       C load 8 limbs
 112         vcnt.8     q12, q2
 113         vcnt.8     q13, q3
 114         subs       n, n, #16
 115         blo        L(end)
 116
 117 L(top): vld1.32    {q2,q3}, [ap]!       C load 8 limbs
 118         vpadal.u8  q8, q12
 119         vcnt.8     q12, q0
 120         vpadal.u8  q9, q13
 121         vcnt.8     q13, q1
 122 L(mid): vld1.32    {q0,q1}, [ap]!       C load 8 limbs
 123         subs       n, n, #16
 124         vpadal.u8  q8, q12
 125         vcnt.8     q12, q2
 126         vpadal.u8  q9, q13
 127         vcnt.8     q13, q3
 128         bhs        L(top)
 129
 130 L(end): vpadal.u8  q8, q12
 131         vpadal.u8  q9, q13
 132 L(sum): vcnt.8     q12, q0
 133         vcnt.8     q13, q1
 134         vpadal.u8  q8, q12
 135         vpadal.u8  q9, q13
 136         vadd.i16   q8, q8, q9
 137                                         C we have 8 16-bit counts
 138 L(e0):  vpaddl.u16 q8, q8               C we have 4 32-bit counts
 139         vpaddl.u32 q8, q8               C we have 2 64-bit counts
 140         vmov.32    r0, d16[0]
 141         vmov.32    r1, d17[0]
 142         add        r0, r0, r1
 143         bx      lr
 144
 145 C Code for large count.  Splits operand and calls above code.
 146 define(`ap2', r2)                       C caller-saves reg not used above
 147 L(gt16k):
 148         push    {r4,r14}
 149         mov     ap2, ap
 150         mov     r3, n                   C full count
 151         mov     r4, #0                  C total sum
 152
 153 1:      mov     n, #chunksize           C count for this invocation
 154         bl      L(lt16k)                C could jump deep inside code
 155         add     ap2, ap2, #chunksize*4  C point at next chunk
 156         add     r4, r4, r0
 157         mov     ap, ap2                 C put chunk pointer in place for call
 158         sub     r3, r3, #chunksize
 159         cmp     r3, #chunksize
 160         bhi     1b
 161
 162         mov     n, r3                   C count for final invocation
 163         bl      L(lt16k)
 164         add     r0, r4, r0
 165         pop     {r4,pc}
 166 EPILOGUE()