source/libs/gmp/gmp-src/mpn/powerpc64/vmx/popcount.asm

   1 dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_popcount.
   2
   3 dnl  Copyright 2006, 2010 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33 C                   cycles/limb
  34 C 7400,7410 (G4):       ?
  35 C 744x,745x (G4+):      1.125
  36 C 970 (G5):             2.25
  37
  38 C TODO
  39 C  * Rewrite the awkward huge n outer loop code.
  40 C  * Two lvx, two vperm, and two vxor could make us a similar hamdist.
  41 C  * Compress cnsts table in 64-bit mode, only half the values are needed.
  42
  43 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
  44 define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
  45 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
  46
  47 define(`OPERATION_popcount')
  48
  49 define(`ap',    `r3')
  50 define(`n',     `r4')
  51
  52 define(`rtab',  `v10')
  53 define(`cnt4',  `v11')
  54
  55 ifelse(GMP_LIMB_BITS,32,`
  56         define(`LIMB32',`       $1')
  57         define(`LIMB64',`')
  58 ',`
  59         define(`LIMB32',`')
  60         define(`LIMB64',`       $1')
  61 ')
  62
  63 C The inner loop handles up to 2^34 bits, i.e., 2^31 64-limbs, due to overflow
  64 C in vsum4ubs.  For large operands, we work in chunks, of size LIMBS_PER_CHUNK.
  65 define(`LIMBS_PER_CHUNK', 0x1000)
  66 define(`LIMBS_CHUNK_THRES', 0x1001)
  67
  68 ASM_START()
  69 PROLOGUE(mpn_popcount,toc)
  70         mfspr   r10, 256
  71         oris    r0, r10, 0xfffc         C Set VRSAVE bit 0-13
  72         mtspr   256, r0
  73
  74 ifdef(`HAVE_ABI_mode32',
  75 `       rldicl  n, n, 0, 32')           C zero extend n
  76
  77 C Load various constants into vector registers
  78         LEAL(   r11, cnsts)
  79         li      r12, 16
  80         vspltisb cnt4, 4                C 0x0404...04 used as shift count
  81
  82         li      r7, 160
  83         lvx     rtab, 0, r11
  84
  85 LIMB64(`lis     r0, LIMBS_CHUNK_THRES   ')
  86 LIMB64(`cmpd    cr7, n, r0              ')
  87
  88         lvx     v0, 0, ap
  89         addi    r7, r11, 80
  90         rlwinm  r6, ap, 2,26,29
  91         lvx     v8, r7, r6
  92         vand    v0, v0, v8
  93
  94 LIMB32(`rlwinm  r8, ap, 30,30,31        ')
  95 LIMB64(`rlwinm  r8, ap, 29,31,31        ')
  96         add     n, n, r8                C compensate n for rounded down `ap'
  97
  98         vxor    v1, v1, v1
  99         li      r8, 0                   C grand total count
 100
 101         vxor    v12, v12, v12           C zero total count
 102         vxor    v13, v13, v13           C zero total count
 103
 104         addic.  n, n, -LIMBS_PER_VR
 105         ble     L(sum)
 106
 107         addic.  n, n, -LIMBS_PER_VR
 108         ble     L(lsum)
 109
 110 C For 64-bit machines, handle huge n that would overflow vsum4ubs
 111 LIMB64(`ble     cr7, L(small)           ')
 112 LIMB64(`addis   r9, n, -LIMBS_PER_CHUNK ') C remaining n
 113 LIMB64(`lis     n, LIMBS_PER_CHUNK      ')
 114
 115         ALIGN(16)
 116 L(small):
 117 LIMB32(`srwi    r7, n, 3        ')      C loop count corresponding to n
 118 LIMB64(`srdi    r7, n, 2        ')      C loop count corresponding to n
 119         addi    r7, r7, 1
 120         mtctr   r7                      C copy n to count register
 121         b       L(ent)
 122
 123         ALIGN(16)
 124 L(top):
 125         lvx     v0, 0, ap
 126 L(ent): lvx     v1, r12, ap
 127         addi    ap, ap, 32
 128         vsrb    v8, v0, cnt4
 129         vsrb    v9, v1, cnt4
 130         vperm   v2, rtab, rtab, v0
 131         vperm   v3, rtab, rtab, v8
 132         vperm   v4, rtab, rtab, v1
 133         vperm   v5, rtab, rtab, v9
 134         vaddubm v6, v2, v3
 135         vaddubm v7, v4, v5
 136         vsum4ubs v12, v6, v12
 137         vsum4ubs v13, v7, v13
 138         bdnz    L(top)
 139
 140         andi.   n, n, eval(LIMBS_PER_2VR-1)
 141         beq     L(rt)
 142
 143         lvx     v0, 0, ap
 144         vxor    v1, v1, v1
 145         cmpwi   n, LIMBS_PER_VR
 146         ble     L(sum)
 147 L(lsum):
 148         vor     v1, v0, v0
 149         lvx     v0, r12, ap
 150 L(sum):
 151 LIMB32(`rlwinm  r6, n, 4,26,27  ')
 152 LIMB64(`rlwinm  r6, n, 5,26,26  ')
 153         addi    r7, r11, 16
 154         lvx     v8, r7, r6
 155         vand    v0, v0, v8
 156         vsrb    v8, v0, cnt4
 157         vsrb    v9, v1, cnt4
 158         vperm   v2, rtab, rtab, v0
 159         vperm   v3, rtab, rtab, v8
 160         vperm   v4, rtab, rtab, v1
 161         vperm   v5, rtab, rtab, v9
 162         vaddubm v6, v2, v3
 163         vaddubm v7, v4, v5
 164         vsum4ubs v12, v6, v12
 165         vsum4ubs v13, v7, v13
 166
 167         ALIGN(16)
 168 L(rt):  vadduwm v3, v12, v13
 169         li      r7, -16                 C FIXME: does all ppc32 and ppc64 ABIs
 170         stvx    v3, r7, r1              C FIXME: ...support storing below sp?
 171
 172         lwz     r7, -16(r1)
 173         add     r8, r8, r7
 174         lwz     r7, -12(r1)
 175         add     r8, r8, r7
 176         lwz     r7, -8(r1)
 177         add     r8, r8, r7
 178         lwz     r7, -4(r1)
 179         add     r8, r8, r7
 180
 181 C Handle outer loop for huge n.  We inherit cr7 and r0 from above.
 182 LIMB64(`ble     cr7, L(ret)
 183         vxor    v12, v12, v12           C zero total count
 184         vxor    v13, v13, v13           C zero total count
 185         mr      n, r9
 186         cmpd    cr7, n, r0
 187         ble     cr7, L(2)
 188         addis   r9, n, -LIMBS_PER_CHUNK C remaining n
 189         lis     n, LIMBS_PER_CHUNK
 190 L(2):   srdi    r7, n, 2                C loop count corresponding to n
 191         mtctr   r7                      C copy n to count register
 192         b       L(top)
 193 ')
 194
 195         ALIGN(16)
 196 L(ret): mr      r3, r8
 197         mtspr   256, r10
 198         blr
 199 EPILOGUE()
 200
 201 DEF_OBJECT(cnsts,16)
 202 C Counts for vperm
 203         .byte   0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
 204         .byte   0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
 205 C Masks for high end of number
 206         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 207         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 208
 209         .byte   0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
 210         .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 211
 212         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 213         .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 214
 215         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 216         .byte   0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
 217 C Masks for low end of number
 218         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 219         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 220
 221         .byte   0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
 222         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 223
 224         .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 225         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 226
 227         .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 228         .byte   0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
 229 END_OBJECT(cnsts)
 230 ASM_END()