source/libs/gmp/gmp-src/mpn/x86/pentium4/sse2/popcount.asm

   1 dnl  X86-32 and X86-64 mpn_popcount using SSE2.
   2
   3 dnl  Copyright 2006, 2007, 2011, 2015 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31
  32 include(`../config.m4')
  33
  34
  35 C 32-bit                     popcount        hamdist
  36 C                           cycles/limb     cycles/limb
  37 C P5                            -
  38 C P6 model 0-8,10-12            -
  39 C P6 model 9  (Banias)          ?
  40 C P6 model 13 (Dothan)          4
  41 C P4 model 0  (Willamette)      ?
  42 C P4 model 1  (?)               ?
  43 C P4 model 2  (Northwood)       3.9
  44 C P4 model 3  (Prescott)        ?
  45 C P4 model 4  (Nocona)          ?
  46 C AMD K6                        -
  47 C AMD K7                        -
  48 C AMD K8                        ?
  49
  50 C 64-bit                     popcount        hamdist
  51 C                           cycles/limb     cycles/limb
  52 C P4 model 4 (Nocona):          8
  53 C AMD K8,K9                     7.5
  54 C AMD K10                       3.5
  55 C Intel core2                   3.68
  56 C Intel corei                   3.15
  57 C Intel atom                   10.8
  58 C VIA nano                      6.5
  59
  60 C TODO
  61 C  * Make an mpn_hamdist based on this.  Alignment could either be handled by
  62 C    using movdqu for one operand and movdqa for the other, or by painfully
  63 C    shifting as we go.  Unfortunately, there seem to be no usable shift
  64 C    instruction, except for one that takes an immediate count.
  65 C  * It would probably be possible to cut a few cycles/limb using software
  66 C    pipelining.
  67 C  * There are 35 decode slots unused by the SSE2 instructions.  Loop control
  68 C    needs just 2 or 3 slots, leaving around 32 slots.  This allows a parallel
  69 C    integer based popcount.  Such a combined loop would handle 6 limbs in
  70 C    about 30 cycles on K8.
  71 C  * We could save a byte or two by using 32-bit operations on areg.
  72 C  * Check if using movdqa to a temp of and then register-based pand is faster.
  73
  74 ifelse(GMP_LIMB_BITS,`32',
  75 `       define(`up',  `%edx')
  76         define(`n',   `%ecx')
  77         define(`areg',`%eax')
  78         define(`breg',`%ebx')
  79         define(`zero',`%xmm4')
  80         define(`LIMB32',`       $1')
  81         define(`LIMB64',`dnl')
  82 ',`
  83         define(`up',  `%rdi')
  84         define(`n',   `%rsi')
  85         define(`areg',`%rax')
  86         define(`breg',`%rdx')
  87         define(`zero',`%xmm8')
  88         define(`LIMB32',`dnl')
  89         define(`LIMB64',`       $1')
  90 ')
  91
  92 define(`mm01010101',`%xmm6')
  93 define(`mm00110011',`%xmm7')
  94 define(`mm00001111',`%xmm2')
  95
  96 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
  97 define(`LIMBS_PER_XMM',  eval(16/GMP_LIMB_BYTES))
  98 define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
  99
 100 undefine(`psadbw')                      C override inherited m4 version
 101
 102 C This file is shared between 32-bit and 64-bit builds.  Only the former has
 103 C LEAL.  Default LEAL as an alias of LEA.
 104 ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')')
 105
 106 ASM_START()
 107
 108 C Make cnsts global to work around Apple relocation bug.
 109 ifdef(`DARWIN',`
 110         define(`cnsts', MPN(popccnsts))
 111         GLOBL   cnsts')
 112
 113         TEXT
 114         ALIGN(32)
 115 PROLOGUE(mpn_popcount)
 116
 117 LIMB32(`mov     4(%esp), up     ')
 118 LIMB32(`mov     8(%esp), n      ')
 119 LIMB32(`push    %ebx            ')
 120
 121         pxor    %xmm3, %xmm3            C zero grand total count
 122 LIMB64(`pxor    zero, zero      ')
 123 ifdef(`PIC',`
 124         LEAL(   cnsts, breg)
 125 ',`
 126 LIMB32(`mov     $cnsts, breg    ')
 127 LIMB64(`movabs  $cnsts, breg    ')
 128 ')
 129
 130         movdqa  -48(breg), mm01010101
 131         movdqa  -32(breg), mm00110011
 132         movdqa  -16(breg), mm00001111
 133
 134         mov     up, areg
 135         and     $-16, up                C round `up' down to 128-bit boundary
 136         and     $12, areg               C 32:areg = 0, 4, 8, 12
 137                                         C 64:areg = 0, 8
 138         movdqa  (up), %xmm0
 139         pand    64(breg,areg,4), %xmm0
 140         shr     $m4_log2(GMP_LIMB_BYTES), %eax
 141         add     areg, n                 C compensate n for rounded down `up'
 142
 143         pxor    %xmm4, %xmm4
 144         sub     $LIMBS_PER_XMM, n
 145         jbe     L(sum)
 146
 147         sub     $LIMBS_PER_XMM, n
 148         ja      L(ent)
 149         jmp     L(lsum)
 150
 151         ALIGN(16)
 152 L(top): movdqa  (up), %xmm0
 153 L(ent): movdqa  16(up), %xmm4
 154
 155         movdqa  %xmm0, %xmm1
 156         movdqa  %xmm4, %xmm5
 157         psrld   $1, %xmm0
 158         psrld   $1, %xmm4
 159         pand    mm01010101, %xmm0
 160         pand    mm01010101, %xmm4
 161         psubd   %xmm0, %xmm1
 162         psubd   %xmm4, %xmm5
 163
 164         movdqa  %xmm1, %xmm0
 165         movdqa  %xmm5, %xmm4
 166         psrlq   $2, %xmm1
 167         psrlq   $2, %xmm5
 168         pand    mm00110011, %xmm0
 169         pand    mm00110011, %xmm4
 170         pand    mm00110011, %xmm1
 171         pand    mm00110011, %xmm5
 172         paddq   %xmm0, %xmm1
 173         paddq   %xmm4, %xmm5
 174
 175 LIMB32(`pxor    zero, zero      ')
 176
 177         add     $32, up
 178         sub     $LIMBS_PER_2XMM, n
 179
 180         paddq   %xmm5, %xmm1
 181         movdqa  %xmm1, %xmm0
 182         psrlq   $4, %xmm1
 183         pand    mm00001111, %xmm0
 184         pand    mm00001111, %xmm1
 185         paddq   %xmm0, %xmm1
 186
 187         psadbw  zero, %xmm1
 188         paddq   %xmm1, %xmm3            C add to grand total
 189
 190         jnc     L(top)
 191 L(end):
 192         add     $LIMBS_PER_2XMM, n
 193         jz      L(rt)
 194         movdqa  (up), %xmm0
 195         pxor    %xmm4, %xmm4
 196         sub     $LIMBS_PER_XMM, n
 197         jbe     L(sum)
 198 L(lsum):
 199         movdqa  %xmm0, %xmm4
 200         movdqa  16(up), %xmm0
 201 L(sum):
 202         shl     $m4_log2(GMP_LIMB_BYTES), n
 203         and     $12, n
 204         pand    (breg,n,4), %xmm0
 205
 206         movdqa  %xmm0, %xmm1
 207         movdqa  %xmm4, %xmm5
 208         psrld   $1, %xmm0
 209         psrld   $1, %xmm4
 210         pand    mm01010101, %xmm0
 211         pand    mm01010101, %xmm4
 212         psubd   %xmm0, %xmm1
 213         psubd   %xmm4, %xmm5
 214
 215         movdqa  %xmm1, %xmm0
 216         movdqa  %xmm5, %xmm4
 217         psrlq   $2, %xmm1
 218         psrlq   $2, %xmm5
 219         pand    mm00110011, %xmm0
 220         pand    mm00110011, %xmm4
 221         pand    mm00110011, %xmm1
 222         pand    mm00110011, %xmm5
 223         paddq   %xmm0, %xmm1
 224         paddq   %xmm4, %xmm5
 225
 226 LIMB32(`pxor    zero, zero      ')
 227
 228         paddq   %xmm5, %xmm1
 229         movdqa  %xmm1, %xmm0
 230         psrlq   $4, %xmm1
 231         pand    mm00001111, %xmm0
 232         pand    mm00001111, %xmm1
 233         paddq   %xmm0, %xmm1
 234
 235         psadbw  zero, %xmm1
 236         paddq   %xmm1, %xmm3            C add to grand total
 237
 238
 239 C Add the two 64-bit halves of the grand total counter
 240 L(rt):  movdqa  %xmm3, %xmm0
 241         psrldq  $8, %xmm3
 242         paddq   %xmm3, %xmm0
 243         movd    %xmm0, areg             C movq avoided due to gas bug
 244
 245 LIMB32(`pop     %ebx            ')
 246         ret
 247
 248 EPILOGUE()
 249 DEF_OBJECT(dummy,16)
 250 C Three magic constants used for masking out bits
 251         .byte   0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
 252         .byte   0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
 253
 254         .byte   0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
 255         .byte   0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
 256
 257         .byte   0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
 258         .byte   0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
 259 cnsts:
 260 C Masks for high end of number
 261         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 262         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 263
 264         .byte   0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
 265         .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 266
 267         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 268         .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 269
 270         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 271         .byte   0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
 272 C Masks for low end of number
 273         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 274         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 275
 276         .byte   0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
 277         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 278
 279         .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 280         .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 281
 282         .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 283         .byte   0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
 284 END_OBJECT(dummy)
 285 ASM_END()