source/libs/gmp/gmp-src/mpn/powerpc32/vmx/logops_n.asm

   1 dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_and_n, mpn_andn_n, mpn_nand_n,
   2 dnl  mpn_ior_n, mpn_iorn_n, mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise
   3 dnl  logical operations.
   4
   5 dnl  Copyright 2006 Free Software Foundation, Inc.
   6
   7 dnl  This file is part of the GNU MP Library.
   8 dnl
   9 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
  10 dnl  it under the terms of either:
  11 dnl
  12 dnl    * the GNU Lesser General Public License as published by the Free
  13 dnl      Software Foundation; either version 3 of the License, or (at your
  14 dnl      option) any later version.
  15 dnl
  16 dnl  or
  17 dnl
  18 dnl    * the GNU General Public License as published by the Free Software
  19 dnl      Foundation; either version 2 of the License, or (at your option) any
  20 dnl      later version.
  21 dnl
  22 dnl  or both in parallel, as here.
  23 dnl
  24 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  25 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  26 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  27 dnl  for more details.
  28 dnl
  29 dnl  You should have received copies of the GNU General Public License and the
  30 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  31 dnl  see https://www.gnu.org/licenses/.
  32
  33 include(`../config.m4')
  34
  35
  36 C               and,ior,andn,nior,xor    iorn,xnor         nand
  37 C                   cycles/limb         cycles/limb    cycles/limb
  38 C 7400,7410 (G4):       1.39                 ?              ?
  39 C 744x,745x (G4+):      1.14                1.39           1.39
  40 C 970:                  1.7                 2.0            2.0
  41
  42 C STATUS
  43 C  * Works for all sizes and alignment for 32-bit limbs.
  44 C  * Works for n >= 4 for 64-bit limbs; untested for smaller operands.
  45 C  * Current performance makes this pointless for 970
  46
  47 C TODO
  48 C  * Might want to make variants when just one of the source operands needs
  49 C    vperm, and when neither needs it.  The latter runs 50% faster on 7400.
  50 C  * Idea: If the source operands are equally aligned, we could do the logops
  51 C    first, then vperm before storing!  That means we never need more than one
  52 C    vperm, ever!
  53 C  * Perhaps align `rp' after initial alignment loop?
  54 C  * Instead of having scalar code in the beginning and end, consider using
  55 C    read-modify-write vector code.
  56 C  * Software pipeline?  Hopefully not too important, this is hairy enough
  57 C    already.
  58 C  * At least be more clever about operand loading, i.e., load v operands before
  59 C    u operands, since v operands are sometimes negated.
  60
  61 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
  62 define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
  63 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
  64
  65 define(`vnegb', `')             C default neg-before to null
  66 define(`vnega', `')             C default neg-before to null
  67
  68 ifdef(`OPERATION_and_n',
  69 `       define(`func',  `mpn_and_n')
  70         define(`logopS',`and    $1,$2,$3')
  71         define(`logop', `vand   $1,$2,$3')')
  72 ifdef(`OPERATION_andn_n',
  73 `       define(`func',  `mpn_andn_n')
  74         define(`logopS',`andc   $1,$2,$3')
  75         define(`logop', `vandc  $1,$2,$3')')
  76 ifdef(`OPERATION_nand_n',
  77 `       define(`func',  `mpn_nand_n')
  78         define(`logopS',`nand   $1,$2,$3')
  79         define(`logop', `vand   $1,$2,$3')
  80         define(`vnega', `vnor   $1,$2,$2')')
  81 ifdef(`OPERATION_ior_n',
  82 `       define(`func',  `mpn_ior_n')
  83         define(`logopS',`or     $1,$2,$3')
  84         define(`logop', `vor    $1,$2,$3')')
  85 ifdef(`OPERATION_iorn_n',
  86 `       define(`func',  `mpn_iorn_n')
  87         define(`logopS',`orc    $1,$2,$3')
  88         define(`vnegb', `vnor   $1,$2,$2')
  89         define(`logop', `vor    $1,$2,$3')')
  90 ifdef(`OPERATION_nior_n',
  91 `       define(`func',  `mpn_nior_n')
  92         define(`logopS',`nor    $1,$2,$3')
  93         define(`logop', `vnor   $1,$2,$3')')
  94 ifdef(`OPERATION_xor_n',
  95 `       define(`func',  `mpn_xor_n')
  96         define(`logopS',`xor    $1,$2,$3')
  97         define(`logop', `vxor   $1,$2,$3')')
  98 ifdef(`OPERATION_xnor_n',
  99 `       define(`func',`mpn_xnor_n')
 100         define(`logopS',`eqv    $1,$2,$3')
 101         define(`vnegb', `vnor   $1,$2,$2')
 102         define(`logop', `vxor   $1,$2,$3')')
 103
 104 ifelse(GMP_LIMB_BITS,`32',`
 105         define(`LIMB32',`       $1')
 106         define(`LIMB64',`')
 107 ',`
 108         define(`LIMB32',`')
 109         define(`LIMB64',`       $1')
 110 ')
 111
 112 C INPUT PARAMETERS
 113 define(`rp',    `r3')
 114 define(`up',    `r4')
 115 define(`vp',    `r5')
 116 define(`n',     `r6')
 117
 118 define(`us',    `v8')
 119 define(`vs',    `v9')
 120
 121 MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
 122
 123 ASM_START()
 124 PROLOGUE(func)
 125
 126 LIMB32(`cmpwi   cr0, n, 8       ')
 127 LIMB64(`cmpdi   cr0, n, 4       ')
 128         bge     L(big)
 129
 130         mtctr   n
 131
 132 LIMB32(`lwz     r8, 0(up)       ')
 133 LIMB32(`lwz     r9, 0(vp)       ')
 134 LIMB32(`logopS( r0, r8, r9)     ')
 135 LIMB32(`stw     r0, 0(rp)       ')
 136 LIMB32(`bdz     L(endS)         ')
 137
 138 L(topS):
 139 LIMB32(`lwzu    r8, 4(up)       ')
 140 LIMB64(`ld      r8, 0(up)       ')
 141 LIMB64(`addi    up, up, GMP_LIMB_BYTES  ')
 142 LIMB32(`lwzu    r9, 4(vp)       ')
 143 LIMB64(`ld      r9, 0(vp)       ')
 144 LIMB64(`addi    vp, vp, GMP_LIMB_BYTES  ')
 145         logopS( r0, r8, r9)
 146 LIMB32(`stwu    r0, 4(rp)       ')
 147 LIMB64(`std     r0, 0(rp)       ')
 148 LIMB64(`addi    rp, rp, GMP_LIMB_BYTES  ')
 149         bdnz    L(topS)
 150 L(endS):
 151         blr
 152
 153 L(big): mfspr   r12, 256
 154         oris    r0, r12, 0xfffc         C Set VRSAVE bit 0-13 FIXME
 155         mtspr   256, r0
 156
 157 C First loop until the destination is 16-byte aligned.  This will execute 0 or 1
 158 C times for 64-bit machines, and 0 to 3 times for 32-bit machines.
 159
 160 LIMB32(`rlwinm. r0, rp, 30,30,31')      C (rp >> 2) mod 4
 161 LIMB64(`rlwinm. r0, rp, 29,31,31')      C (rp >> 3) mod 2
 162         beq     L(aligned)
 163
 164         subfic  r7, r0, LIMBS_PER_VR
 165 LIMB32(`li      r10, 0          ')
 166         subf    n, r7, n
 167 L(top0):
 168 LIMB32(`lwz     r8, 0(up)       ')
 169 LIMB64(`ld      r8, 0(up)       ')
 170         addi    up, up, GMP_LIMB_BYTES
 171 LIMB32(`lwz     r9, 0(vp)       ')
 172 LIMB64(`ld      r9, 0(vp)       ')
 173         addi    vp, vp, GMP_LIMB_BYTES
 174 LIMB32(`addic.  r7, r7, -1      ')
 175         logopS( r0, r8, r9)
 176 LIMB32(`stwx    r0, r10, rp     ')
 177 LIMB64(`std     r0, 0(rp)       ')
 178 LIMB32(`addi    r10, r10, GMP_LIMB_BYTES')
 179 LIMB32(`bne     L(top0)         ')
 180
 181         addi    rp, rp, 16              C update rp, but preserve its alignment
 182
 183 L(aligned):
 184 LIMB64(`srdi    r7, n, 1        ')      C loop count corresponding to n
 185 LIMB32(`srwi    r7, n, 2        ')      C loop count corresponding to n
 186         mtctr   r7                      C copy n to count register
 187
 188         li      r10, 16
 189         lvsl    us, 0, up
 190         lvsl    vs, 0, vp
 191
 192         lvx     v2, 0, up
 193         lvx     v3, 0, vp
 194         bdnz    L(gt1)
 195         lvx     v0, r10, up
 196         lvx     v1, r10, vp
 197         vperm   v4, v2, v0, us
 198         vperm   v5, v3, v1, vs
 199         vnegb(  v5, v5)
 200         logop(  v6, v4, v5)
 201         vnega(  v6, v6)
 202         stvx    v6, 0, rp
 203         addi    up, up, 16
 204         addi    vp, vp, 16
 205         addi    rp, rp, 4
 206         b       L(tail)
 207
 208 L(gt1): addi    up, up, 16
 209         addi    vp, vp, 16
 210
 211 L(top): lvx     v0, 0, up
 212         lvx     v1, 0, vp
 213         vperm   v4, v2, v0, us
 214         vperm   v5, v3, v1, vs
 215         vnegb(  v5, v5)
 216         logop(  v6, v4, v5)
 217         vnega(  v6, v6)
 218         stvx    v6, 0, rp
 219         bdz     L(end)
 220         lvx     v2, r10, up
 221         lvx     v3, r10, vp
 222         vperm   v4, v0, v2, us
 223         vperm   v5, v1, v3, vs
 224         vnegb(  v5, v5)
 225         logop(  v6, v4, v5)
 226         vnega(  v6, v6)
 227         stvx    v6, r10, rp
 228         addi    up, up, 32
 229         addi    vp, vp, 32
 230         addi    rp, rp, 32
 231         bdnz    L(top)
 232
 233         andi.   r0, up, 15
 234         vxor    v0, v0, v0
 235         beq     1f
 236         lvx     v0, 0, up
 237 1:      andi.   r0, vp, 15
 238         vxor    v1, v1, v1
 239         beq     1f
 240         lvx     v1, 0, vp
 241 1:      vperm   v4, v2, v0, us
 242         vperm   v5, v3, v1, vs
 243         vnegb(  v5, v5)
 244         logop(  v6, v4, v5)
 245         vnega(  v6, v6)
 246         stvx    v6, 0, rp
 247         addi    rp, rp, 4
 248         b       L(tail)
 249
 250 L(end): andi.   r0, up, 15
 251         vxor    v2, v2, v2
 252         beq     1f
 253         lvx     v2, r10, up
 254 1:      andi.   r0, vp, 15
 255         vxor    v3, v3, v3
 256         beq     1f
 257         lvx     v3, r10, vp
 258 1:      vperm   v4, v0, v2, us
 259         vperm   v5, v1, v3, vs
 260         vnegb(  v5, v5)
 261         logop(  v6, v4, v5)
 262         vnega(  v6, v6)
 263         stvx    v6, r10, rp
 264
 265         addi    up, up, 16
 266         addi    vp, vp, 16
 267         addi    rp, rp, 20
 268
 269 L(tail):
 270 LIMB32(`rlwinm. r7, n, 0,30,31  ')      C r7 = n mod 4
 271 LIMB64(`rlwinm. r7, n, 0,31,31  ')      C r7 = n mod 2
 272         beq     L(ret)
 273         addi    rp, rp, 15
 274 LIMB32(`rlwinm  rp, rp, 0,0,27  ')
 275 LIMB64(`rldicr  rp, rp, 0,59    ')
 276         li      r10, 0
 277 L(top2):
 278 LIMB32(`lwzx    r8, r10, up     ')
 279 LIMB64(`ldx     r8, r10, up     ')
 280 LIMB32(`lwzx    r9, r10, vp     ')
 281 LIMB64(`ldx     r9, r10, vp     ')
 282 LIMB32(`addic.  r7, r7, -1      ')
 283         logopS( r0, r8, r9)
 284 LIMB32(`stwx    r0, r10, rp     ')
 285 LIMB64(`std     r0, 0(rp)       ')
 286 LIMB32(`addi    r10, r10, GMP_LIMB_BYTES')
 287 LIMB32(`bne     L(top2)         ')
 288
 289 L(ret): mtspr   256, r12
 290         blr
 291 EPILOGUE()
 292
 293 C This works for 64-bit PowerPC, since a limb ptr can only be aligned
 294 C in 2 relevant ways, which means we can always find a pair of aligned
 295 C pointers of rp, up, and vp.
 296 C process words until rp is 16-byte aligned
 297 C if (((up | vp) & 15) == 0)
 298 C   process with VMX without any vperm
 299 C else if ((up & 15) != 0 && (vp & 15) != 0)
 300 C   process with VMX using vperm on store data
 301 C else if ((up & 15) != 0)
 302 C   process with VMX using vperm on up data
 303 C else
 304 C   process with VMX using vperm on vp data
 305 C
 306 C       rlwinm, r0, up, 0,28,31
 307 C       rlwinm  r0, vp, 0,28,31
 308 C       cmpwi   cr7, r0, 0
 309 C       cror    cr6, cr0, cr7
 310 C       crand   cr0, cr0, cr7