source/libs/gmp/gmp-src/mpn/powerpc32/vmx/copyi.asm

   1 dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_copyi.
   2
   3 dnl  Copyright 2006 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33 C                16-byte coaligned      unaligned
  34 C                   cycles/limb        cycles/limb
  35 C 7400,7410 (G4):       0.5                0.64
  36 C 744x,745x (G4+):      0.75               0.82
  37 C 970 (G5):             0.78               1.02         (64-bit limbs)
  38
  39 C STATUS
  40 C  * Works for all sizes and alignments.
  41
  42 C TODO
  43 C  * Optimize unaligned case.  Some basic tests with 2-way and 4-way unrolling
  44 C    indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
  45 C    c/l for 970.
  46 C  * Consider using VMX instructions also for head and tail, by using some
  47 C    read-modify-write tricks.
  48 C  * The VMX code is used from the smallest sizes it handles, but measurements
  49 C    show a large speed bump at the cutoff points.  Small copying (perhaps
  50 C    using some read-modify-write technique) should be optimized.
  51 C  * Make an mpn_com based on this code.
  52
  53 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
  54 define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
  55 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
  56
  57
  58 ifelse(GMP_LIMB_BITS,32,`
  59         define(`LIMB32',`       $1')
  60         define(`LIMB64',`')
  61 ',`
  62         define(`LIMB32',`')
  63         define(`LIMB64',`       $1')
  64 ')
  65
  66 C INPUT PARAMETERS
  67 define(`rp',    `r3')
  68 define(`up',    `r4')
  69 define(`n',     `r5')
  70
  71 define(`us',    `v4')
  72
  73
  74 ASM_START()
  75 PROLOGUE(mpn_copyi)
  76
  77 LIMB32(`cmpi    cr7, n, 11      ')
  78 LIMB64(`cmpdi   cr7, n, 5       ')
  79         bge     cr7, L(big)
  80
  81         or.     r0, n, n
  82         beqlr   cr0
  83
  84 C Handle small cases with plain operations
  85         mtctr   n
  86 L(topS):
  87 LIMB32(`lwz     r0, 0(up)       ')
  88 LIMB64(`ld      r0, 0(up)       ')
  89         addi    up, up, GMP_LIMB_BYTES
  90 LIMB32(`stw     r0, 0(rp)       ')
  91 LIMB64(`std     r0, 0(rp)       ')
  92         addi    rp, rp, GMP_LIMB_BYTES
  93         bdnz    L(topS)
  94         blr
  95
  96 C Handle large cases with VMX operations
  97 L(big):
  98         mfspr   r12, 256
  99         oris    r0, r12, 0xf800         C Set VRSAVE bit 0-4
 100         mtspr   256, r0
 101
 102 LIMB32(`rlwinm. r7, rp, 30,30,31')      C (rp >> 2) mod 4
 103 LIMB64(`rlwinm. r7, rp, 29,31,31')      C (rp >> 3) mod 2
 104         beq     L(rp_aligned)
 105
 106         subfic  r7, r7, LIMBS_PER_VR
 107         subf    n, r7, n
 108 L(top0):
 109 LIMB32(`lwz     r0, 0(up)       ')
 110 LIMB64(`ld      r0, 0(up)       ')
 111         addi    up, up, GMP_LIMB_BYTES
 112 LIMB32(`addic.  r7, r7, -1      ')
 113 LIMB32(`stw     r0, 0(rp)       ')
 114 LIMB64(`std     r0, 0(rp)       ')
 115         addi    rp, rp, GMP_LIMB_BYTES
 116 LIMB32(`bne     L(top0)         ')
 117
 118 L(rp_aligned):
 119
 120 LIMB32(`rlwinm. r0, up, 30,30,31')      C (up >> 2) mod 4
 121 LIMB64(`rlwinm. r0, up, 29,31,31')      C (up >> 3) mod 2
 122
 123 LIMB64(`srdi    r7, n, 2        ')      C loop count corresponding to n
 124 LIMB32(`srwi    r7, n, 3        ')      C loop count corresponding to n
 125         mtctr   r7                      C copy n to count register
 126
 127         li      r10, 16
 128
 129         beq     L(up_aligned)
 130
 131         lvsl    us, 0, up
 132
 133 LIMB32(`andi.   r0, n, 0x4      ')
 134 LIMB64(`andi.   r0, n, 0x2      ')
 135         beq     L(1)
 136         lvx     v0, 0, up
 137         lvx     v2, r10, up
 138         vperm   v3, v0, v2, us
 139         stvx    v3, 0, rp
 140         addi    up, up, 32
 141         addi    rp, rp, 16
 142         b       L(lpu)
 143 L(1):   lvx     v2, 0, up
 144         addi    up, up, 16
 145         b       L(lpu)
 146
 147         ALIGN(32)
 148 L(lpu): lvx     v0, 0, up
 149         vperm   v3, v2, v0, us
 150         stvx    v3, 0, rp
 151         lvx     v2, r10, up
 152         addi    up, up, 32
 153         vperm   v3, v0, v2, us
 154         stvx    v3, r10, rp
 155         addi    rp, rp, 32
 156         bdnz    L(lpu)
 157
 158         addi    up, up, -16
 159         b       L(tail)
 160
 161 L(up_aligned):
 162
 163 LIMB32(`andi.   r0, n, 0x4      ')
 164 LIMB64(`andi.   r0, n, 0x2      ')
 165         beq     L(lpa)
 166         lvx     v0, 0,   up
 167         stvx    v0, 0,   rp
 168         addi    up, up, 16
 169         addi    rp, rp, 16
 170         b       L(lpa)
 171
 172         ALIGN(32)
 173 L(lpa): lvx     v0, 0,   up
 174         lvx     v1, r10, up
 175         addi    up, up, 32
 176         nop
 177         stvx    v0, 0,   rp
 178         stvx    v1, r10, rp
 179         addi    rp, rp, 32
 180         bdnz    L(lpa)
 181
 182 L(tail):
 183 LIMB32(`rlwinm. r7, n, 0,30,31  ')      C r7 = n mod 4
 184 LIMB64(`rlwinm. r7, n, 0,31,31  ')      C r7 = n mod 2
 185         beq     L(ret)
 186 LIMB32(`li      r10, 0          ')
 187 L(top2):
 188 LIMB32(`lwzx    r0, r10, up     ')
 189 LIMB64(`ld      r0, 0(up)       ')
 190 LIMB32(`addic.  r7, r7, -1      ')
 191 LIMB32(`stwx    r0, r10, rp     ')
 192 LIMB64(`std     r0, 0(rp)       ')
 193 LIMB32(`addi    r10, r10, GMP_LIMB_BYTES')
 194 LIMB32(`bne     L(top2)         ')
 195
 196 L(ret): mtspr   256, r12
 197         blr
 198 EPILOGUE()