source/libs/gmp/gmp-src/mpn/powerpc32/vmx/copyd.asm

   1 dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd.
   2
   3 dnl  Copyright 2006 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33 C                16-byte coaligned      unaligned
  34 C                   cycles/limb        cycles/limb
  35 C 7400,7410 (G4):       0.5                0.64
  36 C 744x,745x (G4+):      0.75               0.82
  37 C 970 (G5):             0.78               1.02         (64-bit limbs)
  38
  39 C STATUS
  40 C  * Works for all sizes and alignments.
  41
  42 C TODO
  43 C  * Optimize unaligned case.  Some basic tests with 2-way and 4-way unrolling
  44 C    indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
  45 C    c/l for 970.
  46 C  * Consider using VMX instructions also for head and tail, by using some
  47 C    read-modify-write tricks.
  48 C  * The VMX code is used from the smallest sizes it handles, but measurements
  49 C    show a large speed bump at the cutoff points.  Small copying (perhaps
  50 C    using some read-modify-write technique) should be optimized.
  51 C  * Make an mpn_com based on this code.
  52
  53 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
  54 define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
  55 define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
  56
  57
  58 ifelse(GMP_LIMB_BITS,32,`
  59         define(`LIMB32',`       $1')
  60         define(`LIMB64',`')
  61 ',`
  62         define(`LIMB32',`')
  63         define(`LIMB64',`       $1')
  64 ')
  65
  66 C INPUT PARAMETERS
  67 define(`rp',    `r3')
  68 define(`up',    `r4')
  69 define(`n',     `r5')
  70
  71 define(`us',    `v4')
  72
  73
  74 ASM_START()
  75 PROLOGUE(mpn_copyd)
  76
  77 LIMB32(`slwi.   r0, n, 2        ')
  78 LIMB64(`sldi.   r0, n, 3        ')
  79         add     rp, rp, r0
  80         add     up, up, r0
  81
  82 LIMB32(`cmpi    cr7, n, 11      ')
  83 LIMB64(`cmpdi   cr7, n, 5       ')
  84         bge     cr7, L(big)
  85
  86         beqlr   cr0
  87
  88 C Handle small cases with plain operations
  89         mtctr   n
  90 L(topS):
  91 LIMB32(`lwz     r0, -4(up)      ')
  92 LIMB64(`ld      r0, -8(up)      ')
  93         addi    up, up, -GMP_LIMB_BYTES
  94 LIMB32(`stw     r0, -4(rp)      ')
  95 LIMB64(`std     r0, -8(rp)      ')
  96         addi    rp, rp, -GMP_LIMB_BYTES
  97         bdnz    L(topS)
  98         blr
  99
 100 C Handle large cases with VMX operations
 101 L(big):
 102         addi    rp, rp, -16
 103         addi    up, up, -16
 104         mfspr   r12, 256
 105         oris    r0, r12, 0xf800         C Set VRSAVE bit 0-4
 106         mtspr   256, r0
 107
 108 LIMB32(`rlwinm. r7, rp, 30,30,31')      C (rp >> 2) mod 4
 109 LIMB64(`rlwinm. r7, rp, 29,31,31')      C (rp >> 3) mod 2
 110         beq     L(rp_aligned)
 111
 112         subf    n, r7, n
 113 L(top0):
 114 LIMB32(`lwz     r0, 12(up)      ')
 115 LIMB64(`ld      r0, 8(up)       ')
 116         addi    up, up, -GMP_LIMB_BYTES
 117 LIMB32(`addic.  r7, r7, -1      ')
 118 LIMB32(`stw     r0, 12(rp)      ')
 119 LIMB64(`std     r0, 8(rp)       ')
 120         addi    rp, rp, -GMP_LIMB_BYTES
 121 LIMB32(`bne     L(top0)         ')
 122
 123 L(rp_aligned):
 124
 125 LIMB32(`rlwinm. r0, up, 30,30,31')      C (up >> 2) mod 4
 126 LIMB64(`rlwinm. r0, up, 29,31,31')      C (up >> 3) mod 2
 127
 128 LIMB64(`srdi    r7, n, 2        ')      C loop count corresponding to n
 129 LIMB32(`srwi    r7, n, 3        ')      C loop count corresponding to n
 130         mtctr   r7                      C copy n to count register
 131
 132         li      r10, -16
 133
 134         beq     L(up_aligned)
 135
 136         lvsl    us, 0, up
 137
 138         addi    up, up, 16
 139 LIMB32(`andi.   r0, n, 0x4      ')
 140 LIMB64(`andi.   r0, n, 0x2      ')
 141         beq     L(1)
 142         lvx     v0, 0, up
 143         lvx     v2, r10, up
 144         vperm   v3, v2, v0, us
 145         stvx    v3, 0, rp
 146         addi    up, up, -32
 147         addi    rp, rp, -16
 148         b       L(lpu)
 149 L(1):   lvx     v2, 0, up
 150         addi    up, up, -16
 151         b       L(lpu)
 152
 153         ALIGN(32)
 154 L(lpu): lvx     v0, 0, up
 155         vperm   v3, v0, v2, us
 156         stvx    v3, 0, rp
 157         lvx     v2, r10, up
 158         addi    up, up, -32
 159         vperm   v3, v2, v0, us
 160         stvx    v3, r10, rp
 161         addi    rp, rp, -32
 162         bdnz    L(lpu)
 163
 164         b       L(tail)
 165
 166 L(up_aligned):
 167
 168 LIMB32(`andi.   r0, n, 0x4      ')
 169 LIMB64(`andi.   r0, n, 0x2      ')
 170         beq     L(lpa)
 171         lvx     v0, 0,   up
 172         stvx    v0, 0,   rp
 173         addi    up, up, -16
 174         addi    rp, rp, -16
 175         b       L(lpa)
 176
 177         ALIGN(32)
 178 L(lpa): lvx     v0, 0,   up
 179         lvx     v1, r10, up
 180         addi    up, up, -32
 181         nop
 182         stvx    v0, 0,   rp
 183         stvx    v1, r10, rp
 184         addi    rp, rp, -32
 185         bdnz    L(lpa)
 186
 187 L(tail):
 188 LIMB32(`rlwinm. r7, n, 0,30,31  ')      C r7 = n mod 4
 189 LIMB64(`rlwinm. r7, n, 0,31,31  ')      C r7 = n mod 2
 190         beq     L(ret)
 191 LIMB32(`li      r10, 12         ')
 192 L(top2):
 193 LIMB32(`lwzx    r0, r10, up     ')
 194 LIMB64(`ld      r0, 8(up)       ')
 195 LIMB32(`addic.  r7, r7, -1      ')
 196 LIMB32(`stwx    r0, r10, rp     ')
 197 LIMB64(`std     r0, 8(rp)       ')
 198 LIMB32(`addi    r10, r10, -GMP_LIMB_BYTES')
 199 LIMB32(`bne     L(top2)         ')
 200
 201 L(ret): mtspr   256, r12
 202         blr
 203 EPILOGUE()