source/libs/gmp/gmp-src/mpn/alpha/com.asm

   1 dnl  Alpha mpn_com -- mpn one's complement.
   2
   3 dnl  Copyright 2003 Free Software Foundation, Inc.
   4
   5 dnl  This file is part of the GNU MP Library.
   6 dnl
   7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
   8 dnl  it under the terms of either:
   9 dnl
  10 dnl    * the GNU Lesser General Public License as published by the Free
  11 dnl      Software Foundation; either version 3 of the License, or (at your
  12 dnl      option) any later version.
  13 dnl
  14 dnl  or
  15 dnl
  16 dnl    * the GNU General Public License as published by the Free Software
  17 dnl      Foundation; either version 2 of the License, or (at your option) any
  18 dnl      later version.
  19 dnl
  20 dnl  or both in parallel, as here.
  21 dnl
  22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
  23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  25 dnl  for more details.
  26 dnl
  27 dnl  You should have received copies of the GNU General Public License and the
  28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
  29 dnl  see https://www.gnu.org/licenses/.
  30
  31 include(`../config.m4')
  32
  33
  34 C      cycles/limb
  35 C EV4:    4.75
  36 C EV5:    2.0
  37 C EV6:    1.5
  38
  39
  40 C mp_limb_t mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
  41 C
  42 C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total
  43 C 2.0 c/l.  In general, a pattern like this unrolled to N limbs per loop
  44 C will be 1.5+2/N c/l.
  45 C
  46 C 2 cycles of loop control are unavoidable, for pointer updates and the
  47 C taken branch bubble, but also since ldq cannot issue two cycles after stq
  48 C (and with a run of stqs that means neither of two cycles at the end of the
  49 C loop.
  50 C
  51 C The fbeq is forced into the second cycle of the loop using unops, since
  52 C the first time through it must wait for the cvtqt result.  Once that
  53 C result is ready (a 1 cycle stall) then both the branch and following loads
  54 C can issue together.
  55 C
  56 C The main loop handles an odd count of limbs, being two limbs loaded before
  57 C each size test, plus one pipelined around from the previous iteration (or
  58 C setup in the entry sequence).
  59 C
  60 C An even number of limbs is handled by an explicit dst[0]=~src[0] in the
  61 C entry sequence, and an increment of the pointers.  For an odd size there's
  62 C no increment and the first store in the loop (r24) is a repeat of dst[0].
  63 C
  64 C Note that the load for r24 after the possible pointer increment is done
  65 C before the explicit store to dst[0], in case src==dst.
  66
  67
  68 ASM_START()
  69
  70 FLOAT64(L(dat), 2.0)
  71
  72         ALIGN(16)
  73
  74 PROLOGUE(mpn_com,gp)
  75
  76         C r16   dst
  77         C r17   src
  78         C r18   size
  79
  80         lda     r30, -16(r30)           C temporary stack space
  81         lda     r7, -3(r18)             C size - 3
  82
  83         ldq     r20, 0(r17)             C src[0]
  84         srl     r7, 1, r6               C (size-3)/2
  85
  86         stq     r6, 8(r30)              C (size-3)/2
  87         and     r7, 1, r5               C 1 if size even
  88
  89         LEA(    r8, L(dat))
  90         s8addq  r5, r17, r17            C skip src[0] if even
  91
  92         ornot   r31, r20, r20           C ~src[0]
  93         unop
  94
  95         ldt     f0, 8(r30)              C (size-3)/2
  96         ldq     r24, 0(r17)             C src[0 or 1]
  97
  98         stq     r20, 0(r16)             C dst[0]
  99         s8addq  r5, r16, r19            C skip dst[0] if even
 100
 101         ldt     f1, 0(r8)               C data 2.0
 102         lda     r30, 16(r30)            C restore stack
 103         unop
 104         cvtqt   f0, f0                  C (size-3)/2 as float
 105
 106         ornot   r31, r24, r24
 107         blt     r7, L(done_1)           C if size<=2
 108         unop
 109         unop
 110
 111
 112         C 16-byte alignment here
 113 L(top):
 114         C r17   src, incrementing
 115         C r19   dst, incrementing
 116         C r24   dst[i] result, ready to store
 117         C f0    (size-3)/2, decrementing
 118         C f1    2.0
 119
 120         ldq     r20, 8(r17)             C src[i+1]
 121         ldq     r21, 16(r17)            C src[i+2]
 122         unop
 123         unop
 124
 125         fbeq    f0, L(done_2)
 126         unop
 127         ldq     r22, 24(r17)            C src[i+3]
 128         ldq     r23, 32(r17)            C src[i+4]
 129
 130         stq     r24, 0(r19)             C dst[i]
 131         ornot   r31, r20, r20
 132         subt    f0, f1, f0              C count -= 2
 133         unop
 134
 135         stq     r20, 8(r19)             C dst[i+1]
 136         ornot   r31, r21, r21
 137         unop
 138         unop
 139
 140         stq     r21, 16(r19)            C dst[i+2]
 141         ornot   r31, r22, r22
 142
 143         stq     r22, 24(r19)            C dst[i+3]
 144         ornot   r31, r23, r24
 145
 146         lda     r17, 32(r17)            C src += 4
 147         lda     r19, 32(r19)            C dst += 4
 148         unop
 149         fbge    f0, L(top)
 150
 151
 152 L(done_1):
 153         C r19   &dst[size-1]
 154         C r24   result for dst[size-1]
 155
 156         stq     r24, 0(r19)             C dst[size-1]
 157         ret     r31, (r26), 1
 158
 159
 160 L(done_2):
 161         C r19   &dst[size-3]
 162         C r20   src[size-2]
 163         C r21   src[size-1]
 164         C r24   result for dst[size-3]
 165
 166         stq     r24, 0(r19)             C dst[size-3]
 167         ornot   r31, r20, r20
 168
 169         stq     r20, 8(r19)             C dst[size-2]
 170         ornot   r31, r21, r21
 171
 172         stq     r21, 16(r19)            C dst[size-1]
 173         ret     r31, (r26), 1
 174
 175 EPILOGUE()
 176 ASM_END()