sysdeps/ia64/memcpy.S

   1 /* Optimized version of the standard memcpy() function.
   2    This file is part of the GNU C Library.
   3    Copyright (C) 2000, 2001 Free Software Foundation, Inc.
   4    Contributed by Dan Pop <Dan.Pop@cern.ch>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 /* Return: dest
  22
  23    Inputs:
  24         in0:    dest
  25         in1:    src
  26         in2:    byte count
  27
  28    An assembly implementation of the algorithm used by the generic C
  29    version from glibc.  The case when all three arguments are multiples
  30    of 8 is treated separatedly, for extra performance.
  31
  32    In this form, it assumes little endian mode.  For big endian mode,
  33    sh1 must be computed using an extra instruction: sub sh1 = 64, sh1
  34    and the order of r[MEMLAT] and r[MEMLAT+1] must be reverted in the
  35    shrp instruction.  */
  36
  37 #include <sysdep.h>
  38 #undef ret
  39
  40 #define OP_T_THRES      16
  41 #define OPSIZ           8
  42
  43 #define adest           r15
  44 #define saved_pr        r17
  45 #define saved_lc        r18
  46 #define dest            r19
  47 #define src             r20
  48 #define len             r21
  49 #define asrc            r22
  50 #define tmp2            r23
  51 #define tmp3            r24
  52 #define tmp4            r25
  53 #define ptable          r26
  54 #define ploop56         r27
  55 #define loopaddr        r28
  56 #define sh1             r29
  57 #define loopcnt         r30
  58 #define value           r31
  59
  60 #define LOOP(shift)                                                     \
  61                 .align  32 ;                                            \
  62 .loop##shift##:                                                         \
  63 (p[0])          ld8     r[0] = [asrc], 8 ;      /* w1 */                \
  64 (p[MEMLAT+1])   st8     [dest] = value, 8 ;                             \
  65 (p[MEMLAT])     shrp    value = r[MEMLAT], r[MEMLAT+1], shift ;         \
  66                 nop.b   0 ;                                             \
  67                 nop.b   0 ;                                             \
  68                 br.ctop.sptk .loop##shift ;                             \
  69                 br.cond.sptk .cpyfew ; /* deal with the remaining bytes */
  70
  71 #define MEMLAT  21
  72 #define Nrot    (((2*MEMLAT+3) + 7) & ~7)
  73
  74 ENTRY(memcpy)
  75         .prologue
  76         alloc   r2 = ar.pfs, 3, Nrot - 3, 0, Nrot
  77         .rotr   r[MEMLAT + 2], q[MEMLAT + 1]
  78         .rotp   p[MEMLAT + 2]
  79         mov     ret0 = in0              // return value = dest
  80         .save pr, saved_pr
  81         mov     saved_pr = pr           // save the predicate registers
  82         .save ar.lc, saved_lc
  83         mov     saved_lc = ar.lc        // save the loop counter
  84         .body
  85         or      tmp3 = in0, in1 ;;      // tmp3 = dest | src
  86         or      tmp3 = tmp3, in2        // tmp3 = dest | src | len
  87         mov     dest = in0              // dest
  88         mov     src = in1               // src
  89         mov     len = in2               // len
  90         sub     tmp2 = r0, in0          // tmp2 = -dest
  91         cmp.eq  p6, p0 = in2, r0        // if (len == 0)
  92 (p6)    br.cond.spnt .restore_and_exit;;//      return dest;
  93         and     tmp4 = 7, tmp3          // tmp4 = (dest | src | len) & 7
  94         shr.u   loopcnt = len, 4 ;;     // loopcnt = len / 16
  95         cmp.ne  p6, p0 = tmp4, r0       // if ((dest | src | len) & 7 != 0)
  96 (p6)    br.cond.sptk .next              //      goto next;
  97
  98 // The optimal case, when dest, src and len are all multiples of 8
  99
 100         and     tmp3 = 0xf, len         // tmp3 = len % 16
 101         mov     pr.rot = 1 << 16        // set rotating predicates
 102         mov     ar.ec = MEMLAT + 1 ;;   // set the epilog counter
 103         cmp.ne  p6, p0 = tmp3, r0       // do we have to copy an extra word?
 104         adds    loopcnt = -1, loopcnt;; // --loopcnt
 105 (p6)    ld8     value = [src], 8;;
 106 (p6)    st8     [dest] = value, 8       // copy the "extra" word
 107         mov     ar.lc = loopcnt         // set the loop counter
 108         cmp.eq  p6, p0 = 8, len
 109 (p6)    br.cond.spnt .restore_and_exit;;// there was only one word to copy
 110         adds    adest = 8, dest
 111         adds    asrc = 8, src ;;
 112         .align  32
 113 .l0:
 114 (p[0])          ld8     r[0] = [src], 16
 115 (p[0])          ld8     q[0] = [asrc], 16
 116 (p[MEMLAT])     st8     [dest] = r[MEMLAT], 16
 117 (p[MEMLAT])     st8     [adest] = q[MEMLAT], 16
 118                 br.ctop.dptk .l0 ;;
 119
 120         mov     pr = saved_pr, -1       // restore the predicate registers
 121         mov     ar.lc = saved_lc        // restore the loop counter
 122         br.ret.sptk.many b0
 123 .next:
 124         cmp.ge  p6, p0 = OP_T_THRES, len        // is len <= OP_T_THRES
 125         and     loopcnt = 7, tmp2               // loopcnt = -dest % 8
 126 (p6)    br.cond.spnt    .cpyfew                 // copy byte by byte
 127         ;;
 128         cmp.eq  p6, p0 = loopcnt, r0
 129 (p6)    br.cond.sptk    .dest_aligned
 130         sub     len = len, loopcnt      // len -= -dest % 8
 131         adds    loopcnt = -1, loopcnt   // --loopcnt
 132         ;;
 133         mov     ar.lc = loopcnt
 134 .l1:                                    // copy -dest % 8 bytes
 135         ld1     value = [src], 1        // value = *src++
 136         ;;
 137         st1     [dest] = value, 1       // *dest++ = value
 138         br.cloop.dptk .l1 ;;
 139 .dest_aligned:
 140         and     sh1 = 7, src            // sh1 = src % 8
 141         and     tmp2 = -8, len          // tmp2 = len & -OPSIZ
 142         and     asrc = -8, src          // asrc = src & -OPSIZ  -- align src
 143         shr.u   loopcnt = len, 3        // loopcnt = len / 8
 144         and     len = 7, len;;          // len = len % 8
 145         adds    loopcnt = -1, loopcnt   // --loopcnt
 146         addl    tmp4 = @ltoff(.table), gp
 147         addl    tmp3 = @ltoff(.loop56), gp
 148         mov     ar.ec = MEMLAT + 1      // set EC
 149         mov     pr.rot = 1 << 16;;      // set rotating predicates
 150         mov     ar.lc = loopcnt         // set LC
 151         cmp.eq  p6, p0 = sh1, r0        // is the src aligned?
 152 (p6)    br.cond.sptk .src_aligned
 153         add     src = src, tmp2         // src += len & -OPSIZ
 154         shl     sh1 = sh1, 3            // sh1 = 8 * (src % 8)
 155         ld8     ploop56 = [tmp3]        // ploop56 = &loop56
 156         ld8     ptable = [tmp4];;       // ptable = &table
 157         add     tmp3 = ptable, sh1;;    // tmp3 = &table + sh1
 158         mov     ar.ec = MEMLAT + 1 + 1 // one more pass needed
 159         ld8     tmp4 = [tmp3];;         // tmp4 = loop offset
 160         sub     loopaddr = ploop56,tmp4 // loopadd = &loop56 - loop offset
 161         ld8     r[1] = [asrc], 8;;      // w0
 162         mov     b6 = loopaddr;;
 163         br      b6                      // jump to the appropriate loop
 164
 165         LOOP(8)
 166         LOOP(16)
 167         LOOP(24)
 168         LOOP(32)
 169         LOOP(40)
 170         LOOP(48)
 171         LOOP(56)
 172
 173 .src_aligned:
 174 .l3:
 175 (p[0])          ld8     r[0] = [src], 8
 176 (p[MEMLAT])     st8     [dest] = r[MEMLAT], 8
 177                 br.ctop.dptk .l3 ;;
 178 .cpyfew:
 179         cmp.eq  p6, p0 = len, r0        // is len == 0 ?
 180         adds    len = -1, len           // --len;
 181 (p6)    br.cond.spnt    .restore_and_exit ;;
 182         mov     ar.lc = len
 183 .l4:
 184         ld1     value = [src], 1
 185         ;;
 186         st1     [dest] = value, 1
 187         br.cloop.dptk   .l4 ;;
 188 .restore_and_exit:
 189         mov     pr = saved_pr, -1       // restore the predicate registers
 190         mov     ar.lc = saved_lc        // restore the loop counter
 191         br.ret.sptk.many b0
 192         .align 8
 193 .table:
 194         data8   0                       // dummy entry
 195         data8   .loop56 - .loop8
 196         data8   .loop56 - .loop16
 197         data8   .loop56 - .loop24
 198         data8   .loop56 - .loop32
 199         data8   .loop56 - .loop40
 200         data8   .loop56 - .loop48
 201         data8   .loop56 - .loop56
 202
 203 END(memcpy)