arch/sh/lib64/memcpy.S

   1 /* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
   2 /* Modified by SuperH, Inc. September 2003 */
   3 !
   4 ! Fast SH memcpy
   5 !
   6 ! by Toshiyasu Morita (tm@netcom.com)
   7 ! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
   8 ! SH5 code Copyright 2002 SuperH Ltd.
   9 !
  10 ! Entry: ARG0: destination pointer
  11 !        ARG1: source pointer
  12 !        ARG2: byte count
  13 !
  14 ! Exit:  RESULT: destination pointer
  15 !        any other registers in the range r0-r7: trashed
  16 !
  17 ! Notes: Usually one wants to do small reads and write a longword, but
  18 !        unfortunately it is difficult in some cases to concatanate bytes
  19 !        into a longword on the SH, so this does a longword read and small
  20 !        writes.
  21 !
  22 ! This implementation makes two assumptions about how it is called:
  23 !
  24 ! 1.: If the byte count is nonzero, the address of the last byte to be
  25 !     copied is unsigned greater than the address of the first byte to
  26 !     be copied.  This could be easily swapped for a signed comparison,
  27 !     but the algorithm used needs some comparison.
  28 !
  29 ! 2.: When there are two or three bytes in the last word of an 11-or-more
  30 !     bytes memory chunk to b copied, the rest of the word can be read
  31 !     without side effects.
  32 !     This could be easily changed by increasing the minimum size of
  33 !     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
  34 !     however, this would cost a few extra cyles on average.
  35 !     For SHmedia, the assumption is that any quadword can be read in its
  36 !     enirety if at least one byte is included in the copy.
  37 !
  38
  39         .section .text..SHmedia32,"ax"
  40         .globl  memcpy
  41         .type   memcpy, @function
  42         .align  5
  43
  44 memcpy:
  45
  46 #define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
  47 #define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
  48 #define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
  49 #define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
  50
  51         ld.b r3,0,r63
  52         pta/l Large,tr0
  53         movi 25,r0
  54         bgeu/u r4,r0,tr0
  55         nsb r4,r0
  56         shlli r0,5,r0
  57         movi (L1-L0+63*32 + 1) & 0xffff,r1
  58         sub r1, r0, r0
  59 L0:     ptrel r0,tr0
  60         add r2,r4,r5
  61         ptabs r18,tr1
  62         add r3,r4,r6
  63         blink tr0,r63
  64
  65 /* Rearranged to make cut2 safe */
  66         .balign 8
  67 L4_7:   /* 4..7 byte memcpy cntd. */
  68         stlo.l r2, 0, r0
  69         or r6, r7, r6
  70         sthi.l r5, -1, r6
  71         stlo.l r5, -4, r6
  72         blink tr1,r63
  73
  74         .balign 8
  75 L1:     /* 0 byte memcpy */
  76         nop
  77         blink tr1,r63
  78         nop
  79         nop
  80         nop
  81         nop
  82
  83 L2_3:   /* 2 or 3 byte memcpy cntd. */
  84         st.b r5,-1,r6
  85         blink tr1,r63
  86
  87         /* 1 byte memcpy */
  88         ld.b r3,0,r0
  89         st.b r2,0,r0
  90         blink tr1,r63
  91
  92 L8_15:  /* 8..15 byte memcpy cntd. */
  93         stlo.q r2, 0, r0
  94         or r6, r7, r6
  95         sthi.q r5, -1, r6
  96         stlo.q r5, -8, r6
  97         blink tr1,r63
  98
  99         /* 2 or 3 byte memcpy */
 100         ld.b r3,0,r0
 101         ld.b r2,0,r63
 102         ld.b r3,1,r1
 103         st.b r2,0,r0
 104         pta/l L2_3,tr0
 105         ld.b r6,-1,r6
 106         st.b r2,1,r1
 107         blink tr0, r63
 108
 109         /* 4 .. 7 byte memcpy */
 110         LDUAL (r3, 0, r0, r1)
 111         pta L4_7, tr0
 112         ldlo.l r6, -4, r7
 113         or r0, r1, r0
 114         sthi.l r2, 3, r0
 115         ldhi.l r6, -1, r6
 116         blink tr0, r63
 117
 118         /* 8 .. 15 byte memcpy */
 119         LDUAQ (r3, 0, r0, r1)
 120         pta L8_15, tr0
 121         ldlo.q r6, -8, r7
 122         or r0, r1, r0
 123         sthi.q r2, 7, r0
 124         ldhi.q r6, -1, r6
 125         blink tr0, r63
 126
 127         /* 16 .. 24 byte memcpy */
 128         LDUAQ (r3, 0, r0, r1)
 129         LDUAQ (r3, 8, r8, r9)
 130         or r0, r1, r0
 131         sthi.q r2, 7, r0
 132         or r8, r9, r8
 133         sthi.q r2, 15, r8
 134         ldlo.q r6, -8, r7
 135         ldhi.q r6, -1, r6
 136         stlo.q r2, 8, r8
 137         stlo.q r2, 0, r0
 138         or r6, r7, r6
 139         sthi.q r5, -1, r6
 140         stlo.q r5, -8, r6
 141         blink tr1,r63
 142
 143 Large:
 144         ld.b r2, 0, r63
 145         pta/l  Loop_ua, tr1
 146         ori r3, -8, r7
 147         sub r2, r7, r22
 148         sub r3, r2, r6
 149         add r2, r4, r5
 150         ldlo.q r3, 0, r0
 151         addi r5, -16, r5
 152         movi 64+8, r27 // could subtract r7 from that.
 153         stlo.q r2, 0, r0
 154         sthi.q r2, 7, r0
 155         ldx.q r22, r6, r0
 156         bgtu/l r27, r4, tr1
 157
 158         addi r5, -48, r27
 159         pta/l Loop_line, tr0
 160         addi r6, 64, r36
 161         addi r6, -24, r19
 162         addi r6, -16, r20
 163         addi r6, -8, r21
 164
 165 Loop_line:
 166         ldx.q r22, r36, r63
 167         alloco r22, 32
 168         addi r22, 32, r22
 169         ldx.q r22, r19, r23
 170         sthi.q r22, -25, r0
 171         ldx.q r22, r20, r24
 172         ldx.q r22, r21, r25
 173         stlo.q r22, -32, r0
 174         ldx.q r22, r6,  r0
 175         sthi.q r22, -17, r23
 176         sthi.q r22,  -9, r24
 177         sthi.q r22,  -1, r25
 178         stlo.q r22, -24, r23
 179         stlo.q r22, -16, r24
 180         stlo.q r22,  -8, r25
 181         bgeu r27, r22, tr0
 182
 183 Loop_ua:
 184         addi r22, 8, r22
 185         sthi.q r22, -1, r0
 186         stlo.q r22, -8, r0
 187         ldx.q r22, r6, r0
 188         bgtu/l r5, r22, tr1
 189
 190         add r3, r4, r7
 191         ldlo.q r7, -8, r1
 192         sthi.q r22, 7, r0
 193         ldhi.q r7, -1, r7
 194         ptabs r18,tr1
 195         stlo.q r22, 0, r0
 196         or r1, r7, r1
 197         sthi.q r5, 15, r1
 198         stlo.q r5, 8, r1
 199         blink tr1, r63
 200
 201         .size memcpy,.-memcpy