libc/string/arc/memcpy.S

   1 /*
   2  * Copyright (C) 2013, 2014-2015, 2017, 2022 Synopsys, Inc. (www.synopsys.com)
   3  * Copyright (C) 2007 ARC International (UK) LTD
   4  *
   5  * Licensed under the LGPL v2.1 or later, see the file COPYING.LIB in this tarball.
   6  */
   7
   8 #include <sysdep.h>
   9
  10 ENTRY(memcpy)
  11
  12 #if defined(__ARC700__)
  13 /* This memcpy implementation does not support objects of 1GB or larger -
  14    the check for alignment does not work then.  */
  15 /* We assume that most sources and destinations are aligned, and
  16    that also lengths are mostly a multiple of four, although to a lesser
  17    extent.  */
  18         or      r3,r0,r1
  19         asl_s   r3,r3,30
  20         mov_s   r5,r0
  21         brls.d  r2,r3,.Lcopy_bytewise
  22         sub.f   r3,r2,1
  23         ld_s    r12,[r1,0]
  24         asr.f   lp_count,r3,3
  25         bbit0.d r3,2,.Lnox4
  26         bmsk_s  r2,r2,1
  27         st.ab   r12,[r5,4]
  28         ld.a    r12,[r1,4]
  29 .Lnox4:
  30         lppnz   .Lendloop
  31         ld_s    r3,[r1,4]
  32         st.ab   r12,[r5,4]
  33         ld.a    r12,[r1,8]
  34         st.ab   r3,[r5,4]
  35 .Lendloop:
  36         breq    r2,0,.Last_store
  37         ld      r3,[r5,0]
  38 #ifdef __LITTLE_ENDIAN__
  39         add3    r2,-1,r2
  40         ; uses long immediate
  41         xor_s   r12,r12,r3
  42         bmsk    r12,r12,r2
  43         xor_s   r12,r12,r3
  44 #else /* BIG ENDIAN */
  45         sub3    r2,31,r2
  46         ; uses long immediate
  47         xor_s   r3,r3,r12
  48         bmsk    r3,r3,r2
  49         xor_s   r12,r12,r3
  50 #endif /* ENDIAN */
  51 .Last_store:
  52         j_s.d   [blink]
  53         st      r12,[r5,0]
  54
  55         .balign 4
  56 .Lcopy_bytewise:
  57         jcs     [blink]
  58         ldb_s   r12,[r1,0]
  59         lsr.f   lp_count,r3
  60         bhs_s   .Lnox1
  61         stb.ab  r12,[r5,1]
  62         ldb.a   r12,[r1,1]
  63 .Lnox1:
  64         lppnz   .Lendbloop
  65         ldb_s   r3,[r1,1]
  66         stb.ab  r12,[r5,1]
  67         ldb.a   r12,[r1,2]
  68         stb.ab  r3,[r5,1]
  69 .Lendbloop:
  70         j_s.d   [blink]
  71         stb     r12,[r5,0]
  72
  73 #elif defined(__ARCHS__)
  74
  75 #ifdef __LITTLE_ENDIAN__
  76 # define SHIFT_1(RX,RY,IMM)     asl     RX, RY, IMM     ; <<
  77 # define SHIFT_2(RX,RY,IMM)     lsr     RX, RY, IMM     ; >>
  78 # define MERGE_1(RX,RY,IMM)     asl     RX, RY, IMM
  79 # define MERGE_2(RX,RY,IMM)
  80 # define EXTRACT_1(RX,RY,IMM)   and     RX, RY, 0xFFFF
  81 # define EXTRACT_2(RX,RY,IMM)   lsr     RX, RY, IMM
  82 #else
  83 # define SHIFT_1(RX,RY,IMM)     lsr     RX, RY, IMM     ; >>
  84 # define SHIFT_2(RX,RY,IMM)     asl     RX, RY, IMM     ; <<
  85 # define MERGE_1(RX,RY,IMM)     asl     RX, RY, IMM     ; <<
  86 # define MERGE_2(RX,RY,IMM)     asl     RX, RY, IMM     ; <<
  87 # define EXTRACT_1(RX,RY,IMM)   lsr     RX, RY, IMM
  88 # define EXTRACT_2(RX,RY,IMM)   lsr     RX, RY, 0x08
  89 #endif
  90
  91 #if defined(__LL64__) || defined(__ARC_LL64__)
  92 # define PREFETCH_READ(RX)      prefetch [RX, 56]
  93 # define PREFETCH_WRITE(RX)     prefetchw [RX, 64]
  94 # define LOADX(DST,RX)          ldd.ab  DST, [RX, 8]
  95 # define STOREX(SRC,RX)         std.ab  SRC, [RX, 8]
  96 # define ZOLSHFT                5
  97 # define ZOLAND                 0x1F
  98 #else
  99 # define PREFETCH_READ(RX)      prefetch [RX, 28]
 100 # define PREFETCH_WRITE(RX)     prefetchw [RX, 32]
 101 # define LOADX(DST,RX)          ld.ab   DST, [RX, 4]
 102 # define STOREX(SRC,RX)         st.ab   SRC, [RX, 4]
 103 # define ZOLSHFT                4
 104 # define ZOLAND                 0xF
 105 #endif
 106
 107         prefetch  [r1]          ; Prefetch the read location
 108         prefetchw [r0]          ; Prefetch the write location
 109         mov.f   0, r2
 110 ;;; if size is zero
 111         jz.d    [blink]
 112         mov     r3, r0          ; don't clobber ret val
 113
 114 ;;; if size <= 8
 115         cmp     r2, 8
 116         bls.d   @.Lsmallchunk
 117         mov.f   lp_count, r2
 118
 119         and.f   r4, r0, 0x03
 120         rsub    lp_count, r4, 4
 121         lpnz    @.Laligndestination
 122         ;; LOOP BEGIN
 123         ldb.ab  r5, [r1,1]
 124         sub     r2, r2, 1
 125         stb.ab  r5, [r3,1]
 126 .Laligndestination:
 127
 128 ;;; Check the alignment of the source
 129         and.f   r4, r1, 0x03
 130         bnz.d   @.Lsourceunaligned
 131
 132 ;;; CASE 0: Both source and destination are 32bit aligned
 133 ;;; Convert len to Dwords, unfold x4
 134         lsr.f   lp_count, r2, ZOLSHFT
 135         lpnz    @.Lcopy32_64bytes
 136         ;; LOOP START
 137         LOADX (r6, r1)
 138         PREFETCH_READ (r1)
 139         PREFETCH_WRITE (r3)
 140         LOADX (r8, r1)
 141         LOADX (r10, r1)
 142         LOADX (r4, r1)
 143         STOREX (r6, r3)
 144         STOREX (r8, r3)
 145         STOREX (r10, r3)
 146         STOREX (r4, r3)
 147 .Lcopy32_64bytes:
 148
 149         and.f   lp_count, r2, ZOLAND ;Last remaining 31 bytes
 150 .Lsmallchunk:
 151         lpnz    @.Lcopyremainingbytes
 152         ;; LOOP START
 153         ldb.ab  r5, [r1,1]
 154         stb.ab  r5, [r3,1]
 155 .Lcopyremainingbytes:
 156
 157         j       [blink]
 158 ;;; END CASE 0
 159
 160 .Lsourceunaligned:
 161         cmp     r4, 2
 162         beq.d   @.LunalignedOffby2
 163         sub     r2, r2, 1
 164
 165         bhi.d   @.LunalignedOffby3
 166         ldb.ab  r5, [r1, 1]
 167
 168 ;;; CASE 1: The source is unaligned, off by 1
 169         ;; Hence I need to read 1 byte for a 16bit alignment
 170         ;; and 2bytes to reach 32bit alignment
 171         ldh.ab  r6, [r1, 2]
 172         sub     r2, r2, 2
 173         ;; Convert to words, unfold x2
 174         lsr.f   lp_count, r2, 3
 175         MERGE_1 (r6, r6, 8)
 176         MERGE_2 (r5, r5, 24)
 177         or      r5, r5, r6
 178
 179         ;; Both src and dst are aligned
 180         lpnz    @.Lcopy8bytes_1
 181         ;; LOOP START
 182         ld.ab   r6, [r1, 4]
 183         prefetch [r1, 28]       ;Prefetch the next read location
 184         ld.ab   r8, [r1,4]
 185         prefetchw [r3, 32]      ;Prefetch the next write location
 186
 187         SHIFT_1 (r7, r6, 24)
 188         or      r7, r7, r5
 189         SHIFT_2 (r5, r6, 8)
 190
 191         SHIFT_1 (r9, r8, 24)
 192         or      r9, r9, r5
 193         SHIFT_2 (r5, r8, 8)
 194
 195         st.ab   r7, [r3, 4]
 196         st.ab   r9, [r3, 4]
 197 .Lcopy8bytes_1:
 198
 199         ;; Write back the remaining 16bits
 200         EXTRACT_1 (r6, r5, 16)
 201         sth.ab  r6, [r3, 2]
 202         ;; Write back the remaining 8bits
 203         EXTRACT_2 (r5, r5, 16)
 204         stb.ab  r5, [r3, 1]
 205
 206         and.f   lp_count, r2, 0x07 ;Last 8bytes
 207         lpnz    @.Lcopybytewise_1
 208         ;; LOOP START
 209         ldb.ab  r6, [r1,1]
 210         stb.ab  r6, [r3,1]
 211 .Lcopybytewise_1:
 212         j       [blink]
 213
 214 .LunalignedOffby2:
 215 ;;; CASE 2: The source is unaligned, off by 2
 216         ldh.ab  r5, [r1, 2]
 217         sub     r2, r2, 1
 218
 219         ;; Both src and dst are aligned
 220         ;; Convert to words, unfold x2
 221         lsr.f   lp_count, r2, 3
 222 #ifdef __BIG_ENDIAN__
 223         asl.nz  r5, r5, 16
 224 #endif
 225         lpnz    @.Lcopy8bytes_2
 226         ;; LOOP START
 227         ld.ab   r6, [r1, 4]
 228         prefetch [r1, 28]       ;Prefetch the next read location
 229         ld.ab   r8, [r1,4]
 230         prefetchw [r3, 32]      ;Prefetch the next write location
 231
 232         SHIFT_1 (r7, r6, 16)
 233         or      r7, r7, r5
 234         SHIFT_2 (r5, r6, 16)
 235
 236         SHIFT_1 (r9, r8, 16)
 237         or      r9, r9, r5
 238         SHIFT_2 (r5, r8, 16)
 239
 240         st.ab   r7, [r3, 4]
 241         st.ab   r9, [r3, 4]
 242 .Lcopy8bytes_2:
 243
 244 #ifdef __BIG_ENDIAN__
 245         lsr.nz  r5, r5, 16
 246 #endif
 247         sth.ab  r5, [r3, 2]
 248
 249         and.f   lp_count, r2, 0x07 ;Last 8bytes
 250         lpnz    @.Lcopybytewise_2
 251         ;; LOOP START
 252         ldb.ab  r6, [r1,1]
 253         stb.ab  r6, [r3,1]
 254 .Lcopybytewise_2:
 255         j       [blink]
 256
 257 .LunalignedOffby3:
 258 ;;; CASE 3: The source is unaligned, off by 3
 259 ;;; Hence, I need to read 1byte for achieve the 32bit alignment
 260
 261         ;; Both src and dst are aligned
 262         ;; Convert to words, unfold x2
 263         lsr.f   lp_count, r2, 3
 264 #ifdef __BIG_ENDIAN__
 265         asl.ne  r5, r5, 24
 266 #endif
 267         lpnz    @.Lcopy8bytes_3
 268         ;; LOOP START
 269         ld.ab   r6, [r1, 4]
 270         prefetch [r1, 28]       ;Prefetch the next read location
 271         ld.ab   r8, [r1,4]
 272         prefetchw [r3, 32]      ;Prefetch the next write location
 273
 274         SHIFT_1 (r7, r6, 8)
 275         or      r7, r7, r5
 276         SHIFT_2 (r5, r6, 24)
 277
 278         SHIFT_1 (r9, r8, 8)
 279         or      r9, r9, r5
 280         SHIFT_2 (r5, r8, 24)
 281
 282         st.ab   r7, [r3, 4]
 283         st.ab   r9, [r3, 4]
 284 .Lcopy8bytes_3:
 285
 286 #ifdef __BIG_ENDIAN__
 287         lsr.nz  r5, r5, 24
 288 #endif
 289         stb.ab  r5, [r3, 1]
 290
 291         and.f   lp_count, r2, 0x07 ;Last 8bytes
 292         lpnz    @.Lcopybytewise_3
 293         ;; LOOP START
 294         ldb.ab  r6, [r1,1]
 295         stb.ab  r6, [r3,1]
 296 .Lcopybytewise_3:
 297         j       [blink]
 298
 299 #elif defined(__ARC64_ARCH32__)
 300         ;; Based on Synopsys code from newlib's arc64/memcpy.S
 301         lsr.f   r11, r2, 4              ; counter for 16-byte chunks
 302         beq.d   @.L_write_15_bytes
 303         mov     r3, r0                  ; work on a copy of "r0"
 304
 305 .L_write_16_bytes:
 306 #if defined(__ARC64_LL64__)
 307         ldd.ab  r4, [r1, 8]
 308         ldd.ab  r6, [r1, 8]
 309         std.ab  r4, [r3, 8]
 310         std.ab  r6, [r3, 8]
 311         dbnz    r11, @.L_write_16_bytes
 312 #else
 313         ld.ab   r4, [r1, 4]
 314         ld.ab   r5, [r1, 4]
 315         ld.ab   r6, [r1, 4]
 316         ld.ab   r7, [r1, 4]
 317         st.ab   r4, [r3, 4]
 318         st.ab   r5, [r3, 4]
 319         st.ab   r6, [r3, 4]
 320         dbnz.d  r11, @.L_write_16_bytes
 321         st.ab   r7, [r3, 4]
 322 #endif
 323         bmsk_s  r2, r2, 3
 324
 325 .L_write_15_bytes:
 326         bbit0.d r2, 1, @1f
 327         lsr     r11, r2, 2
 328         ldh.ab  r4, [r1, 2]
 329         sth.ab  r4, [r3, 2]
 330 1:
 331         bbit0.d r2, 0, @1f
 332         xor     r11, r11, 3
 333         ldb.ab  r4, [r1, 1]
 334         stb.ab  r4, [r3, 1]
 335 1:
 336         asl     r11, r11, 1
 337         bi      [r11]
 338         ld.ab   r4,[r1, 4]
 339         st.ab   r4,[r3, 4]
 340         ld.ab   r4,[r1, 4]
 341         st.ab   r4,[r3, 4]
 342         ld      r4,[r1]
 343         st      r4,[r3]
 344
 345         j_s     [blink]
 346
 347 #else
 348 #error "Unsupported ARC CPU type"
 349 #endif
 350
 351 END(memcpy)
 352 libc_hidden_def(memcpy)