sysdeps/powerpc/powerpc32/a2/memcpy.S

   1 /* Optimized memcpy implementation for PowerPC A2.
   2    Copyright (C) 2010-2014 Free Software Foundation, Inc.
   3    Contributed by Michael Brutman <brutman@us.ibm.com>.
   4    This file is part of the GNU C Library.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <sysdep.h>
  21
  22 #define PREFETCH_AHEAD 4        /* no cache lines SRC prefetching ahead  */
  23 #define ZERO_AHEAD 2            /* no cache lines DST zeroing ahead  */
  24
  25         .machine  a2
  26 EALIGN (memcpy, 5, 0)
  27         CALL_MCOUNT
  28
  29         dcbt    0,r4            /* Prefetch ONE SRC cacheline  */
  30         cmplwi  cr1,r5,16       /* is size < 16 ?  */
  31         mr      r6,r3           /* Copy dest reg to r6; */
  32         blt+    cr1,L(shortcopy)
  33
  34
  35         /* Big copy (16 bytes or more)
  36
  37            Figure out how far to the nearest quadword boundary, or if we are
  38            on one already.
  39
  40            r3 - return value (always)
  41            r4 - current source addr
  42            r5 - copy length
  43            r6 - current dest addr
  44         */
  45
  46         neg     r8,r3           /* LS 4 bits = # bytes to 8-byte dest bdry  */
  47         clrlwi  r8,r8,32-4      /* align to 16byte boundary  */
  48         sub     r7,r4,r3        /* compute offset to src from dest */
  49         cmplwi  cr0,r8,0        /* Were we aligned on a 16 byte bdy? */
  50         beq+    L(dst_aligned)
  51
  52
  53
  54         /* Destination is not aligned on quadword boundary.  Get us to one.
  55
  56            r3 - return value (always)
  57            r4 - current source addr
  58            r5 - copy length
  59            r6 - current dest addr
  60            r7 - offset to src from dest
  61            r8 - number of bytes to quadword boundary
  62         */
  63
  64         mtcrf   0x01,r8         /* put #bytes to boundary into cr7  */
  65         subf    r5,r8,r5        /* adjust remaining len */
  66
  67         bf      cr7*4+3,1f
  68         lbzx    r0,r7,r6        /* copy 1 byte addr */
  69         stb     r0,0(r6)
  70         addi    r6,r6,1
  71 1:
  72         bf      cr7*4+2,2f
  73         lhzx    r0,r7,r6        /* copy 2 byte addr */
  74         sth     r0,0(r6)
  75         addi    r6,r6,2
  76 2:
  77         bf      cr7*4+1,4f
  78         lwzx    r0,r7,r6        /* copy 4 byte addr */
  79         stw     r0,0(r6)
  80         addi    r6,r6,4
  81 4:
  82         bf      cr7*4+0,8f
  83         lfdx    r0,r7,r6        /* copy 8 byte addr */
  84         stfd    r0,0(r6)
  85         addi    r6,r6,8
  86 8:
  87         add     r4,r7,r6        /* update src addr */
  88
  89
  90
  91         /* Dest is quadword aligned now.
  92
  93            Lots of decisions to make.  If we are copying less than a cache
  94            line we won't be here long.  If we are not on a cache line
  95            boundary we need to get there.  And then we need to figure out
  96            how many cache lines ahead to pre-touch.
  97
  98            r3 - return value (always)
  99            r4 - current source addr
 100            r5 - copy length
 101            r6 - current dest addr
 102         */
 103
 104
 105         .align  4
 106 L(dst_aligned):
 107
 108
 109 #ifdef SHARED
 110         mflr    r0
 111 /* Establishes GOT addressability so we can load __cache_line_size
 112    from static. This value was set from the aux vector during startup.  */
 113         SETUP_GOT_ACCESS(r9,got_label)
 114         addis   r9,r9,__cache_line_size-got_label@ha
 115         lwz     r9,__cache_line_size-got_label@l(r9)
 116         mtlr    r0
 117 #else
 118 /* Load __cache_line_size from static. This value was set from the
 119    aux vector during startup.  */
 120         lis     r9,__cache_line_size@ha
 121         lwz     r9,__cache_line_size@l(r9)
 122 #endif
 123
 124         cmplwi  cr5, r9, 0
 125         bne+    cr5,L(cachelineset)
 126
 127 /* __cache_line_size not set: generic byte copy without much optimization */
 128         andi.   r0,r5,1         /* If length is odd copy one byte.  */
 129         beq     L(cachelinenotset_align)
 130         lbz     r7,0(r4)        /* Read one byte from source.  */
 131         addi    r5,r5,-1        /* Update length.  */
 132         addi    r4,r4,1         /* Update source pointer address.  */
 133         stb     r7,0(r6)        /* Store one byte on dest.  */
 134         addi    r6,r6,1         /* Update dest pointer address.  */
 135 L(cachelinenotset_align):
 136         cmpwi   cr7,r5,0        /* If length is 0 return.  */
 137         beqlr   cr7
 138         ori     r2,r2,0         /* Force a new dispatch group.  */
 139 L(cachelinenotset_loop):
 140         addic.  r5,r5,-2        /* Update length.  */
 141         lbz     r7,0(r4)        /* Load 2 bytes from source.  */
 142         lbz     r8,1(r4)
 143         addi    r4,r4,2         /* Update source pointer address.  */
 144         stb     r7,0(r6)        /* Store 2 bytes on dest.  */
 145         stb     r8,1(r6)
 146         addi    r6,r6,2         /* Update dest pointer address.  */
 147         bne     L(cachelinenotset_loop)
 148         blr
 149
 150
 151 L(cachelineset):
 152
 153         addi   r10,r9,-1
 154
 155         cmpw   cr5,r5,r10       /* Less than a cacheline to go? */
 156
 157         neg     r7,r6           /* How far to next cacheline bdy? */
 158
 159         addi    r6,r6,-8        /* prepare for stdu  */
 160         cmpwi   cr0,r9,128
 161         addi    r4,r4,-8        /* prepare for ldu  */
 162
 163
 164         ble+    cr5,L(lessthancacheline)
 165
 166         beq-    cr0,L(big_lines) /* 128 byte line code */
 167
 168
 169
 170
 171         /* More than a cacheline left to go, and using 64 byte cachelines */
 172
 173         clrlwi  r7,r7,32-6      /* How far to next cacheline bdy? */
 174
 175         cmplwi  cr6,r7,0        /* Are we on a cacheline bdy already? */
 176
 177         /* Reduce total len by what it takes to get to the next cache line */
 178         subf    r5,r7,r5
 179         srwi    r7,r7,4         /* How many qws to get to the line bdy? */
 180
 181         /* How many full cache lines to copy after getting to a line bdy? */
 182         srwi    r10,r5,6
 183
 184         cmplwi  r10,0           /* If no full cache lines to copy ... */
 185         li      r11,0           /* number cachelines to copy with prefetch  */
 186         beq     L(nocacheprefetch)
 187
 188
 189         /* We are here because we have at least one full cache line to copy,
 190            and therefore some pre-touching to do. */
 191
 192         cmplwi  r10,PREFETCH_AHEAD
 193         li      r12,64+8        /* prefetch distance  */
 194         ble     L(lessthanmaxprefetch)
 195
 196         /* We can only do so much pre-fetching.  R11 will have the count of
 197            lines left to prefetch after the initial batch of prefetches
 198            are executed. */
 199
 200         subi    r11,r10,PREFETCH_AHEAD
 201         li      r10,PREFETCH_AHEAD
 202
 203 L(lessthanmaxprefetch):
 204         mtctr   r10
 205
 206         /* At this point r10/ctr hold the number of lines to prefetch in this
 207            initial batch, and r11 holds any remainder. */
 208
 209 L(prefetchSRC):
 210         dcbt    r12,r4
 211         addi    r12,r12,64
 212         bdnz    L(prefetchSRC)
 213
 214
 215         /* Prefetching is done, or was not needed.
 216
 217            cr6 - are we on a cacheline boundary already?
 218            r7  - number of quadwords to the next cacheline boundary
 219         */
 220
 221 L(nocacheprefetch):
 222         mtctr   r7
 223
 224         cmplwi  cr1,r5,64   /* Less than a cache line to copy? */
 225
 226         /* How many bytes are left after we copy whatever full
 227            cache lines we can get? */
 228         clrlwi  r5,r5,32-6
 229
 230         beq     cr6,L(cachelinealigned)
 231
 232
 233         /* Copy quadwords up to the next cacheline boundary */
 234
 235 L(aligntocacheline):
 236         lfd     fp9,0x08(r4)
 237         lfdu    fp10,0x10(r4)
 238         stfd    fp9,0x08(r6)
 239         stfdu   fp10,0x10(r6)
 240         bdnz    L(aligntocacheline)
 241
 242
 243         .align 4
 244 L(cachelinealigned):            /* copy while cache lines  */
 245
 246         blt-    cr1,L(lessthancacheline) /* size <64  */
 247
 248 L(outerloop):
 249         cmpwi   r11,0
 250         mtctr   r11
 251         beq-    L(endloop)
 252
 253         li      r11,64*ZERO_AHEAD +8    /* DCBZ dist  */
 254
 255         .align  4
 256         /* Copy whole cachelines, optimized by prefetching SRC cacheline  */
 257 L(loop):                        /* Copy aligned body  */
 258         dcbt    r12,r4          /* PREFETCH SOURCE some cache lines ahead  */
 259         lfd     fp9,  0x08(r4)
 260         dcbz    r11,r6
 261         lfd     fp10, 0x10(r4)
 262         lfd     fp11, 0x18(r4)
 263         lfd     fp12, 0x20(r4)
 264         stfd    fp9,  0x08(r6)
 265         stfd    fp10, 0x10(r6)
 266         stfd    fp11, 0x18(r6)
 267         stfd    fp12, 0x20(r6)
 268         lfd     fp9,  0x28(r4)
 269         lfd     fp10, 0x30(r4)
 270         lfd     fp11, 0x38(r4)
 271         lfdu    fp12, 0x40(r4)
 272         stfd    fp9,  0x28(r6)
 273         stfd    fp10, 0x30(r6)
 274         stfd    fp11, 0x38(r6)
 275         stfdu   fp12, 0x40(r6)
 276
 277         bdnz    L(loop)
 278
 279
 280 L(endloop):
 281         cmpwi   r10,0
 282         beq-    L(endloop2)
 283         mtctr   r10
 284
 285 L(loop2):                       /* Copy aligned body  */
 286         lfd     fp9,  0x08(r4)
 287         lfd     fp10, 0x10(r4)
 288         lfd     fp11, 0x18(r4)
 289         lfd     fp12, 0x20(r4)
 290         stfd    fp9,  0x08(r6)
 291         stfd    fp10, 0x10(r6)
 292         stfd    fp11, 0x18(r6)
 293         stfd    fp12, 0x20(r6)
 294         lfd     fp9,  0x28(r4)
 295         lfd     fp10, 0x30(r4)
 296         lfd     fp11, 0x38(r4)
 297         lfdu    fp12, 0x40(r4)
 298         stfd    fp9,  0x28(r6)
 299         stfd    fp10, 0x30(r6)
 300         stfd    fp11, 0x38(r6)
 301         stfdu   fp12, 0x40(r6)
 302
 303         bdnz    L(loop2)
 304 L(endloop2):
 305
 306
 307         .align  4
 308 L(lessthancacheline):           /* Was there less than cache to do ?  */
 309         cmplwi  cr0,r5,16
 310         srwi    r7,r5,4         /* divide size by 16  */
 311         blt-    L(do_lt16)
 312         mtctr   r7
 313
 314 L(copy_remaining):
 315         lfd     fp9,  0x08(r4)
 316         lfdu    fp10, 0x10(r4)
 317         stfd    fp9,  0x08(r6)
 318         stfdu   fp10, 0x10(r6)
 319         bdnz    L(copy_remaining)
 320
 321 L(do_lt16):                     /* less than 16 ?  */
 322         cmplwi  cr0,r5,0        /* copy remaining bytes (0-15)  */
 323         beqlr+                  /* no rest to copy  */
 324         addi    r4,r4,8
 325         addi    r6,r6,8
 326
 327 L(shortcopy):                   /* SIMPLE COPY to handle size =< 15 bytes  */
 328         mtcrf   0x01,r5
 329         sub     r7,r4,r6
 330         bf-     cr7*4+0,8f
 331         lfdx    fp9,r7,r6       /* copy 8 byte  */
 332         stfd    fp9,0(r6)
 333         addi    r6,r6,8
 334 8:
 335         bf      cr7*4+1,4f
 336         lwzx    r0,r7,r6        /* copy 4 byte  */
 337         stw     r0,0(r6)
 338         addi    r6,r6,4
 339 4:
 340         bf      cr7*4+2,2f
 341         lhzx    r0,r7,r6        /* copy 2 byte  */
 342         sth     r0,0(r6)
 343         addi    r6,r6,2
 344 2:
 345         bf      cr7*4+3,1f
 346         lbzx    r0,r7,r6        /* copy 1 byte  */
 347         stb     r0,0(r6)
 348 1:
 349         blr
 350
 351
 352
 353
 354
 355         /* Similar to above, but for use with 128 byte lines. */
 356
 357
 358 L(big_lines):
 359
 360         clrlwi  r7,r7,32-7      /* How far to next cacheline bdy? */
 361
 362         cmplwi  cr6,r7,0        /* Are we on a cacheline bdy already? */
 363
 364         /* Reduce total len by what it takes to get to the next cache line */
 365         subf    r5,r7,r5
 366         srwi    r7,r7,4         /* How many qw to get to the line bdy? */
 367
 368         /* How many full cache lines to copy after getting to a line bdy? */
 369         srwi    r10,r5,7
 370
 371         cmplwi  r10,0           /* If no full cache lines to copy ... */
 372         li      r11,0           /* number cachelines to copy with prefetch  */
 373         beq     L(nocacheprefetch_128)
 374
 375
 376         /* We are here because we have at least one full cache line to copy,
 377            and therefore some pre-touching to do. */
 378
 379         cmplwi  r10,PREFETCH_AHEAD
 380         li      r12,128+8       /* prefetch distance  */
 381         ble     L(lessthanmaxprefetch_128)
 382
 383         /* We can only do so much pre-fetching.  R11 will have the count of
 384            lines left to prefetch after the initial batch of prefetches
 385            are executed. */
 386
 387         subi    r11,r10,PREFETCH_AHEAD
 388         li      r10,PREFETCH_AHEAD
 389
 390 L(lessthanmaxprefetch_128):
 391         mtctr   r10
 392
 393         /* At this point r10/ctr hold the number of lines to prefetch in this
 394            initial batch, and r11 holds any remainder. */
 395
 396 L(prefetchSRC_128):
 397         dcbt    r12,r4
 398         addi    r12,r12,128
 399         bdnz    L(prefetchSRC_128)
 400
 401
 402         /* Prefetching is done, or was not needed.
 403
 404            cr6 - are we on a cacheline boundary already?
 405            r7  - number of quadwords to the next cacheline boundary
 406         */
 407
 408 L(nocacheprefetch_128):
 409         mtctr   r7
 410
 411         cmplwi  cr1,r5,128  /* Less than a cache line to copy? */
 412
 413         /* How many bytes are left after we copy whatever full
 414            cache lines we can get? */
 415         clrlwi  r5,r5,32-7
 416
 417         beq     cr6,L(cachelinealigned_128)
 418
 419
 420         /* Copy quadwords up to the next cacheline boundary */
 421
 422 L(aligntocacheline_128):
 423         lfd     fp9,0x08(r4)
 424         lfdu    fp10,0x10(r4)
 425         stfd    fp9,0x08(r6)
 426         stfdu   fp10,0x10(r6)
 427         bdnz    L(aligntocacheline_128)
 428
 429
 430 L(cachelinealigned_128):        /* copy while cache lines  */
 431
 432         blt-    cr1,L(lessthancacheline) /* size <128  */
 433
 434 L(outerloop_128):
 435         cmpwi   r11,0
 436         mtctr   r11
 437         beq-    L(endloop_128)
 438
 439         li      r11,128*ZERO_AHEAD +8    /* DCBZ dist  */
 440
 441         .align  4
 442         /* Copy whole cachelines, optimized by prefetching SRC cacheline  */
 443 L(loop_128):                    /* Copy aligned body  */
 444         dcbt    r12,r4          /* PREFETCH SOURCE some cache lines ahead  */
 445         lfd     fp9,  0x08(r4)
 446         dcbz    r11,r6
 447         lfd     fp10, 0x10(r4)
 448         lfd     fp11, 0x18(r4)
 449         lfd     fp12, 0x20(r4)
 450         stfd    fp9,  0x08(r6)
 451         stfd    fp10, 0x10(r6)
 452         stfd    fp11, 0x18(r6)
 453         stfd    fp12, 0x20(r6)
 454         lfd     fp9,  0x28(r4)
 455         lfd     fp10, 0x30(r4)
 456         lfd     fp11, 0x38(r4)
 457         lfd     fp12, 0x40(r4)
 458         stfd    fp9,  0x28(r6)
 459         stfd    fp10, 0x30(r6)
 460         stfd    fp11, 0x38(r6)
 461         stfd    fp12, 0x40(r6)
 462         lfd     fp9,  0x48(r4)
 463         lfd     fp10, 0x50(r4)
 464         lfd     fp11, 0x58(r4)
 465         lfd     fp12, 0x60(r4)
 466         stfd    fp9,  0x48(r6)
 467         stfd    fp10, 0x50(r6)
 468         stfd    fp11, 0x58(r6)
 469         stfd    fp12, 0x60(r6)
 470         lfd     fp9,  0x68(r4)
 471         lfd     fp10, 0x70(r4)
 472         lfd     fp11, 0x78(r4)
 473         lfdu    fp12, 0x80(r4)
 474         stfd    fp9,  0x68(r6)
 475         stfd    fp10, 0x70(r6)
 476         stfd    fp11, 0x78(r6)
 477         stfdu   fp12, 0x80(r6)
 478
 479         bdnz    L(loop_128)
 480
 481
 482 L(endloop_128):
 483         cmpwi   r10,0
 484         beq-    L(endloop2_128)
 485         mtctr   r10
 486
 487 L(loop2_128):                   /* Copy aligned body  */
 488         lfd     fp9,  0x08(r4)
 489         lfd     fp10, 0x10(r4)
 490         lfd     fp11, 0x18(r4)
 491         lfd     fp12, 0x20(r4)
 492         stfd    fp9,  0x08(r6)
 493         stfd    fp10, 0x10(r6)
 494         stfd    fp11, 0x18(r6)
 495         stfd    fp12, 0x20(r6)
 496         lfd     fp9,  0x28(r4)
 497         lfd     fp10, 0x30(r4)
 498         lfd     fp11, 0x38(r4)
 499         lfd     fp12, 0x40(r4)
 500         stfd    fp9,  0x28(r6)
 501         stfd    fp10, 0x30(r6)
 502         stfd    fp11, 0x38(r6)
 503         stfd    fp12, 0x40(r6)
 504         lfd     fp9,  0x48(r4)
 505         lfd     fp10, 0x50(r4)
 506         lfd     fp11, 0x58(r4)
 507         lfd     fp12, 0x60(r4)
 508         stfd    fp9,  0x48(r6)
 509         stfd    fp10, 0x50(r6)
 510         stfd    fp11, 0x58(r6)
 511         stfd    fp12, 0x60(r6)
 512         lfd     fp9,  0x68(r4)
 513         lfd     fp10, 0x70(r4)
 514         lfd     fp11, 0x78(r4)
 515         lfdu    fp12, 0x80(r4)
 516         stfd    fp9,  0x68(r6)
 517         stfd    fp10, 0x70(r6)
 518         stfd    fp11, 0x78(r6)
 519         stfdu   fp12, 0x80(r6)
 520         bdnz    L(loop2_128)
 521 L(endloop2_128):
 522
 523         b       L(lessthancacheline)
 524
 525
 526 END (memcpy)
 527 libc_hidden_builtin_def (memcpy)