sysdeps/sparc/sparc64/sparcv9v/memcpy.S

   1 /* Copy SIZE bytes from SRC to DEST.  For SUN4V Niagara.
   2    Copyright (C) 2006, 2008 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by David S. Miller (davem@davemloft.net)
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 #include <sysdep.h>
  22
  23 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
  24 #define ASI_P                   0x80
  25 #define ASI_PNF                 0x82
  26
  27 #define LOAD(type,addr,dest)    type##a [addr] ASI_P, dest
  28 #define LOAD_TWIN(addr_reg,dest0,dest1) \
  29         ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
  30
  31 #define STORE(type,src,addr)    type src, [addr]
  32 #define STORE_INIT(src,addr)    stxa src, [addr] %asi
  33
  34 #ifndef XCC
  35 #define USE_BPR
  36 #define XCC xcc
  37 #endif
  38
  39         .register       %g2,#scratch
  40         .register       %g3,#scratch
  41         .register       %g6,#scratch
  42
  43         .text
  44         .align          32
  45
  46 ENTRY(bcopy)
  47         sub             %o1, %o0, %o4
  48         mov             %o0, %g4
  49         cmp             %o4, %o2
  50         mov             %o1, %o0
  51         bgeu,pt         %XCC, 100f
  52          mov            %g4, %o1
  53 #ifndef USE_BPR
  54         srl             %o2, 0, %o2
  55 #endif
  56         brnz,pn         %o2, 220f
  57          add            %o0, %o2, %o0
  58         retl
  59          nop
  60 END(bcopy)
  61
  62         .align          32
  63 ENTRY(memcpy)
  64 #ifndef USE_BPR
  65         srl             %o2, 0, %o2
  66 #endif
  67 100:    /* %o0=dst, %o1=src, %o2=len */
  68         mov             %o0, %g5
  69         cmp             %o2, 0
  70         be,pn           %XCC, 85f
  71 218:     or             %o0, %o1, %o3
  72         cmp             %o2, 16
  73         blu,a,pn        %XCC, 80f
  74          or             %o3, %o2, %o3
  75
  76         /* 2 blocks (128 bytes) is the minimum we can do the block
  77          * copy with.  We need to ensure that we'll iterate at least
  78          * once in the block copy loop.  At worst we'll need to align
  79          * the destination to a 64-byte boundary which can chew up
  80          * to (64 - 1) bytes from the length before we perform the
  81          * block copy loop.
  82          */
  83         cmp             %o2, (2 * 64)
  84         blu,pt          %XCC, 70f
  85          andcc          %o3, 0x7, %g0
  86
  87         /* %o0: dst
  88          * %o1: src
  89          * %o2: len  (known to be >= 128)
  90          *
  91          * The block copy loops will use %o4/%o5,%g2/%g3 as
  92          * temporaries while copying the data.
  93          */
  94
  95         LOAD(prefetch, %o1, #one_read)
  96         wr              %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
  97
  98         /* Align destination on 64-byte boundary.  */
  99         andcc           %o0, (64 - 1), %o4
 100         be,pt           %XCC, 2f
 101          sub            %o4, 64, %o4
 102         sub             %g0, %o4, %o4   ! bytes to align dst
 103         sub             %o2, %o4, %o2
 104 1:      subcc           %o4, 1, %o4
 105         LOAD(ldub, %o1, %g1)
 106         STORE(stb, %g1, %o0)
 107         add             %o1, 1, %o1
 108         bne,pt          %XCC, 1b
 109         add             %o0, 1, %o0
 110
 111         /* If the source is on a 16-byte boundary we can do
 112          * the direct block copy loop.  If it is 8-byte aligned
 113          * we can do the 16-byte loads offset by -8 bytes and the
 114          * init stores offset by one register.
 115          *
 116          * If the source is not even 8-byte aligned, we need to do
 117          * shifting and masking (basically integer faligndata).
 118          *
 119          * The careful bit with init stores is that if we store
 120          * to any part of the cache line we have to store the whole
 121          * cacheline else we can end up with corrupt L2 cache line
 122          * contents.  Since the loop works on 64-bytes of 64-byte
 123          * aligned store data at a time, this is easy to ensure.
 124          */
 125 2:
 126         andcc           %o1, (16 - 1), %o4
 127         andn            %o2, (64 - 1), %g1      ! block copy loop iterator
 128         sub             %o2, %g1, %o2           ! final sub-block copy bytes
 129         be,pt           %XCC, 50f
 130          cmp            %o4, 8
 131         be,a,pt         %XCC, 10f
 132          sub            %o1, 0x8, %o1
 133
 134         /* Neither 8-byte nor 16-byte aligned, shift and mask.  */
 135         mov             %g1, %o4
 136         and             %o1, 0x7, %g1
 137         sll             %g1, 3, %g1
 138         mov             64, %o3
 139         andn            %o1, 0x7, %o1
 140         LOAD(ldx, %o1, %g2)
 141         sub             %o3, %g1, %o3
 142         sllx            %g2, %g1, %g2
 143
 144 #define SWIVEL_ONE_DWORD(SRC, TMP1, TMP2, PRE_VAL, PRE_SHIFT, POST_SHIFT, DST)\
 145         LOAD(ldx, SRC, TMP1); \
 146         srlx            TMP1, PRE_SHIFT, TMP2; \
 147         or              TMP2, PRE_VAL, TMP2; \
 148         STORE_INIT(TMP2, DST); \
 149         sllx            TMP1, POST_SHIFT, PRE_VAL;
 150
 151 1:      add             %o1, 0x8, %o1
 152         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x00)
 153         add             %o1, 0x8, %o1
 154         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x08)
 155         add             %o1, 0x8, %o1
 156         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x10)
 157         add             %o1, 0x8, %o1
 158         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x18)
 159         add             %o1, 32, %o1
 160         LOAD(prefetch, %o1, #one_read)
 161         sub             %o1, 32 - 8, %o1
 162         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x20)
 163         add             %o1, 8, %o1
 164         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x28)
 165         add             %o1, 8, %o1
 166         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x30)
 167         add             %o1, 8, %o1
 168         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x38)
 169         subcc           %o4, 64, %o4
 170         bne,pt          %XCC, 1b
 171          add            %o0, 64, %o0
 172
 173 #undef SWIVEL_ONE_DWORD
 174
 175         srl             %g1, 3, %g1
 176         ba,pt           %XCC, 60f
 177          add            %o1, %g1, %o1
 178
 179 10:     /* Destination is 64-byte aligned, source was only 8-byte
 180          * aligned but it has been subtracted by 8 and we perform
 181          * one twin load ahead, then add 8 back into source when
 182          * we finish the loop.
 183          */
 184         LOAD_TWIN(%o1, %o4, %o5)
 185 1:      add             %o1, 16, %o1
 186         LOAD_TWIN(%o1, %g2, %g3)
 187         add             %o1, 16 + 32, %o1
 188         LOAD(prefetch, %o1, #one_read)
 189         sub             %o1, 32, %o1
 190         STORE_INIT(%o5, %o0 + 0x00)             ! initializes cache line
 191         STORE_INIT(%g2, %o0 + 0x08)
 192         LOAD_TWIN(%o1, %o4, %o5)
 193         add             %o1, 16, %o1
 194         STORE_INIT(%g3, %o0 + 0x10)
 195         STORE_INIT(%o4, %o0 + 0x18)
 196         LOAD_TWIN(%o1, %g2, %g3)
 197         add             %o1, 16, %o1
 198         STORE_INIT(%o5, %o0 + 0x20)
 199         STORE_INIT(%g2, %o0 + 0x28)
 200         LOAD_TWIN(%o1, %o4, %o5)
 201         STORE_INIT(%g3, %o0 + 0x30)
 202         STORE_INIT(%o4, %o0 + 0x38)
 203         subcc           %g1, 64, %g1
 204         bne,pt          %XCC, 1b
 205          add            %o0, 64, %o0
 206
 207         ba,pt           %XCC, 60f
 208          add            %o1, 0x8, %o1
 209
 210 50:     /* Destination is 64-byte aligned, and source is 16-byte
 211          * aligned.
 212          */
 213 1:      LOAD_TWIN(%o1, %o4, %o5)
 214         add     %o1, 16, %o1
 215         LOAD_TWIN(%o1, %g2, %g3)
 216         add     %o1, 16 + 32, %o1
 217         LOAD(prefetch, %o1, #one_read)
 218         sub     %o1, 32, %o1
 219         STORE_INIT(%o4, %o0 + 0x00)             ! initializes cache line
 220         STORE_INIT(%o5, %o0 + 0x08)
 221         LOAD_TWIN(%o1, %o4, %o5)
 222         add     %o1, 16, %o1
 223         STORE_INIT(%g2, %o0 + 0x10)
 224         STORE_INIT(%g3, %o0 + 0x18)
 225         LOAD_TWIN(%o1, %g2, %g3)
 226         add     %o1, 16, %o1
 227         STORE_INIT(%o4, %o0 + 0x20)
 228         STORE_INIT(%o5, %o0 + 0x28)
 229         STORE_INIT(%g2, %o0 + 0x30)
 230         STORE_INIT(%g3, %o0 + 0x38)
 231         subcc   %g1, 64, %g1
 232         bne,pt  %XCC, 1b
 233          add    %o0, 64, %o0
 234         /* fall through */
 235
 236 60:
 237         /* %o2 contains any final bytes still needed to be copied
 238          * over. If anything is left, we copy it one byte at a time.
 239          */
 240         wr              %g0, ASI_PNF, %asi
 241         brz,pt          %o2, 85f
 242          sub            %o0, %o1, %o3
 243         ba,a,pt         %XCC, 90f
 244
 245         .align          64
 246 70: /* 16 < len <= 64 */
 247         bne,pn          %XCC, 75f
 248          sub            %o0, %o1, %o3
 249
 250 72:
 251         andn            %o2, 0xf, %o4
 252         and             %o2, 0xf, %o2
 253 1:      subcc           %o4, 0x10, %o4
 254         LOAD(ldx, %o1, %o5)
 255         add             %o1, 0x08, %o1
 256         LOAD(ldx, %o1, %g1)
 257         sub             %o1, 0x08, %o1
 258         STORE(stx, %o5, %o1 + %o3)
 259         add             %o1, 0x8, %o1
 260         STORE(stx, %g1, %o1 + %o3)
 261         bgu,pt          %XCC, 1b
 262          add            %o1, 0x8, %o1
 263 73:     andcc           %o2, 0x8, %g0
 264         be,pt           %XCC, 1f
 265          nop
 266         sub             %o2, 0x8, %o2
 267         LOAD(ldx, %o1, %o5)
 268         STORE(stx, %o5, %o1 + %o3)
 269         add             %o1, 0x8, %o1
 270 1:      andcc           %o2, 0x4, %g0
 271         be,pt           %XCC, 1f
 272          nop
 273         sub             %o2, 0x4, %o2
 274         LOAD(lduw, %o1, %o5)
 275         STORE(stw, %o5, %o1 + %o3)
 276         add             %o1, 0x4, %o1
 277 1:      cmp             %o2, 0
 278         be,pt           %XCC, 85f
 279          nop
 280         ba,pt           %XCC, 90f
 281          nop
 282
 283 75:
 284         andcc           %o0, 0x7, %g1
 285         sub             %g1, 0x8, %g1
 286         be,pn           %icc, 2f
 287          sub            %g0, %g1, %g1
 288         sub             %o2, %g1, %o2
 289
 290 1:      subcc           %g1, 1, %g1
 291         LOAD(ldub, %o1, %o5)
 292         STORE(stb, %o5, %o1 + %o3)
 293         bgu,pt          %icc, 1b
 294          add            %o1, 1, %o1
 295
 296 2:      add             %o1, %o3, %o0
 297         andcc           %o1, 0x7, %g1
 298         bne,pt          %icc, 8f
 299          sll            %g1, 3, %g1
 300
 301         cmp             %o2, 16
 302         bgeu,pt         %icc, 72b
 303          nop
 304         ba,a,pt         %XCC, 73b
 305
 306 8:      mov             64, %o3
 307         andn            %o1, 0x7, %o1
 308         LOAD(ldx, %o1, %g2)
 309         sub             %o3, %g1, %o3
 310         andn            %o2, 0x7, %o4
 311         sllx            %g2, %g1, %g2
 312 1:      add             %o1, 0x8, %o1
 313         LOAD(ldx, %o1, %g3)
 314         subcc           %o4, 0x8, %o4
 315         srlx            %g3, %o3, %o5
 316         or              %o5, %g2, %o5
 317         STORE(stx, %o5, %o0)
 318         add             %o0, 0x8, %o0
 319         bgu,pt          %icc, 1b
 320          sllx           %g3, %g1, %g2
 321
 322         srl             %g1, 3, %g1
 323         andcc           %o2, 0x7, %o2
 324         be,pn           %icc, 85f
 325          add            %o1, %g1, %o1
 326         ba,pt           %XCC, 90f
 327          sub            %o0, %o1, %o3
 328
 329         .align          64
 330 80: /* 0 < len <= 16 */
 331         andcc           %o3, 0x3, %g0
 332         bne,pn          %XCC, 90f
 333          sub            %o0, %o1, %o3
 334
 335 1:
 336         subcc           %o2, 4, %o2
 337         LOAD(lduw, %o1, %g1)
 338         STORE(stw, %g1, %o1 + %o3)
 339         bgu,pt          %XCC, 1b
 340          add            %o1, 4, %o1
 341
 342 85:     retl
 343          mov            %g5, %o0
 344
 345         .align          32
 346 90:
 347         subcc           %o2, 1, %o2
 348         LOAD(ldub, %o1, %g1)
 349         STORE(stb, %g1, %o1 + %o3)
 350         bgu,pt          %XCC, 90b
 351          add            %o1, 1, %o1
 352         retl
 353          mov            %g5, %o0
 354
 355 END(memcpy)
 356
 357 #define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3)        \
 358         ldx             [%src - offset - 0x20], %t0;            \
 359         ldx             [%src - offset - 0x18], %t1;            \
 360         ldx             [%src - offset - 0x10], %t2;            \
 361         ldx             [%src - offset - 0x08], %t3;            \
 362         stw             %t0, [%dst - offset - 0x1c];            \
 363         srlx            %t0, 32, %t0;                           \
 364         stw             %t0, [%dst - offset - 0x20];            \
 365         stw             %t1, [%dst - offset - 0x14];            \
 366         srlx            %t1, 32, %t1;                           \
 367         stw             %t1, [%dst - offset - 0x18];            \
 368         stw             %t2, [%dst - offset - 0x0c];            \
 369         srlx            %t2, 32, %t2;                           \
 370         stw             %t2, [%dst - offset - 0x10];            \
 371         stw             %t3, [%dst - offset - 0x04];            \
 372         srlx            %t3, 32, %t3;                           \
 373         stw             %t3, [%dst - offset - 0x08];
 374
 375 #define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3)   \
 376         ldx             [%src - offset - 0x20], %t0;            \
 377         ldx             [%src - offset - 0x18], %t1;            \
 378         ldx             [%src - offset - 0x10], %t2;            \
 379         ldx             [%src - offset - 0x08], %t3;            \
 380         stx             %t0, [%dst - offset - 0x20];            \
 381         stx             %t1, [%dst - offset - 0x18];            \
 382         stx             %t2, [%dst - offset - 0x10];            \
 383         stx             %t3, [%dst - offset - 0x08];            \
 384         ldx             [%src - offset - 0x40], %t0;            \
 385         ldx             [%src - offset - 0x38], %t1;            \
 386         ldx             [%src - offset - 0x30], %t2;            \
 387         ldx             [%src - offset - 0x28], %t3;            \
 388         stx             %t0, [%dst - offset - 0x40];            \
 389         stx             %t1, [%dst - offset - 0x38];            \
 390         stx             %t2, [%dst - offset - 0x30];            \
 391         stx             %t3, [%dst - offset - 0x28];
 392
 393 #define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3)       \
 394         ldx             [%src + offset + 0x00], %t0;            \
 395         ldx             [%src + offset + 0x08], %t1;            \
 396         stw             %t0, [%dst + offset + 0x04];            \
 397         srlx            %t0, 32, %t2;                           \
 398         stw             %t2, [%dst + offset + 0x00];            \
 399         stw             %t1, [%dst + offset + 0x0c];            \
 400         srlx            %t1, 32, %t3;                           \
 401         stw             %t3, [%dst + offset + 0x08];
 402
 403 #define RMOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1)          \
 404         ldx             [%src + offset + 0x00], %t0;            \
 405         ldx             [%src + offset + 0x08], %t1;            \
 406         stx             %t0, [%dst + offset + 0x00];            \
 407         stx             %t1, [%dst + offset + 0x08];
 408
 409         .align          32
 410 228:    andcc           %o2, 1, %g0
 411         be,pt           %icc, 2f+4
 412 1:       ldub           [%o1 - 1], %o5
 413         sub             %o1, 1, %o1
 414         sub             %o0, 1, %o0
 415         subcc           %o2, 1, %o2
 416         be,pn           %xcc, 229f
 417          stb            %o5, [%o0]
 418 2:      ldub            [%o1 - 1], %o5
 419         sub             %o0, 2, %o0
 420         ldub            [%o1 - 2], %g5
 421         sub             %o1, 2, %o1
 422         subcc           %o2, 2, %o2
 423         stb             %o5, [%o0 + 1]
 424         bne,pt          %xcc, 2b
 425          stb            %g5, [%o0]
 426 229:    retl
 427          mov            %g4, %o0
 428 out:    retl
 429          mov            %g5, %o0
 430
 431         .align          32
 432 ENTRY(memmove)
 433         mov             %o0, %g5
 434 #ifndef USE_BPR
 435         srl             %o2, 0, %o2
 436 #endif
 437         brz,pn          %o2, out
 438          sub            %o0, %o1, %o4
 439         cmp             %o4, %o2
 440         bgeu,pt         %XCC, 218b
 441          mov            %o0, %g4
 442         add             %o0, %o2, %o0
 443 220:    add             %o1, %o2, %o1
 444         cmp             %o2, 15
 445         bleu,pn         %xcc, 228b
 446          andcc          %o0, 7, %g2
 447         sub             %o0, %o1, %g5
 448         andcc           %g5, 3, %o5
 449         bne,pn          %xcc, 232f
 450          andcc          %o1, 3, %g0
 451         be,a,pt         %xcc, 236f
 452          andcc          %o1, 4, %g0
 453         andcc           %o1, 1, %g0
 454         be,pn           %xcc, 4f
 455          andcc          %o1, 2, %g0
 456         ldub            [%o1 - 1], %g2
 457         sub             %o1, 1, %o1
 458         sub             %o0, 1, %o0
 459         sub             %o2, 1, %o2
 460         be,pn           %xcc, 5f
 461          stb            %g2, [%o0]
 462 4:      lduh            [%o1 - 2], %g2
 463         sub             %o1, 2, %o1
 464         sub             %o0, 2, %o0
 465         sub             %o2, 2, %o2
 466         sth             %g2, [%o0]
 467 5:      andcc           %o1, 4, %g0
 468 236:    be,a,pn         %xcc, 2f
 469          andcc          %o2, -128, %g6
 470         lduw            [%o1 - 4], %g5
 471         sub             %o1, 4, %o1
 472         sub             %o0, 4, %o0
 473         sub             %o2, 4, %o2
 474         stw             %g5, [%o0]
 475         andcc           %o2, -128, %g6
 476 2:      be,pn           %xcc, 235f
 477          andcc          %o0, 4, %g0
 478         be,pn           %xcc, 282f + 4
 479 5:      RMOVE_BIGCHUNK(o1, o0, 0x00, g1, g3, g5, o5)
 480         RMOVE_BIGCHUNK(o1, o0, 0x20, g1, g3, g5, o5)
 481         RMOVE_BIGCHUNK(o1, o0, 0x40, g1, g3, g5, o5)
 482         RMOVE_BIGCHUNK(o1, o0, 0x60, g1, g3, g5, o5)
 483         subcc           %g6, 128, %g6
 484         sub             %o1, 128, %o1
 485         bne,pt          %xcc, 5b
 486          sub            %o0, 128, %o0
 487 235:    andcc           %o2, 0x70, %g6
 488 41:     be,pn           %xcc, 280f
 489          andcc          %o2, 8, %g0
 490
 491 279:    rd              %pc, %o5
 492         sll             %g6, 1, %g5
 493         sub             %o1, %g6, %o1
 494         sub             %o5, %g5, %o5
 495         jmpl            %o5 + %lo(280f - 279b), %g0
 496          sub            %o0, %g6, %o0
 497         RMOVE_LASTCHUNK(o1, o0, 0x60, g2, g3, g5, o5)
 498         RMOVE_LASTCHUNK(o1, o0, 0x50, g2, g3, g5, o5)
 499         RMOVE_LASTCHUNK(o1, o0, 0x40, g2, g3, g5, o5)
 500         RMOVE_LASTCHUNK(o1, o0, 0x30, g2, g3, g5, o5)
 501         RMOVE_LASTCHUNK(o1, o0, 0x20, g2, g3, g5, o5)
 502         RMOVE_LASTCHUNK(o1, o0, 0x10, g2, g3, g5, o5)
 503         RMOVE_LASTCHUNK(o1, o0, 0x00, g2, g3, g5, o5)
 504 280:    be,pt           %xcc, 281f
 505          andcc          %o2, 4, %g0
 506         ldx             [%o1 - 8], %g2
 507         sub             %o0, 8, %o0
 508         stw             %g2, [%o0 + 4]
 509         sub             %o1, 8, %o1
 510         srlx            %g2, 32, %g2
 511         stw             %g2, [%o0]
 512 281:    be,pt           %xcc, 1f
 513          andcc          %o2, 2, %g0
 514         lduw            [%o1 - 4], %g2
 515         sub             %o1, 4, %o1
 516         stw             %g2, [%o0 - 4]
 517         sub             %o0, 4, %o0
 518 1:      be,pt           %xcc, 1f
 519          andcc          %o2, 1, %g0
 520         lduh            [%o1 - 2], %g2
 521         sub             %o1, 2, %o1
 522         sth             %g2, [%o0 - 2]
 523         sub             %o0, 2, %o0
 524 1:      be,pt           %xcc, 211f
 525          nop
 526         ldub            [%o1 - 1], %g2
 527         stb             %g2, [%o0 - 1]
 528 211:    retl
 529          mov            %g4, %o0
 530
 531 282:    RMOVE_BIGALIGNCHUNK(o1, o0, 0x00, g1, g3, g5, o5)
 532         RMOVE_BIGALIGNCHUNK(o1, o0, 0x40, g1, g3, g5, o5)
 533         subcc           %g6, 128, %g6
 534         sub             %o1, 128, %o1
 535         bne,pt          %xcc, 282b
 536          sub            %o0, 128, %o0
 537         andcc           %o2, 0x70, %g6
 538         be,pn           %xcc, 284f
 539          andcc          %o2, 8, %g0
 540
 541 283:    rd              %pc, %o5
 542         sub             %o1, %g6, %o1
 543         sub             %o5, %g6, %o5
 544         jmpl            %o5 + %lo(284f - 283b), %g0
 545          sub            %o0, %g6, %o0
 546         RMOVE_LASTALIGNCHUNK(o1, o0, 0x60, g2, g3)
 547         RMOVE_LASTALIGNCHUNK(o1, o0, 0x50, g2, g3)
 548         RMOVE_LASTALIGNCHUNK(o1, o0, 0x40, g2, g3)
 549         RMOVE_LASTALIGNCHUNK(o1, o0, 0x30, g2, g3)
 550         RMOVE_LASTALIGNCHUNK(o1, o0, 0x20, g2, g3)
 551         RMOVE_LASTALIGNCHUNK(o1, o0, 0x10, g2, g3)
 552         RMOVE_LASTALIGNCHUNK(o1, o0, 0x00, g2, g3)
 553 284:    be,pt           %xcc, 285f
 554          andcc          %o2, 4, %g0
 555         ldx             [%o1 - 8], %g2
 556         sub             %o0, 8, %o0
 557         sub             %o1, 8, %o1
 558         stx             %g2, [%o0]
 559 285:    be,pt           %xcc, 1f
 560          andcc          %o2, 2, %g0
 561         lduw            [%o1 - 4], %g2
 562         sub             %o0, 4, %o0
 563         sub             %o1, 4, %o1
 564         stw             %g2, [%o0]
 565 1:      be,pt           %xcc, 1f
 566          andcc          %o2, 1, %g0
 567         lduh            [%o1 - 2], %g2
 568         sub             %o0, 2, %o0
 569         sub             %o1, 2, %o1
 570         sth             %g2, [%o0]
 571 1:      be,pt           %xcc, 1f
 572          nop
 573         ldub            [%o1 - 1], %g2
 574         stb             %g2, [%o0 - 1]
 575 1:      retl
 576          mov            %g4, %o0
 577
 578 232:    ldub            [%o1 - 1], %g5
 579         sub             %o1, 1, %o1
 580         sub             %o0, 1, %o0
 581         subcc           %o2, 1, %o2
 582         bne,pt          %xcc, 232b
 583          stb            %g5, [%o0]
 584 234:    retl
 585          mov            %g4, %o0
 586 END(memmove)
 587
 588 #ifdef USE_BPR
 589 weak_alias (memcpy, __align_cpy_1)
 590 weak_alias (memcpy, __align_cpy_2)
 591 weak_alias (memcpy, __align_cpy_4)
 592 weak_alias (memcpy, __align_cpy_8)
 593 weak_alias (memcpy, __align_cpy_16)
 594 #endif
 595 libc_hidden_builtin_def (memcpy)
 596 libc_hidden_builtin_def (memmove)