ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S

   1 /* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
   2    Copyright (C) 2013-2014 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.
  18
  19    This memcpy routine is optimised for Cortex-A15 cores and takes advantage
  20    of VFP or NEON when built with the appropriate flags.
  21
  22    Assumptions:
  23
  24     ARMv6 (ARMv7-a if using Neon)
  25     ARM state
  26     Unaligned accesses
  27
  28  */
  29
  30 /* Thumb cannot encode negative immediate offsets in memory operations.  */
  31 #ifndef NO_THUMB
  32 #define NO_THUMB
  33 #endif
  34 #include <sysdep.h>
  35 #include <arm-features.h>
  36
  37         .syntax unified
  38         /* This implementation requires ARM state.  */
  39         .arm
  40
  41 #ifdef MEMCPY_NEON
  42
  43         .fpu    neon
  44         .arch   armv7-a
  45 # define FRAME_SIZE     4
  46 # define USE_VFP
  47 # define USE_NEON
  48
  49 #elif defined (MEMCPY_VFP)
  50
  51         .arch   armv6
  52         .fpu    vfpv2
  53 # define FRAME_SIZE     32
  54 # define USE_VFP
  55
  56 #else
  57         .arch   armv6
  58 # define FRAME_SIZE    32
  59
  60 #endif
  61
  62 #define ALIGN(addr, align) addr:align
  63
  64 #define INSN_SIZE       4
  65
  66 /* Call parameters.  */
  67 #define dstin   r0
  68 #define src     r1
  69 #define count   r2
  70
  71 /* Locals.  */
  72 #define tmp1    r3
  73 #define dst     ip
  74 #define tmp2    r8
  75
  76 /* These two macros both work by repeated invocation of the macro
  77    dispatch_step (not defined here).  That macro performs one "step",
  78    doing one load instruction and one store instruction to copy one
  79    "unit".  On entry, TMP1 contains the number of bytes to be copied,
  80    a multiple of the unit size.  The macro clobbers TMP1 in the
  81    process of doing a computed jump to the tail containing the
  82    appropriate number of steps.
  83
  84    In dispatch_7_dword, dispatch_step is invoked seven times, with an
  85    argument that is 7 for the first and 1 for the last.  Units are
  86    double-words (8 bytes).  TMP1 is at most 56.
  87
  88    In dispatch_15_word, dispatch_step is invoked fifteen times,
  89    with an argument that is 15 for the first and 1 for the last.
  90    Units are words (4 bytes).  TMP1 is at most 60.  */
  91
  92 #ifndef ARM_ALWAYS_BX
  93 # if ARM_BX_ALIGN_LOG2 != 2
  94 #  error case not handled
  95 # endif
  96         .macro dispatch_7_dword
  97         rsb     tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE)
  98         add     pc, pc, tmp1
  99         dispatch_step 7
 100         dispatch_step 6
 101         dispatch_step 5
 102         dispatch_step 4
 103         dispatch_step 3
 104         dispatch_step 2
 105         dispatch_step 1
 106         .purgem dispatch_step
 107         .endm
 108
 109         .macro dispatch_15_word
 110         rsb     tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2)
 111         add     pc, pc, tmp1, lsl #1
 112         dispatch_step 15
 113         dispatch_step 14
 114         dispatch_step 13
 115         dispatch_step 12
 116         dispatch_step 11
 117         dispatch_step 10
 118         dispatch_step 9
 119         dispatch_step 8
 120         dispatch_step 7
 121         dispatch_step 6
 122         dispatch_step 5
 123         dispatch_step 4
 124         dispatch_step 3
 125         dispatch_step 2
 126         dispatch_step 1
 127         .purgem dispatch_step
 128         .endm
 129 #else
 130 # if ARM_BX_ALIGN_LOG2 < 3
 131 #  error case not handled
 132 # endif
 133         .macro dispatch_helper steps, log2_bytes_per_step
 134         /* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is
 135            (STEPS << LOG2_BYTES_PER_STEP).
 136            So this is (steps_to_skip << LOG2_BYTES_PER_STEP).
 137            Then it needs further adjustment to compensate for the
 138            distance between the PC value taken below (0f + PC_OFS)
 139            and the first step's instructions (1f).  */
 140         rsb     tmp1, tmp1, #((\steps << \log2_bytes_per_step) \
 141                               + ((1f - PC_OFS - 0f) \
 142                                  >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)))
 143         /* Shifting down LOG2_BYTES_PER_STEP gives us the number of
 144            steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us
 145            the (byte) distance to add to the PC.  */
 146 0:      add     tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)
 147         bx      tmp1
 148         .p2align ARM_BX_ALIGN_LOG2
 149 1:
 150         .endm
 151
 152         .macro dispatch_7_dword
 153         dispatch_helper 7, 3
 154         .p2align ARM_BX_ALIGN_LOG2
 155         dispatch_step 7
 156         .p2align ARM_BX_ALIGN_LOG2
 157         dispatch_step 6
 158         .p2align ARM_BX_ALIGN_LOG2
 159         dispatch_step 5
 160         .p2align ARM_BX_ALIGN_LOG2
 161         dispatch_step 4
 162         .p2align ARM_BX_ALIGN_LOG2
 163         dispatch_step 3
 164         .p2align ARM_BX_ALIGN_LOG2
 165         dispatch_step 2
 166         .p2align ARM_BX_ALIGN_LOG2
 167         dispatch_step 1
 168         .p2align ARM_BX_ALIGN_LOG2
 169         .purgem dispatch_step
 170         .endm
 171
 172         .macro dispatch_15_word
 173         dispatch_helper 15, 2
 174         dispatch_step 15
 175         .p2align ARM_BX_ALIGN_LOG2
 176         dispatch_step 14
 177         .p2align ARM_BX_ALIGN_LOG2
 178         dispatch_step 13
 179         .p2align ARM_BX_ALIGN_LOG2
 180         dispatch_step 12
 181         .p2align ARM_BX_ALIGN_LOG2
 182         dispatch_step 11
 183         .p2align ARM_BX_ALIGN_LOG2
 184         dispatch_step 10
 185         .p2align ARM_BX_ALIGN_LOG2
 186         dispatch_step 9
 187         .p2align ARM_BX_ALIGN_LOG2
 188         dispatch_step 8
 189         .p2align ARM_BX_ALIGN_LOG2
 190         dispatch_step 7
 191         .p2align ARM_BX_ALIGN_LOG2
 192         dispatch_step 6
 193         .p2align ARM_BX_ALIGN_LOG2
 194         dispatch_step 5
 195         .p2align ARM_BX_ALIGN_LOG2
 196         dispatch_step 4
 197         .p2align ARM_BX_ALIGN_LOG2
 198         dispatch_step 3
 199         .p2align ARM_BX_ALIGN_LOG2
 200         dispatch_step 2
 201         .p2align ARM_BX_ALIGN_LOG2
 202         dispatch_step 1
 203         .p2align ARM_BX_ALIGN_LOG2
 204         .purgem dispatch_step
 205         .endm
 206
 207 #endif
 208
 209 #ifndef USE_NEON
 210 /* For bulk copies using GP registers.  */
 211 #define A_l     r2              /* Call-clobbered.  */
 212 #define A_h     r3              /* Call-clobbered.  */
 213 #define B_l     r4
 214 #define B_h     r5
 215 #define C_l     r6
 216 #define C_h     r7
 217 /* Don't use the pair r8,r9 because in some EABI variants r9 is reserved.  */
 218 #define D_l     r10
 219 #define D_h     r11
 220 #endif
 221
 222 /* Number of lines ahead to pre-fetch data.  If you change this the code
 223    below will need adjustment to compensate.  */
 224
 225 #define prefetch_lines  5
 226
 227 #ifdef USE_VFP
 228         .macro  cpy_line_vfp vreg, base
 229         sfi_breg dst, \
 230         vstr    \vreg, [\B, #\base]
 231         sfi_breg src, \
 232         vldr    \vreg, [\B, #\base]
 233         sfi_breg dst, \
 234         vstr    d0, [\B, #\base + 8]
 235         sfi_breg src, \
 236         vldr    d0, [\B, #\base + 8]
 237         sfi_breg dst, \
 238         vstr    d1, [\B, #\base + 16]
 239         sfi_breg src, \
 240         vldr    d1, [\B, #\base + 16]
 241         sfi_breg dst, \
 242         vstr    d2, [\B, #\base + 24]
 243         sfi_breg src, \
 244         vldr    d2, [\B, #\base + 24]
 245         sfi_breg dst, \
 246         vstr    \vreg, [\B, #\base + 32]
 247         sfi_breg src, \
 248         vldr    \vreg, [\B, #\base + prefetch_lines * 64 - 32]
 249         sfi_breg dst, \
 250         vstr    d0, [\B, #\base + 40]
 251         sfi_breg src, \
 252         vldr    d0, [\B, #\base + 40]
 253         sfi_breg dst, \
 254         vstr    d1, [\B, #\base + 48]
 255         sfi_breg src, \
 256         vldr    d1, [\B, #\base + 48]
 257         sfi_breg dst, \
 258         vstr    d2, [\B, #\base + 56]
 259         sfi_breg src, \
 260         vldr    d2, [\B, #\base + 56]
 261         .endm
 262
 263         .macro  cpy_tail_vfp vreg, base
 264         sfi_breg dst, \
 265         vstr    \vreg, [\B, #\base]
 266         sfi_breg src, \
 267         vldr    \vreg, [\B, #\base]
 268         sfi_breg dst, \
 269         vstr    d0, [\B, #\base + 8]
 270         sfi_breg src, \
 271         vldr    d0, [\B, #\base + 8]
 272         sfi_breg dst, \
 273         vstr    d1, [\B, #\base + 16]
 274         sfi_breg src, \
 275         vldr    d1, [\B, #\base + 16]
 276         sfi_breg dst, \
 277         vstr    d2, [\B, #\base + 24]
 278         sfi_breg src, \
 279         vldr    d2, [\B, #\base + 24]
 280         sfi_breg dst, \
 281         vstr    \vreg, [\B, #\base + 32]
 282         sfi_breg dst, \
 283         vstr    d0, [\B, #\base + 40]
 284         sfi_breg src, \
 285         vldr    d0, [\B, #\base + 40]
 286         sfi_breg dst, \
 287         vstr    d1, [\B, #\base + 48]
 288         sfi_breg src, \
 289         vldr    d1, [\B, #\base + 48]
 290         sfi_breg dst, \
 291         vstr    d2, [\B, #\base + 56]
 292         sfi_breg src, \
 293         vldr    d2, [\B, #\base + 56]
 294         .endm
 295 #endif
 296
 297         .p2align 6
 298 ENTRY(memcpy)
 299
 300         mov     dst, dstin      /* Preserve dstin, we need to return it.  */
 301         cmp     count, #64
 302         bge     .Lcpy_not_short
 303         /* Deal with small copies quickly by dropping straight into the
 304            exit block.  */
 305
 306 .Ltail63unaligned:
 307 #ifdef USE_NEON
 308         /* These need an extra layer of macro just to work around a
 309            bug in the assembler's parser when an operand starts with
 310            a {...}.  http://sourceware.org/bugzilla/show_bug.cgi?id=15647
 311            tracks that bug; it was not fixed as of binutils-2.23.2.  */
 312         .macro neon_load_d0 reg
 313         vld1.8  {d0}, [\reg]!
 314         .endm
 315         .macro neon_store_d0 reg
 316         vst1.8  {d0}, [\reg]!
 317         .endm
 318
 319         /* These are used by the NaCl sfi_breg macro.  */
 320         .macro _sfi_breg_dmask_neon_load_d0 reg
 321         _sfi_dmask \reg
 322         .endm
 323         .macro _sfi_breg_dmask_neon_store_d0 reg
 324         _sfi_dmask \reg
 325         .endm
 326
 327         and     tmp1, count, #0x38
 328         .macro dispatch_step i
 329         sfi_breg src, neon_load_d0 \B
 330         sfi_breg dst, neon_store_d0 \B
 331         .endm
 332         dispatch_7_dword
 333
 334         tst     count, #4
 335         sfi_breg src, \
 336         ldrne   tmp1, [\B], #4
 337         sfi_breg dst, \
 338         strne   tmp1, [\B], #4
 339 #else
 340         /* Copy up to 15 full words of data.  May not be aligned.  */
 341         /* Cannot use VFP for unaligned data.  */
 342         and     tmp1, count, #0x3c
 343         add     dst, dst, tmp1
 344         add     src, src, tmp1
 345         /* Jump directly into the sequence below at the correct offset.  */
 346         .macro dispatch_step i
 347         sfi_breg src, \
 348         ldr     tmp1, [\B, #-(\i * 4)]
 349         sfi_breg dst, \
 350         str     tmp1, [\B, #-(\i * 4)]
 351         .endm
 352         dispatch_15_word
 353 #endif
 354
 355         lsls    count, count, #31
 356         sfi_breg src, \
 357         ldrhcs  tmp1, [\B], #2
 358         sfi_breg src, \
 359         ldrbne  src, [\B]               /* Src is dead, use as a scratch.  */
 360         sfi_breg dst, \
 361         strhcs  tmp1, [\B], #2
 362         sfi_breg dst, \
 363         strbne  src, [\B]
 364         bx      lr
 365
 366 .Lcpy_not_short:
 367         /* At least 64 bytes to copy, but don't know the alignment yet.  */
 368         str     tmp2, [sp, #-FRAME_SIZE]!
 369         cfi_adjust_cfa_offset (FRAME_SIZE)
 370         cfi_rel_offset (tmp2, 0)
 371         cfi_remember_state
 372         and     tmp2, src, #7
 373         and     tmp1, dst, #7
 374         cmp     tmp1, tmp2
 375         bne     .Lcpy_notaligned
 376
 377 #ifdef USE_VFP
 378         /* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
 379            that the FP pipeline is much better at streaming loads and
 380            stores.  This is outside the critical loop.  */
 381         vmov.f32        s0, s0
 382 #endif
 383
 384         /* SRC and DST have the same mutual 64-bit alignment, but we may
 385            still need to pre-copy some bytes to get to natural alignment.
 386            We bring SRC and DST into full 64-bit alignment.  */
 387         lsls    tmp2, dst, #29
 388         beq     1f
 389         rsbs    tmp2, tmp2, #0
 390         sub     count, count, tmp2, lsr #29
 391         sfi_breg src, \
 392         ldrmi   tmp1, [\B], #4
 393         sfi_breg dst, \
 394         strmi   tmp1, [\B], #4
 395         lsls    tmp2, tmp2, #2
 396         sfi_breg src, \
 397         ldrhcs  tmp1, [\B], #2
 398         sfi_breg src, \
 399         ldrbne  tmp2, [\B], #1
 400         sfi_breg dst, \
 401         strhcs  tmp1, [\B], #2
 402         sfi_breg dst, \
 403         strbne  tmp2, [\B], #1
 404
 405 1:
 406         subs    tmp2, count, #64        /* Use tmp2 for count.  */
 407         blt     .Ltail63aligned
 408
 409         cmp     tmp2, #512
 410         bge     .Lcpy_body_long
 411
 412 .Lcpy_body_medium:                      /* Count in tmp2.  */
 413 #ifdef USE_VFP
 414 1:
 415         sfi_breg src, \
 416         vldr    d0, [\B, #0]
 417         subs    tmp2, tmp2, #64
 418         sfi_breg src, \
 419         vldr    d1, [\B, #8]
 420         sfi_breg dst, \
 421         vstr    d0, [\B, #0]
 422         sfi_breg src, \
 423         vldr    d0, [\B, #16]
 424         sfi_breg dst, \
 425         vstr    d1, [\B, #8]
 426         sfi_breg src, \
 427         vldr    d1, [\B, #24]
 428         sfi_breg dst, \
 429         vstr    d0, [\B, #16]
 430         sfi_breg src, \
 431         vldr    d0, [\B, #32]
 432         sfi_breg dst, \
 433         vstr    d1, [\B, #24]
 434         sfi_breg src, \
 435         vldr    d1, [\B, #40]
 436         sfi_breg dst, \
 437         vstr    d0, [\B, #32]
 438         sfi_breg src, \
 439         vldr    d0, [\B, #48]
 440         sfi_breg dst, \
 441         vstr    d1, [\B, #40]
 442         sfi_breg src, \
 443         vldr    d1, [\B, #56]
 444         sfi_breg dst, \
 445         vstr    d0, [\B, #48]
 446         add     src, src, #64
 447         sfi_breg dst, \
 448         vstr    d1, [\B, #56]
 449         add     dst, dst, #64
 450         bge     1b
 451         tst     tmp2, #0x3f
 452         beq     .Ldone
 453
 454 .Ltail63aligned:                        /* Count in tmp2.  */
 455         and     tmp1, tmp2, #0x38
 456         add     dst, dst, tmp1
 457         add     src, src, tmp1
 458         .macro dispatch_step i
 459         sfi_breg src, \
 460         vldr    d0, [\B, #-(\i * 8)]
 461         sfi_breg dst, \
 462         vstr    d0, [\B, #-(\i * 8)]
 463         .endm
 464         dispatch_7_dword
 465 #else
 466         sub     src, src, #8
 467         sub     dst, dst, #8
 468 1:
 469         sfi_breg src, \
 470         ldrd    A_l, A_h, [\B, #8]
 471         sfi_breg dst, \
 472         strd    A_l, A_h, [\B, #8]
 473         sfi_breg src, \
 474         ldrd    A_l, A_h, [\B, #16]
 475         sfi_breg dst, \
 476         strd    A_l, A_h, [\B, #16]
 477         sfi_breg src, \
 478         ldrd    A_l, A_h, [\B, #24]
 479         sfi_breg dst, \
 480         strd    A_l, A_h, [\B, #24]
 481         sfi_breg src, \
 482         ldrd    A_l, A_h, [\B, #32]
 483         sfi_breg dst, \
 484         strd    A_l, A_h, [\B, #32]
 485         sfi_breg src, \
 486         ldrd    A_l, A_h, [\B, #40]
 487         sfi_breg dst, \
 488         strd    A_l, A_h, [\B, #40]
 489         sfi_breg src, \
 490         ldrd    A_l, A_h, [\B, #48]
 491         sfi_breg dst, \
 492         strd    A_l, A_h, [\B, #48]
 493         sfi_breg src, \
 494         ldrd    A_l, A_h, [\B, #56]
 495         sfi_breg dst, \
 496         strd    A_l, A_h, [\B, #56]
 497         sfi_breg src, \
 498         ldrd    A_l, A_h, [\B, #64]!
 499         sfi_breg dst, \
 500         strd    A_l, A_h, [\B, #64]!
 501         subs    tmp2, tmp2, #64
 502         bge     1b
 503         tst     tmp2, #0x3f
 504         bne     1f
 505         ldr     tmp2,[sp], #FRAME_SIZE
 506         cfi_adjust_cfa_offset (-FRAME_SIZE)
 507         cfi_restore (tmp2)
 508         bx      lr
 509
 510         cfi_restore_state
 511         cfi_remember_state
 512 1:
 513         add     src, src, #8
 514         add     dst, dst, #8
 515
 516 .Ltail63aligned:                        /* Count in tmp2.  */
 517         /* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
 518            we know that the src and dest are 64-bit aligned so we can use
 519            LDRD/STRD to improve efficiency.  */
 520         /* TMP2 is now negative, but we don't care about that.  The bottom
 521            six bits still tell us how many bytes are left to copy.  */
 522
 523         and     tmp1, tmp2, #0x38
 524         add     dst, dst, tmp1
 525         add     src, src, tmp1
 526         .macro dispatch_step i
 527         sfi_breg src, \
 528         ldrd    A_l, A_h, [\B, #-(\i * 8)]
 529         sfi_breg dst, \
 530         strd    A_l, A_h, [\B, #-(\i * 8)]
 531         .endm
 532         dispatch_7_dword
 533 #endif
 534
 535         tst     tmp2, #4
 536         sfi_breg src, \
 537         ldrne   tmp1, [\B], #4
 538         sfi_breg dst, \
 539         strne   tmp1, [\B], #4
 540         lsls    tmp2, tmp2, #31         /* Count (tmp2) now dead. */
 541         sfi_breg src, \
 542         ldrhcs  tmp1, [\B], #2
 543         sfi_breg src, \
 544         ldrbne  tmp2, [\B]
 545         sfi_breg dst, \
 546         strhcs  tmp1, [\B], #2
 547         sfi_breg dst, \
 548         strbne  tmp2, [\B]
 549
 550 .Ldone:
 551         ldr     tmp2, [sp], #FRAME_SIZE
 552         cfi_adjust_cfa_offset (-FRAME_SIZE)
 553         cfi_restore (tmp2)
 554         bx      lr
 555
 556         cfi_restore_state
 557         cfi_remember_state
 558
 559 .Lcpy_body_long:                        /* Count in tmp2.  */
 560
 561         /* Long copy.  We know that there's at least (prefetch_lines * 64)
 562            bytes to go.  */
 563 #ifdef USE_VFP
 564         /* Don't use PLD.  Instead, read some data in advance of the current
 565            copy position into a register.  This should act like a PLD
 566            operation but we won't have to repeat the transfer.  */
 567
 568         sfi_breg src, \
 569         vldr    d3, [\B, #0]
 570         sfi_breg src, \
 571         vldr    d4, [\B, #64]
 572         sfi_breg src, \
 573         vldr    d5, [\B, #128]
 574         sfi_breg src, \
 575         vldr    d6, [\B, #192]
 576         sfi_breg src, \
 577         vldr    d7, [\B, #256]
 578
 579         sfi_breg src, \
 580         vldr    d0, [\B, #8]
 581         sfi_breg src, \
 582         vldr    d1, [\B, #16]
 583         sfi_breg src, \
 584         vldr    d2, [\B, #24]
 585         add     src, src, #32
 586
 587         subs    tmp2, tmp2, #prefetch_lines * 64 * 2
 588         blt     2f
 589 1:
 590         cpy_line_vfp    d3, 0
 591         cpy_line_vfp    d4, 64
 592         cpy_line_vfp    d5, 128
 593         add     dst, dst, #3 * 64
 594         add     src, src, #3 * 64
 595         cpy_line_vfp    d6, 0
 596         cpy_line_vfp    d7, 64
 597         add     dst, dst, #2 * 64
 598         add     src, src, #2 * 64
 599         subs    tmp2, tmp2, #prefetch_lines * 64
 600         bge     1b
 601
 602 2:
 603         cpy_tail_vfp    d3, 0
 604         cpy_tail_vfp    d4, 64
 605         cpy_tail_vfp    d5, 128
 606         add     src, src, #3 * 64
 607         add     dst, dst, #3 * 64
 608         cpy_tail_vfp    d6, 0
 609         sfi_breg dst, \
 610         vstr    d7, [\B, #64]
 611         sfi_breg src, \
 612         vldr    d7, [\B, #64]
 613         sfi_breg dst, \
 614         vstr    d0, [\B, #64 + 8]
 615         sfi_breg src, \
 616         vldr    d0, [\B, #64 + 8]
 617         sfi_breg dst, \
 618         vstr    d1, [\B, #64 + 16]
 619         sfi_breg src, \
 620         vldr    d1, [\B, #64 + 16]
 621         sfi_breg dst, \
 622         vstr    d2, [\B, #64 + 24]
 623         sfi_breg src, \
 624         vldr    d2, [\B, #64 + 24]
 625         sfi_breg dst, \
 626         vstr    d7, [\B, #64 + 32]
 627         add     src, src, #96
 628         sfi_breg dst, \
 629         vstr    d0, [\B, #64 + 40]
 630         sfi_breg dst, \
 631         vstr    d1, [\B, #64 + 48]
 632         sfi_breg dst, \
 633         vstr    d2, [\B, #64 + 56]
 634         add     dst, dst, #128
 635         add     tmp2, tmp2, #prefetch_lines * 64
 636         b       .Lcpy_body_medium
 637 #else
 638         /* Long copy.  Use an SMS style loop to maximize the I/O
 639            bandwidth of the core.  We don't have enough spare registers
 640            to synthesise prefetching, so use PLD operations.  */
 641         /* Pre-bias src and dst.  */
 642         sub     src, src, #8
 643         sub     dst, dst, #8
 644         sfi_pld src, #8
 645         sfi_pld src, #72
 646         subs    tmp2, tmp2, #64
 647         sfi_pld src, #136
 648         sfi_breg src, \
 649         ldrd    A_l, A_h, [\B, #8]
 650         strd    B_l, B_h, [sp, #8]
 651         cfi_rel_offset (B_l, 8)
 652         cfi_rel_offset (B_h, 12)
 653         sfi_breg src, \
 654         ldrd    B_l, B_h, [\B, #16]
 655         strd    C_l, C_h, [sp, #16]
 656         cfi_rel_offset (C_l, 16)
 657         cfi_rel_offset (C_h, 20)
 658         sfi_breg src, \
 659         ldrd    C_l, C_h, [\B, #24]
 660         strd    D_l, D_h, [sp, #24]
 661         cfi_rel_offset (D_l, 24)
 662         cfi_rel_offset (D_h, 28)
 663         sfi_pld src, #200
 664         sfi_breg src, \
 665         ldrd    D_l, D_h, [\B, #32]!
 666         b       1f
 667         .p2align        6
 668 2:
 669         sfi_pld src, #232
 670         sfi_breg dst, \
 671         strd    A_l, A_h, [\B, #40]
 672         sfi_breg src, \
 673         ldrd    A_l, A_h, [\B, #40]
 674         sfi_breg dst, \
 675         strd    B_l, B_h, [\B, #48]
 676         sfi_breg src, \
 677         ldrd    B_l, B_h, [\B, #48]
 678         sfi_breg dst, \
 679         strd    C_l, C_h, [\B, #56]
 680         sfi_breg src, \
 681         ldrd    C_l, C_h, [\B, #56]
 682         sfi_breg dst, \
 683         strd    D_l, D_h, [\B, #64]!
 684         sfi_breg src, \
 685         ldrd    D_l, D_h, [\B, #64]!
 686         subs    tmp2, tmp2, #64
 687 1:
 688         sfi_breg dst, \
 689         strd    A_l, A_h, [\B, #8]
 690         sfi_breg src, \
 691         ldrd    A_l, A_h, [\B, #8]
 692         sfi_breg dst, \
 693         strd    B_l, B_h, [\B, #16]
 694         sfi_breg src, \
 695         ldrd    B_l, B_h, [\B, #16]
 696         sfi_breg dst, \
 697         strd    C_l, C_h, [\B, #24]
 698         sfi_breg src, \
 699         ldrd    C_l, C_h, [\B, #24]
 700         sfi_breg dst, \
 701         strd    D_l, D_h, [\B, #32]
 702         sfi_breg src, \
 703         ldrd    D_l, D_h, [\B, #32]
 704         bcs     2b
 705         /* Save the remaining bytes and restore the callee-saved regs.  */
 706         sfi_breg dst, \
 707         strd    A_l, A_h, [\B, #40]
 708         add     src, src, #40
 709         sfi_breg dst, \
 710         strd    B_l, B_h, [\B, #48]
 711         ldrd    B_l, B_h, [sp, #8]
 712         cfi_restore (B_l)
 713         cfi_restore (B_h)
 714         sfi_breg dst, \
 715         strd    C_l, C_h, [\B, #56]
 716         ldrd    C_l, C_h, [sp, #16]
 717         cfi_restore (C_l)
 718         cfi_restore (C_h)
 719         sfi_breg dst, \
 720         strd    D_l, D_h, [\B, #64]
 721         ldrd    D_l, D_h, [sp, #24]
 722         cfi_restore (D_l)
 723         cfi_restore (D_h)
 724         add     dst, dst, #72
 725         tst     tmp2, #0x3f
 726         bne     .Ltail63aligned
 727         ldr     tmp2, [sp], #FRAME_SIZE
 728         cfi_adjust_cfa_offset (-FRAME_SIZE)
 729         cfi_restore (tmp2)
 730         bx      lr
 731 #endif
 732
 733         cfi_restore_state
 734         cfi_remember_state
 735
 736 .Lcpy_notaligned:
 737         sfi_pld src
 738         sfi_pld src, #64
 739         /* There's at least 64 bytes to copy, but there is no mutual
 740            alignment.  */
 741         /* Bring DST to 64-bit alignment.  */
 742         lsls    tmp2, dst, #29
 743         sfi_pld src, #(2 * 64)
 744         beq     1f
 745         rsbs    tmp2, tmp2, #0
 746         sub     count, count, tmp2, lsr #29
 747         sfi_breg src, \
 748         ldrmi   tmp1, [\B], #4
 749         sfi_breg dst, \
 750         strmi   tmp1, [\B], #4
 751         lsls    tmp2, tmp2, #2
 752         sfi_breg src, \
 753         ldrbne  tmp1, [\B], #1
 754         sfi_breg src, \
 755         ldrhcs  tmp2, [\B], #2
 756         sfi_breg dst, \
 757         strbne  tmp1, [\B], #1
 758         sfi_breg dst, \
 759         strhcs  tmp2, [\B], #2
 760 1:
 761         sfi_pld src, #(3 * 64)
 762         subs    count, count, #64
 763         ldrmi   tmp2, [sp], #FRAME_SIZE
 764         bmi     .Ltail63unaligned
 765         sfi_pld src, #(4 * 64)
 766
 767 #ifdef USE_NEON
 768         /* These need an extra layer of macro just to work around a
 769            bug in the assembler's parser when an operand starts with
 770            a {...}.  */
 771         .macro neon_load_multi reglist, basereg
 772         vld1.8  {\reglist}, [\basereg]!
 773         .endm
 774         .macro neon_store_multi reglist, basereg
 775         vst1.8  {\reglist}, [ALIGN (\basereg, 64)]!
 776         .endm
 777
 778         /* These are used by the NaCl sfi_breg macro.  */
 779         .macro _sfi_breg_dmask_neon_load_multi reg
 780         _sfi_dmask \reg
 781         .endm
 782         .macro _sfi_breg_dmask_neon_store_multi reg
 783         _sfi_dmask \reg
 784         .endm
 785
 786         sfi_breg src, neon_load_multi d0-d3, \B
 787         sfi_breg src, neon_load_multi d4-d7, \B
 788         subs    count, count, #64
 789         bmi     2f
 790 1:
 791         sfi_pld src, #(4 * 64)
 792         sfi_breg dst, neon_store_multi d0-d3, \B
 793         sfi_breg src, neon_load_multi d0-d3, \B
 794         sfi_breg dst, neon_store_multi d4-d7, \B
 795         sfi_breg src, neon_load_multi d4-d7, \B
 796         subs    count, count, #64
 797         bpl     1b
 798 2:
 799         sfi_breg dst, neon_store_multi d0-d3, \B
 800         sfi_breg dst, neon_store_multi d4-d7, \B
 801         ands    count, count, #0x3f
 802 #else
 803         /* Use an SMS style loop to maximize the I/O bandwidth.  */
 804         sub     src, src, #4
 805         sub     dst, dst, #8
 806         subs    tmp2, count, #64        /* Use tmp2 for count.  */
 807         sfi_breg src, \
 808         ldr     A_l, [\B, #4]
 809         sfi_breg src, \
 810         ldr     A_h, [\B, #8]
 811         strd    B_l, B_h, [sp, #8]
 812         cfi_rel_offset (B_l, 8)
 813         cfi_rel_offset (B_h, 12)
 814         sfi_breg src, \
 815         ldr     B_l, [\B, #12]
 816         sfi_breg src, \
 817         ldr     B_h, [\B, #16]
 818         strd    C_l, C_h, [sp, #16]
 819         cfi_rel_offset (C_l, 16)
 820         cfi_rel_offset (C_h, 20)
 821         sfi_breg src, \
 822         ldr     C_l, [\B, #20]
 823         sfi_breg src, \
 824         ldr     C_h, [\B, #24]
 825         strd    D_l, D_h, [sp, #24]
 826         cfi_rel_offset (D_l, 24)
 827         cfi_rel_offset (D_h, 28)
 828         sfi_breg src, \
 829         ldr     D_l, [\B, #28]
 830         sfi_breg src, \
 831         ldr     D_h, [\B, #32]!
 832         b       1f
 833         .p2align        6
 834 2:
 835         sfi_pld src, #(5 * 64) - (32 - 4)
 836         sfi_breg dst, \
 837         strd    A_l, A_h, [\B, #40]
 838         sfi_breg src, \
 839         ldr     A_l, [\B, #36]
 840         sfi_breg src, \
 841         ldr     A_h, [\B, #40]
 842         sfi_breg dst, \
 843         strd    B_l, B_h, [\B, #48]
 844         sfi_breg src, \
 845         ldr     B_l, [\B, #44]
 846         sfi_breg src, \
 847         ldr     B_h, [\B, #48]
 848         sfi_breg dst, \
 849         strd    C_l, C_h, [\B, #56]
 850         sfi_breg src, \
 851         ldr     C_l, [\B, #52]
 852         sfi_breg src, \
 853         ldr     C_h, [\B, #56]
 854         sfi_breg dst, \
 855         strd    D_l, D_h, [\B, #64]!
 856         sfi_breg src, \
 857         ldr     D_l, [\B, #60]
 858         sfi_breg src, \
 859         ldr     D_h, [\B, #64]!
 860         subs    tmp2, tmp2, #64
 861 1:
 862         sfi_breg dst, \
 863         strd    A_l, A_h, [\B, #8]
 864         sfi_breg src, \
 865         ldr     A_l, [\B, #4]
 866         sfi_breg src, \
 867         ldr     A_h, [\B, #8]
 868         sfi_breg dst, \
 869         strd    B_l, B_h, [\B, #16]
 870         sfi_breg src, \
 871         ldr     B_l, [\B, #12]
 872         sfi_breg src, \
 873         ldr     B_h, [\B, #16]
 874         sfi_breg dst, \
 875         strd    C_l, C_h, [\B, #24]
 876         sfi_breg src, \
 877         ldr     C_l, [\B, #20]
 878         sfi_breg src, \
 879         ldr     C_h, [\B, #24]
 880         sfi_breg dst, \
 881         strd    D_l, D_h, [\B, #32]
 882         sfi_breg src, \
 883         ldr     D_l, [\B, #28]
 884         sfi_breg src, \
 885         ldr     D_h, [\B, #32]
 886         bcs     2b
 887
 888         /* Save the remaining bytes and restore the callee-saved regs.  */
 889         sfi_breg dst, \
 890         strd    A_l, A_h, [\B, #40]
 891         add     src, src, #36
 892         sfi_breg dst, \
 893         strd    B_l, B_h, [\B, #48]
 894         ldrd    B_l, B_h, [sp, #8]
 895         cfi_restore (B_l)
 896         cfi_restore (B_h)
 897         sfi_breg dst, \
 898         strd    C_l, C_h, [\B, #56]
 899         ldrd    C_l, C_h, [sp, #16]
 900         cfi_restore (C_l)
 901         cfi_restore (C_h)
 902         sfi_breg dst, \
 903         strd    D_l, D_h, [\B, #64]
 904         ldrd    D_l, D_h, [sp, #24]
 905         cfi_restore (D_l)
 906         cfi_restore (D_h)
 907         add     dst, dst, #72
 908         ands    count, tmp2, #0x3f
 909 #endif
 910         ldr     tmp2, [sp], #FRAME_SIZE
 911         cfi_adjust_cfa_offset (-FRAME_SIZE)
 912         cfi_restore (tmp2)
 913         bne     .Ltail63unaligned
 914         bx      lr
 915
 916 END(memcpy)
 917 libc_hidden_builtin_def (memcpy)