1 /* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
2 Copyright (C) 2013-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>.
19 This memcpy routine is optimised for Cortex-A15 cores and takes advantage
20 of VFP or NEON when built with the appropriate flags.
24 ARMv6 (ARMv7-a if using Neon)
30 /* Thumb cannot encode negative immediate offsets in memory operations. */
35 #include <arm-features.h>
38 /* This implementation requires ARM state. */
49 #elif defined (MEMCPY_VFP)
53 # define FRAME_SIZE 32
58 # define FRAME_SIZE 32
62 #define ALIGN(addr, align) addr:align
66 /* Call parameters. */
76 /* These two macros both work by repeated invocation of the macro
77 dispatch_step (not defined here). That macro performs one "step",
78 doing one load instruction and one store instruction to copy one
79 "unit". On entry, TMP1 contains the number of bytes to be copied,
80 a multiple of the unit size. The macro clobbers TMP1 in the
81 process of doing a computed jump to the tail containing the
82 appropriate number of steps.
84 In dispatch_7_dword, dispatch_step is invoked seven times, with an
85 argument that is 7 for the first and 1 for the last. Units are
86 double-words (8 bytes). TMP1 is at most 56.
88 In dispatch_15_word, dispatch_step is invoked fifteen times,
89 with an argument that is 15 for the first and 1 for the last.
90 Units are words (4 bytes). TMP1 is at most 60. */
93 # if ARM_BX_ALIGN_LOG2 != 2
94 # error case not handled
96 .macro dispatch_7_dword
97 rsb tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE)
106 .purgem dispatch_step
109 .macro dispatch_15_word
110 rsb tmp1, tmp1, #((15 * 4) - PC_OFS/2 + INSN_SIZE/2)
111 add pc, pc, tmp1, lsl #1
127 .purgem dispatch_step
130 # if ARM_BX_ALIGN_LOG2 < 3
131 # error case not handled
133 .macro dispatch_helper steps, log2_bytes_per_step
134 /* TMP1 gets (max_bytes - bytes_to_copy), where max_bytes is
135 (STEPS << LOG2_BYTES_PER_STEP).
136 So this is (steps_to_skip << LOG2_BYTES_PER_STEP).
137 Then it needs further adjustment to compensate for the
138 distance between the PC value taken below (0f + PC_OFS)
139 and the first step's instructions (1f). */
140 rsb tmp1, tmp1, #((\steps << \log2_bytes_per_step) \
141 + ((1f - PC_OFS - 0f) \
142 >> (ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)))
143 /* Shifting down LOG2_BYTES_PER_STEP gives us the number of
144 steps to skip, then shifting up ARM_BX_ALIGN_LOG2 gives us
145 the (byte) distance to add to the PC. */
146 0: add tmp1, pc, tmp1, lsl #(ARM_BX_ALIGN_LOG2 - \log2_bytes_per_step)
148 .p2align ARM_BX_ALIGN_LOG2
152 .macro dispatch_7_dword
154 .p2align ARM_BX_ALIGN_LOG2
156 .p2align ARM_BX_ALIGN_LOG2
158 .p2align ARM_BX_ALIGN_LOG2
160 .p2align ARM_BX_ALIGN_LOG2
162 .p2align ARM_BX_ALIGN_LOG2
164 .p2align ARM_BX_ALIGN_LOG2
166 .p2align ARM_BX_ALIGN_LOG2
168 .p2align ARM_BX_ALIGN_LOG2
169 .purgem dispatch_step
172 .macro dispatch_15_word
173 dispatch_helper 15, 2
175 .p2align ARM_BX_ALIGN_LOG2
177 .p2align ARM_BX_ALIGN_LOG2
179 .p2align ARM_BX_ALIGN_LOG2
181 .p2align ARM_BX_ALIGN_LOG2
183 .p2align ARM_BX_ALIGN_LOG2
185 .p2align ARM_BX_ALIGN_LOG2
187 .p2align ARM_BX_ALIGN_LOG2
189 .p2align ARM_BX_ALIGN_LOG2
191 .p2align ARM_BX_ALIGN_LOG2
193 .p2align ARM_BX_ALIGN_LOG2
195 .p2align ARM_BX_ALIGN_LOG2
197 .p2align ARM_BX_ALIGN_LOG2
199 .p2align ARM_BX_ALIGN_LOG2
201 .p2align ARM_BX_ALIGN_LOG2
203 .p2align ARM_BX_ALIGN_LOG2
204 .purgem dispatch_step
210 /* For bulk copies using GP registers. */
211 #define A_l r2 /* Call-clobbered. */
212 #define A_h r3 /* Call-clobbered. */
217 /* Don't use the pair r8,r9 because in some EABI variants r9 is reserved. */
222 /* Number of lines ahead to pre-fetch data. If you change this the code
223 below will need adjustment to compensate. */
225 #define prefetch_lines 5
228 .macro cpy_line_vfp vreg, base
230 vstr \vreg, [\B, #\base]
232 vldr \vreg, [\B, #\base]
234 vstr d0, [\B, #\base + 8]
236 vldr d0, [\B, #\base + 8]
238 vstr d1, [\B, #\base + 16]
240 vldr d1, [\B, #\base + 16]
242 vstr d2, [\B, #\base + 24]
244 vldr d2, [\B, #\base + 24]
246 vstr \vreg, [\B, #\base + 32]
248 vldr \vreg, [\B, #\base + prefetch_lines * 64 - 32]
250 vstr d0, [\B, #\base + 40]
252 vldr d0, [\B, #\base + 40]
254 vstr d1, [\B, #\base + 48]
256 vldr d1, [\B, #\base + 48]
258 vstr d2, [\B, #\base + 56]
260 vldr d2, [\B, #\base + 56]
263 .macro cpy_tail_vfp vreg, base
265 vstr \vreg, [\B, #\base]
267 vldr \vreg, [\B, #\base]
269 vstr d0, [\B, #\base + 8]
271 vldr d0, [\B, #\base + 8]
273 vstr d1, [\B, #\base + 16]
275 vldr d1, [\B, #\base + 16]
277 vstr d2, [\B, #\base + 24]
279 vldr d2, [\B, #\base + 24]
281 vstr \vreg, [\B, #\base + 32]
283 vstr d0, [\B, #\base + 40]
285 vldr d0, [\B, #\base + 40]
287 vstr d1, [\B, #\base + 48]
289 vldr d1, [\B, #\base + 48]
291 vstr d2, [\B, #\base + 56]
293 vldr d2, [\B, #\base + 56]
300 mov dst, dstin /* Preserve dstin, we need to return it. */
303 /* Deal with small copies quickly by dropping straight into the
308 /* These need an extra layer of macro just to work around a
309 bug in the assembler's parser when an operand starts with
310 a {...}. http://sourceware.org/bugzilla/show_bug.cgi?id=15647
311 tracks that bug; it was not fixed as of binutils-2.23.2. */
312 .macro neon_load_d0 reg
315 .macro neon_store_d0 reg
319 /* These are used by the NaCl sfi_breg macro. */
320 .macro _sfi_breg_dmask_neon_load_d0 reg
323 .macro _sfi_breg_dmask_neon_store_d0 reg
327 and tmp1, count, #0x38
328 .macro dispatch_step i
329 sfi_breg src, neon_load_d0 \B
330 sfi_breg dst, neon_store_d0 \B
340 /* Copy up to 15 full words of data. May not be aligned. */
341 /* Cannot use VFP for unaligned data. */
342 and tmp1, count, #0x3c
345 /* Jump directly into the sequence below at the correct offset. */
346 .macro dispatch_step i
348 ldr tmp1, [\B, #-(\i * 4)]
350 str tmp1, [\B, #-(\i * 4)]
355 lsls count, count, #31
357 ldrhcs tmp1, [\B], #2
359 ldrbne src, [\B] /* Src is dead, use as a scratch. */
361 strhcs tmp1, [\B], #2
367 /* At least 64 bytes to copy, but don't know the alignment yet. */
368 str tmp2, [sp, #-FRAME_SIZE]!
369 cfi_adjust_cfa_offset (FRAME_SIZE)
370 cfi_rel_offset (tmp2, 0)
378 /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
379 that the FP pipeline is much better at streaming loads and
380 stores. This is outside the critical loop. */
384 /* SRC and DST have the same mutual 64-bit alignment, but we may
385 still need to pre-copy some bytes to get to natural alignment.
386 We bring SRC and DST into full 64-bit alignment. */
390 sub count, count, tmp2, lsr #29
397 ldrhcs tmp1, [\B], #2
399 ldrbne tmp2, [\B], #1
401 strhcs tmp1, [\B], #2
403 strbne tmp2, [\B], #1
406 subs tmp2, count, #64 /* Use tmp2 for count. */
412 .Lcpy_body_medium: /* Count in tmp2. */
454 .Ltail63aligned: /* Count in tmp2. */
455 and tmp1, tmp2, #0x38
458 .macro dispatch_step i
460 vldr d0, [\B, #-(\i * 8)]
462 vstr d0, [\B, #-(\i * 8)]
470 ldrd A_l, A_h, [\B, #8]
472 strd A_l, A_h, [\B, #8]
474 ldrd A_l, A_h, [\B, #16]
476 strd A_l, A_h, [\B, #16]
478 ldrd A_l, A_h, [\B, #24]
480 strd A_l, A_h, [\B, #24]
482 ldrd A_l, A_h, [\B, #32]
484 strd A_l, A_h, [\B, #32]
486 ldrd A_l, A_h, [\B, #40]
488 strd A_l, A_h, [\B, #40]
490 ldrd A_l, A_h, [\B, #48]
492 strd A_l, A_h, [\B, #48]
494 ldrd A_l, A_h, [\B, #56]
496 strd A_l, A_h, [\B, #56]
498 ldrd A_l, A_h, [\B, #64]!
500 strd A_l, A_h, [\B, #64]!
505 ldr tmp2,[sp], #FRAME_SIZE
506 cfi_adjust_cfa_offset (-FRAME_SIZE)
516 .Ltail63aligned: /* Count in tmp2. */
517 /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
518 we know that the src and dest are 64-bit aligned so we can use
519 LDRD/STRD to improve efficiency. */
520 /* TMP2 is now negative, but we don't care about that. The bottom
521 six bits still tell us how many bytes are left to copy. */
523 and tmp1, tmp2, #0x38
526 .macro dispatch_step i
528 ldrd A_l, A_h, [\B, #-(\i * 8)]
530 strd A_l, A_h, [\B, #-(\i * 8)]
540 lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
542 ldrhcs tmp1, [\B], #2
546 strhcs tmp1, [\B], #2
551 ldr tmp2, [sp], #FRAME_SIZE
552 cfi_adjust_cfa_offset (-FRAME_SIZE)
559 .Lcpy_body_long: /* Count in tmp2. */
561 /* Long copy. We know that there's at least (prefetch_lines * 64)
564 /* Don't use PLD. Instead, read some data in advance of the current
565 copy position into a register. This should act like a PLD
566 operation but we won't have to repeat the transfer. */
587 subs tmp2, tmp2, #prefetch_lines * 64 * 2
593 add dst, dst, #3 * 64
594 add src, src, #3 * 64
597 add dst, dst, #2 * 64
598 add src, src, #2 * 64
599 subs tmp2, tmp2, #prefetch_lines * 64
606 add src, src, #3 * 64
607 add dst, dst, #3 * 64
614 vstr d0, [\B, #64 + 8]
616 vldr d0, [\B, #64 + 8]
618 vstr d1, [\B, #64 + 16]
620 vldr d1, [\B, #64 + 16]
622 vstr d2, [\B, #64 + 24]
624 vldr d2, [\B, #64 + 24]
626 vstr d7, [\B, #64 + 32]
629 vstr d0, [\B, #64 + 40]
631 vstr d1, [\B, #64 + 48]
633 vstr d2, [\B, #64 + 56]
635 add tmp2, tmp2, #prefetch_lines * 64
638 /* Long copy. Use an SMS style loop to maximize the I/O
639 bandwidth of the core. We don't have enough spare registers
640 to synthesise prefetching, so use PLD operations. */
641 /* Pre-bias src and dst. */
649 ldrd A_l, A_h, [\B, #8]
650 strd B_l, B_h, [sp, #8]
651 cfi_rel_offset (B_l, 8)
652 cfi_rel_offset (B_h, 12)
654 ldrd B_l, B_h, [\B, #16]
655 strd C_l, C_h, [sp, #16]
656 cfi_rel_offset (C_l, 16)
657 cfi_rel_offset (C_h, 20)
659 ldrd C_l, C_h, [\B, #24]
660 strd D_l, D_h, [sp, #24]
661 cfi_rel_offset (D_l, 24)
662 cfi_rel_offset (D_h, 28)
665 ldrd D_l, D_h, [\B, #32]!
671 strd A_l, A_h, [\B, #40]
673 ldrd A_l, A_h, [\B, #40]
675 strd B_l, B_h, [\B, #48]
677 ldrd B_l, B_h, [\B, #48]
679 strd C_l, C_h, [\B, #56]
681 ldrd C_l, C_h, [\B, #56]
683 strd D_l, D_h, [\B, #64]!
685 ldrd D_l, D_h, [\B, #64]!
689 strd A_l, A_h, [\B, #8]
691 ldrd A_l, A_h, [\B, #8]
693 strd B_l, B_h, [\B, #16]
695 ldrd B_l, B_h, [\B, #16]
697 strd C_l, C_h, [\B, #24]
699 ldrd C_l, C_h, [\B, #24]
701 strd D_l, D_h, [\B, #32]
703 ldrd D_l, D_h, [\B, #32]
705 /* Save the remaining bytes and restore the callee-saved regs. */
707 strd A_l, A_h, [\B, #40]
710 strd B_l, B_h, [\B, #48]
711 ldrd B_l, B_h, [sp, #8]
715 strd C_l, C_h, [\B, #56]
716 ldrd C_l, C_h, [sp, #16]
720 strd D_l, D_h, [\B, #64]
721 ldrd D_l, D_h, [sp, #24]
727 ldr tmp2, [sp], #FRAME_SIZE
728 cfi_adjust_cfa_offset (-FRAME_SIZE)
739 /* There's at least 64 bytes to copy, but there is no mutual
741 /* Bring DST to 64-bit alignment. */
743 sfi_pld src, #(2 * 64)
746 sub count, count, tmp2, lsr #29
753 ldrbne tmp1, [\B], #1
755 ldrhcs tmp2, [\B], #2
757 strbne tmp1, [\B], #1
759 strhcs tmp2, [\B], #2
761 sfi_pld src, #(3 * 64)
762 subs count, count, #64
763 ldrmi tmp2, [sp], #FRAME_SIZE
764 bmi .Ltail63unaligned
765 sfi_pld src, #(4 * 64)
768 /* These need an extra layer of macro just to work around a
769 bug in the assembler's parser when an operand starts with
771 .macro neon_load_multi reglist, basereg
772 vld1.8 {\reglist}, [\basereg]!
774 .macro neon_store_multi reglist, basereg
775 vst1.8 {\reglist}, [ALIGN (\basereg, 64)]!
778 /* These are used by the NaCl sfi_breg macro. */
779 .macro _sfi_breg_dmask_neon_load_multi reg
782 .macro _sfi_breg_dmask_neon_store_multi reg
786 sfi_breg src, neon_load_multi d0-d3, \B
787 sfi_breg src, neon_load_multi d4-d7, \B
788 subs count, count, #64
791 sfi_pld src, #(4 * 64)
792 sfi_breg dst, neon_store_multi d0-d3, \B
793 sfi_breg src, neon_load_multi d0-d3, \B
794 sfi_breg dst, neon_store_multi d4-d7, \B
795 sfi_breg src, neon_load_multi d4-d7, \B
796 subs count, count, #64
799 sfi_breg dst, neon_store_multi d0-d3, \B
800 sfi_breg dst, neon_store_multi d4-d7, \B
801 ands count, count, #0x3f
803 /* Use an SMS style loop to maximize the I/O bandwidth. */
806 subs tmp2, count, #64 /* Use tmp2 for count. */
811 strd B_l, B_h, [sp, #8]
812 cfi_rel_offset (B_l, 8)
813 cfi_rel_offset (B_h, 12)
818 strd C_l, C_h, [sp, #16]
819 cfi_rel_offset (C_l, 16)
820 cfi_rel_offset (C_h, 20)
825 strd D_l, D_h, [sp, #24]
826 cfi_rel_offset (D_l, 24)
827 cfi_rel_offset (D_h, 28)
835 sfi_pld src, #(5 * 64) - (32 - 4)
837 strd A_l, A_h, [\B, #40]
843 strd B_l, B_h, [\B, #48]
849 strd C_l, C_h, [\B, #56]
855 strd D_l, D_h, [\B, #64]!
863 strd A_l, A_h, [\B, #8]
869 strd B_l, B_h, [\B, #16]
875 strd C_l, C_h, [\B, #24]
881 strd D_l, D_h, [\B, #32]
888 /* Save the remaining bytes and restore the callee-saved regs. */
890 strd A_l, A_h, [\B, #40]
893 strd B_l, B_h, [\B, #48]
894 ldrd B_l, B_h, [sp, #8]
898 strd C_l, C_h, [\B, #56]
899 ldrd C_l, C_h, [sp, #16]
903 strd D_l, D_h, [\B, #64]
904 ldrd D_l, D_h, [sp, #24]
908 ands count, tmp2, #0x3f
910 ldr tmp2, [sp], #FRAME_SIZE
911 cfi_adjust_cfa_offset (-FRAME_SIZE)
913 bne .Ltail63unaligned
917 libc_hidden_builtin_def (memcpy)