sysdeps/aarch64/multiarch/memmove_falkor.S

   1 /* Copyright (C) 2017-2018 Free Software Foundation, Inc.
   2
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library.  If not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 /* Assumptions: ARMv8-a, AArch64, falkor, unaligned accesses.  */
  22
  23 #define dstin   x0
  24 #define src     x1
  25 #define count   x2
  26 #define dstlen  x3
  27 #define dst     x3
  28 #define srcend  x4
  29 #define dstend  x5
  30 #define A_l     x6
  31 #define A_lw    w6
  32 #define A_h     x7
  33 #define A_hw    w7
  34 #define B_l     x8
  35 #define B_lw    w8
  36 #define B_h     x9
  37 #define C_l     x10
  38 #define C_h     x11
  39 #define D_l     x12
  40 #define D_h     x13
  41 #define E_l     src
  42 #define E_h     count
  43 #define F_l     srcend
  44 #define F_h     dst
  45 #define tmp1    x14
  46
  47 /* Alias with A_l and A_h to train the prefetcher.  */
  48 #define Q_l     x22
  49 #define Q_h     x23
  50
  51 /* RATIONALE:
  52
  53    The copy has 4 distinct parts:
  54    * Small copies of 16 bytes and under
  55    * Medium sized copies of 17-96 bytes
  56    * Large copies where the source address is higher than the destination
  57      (forward copies)
  58    * Large copies where the destination address is higher than the source
  59      (copy backward, or move).
  60
  61    We use only two registerpairs x6,x7 and x22,x23 for the copies and copy 32
  62    bytes at a time to correctly train the hardware prefetcher for better
  63    throughput.  */
  64 ENTRY_ALIGN (__memmove_falkor, 6)
  65
  66         sub     tmp1, dstin, src
  67         add     srcend, src, count
  68         add     dstend, dstin, count
  69         cmp     count, 96
  70         ccmp    tmp1, count, 2, hi
  71         b.lo    L(move_long)
  72
  73         cmp     count, 16
  74         b.ls    L(copy16)
  75         cmp     count, 96
  76         b.hi    L(copy_long)
  77
  78         /* Medium copies: 17..96 bytes.  */
  79         sub     tmp1, count, 1
  80         ldp     A_l, A_h, [src]
  81         tbnz    tmp1, 6, L(copy96)
  82         ldp     D_l, D_h, [srcend, -16]
  83         tbz     tmp1, 5, 1f
  84         ldp     B_l, B_h, [src, 16]
  85         ldp     C_l, C_h, [srcend, -32]
  86         stp     B_l, B_h, [dstin, 16]
  87         stp     C_l, C_h, [dstend, -32]
  88 1:
  89         stp     A_l, A_h, [dstin]
  90         stp     D_l, D_h, [dstend, -16]
  91         ret
  92
  93         .p2align 4
  94         /* Small copies: 0..16 bytes.  */
  95 L(copy16):
  96         cmp     count, 8
  97         b.lo    1f
  98         ldr     A_l, [src]
  99         ldr     A_h, [srcend, -8]
 100         str     A_l, [dstin]
 101         str     A_h, [dstend, -8]
 102         ret
 103         .p2align 4
 104 1:
 105         /* 4-7 */
 106         tbz     count, 2, 1f
 107         ldr     A_lw, [src]
 108         ldr     A_hw, [srcend, -4]
 109         str     A_lw, [dstin]
 110         str     A_hw, [dstend, -4]
 111         ret
 112         .p2align 4
 113 1:
 114         /* 2-3 */
 115         tbz     count, 1, 1f
 116         ldrh    A_lw, [src]
 117         ldrh    A_hw, [srcend, -2]
 118         strh    A_lw, [dstin]
 119         strh    A_hw, [dstend, -2]
 120         ret
 121         .p2align 4
 122 1:
 123         /* 0-1 */
 124         tbz     count, 0, 1f
 125         ldrb    A_lw, [src]
 126         strb    A_lw, [dstin]
 127 1:      ret
 128
 129         .p2align 4
 130         /* Copy 64..96 bytes.  Copy 64 bytes from the start and
 131            32 bytes from the end.  */
 132 L(copy96):
 133         ldp     B_l, B_h, [src, 16]
 134         ldp     C_l, C_h, [src, 32]
 135         ldp     D_l, D_h, [src, 48]
 136         ldp     E_l, E_h, [srcend, -32]
 137         ldp     F_l, F_h, [srcend, -16]
 138         stp     A_l, A_h, [dstin]
 139         stp     B_l, B_h, [dstin, 16]
 140         stp     C_l, C_h, [dstin, 32]
 141         stp     D_l, D_h, [dstin, 48]
 142         stp     E_l, E_h, [dstend, -32]
 143         stp     F_l, F_h, [dstend, -16]
 144         ret
 145
 146         /* Align SRC to 16 byte alignment so that we don't cross cache line
 147            boundaries on both loads and stores.  There are at least 96 bytes
 148            to copy, so copy 16 bytes unaligned and then align.  The loop
 149            copies 32 bytes per iteration and prefetches one iteration ahead.  */
 150
 151         .p2align 4
 152 L(copy_long):
 153         sub     count, count, 64 + 16   /* Test and readjust count.  */
 154         mov     B_l, Q_l
 155         mov     B_h, Q_h
 156         ldp     A_l, A_h, [src]
 157         and     tmp1, src, 15
 158         bic     src, src, 15
 159         sub     dst, dstin, tmp1
 160         add     count, count, tmp1      /* Count is now 16 too large.  */
 161         ldp     Q_l, Q_h, [src, 16]!
 162         stp     A_l, A_h, [dstin]
 163         ldp     A_l, A_h, [src, 16]!
 164
 165 L(loop64):
 166         subs    count, count, 32
 167         stp     Q_l, Q_h, [dst, 16]
 168         ldp     Q_l, Q_h, [src, 16]!
 169         stp     A_l, A_h, [dst, 32]!
 170         ldp     A_l, A_h, [src, 16]!
 171         b.hi    L(loop64)
 172
 173         /* Write the last full set of 32 bytes.  The remainder is at most 32
 174            bytes, so it is safe to always copy 32 bytes from the end even if
 175            there is just 1 byte left.  */
 176 L(last64):
 177         ldp     C_l, C_h, [srcend, -32]
 178         stp     Q_l, Q_h, [dst, 16]
 179         ldp     Q_l, Q_h, [srcend, -16]
 180         stp     A_l, A_h, [dst, 32]
 181         stp     C_l, C_h, [dstend, -32]
 182         stp     Q_l, Q_h, [dstend, -16]
 183         mov     Q_l, B_l
 184         mov     Q_h, B_h
 185         ret
 186
 187         .p2align 4
 188 L(move_long):
 189         cbz     tmp1, 3f
 190
 191         mov     B_l, Q_l
 192         mov     B_h, Q_h
 193
 194         /* Align SRCEND to 16 byte alignment so that we don't cross cache line
 195            boundaries on both loads and stores.  There are at least 96 bytes
 196            to copy, so copy 16 bytes unaligned and then align.  The loop
 197            copies 32 bytes per iteration and prefetches one iteration ahead.  */
 198
 199         ldp     A_l, A_h, [srcend, -16]
 200         and     tmp1, srcend, 15
 201         sub     srcend, srcend, tmp1
 202         ldp     Q_l, Q_h, [srcend, -16]!
 203         stp     A_l, A_h, [dstend, -16]
 204         sub     count, count, tmp1
 205         ldp     A_l, A_h, [srcend, -16]!
 206         sub     dstend, dstend, tmp1
 207         sub     count, count, 64
 208
 209 1:
 210         subs    count, count, 32
 211         stp     Q_l, Q_h, [dstend, -16]
 212         ldp     Q_l, Q_h, [srcend, -16]!
 213         stp     A_l, A_h, [dstend, -32]!
 214         ldp     A_l, A_h, [srcend, -16]!
 215         b.hi    1b
 216
 217         /* Write the last full set of 32 bytes.  The remainder is at most 32
 218            bytes, so it is safe to always copy 32 bytes from the start even if
 219            there is just 1 byte left.  */
 220 2:
 221         ldp     C_l, C_h, [src, 16]
 222         stp     Q_l, Q_h, [dstend, -16]
 223         ldp     Q_l, Q_h, [src]
 224         stp     A_l, A_h, [dstend, -32]
 225         stp     C_l, C_h, [dstin, 16]
 226         stp     Q_l, Q_h, [dstin]
 227         mov     Q_l, B_l
 228         mov     Q_h, B_h
 229 3:      ret
 230
 231 END (__memmove_falkor)
 232 libc_hidden_builtin_def (__memmove_falkor)