sysdeps/sparc/sparc64/multiarch/memcpy-niagara1.S

   1 /* Copy SIZE bytes from SRC to DEST.  For SUN4V Niagara.
   2    Copyright (C) 2006-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <https://www.gnu.org/licenses/>.  */
  18
  19 #include <sysdep.h>
  20
  21 #define ASI_BLK_INIT_QUAD_LDD_P 0xe2
  22 #define ASI_P                   0x80
  23 #define ASI_PNF                 0x82
  24
  25 #define LOAD(type,addr,dest)    type##a [addr] ASI_P, dest
  26 #define LOAD_TWIN(addr_reg,dest0,dest1) \
  27         ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
  28
  29 #define STORE(type,src,addr)    type src, [addr]
  30 #define STORE_INIT(src,addr)    stxa src, [addr] %asi
  31
  32 #ifndef XCC
  33 #define USE_BPR
  34 #define XCC xcc
  35 #endif
  36
  37 #if IS_IN (libc)
  38
  39         .register       %g2,#scratch
  40         .register       %g3,#scratch
  41         .register       %g6,#scratch
  42
  43         .text
  44
  45 ENTRY(__mempcpy_niagara1)
  46         ba,pt           %XCC, 101f
  47          add            %o0, %o2, %g5
  48 END(__mempcpy_niagara1)
  49
  50         .align          32
  51 ENTRY(__memcpy_niagara1)
  52 100:    /* %o0=dst, %o1=src, %o2=len */
  53         mov             %o0, %g5
  54 101:
  55 # ifndef USE_BPR
  56         srl             %o2, 0, %o2
  57 # endif
  58         cmp             %o2, 0
  59         be,pn           %XCC, 85f
  60 218:     or             %o0, %o1, %o3
  61         cmp             %o2, 16
  62         blu,a,pn        %XCC, 80f
  63          or             %o3, %o2, %o3
  64
  65         /* 2 blocks (128 bytes) is the minimum we can do the block
  66          * copy with.  We need to ensure that we'll iterate at least
  67          * once in the block copy loop.  At worst we'll need to align
  68          * the destination to a 64-byte boundary which can chew up
  69          * to (64 - 1) bytes from the length before we perform the
  70          * block copy loop.
  71          */
  72         cmp             %o2, (2 * 64)
  73         blu,pt          %XCC, 70f
  74          andcc          %o3, 0x7, %g0
  75
  76         /* %o0: dst
  77          * %o1: src
  78          * %o2: len  (known to be >= 128)
  79          *
  80          * The block copy loops will use %o4/%o5,%g2/%g3 as
  81          * temporaries while copying the data.
  82          */
  83
  84         LOAD(prefetch, %o1, #one_read)
  85         wr              %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
  86
  87         /* Align destination on 64-byte boundary.  */
  88         andcc           %o0, (64 - 1), %o4
  89         be,pt           %XCC, 2f
  90          sub            %o4, 64, %o4
  91         sub             %g0, %o4, %o4   ! bytes to align dst
  92         sub             %o2, %o4, %o2
  93 1:      subcc           %o4, 1, %o4
  94         LOAD(ldub, %o1, %g1)
  95         STORE(stb, %g1, %o0)
  96         add             %o1, 1, %o1
  97         bne,pt          %XCC, 1b
  98         add             %o0, 1, %o0
  99
 100         /* If the source is on a 16-byte boundary we can do
 101          * the direct block copy loop.  If it is 8-byte aligned
 102          * we can do the 16-byte loads offset by -8 bytes and the
 103          * init stores offset by one register.
 104          *
 105          * If the source is not even 8-byte aligned, we need to do
 106          * shifting and masking (basically integer faligndata).
 107          *
 108          * The careful bit with init stores is that if we store
 109          * to any part of the cache line we have to store the whole
 110          * cacheline else we can end up with corrupt L2 cache line
 111          * contents.  Since the loop works on 64-bytes of 64-byte
 112          * aligned store data at a time, this is easy to ensure.
 113          */
 114 2:
 115         andcc           %o1, (16 - 1), %o4
 116         andn            %o2, (64 - 1), %g1      ! block copy loop iterator
 117         sub             %o2, %g1, %o2           ! final sub-block copy bytes
 118         be,pt           %XCC, 50f
 119          cmp            %o4, 8
 120         be,a,pt         %XCC, 10f
 121          sub            %o1, 0x8, %o1
 122
 123         /* Neither 8-byte nor 16-byte aligned, shift and mask.  */
 124         mov             %g1, %o4
 125         and             %o1, 0x7, %g1
 126         sll             %g1, 3, %g1
 127         mov             64, %o3
 128         andn            %o1, 0x7, %o1
 129         LOAD(ldx, %o1, %g2)
 130         sub             %o3, %g1, %o3
 131         sllx            %g2, %g1, %g2
 132
 133 #define SWIVEL_ONE_DWORD(SRC, TMP1, TMP2, PRE_VAL, PRE_SHIFT, POST_SHIFT, DST)\
 134         LOAD(ldx, SRC, TMP1); \
 135         srlx            TMP1, PRE_SHIFT, TMP2; \
 136         or              TMP2, PRE_VAL, TMP2; \
 137         STORE_INIT(TMP2, DST); \
 138         sllx            TMP1, POST_SHIFT, PRE_VAL;
 139
 140 1:      add             %o1, 0x8, %o1
 141         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x00)
 142         add             %o1, 0x8, %o1
 143         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x08)
 144         add             %o1, 0x8, %o1
 145         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x10)
 146         add             %o1, 0x8, %o1
 147         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x18)
 148         add             %o1, 32, %o1
 149         LOAD(prefetch, %o1, #one_read)
 150         sub             %o1, 32 - 8, %o1
 151         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x20)
 152         add             %o1, 8, %o1
 153         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x28)
 154         add             %o1, 8, %o1
 155         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x30)
 156         add             %o1, 8, %o1
 157         SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x38)
 158         subcc           %o4, 64, %o4
 159         bne,pt          %XCC, 1b
 160          add            %o0, 64, %o0
 161
 162 #undef SWIVEL_ONE_DWORD
 163
 164         srl             %g1, 3, %g1
 165         ba,pt           %XCC, 60f
 166          add            %o1, %g1, %o1
 167
 168 10:     /* Destination is 64-byte aligned, source was only 8-byte
 169          * aligned but it has been subtracted by 8 and we perform
 170          * one twin load ahead, then add 8 back into source when
 171          * we finish the loop.
 172          */
 173         LOAD_TWIN(%o1, %o4, %o5)
 174 1:      add             %o1, 16, %o1
 175         LOAD_TWIN(%o1, %g2, %g3)
 176         add             %o1, 16 + 32, %o1
 177         LOAD(prefetch, %o1, #one_read)
 178         sub             %o1, 32, %o1
 179         STORE_INIT(%o5, %o0 + 0x00)             ! initializes cache line
 180         STORE_INIT(%g2, %o0 + 0x08)
 181         LOAD_TWIN(%o1, %o4, %o5)
 182         add             %o1, 16, %o1
 183         STORE_INIT(%g3, %o0 + 0x10)
 184         STORE_INIT(%o4, %o0 + 0x18)
 185         LOAD_TWIN(%o1, %g2, %g3)
 186         add             %o1, 16, %o1
 187         STORE_INIT(%o5, %o0 + 0x20)
 188         STORE_INIT(%g2, %o0 + 0x28)
 189         LOAD_TWIN(%o1, %o4, %o5)
 190         STORE_INIT(%g3, %o0 + 0x30)
 191         STORE_INIT(%o4, %o0 + 0x38)
 192         subcc           %g1, 64, %g1
 193         bne,pt          %XCC, 1b
 194          add            %o0, 64, %o0
 195
 196         ba,pt           %XCC, 60f
 197          add            %o1, 0x8, %o1
 198
 199 50:     /* Destination is 64-byte aligned, and source is 16-byte
 200          * aligned.
 201          */
 202 1:      LOAD_TWIN(%o1, %o4, %o5)
 203         add     %o1, 16, %o1
 204         LOAD_TWIN(%o1, %g2, %g3)
 205         add     %o1, 16 + 32, %o1
 206         LOAD(prefetch, %o1, #one_read)
 207         sub     %o1, 32, %o1
 208         STORE_INIT(%o4, %o0 + 0x00)             ! initializes cache line
 209         STORE_INIT(%o5, %o0 + 0x08)
 210         LOAD_TWIN(%o1, %o4, %o5)
 211         add     %o1, 16, %o1
 212         STORE_INIT(%g2, %o0 + 0x10)
 213         STORE_INIT(%g3, %o0 + 0x18)
 214         LOAD_TWIN(%o1, %g2, %g3)
 215         add     %o1, 16, %o1
 216         STORE_INIT(%o4, %o0 + 0x20)
 217         STORE_INIT(%o5, %o0 + 0x28)
 218         STORE_INIT(%g2, %o0 + 0x30)
 219         STORE_INIT(%g3, %o0 + 0x38)
 220         subcc   %g1, 64, %g1
 221         bne,pt  %XCC, 1b
 222          add    %o0, 64, %o0
 223         /* fall through */
 224
 225 60:
 226         /* %o2 contains any final bytes still needed to be copied
 227          * over. If anything is left, we copy it one byte at a time.
 228          */
 229         wr              %g0, ASI_PNF, %asi
 230         brz,pt          %o2, 85f
 231          sub            %o0, %o1, %o3
 232         ba,a,pt         %XCC, 90f
 233
 234         .align          64
 235 70: /* 16 < len <= 64 */
 236         bne,pn          %XCC, 75f
 237          sub            %o0, %o1, %o3
 238
 239 72:
 240         andn            %o2, 0xf, %o4
 241         and             %o2, 0xf, %o2
 242 1:      subcc           %o4, 0x10, %o4
 243         LOAD(ldx, %o1, %o5)
 244         add             %o1, 0x08, %o1
 245         LOAD(ldx, %o1, %g1)
 246         sub             %o1, 0x08, %o1
 247         STORE(stx, %o5, %o1 + %o3)
 248         add             %o1, 0x8, %o1
 249         STORE(stx, %g1, %o1 + %o3)
 250         bgu,pt          %XCC, 1b
 251          add            %o1, 0x8, %o1
 252 73:     andcc           %o2, 0x8, %g0
 253         be,pt           %XCC, 1f
 254          nop
 255         sub             %o2, 0x8, %o2
 256         LOAD(ldx, %o1, %o5)
 257         STORE(stx, %o5, %o1 + %o3)
 258         add             %o1, 0x8, %o1
 259 1:      andcc           %o2, 0x4, %g0
 260         be,pt           %XCC, 1f
 261          nop
 262         sub             %o2, 0x4, %o2
 263         LOAD(lduw, %o1, %o5)
 264         STORE(stw, %o5, %o1 + %o3)
 265         add             %o1, 0x4, %o1
 266 1:      cmp             %o2, 0
 267         be,pt           %XCC, 85f
 268          nop
 269         ba,pt           %XCC, 90f
 270          nop
 271
 272 75:
 273         andcc           %o0, 0x7, %g1
 274         sub             %g1, 0x8, %g1
 275         be,pn           %icc, 2f
 276          sub            %g0, %g1, %g1
 277         sub             %o2, %g1, %o2
 278
 279 1:      subcc           %g1, 1, %g1
 280         LOAD(ldub, %o1, %o5)
 281         STORE(stb, %o5, %o1 + %o3)
 282         bgu,pt          %icc, 1b
 283          add            %o1, 1, %o1
 284
 285 2:      add             %o1, %o3, %o0
 286         andcc           %o1, 0x7, %g1
 287         bne,pt          %icc, 8f
 288          sll            %g1, 3, %g1
 289
 290         cmp             %o2, 16
 291         bgeu,pt         %icc, 72b
 292          nop
 293         ba,a,pt         %XCC, 73b
 294
 295 8:      mov             64, %o3
 296         andn            %o1, 0x7, %o1
 297         LOAD(ldx, %o1, %g2)
 298         sub             %o3, %g1, %o3
 299         andn            %o2, 0x7, %o4
 300         sllx            %g2, %g1, %g2
 301 1:      add             %o1, 0x8, %o1
 302         LOAD(ldx, %o1, %g3)
 303         subcc           %o4, 0x8, %o4
 304         srlx            %g3, %o3, %o5
 305         or              %o5, %g2, %o5
 306         STORE(stx, %o5, %o0)
 307         add             %o0, 0x8, %o0
 308         bgu,pt          %icc, 1b
 309          sllx           %g3, %g1, %g2
 310
 311         srl             %g1, 3, %g1
 312         andcc           %o2, 0x7, %o2
 313         be,pn           %icc, 85f
 314          add            %o1, %g1, %o1
 315         ba,pt           %XCC, 90f
 316          sub            %o0, %o1, %o3
 317
 318         .align          64
 319 80: /* 0 < len <= 16 */
 320         andcc           %o3, 0x3, %g0
 321         bne,pn          %XCC, 90f
 322          sub            %o0, %o1, %o3
 323
 324 1:
 325         subcc           %o2, 4, %o2
 326         LOAD(lduw, %o1, %g1)
 327         STORE(stw, %g1, %o1 + %o3)
 328         bgu,pt          %XCC, 1b
 329          add            %o1, 4, %o1
 330
 331 85:     retl
 332          mov            %g5, %o0
 333
 334         .align          32
 335 90:
 336         subcc           %o2, 1, %o2
 337         LOAD(ldub, %o1, %g1)
 338         STORE(stb, %g1, %o1 + %o3)
 339         bgu,pt          %XCC, 90b
 340          add            %o1, 1, %o1
 341         retl
 342          mov            %g5, %o0
 343
 344 END(__memcpy_niagara1)
 345
 346 #endif