arch/mips/lib/memcpy-inatomic.S

   1 /*
   2  * This file is subject to the terms and conditions of the GNU General Public
   3  * License.  See the file "COPYING" in the main directory of this archive
   4  * for more details.
   5  *
   6  * Unified implementation of memcpy, memmove and the __copy_user backend.
   7  *
   8  * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
   9  * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
  10  * Copyright (C) 2002 Broadcom, Inc.
  11  *   memcpy/copy_user author: Mark Vandevoorde
  12  * Copyright (C) 2007  Maciej W. Rozycki
  13  *
  14  * Mnemonic names for arguments to memcpy/__copy_user
  15  */
  16
  17 /*
  18  * Hack to resolve longstanding prefetch issue
  19  *
  20  * Prefetching may be fatal on some systems if we're prefetching beyond the
  21  * end of memory on some systems.  It's also a seriously bad idea on non
  22  * dma-coherent systems.
  23  */
  24 #ifdef CONFIG_DMA_NONCOHERENT
  25 #undef CONFIG_CPU_HAS_PREFETCH
  26 #endif
  27 #ifdef CONFIG_MIPS_MALTA
  28 #undef CONFIG_CPU_HAS_PREFETCH
  29 #endif
  30
  31 #include <asm/asm.h>
  32 #include <asm/asm-offsets.h>
  33 #include <asm/regdef.h>
  34
  35 #define dst a0
  36 #define src a1
  37 #define len a2
  38
  39 /*
  40  * Spec
  41  *
  42  * memcpy copies len bytes from src to dst and sets v0 to dst.
  43  * It assumes that
  44  *   - src and dst don't overlap
  45  *   - src is readable
  46  *   - dst is writable
  47  * memcpy uses the standard calling convention
  48  *
  49  * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
  50  * the number of uncopied bytes due to an exception caused by a read or write.
  51  * __copy_user assumes that src and dst don't overlap, and that the call is
  52  * implementing one of the following:
  53  *   copy_to_user
  54  *     - src is readable  (no exceptions when reading src)
  55  *   copy_from_user
  56  *     - dst is writable  (no exceptions when writing dst)
  57  * __copy_user uses a non-standard calling convention; see
  58  * include/asm-mips/uaccess.h
  59  *
  60  * When an exception happens on a load, the handler must
  61  # ensure that all of the destination buffer is overwritten to prevent
  62  * leaking information to user mode programs.
  63  */
  64
  65 /*
  66  * Implementation
  67  */
  68
  69 /*
  70  * The exception handler for loads requires that:
  71  *  1- AT contain the address of the byte just past the end of the source
  72  *     of the copy,
  73  *  2- src_entry <= src < AT, and
  74  *  3- (dst - src) == (dst_entry - src_entry),
  75  * The _entry suffix denotes values when __copy_user was called.
  76  *
  77  * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
  78  * (2) is met by incrementing src by the number of bytes copied
  79  * (3) is met by not doing loads between a pair of increments of dst and src
  80  *
  81  * The exception handlers for stores adjust len (if necessary) and return.
  82  * These handlers do not need to overwrite any data.
  83  *
  84  * For __rmemcpy and memmove an exception is always a kernel bug, therefore
  85  * they're not protected.
  86  */
  87
  88 #define EXC(inst_reg,addr,handler)              \
  89 9:      inst_reg, addr;                         \
  90         .section __ex_table,"a";                \
  91         PTR     9b, handler;                    \
  92         .previous
  93
  94 /*
  95  * Only on the 64-bit kernel we can made use of 64-bit registers.
  96  */
  97 #ifdef CONFIG_64BIT
  98 #define USE_DOUBLE
  99 #endif
 100
 101 #ifdef USE_DOUBLE
 102
 103 #define LOAD   ld
 104 #define LOADL  ldl
 105 #define LOADR  ldr
 106 #define STOREL sdl
 107 #define STORER sdr
 108 #define STORE  sd
 109 #define ADD    daddu
 110 #define SUB    dsubu
 111 #define SRL    dsrl
 112 #define SRA    dsra
 113 #define SLL    dsll
 114 #define SLLV   dsllv
 115 #define SRLV   dsrlv
 116 #define NBYTES 8
 117 #define LOG_NBYTES 3
 118
 119 /*
 120  * As we are sharing code base with the mips32 tree (which use the o32 ABI
 121  * register definitions). We need to redefine the register definitions from
 122  * the n64 ABI register naming to the o32 ABI register naming.
 123  */
 124 #undef t0
 125 #undef t1
 126 #undef t2
 127 #undef t3
 128 #define t0      $8
 129 #define t1      $9
 130 #define t2      $10
 131 #define t3      $11
 132 #define t4      $12
 133 #define t5      $13
 134 #define t6      $14
 135 #define t7      $15
 136
 137 #else
 138
 139 #define LOAD   lw
 140 #define LOADL  lwl
 141 #define LOADR  lwr
 142 #define STOREL swl
 143 #define STORER swr
 144 #define STORE  sw
 145 #define ADD    addu
 146 #define SUB    subu
 147 #define SRL    srl
 148 #define SLL    sll
 149 #define SRA    sra
 150 #define SLLV   sllv
 151 #define SRLV   srlv
 152 #define NBYTES 4
 153 #define LOG_NBYTES 2
 154
 155 #endif /* USE_DOUBLE */
 156
 157 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 158 #define LDFIRST LOADR
 159 #define LDREST  LOADL
 160 #define STFIRST STORER
 161 #define STREST  STOREL
 162 #define SHIFT_DISCARD SLLV
 163 #else
 164 #define LDFIRST LOADL
 165 #define LDREST  LOADR
 166 #define STFIRST STOREL
 167 #define STREST  STORER
 168 #define SHIFT_DISCARD SRLV
 169 #endif
 170
 171 #define FIRST(unit) ((unit)*NBYTES)
 172 #define REST(unit)  (FIRST(unit)+NBYTES-1)
 173 #define UNIT(unit)  FIRST(unit)
 174
 175 #define ADDRMASK (NBYTES-1)
 176
 177         .text
 178         .set    noreorder
 179 #ifndef CONFIG_CPU_DADDI_WORKAROUNDS
 180         .set    noat
 181 #else
 182         .set    at=v1
 183 #endif
 184
 185 /*
 186  * A combined memcpy/__copy_user
 187  * __copy_user sets len to 0 for success; else to an upper bound of
 188  * the number of uncopied bytes.
 189  * memcpy sets v0 to dst.
 190  */
 191         .align  5
 192 LEAF(__copy_user_inatomic)
 193         /*
 194          * Note: dst & src may be unaligned, len may be 0
 195          * Temps
 196          */
 197 #define rem t8
 198
 199         /*
 200          * The "issue break"s below are very approximate.
 201          * Issue delays for dcache fills will perturb the schedule, as will
 202          * load queue full replay traps, etc.
 203          *
 204          * If len < NBYTES use byte operations.
 205          */
 206         PREF(   0, 0(src) )
 207         PREF(   1, 0(dst) )
 208         sltu    t2, len, NBYTES
 209         and     t1, dst, ADDRMASK
 210         PREF(   0, 1*32(src) )
 211         PREF(   1, 1*32(dst) )
 212         bnez    t2, .Lcopy_bytes_checklen
 213          and    t0, src, ADDRMASK
 214         PREF(   0, 2*32(src) )
 215         PREF(   1, 2*32(dst) )
 216         bnez    t1, .Ldst_unaligned
 217          nop
 218         bnez    t0, .Lsrc_unaligned_dst_aligned
 219         /*
 220          * use delay slot for fall-through
 221          * src and dst are aligned; need to compute rem
 222          */
 223 .Lboth_aligned:
 224          SRL    t0, len, LOG_NBYTES+3           # +3 for 8 units/iter
 225         beqz    t0, .Lcleanup_both_aligned      # len < 8*NBYTES
 226          and    rem, len, (8*NBYTES-1)          # rem = len % (8*NBYTES)
 227         PREF(   0, 3*32(src) )
 228         PREF(   1, 3*32(dst) )
 229         .align  4
 230 1:
 231 EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
 232 EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
 233 EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
 234 EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
 235         SUB     len, len, 8*NBYTES
 236 EXC(    LOAD    t4, UNIT(4)(src),       .Ll_exc_copy)
 237 EXC(    LOAD    t7, UNIT(5)(src),       .Ll_exc_copy)
 238         STORE   t0, UNIT(0)(dst)
 239         STORE   t1, UNIT(1)(dst)
 240 EXC(    LOAD    t0, UNIT(6)(src),       .Ll_exc_copy)
 241 EXC(    LOAD    t1, UNIT(7)(src),       .Ll_exc_copy)
 242         ADD     src, src, 8*NBYTES
 243         ADD     dst, dst, 8*NBYTES
 244         STORE   t2, UNIT(-6)(dst)
 245         STORE   t3, UNIT(-5)(dst)
 246         STORE   t4, UNIT(-4)(dst)
 247         STORE   t7, UNIT(-3)(dst)
 248         STORE   t0, UNIT(-2)(dst)
 249         STORE   t1, UNIT(-1)(dst)
 250         PREF(   0, 8*32(src) )
 251         PREF(   1, 8*32(dst) )
 252         bne     len, rem, 1b
 253          nop
 254
 255         /*
 256          * len == rem == the number of bytes left to copy < 8*NBYTES
 257          */
 258 .Lcleanup_both_aligned:
 259         beqz    len, .Ldone
 260          sltu   t0, len, 4*NBYTES
 261         bnez    t0, .Lless_than_4units
 262          and    rem, len, (NBYTES-1)    # rem = len % NBYTES
 263         /*
 264          * len >= 4*NBYTES
 265          */
 266 EXC(    LOAD    t0, UNIT(0)(src),       .Ll_exc)
 267 EXC(    LOAD    t1, UNIT(1)(src),       .Ll_exc_copy)
 268 EXC(    LOAD    t2, UNIT(2)(src),       .Ll_exc_copy)
 269 EXC(    LOAD    t3, UNIT(3)(src),       .Ll_exc_copy)
 270         SUB     len, len, 4*NBYTES
 271         ADD     src, src, 4*NBYTES
 272         STORE   t0, UNIT(0)(dst)
 273         STORE   t1, UNIT(1)(dst)
 274         STORE   t2, UNIT(2)(dst)
 275         STORE   t3, UNIT(3)(dst)
 276         .set    reorder                         /* DADDI_WAR */
 277         ADD     dst, dst, 4*NBYTES
 278         beqz    len, .Ldone
 279         .set    noreorder
 280 .Lless_than_4units:
 281         /*
 282          * rem = len % NBYTES
 283          */
 284         beq     rem, len, .Lcopy_bytes
 285          nop
 286 1:
 287 EXC(    LOAD    t0, 0(src),             .Ll_exc)
 288         ADD     src, src, NBYTES
 289         SUB     len, len, NBYTES
 290         STORE   t0, 0(dst)
 291         .set    reorder                         /* DADDI_WAR */
 292         ADD     dst, dst, NBYTES
 293         bne     rem, len, 1b
 294         .set    noreorder
 295
 296         /*
 297          * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 298          * A loop would do only a byte at a time with possible branch
 299          * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
 300          * because can't assume read-access to dst.  Instead, use
 301          * STREST dst, which doesn't require read access to dst.
 302          *
 303          * This code should perform better than a simple loop on modern,
 304          * wide-issue mips processors because the code has fewer branches and
 305          * more instruction-level parallelism.
 306          */
 307 #define bits t2
 308         beqz    len, .Ldone
 309          ADD    t1, dst, len    # t1 is just past last byte of dst
 310         li      bits, 8*NBYTES
 311         SLL     rem, len, 3     # rem = number of bits to keep
 312 EXC(    LOAD    t0, 0(src),             .Ll_exc)
 313         SUB     bits, bits, rem # bits = number of bits to discard
 314         SHIFT_DISCARD t0, t0, bits
 315         STREST  t0, -1(t1)
 316         jr      ra
 317          move   len, zero
 318 .Ldst_unaligned:
 319         /*
 320          * dst is unaligned
 321          * t0 = src & ADDRMASK
 322          * t1 = dst & ADDRMASK; T1 > 0
 323          * len >= NBYTES
 324          *
 325          * Copy enough bytes to align dst
 326          * Set match = (src and dst have same alignment)
 327          */
 328 #define match rem
 329 EXC(    LDFIRST t3, FIRST(0)(src),      .Ll_exc)
 330         ADD     t2, zero, NBYTES
 331 EXC(    LDREST  t3, REST(0)(src),       .Ll_exc_copy)
 332         SUB     t2, t2, t1      # t2 = number of bytes copied
 333         xor     match, t0, t1
 334         STFIRST t3, FIRST(0)(dst)
 335         beq     len, t2, .Ldone
 336          SUB    len, len, t2
 337         ADD     dst, dst, t2
 338         beqz    match, .Lboth_aligned
 339          ADD    src, src, t2
 340
 341 .Lsrc_unaligned_dst_aligned:
 342         SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 343         PREF(   0, 3*32(src) )
 344         beqz    t0, .Lcleanup_src_unaligned
 345          and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 346         PREF(   1, 3*32(dst) )
 347 1:
 348 /*
 349  * Avoid consecutive LD*'s to the same register since some mips
 350  * implementations can't issue them in the same cycle.
 351  * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 352  * are to the same unit (unless src is aligned, but it's not).
 353  */
 354 EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
 355 EXC(    LDFIRST t1, FIRST(1)(src),      .Ll_exc_copy)
 356         SUB     len, len, 4*NBYTES
 357 EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
 358 EXC(    LDREST  t1, REST(1)(src),       .Ll_exc_copy)
 359 EXC(    LDFIRST t2, FIRST(2)(src),      .Ll_exc_copy)
 360 EXC(    LDFIRST t3, FIRST(3)(src),      .Ll_exc_copy)
 361 EXC(    LDREST  t2, REST(2)(src),       .Ll_exc_copy)
 362 EXC(    LDREST  t3, REST(3)(src),       .Ll_exc_copy)
 363         PREF(   0, 9*32(src) )          # 0 is PREF_LOAD  (not streamed)
 364         ADD     src, src, 4*NBYTES
 365 #ifdef CONFIG_CPU_SB1
 366         nop                             # improves slotting
 367 #endif
 368         STORE   t0, UNIT(0)(dst)
 369         STORE   t1, UNIT(1)(dst)
 370         STORE   t2, UNIT(2)(dst)
 371         STORE   t3, UNIT(3)(dst)
 372         PREF(   1, 9*32(dst) )          # 1 is PREF_STORE (not streamed)
 373         .set    reorder                         /* DADDI_WAR */
 374         ADD     dst, dst, 4*NBYTES
 375         bne     len, rem, 1b
 376         .set    noreorder
 377
 378 .Lcleanup_src_unaligned:
 379         beqz    len, .Ldone
 380          and    rem, len, NBYTES-1  # rem = len % NBYTES
 381         beq     rem, len, .Lcopy_bytes
 382          nop
 383 1:
 384 EXC(    LDFIRST t0, FIRST(0)(src),      .Ll_exc)
 385 EXC(    LDREST  t0, REST(0)(src),       .Ll_exc_copy)
 386         ADD     src, src, NBYTES
 387         SUB     len, len, NBYTES
 388         STORE   t0, 0(dst)
 389         .set    reorder                         /* DADDI_WAR */
 390         ADD     dst, dst, NBYTES
 391         bne     len, rem, 1b
 392         .set    noreorder
 393
 394 .Lcopy_bytes_checklen:
 395         beqz    len, .Ldone
 396          nop
 397 .Lcopy_bytes:
 398         /* 0 < len < NBYTES  */
 399 #define COPY_BYTE(N)                    \
 400 EXC(    lb      t0, N(src), .Ll_exc);   \
 401         SUB     len, len, 1;            \
 402         beqz    len, .Ldone;            \
 403          sb     t0, N(dst)
 404
 405         COPY_BYTE(0)
 406         COPY_BYTE(1)
 407 #ifdef USE_DOUBLE
 408         COPY_BYTE(2)
 409         COPY_BYTE(3)
 410         COPY_BYTE(4)
 411         COPY_BYTE(5)
 412 #endif
 413 EXC(    lb      t0, NBYTES-2(src), .Ll_exc)
 414         SUB     len, len, 1
 415         jr      ra
 416          sb     t0, NBYTES-2(dst)
 417 .Ldone:
 418         jr      ra
 419          nop
 420         END(__copy_user_inatomic)
 421
 422 .Ll_exc_copy:
 423         /*
 424          * Copy bytes from src until faulting load address (or until a
 425          * lb faults)
 426          *
 427          * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 428          * may be more than a byte beyond the last address.
 429          * Hence, the lb below may get an exception.
 430          *
 431          * Assumes src < THREAD_BUADDR($28)
 432          */
 433         LOAD    t0, TI_TASK($28)
 434          nop
 435         LOAD    t0, THREAD_BUADDR(t0)
 436 1:
 437 EXC(    lb      t1, 0(src),     .Ll_exc)
 438         ADD     src, src, 1
 439         sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 440         .set    reorder                         /* DADDI_WAR */
 441         ADD     dst, dst, 1
 442         bne     src, t0, 1b
 443         .set    noreorder
 444 .Ll_exc:
 445         LOAD    t0, TI_TASK($28)
 446          nop
 447         LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 448          nop
 449         SUB     len, AT, t0             # len number of uncopied bytes
 450         jr      ra
 451          nop