release/src-rt-6.x/linux/linux-2.6/arch/mips/lib/csum_partial.S

   1 /*
   2  * This file is subject to the terms and conditions of the GNU General Public
   3  * License.  See the file "COPYING" in the main directory of this archive
   4  * for more details.
   5  *
   6  * Quick'n'dirty IP checksum ...
   7  *
   8  * Copyright (C) 1998, 1999 Ralf Baechle
   9  * Copyright (C) 1999 Silicon Graphics, Inc.
  10  */
  11 #include <linux/errno.h>
  12 #include <asm/asm.h>
  13 #include <asm/asm-offsets.h>
  14 #include <asm/regdef.h>
  15
  16 #ifdef CONFIG_64BIT
  17 /*
  18  * As we are sharing code base with the mips32 tree (which use the o32 ABI
  19  * register definitions). We need to redefine the register definitions from
  20  * the n64 ABI register naming to the o32 ABI register naming.
  21  */
  22 #undef t0
  23 #undef t1
  24 #undef t2
  25 #undef t3
  26 #define t0      $8
  27 #define t1      $9
  28 #define t2      $10
  29 #define t3      $11
  30 #define t4      $12
  31 #define t5      $13
  32 #define t6      $14
  33 #define t7      $15
  34
  35 #define USE_DOUBLE
  36 #endif
  37
  38 #ifdef USE_DOUBLE
  39
  40 #define LOAD   ld
  41 #define LOAD32 lwu
  42 #define ADD    daddu
  43 #define NBYTES 8
  44
  45 #else
  46
  47 #define LOAD   lw
  48 #define LOAD32 lw
  49 #define ADD    addu
  50 #define NBYTES 4
  51
  52 #endif /* USE_DOUBLE */
  53
  54 #define UNIT(unit)  ((unit)*NBYTES)
  55
  56 #define ADDC(sum,reg)                                           \
  57         ADD     sum, reg;                                       \
  58         sltu    v1, sum, reg;                                   \
  59         ADD     sum, v1
  60
  61 #define ADDC32(sum,reg)                                         \
  62         .set    push;                                           \
  63         .set    noat;                                           \
  64         addu    sum, reg;                                       \
  65         sltu    v1, sum, reg;                                   \
  66         addu    sum, v1;                                        \
  67         .set    pop
  68
  69 #define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)    \
  70         LOAD    _t0, (offset + UNIT(0))(src);                   \
  71         LOAD    _t1, (offset + UNIT(1))(src);                   \
  72         LOAD    _t2, (offset + UNIT(2))(src);                   \
  73         LOAD    _t3, (offset + UNIT(3))(src);                   \
  74         ADDC(sum, _t0);                                         \
  75         ADDC(sum, _t1);                                         \
  76         ADDC(sum, _t2);                                         \
  77         ADDC(sum, _t3)
  78
  79 #ifdef USE_DOUBLE
  80 #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)     \
  81         CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
  82 #else
  83 #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)     \
  84         CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3);   \
  85         CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
  86 #endif
  87
  88 /*
  89  * a0: source address
  90  * a1: length of the area to checksum
  91  * a2: partial checksum
  92  */
  93
  94 #define src a0
  95 #define sum v0
  96
  97         .text
  98         .set    noreorder
  99         .align  5
 100 LEAF(csum_partial)
 101         move    sum, zero
 102         move    t7, zero
 103
 104         sltiu   t8, a1, 0x8
 105         bnez    t8, small_csumcpy               /* < 8 bytes to copy */
 106          move   t2, a1
 107
 108         andi    t7, src, 0x1                    /* odd buffer? */
 109
 110 hword_align:
 111         beqz    t7, word_align
 112          andi   t8, src, 0x2
 113
 114         lbu     t0, (src)
 115         LONG_SUBU       a1, a1, 0x1
 116 #ifdef __MIPSEL__
 117         sll     t0, t0, 8
 118 #endif
 119         ADDC(sum, t0)
 120         PTR_ADDU        src, src, 0x1
 121         andi    t8, src, 0x2
 122
 123 word_align:
 124         beqz    t8, dword_align
 125          sltiu  t8, a1, 56
 126
 127         lhu     t0, (src)
 128         LONG_SUBU       a1, a1, 0x2
 129         ADDC(sum, t0)
 130         sltiu   t8, a1, 56
 131         PTR_ADDU        src, src, 0x2
 132
 133 dword_align:
 134         bnez    t8, do_end_words
 135          move   t8, a1
 136
 137         andi    t8, src, 0x4
 138         beqz    t8, qword_align
 139          andi   t8, src, 0x8
 140
 141         LOAD32  t0, 0x00(src)
 142         LONG_SUBU       a1, a1, 0x4
 143         ADDC(sum, t0)
 144         PTR_ADDU        src, src, 0x4
 145         andi    t8, src, 0x8
 146
 147 qword_align:
 148         beqz    t8, oword_align
 149          andi   t8, src, 0x10
 150
 151 #ifdef USE_DOUBLE
 152         ld      t0, 0x00(src)
 153         LONG_SUBU       a1, a1, 0x8
 154         ADDC(sum, t0)
 155 #else
 156         lw      t0, 0x00(src)
 157         lw      t1, 0x04(src)
 158         LONG_SUBU       a1, a1, 0x8
 159         ADDC(sum, t0)
 160         ADDC(sum, t1)
 161 #endif
 162         PTR_ADDU        src, src, 0x8
 163         andi    t8, src, 0x10
 164
 165 oword_align:
 166         beqz    t8, begin_movement
 167          LONG_SRL       t8, a1, 0x7
 168
 169 #ifdef USE_DOUBLE
 170         ld      t0, 0x00(src)
 171         ld      t1, 0x08(src)
 172         ADDC(sum, t0)
 173         ADDC(sum, t1)
 174 #else
 175         CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)
 176 #endif
 177         LONG_SUBU       a1, a1, 0x10
 178         PTR_ADDU        src, src, 0x10
 179         LONG_SRL        t8, a1, 0x7
 180
 181 begin_movement:
 182         beqz    t8, 1f
 183          andi   t2, a1, 0x40
 184
 185 move_128bytes:
 186         CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
 187         CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
 188         CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
 189         CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
 190         LONG_SUBU       t8, t8, 0x01
 191         bnez    t8, move_128bytes
 192          PTR_ADDU       src, src, 0x80
 193
 194 1:
 195         beqz    t2, 1f
 196          andi   t2, a1, 0x20
 197
 198 move_64bytes:
 199         CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
 200         CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
 201         PTR_ADDU        src, src, 0x40
 202
 203 1:
 204         beqz    t2, do_end_words
 205          andi   t8, a1, 0x1c
 206
 207 move_32bytes:
 208         CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
 209         andi    t8, a1, 0x1c
 210         PTR_ADDU        src, src, 0x20
 211
 212 do_end_words:
 213         beqz    t8, small_csumcpy
 214          andi   t2, a1, 0x3
 215         LONG_SRL        t8, t8, 0x2
 216
 217 end_words:
 218         LOAD32  t0, (src)
 219         LONG_SUBU       t8, t8, 0x1
 220         ADDC(sum, t0)
 221         bnez    t8, end_words
 222          PTR_ADDU       src, src, 0x4
 223
 224 /* unknown src alignment and < 8 bytes to go  */
 225 small_csumcpy:
 226         move    a1, t2
 227
 228         andi    t0, a1, 4
 229         beqz    t0, 1f
 230          andi   t0, a1, 2
 231
 232         /* Still a full word to go  */
 233         ulw     t1, (src)
 234         PTR_ADDIU       src, 4
 235 #ifdef USE_DOUBLE
 236         dsll    t1, t1, 32                      /* clear lower 32bit */
 237 #endif
 238         ADDC(sum, t1)
 239
 240 1:      move    t1, zero
 241         beqz    t0, 1f
 242          andi   t0, a1, 1
 243
 244         /* Still a halfword to go  */
 245         ulhu    t1, (src)
 246         PTR_ADDIU       src, 2
 247
 248 1:      beqz    t0, 1f
 249          sll    t1, t1, 16
 250
 251         lbu     t2, (src)
 252          nop
 253
 254 #ifdef __MIPSEB__
 255         sll     t2, t2, 8
 256 #endif
 257         or      t1, t2
 258
 259 1:      ADDC(sum, t1)
 260
 261         /* fold checksum */
 262 #ifdef USE_DOUBLE
 263         dsll32  v1, sum, 0
 264         daddu   sum, v1
 265         sltu    v1, sum, v1
 266         dsra32  sum, sum, 0
 267         addu    sum, v1
 268 #endif
 269         sll     v1, sum, 16
 270         addu    sum, v1
 271         sltu    v1, sum, v1
 272         srl     sum, sum, 16
 273         addu    sum, v1
 274
 275         /* odd buffer alignment? */
 276         beqz    t7, 1f
 277          nop
 278         sll     v1, sum, 8
 279         srl     sum, sum, 8
 280         or      sum, v1
 281         andi    sum, 0xffff
 282 1:
 283         .set    reorder
 284         /* Add the passed partial csum.  */
 285         ADDC32(sum, a2)
 286         jr      ra
 287         .set    noreorder
 288         END(csum_partial)
 289
 290
 291 /*
 292  * checksum and copy routines based on memcpy.S
 293  *
 294  *      csum_partial_copy_nocheck(src, dst, len, sum)
 295  *      __csum_partial_copy_user(src, dst, len, sum, errp)
 296  *
 297  * See "Spec" in memcpy.S for details.  Unlike __copy_user, all
 298  * function in this file use the standard calling convention.
 299  */
 300
 301 #define src a0
 302 #define dst a1
 303 #define len a2
 304 #define psum a3
 305 #define sum v0
 306 #define odd t8
 307 #define errptr t9
 308
 309 /*
 310  * The exception handler for loads requires that:
 311  *  1- AT contain the address of the byte just past the end of the source
 312  *     of the copy,
 313  *  2- src_entry <= src < AT, and
 314  *  3- (dst - src) == (dst_entry - src_entry),
 315  * The _entry suffix denotes values when __copy_user was called.
 316  *
 317  * (1) is set up up by __csum_partial_copy_from_user and maintained by
 318  *      not writing AT in __csum_partial_copy
 319  * (2) is met by incrementing src by the number of bytes copied
 320  * (3) is met by not doing loads between a pair of increments of dst and src
 321  *
 322  * The exception handlers for stores stores -EFAULT to errptr and return.
 323  * These handlers do not need to overwrite any data.
 324  */
 325
 326 #define EXC(inst_reg,addr,handler)              \
 327 9:      inst_reg, addr;                         \
 328         .section __ex_table,"a";                \
 329         PTR     9b, handler;                    \
 330         .previous
 331
 332 #ifdef USE_DOUBLE
 333
 334 #define LOAD   ld
 335 #define LOADL  ldl
 336 #define LOADR  ldr
 337 #define STOREL sdl
 338 #define STORER sdr
 339 #define STORE  sd
 340 #define ADD    daddu
 341 #define SUB    dsubu
 342 #define SRL    dsrl
 343 #define SLL    dsll
 344 #define SLLV   dsllv
 345 #define SRLV   dsrlv
 346 #define NBYTES 8
 347 #define LOG_NBYTES 3
 348
 349 #else
 350
 351 #define LOAD   lw
 352 #define LOADL  lwl
 353 #define LOADR  lwr
 354 #define STOREL swl
 355 #define STORER swr
 356 #define STORE  sw
 357 #define ADD    addu
 358 #define SUB    subu
 359 #define SRL    srl
 360 #define SLL    sll
 361 #define SLLV   sllv
 362 #define SRLV   srlv
 363 #define NBYTES 4
 364 #define LOG_NBYTES 2
 365
 366 #endif /* USE_DOUBLE */
 367
 368 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 369 #define LDFIRST LOADR
 370 #define LDREST  LOADL
 371 #define STFIRST STORER
 372 #define STREST  STOREL
 373 #define SHIFT_DISCARD SLLV
 374 #define SHIFT_DISCARD_REVERT SRLV
 375 #else
 376 #define LDFIRST LOADL
 377 #define LDREST  LOADR
 378 #define STFIRST STOREL
 379 #define STREST  STORER
 380 #define SHIFT_DISCARD SRLV
 381 #define SHIFT_DISCARD_REVERT SLLV
 382 #endif
 383
 384 #define FIRST(unit) ((unit)*NBYTES)
 385 #define REST(unit)  (FIRST(unit)+NBYTES-1)
 386
 387 #define ADDRMASK (NBYTES-1)
 388
 389         .set    noat
 390
 391 LEAF(__csum_partial_copy_user)
 392         PTR_ADDU        AT, src, len    /* See (1) above. */
 393 #ifdef CONFIG_64BIT
 394         move    errptr, a4
 395 #else
 396         lw      errptr, 16(sp)
 397 #endif
 398 FEXPORT(csum_partial_copy_nocheck)
 399         move    sum, zero
 400         move    odd, zero
 401         /*
 402          * Note: dst & src may be unaligned, len may be 0
 403          * Temps
 404          */
 405         /*
 406          * The "issue break"s below are very approximate.
 407          * Issue delays for dcache fills will perturb the schedule, as will
 408          * load queue full replay traps, etc.
 409          *
 410          * If len < NBYTES use byte operations.
 411          */
 412         sltu    t2, len, NBYTES
 413         and     t1, dst, ADDRMASK
 414         bnez    t2, copy_bytes_checklen
 415          and    t0, src, ADDRMASK
 416         andi    odd, dst, 0x1                   /* odd buffer? */
 417         bnez    t1, dst_unaligned
 418          nop
 419         bnez    t0, src_unaligned_dst_aligned
 420         /*
 421          * use delay slot for fall-through
 422          * src and dst are aligned; need to compute rem
 423          */
 424 both_aligned:
 425          SRL    t0, len, LOG_NBYTES+3    # +3 for 8 units/iter
 426         beqz    t0, cleanup_both_aligned # len < 8*NBYTES
 427          nop
 428         SUB     len, 8*NBYTES           # subtract here for bgez loop
 429         .align  4
 430 1:
 431 EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 432 EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 433 EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 434 EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 435 EXC(    LOAD    t4, UNIT(4)(src),       l_exc_copy)
 436 EXC(    LOAD    t5, UNIT(5)(src),       l_exc_copy)
 437 EXC(    LOAD    t6, UNIT(6)(src),       l_exc_copy)
 438 EXC(    LOAD    t7, UNIT(7)(src),       l_exc_copy)
 439         SUB     len, len, 8*NBYTES
 440         ADD     src, src, 8*NBYTES
 441 EXC(    STORE   t0, UNIT(0)(dst),       s_exc)
 442         ADDC(sum, t0)
 443 EXC(    STORE   t1, UNIT(1)(dst),       s_exc)
 444         ADDC(sum, t1)
 445 EXC(    STORE   t2, UNIT(2)(dst),       s_exc)
 446         ADDC(sum, t2)
 447 EXC(    STORE   t3, UNIT(3)(dst),       s_exc)
 448         ADDC(sum, t3)
 449 EXC(    STORE   t4, UNIT(4)(dst),       s_exc)
 450         ADDC(sum, t4)
 451 EXC(    STORE   t5, UNIT(5)(dst),       s_exc)
 452         ADDC(sum, t5)
 453 EXC(    STORE   t6, UNIT(6)(dst),       s_exc)
 454         ADDC(sum, t6)
 455 EXC(    STORE   t7, UNIT(7)(dst),       s_exc)
 456         ADDC(sum, t7)
 457         bgez    len, 1b
 458          ADD    dst, dst, 8*NBYTES
 459         ADD     len, 8*NBYTES           # revert len (see above)
 460
 461         /*
 462          * len == the number of bytes left to copy < 8*NBYTES
 463          */
 464 cleanup_both_aligned:
 465 #define rem t7
 466         beqz    len, done
 467          sltu   t0, len, 4*NBYTES
 468         bnez    t0, less_than_4units
 469          and    rem, len, (NBYTES-1)    # rem = len % NBYTES
 470         /*
 471          * len >= 4*NBYTES
 472          */
 473 EXC(    LOAD    t0, UNIT(0)(src),       l_exc)
 474 EXC(    LOAD    t1, UNIT(1)(src),       l_exc_copy)
 475 EXC(    LOAD    t2, UNIT(2)(src),       l_exc_copy)
 476 EXC(    LOAD    t3, UNIT(3)(src),       l_exc_copy)
 477         SUB     len, len, 4*NBYTES
 478         ADD     src, src, 4*NBYTES
 479 EXC(    STORE   t0, UNIT(0)(dst),       s_exc)
 480         ADDC(sum, t0)
 481 EXC(    STORE   t1, UNIT(1)(dst),       s_exc)
 482         ADDC(sum, t1)
 483 EXC(    STORE   t2, UNIT(2)(dst),       s_exc)
 484         ADDC(sum, t2)
 485 EXC(    STORE   t3, UNIT(3)(dst),       s_exc)
 486         ADDC(sum, t3)
 487         beqz    len, done
 488          ADD    dst, dst, 4*NBYTES
 489 less_than_4units:
 490         /*
 491          * rem = len % NBYTES
 492          */
 493         beq     rem, len, copy_bytes
 494          nop
 495 1:
 496 EXC(    LOAD    t0, 0(src),             l_exc)
 497         ADD     src, src, NBYTES
 498         SUB     len, len, NBYTES
 499 EXC(    STORE   t0, 0(dst),             s_exc)
 500         ADDC(sum, t0)
 501         bne     rem, len, 1b
 502          ADD    dst, dst, NBYTES
 503
 504         /*
 505          * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
 506          * A loop would do only a byte at a time with possible branch
 507          * mispredicts.  Can't do an explicit LOAD dst,mask,or,STORE
 508          * because can't assume read-access to dst.  Instead, use
 509          * STREST dst, which doesn't require read access to dst.
 510          *
 511          * This code should perform better than a simple loop on modern,
 512          * wide-issue mips processors because the code has fewer branches and
 513          * more instruction-level parallelism.
 514          */
 515 #define bits t2
 516         beqz    len, done
 517          ADD    t1, dst, len    # t1 is just past last byte of dst
 518         li      bits, 8*NBYTES
 519         SLL     rem, len, 3     # rem = number of bits to keep
 520 EXC(    LOAD    t0, 0(src),             l_exc)
 521         SUB     bits, bits, rem # bits = number of bits to discard
 522         SHIFT_DISCARD t0, t0, bits
 523 EXC(    STREST  t0, -1(t1),             s_exc)
 524         SHIFT_DISCARD_REVERT t0, t0, bits
 525         .set reorder
 526         ADDC(sum, t0)
 527         b       done
 528         .set noreorder
 529 dst_unaligned:
 530         /*
 531          * dst is unaligned
 532          * t0 = src & ADDRMASK
 533          * t1 = dst & ADDRMASK; T1 > 0
 534          * len >= NBYTES
 535          *
 536          * Copy enough bytes to align dst
 537          * Set match = (src and dst have same alignment)
 538          */
 539 #define match rem
 540 EXC(    LDFIRST t3, FIRST(0)(src),      l_exc)
 541         ADD     t2, zero, NBYTES
 542 EXC(    LDREST  t3, REST(0)(src),       l_exc_copy)
 543         SUB     t2, t2, t1      # t2 = number of bytes copied
 544         xor     match, t0, t1
 545 EXC(    STFIRST t3, FIRST(0)(dst),      s_exc)
 546         SLL     t4, t1, 3               # t4 = number of bits to discard
 547         SHIFT_DISCARD t3, t3, t4
 548         /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
 549         ADDC(sum, t3)
 550         beq     len, t2, done
 551          SUB    len, len, t2
 552         ADD     dst, dst, t2
 553         beqz    match, both_aligned
 554          ADD    src, src, t2
 555
 556 src_unaligned_dst_aligned:
 557         SRL     t0, len, LOG_NBYTES+2    # +2 for 4 units/iter
 558         beqz    t0, cleanup_src_unaligned
 559          and    rem, len, (4*NBYTES-1)   # rem = len % 4*NBYTES
 560 1:
 561 /*
 562  * Avoid consecutive LD*'s to the same register since some mips
 563  * implementations can't issue them in the same cycle.
 564  * It's OK to load FIRST(N+1) before REST(N) because the two addresses
 565  * are to the same unit (unless src is aligned, but it's not).
 566  */
 567 EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
 568 EXC(    LDFIRST t1, FIRST(1)(src),      l_exc_copy)
 569         SUB     len, len, 4*NBYTES
 570 EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
 571 EXC(    LDREST  t1, REST(1)(src),       l_exc_copy)
 572 EXC(    LDFIRST t2, FIRST(2)(src),      l_exc_copy)
 573 EXC(    LDFIRST t3, FIRST(3)(src),      l_exc_copy)
 574 EXC(    LDREST  t2, REST(2)(src),       l_exc_copy)
 575 EXC(    LDREST  t3, REST(3)(src),       l_exc_copy)
 576         ADD     src, src, 4*NBYTES
 577 #ifdef CONFIG_CPU_SB1
 578         nop                             # improves slotting
 579 #endif
 580 EXC(    STORE   t0, UNIT(0)(dst),       s_exc)
 581         ADDC(sum, t0)
 582 EXC(    STORE   t1, UNIT(1)(dst),       s_exc)
 583         ADDC(sum, t1)
 584 EXC(    STORE   t2, UNIT(2)(dst),       s_exc)
 585         ADDC(sum, t2)
 586 EXC(    STORE   t3, UNIT(3)(dst),       s_exc)
 587         ADDC(sum, t3)
 588         bne     len, rem, 1b
 589          ADD    dst, dst, 4*NBYTES
 590
 591 cleanup_src_unaligned:
 592         beqz    len, done
 593          and    rem, len, NBYTES-1  # rem = len % NBYTES
 594         beq     rem, len, copy_bytes
 595          nop
 596 1:
 597 EXC(    LDFIRST t0, FIRST(0)(src),      l_exc)
 598 EXC(    LDREST  t0, REST(0)(src),       l_exc_copy)
 599         ADD     src, src, NBYTES
 600         SUB     len, len, NBYTES
 601 EXC(    STORE   t0, 0(dst),             s_exc)
 602         ADDC(sum, t0)
 603         bne     len, rem, 1b
 604          ADD    dst, dst, NBYTES
 605
 606 copy_bytes_checklen:
 607         beqz    len, done
 608          nop
 609 copy_bytes:
 610         /* 0 < len < NBYTES  */
 611 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 612 #define SHIFT_START 0
 613 #define SHIFT_INC 8
 614 #else
 615 #define SHIFT_START 8*(NBYTES-1)
 616 #define SHIFT_INC -8
 617 #endif
 618         move    t2, zero        # partial word
 619         li      t3, SHIFT_START # shift
 620 /* use l_exc_copy here to return correct sum on fault */
 621 #define COPY_BYTE(N)                    \
 622 EXC(    lbu     t0, N(src), l_exc_copy);        \
 623         SUB     len, len, 1;            \
 624 EXC(    sb      t0, N(dst), s_exc);     \
 625         SLLV    t0, t0, t3;             \
 626         addu    t3, SHIFT_INC;          \
 627         beqz    len, copy_bytes_done;   \
 628          or     t2, t0
 629
 630         COPY_BYTE(0)
 631         COPY_BYTE(1)
 632 #ifdef USE_DOUBLE
 633         COPY_BYTE(2)
 634         COPY_BYTE(3)
 635         COPY_BYTE(4)
 636         COPY_BYTE(5)
 637 #endif
 638 EXC(    lbu     t0, NBYTES-2(src), l_exc_copy)
 639         SUB     len, len, 1
 640 EXC(    sb      t0, NBYTES-2(dst), s_exc)
 641         SLLV    t0, t0, t3
 642         or      t2, t0
 643 copy_bytes_done:
 644         ADDC(sum, t2)
 645 done:
 646         /* fold checksum */
 647 #ifdef USE_DOUBLE
 648         dsll32  v1, sum, 0
 649         daddu   sum, v1
 650         sltu    v1, sum, v1
 651         dsra32  sum, sum, 0
 652         addu    sum, v1
 653 #endif
 654         sll     v1, sum, 16
 655         addu    sum, v1
 656         sltu    v1, sum, v1
 657         srl     sum, sum, 16
 658         addu    sum, v1
 659
 660         /* odd buffer alignment? */
 661         beqz    odd, 1f
 662          nop
 663         sll     v1, sum, 8
 664         srl     sum, sum, 8
 665         or      sum, v1
 666         andi    sum, 0xffff
 667 1:
 668         .set reorder
 669         ADDC32(sum, psum)
 670         jr      ra
 671         .set noreorder
 672
 673 l_exc_copy:
 674         /*
 675          * Copy bytes from src until faulting load address (or until a
 676          * lb faults)
 677          *
 678          * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
 679          * may be more than a byte beyond the last address.
 680          * Hence, the lb below may get an exception.
 681          *
 682          * Assumes src < THREAD_BUADDR($28)
 683          */
 684         LOAD    t0, TI_TASK($28)
 685          li     t2, SHIFT_START
 686         LOAD    t0, THREAD_BUADDR(t0)
 687 1:
 688 EXC(    lbu     t1, 0(src),     l_exc)
 689         ADD     src, src, 1
 690         sb      t1, 0(dst)      # can't fault -- we're copy_from_user
 691         SLLV    t1, t1, t2
 692         addu    t2, SHIFT_INC
 693         ADDC(sum, t1)
 694         bne     src, t0, 1b
 695          ADD    dst, dst, 1
 696 l_exc:
 697         LOAD    t0, TI_TASK($28)
 698          nop
 699         LOAD    t0, THREAD_BUADDR(t0)   # t0 is just past last good address
 700          nop
 701         SUB     len, AT, t0             # len number of uncopied bytes
 702         /*
 703          * Here's where we rely on src and dst being incremented in tandem,
 704          *   See (3) above.
 705          * dst += (fault addr - src) to put dst at first byte to clear
 706          */
 707         ADD     dst, t0                 # compute start address in a1
 708         SUB     dst, src
 709         /*
 710          * Clear len bytes starting at dst.  Can't call __bzero because it
 711          * might modify len.  An inefficient loop for these rare times...
 712          */
 713         beqz    len, done
 714          SUB    src, len, 1
 715 1:      sb      zero, 0(dst)
 716         ADD     dst, dst, 1
 717         bnez    src, 1b
 718          SUB    src, src, 1
 719         li      v1, -EFAULT
 720         b       done
 721          sw     v1, (errptr)
 722
 723 s_exc:
 724         li      v0, -1 /* invalid checksum */
 725         li      v1, -EFAULT
 726         jr      ra
 727          sw     v1, (errptr)
 728         END(__csum_partial_copy_user)