2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
6 * Quick'n'dirty IP checksum ...
8 * Copyright (C) 1998, 1999 Ralf Baechle
9 * Copyright (C) 1999 Silicon Graphics, Inc.
11 #include <linux/errno.h>
13 #include <asm/asm-offsets.h>
14 #include <asm/regdef.h>
18 * As we are sharing code base with the mips32 tree (which use the o32 ABI
19 * register definitions). We need to redefine the register definitions from
20 * the n64 ABI register naming to the o32 ABI register naming.
52 #endif /* USE_DOUBLE */
54 #define UNIT(unit) ((unit)*NBYTES)
56 #define ADDC(sum,reg) \
61 #define ADDC32(sum,reg) \
69 #define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3) \
70 LOAD _t0, (offset + UNIT(0))(src); \
71 LOAD _t1, (offset + UNIT(1))(src); \
72 LOAD _t2, (offset + UNIT(2))(src); \
73 LOAD _t3, (offset + UNIT(3))(src); \
80 #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \
81 CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3)
83 #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \
84 CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3); \
85 CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3)
90 * a1: length of the area to checksum
91 * a2: partial checksum
105 bnez t8, small_csumcpy /* < 8 bytes to copy */
108 andi t7, src, 0x1 /* odd buffer? */
115 LONG_SUBU a1, a1, 0x1
120 PTR_ADDU src, src, 0x1
128 LONG_SUBU a1, a1, 0x2
131 PTR_ADDU src, src, 0x2
134 bnez t8, do_end_words
142 LONG_SUBU a1, a1, 0x4
144 PTR_ADDU src, src, 0x4
153 LONG_SUBU a1, a1, 0x8
158 LONG_SUBU a1, a1, 0x8
162 PTR_ADDU src, src, 0x8
166 beqz t8, begin_movement
175 CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4)
177 LONG_SUBU a1, a1, 0x10
178 PTR_ADDU src, src, 0x10
186 CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
187 CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
188 CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4)
189 CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4)
190 LONG_SUBU t8, t8, 0x01
191 bnez t8, move_128bytes
192 PTR_ADDU src, src, 0x80
199 CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
200 CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4)
201 PTR_ADDU src, src, 0x40
204 beqz t2, do_end_words
208 CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4)
210 PTR_ADDU src, src, 0x20
213 beqz t8, small_csumcpy
219 LONG_SUBU t8, t8, 0x1
222 PTR_ADDU src, src, 0x4
224 /* unknown src alignment and < 8 bytes to go */
232 /* Still a full word to go */
236 dsll t1, t1, 32 /* clear lower 32bit */
244 /* Still a halfword to go */
275 /* odd buffer alignment? */
284 /* Add the passed partial csum. */
292 * checksum and copy routines based on memcpy.S
294 * csum_partial_copy_nocheck(src, dst, len, sum)
295 * __csum_partial_copy_user(src, dst, len, sum, errp)
297 * See "Spec" in memcpy.S for details. Unlike __copy_user, all
298 * function in this file use the standard calling convention.
310 * The exception handler for loads requires that:
311 * 1- AT contain the address of the byte just past the end of the source
313 * 2- src_entry <= src < AT, and
314 * 3- (dst - src) == (dst_entry - src_entry),
315 * The _entry suffix denotes values when __copy_user was called.
317 * (1) is set up up by __csum_partial_copy_from_user and maintained by
318 * not writing AT in __csum_partial_copy
319 * (2) is met by incrementing src by the number of bytes copied
320 * (3) is met by not doing loads between a pair of increments of dst and src
322 * The exception handlers for stores stores -EFAULT to errptr and return.
323 * These handlers do not need to overwrite any data.
326 #define EXC(inst_reg,addr,handler) \
328 .section __ex_table,"a"; \
366 #endif /* USE_DOUBLE */
368 #ifdef CONFIG_CPU_LITTLE_ENDIAN
369 #define LDFIRST LOADR
371 #define STFIRST STORER
372 #define STREST STOREL
373 #define SHIFT_DISCARD SLLV
374 #define SHIFT_DISCARD_REVERT SRLV
376 #define LDFIRST LOADL
378 #define STFIRST STOREL
379 #define STREST STORER
380 #define SHIFT_DISCARD SRLV
381 #define SHIFT_DISCARD_REVERT SLLV
384 #define FIRST(unit) ((unit)*NBYTES)
385 #define REST(unit) (FIRST(unit)+NBYTES-1)
387 #define ADDRMASK (NBYTES-1)
391 LEAF(__csum_partial_copy_user)
392 PTR_ADDU AT, src, len /* See (1) above. */
398 FEXPORT(csum_partial_copy_nocheck)
402 * Note: dst & src may be unaligned, len may be 0
406 * The "issue break"s below are very approximate.
407 * Issue delays for dcache fills will perturb the schedule, as will
408 * load queue full replay traps, etc.
410 * If len < NBYTES use byte operations.
413 and t1, dst, ADDRMASK
414 bnez t2, copy_bytes_checklen
415 and t0, src, ADDRMASK
416 andi odd, dst, 0x1 /* odd buffer? */
417 bnez t1, dst_unaligned
419 bnez t0, src_unaligned_dst_aligned
421 * use delay slot for fall-through
422 * src and dst are aligned; need to compute rem
425 SRL t0, len, LOG_NBYTES+3 # +3 for 8 units/iter
426 beqz t0, cleanup_both_aligned # len < 8*NBYTES
428 SUB len, 8*NBYTES # subtract here for bgez loop
431 EXC( LOAD t0, UNIT(0)(src), l_exc)
432 EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
433 EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
434 EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
435 EXC( LOAD t4, UNIT(4)(src), l_exc_copy)
436 EXC( LOAD t5, UNIT(5)(src), l_exc_copy)
437 EXC( LOAD t6, UNIT(6)(src), l_exc_copy)
438 EXC( LOAD t7, UNIT(7)(src), l_exc_copy)
439 SUB len, len, 8*NBYTES
440 ADD src, src, 8*NBYTES
441 EXC( STORE t0, UNIT(0)(dst), s_exc)
443 EXC( STORE t1, UNIT(1)(dst), s_exc)
445 EXC( STORE t2, UNIT(2)(dst), s_exc)
447 EXC( STORE t3, UNIT(3)(dst), s_exc)
449 EXC( STORE t4, UNIT(4)(dst), s_exc)
451 EXC( STORE t5, UNIT(5)(dst), s_exc)
453 EXC( STORE t6, UNIT(6)(dst), s_exc)
455 EXC( STORE t7, UNIT(7)(dst), s_exc)
458 ADD dst, dst, 8*NBYTES
459 ADD len, 8*NBYTES # revert len (see above)
462 * len == the number of bytes left to copy < 8*NBYTES
464 cleanup_both_aligned:
467 sltu t0, len, 4*NBYTES
468 bnez t0, less_than_4units
469 and rem, len, (NBYTES-1) # rem = len % NBYTES
473 EXC( LOAD t0, UNIT(0)(src), l_exc)
474 EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
475 EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
476 EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
477 SUB len, len, 4*NBYTES
478 ADD src, src, 4*NBYTES
479 EXC( STORE t0, UNIT(0)(dst), s_exc)
481 EXC( STORE t1, UNIT(1)(dst), s_exc)
483 EXC( STORE t2, UNIT(2)(dst), s_exc)
485 EXC( STORE t3, UNIT(3)(dst), s_exc)
488 ADD dst, dst, 4*NBYTES
493 beq rem, len, copy_bytes
496 EXC( LOAD t0, 0(src), l_exc)
499 EXC( STORE t0, 0(dst), s_exc)
505 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
506 * A loop would do only a byte at a time with possible branch
507 * mispredicts. Can't do an explicit LOAD dst,mask,or,STORE
508 * because can't assume read-access to dst. Instead, use
509 * STREST dst, which doesn't require read access to dst.
511 * This code should perform better than a simple loop on modern,
512 * wide-issue mips processors because the code has fewer branches and
513 * more instruction-level parallelism.
517 ADD t1, dst, len # t1 is just past last byte of dst
519 SLL rem, len, 3 # rem = number of bits to keep
520 EXC( LOAD t0, 0(src), l_exc)
521 SUB bits, bits, rem # bits = number of bits to discard
522 SHIFT_DISCARD t0, t0, bits
523 EXC( STREST t0, -1(t1), s_exc)
524 SHIFT_DISCARD_REVERT t0, t0, bits
532 * t0 = src & ADDRMASK
533 * t1 = dst & ADDRMASK; T1 > 0
536 * Copy enough bytes to align dst
537 * Set match = (src and dst have same alignment)
540 EXC( LDFIRST t3, FIRST(0)(src), l_exc)
542 EXC( LDREST t3, REST(0)(src), l_exc_copy)
543 SUB t2, t2, t1 # t2 = number of bytes copied
545 EXC( STFIRST t3, FIRST(0)(dst), s_exc)
546 SLL t4, t1, 3 # t4 = number of bits to discard
547 SHIFT_DISCARD t3, t3, t4
548 /* no SHIFT_DISCARD_REVERT to handle odd buffer properly */
553 beqz match, both_aligned
556 src_unaligned_dst_aligned:
557 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
558 beqz t0, cleanup_src_unaligned
559 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
562 * Avoid consecutive LD*'s to the same register since some mips
563 * implementations can't issue them in the same cycle.
564 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
565 * are to the same unit (unless src is aligned, but it's not).
567 EXC( LDFIRST t0, FIRST(0)(src), l_exc)
568 EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
569 SUB len, len, 4*NBYTES
570 EXC( LDREST t0, REST(0)(src), l_exc_copy)
571 EXC( LDREST t1, REST(1)(src), l_exc_copy)
572 EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
573 EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
574 EXC( LDREST t2, REST(2)(src), l_exc_copy)
575 EXC( LDREST t3, REST(3)(src), l_exc_copy)
576 ADD src, src, 4*NBYTES
577 #ifdef CONFIG_CPU_SB1
578 nop # improves slotting
580 EXC( STORE t0, UNIT(0)(dst), s_exc)
582 EXC( STORE t1, UNIT(1)(dst), s_exc)
584 EXC( STORE t2, UNIT(2)(dst), s_exc)
586 EXC( STORE t3, UNIT(3)(dst), s_exc)
589 ADD dst, dst, 4*NBYTES
591 cleanup_src_unaligned:
593 and rem, len, NBYTES-1 # rem = len % NBYTES
594 beq rem, len, copy_bytes
597 EXC( LDFIRST t0, FIRST(0)(src), l_exc)
598 EXC( LDREST t0, REST(0)(src), l_exc_copy)
601 EXC( STORE t0, 0(dst), s_exc)
610 /* 0 < len < NBYTES */
611 #ifdef CONFIG_CPU_LITTLE_ENDIAN
612 #define SHIFT_START 0
615 #define SHIFT_START 8*(NBYTES-1)
618 move t2, zero # partial word
619 li t3, SHIFT_START # shift
620 /* use l_exc_copy here to return correct sum on fault */
621 #define COPY_BYTE(N) \
622 EXC( lbu t0, N(src), l_exc_copy); \
624 EXC( sb t0, N(dst), s_exc); \
626 addu t3, SHIFT_INC; \
627 beqz len, copy_bytes_done; \
638 EXC( lbu t0, NBYTES-2(src), l_exc_copy)
640 EXC( sb t0, NBYTES-2(dst), s_exc)
660 /* odd buffer alignment? */
675 * Copy bytes from src until faulting load address (or until a
678 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
679 * may be more than a byte beyond the last address.
680 * Hence, the lb below may get an exception.
682 * Assumes src < THREAD_BUADDR($28)
684 LOAD t0, TI_TASK($28)
686 LOAD t0, THREAD_BUADDR(t0)
688 EXC( lbu t1, 0(src), l_exc)
690 sb t1, 0(dst) # can't fault -- we're copy_from_user
697 LOAD t0, TI_TASK($28)
699 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
701 SUB len, AT, t0 # len number of uncopied bytes
703 * Here's where we rely on src and dst being incremented in tandem,
705 * dst += (fault addr - src) to put dst at first byte to clear
707 ADD dst, t0 # compute start address in a1
710 * Clear len bytes starting at dst. Can't call __bzero because it
711 * might modify len. An inefficient loop for these rare times...
724 li v0, -1 /* invalid checksum */
728 END(__csum_partial_copy_user)