arch/sparc/lib/checksum_32.S

   1 /* checksum.S: Sparc optimized checksum code.
   2  *
   3  *  Copyright(C) 1995 Linus Torvalds
   4  *  Copyright(C) 1995 Miguel de Icaza
   5  *  Copyright(C) 1996 David S. Miller
   6  *  Copyright(C) 1997 Jakub Jelinek
   7  *
   8  * derived from:
   9  *      Linux/Alpha checksum c-code
  10  *      Linux/ix86 inline checksum assembly
  11  *      RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
  12  *      David Mosberger-Tang for optimized reference c-code
  13  *      BSD4.4 portable checksum routine
  14  */
  15
  16 #include <asm/errno.h>
  17
  18 #define CSUM_BIGCHUNK(buf, offset, sum, t0, t1, t2, t3, t4, t5) \
  19         ldd     [buf + offset + 0x00], t0;                      \
  20         ldd     [buf + offset + 0x08], t2;                      \
  21         addxcc  t0, sum, sum;                                   \
  22         addxcc  t1, sum, sum;                                   \
  23         ldd     [buf + offset + 0x10], t4;                      \
  24         addxcc  t2, sum, sum;                                   \
  25         addxcc  t3, sum, sum;                                   \
  26         ldd     [buf + offset + 0x18], t0;                      \
  27         addxcc  t4, sum, sum;                                   \
  28         addxcc  t5, sum, sum;                                   \
  29         addxcc  t0, sum, sum;                                   \
  30         addxcc  t1, sum, sum;
  31
  32 #define CSUM_LASTCHUNK(buf, offset, sum, t0, t1, t2, t3)        \
  33         ldd     [buf - offset - 0x08], t0;                      \
  34         ldd     [buf - offset - 0x00], t2;                      \
  35         addxcc  t0, sum, sum;                                   \
  36         addxcc  t1, sum, sum;                                   \
  37         addxcc  t2, sum, sum;                                   \
  38         addxcc  t3, sum, sum;
  39
  40         /* Do end cruft out of band to get better cache patterns. */
  41 csum_partial_end_cruft:
  42         be      1f                              ! caller asks %o1 & 0x8
  43          andcc  %o1, 4, %g0                     ! nope, check for word remaining
  44         ldd     [%o0], %g2                      ! load two
  45         addcc   %g2, %o2, %o2                   ! add first word to sum
  46         addxcc  %g3, %o2, %o2                   ! add second word as well
  47         add     %o0, 8, %o0                     ! advance buf ptr
  48         addx    %g0, %o2, %o2                   ! add in final carry
  49         andcc   %o1, 4, %g0                     ! check again for word remaining
  50 1:      be      1f                              ! nope, skip this code
  51          andcc  %o1, 3, %o1                     ! check for trailing bytes
  52         ld      [%o0], %g2                      ! load it
  53         addcc   %g2, %o2, %o2                   ! add to sum
  54         add     %o0, 4, %o0                     ! advance buf ptr
  55         addx    %g0, %o2, %o2                   ! add in final carry
  56         andcc   %o1, 3, %g0                     ! check again for trailing bytes
  57 1:      be      1f                              ! no trailing bytes, return
  58          addcc  %o1, -1, %g0                    ! only one byte remains?
  59         bne     2f                              ! at least two bytes more
  60          subcc  %o1, 2, %o1                     ! only two bytes more?
  61         b       4f                              ! only one byte remains
  62          or     %g0, %g0, %o4                   ! clear fake hword value
  63 2:      lduh    [%o0], %o4                      ! get hword
  64         be      6f                              ! jmp if only hword remains
  65          add    %o0, 2, %o0                     ! advance buf ptr either way
  66         sll     %o4, 16, %o4                    ! create upper hword
  67 4:      ldub    [%o0], %o5                      ! get final byte
  68         sll     %o5, 8, %o5                     ! put into place
  69         or      %o5, %o4, %o4                   ! coalese with hword (if any)
  70 6:      addcc   %o4, %o2, %o2                   ! add to sum
  71 1:      retl                                    ! get outta here
  72          addx   %g0, %o2, %o0                   ! add final carry into retval
  73
  74         /* Also do alignment out of band to get better cache patterns. */
  75 csum_partial_fix_alignment:
  76         cmp     %o1, 6
  77         bl      cpte - 0x4
  78          andcc  %o0, 0x2, %g0
  79         be      1f
  80          andcc  %o0, 0x4, %g0
  81         lduh    [%o0 + 0x00], %g2
  82         sub     %o1, 2, %o1
  83         add     %o0, 2, %o0
  84         sll     %g2, 16, %g2
  85         addcc   %g2, %o2, %o2
  86         srl     %o2, 16, %g3
  87         addx    %g0, %g3, %g2
  88         sll     %o2, 16, %o2
  89         sll     %g2, 16, %g3
  90         srl     %o2, 16, %o2
  91         andcc   %o0, 0x4, %g0
  92         or      %g3, %o2, %o2
  93 1:      be      cpa
  94          andcc  %o1, 0xffffff80, %o3
  95         ld      [%o0 + 0x00], %g2
  96         sub     %o1, 4, %o1
  97         addcc   %g2, %o2, %o2
  98         add     %o0, 4, %o0
  99         addx    %g0, %o2, %o2
 100         b       cpa
 101          andcc  %o1, 0xffffff80, %o3
 102
 103         /* The common case is to get called with a nicely aligned
 104          * buffer of size 0x20.  Follow the code path for that case.
 105          */
 106         .globl  csum_partial
 107 csum_partial:                   /* %o0=buf, %o1=len, %o2=sum */
 108         andcc   %o0, 0x7, %g0                           ! alignment problems?
 109         bne     csum_partial_fix_alignment              ! yep, handle it
 110          sethi  %hi(cpte - 8), %g7                      ! prepare table jmp ptr
 111         andcc   %o1, 0xffffff80, %o3                    ! num loop iterations
 112 cpa:    be      3f                                      ! none to do
 113          andcc  %o1, 0x70, %g1                          ! clears carry flag too
 114 5:      CSUM_BIGCHUNK(%o0, 0x00, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
 115         CSUM_BIGCHUNK(%o0, 0x20, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
 116         CSUM_BIGCHUNK(%o0, 0x40, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
 117         CSUM_BIGCHUNK(%o0, 0x60, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
 118         addx    %g0, %o2, %o2                           ! sink in final carry
 119         subcc   %o3, 128, %o3                           ! detract from loop iters
 120         bne     5b                                      ! more to do
 121          add    %o0, 128, %o0                           ! advance buf ptr
 122         andcc   %o1, 0x70, %g1                          ! clears carry flag too
 123 3:      be      cpte                                    ! nope
 124          andcc  %o1, 0xf, %g0                           ! anything left at all?
 125         srl     %g1, 1, %o4                             ! compute offset
 126         sub     %g7, %g1, %g7                           ! adjust jmp ptr
 127         sub     %g7, %o4, %g7                           ! final jmp ptr adjust
 128         jmp     %g7 + %lo(cpte - 8)                     ! enter the table
 129          add    %o0, %g1, %o0                           ! advance buf ptr
 130 cptbl:  CSUM_LASTCHUNK(%o0, 0x68, %o2, %g2, %g3, %g4, %g5)
 131         CSUM_LASTCHUNK(%o0, 0x58, %o2, %g2, %g3, %g4, %g5)
 132         CSUM_LASTCHUNK(%o0, 0x48, %o2, %g2, %g3, %g4, %g5)
 133         CSUM_LASTCHUNK(%o0, 0x38, %o2, %g2, %g3, %g4, %g5)
 134         CSUM_LASTCHUNK(%o0, 0x28, %o2, %g2, %g3, %g4, %g5)
 135         CSUM_LASTCHUNK(%o0, 0x18, %o2, %g2, %g3, %g4, %g5)
 136         CSUM_LASTCHUNK(%o0, 0x08, %o2, %g2, %g3, %g4, %g5)
 137         addx    %g0, %o2, %o2                           ! fetch final carry
 138         andcc   %o1, 0xf, %g0                           ! anything left at all?
 139 cpte:   bne     csum_partial_end_cruft                  ! yep, handle it
 140          andcc  %o1, 8, %g0                             ! check how much
 141 cpout:  retl                                            ! get outta here
 142          mov    %o2, %o0                                ! return computed csum
 143
 144         .globl __csum_partial_copy_start, __csum_partial_copy_end
 145 __csum_partial_copy_start:
 146
 147 /* Work around cpp -rob */
 148 #define ALLOC #alloc
 149 #define EXECINSTR #execinstr
 150 #define EX(x,y,a,b)                             \
 151 98:     x,y;                                    \
 152         .section .fixup,ALLOC,EXECINSTR;        \
 153         .align  4;                              \
 154 99:     ba 30f;                                 \
 155          a, b, %o3;                             \
 156         .section __ex_table,ALLOC;              \
 157         .align  4;                              \
 158         .word   98b, 99b;                       \
 159         .text;                                  \
 160         .align  4
 161
 162 #define EX2(x,y)                                \
 163 98:     x,y;                                    \
 164         .section __ex_table,ALLOC;              \
 165         .align  4;                              \
 166         .word   98b, 30f;                       \
 167         .text;                                  \
 168         .align  4
 169
 170 #define EX3(x,y)                                \
 171 98:     x,y;                                    \
 172         .section __ex_table,ALLOC;              \
 173         .align  4;                              \
 174         .word   98b, 96f;                       \
 175         .text;                                  \
 176         .align  4
 177
 178 #define EXT(start,end,handler)                  \
 179         .section __ex_table,ALLOC;              \
 180         .align  4;                              \
 181         .word   start, 0, end, handler;         \
 182         .text;                                  \
 183         .align  4
 184
 185         /* This aligned version executes typically in 8.5 superscalar cycles, this
 186          * is the best I can do.  I say 8.5 because the final add will pair with
 187          * the next ldd in the main unrolled loop.  Thus the pipe is always full.
 188          * If you change these macros (including order of instructions),
 189          * please check the fixup code below as well.
 190          */
 191 #define CSUMCOPY_BIGCHUNK_ALIGNED(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7)   \
 192         ldd     [src + off + 0x00], t0;                                                 \
 193         ldd     [src + off + 0x08], t2;                                                 \
 194         addxcc  t0, sum, sum;                                                           \
 195         ldd     [src + off + 0x10], t4;                                                 \
 196         addxcc  t1, sum, sum;                                                           \
 197         ldd     [src + off + 0x18], t6;                                                 \
 198         addxcc  t2, sum, sum;                                                           \
 199         std     t0, [dst + off + 0x00];                                                 \
 200         addxcc  t3, sum, sum;                                                           \
 201         std     t2, [dst + off + 0x08];                                                 \
 202         addxcc  t4, sum, sum;                                                           \
 203         std     t4, [dst + off + 0x10];                                                 \
 204         addxcc  t5, sum, sum;                                                           \
 205         std     t6, [dst + off + 0x18];                                                 \
 206         addxcc  t6, sum, sum;                                                           \
 207         addxcc  t7, sum, sum;
 208
 209         /* 12 superscalar cycles seems to be the limit for this case,
 210          * because of this we thus do all the ldd's together to get
 211          * Viking MXCC into streaming mode.  Ho hum...
 212          */
 213 #define CSUMCOPY_BIGCHUNK(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7)   \
 214         ldd     [src + off + 0x00], t0;                                         \
 215         ldd     [src + off + 0x08], t2;                                         \
 216         ldd     [src + off + 0x10], t4;                                         \
 217         ldd     [src + off + 0x18], t6;                                         \
 218         st      t0, [dst + off + 0x00];                                         \
 219         addxcc  t0, sum, sum;                                                   \
 220         st      t1, [dst + off + 0x04];                                         \
 221         addxcc  t1, sum, sum;                                                   \
 222         st      t2, [dst + off + 0x08];                                         \
 223         addxcc  t2, sum, sum;                                                   \
 224         st      t3, [dst + off + 0x0c];                                         \
 225         addxcc  t3, sum, sum;                                                   \
 226         st      t4, [dst + off + 0x10];                                         \
 227         addxcc  t4, sum, sum;                                                   \
 228         st      t5, [dst + off + 0x14];                                         \
 229         addxcc  t5, sum, sum;                                                   \
 230         st      t6, [dst + off + 0x18];                                         \
 231         addxcc  t6, sum, sum;                                                   \
 232         st      t7, [dst + off + 0x1c];                                         \
 233         addxcc  t7, sum, sum;
 234
 235         /* Yuck, 6 superscalar cycles... */
 236 #define CSUMCOPY_LASTCHUNK(src, dst, sum, off, t0, t1, t2, t3)  \
 237         ldd     [src - off - 0x08], t0;                         \
 238         ldd     [src - off - 0x00], t2;                         \
 239         addxcc  t0, sum, sum;                                   \
 240         st      t0, [dst - off - 0x08];                         \
 241         addxcc  t1, sum, sum;                                   \
 242         st      t1, [dst - off - 0x04];                         \
 243         addxcc  t2, sum, sum;                                   \
 244         st      t2, [dst - off - 0x00];                         \
 245         addxcc  t3, sum, sum;                                   \
 246         st      t3, [dst - off + 0x04];
 247
 248         /* Handle the end cruft code out of band for better cache patterns. */
 249 cc_end_cruft:
 250         be      1f
 251          andcc  %o3, 4, %g0
 252         EX(ldd  [%o0 + 0x00], %g2, and %o3, 0xf)
 253         add     %o1, 8, %o1
 254         addcc   %g2, %g7, %g7
 255         add     %o0, 8, %o0
 256         addxcc  %g3, %g7, %g7
 257         EX2(st  %g2, [%o1 - 0x08])
 258         addx    %g0, %g7, %g7
 259         andcc   %o3, 4, %g0
 260         EX2(st  %g3, [%o1 - 0x04])
 261 1:      be      1f
 262          andcc  %o3, 3, %o3
 263         EX(ld   [%o0 + 0x00], %g2, add %o3, 4)
 264         add     %o1, 4, %o1
 265         addcc   %g2, %g7, %g7
 266         EX2(st  %g2, [%o1 - 0x04])
 267         addx    %g0, %g7, %g7
 268         andcc   %o3, 3, %g0
 269         add     %o0, 4, %o0
 270 1:      be      1f
 271          addcc  %o3, -1, %g0
 272         bne     2f
 273          subcc  %o3, 2, %o3
 274         b       4f
 275          or     %g0, %g0, %o4
 276 2:      EX(lduh [%o0 + 0x00], %o4, add %o3, 2)
 277         add     %o0, 2, %o0
 278         EX2(sth %o4, [%o1 + 0x00])
 279         be      6f
 280          add    %o1, 2, %o1
 281         sll     %o4, 16, %o4
 282 4:      EX(ldub [%o0 + 0x00], %o5, add %g0, 1)
 283         EX2(stb %o5, [%o1 + 0x00])
 284         sll     %o5, 8, %o5
 285         or      %o5, %o4, %o4
 286 6:      addcc   %o4, %g7, %g7
 287 1:      retl
 288          addx   %g0, %g7, %o0
 289
 290         /* Also, handle the alignment code out of band. */
 291 cc_dword_align:
 292         cmp     %g1, 16
 293         bge     1f
 294          srl    %g1, 1, %o3
 295 2:      cmp     %o3, 0
 296         be,a    ccte
 297          andcc  %g1, 0xf, %o3
 298         andcc   %o3, %o0, %g0   ! Check %o0 only (%o1 has the same last 2 bits)
 299         be,a    2b
 300          srl    %o3, 1, %o3
 301 1:      andcc   %o0, 0x1, %g0
 302         bne     ccslow
 303          andcc  %o0, 0x2, %g0
 304         be      1f
 305          andcc  %o0, 0x4, %g0
 306         EX(lduh [%o0 + 0x00], %g4, add %g1, 0)
 307         sub     %g1, 2, %g1
 308         EX2(sth %g4, [%o1 + 0x00])
 309         add     %o0, 2, %o0
 310         sll     %g4, 16, %g4
 311         addcc   %g4, %g7, %g7
 312         add     %o1, 2, %o1
 313         srl     %g7, 16, %g3
 314         addx    %g0, %g3, %g4
 315         sll     %g7, 16, %g7
 316         sll     %g4, 16, %g3
 317         srl     %g7, 16, %g7
 318         andcc   %o0, 0x4, %g0
 319         or      %g3, %g7, %g7
 320 1:      be      3f
 321          andcc  %g1, 0xffffff80, %g0
 322         EX(ld   [%o0 + 0x00], %g4, add %g1, 0)
 323         sub     %g1, 4, %g1
 324         EX2(st  %g4, [%o1 + 0x00])
 325         add     %o0, 4, %o0
 326         addcc   %g4, %g7, %g7
 327         add     %o1, 4, %o1
 328         addx    %g0, %g7, %g7
 329         b       3f
 330          andcc  %g1, 0xffffff80, %g0
 331
 332         /* Sun, you just can't beat me, you just can't.  Stop trying,
 333          * give up.  I'm serious, I am going to kick the living shit
 334          * out of you, game over, lights out.
 335          */
 336         .align  8
 337         .globl  __csum_partial_copy_sparc_generic
 338 __csum_partial_copy_sparc_generic:
 339                                         /* %o0=src, %o1=dest, %g1=len, %g7=sum */
 340         xor     %o0, %o1, %o4           ! get changing bits
 341         andcc   %o4, 3, %g0             ! check for mismatched alignment
 342         bne     ccslow                  ! better this than unaligned/fixups
 343          andcc  %o0, 7, %g0             ! need to align things?
 344         bne     cc_dword_align          ! yes, we check for short lengths there
 345          andcc  %g1, 0xffffff80, %g0    ! can we use unrolled loop?
 346 3:      be      3f                      ! nope, less than one loop remains
 347          andcc  %o1, 4, %g0             ! dest aligned on 4 or 8 byte boundary?
 348         be      ccdbl + 4               ! 8 byte aligned, kick ass
 349 5:      CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 350         CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 351         CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 352         CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 353 10:     EXT(5b, 10b, 20f)               ! note for exception handling
 354         sub     %g1, 128, %g1           ! detract from length
 355         addx    %g0, %g7, %g7           ! add in last carry bit
 356         andcc   %g1, 0xffffff80, %g0    ! more to csum?
 357         add     %o0, 128, %o0           ! advance src ptr
 358         bne     5b                      ! we did not go negative, continue looping
 359          add    %o1, 128, %o1           ! advance dest ptr
 360 3:      andcc   %g1, 0x70, %o2          ! can use table?
 361 ccmerge:be      ccte                    ! nope, go and check for end cruft
 362          andcc  %g1, 0xf, %o3           ! get low bits of length (clears carry btw)
 363         srl     %o2, 1, %o4             ! begin negative offset computation
 364         sethi   %hi(12f), %o5           ! set up table ptr end
 365         add     %o0, %o2, %o0           ! advance src ptr
 366         sub     %o5, %o4, %o5           ! continue table calculation
 367         sll     %o2, 1, %g2             ! constant multiplies are fun...
 368         sub     %o5, %g2, %o5           ! some more adjustments
 369         jmp     %o5 + %lo(12f)          ! jump into it, duff style, wheee...
 370          add    %o1, %o2, %o1           ! advance dest ptr (carry is clear btw)
 371 cctbl:  CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x68,%g2,%g3,%g4,%g5)
 372         CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x58,%g2,%g3,%g4,%g5)
 373         CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x48,%g2,%g3,%g4,%g5)
 374         CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x38,%g2,%g3,%g4,%g5)
 375         CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x28,%g2,%g3,%g4,%g5)
 376         CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x18,%g2,%g3,%g4,%g5)
 377         CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x08,%g2,%g3,%g4,%g5)
 378 12:     EXT(cctbl, 12b, 22f)            ! note for exception table handling
 379         addx    %g0, %g7, %g7
 380         andcc   %o3, 0xf, %g0           ! check for low bits set
 381 ccte:   bne     cc_end_cruft            ! something left, handle it out of band
 382          andcc  %o3, 8, %g0             ! begin checks for that code
 383         retl                            ! return
 384          mov    %g7, %o0                ! give em the computed checksum
 385 ccdbl:  CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 386         CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 387         CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 388         CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 389 11:     EXT(ccdbl, 11b, 21f)            ! note for exception table handling
 390         sub     %g1, 128, %g1           ! detract from length
 391         addx    %g0, %g7, %g7           ! add in last carry bit
 392         andcc   %g1, 0xffffff80, %g0    ! more to csum?
 393         add     %o0, 128, %o0           ! advance src ptr
 394         bne     ccdbl                   ! we did not go negative, continue looping
 395          add    %o1, 128, %o1           ! advance dest ptr
 396         b       ccmerge                 ! finish it off, above
 397          andcc  %g1, 0x70, %o2          ! can use table? (clears carry btw)
 398
 399 ccslow: cmp     %g1, 0
 400         mov     0, %g5
 401         bleu    4f
 402          andcc  %o0, 1, %o5
 403         be,a    1f
 404          srl    %g1, 1, %g4
 405         sub     %g1, 1, %g1
 406         EX(ldub [%o0], %g5, add %g1, 1)
 407         add     %o0, 1, %o0
 408         EX2(stb %g5, [%o1])
 409         srl     %g1, 1, %g4
 410         add     %o1, 1, %o1
 411 1:      cmp     %g4, 0
 412         be,a    3f
 413          andcc  %g1, 1, %g0
 414         andcc   %o0, 2, %g0
 415         be,a    1f
 416          srl    %g4, 1, %g4
 417         EX(lduh [%o0], %o4, add %g1, 0)
 418         sub     %g1, 2, %g1
 419         srl     %o4, 8, %g2
 420         sub     %g4, 1, %g4
 421         EX2(stb %g2, [%o1])
 422         add     %o4, %g5, %g5
 423         EX2(stb %o4, [%o1 + 1])
 424         add     %o0, 2, %o0
 425         srl     %g4, 1, %g4
 426         add     %o1, 2, %o1
 427 1:      cmp     %g4, 0
 428         be,a    2f
 429          andcc  %g1, 2, %g0
 430         EX3(ld  [%o0], %o4)
 431 5:      srl     %o4, 24, %g2
 432         srl     %o4, 16, %g3
 433         EX2(stb %g2, [%o1])
 434         srl     %o4, 8, %g2
 435         EX2(stb %g3, [%o1 + 1])
 436         add     %o0, 4, %o0
 437         EX2(stb %g2, [%o1 + 2])
 438         addcc   %o4, %g5, %g5
 439         EX2(stb %o4, [%o1 + 3])
 440         addx    %g5, %g0, %g5   ! I am now to lazy to optimize this (question it
 441         add     %o1, 4, %o1     ! is worthy). Maybe some day - with the sll/srl
 442         subcc   %g4, 1, %g4     ! tricks
 443         bne,a   5b
 444          EX3(ld [%o0], %o4)
 445         sll     %g5, 16, %g2
 446         srl     %g5, 16, %g5
 447         srl     %g2, 16, %g2
 448         andcc   %g1, 2, %g0
 449         add     %g2, %g5, %g5
 450 2:      be,a    3f
 451          andcc  %g1, 1, %g0
 452         EX(lduh [%o0], %o4, and %g1, 3)
 453         andcc   %g1, 1, %g0
 454         srl     %o4, 8, %g2
 455         add     %o0, 2, %o0
 456         EX2(stb %g2, [%o1])
 457         add     %g5, %o4, %g5
 458         EX2(stb %o4, [%o1 + 1])
 459         add     %o1, 2, %o1
 460 3:      be,a    1f
 461          sll    %g5, 16, %o4
 462         EX(ldub [%o0], %g2, add %g0, 1)
 463         sll     %g2, 8, %o4
 464         EX2(stb %g2, [%o1])
 465         add     %g5, %o4, %g5
 466         sll     %g5, 16, %o4
 467 1:      addcc   %o4, %g5, %g5
 468         srl     %g5, 16, %o4
 469         addx    %g0, %o4, %g5
 470         orcc    %o5, %g0, %g0
 471         be      4f
 472          srl    %g5, 8, %o4
 473         and     %g5, 0xff, %g2
 474         and     %o4, 0xff, %o4
 475         sll     %g2, 8, %g2
 476         or      %g2, %o4, %g5
 477 4:      addcc   %g7, %g5, %g7
 478         retl
 479          addx   %g0, %g7, %o0
 480 __csum_partial_copy_end:
 481
 482 /* We do these strange calculations for the csum_*_from_user case only, ie.
 483  * we only bother with faults on loads... */
 484
 485 /* o2 = ((g2%20)&3)*8
 486  * o3 = g1 - (g2/20)*32 - o2 */
 487 20:
 488         cmp     %g2, 20
 489         blu,a   1f
 490          and    %g2, 3, %o2
 491         sub     %g1, 32, %g1
 492         b       20b
 493          sub    %g2, 20, %g2
 494 1:
 495         sll     %o2, 3, %o2
 496         b       31f
 497          sub    %g1, %o2, %o3
 498
 499 /* o2 = (!(g2 & 15) ? 0 : (((g2 & 15) + 1) & ~1)*8)
 500  * o3 = g1 - (g2/16)*32 - o2 */
 501 21:
 502         andcc   %g2, 15, %o3
 503         srl     %g2, 4, %g2
 504         be,a    1f
 505          clr    %o2
 506         add     %o3, 1, %o3
 507         and     %o3, 14, %o3
 508         sll     %o3, 3, %o2
 509 1:
 510         sll     %g2, 5, %g2
 511         sub     %g1, %g2, %o3
 512         b       31f
 513          sub    %o3, %o2, %o3
 514
 515 /* o0 += (g2/10)*16 - 0x70
 516  * 01 += (g2/10)*16 - 0x70
 517  * o2 = (g2 % 10) ? 8 : 0
 518  * o3 += 0x70 - (g2/10)*16 - o2 */
 519 22:
 520         cmp     %g2, 10
 521         blu,a   1f
 522          sub    %o0, 0x70, %o0
 523         add     %o0, 16, %o0
 524         add     %o1, 16, %o1
 525         sub     %o3, 16, %o3
 526         b       22b
 527          sub    %g2, 10, %g2
 528 1:
 529         sub     %o1, 0x70, %o1
 530         add     %o3, 0x70, %o3
 531         clr     %o2
 532         tst     %g2
 533         bne,a   1f
 534          mov    8, %o2
 535 1:
 536         b       31f
 537          sub    %o3, %o2, %o3
 538 96:
 539         and     %g1, 3, %g1
 540         sll     %g4, 2, %g4
 541         add     %g1, %g4, %o3
 542 30:
 543 /* %o1 is dst
 544  * %o3 is # bytes to zero out
 545  * %o4 is faulting address
 546  * %o5 is %pc where fault occurred */
 547         clr     %o2
 548 31:
 549 /* %o0 is src
 550  * %o1 is dst
 551  * %o2 is # of bytes to copy from src to dst
 552  * %o3 is # bytes to zero out
 553  * %o4 is faulting address
 554  * %o5 is %pc where fault occurred */
 555         save    %sp, -104, %sp
 556         mov     %i5, %o0
 557         mov     %i7, %o1
 558         mov     %i4, %o2
 559         call    lookup_fault
 560          mov    %g7, %i4
 561         cmp     %o0, 2
 562         bne     1f
 563          add    %g0, -EFAULT, %i5
 564         tst     %i2
 565         be      2f
 566          mov    %i0, %o1
 567         mov     %i1, %o0
 568 5:
 569         call    memcpy
 570          mov    %i2, %o2
 571         tst     %o0
 572         bne,a   2f
 573          add    %i3, %i2, %i3
 574         add     %i1, %i2, %i1
 575 2:
 576         mov     %i1, %o0
 577 6:
 578         call    __bzero
 579          mov    %i3, %o1
 580 1:
 581         ld      [%sp + 168], %o2                ! struct_ptr of parent
 582         st      %i5, [%o2]
 583         ret
 584          restore
 585
 586         .section __ex_table,#alloc
 587         .align 4
 588         .word 5b,2
 589         .word 6b,2