Import 2.3.41pre2
[davej-history.git] / arch / sparc64 / lib / VIScsumcopy.S
blob3f89eea29e5002ea37d3987cca579121e064bbc2
1 /* $Id: VIScsumcopy.S,v 1.7 2000/01/19 04:06:03 davem Exp $
2  * VIScsumcopy.S: High bandwidth IP checksumming with simultaneous
3  *            copying utilizing the UltraSparc Visual Instruction Set.
4  *
5  * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
6  *
7  * Based on older sparc32/sparc64 checksum.S, which is:
8  *
9  *      Copyright(C) 1995 Linus Torvalds
10  *      Copyright(C) 1995 Miguel de Icaza
11  *      Copyright(C) 1996,1997 David S. Miller
12  *    derived from:
13  *        Linux/Alpha checksum c-code
14  *        Linux/ix86 inline checksum assembly
15  *        RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
16  *        David Mosberger-Tang for optimized reference c-code
17  *        BSD4.4 portable checksum routine
18  */
20 #ifdef __sparc_v9__
21 #define STACKOFF        0x7ff+128
22 #else
23 #define STACKOFF        64
24 #endif
26 #ifdef __KERNEL__
27 #include <asm/head.h>
28 #include <asm/asi.h>
29 #include <asm/page.h>
30 #include <asm/visasm.h>
31 #define ASI_BLK_XOR     0
32 #define ASI_BLK_XOR1    (ASI_BLK_P ^ (ASI_BLK_P >> 3) ^ ASI_P)
33 #define ASI_BLK_OR      (ASI_BLK_P & ~ASI_P)
34 #else
35 #define ASI_P           0x80
36 #define ASI_BLK_P       0xf0
37 #define FRPS_FEF        0x04
38 #define FPRS_DU         0x02
39 #define FPRS_DL         0x01
40 #define ASI_BLK_XOR     (ASI_BLK_P ^ ASI_P)
41 #endif
43 #define src             o0
44 #define dst             o1
45 #define len             o2
46 #define sum             o3
47 #define x1              g1
48 #define x2              g2
49 #define x3              o4
50 #define x4              g4
51 #define x5              g5
52 #define x6              g7
53 #define x7              g3
54 #define x8              o5
56 /* Dobrou noc, SunSoft engineers. Spete sladce.
57  * This has a couple of tricks in and those
58  * tricks are UltraLinux trade secrets :))
59  * Once AGAIN, the SunSoft engineers are caught
60  * asleep at the keyboard :)).
61  * The main loop does about 20 superscalar cycles
62  * per 64bytes checksummed/copied.
63  */
65 #define LDBLK(O0)                                                                               \
66         ldda            [%src] %asi, %O0        /*  Load        Group                   */
68 #define STBLK                                                                                   \
69         stda            %f48, [%dst] ASI_BLK_P  /*  Store                               */
71 #define ST(fx,off)                                                                              \
72         std             %fx, [%dst + off]       /*  Store                               */
74 #define SYNC                                                                                    \
75         membar          #Sync
78 #define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DUMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...)  \
79         LOAD                                    /*  Load        Group                   */;     \
80         faligndata      %A14, %F0, %A14         /*  FPA         Group                   */;     \
81         inc             %x5                     /*  IEU0                                */;     \
82         STORE1                                  /*  Store (optional)                    */;     \
83         faligndata      %F0, %F2, %A0           /*  FPA         Group                   */;     \
84         srl             %x5, 1, %x5             /*  IEU0                                */;     \
85         add             %sum, %x4, %sum         /*  IEU1                                */;     \
86         fpadd32         %F0, %f0, %F0           /*  FPA         Group                   */;     \
87         inc             %x6                     /*  IEU0                                */;     \
88         STORE2                                  /*  Store (optional)                    */;     \
89         faligndata      %F2, %F4, %A2           /*  FPA         Group                   */;     \
90         srl             %x6, 1, %x6             /*  IEU0                                */;     \
91         add             %sum, %x5, %sum         /*  IEU1                                */;     \
92         fpadd32         %F2, %f2, %F2           /*  FPA         Group                   */;     \
93         add             %src, 64, %src          /*  IEU0                                */;     \
94         add             %dst, 64, %dst          /*  IEU1                                */;     \
95         fcmpgt32        %f0, %F0, %x1           /*  FPM         Group                   */;     \
96         inc             %x7                     /*  IEU0                                */;     \
97         STORE3                                  /*  Store (optional)                    */;     \
98         faligndata      %F4, %F6, %A4           /*  FPA                                 */;     \
99         srl             %x7, 1, %x7             /*  IEU0        Group                   */;     \
100         add             %sum, %x6, %sum         /*  IEU1                                */;     \
101         fpadd32         %F4, %f4, %F4           /*  FPA                                 */;     \
102         fcmpgt32        %f2, %F2, %x2           /*  FPM         Group                   */;     \
103         inc             %x8                     /*  IEU0                                */;     \
104         STORE4                                  /*  Store (optional)                    */;     \
105         faligndata      %F6, %F8, %A6           /*  FPA                                 */;     \
106         srl             %x8, 1, %x8             /*  IEU0        Group                   */;     \
107         add             %sum, %x7, %sum         /*  IEU1                                */;     \
108         fpadd32         %F6, %f6, %F6           /*  FPA                                 */;     \
109         fcmpgt32        %f4, %F4, %x3           /*  FPM         Group                   */;     \
110         inc             %x1                     /*  IEU0                                */;     \
111         STORE5                                  /*  Store (optional)                    */;     \
112         faligndata      %F8, %F10, %A8          /*  FPA                                 */;     \
113         srl             %x1, 1, %x1             /*  IEU0        Group                   */;     \
114         add             %sum, %x8, %sum         /*  IEU1                                */;     \
115         fpadd32         %F8, %f8, %F8           /*  FPA                                 */;     \
116         fcmpgt32        %f6, %F6, %x4           /*  FPM         Group                   */;     \
117         inc             %x2                     /*  IEU0                                */;     \
118         STORE6                                  /*  Store (optional)                    */;     \
119         faligndata      %F10, %F12, %A10        /*  FPA                                 */;     \
120         srl             %x2, 1, %x2             /*  IEU0        Group                   */;     \
121         add             %sum, %x1, %sum         /*  IEU1                                */;     \
122         fpadd32         %F10, %f10, %F10        /*  FPA                                 */;     \
123         fcmpgt32        %f8, %F8, %x5           /*  FPM         Group                   */;     \
124         inc             %x3                     /*  IEU0                                */;     \
125         STORE7                                  /*  Store (optional)                    */;     \
126         faligndata      %F12, %F14, %A12        /*  FPA                                 */;     \
127         srl             %x3, 1, %x3             /*  IEU0        Group                   */;     \
128         add             %sum, %x2, %sum         /*  IEU1                                */;     \
129         fpadd32         %F12, %f12, %F12        /*  FPA                                 */;     \
130         fcmpgt32        %f10, %F10, %x6         /*  FPM         Group                   */;     \
131         inc             %x4                     /*  IEU0                                */;     \
132         STORE8                                  /*  Store (optional)                    */;     \
133         fmovd           %F14, %B14              /*  FPA                                 */;     \
134         srl             %x4, 1, %x4             /*  IEU0        Group                   */;     \
135         add             %sum, %x3, %sum         /*  IEU1                                */;     \
136         fpadd32         %F14, %f14, %F14        /*  FPA                                 */;     \
137         fcmpgt32        %f12, %F12, %x7         /*  FPM         Group                   */;     \
138         subcc           %len, 64, %len          /*  IEU1                                */;     \
139         BRANCH                                  /*  CTI                                 */;     \
140         fcmpgt32        %f14, %F14, %x8         /*  FPM         Group                   */;     \
142 #define END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,S0,S1,S2,S3,T0,T1,U0,fz) \
143         inc             %x5                     /*  IEU0        Group                   */;     \
144         fpadd32         %f2, %f0, %S0           /*  FPA                                 */;     \
145         srl             %x5, 1, %x5             /*  IEU0        Group                   */;     \
146         add             %sum, %x4, %sum         /*  IEU1                                */;     \
147         fpadd32         %f6, %f4, %S1           /*  FPA                                 */;     \
148         inc             %x6                     /*  IEU0        Group                   */;     \
149         add             %sum, %x5, %sum         /*  IEU1                                */;     \
150         fcmpgt32        %f0, %S0, %x1           /*  FPM         Group                   */;     \
151         srl             %x6, 1, %x6             /*  IEU0                                */;     \
152         inc             %x7                     /*  IEU1                                */;     \
153         fpadd32         %f10, %f8, %S2          /*  FPA                                 */;     \
154         fcmpgt32        %f4, %S1, %x2           /*  FPM         Group                   */;     \
155         srl             %x7, 1, %x7             /*  IEU0                                */;     \
156         add             %sum, %x6, %sum         /*  IEU1                                */;     \
157         fpadd32         %f14, %f12, %S3         /*  FPA                                 */;     \
158         inc             %x8                     /*  IEU0        Group                   */;     \
159         add             %sum, %x7, %sum         /*  IEU1                                */;     \
160         fzero           %fz                     /*  FPA                                 */;     \
161         fcmpgt32        %f8, %S2, %x3           /*  FPM         Group                   */;     \
162         srl             %x8, 1, %x8             /*  IEU0                                */;     \
163         inc             %x1                     /*  IEU1                                */;     \
164         fpadd32         %S0, %S1, %T0           /*  FPA                                 */;     \
165         fcmpgt32        %f12, %S3, %x4          /*  FPM         Group                   */;     \
166         srl             %x1, 1, %x1             /*  IEU0                                */;     \
167         add             %sum, %x8, %sum         /*  IEU1                                */;     \
168         fpadd32         %S2, %S3, %T1           /*  FPA                                 */;     \
169         inc             %x2                     /*  IEU0        Group                   */;     \
170         add             %sum, %x1, %sum         /*  IEU1                                */;     \
171         fcmpgt32        %S0, %T0, %x5           /*  FPM         Group                   */;     \
172         srl             %x2, 1, %x2             /*  IEU0                                */;     \
173         inc             %x3                     /*  IEU1                                */;     \
174         fcmpgt32        %S2, %T1, %x6           /*  FPM         Group                   */;     \
175         srl             %x3, 1, %x3             /*  IEU0                                */;     \
176         add             %sum, %x2, %sum         /*  IEU1                                */;     \
177         inc             %x4                     /*  IEU0        Group                   */;     \
178         add             %sum, %x3, %sum         /*  IEU1                                */;     \
179         fcmpgt32        %fz, %f2, %x7           /*  FPM         Group                   */;     \
180         srl             %x4, 1, %x4             /*  IEU0                                */;     \
181         inc             %x5                     /*  IEU1                                */;     \
182         fpadd32         %T0, %T1, %U0           /*  FPA                                 */;     \
183         fcmpgt32        %fz, %f6, %x8           /*  FPM         Group                   */;     \
184         srl             %x5, 1, %x5             /*  IEU0                                */;     \
185         add             %sum, %x4, %sum         /*  IEU1                                */;     \
186         inc             %x6                     /*  IEU0        Group                   */;     \
187         add             %sum, %x5, %sum         /*  IEU1                                */;     \
188         fcmpgt32        %fz, %f10, %x1          /*  FPM         Group                   */;     \
189         srl             %x6, 1, %x6             /*  IEU0                                */;     \
190         inc             %x7                     /*  IEU1                                */;     \
191         fcmpgt32        %fz, %f14, %x2          /*  FPM         Group                   */;     \
192         ba,pt           %xcc, ett               /*  CTI                                 */;     \
193          fmovd          %FA, %FB                /*  FPA                                 */;     \
195 #define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB)                                        \
196         END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,f48,f50,f52,f54,f56,f58,f60,f62)
198 #define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz)                                           \
199         fpadd32         %U0, %U1, %V0           /*  FPA         Group                   */;     \
200         srl             %x7, 1, %x7             /*  IEU0                                */;     \
201         add             %sum, %x6, %sum         /*  IEU1                                */;     \
202         std             %V0, [%sp + STACKOFF]   /*  Store       Group                   */;     \
203         inc             %x8                     /*  IEU0                                */;     \
204         sub             %sum, %x7, %sum         /*  IEU1                                */;     \
205         fcmpgt32        %fz, %S1, %x3           /*  FPM         Group                   */;     \
206         srl             %x8, 1, %x8             /*  IEU0                                */;     \
207         inc             %x1                     /*  IEU1                                */;     \
208         fcmpgt32        %fz, %S3, %x4           /*  FPM         Group                   */;     \
209         srl             %x1, 1, %x1             /*  IEU0                                */;     \
210         sub             %sum, %x8, %sum         /*  IEU1                                */;     \
211         ldx             [%sp + STACKOFF], %x8   /*  Load        Group                   */;     \
212         inc             %x2                     /*  IEU0                                */;     \
213         sub             %sum, %x1, %sum         /*  IEU1                                */;     \
214         fcmpgt32        %fz, %T1, %x5           /*  FPM         Group                   */;     \
215         srl             %x2, 1, %x2             /*  IEU0                                */;     \
216         inc             %x3                     /*  IEU1                                */;     \
217         fcmpgt32        %T0, %U0, %x6           /*  FPM         Group                   */;     \
218         srl             %x3, 1, %x3             /*  IEU0                                */;     \
219         sub             %sum, %x2, %sum         /*  IEU1                                */;     \
220         inc             %x4                     /*  IEU0        Group                   */;     \
221         sub             %sum, %x3, %sum         /*  IEU1                                */;     \
222         fcmpgt32        %fz, %U1, %x7           /*  FPM         Group                   */;     \
223         srl             %x4, 1, %x4             /*  IEU0                                */;     \
224         inc             %x5                     /*  IEU1                                */;     \
225         fcmpgt32        %U0, %V0, %x1           /*  FPM         Group                   */;     \
226         srl             %x5, 1, %x5             /*  IEU0                                */;     \
227         sub             %sum, %x4, %sum         /*  IEU1                                */;     \
228         fcmpgt32        %fz, %V0, %x2           /*  FPM         Group                   */;     \
229         inc             %x6                     /*  IEU0                                */;     \
230         sub             %sum, %x5, %sum         /*  IEU1                                */;     \
231         srl             %x6, 1, %x6             /*  IEU0        Group                   */;     \
232         inc             %x7                     /*  IEU1                                */;     \
233         srl             %x7, 1, %x7             /*  IEU0        Group                   */;     \
234         add             %sum, %x6, %sum         /*  IEU1                                */;     \
235         inc             %x1                     /*  IEU0        Group                   */;     \
236         sub             %sum, %x7, %sum         /*  IEU1                                */;     \
237         srl             %x1, 1, %x1             /*  IEU0        Group                   */;     \
238         inc             %x2                     /*  IEU1                                */;     \
239         srl             %x2, 1, %x2             /*  IEU0        Group                   */;     \
240         add             %sum, %x1, %sum         /*  IEU1                                */;     \
241         sub             %sum, %x2, %sum         /*  IEU0        Group                   */;     \
242         addcc           %sum, %x8, %sum         /*  IEU         Group                   */;     \
243         bcs,a,pn        %xcc, 33f               /*  CTI                                 */;     \
244          add            %sum, 1, %sum           /*  IEU0                                */;     \
245 33:                                             /*  That's it                           */;
247         .text
248         .globl          csum_partial_copy_vis
249         .align          32
250 /* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp. csum_partial_copy_from_user */
251 /* This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256 */
252 csum_partial_copy_vis:
253         andcc           %dst, 7, %g0            /*  IEU1        Group                   */
254         be,pt           %icc, 4f                /*  CTI                                 */
255          and            %dst, 0x38, %o4         /*  IEU0                                */
256         mov             1, %g5                  /*  IEU0        Group                   */
257         andcc           %dst, 2, %g0            /*  IEU1                                */
258         be,pt           %icc, 1f                /*  CTI                                 */
259          and            %dst, 4, %g7            /*  IEU0        Group                   */
260         lduha           [%src] %asi, %g2        /*  Load                                */
261         sub             %len, 2, %len           /*  IEU0        Group                   */
262         add             %dst, 2, %dst           /*  IEU1                                */
263         andcc           %dst, 4, %g7            /*  IEU1        Group                   */
264         sll             %g5, 16, %g5            /*  IEU0                                */
265         sth             %g2, [%dst - 2]         /*  Store       Group                   */
266         sll             %g2, 16, %g2            /*  IEU0                                */
267         add             %src, 2, %src           /*  IEU1                                */
268         addcc           %g2, %sum, %sum         /*  IEU1        Group                   */
269         bcs,a,pn        %icc, 1f                /*  CTI                                 */
270          add            %sum, %g5, %sum         /*  IEU0                                */
271 1:      lduwa           [%src] %asi, %g2        /*  Load                                */
272         brz,a,pn        %g7, 4f                 /*  CTI+IEU1    Group                   */
273          and            %dst, 0x38, %o4         /*  IEU0                                */
274         add             %dst, 4, %dst           /*  IEU0        Group                   */
275         sub             %len, 4, %len           /*  IEU1                                */
276         addcc           %g2, %sum, %sum         /*  IEU1        Group                   */
277         bcs,a,pn        %icc, 1f                /*  CTI                                 */
278          add            %sum, 1, %sum           /*  IEU0                                */
279 1:      and             %dst, 0x38, %o4         /*  IEU0        Group                   */
280         stw             %g2, [%dst - 4]         /*  Store                               */
281         add             %src, 4, %src           /*  IEU1                                */
283 #ifdef __KERNEL__
284         VISEntry
285 #endif
286         mov             %src, %g7               /*  IEU1        Group                   */
287         fzero           %f48                    /*  FPA                                 */
288         alignaddr       %src, %g0, %src         /*  Single      Group                   */
289         subcc           %g7, %src, %g7          /*  IEU1        Group                   */
290         be,pt           %xcc, 1f                /*  CTI                                 */
291          mov            0x40, %g1               /*  IEU0                                */
292         lduwa           [%src] %asi, %g2        /*  Load        Group                   */
293         subcc           %sum, %g2, %sum         /*  IEU1        Group+load stall        */
294         bcs,a,pn        %icc, 1f                /*  CTI                                 */
295          sub            %sum, 1, %sum           /*  IEU0                                */
296 1:      srl             %sum, 0, %sum           /*  IEU0        Group                   */
297         clr             %g5                     /*  IEU1                                */
298         brz,pn          %o4, 3f                 /*  CTI+IEU1    Group                   */
299          sub            %g1, %o4, %g1           /*  IEU0                                */
300         ldda            [%src] %asi, %f0        /*  Load                                */
301         clr             %o4                     /*  IEU0        Group                   */
302         andcc           %dst, 8, %g0            /*  IEU1                                */
303         be,pn           %icc, 1f                /*  CTI                                 */
304          ldda           [%src + 8] %asi, %f2    /*  Load        Group                   */
305         add             %src, 8, %src           /*  IEU0                                */
306         sub             %len, 8, %len           /*  IEU1                                */
307         fpadd32         %f0, %f48, %f50         /*  FPA                                 */
308         addcc           %dst, 8, %dst           /*  IEU1        Group                   */
309         faligndata      %f0, %f2, %f16          /*  FPA                                 */
310         fcmpgt32        %f48, %f50, %o4         /*  FPM         Group                   */
311         fmovd           %f2, %f0                /*  FPA         Group                   */
312         ldda            [%src + 8] %asi, %f2    /*  Load                                */
313         std             %f16, [%dst - 8]        /*  Store                               */
314         fmovd           %f50, %f48              /*  FPA                                 */
315 1:      andcc           %g1, 0x10, %g0          /*  IEU1        Group                   */
316         be,pn           %icc, 1f                /*  CTI                                 */
317          and            %g1, 0x20, %g1          /*  IEU0                                */
318         fpadd32         %f0, %f48, %f50         /*  FPA                                 */
319         ldda            [%src + 16] %asi, %f4   /*  Load        Group                   */
320         add             %src, 16, %src          /*  IEU0                                */
321         add             %dst, 16, %dst          /*  IEU1                                */
322         faligndata      %f0, %f2, %f16          /*  FPA                                 */
323         fcmpgt32        %f48, %f50, %g5         /*  FPM         Group                   */
324         sub             %len, 16, %len          /*  IEU0                                */
325         inc             %o4                     /*  IEU1                                */
326         std             %f16, [%dst - 16]       /*  Store       Group                   */
327         fpadd32         %f2, %f50, %f48         /*  FPA                                 */
328         srl             %o4, 1, %o5             /*  IEU0                                */
329         faligndata      %f2, %f4, %f18          /*  FPA         Group                   */
330         std             %f18, [%dst - 8]        /*  Store                               */
331         fcmpgt32        %f50, %f48, %o4         /*  FPM         Group                   */
332         add             %o5, %sum, %sum         /*  IEU0                                */
333         ldda            [%src + 8] %asi, %f2    /*  Load                                */
334         fmovd           %f4, %f0                /*  FPA                                 */
335 1:      brz,a,pn        %g1, 4f                 /*  CTI+IEU1    Group                   */
336          rd             %asi, %g2               /*  LSU         Group + 4 bubbles       */
337         inc             %g5                     /*  IEU0                                */
338         fpadd32         %f0, %f48, %f50         /*  FPA                                 */
339         ldda            [%src + 16] %asi, %f4   /*  Load        Group                   */
340         srl             %g5, 1, %g5             /*  IEU0                                */
341         add             %dst, 32, %dst          /*  IEU1                                */
342         faligndata      %f0, %f2, %f16          /*  FPA                                 */
343         fcmpgt32        %f48, %f50, %o5         /*  FPM         Group                   */
344         inc             %o4                     /*  IEU0                                */
345         ldda            [%src + 24] %asi, %f6   /*  Load                                */
346         srl             %o4, 1, %o4             /*  IEU0        Group                   */
347         add             %g5, %sum, %sum         /*  IEU1                                */
348         ldda            [%src + 32] %asi, %f8   /*  Load                                */
349         fpadd32         %f2, %f50, %f48         /*  FPA                                 */
350         faligndata      %f2, %f4, %f18          /*  FPA         Group                   */
351         sub             %len, 32, %len          /*  IEU0                                */
352         std             %f16, [%dst - 32]       /*  Store                               */
353         fcmpgt32        %f50, %f48, %g3         /*  FPM         Group                   */
354         inc             %o5                     /*  IEU0                                */
355         add             %o4, %sum, %sum         /*  IEU1                                */
356         fpadd32         %f4, %f48, %f50         /*  FPA                                 */
357         faligndata      %f4, %f6, %f20          /*  FPA         Group                   */
358         srl             %o5, 1, %o5             /*  IEU0                                */
359         fcmpgt32        %f48, %f50, %g5         /*  FPM         Group                   */
360         add             %o5, %sum, %sum         /*  IEU0                                */
361         std             %f18, [%dst - 24]       /*  Store                               */
362         fpadd32         %f6, %f50, %f48         /*  FPA                                 */
363         inc             %g3                     /*  IEU0        Group                   */
364         std             %f20, [%dst - 16]       /*  Store                               */
365         add             %src, 32, %src          /*  IEU1                                */
366         faligndata      %f6, %f8, %f22          /*  FPA                                 */
367         fcmpgt32        %f50, %f48, %o4         /*  FPM         Group                   */
368         srl             %g3, 1, %g3             /*  IEU0                                */
369         std             %f22, [%dst - 8]        /*  Store                               */      
370         add             %g3, %sum, %sum         /*  IEU0        Group                   */
371 3:      rd              %asi, %g2               /*  LSU         Group + 4 bubbles       */
372 #ifdef __KERNEL__
373 4:      sethi           %hi(vis0s), %g7         /*  IEU0        Group                   */
374         or              %g2, ASI_BLK_OR, %g2    /*  IEU1                                */
375 #else
376 4:      rd              %pc, %g7                /*  LSU         Group + 4 bubbles       */
377 #endif
378         inc             %g5                     /*  IEU0        Group                   */
379         and             %src, 0x38, %g3         /*  IEU1                                */      
380         membar          #StoreLoad              /*  LSU         Group                   */
381         srl             %g5, 1, %g5             /*  IEU0                                */
382         inc             %o4                     /*  IEU1                                */
383         sll             %g3, 8, %g3             /*  IEU0        Group                   */
384         sub             %len, 0xc0, %len        /*  IEU1                                */
385         addcc           %g5, %sum, %sum         /*  IEU1        Group                   */
386         srl             %o4, 1, %o4             /*  IEU0                                */
387         add             %g7, %g3, %g7           /*  IEU0        Group                   */
388         add             %o4, %sum, %sum         /*  IEU1                                */
389 #ifdef __KERNEL__
390         jmpl            %g7 + %lo(vis0s), %g0   /*  CTI+IEU1    Group                   */
391 #else
392         jmpl            %g7 + (vis0s - 4b), %g0 /*  CTI+IEU1    Group                   */
393 #endif
394          fzero          %f32                    /*  FPA                                 */
396         .align          2048
397 vis0s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group                   */
398         add             %src, 128, %src         /*  IEU0        Group                   */
399         ldda            [%src-128] %asi, %f0    /*  Load        Group                   */
400         ldda            [%src-64] %asi, %f16    /*  Load        Group                   */
401         fmovd           %f48, %f62              /*  FPA         Group   f0 available    */
402         faligndata      %f0, %f2, %f48          /*  FPA         Group   f2 available    */
403         fcmpgt32        %f32, %f2, %x1          /*  FPM         Group   f4 available    */
404         fpadd32         %f0, %f62, %f0          /*  FPA                                 */
405         fcmpgt32        %f32, %f4, %x2          /*  FPM         Group   f6 available    */
406         faligndata      %f2, %f4, %f50          /*  FPA                                 */
407         fcmpgt32        %f62, %f0, %x3          /*  FPM         Group   f8 available    */
408         faligndata      %f4, %f6, %f52          /*  FPA                                 */
409         fcmpgt32        %f32, %f6, %x4          /*  FPM         Group   f10 available   */
410         inc             %x1                     /*  IEU0                                */
411         faligndata      %f6, %f8, %f54          /*  FPA                                 */
412         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group   f12 available   */
413         srl             %x1, 1, %x1             /*  IEU0                                */
414         inc             %x2                     /*  IEU1                                */
415         faligndata      %f8, %f10, %f56         /*  FPA                                 */
416         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group   f14 available   */
417         srl             %x2, 1, %x2             /*  IEU0                                */
418         add             %sum, %x1, %sum         /*  IEU1                                */
419         faligndata      %f10, %f12, %f58        /*  FPA                                 */
420         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group                   */
421         inc             %x3                     /*  IEU0                                */
422         add             %sum, %x2, %sum         /*  IEU1                                */
423         faligndata      %f12, %f14, %f60        /*  FPA                                 */
424         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group                   */
425         srl             %x3, 1, %x3             /*  IEU0                                */
426         inc             %x4                     /*  IEU1                                */
427         fmovd           %f14, %f62              /*  FPA                                 */
428         srl             %x4, 1, %x4             /*  IEU0        Group                   */
429         add             %sum, %x3, %sum         /*  IEU1                                */
430 vis0:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
431                         ,f48,f50,f52,f54,f56,f58,f60,f62,f62,                                                           
432                         ,LDBLK(f32),    STBLK,,,,,,,,                                                                   
433                         ,bcs,pn %icc, vis0e1)
434         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
435                         ,f48,f50,f52,f54,f56,f58,f60,f62,f62,                                                           
436                         ,LDBLK(f0),     STBLK,,,,,,,,                                                                   
437                         ,bcs,pn %icc, vis0e2)
438         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
439                         ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
440                         ,LDBLK(f16),    STBLK,,,,,,,,
441                         ,bcc,pt %icc, vis0)
442 vis0e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
443                         ,f48,f50,f52,f54,f56,f58,f60,f62,f32,
444                         ,SYNC,          STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
445                         ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e2)
446 vis0e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
447                         ,f48,f50,f52,f54,f56,f58,f60,f62,f0,
448                         ,SYNC,          STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
449                         ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e3)
450 vis0e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
451                         ,f48,f50,f52,f54,f56,f58,f60,f62,f16,
452                         ,SYNC,          STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
453                         ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e1)
454         .align          2048
455 vis1s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group                   */
456         add             %src, 128 - 8, %src     /*  IEU0        Group                   */
457         ldda            [%src-128] %asi, %f0    /*  Load        Group                   */
458         ldda            [%src-64] %asi, %f16    /*  Load        Group                   */
459         fmovd           %f0, %f58               /*  FPA         Group                   */
460         fmovd           %f48, %f0               /*  FPA         Group                   */
461         fcmpgt32        %f32, %f2, %x2          /*  FPM         Group                   */
462         faligndata      %f2, %f4, %f48          /*  FPA                                 */
463         fcmpgt32        %f32, %f4, %x3          /*  FPM         Group                   */
464         faligndata      %f4, %f6, %f50          /*  FPA                                 */
465         fcmpgt32        %f32, %f6, %x4          /*  FPM         Group                   */
466         faligndata      %f6, %f8, %f52          /*  FPA                                 */
467         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group                   */
468         inc             %x2                     /*  IEU1                                */
469         faligndata      %f8, %f10, %f54         /*  FPA                                 */
470         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group                   */
471         srl             %x2, 1, %x2             /*  IEU0                                */
472         faligndata      %f10, %f12, %f56        /*  FPA                                 */
473         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group                   */
474         inc             %x3                     /*  IEU0                                */
475         add             %sum, %x2, %sum         /*  IEU1                                */
476         faligndata      %f12, %f14, %f58        /*  FPA                                 */
477         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group                   */
478         srl             %x3, 1, %x3             /*  IEU0                                */
479         inc             %x4                     /*  IEU1                                */
480         fmovd           %f14, %f60              /*  FPA                                 */
481         srl             %x4, 1, %x4             /*  IEU0        Group                   */
482         add             %sum, %x3, %sum         /*  IEU1                                */
483 vis1:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
484                         ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
485                         ,LDBLK(f32),    ,STBLK,,,,,,,
486                         ,bcs,pn %icc, vis1e1)
487         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
488                         ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
489                         ,LDBLK(f0),     ,STBLK,,,,,,,
490                         ,bcs,pn %icc, vis1e2)
491         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
492                         ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
493                         ,LDBLK(f16),    ,STBLK,,,,,,,
494                         ,bcc,pt %icc, vis1)
495 vis1e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
496                         ,f62,f48,f50,f52,f54,f56,f58,f60,f32,
497                         ,SYNC,          ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
498                         ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e2)
499 vis1e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
500                         ,f62,f48,f50,f52,f54,f56,f58,f60,f0,
501                         ,SYNC,          ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
502                         ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e3)
503 vis1e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
504                         ,f62,f48,f50,f52,f54,f56,f58,f60,f16,
505                         ,SYNC,          ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
506                         ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e1)
507         .align          2048
508 vis2s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group                   */
509         add             %src, 128 - 16, %src    /*  IEU0        Group                   */
510         ldda            [%src-128] %asi, %f0    /*  Load        Group                   */
511         ldda            [%src-64] %asi, %f16    /*  Load        Group                   */
512         fmovd           %f0, %f56               /*  FPA         Group                   */
513         fmovd           %f48, %f0               /*  FPA         Group                   */      
514         sub             %dst, 64, %dst          /*  IEU0                                */
515         fpsub32         %f2, %f2, %f2           /*  FPA         Group                   */
516         fcmpgt32        %f32, %f4, %x3          /*  FPM         Group                   */
517         faligndata      %f4, %f6, %f48          /*  FPA                                 */
518         fcmpgt32        %f32, %f6, %x4          /*  FPM         Group                   */
519         faligndata      %f6, %f8, %f50          /*  FPA                                 */
520         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group                   */
521         faligndata      %f8, %f10, %f52         /*  FPA                                 */
522         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group                   */
523         faligndata      %f10, %f12, %f54        /*  FPA                                 */
524         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group                   */
525         inc             %x3                     /*  IEU0                                */
526         faligndata      %f12, %f14, %f56        /*  FPA                                 */
527         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group                   */
528         srl             %x3, 1, %x3             /*  IEU0                                */
529         inc             %x4                     /*  IEU1                                */
530         fmovd           %f14, %f58              /*  FPA                                 */
531         srl             %x4, 1, %x4             /*  IEU0        Group                   */
532         add             %sum, %x3, %sum         /*  IEU1                                */
533 vis2:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
534                         ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
535                         ,LDBLK(f32),    ,,STBLK,,,,,,
536                         ,bcs,pn %icc, vis2e1)
537         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
538                         ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
539                         ,LDBLK(f0),     ,,STBLK,,,,,,
540                         ,bcs,pn %icc, vis2e2)
541         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
542                         ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
543                         ,LDBLK(f16),    ,,STBLK,,,,,,
544                         ,bcc,pt %icc, vis2)
545 vis2e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
546                         ,f60,f62,f48,f50,f52,f54,f56,f58,f32,
547                         ,SYNC,          ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
548                         ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e2)
549 vis2e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
550                         ,f60,f62,f48,f50,f52,f54,f56,f58,f0,
551                         ,SYNC,          ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
552                         ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e3)
553 vis2e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
554                         ,f60,f62,f48,f50,f52,f54,f56,f58,f16,
555                         ,SYNC,          ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
556                         ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e1)
557         .align          2048
558 vis3s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group                   */
559         add             %src, 128 - 24, %src    /*  IEU0        Group                   */
560         ldda            [%src-128] %asi, %f0    /*  Load        Group                   */
561         ldda            [%src-64] %asi, %f16    /*  Load        Group                   */
562         fmovd           %f0, %f54               /*  FPA         Group                   */
563         fmovd           %f48, %f0               /*  FPA         Group                   */
564         sub             %dst, 64, %dst          /*  IEU0                                */
565         fpsub32         %f2, %f2, %f2           /*  FPA         Group                   */
566         fpsub32         %f4, %f4, %f4           /*  FPA         Group                   */
567         fcmpgt32        %f32, %f6, %x4          /*  FPM         Group                   */
568         faligndata      %f6, %f8, %f48          /*  FPA                                 */
569         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group                   */
570         faligndata      %f8, %f10, %f50         /*  FPA                                 */
571         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group                   */
572         faligndata      %f10, %f12, %f52        /*  FPA                                 */
573         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group                   */
574         faligndata      %f12, %f14, %f54        /*  FPA                                 */
575         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group                   */
576         fmovd           %f14, %f56              /*  FPA                                 */
577         inc             %x4                     /*  IEU0                                */
578         srl             %x4, 1, %x4             /*  IEU0        Group                   */
579 vis3:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
580                         ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
581                         ,LDBLK(f32),    ,,,STBLK,,,,,
582                         ,bcs,pn %icc, vis3e1)
583         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
584                         ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
585                         ,LDBLK(f0),     ,,,STBLK,,,,,
586                         ,bcs,pn %icc, vis3e2)
587         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
588                         ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
589                         ,LDBLK(f16),    ,,,STBLK,,,,,
590                         ,bcc,pt %icc, vis3)
591 vis3e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
592                         ,f58,f60,f62,f48,f50,f52,f54,f56,f32,
593                         ,SYNC,          ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
594                         ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e2)
595 vis3e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
596                         ,f58,f60,f62,f48,f50,f52,f54,f56,f0,
597                         ,SYNC,          ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
598                         ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e3)
599 vis3e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
600                         ,f58,f60,f62,f48,f50,f52,f54,f56,f16,
601                         ,SYNC,          ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
602                         ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e1)
603         .align          2048
604 vis4s:  wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group                   */
605         add             %src, 128 - 32, %src    /*  IEU0        Group                   */
606         ldda            [%src-128] %asi, %f0    /*  Load        Group                   */
607         ldda            [%src-64] %asi, %f16    /*  Load        Group                   */
608         fmovd           %f0, %f52               /*  FPA         Group                   */
609         fmovd           %f48, %f0               /*  FPA         Group                   */
610         sub             %dst, 64, %dst          /*  IEU0                                */
611         fpsub32         %f2, %f2, %f2           /*  FPA         Group                   */
612         fpsub32         %f4, %f4, %f4           /*  FPA         Group                   */
613         fpsub32         %f6, %f6, %f6           /*  FPA         Group                   */
614         clr             %x4                     /*  IEU0                                */
615         fcmpgt32        %f32, %f8, %x5          /*  FPM         Group                   */
616         faligndata      %f8, %f10, %f48         /*  FPA                                 */
617         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group                   */
618         faligndata      %f10, %f12, %f50        /*  FPA                                 */
619         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group                   */
620         faligndata      %f12, %f14, %f52        /*  FPA                                 */
621         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group                   */
622         fmovd           %f14, %f54              /*  FPA                                 */
623 vis4:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
624                         ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
625                         ,LDBLK(f32),    ,,,,STBLK,,,,
626                         ,bcs,pn %icc, vis4e1)
627         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
628                         ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
629                         ,LDBLK(f0),     ,,,,STBLK,,,,
630                         ,bcs,pn %icc, vis4e2)
631         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
632                         ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
633                         ,LDBLK(f16),    ,,,,STBLK,,,,
634                         ,bcc,pt %icc, vis4)
635 vis4e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
636                         ,f56,f58,f60,f62,f48,f50,f52,f54,f32,
637                         ,SYNC,          ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
638                         ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e2)
639 vis4e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
640                         ,f56,f58,f60,f62,f48,f50,f52,f54,f0,
641                         ,SYNC,          ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
642                         ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e3)
643 vis4e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
644                         ,f56,f58,f60,f62,f48,f50,f52,f54,f16,
645                         ,SYNC,          ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
646                         ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e1)
647         .align          2048
648 vis5s:  add             %src, 128 - 40, %src    /*  IEU0        Group                   */
649         ldda            [%src-88] %asi, %f10    /*  Load        Group                   */
650         ldda            [%src-80] %asi, %f12    /*  Load        Group                   */
651         ldda            [%src-72] %asi, %f14    /*  Load        Group                   */
652         wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group                   */
653         ldda            [%src-64] %asi, %f16    /*  Load        Group                   */
654         fmovd           %f48, %f0               /*  FPA         Group                   */
655         fmuld           %f32, %f32, %f2         /*  FPM                                 */
656         clr             %x4                     /*  IEU0                                */
657         faddd           %f32, %f32, %f4         /*  FPA         Group                   */
658         fmuld           %f32, %f32, %f6         /*  FPM                                 */
659         clr             %x5                     /*  IEU0                                */
660         faddd           %f32, %f32, %f8         /*  FPA         Group                   */
661         fcmpgt32        %f32, %f10, %x6         /*  FPM         Group                   */
662         sub             %dst, 64, %dst          /*  IEU0                                */
663         faligndata      %f10, %f12, %f48        /*  FPA                                 */
664         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group                   */
665         faligndata      %f12, %f14, %f50        /*  FPA                                 */
666         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group                   */
667         fmovd           %f14, %f52              /*  FPA                                 */
668 vis5:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
669                         ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
670                         ,LDBLK(f32),    ,,,,,STBLK,,,
671                         ,bcs,pn %icc, vis5e1)
672         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
673                         ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
674                         ,LDBLK(f0),     ,,,,,STBLK,,,
675                         ,bcs,pn %icc, vis5e2)
676         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
677                         ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
678                         ,LDBLK(f16),    ,,,,,STBLK,,,
679                         ,bcc,pt %icc, vis5)
680 vis5e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
681                         ,f54,f56,f58,f60,f62,f48,f50,f52,f32,
682                         ,SYNC,          ,,,,,STBLK,ST(f48,64),ST(f50,72),
683                         ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e2)
684 vis5e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
685                         ,f54,f56,f58,f60,f62,f48,f50,f52,f0,
686                         ,SYNC,          ,,,,,STBLK,ST(f48,64),ST(f50,72),
687                         ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e3)
688 vis5e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
689                         ,f54,f56,f58,f60,f62,f48,f50,f52,f16,
690                         ,SYNC,          ,,,,,STBLK,ST(f48,64),ST(f50,72),
691                         ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e1)
692         .align          2048
693 vis6s:  add             %src, 128 - 48, %src    /*  IEU0        Group                   */
694         ldda            [%src-80] %asi, %f12    /*  Load        Group                   */
695         ldda            [%src-72] %asi, %f14    /*  Load        Group                   */
696         wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group                   */
697         ldda            [%src-64] %asi, %f16    /*  Load        Group                   */
698         fmovd           %f48, %f0               /*  FPA         Group                   */
699         fmuld           %f32, %f32, %f2         /*  FPM                                 */
700         clr             %x4                     /*  IEU0                                */
701         faddd           %f32, %f32, %f4         /*  FPA         Group                   */
702         fmuld           %f32, %f32, %f6         /*  FPM                                 */
703         clr             %x5                     /*  IEU0                                */
704         faddd           %f32, %f32, %f8         /*  FPA         Group                   */
705         fmuld           %f32, %f32, %f10        /*  FPM                                 */
706         clr             %x6                     /*  IEU0                                */
707         fcmpgt32        %f32, %f12, %x7         /*  FPM         Group                   */
708         sub             %dst, 64, %dst          /*  IEU0                                */
709         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group                   */
710         faligndata      %f12, %f14, %f48        /*  FPA                                 */
711         fmovd           %f14, %f50              /*  FPA         Group                   */
712 vis6:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
713                         ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
714                         ,LDBLK(f32),    ,,,,,,STBLK,,
715                         ,bcs,pn %icc, vis6e1)
716         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
717                         ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
718                         ,LDBLK(f0),     ,,,,,,STBLK,,
719                         ,bcs,pn %icc, vis6e2)
720         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
721                         ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
722                         ,LDBLK(f16),    ,,,,,,STBLK,,
723                         ,bcc,pt %icc, vis6)
724 vis6e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
725                         ,f52,f54,f56,f58,f60,f62,f48,f50,f32,
726                         ,SYNC,          ,,,,,,STBLK,ST(f48,64),
727                         ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e2)
728 vis6e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
729                         ,f52,f54,f56,f58,f60,f62,f48,f50,f0,
730                         ,SYNC,          ,,,,,,STBLK,ST(f48,64),
731                         ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e3)
732 vis6e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
733                         ,f52,f54,f56,f58,f60,f62,f48,f50,f16,
734                         ,SYNC,          ,,,,,,STBLK,ST(f48,64),
735                         ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e1)
736         .align          2048
737 vis7s:  add             %src, 128 - 56, %src    /*  IEU0        Group                   */
738         ldda            [%src-72] %asi, %f14    /*  Load        Group                   */
739         wr              %g2, ASI_BLK_XOR, %asi  /*  LSU         Group                   */
740         ldda            [%src-64] %asi, %f16    /*  Load        Group                   */
741         fmovd           %f48, %f0               /*  FPA         Group                   */
742         fmuld           %f32, %f32, %f2         /*  FPM                                 */
743         clr             %x4                     /*  IEU0                                */
744         faddd           %f32, %f32, %f4         /*  FPA         Group                   */
745         fmuld           %f32, %f32, %f6         /*  FPM                                 */
746         clr             %x5                     /*  IEU0                                */
747         faddd           %f32, %f32, %f8         /*  FPA         Group                   */
748         fmuld           %f32, %f32, %f10        /*  FPM                                 */
749         clr             %x6                     /*  IEU0                                */
750         faddd           %f32, %f32, %f12        /*  FPA         Group                   */
751         clr             %x7                     /*  IEU0                                */
752         fcmpgt32        %f32, %f14, %x8         /*  FPM         Group                   */
753         sub             %dst, 64, %dst          /*  IEU0                                */
754         fmovd           %f14, %f48              /*  FPA                                 */
755 vis7:   DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
756                         ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
757                         ,LDBLK(f32),    ,,,,,,,STBLK,
758                         ,bcs,pn %icc, vis7e1)
759         DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
760                         ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
761                         ,LDBLK(f0),     ,,,,,,,STBLK,
762                         ,bcs,pn %icc, vis7e2)
763         DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
764                         ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
765                         ,LDBLK(f16),    ,,,,,,,STBLK,
766                         ,bcc,pt %icc, vis7)
767 vis7e3: DO_THE_TRICK(   f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,     
768                         ,f50,f52,f54,f56,f58,f60,f62,f48,f32,
769                         ,SYNC,          ,,,,,,,STBLK,
770                         ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e2)
771 vis7e1: DO_THE_TRICK(   f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
772                         ,f50,f52,f54,f56,f58,f60,f62,f48,f0,
773                         ,SYNC,          ,,,,,,,STBLK,
774                         ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e3)
775 vis7e2: DO_THE_TRICK(   f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,     
776                         ,f50,f52,f54,f56,f58,f60,f62,f48,f16,
777                         ,SYNC,          ,,,,,,,STBLK,
778                         ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e1)
779 e1:     END_THE_TRICK1( f0,f2,f4,f6,f8,f10,f12,f14,f16,f6)
780 e2:     END_THE_TRICK1( f16,f18,f20,f22,f24,f26,f28,f30,f32,f6)
781 e3:     END_THE_TRICK1( f32,f34,f36,f38,f40,f42,f44,f46,f0,f6)
782 ett:    rd              %asi, %x4               /*  LSU         Group+4bubbles          */
783         rd              %gsr, %x3               /*  LSU         Group+4bubbles          */
784 #ifdef __KERNEL__
785         srl             %x4, 3, %x5             /*  IEU0        Group                   */
786         xor             %x4, ASI_BLK_XOR1, %x4  /*  IEU1                                */
787         wr              %x4, %x5, %asi          /*  LSU         Group+4bubbles          */
788 #else
789         wr              %x4, ASI_BLK_XOR, %asi  /*  LSU         Group+4bubbles          */
790 #endif
791         andcc           %x3, 7, %x3             /*  IEU1        Group                   */
792         add             %dst, 8, %dst           /*  IEU0                                */
793         bne,pn          %icc, 1f                /*  CTI                                 */
794          fzero          %f10                    /*  FPA                                 */
795         brz,a,pn        %len, 2f                /*  CTI+IEU1    Group                   */
796          std            %f6, [%dst - 8]         /*  Store                               */
797 1:      cmp             %len, 8                 /*  IEU1                                */
798         blu,pn          %icc, 3f                /*  CTI                                 */
799          sub            %src, 64, %src          /*  IEU0        Group                   */
800 1:      ldda            [%src] %asi, %f2        /*  Load        Group                   */
801         fpadd32         %f10, %f2, %f12         /*  FPA         Group+load stall        */
802         add             %src, 8, %src           /*  IEU0                                */
803         add             %dst, 8, %dst           /*  IEU1                                */
804         faligndata      %f6, %f2, %f14          /*  FPA         Group                   */
805         fcmpgt32        %f10, %f12, %x5         /*  FPM         Group                   */
806         std             %f14, [%dst - 16]       /*  Store                               */
807         fmovd           %f2, %f6                /*  FPA                                 */
808         fmovd           %f12, %f10              /*  FPA         Group                   */
809         sub             %len, 8, %len           /*  IEU1                                */
810         fzero           %f16                    /*  FPA         Group - FPU nop         */
811         fzero           %f18                    /*  FPA         Group - FPU nop         */
812         inc             %x5                     /*  IEU0                                */
813         srl             %x5, 1, %x5             /*  IEU0        Group (regdep)          */
814         cmp             %len, 8                 /*  IEU1                                */
815         bgeu,pt         %icc, 1b                /*  CTI                                 */
816          add            %x5, %sum, %sum         /*  IEU0        Group                   */
817 3:      brz,a,pt        %x3, 2f                 /*  CTI+IEU1                            */
818          std            %f6, [%dst - 8]         /*  Store       Group                   */
819         st              %f7, [%dst - 8]         /*  Store       Group                   */
820         sub             %dst, 4, %dst           /*  IEU0                                */
821         add             %len, 4, %len           /*  IEU1                                */
823 #ifdef __KERNEL__
824         sub             %sp, 8, %sp             /*  IEU0        Group                   */
825 #endif
826         END_THE_TRICK2( f48,f50,f52,f54,f56,f58,f60,f10,f12,f62)
827         membar          #Sync                   /*  LSU         Group                   */
828 #ifdef __KERNEL__
829         VISExit
830         add             %sp, 8, %sp             /*  IEU0        Group                   */
831 #endif
832 23:     brnz,pn         %len, 26f               /*  CTI+IEU1    Group                   */
833 24:      sllx           %sum, 32, %g1           /*  IEU0                                */
834 25:     addcc           %sum, %g1, %src         /*  IEU1        Group                   */
835         srlx            %src, 32, %src          /*  IEU0        Group (regdep)          */
836         bcs,a,pn        %xcc, 1f                /*  CTI                                 */
837          add            %src, 1, %src           /*  IEU1                                */
838 #ifndef __KERNEL__
839 1:      retl                                    /*  CTI         Group brk forced        */
840          srl            %src, 0, %src           /*  IEU0                                */
841 #else
842 1:      sethi           %uhi(PAGE_OFFSET), %g4  /*  IEU0        Group                   */
843         retl                                    /*  CTI         Group brk forced        */
844          sllx           %g4, 32, %g4            /*  IEU0                                */
845 #endif
846 26:     andcc           %len, 8, %g0            /*  IEU1        Group                   */
847         be,pn           %icc, 1f                /*  CTI                                 */
848          lduwa          [%src] %asi, %o4        /*  Load                                */
849         lduwa           [%src+4] %asi, %g2      /*  Load        Group                   */
850         add             %src, 8, %src           /*  IEU0                                */
851         add             %dst, 8, %dst           /*  IEU1                                */
852         sllx            %o4, 32, %g5            /*  IEU0        Group                   */
853         stw             %o4, [%dst - 8]         /*  Store                               */
854         or              %g5, %g2, %g5           /*  IEU0        Group                   */
855         stw             %g2, [%dst - 4]         /*  Store                               */
856         addcc           %g5, %sum, %sum         /*  IEU1        Group                   */
857         bcs,a,pn        %xcc, 1f                /*  CTI                                 */
858          add            %sum, 1, %sum           /*  IEU0                                */
859 1:      andcc           %len, 4, %g0            /*  IEU1        Group                   */
860         be,a,pn         %icc, 1f                /*  CTI                                 */
861          clr            %g2                     /*  IEU0                                */
862         lduwa           [%src] %asi, %g7        /*  Load                                */
863         add             %src, 4, %src           /*  IEU0        Group                   */
864         add             %dst, 4, %dst           /*  IEU1                                */
865         sllx            %g7, 32, %g2            /*  IEU0        Group                   */
866         stw             %g7, [%dst - 4]         /*  Store                               */
867 1:      andcc           %len, 2, %g0            /*  IEU1                                */
868         be,a,pn         %icc, 1f                /*  CTI                                 */
869          clr            %g3                     /*  IEU0        Group                   */
870         lduha           [%src] %asi, %g7        /*  Load                                */
871         add             %src, 2, %src           /*  IEU1                                */
872         add             %dst, 2, %dst           /*  IEU0        Group                   */
873         sll             %g7, 16, %g3            /*  IEU0        Group                   */
874         sth             %g7, [%dst - 2]         /*  Store                               */
875 1:      andcc           %len, 1, %g0            /*  IEU1                                */
876         be,a,pn         %icc, 1f                /*  CTI                                 */
877          clr            %o5                     /*  IEU0        Group                   */
878         lduba           [%src] %asi, %g7        /*  Load                                */
879         sll             %g7, 8, %o5             /*  IEU0        Group                   */
880         stb             %g7, [%dst]             /*  Store                               */
881 1:      or              %g2, %g3, %g3           /*  IEU1                                */
882         or              %o5, %g3, %g3           /*  IEU0        Group (regdep)          */
883         addcc           %g3, %sum, %sum         /*  IEU1        Group (regdep)          */
884         bcs,a,pn        %xcc, 1f                /*  CTI                                 */
885          add            %sum, 1, %sum           /*  IEU0                                */
886 1:      ba,pt           %xcc, 25b               /*  CTI         Group                   */
887          sllx           %sum, 32, %g1           /*  IEU0                                */
889 #ifdef __KERNEL__
890 end:
892         .section        __ex_table
893         .align          4
894         .word           csum_partial_copy_vis, 0, end, cpc_handler
895 #endif