3 * Optimized version of the copy_user() routine.
4 * It is used to copy date across the kernel/user boundary.
6 * The source and destination are always on opposite side of
7 * the boundary. When reading from user space we must catch
8 * faults on loads. When writing to user space we must catch
9 * errors on stores. Note that because of the nature of the copy
10 * we don't need to worry about overlapping regions.
14 * in0 address of source buffer
15 * in1 address of destination buffer
16 * in2 number of bytes to copy
19 * ret0 0 in case of sucess. The number of bytes NOT copied in
22 * Copyright (C) 2000 Hewlett-Packard Co
23 * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com>
26 * - handle the case where we have more than 16 bytes and the alignment
29 * - fix extraneous stop bit introduced by the EX() macro.
32 #include <asm/asmmacro.h>
34 // The label comes first because our store instruction contains a comma
35 // and confuse the preprocessor otherwise
43 .section __ex_table,"a"; \
51 // Tuneable parameters
53 #define COPY_BREAK 16 // we do byte copy below (must be >=16)
54 #define PIPE_DEPTH 4 // pipe depth
56 #define EPI p[PIPE_DEPTH-1] // PASTE(p,16+PIPE_DEPTH-1)
86 GLOBAL_ENTRY(__copy_user)
88 UNW(.save ar.pfs, saved_pfs)
89 alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7)
91 .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH]
94 adds len2=-1,len // br.ctop is repeat/until
97 ;; // RAW of cfm when len=0
98 cmp.eq p8,p0=r0,len // check for zero length
99 UNW(.save ar.lc, saved_lc)
100 mov saved_lc=ar.lc // preserve ar.lc (slow)
101 (p8) br.ret.spnt.few rp // empty mempcy()
103 add enddst=dst,len // first byte after end of source
104 add endsrc=src,len // first byte after end of destination
105 UNW(.save pr, saved_pr)
106 mov saved_pr=pr // preserve predicates
110 mov dst1=dst // copy because of rotation
112 mov pr.rot=1<<16 // p16=true all others are false
114 mov src1=src // copy because of rotation
115 mov ar.lc=len2 // initialize lc for small count
116 cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy
118 xor tmp=src,dst // same alignment test prepare
119 (p10) br.cond.dptk.few long_copy_user
120 ;; // RAW pr.rot/p16 ?
122 // Now we do the byte by byte loop with software pipeline
124 // p7 is necessarily false by now
126 EX(failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
128 EX(failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
132 mov pr=saved_pr,0xffffffffffff0000
133 mov ar.pfs=saved_pfs // restore ar.ec
134 br.ret.sptk.few rp // end of short memcpy
137 // Beginning of long mempcy (i.e. > 16 bytes)
140 tbit.nz p6,p7=src1,0 // odd alignement
144 mov len1=len // copy because of rotation
145 (p8) br.cond.dpnt.few 1b // XXX Fixme. memcpy_diff_align
147 // At this point we know we have more than 16 bytes to copy
148 // and also that both src and dest have the same alignment
149 // which may not be the one we want. So for now we must move
150 // forward slowly until we reach 16byte alignment: no need to
151 // worry about reaching the end of buffer.
153 EX(failure_in1,(p6) ld1 val1[0]=[src1],1) // 1-byte aligned
154 (p6) adds len1=-1,len1;;
157 EX(failure_in1,(p7) ld2 val1[1]=[src1],2) // 2-byte aligned
158 (p7) adds len1=-2,len1;;
162 // Stop bit not required after ld4 because if we fail on ld4
163 // we have never executed the ld1, therefore st1 is not executed.
165 EX(failure_in1,(p8) ld4 val2[0]=[src1],4) // 4-byte aligned
166 EX(failure_out,(p6) st1 [dst1]=val1[0],1)
170 // Stop bit not required after ld8 because if we fail on ld8
171 // we have never executed the ld2, therefore st2 is not executed.
173 EX(failure_in1,(p9) ld8 val2[1]=[src1],8) // 8-byte aligned
174 EX(failure_out,(p7) st2 [dst1]=val1[1],2)
175 (p8) adds len1=-4,len1
177 EX(failure_out, (p8) st4 [dst1]=val2[0],4)
178 (p9) adds len1=-8,len1;;
179 shr.u cnt=len1,4 // number of 128-bit (2x64bit) words
181 EX(failure_out, (p9) st8 [dst1]=val2[1],8)
184 adds tmp=-1,cnt // br.ctop is repeat/until
185 (p7) br.cond.dpnt.few .dotail // we have less than 16 bytes left
195 EX(failure_in3,(p16) ld8 val1[0]=[src1],16)
196 (p16) ld8 val2[0]=[src2],16
198 EX(failure_out, (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16)
199 (EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
201 ;; // RAW on src1 when fall through from loop
203 // Tail correction based on len only
205 // No matter where we come from (loop or test) the src1 pointer
206 // is 16 byte aligned AND we have less than 16 bytes to copy.
209 EX(failure_in1,(p6) ld8 val1[0]=[src1],8) // at least 8 bytes
212 EX(failure_in1,(p7) ld4 val1[1]=[src1],4) // at least 4 bytes
215 EX(failure_in1,(p8) ld2 val2[0]=[src1],2) // at least 2 bytes
218 EX(failure_out, (p6) st8 [dst1]=val1[0],8)
220 EX(failure_in1,(p9) ld1 val2[1]=[src1]) // only 1 byte left
223 EX(failure_out,(p7) st4 [dst1]=val1[1],4)
224 mov pr=saved_pr,0xffffffffffff0000
226 EX(failure_out, (p8) st2 [dst1]=val2[0],2)
229 EX(failure_out, (p9) st1 [dst1]=val2[1])
235 // Here we handle the case where the byte by byte copy fails
237 // Several factors make the zeroing of the rest of the buffer kind of
239 // - the pipeline: loads/stores are not in sync (pipeline)
241 // In the same loop iteration, the dst1 pointer does not directly
242 // reflect where the faulty load was.
245 // When you get a fault on load, you may have valid data from
246 // previous loads not yet store in transit. Such data must be
247 // store normally before moving onto zeroing the rest.
249 // - single/multi dispersal independence.
252 // - we don't disrupt the pipeline, i.e. data in transit in
253 // the software pipeline will be eventually move to memory.
254 // We simply replace the load with a simple mov and keep the
255 // pipeline going. We can't really do this inline because
256 // p16 is always reset to 1 when lc > 0.
259 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
262 (EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
265 mov pr=saved_pr,0xffffffffffff0000
272 // Here we handle the head & tail part when we check for alignment.
273 // The following code handles only the load failures. The
274 // main diffculty comes from the fact that loads/stores are
275 // scheduled. So when you fail on a load, the stores corresponding
276 // to previous successful loads must be executed.
278 // However some simplifications are possible given the way
282 // Theory of operation:
296 // page_size >= 4k (2^12). (x means 4, 2, 1)
297 // Here we suppose Page A exists and Page B does not.
299 // As we move towards eight byte alignment we may encounter faults.
300 // The numbers on each page show the size of the load (current alignment).
303 // - if you fail on 1, 2, 4 then you have never executed any smaller
304 // size loads, e.g. failing ld4 means no ld1 nor ld2 executed
307 // This allows us to simplify the cleanup code, because basically you
308 // only have to worry about "pending" stores in the case of a failing
309 // ld8(). Given the way the code is written today, this means only
310 // worry about st2, st4. There we can use the information encapsulated
311 // into the predicates.
314 // - if you fail on the ld8 in the head, it means you went straight
315 // to it, i.e. 8byte alignment within an unexisting page.
316 // Again this comes from the fact that if you crossed just for the the ld8 then
317 // you are 8byte aligned but also 16byte align, therefore you would
318 // either go for the 16byte copy loop OR the ld8 in the tail part.
319 // The combination ld1, ld2, ld4, ld8 where you fail on ld8 is impossible
320 // because it would mean you had 15bytes to copy in which case you
321 // would have defaulted to the byte by byte copy.
325 // Here we now we have less than 16 bytes AND we are either 8 or 16 byte
329 // This means that we either:
330 // - are right on a page boundary
332 // - are at more than 16 bytes from a page boundary with
333 // at most 15 bytes to copy: no chance of crossing.
335 // This allows us to assume that if we fail on a load we haven't possibly
336 // executed any of the previous (tail) ones, so we don't need to do
337 // any stores. For instance, if we fail on ld2, this means we had
338 // 2 or 3 bytes left to copy and we did not execute the ld8 nor ld4.
340 // This means that we are in a situation similar the a fault in the
341 // head part. That's nice!
344 // sub ret0=enddst,dst1 // number of bytes to zero, i.e. not copied
345 // sub len=enddst,dst1,1
346 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
347 sub len=endsrc,src1,1
349 // we know that ret0 can never be zero at this point
350 // because we failed why trying to do a load, i.e. there is still
352 // The failure_in1bis and length problem is taken care of at the
356 failure_in1bis: // from (failure_in3)
357 mov ar.lc=len // Continue with a stupid byte store.
364 mov pr=saved_pr,0xffffffffffff0000
370 // Here we simply restart the loop but instead
371 // of doing loads we fill the pipeline with zeroes
372 // We can't simply store r0 because we may have valid
373 // data in transit in the pipeline.
374 // ar.lc and ar.ec are setup correctly at this point
376 // we MUST use src1/endsrc here and not dst1/enddst because
377 // of the pipeline effect.
380 sub ret0=endsrc,src1 // number of bytes to zero, i.e. not copied
385 (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],16
386 (EPI) st8 [dst2]=val2[PIPE_DEPTH-1],16
389 cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail ?
390 sub len=enddst,dst1,1 // precompute len
391 (p6) br.cond.dptk.few failure_in1bis
393 mov pr=saved_pr,0xffffffffffff0000
399 // handling of failures on stores: that's the easy part
403 mov pr=saved_pr,0xffffffffffff0000