beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / pentium / mmx / lshift.asm
blob04b0ddcc8f09e986e394d462acd965e612f64655
1 dnl Intel P5 mpn_lshift -- mpn left shift.
3 dnl Copyright 2000-2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C P5: 1.75 cycles/limb.
37 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
38 C unsigned shift);
40 C Shift src,size left by shift many bits and store the result in dst,size.
41 C Zeros are shifted in at the right. Return the bits shifted out at the
42 C left.
44 C The comments in mpn_rshift apply here too.
46 defframe(PARAM_SHIFT,16)
47 defframe(PARAM_SIZE, 12)
48 defframe(PARAM_SRC, 8)
49 defframe(PARAM_DST, 4)
50 deflit(`FRAME',0)
52 dnl minimum 5, because the unrolled loop can't handle less
53 deflit(UNROLL_THRESHOLD, 5)
55 TEXT
56 ALIGN(8)
58 PROLOGUE(mpn_lshift)
60 pushl %ebx
61 pushl %edi
62 deflit(`FRAME',8)
64 movl PARAM_SIZE, %eax
65 movl PARAM_DST, %edx
67 movl PARAM_SRC, %ebx
68 movl PARAM_SHIFT, %ecx
70 cmp $UNROLL_THRESHOLD, %eax
71 jae L(unroll)
73 movl -4(%ebx,%eax,4), %edi C src high limb
74 decl %eax
76 jnz L(simple)
78 shldl( %cl, %edi, %eax) C eax was decremented to zero
80 shll %cl, %edi
82 movl %edi, (%edx) C dst low limb
83 popl %edi C risk of data cache bank clash
85 popl %ebx
87 ret
90 C -----------------------------------------------------------------------------
91 L(simple):
92 C eax size-1
93 C ebx src
94 C ecx shift
95 C edx dst
96 C esi
97 C edi
98 C ebp
99 deflit(`FRAME',8)
101 movd (%ebx,%eax,4), %mm5 C src high limb
103 movd %ecx, %mm6 C lshift
104 negl %ecx
106 psllq %mm6, %mm5
107 addl $32, %ecx
109 movd %ecx, %mm7
110 psrlq $32, %mm5 C retval
113 L(simple_top):
114 C eax counter, limbs, negative
115 C ebx src
116 C ecx
117 C edx dst
118 C esi
119 C edi
121 C mm0 scratch
122 C mm5 return value
123 C mm6 shift
124 C mm7 32-shift
126 movq -4(%ebx,%eax,4), %mm0
127 decl %eax
129 psrlq %mm7, %mm0
133 movd %mm0, 4(%edx,%eax,4)
134 jnz L(simple_top)
137 movd (%ebx), %mm0
139 movd %mm5, %eax
140 psllq %mm6, %mm0
142 popl %edi
143 popl %ebx
145 movd %mm0, (%edx)
147 emms
152 C -----------------------------------------------------------------------------
153 ALIGN(8)
154 L(unroll):
155 C eax size
156 C ebx src
157 C ecx shift
158 C edx dst
159 C esi
160 C edi
161 C ebp
162 deflit(`FRAME',8)
164 movd -4(%ebx,%eax,4), %mm5 C src high limb
165 leal (%ebx,%eax,4), %edi
167 movd %ecx, %mm6 C lshift
168 andl $4, %edi
170 psllq %mm6, %mm5
171 jz L(start_src_aligned)
174 C src isn't aligned, process high limb separately (marked xxx) to
175 C make it so.
177 C source -8(ebx,%eax,4)
179 C +-------+-------+-------+--
180 C | |
181 C +-------+-------+-------+--
182 C 0mod8 4mod8 0mod8
184 C dest
185 C -4(edx,%eax,4)
187 C +-------+-------+--
188 C | xxx | |
189 C +-------+-------+--
191 movq -8(%ebx,%eax,4), %mm0 C unaligned load
193 psllq %mm6, %mm0
194 decl %eax
196 psrlq $32, %mm0
200 movd %mm0, (%edx,%eax,4)
201 L(start_src_aligned):
203 movq -8(%ebx,%eax,4), %mm1 C src high qword
204 leal (%edx,%eax,4), %edi
206 andl $4, %edi
207 psrlq $32, %mm5 C return value
209 movq -16(%ebx,%eax,4), %mm3 C src second highest qword
210 jz L(start_dst_aligned)
212 C dst isn't aligned, subtract 4 to make it so, and pretend the shift
213 C is 32 bits extra. High limb of dst (marked xxx) handled here
214 C separately.
216 C source -8(ebx,%eax,4)
218 C +-------+-------+--
219 C | mm1 |
220 C +-------+-------+--
221 C 0mod8 4mod8
223 C dest
224 C -4(edx,%eax,4)
226 C +-------+-------+-------+--
227 C | xxx | |
228 C +-------+-------+-------+--
229 C 0mod8 4mod8 0mod8
231 movq %mm1, %mm0
232 addl $32, %ecx C new shift
234 psllq %mm6, %mm0
236 movd %ecx, %mm6
237 psrlq $32, %mm0
239 C wasted cycle here waiting for %mm0
241 movd %mm0, -4(%edx,%eax,4)
242 subl $4, %edx
243 L(start_dst_aligned):
246 psllq %mm6, %mm1
247 negl %ecx C -shift
249 addl $64, %ecx C 64-shift
250 movq %mm3, %mm2
252 movd %ecx, %mm7
253 subl $8, %eax C size-8
255 psrlq %mm7, %mm3
257 por %mm1, %mm3 C mm3 ready to store
258 jc L(finish)
261 C The comments in mpn_rshift apply here too.
263 ALIGN(8)
264 L(unroll_loop):
265 C eax counter, limbs
266 C ebx src
267 C ecx
268 C edx dst
269 C esi
270 C edi
272 C mm0
273 C mm1
274 C mm2 src qword from 16(%ebx,%eax,4)
275 C mm3 dst qword ready to store to 24(%edx,%eax,4)
277 C mm5 return value
278 C mm6 lshift
279 C mm7 rshift
281 movq 8(%ebx,%eax,4), %mm0
282 psllq %mm6, %mm2
284 movq %mm0, %mm1
285 psrlq %mm7, %mm0
287 movq %mm3, 24(%edx,%eax,4) C prev
288 por %mm2, %mm0
290 movq (%ebx,%eax,4), %mm3 C
291 psllq %mm6, %mm1 C
293 movq %mm0, 16(%edx,%eax,4)
294 movq %mm3, %mm2 C
296 psrlq %mm7, %mm3 C
297 subl $4, %eax
299 por %mm1, %mm3 C
300 jnc L(unroll_loop)
304 L(finish):
305 C eax -4 to -1 representing respectively 0 to 3 limbs remaining
307 testb $2, %al
309 jz L(finish_no_two)
311 movq 8(%ebx,%eax,4), %mm0
312 psllq %mm6, %mm2
314 movq %mm0, %mm1
315 psrlq %mm7, %mm0
317 movq %mm3, 24(%edx,%eax,4) C prev
318 por %mm2, %mm0
320 movq %mm1, %mm2
321 movq %mm0, %mm3
323 subl $2, %eax
324 L(finish_no_two):
327 C eax -4 or -3 representing respectively 0 or 1 limbs remaining
329 C mm2 src prev qword, from 16(%ebx,%eax,4)
330 C mm3 dst qword, for 24(%edx,%eax,4)
332 testb $1, %al
333 movd %mm5, %eax C retval
335 popl %edi
336 jz L(finish_zero)
339 C One extra src limb, destination was aligned.
341 C source ebx
342 C --+---------------+-------+
343 C | mm2 | |
344 C --+---------------+-------+
346 C dest edx+12 edx+4 edx
347 C --+---------------+---------------+-------+
348 C | mm3 | | |
349 C --+---------------+---------------+-------+
351 C mm6 = shift
352 C mm7 = ecx = 64-shift
355 C One extra src limb, destination was unaligned.
357 C source ebx
358 C --+---------------+-------+
359 C | mm2 | |
360 C --+---------------+-------+
362 C dest edx+12 edx+4
363 C --+---------------+---------------+
364 C | mm3 | |
365 C --+---------------+---------------+
367 C mm6 = shift+32
368 C mm7 = ecx = 64-(shift+32)
371 C In both cases there's one extra limb of src to fetch and combine
372 C with mm2 to make a qword at 4(%edx), and in the aligned case
373 C there's an extra limb of dst to be formed from that extra src limb
374 C left shifted.
377 movd (%ebx), %mm0
378 psllq %mm6, %mm2
380 movq %mm3, 12(%edx)
381 psllq $32, %mm0
383 movq %mm0, %mm1
384 psrlq %mm7, %mm0
386 por %mm2, %mm0
387 psllq %mm6, %mm1
389 movq %mm0, 4(%edx)
390 psrlq $32, %mm1
392 andl $32, %ecx
393 popl %ebx
395 jz L(finish_one_unaligned)
397 movd %mm1, (%edx)
398 L(finish_one_unaligned):
400 emms
405 L(finish_zero):
407 C No extra src limbs, destination was aligned.
409 C source ebx
410 C --+---------------+
411 C | mm2 |
412 C --+---------------+
414 C dest edx+8 edx
415 C --+---------------+---------------+
416 C | mm3 | |
417 C --+---------------+---------------+
419 C mm6 = shift
420 C mm7 = ecx = 64-shift
423 C No extra src limbs, destination was unaligned.
425 C source ebx
426 C --+---------------+
427 C | mm2 |
428 C --+---------------+
430 C dest edx+8 edx+4
431 C --+---------------+-------+
432 C | mm3 | |
433 C --+---------------+-------+
435 C mm6 = shift+32
436 C mm7 = ecx = 64-(shift+32)
439 C The movd for the unaligned case writes the same data to 4(%edx)
440 C that the movq does for the aligned case.
443 movq %mm3, 8(%edx)
444 andl $32, %ecx
446 psllq %mm6, %mm2
447 jz L(finish_zero_unaligned)
449 movq %mm2, (%edx)
450 L(finish_zero_unaligned):
452 psrlq $32, %mm2
453 popl %ebx
455 movd %mm5, %eax C retval
457 movd %mm2, 4(%edx)
459 emms
463 EPILOGUE()