beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / x86 / k7 / mmx / lshift.asm
blobb3383cf2c30011548d0635f6ebe580d51c50099c
1 dnl AMD K7 mpn_lshift -- mpn left shift.
3 dnl Copyright 1999-2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
6 dnl
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
9 dnl
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
13 dnl
14 dnl or
15 dnl
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
18 dnl later version.
19 dnl
20 dnl or both in parallel, as here.
21 dnl
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
25 dnl for more details.
26 dnl
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C K7: 1.21 cycles/limb (at 16 limbs/loop).
38 dnl K7: UNROLL_COUNT cycles/limb
39 dnl 4 1.51
40 dnl 8 1.26
41 dnl 16 1.21
42 dnl 32 1.2
43 dnl Maximum possible with the current code is 64.
45 deflit(UNROLL_COUNT, 16)
48 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
49 C unsigned shift);
51 C Shift src,size left by shift many bits and store the result in dst,size.
52 C Zeros are shifted in at the right. The bits shifted out at the left are
53 C the return value.
55 C The comments in mpn_rshift apply here too.
57 ifdef(`PIC',`
58 deflit(UNROLL_THRESHOLD, 10)
59 ',`
60 deflit(UNROLL_THRESHOLD, 10)
63 defframe(PARAM_SHIFT,16)
64 defframe(PARAM_SIZE, 12)
65 defframe(PARAM_SRC, 8)
66 defframe(PARAM_DST, 4)
68 defframe(SAVE_EDI, -4)
69 defframe(SAVE_ESI, -8)
70 defframe(SAVE_EBX, -12)
71 deflit(SAVE_SIZE, 12)
73 TEXT
74 ALIGN(32)
76 PROLOGUE(mpn_lshift)
77 deflit(`FRAME',0)
79 movl PARAM_SIZE, %eax
80 movl PARAM_SRC, %edx
81 subl $SAVE_SIZE, %esp
82 deflit(`FRAME',SAVE_SIZE)
84 movl PARAM_SHIFT, %ecx
85 movl %edi, SAVE_EDI
87 movl PARAM_DST, %edi
88 decl %eax
89 jnz L(more_than_one_limb)
91 movl (%edx), %edx
93 shldl( %cl, %edx, %eax) C eax was decremented to zero
95 shll %cl, %edx
97 movl %edx, (%edi)
98 movl SAVE_EDI, %edi
99 addl $SAVE_SIZE, %esp
104 C -----------------------------------------------------------------------------
105 L(more_than_one_limb):
106 C eax size-1
107 C ebx
108 C ecx shift
109 C edx src
110 C esi
111 C edi dst
112 C ebp
114 movd PARAM_SHIFT, %mm6
115 movd (%edx,%eax,4), %mm5 C src high limb
116 cmp $UNROLL_THRESHOLD-1, %eax
118 jae L(unroll)
119 negl %ecx
120 movd (%edx), %mm4 C src low limb
122 addl $32, %ecx
124 movd %ecx, %mm7
126 L(simple_top):
127 C eax loop counter, limbs
128 C ebx
129 C ecx
130 C edx src
131 C esi
132 C edi dst
133 C ebp
135 C mm0 scratch
136 C mm4 src low limb
137 C mm5 src high limb
138 C mm6 shift
139 C mm7 32-shift
141 movq -4(%edx,%eax,4), %mm0
142 decl %eax
144 psrlq %mm7, %mm0
146 movd %mm0, 4(%edi,%eax,4)
147 jnz L(simple_top)
150 psllq %mm6, %mm5
151 psllq %mm6, %mm4
153 psrlq $32, %mm5
154 movd %mm4, (%edi) C dst low limb
156 movd %mm5, %eax C return value
158 movl SAVE_EDI, %edi
159 addl $SAVE_SIZE, %esp
160 emms
165 C -----------------------------------------------------------------------------
166 ALIGN(16)
167 L(unroll):
168 C eax size-1
169 C ebx (saved)
170 C ecx shift
171 C edx src
172 C esi
173 C edi dst
174 C ebp
176 C mm5 src high limb, for return value
177 C mm6 lshift
179 movl %esi, SAVE_ESI
180 movl %ebx, SAVE_EBX
181 leal -4(%edx,%eax,4), %edx C &src[size-2]
183 testb $4, %dl
184 movq (%edx), %mm1 C src high qword
186 jz L(start_src_aligned)
189 C src isn't aligned, process high limb (marked xxx) separately to
190 C make it so
192 C source -4(edx,%eax,4)
194 C +-------+-------+-------+--
195 C | xxx |
196 C +-------+-------+-------+--
197 C 0mod8 4mod8 0mod8
199 C dest -4(edi,%eax,4)
201 C +-------+-------+--
202 C | xxx | |
203 C +-------+-------+--
205 psllq %mm6, %mm1
206 subl $4, %edx
207 movl %eax, PARAM_SIZE C size-1
209 psrlq $32, %mm1
210 decl %eax C size-2 is new size-1
212 movd %mm1, 4(%edi,%eax,4)
213 movq (%edx), %mm1 C new src high qword
214 L(start_src_aligned):
217 leal -4(%edi,%eax,4), %edi C &dst[size-2]
218 psllq %mm6, %mm5
220 testl $4, %edi
221 psrlq $32, %mm5 C return value
223 jz L(start_dst_aligned)
226 C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
227 C shift is 32 bits extra. High limb of dst (marked xxx) handled
228 C here separately.
230 C source %edx
231 C +-------+-------+--
232 C | mm1 |
233 C +-------+-------+--
234 C 0mod8 4mod8
236 C dest %edi
237 C +-------+-------+-------+--
238 C | xxx |
239 C +-------+-------+-------+--
240 C 0mod8 4mod8 0mod8
242 movq %mm1, %mm0
243 psllq %mm6, %mm1
244 addl $32, %ecx C shift+32
246 psrlq $32, %mm1
248 movd %mm1, 4(%edi)
249 movq %mm0, %mm1
250 subl $4, %edi
252 movd %ecx, %mm6 C new lshift
253 L(start_dst_aligned):
255 decl %eax C size-2, two last limbs handled at end
256 movq %mm1, %mm2 C copy of src high qword
257 negl %ecx
259 andl $-2, %eax C round size down to even
260 addl $64, %ecx
262 movl %eax, %ebx
263 negl %eax
265 andl $UNROLL_MASK, %eax
266 decl %ebx
268 shll %eax
270 movd %ecx, %mm7 C rshift = 64-lshift
272 ifdef(`PIC',`
273 call L(pic_calc)
274 L(here):
276 leal L(entry) (%eax,%eax,4), %esi
278 shrl $UNROLL_LOG2, %ebx C loop counter
280 leal ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
281 leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
282 movl PARAM_SIZE, %eax C for use at end
283 jmp *%esi
286 ifdef(`PIC',`
287 L(pic_calc):
288 C See mpn/x86/README about old gas bugs
289 leal (%eax,%eax,4), %esi
290 addl $L(entry)-L(here), %esi
291 addl (%esp), %esi
293 ret_internal
297 C -----------------------------------------------------------------------------
298 ALIGN(32)
299 L(top):
300 C eax size (for use at end)
301 C ebx loop counter
302 C ecx rshift
303 C edx src
304 C esi computed jump
305 C edi dst
306 C ebp
308 C mm0 scratch
309 C mm1 \ carry (alternating, mm2 first)
310 C mm2 /
311 C mm6 lshift
312 C mm7 rshift
314 C 10 code bytes/limb
316 C The two chunks differ in whether mm1 or mm2 hold the carry.
317 C The computed jump puts the initial carry in both mm1 and mm2.
319 L(entry):
320 deflit(CHUNK_COUNT, 4)
321 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
322 deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
323 deflit(`disp1', eval(disp0 - 8))
325 Zdisp( movq, disp0,(%edx), %mm0)
326 psllq %mm6, %mm2
328 movq %mm0, %mm1
329 psrlq %mm7, %mm0
331 por %mm2, %mm0
332 Zdisp( movq, %mm0, disp0,(%edi))
335 Zdisp( movq, disp1,(%edx), %mm0)
336 psllq %mm6, %mm1
338 movq %mm0, %mm2
339 psrlq %mm7, %mm0
341 por %mm1, %mm0
342 Zdisp( movq, %mm0, disp1,(%edi))
345 subl $UNROLL_BYTES, %edx
346 subl $UNROLL_BYTES, %edi
347 decl %ebx
349 jns L(top)
353 define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
355 L(end):
356 testb $1, %al
357 movl SAVE_EBX, %ebx
358 psllq %mm6, %mm2 C wanted left shifted in all cases below
360 movd %mm5, %eax
362 movl SAVE_ESI, %esi
363 jz L(end_even)
366 L(end_odd):
368 C Size odd, destination was aligned.
370 C source edx+8 edx+4
371 C --+---------------+-------+
372 C | mm2 | |
373 C --+---------------+-------+
375 C dest edi
376 C --+---------------+---------------+-------+
377 C | written | | |
378 C --+---------------+---------------+-------+
380 C mm6 = shift
381 C mm7 = ecx = 64-shift
384 C Size odd, destination was unaligned.
386 C source edx+8 edx+4
387 C --+---------------+-------+
388 C | mm2 | |
389 C --+---------------+-------+
391 C dest edi
392 C --+---------------+---------------+
393 C | written | |
394 C --+---------------+---------------+
396 C mm6 = shift+32
397 C mm7 = ecx = 64-(shift+32)
400 C In both cases there's one extra limb of src to fetch and combine
401 C with mm2 to make a qword at (%edi), and in the aligned case
402 C there's an extra limb of dst to be formed from that extra src limb
403 C left shifted.
405 movd disp(4) (%edx), %mm0
406 testb $32, %cl
408 movq %mm0, %mm1
409 psllq $32, %mm0
411 psrlq %mm7, %mm0
412 psllq %mm6, %mm1
414 por %mm2, %mm0
416 movq %mm0, disp(0) (%edi)
417 jz L(end_odd_unaligned)
418 movd %mm1, disp(-4) (%edi)
419 L(end_odd_unaligned):
421 movl SAVE_EDI, %edi
422 addl $SAVE_SIZE, %esp
423 emms
428 L(end_even):
430 C Size even, destination was aligned.
432 C source edx+8
433 C --+---------------+
434 C | mm2 |
435 C --+---------------+
437 C dest edi
438 C --+---------------+---------------+
439 C | written | |
440 C --+---------------+---------------+
442 C mm6 = shift
443 C mm7 = ecx = 64-shift
446 C Size even, destination was unaligned.
448 C source edx+8
449 C --+---------------+
450 C | mm2 |
451 C --+---------------+
453 C dest edi+4
454 C --+---------------+-------+
455 C | written | |
456 C --+---------------+-------+
458 C mm6 = shift+32
459 C mm7 = ecx = 64-(shift+32)
462 C The movq for the aligned case overwrites the movd for the
463 C unaligned case.
465 movq %mm2, %mm0
466 psrlq $32, %mm2
468 testb $32, %cl
469 movd %mm2, disp(4) (%edi)
471 jz L(end_even_unaligned)
472 movq %mm0, disp(0) (%edi)
473 L(end_even_unaligned):
475 movl SAVE_EDI, %edi
476 addl $SAVE_SIZE, %esp
477 emms
481 EPILOGUE()