beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / pa32 / hppa1_1 / mul_1.asm
blob6e60c2f61fec0ac96a8491408a471e08ab8fb63e
1 dnl HP-PA 1.1 mpn_mul_1 -- Multiply a limb vector with a limb and store the
2 dnl result in a second limb vector.
4 dnl Copyright 1992-1994, 2000-2002 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C INPUT PARAMETERS
35 C res_ptr r26
36 C s1_ptr r25
37 C size r24
38 C s2_limb r23
40 C This runs at 9 cycles/limb on a PA7000. With the used instructions, it can
41 C not become faster due to data cache contention after a store. On the PA7100
42 C it runs at 7 cycles/limb.
44 C We could use fldds to read two limbs at a time from the S1 array, and that
45 C could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and
46 C PA7100, respectively. We don't do that since it does not seem worth the
47 C (alignment) troubles...
49 C At least the PA7100 is rumored to be able to deal with cache-misses without
50 C stalling instruction issue. If this is true, and the cache is actually also
51 C lockup-free, we should use a deeper software pipeline, and load from S1 very
52 C early! (The loads and stores to -12(sp) will surely be in the cache.)
54 ASM_START()
55 PROLOGUE(mpn_mul_1)
56 C .callinfo frame=64,no_calls
58 ldo 64(%r30),%r30
59 fldws,ma 4(%r25),%fr5
60 stw %r23,-16(%r30) C move s2_limb ...
61 addib,= -1,%r24,L(just_one_limb)
62 fldws -16(%r30),%fr4 C ... into fr4
63 add %r0,%r0,%r0 C clear carry
64 xmpyu %fr4,%fr5,%fr6
65 fldws,ma 4(%r25),%fr7
66 fstds %fr6,-16(%r30)
67 xmpyu %fr4,%fr7,%fr8
68 ldw -12(%r30),%r19 C least significant limb in product
69 ldw -16(%r30),%r28
71 fstds %fr8,-16(%r30)
72 addib,= -1,%r24,L(end)
73 ldw -12(%r30),%r1
75 C Main loop
76 LDEF(loop)
77 fldws,ma 4(%r25),%fr5
78 stws,ma %r19,4(%r26)
79 addc %r28,%r1,%r19
80 xmpyu %fr4,%fr5,%fr6
81 ldw -16(%r30),%r28
82 fstds %fr6,-16(%r30)
83 addib,<> -1,%r24,L(loop)
84 ldw -12(%r30),%r1
86 LDEF(end)
87 stws,ma %r19,4(%r26)
88 addc %r28,%r1,%r19
89 ldw -16(%r30),%r28
90 stws,ma %r19,4(%r26)
91 addc %r0,%r28,%r28
92 bv 0(%r2)
93 ldo -64(%r30),%r30
95 LDEF(just_one_limb)
96 xmpyu %fr4,%fr5,%fr6
97 fstds %fr6,-16(%r30)
98 ldw -16(%r30),%r28
99 ldo -64(%r30),%r30
100 bv 0(%r2)
101 fstws %fr6R,0(%r26)
102 EPILOGUE()