beta-0.89.2
[luatex.git] / source / libs / gmp / gmp-src / mpn / pa32 / hppa1_1 / submul_1.asm
bloba9b11d24a893c160e285037c2bc48dd8bbed1a64
1 dnl HP-PA 1.1 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
2 dnl the result from a second limb vector.
4 dnl Copyright 1992-1994, 2000-2002 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
7 dnl
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of either:
10 dnl
11 dnl * the GNU Lesser General Public License as published by the Free
12 dnl Software Foundation; either version 3 of the License, or (at your
13 dnl option) any later version.
14 dnl
15 dnl or
16 dnl
17 dnl * the GNU General Public License as published by the Free Software
18 dnl Foundation; either version 2 of the License, or (at your option) any
19 dnl later version.
20 dnl
21 dnl or both in parallel, as here.
22 dnl
23 dnl The GNU MP Library is distributed in the hope that it will be useful, but
24 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
26 dnl for more details.
27 dnl
28 dnl You should have received copies of the GNU General Public License and the
29 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
30 dnl see https://www.gnu.org/licenses/.
32 include(`../config.m4')
34 C INPUT PARAMETERS
35 C res_ptr r26
36 C s1_ptr r25
37 C size r24
38 C s2_limb r23
40 C This runs at 12 cycles/limb on a PA7000. With the used instructions, it can
41 C not become faster due to data cache contention after a store. On the PA7100
42 C it runs at 11 cycles/limb.
44 C There are some ideas described in mul_1.asm that applies to this code too.
46 C It seems possible to make this run as fast as mpn_addmul_1, if we use
47 C sub,>>= %r29,%r19,%r22
48 C addi 1,%r28,%r28
49 C but that requires reworking the hairy software pipeline...
51 ASM_START()
52 PROLOGUE(mpn_submul_1)
53 C .callinfo frame=64,no_calls
55 ldo 64(%r30),%r30
56 fldws,ma 4(%r25),%fr5
57 stw %r23,-16(%r30) C move s2_limb ...
58 addib,= -1,%r24,L(just_one_limb)
59 fldws -16(%r30),%fr4 C ... into fr4
60 add %r0,%r0,%r0 C clear carry
61 xmpyu %fr4,%fr5,%fr6
62 fldws,ma 4(%r25),%fr7
63 fstds %fr6,-16(%r30)
64 xmpyu %fr4,%fr7,%fr8
65 ldw -12(%r30),%r19 C least significant limb in product
66 ldw -16(%r30),%r28
68 fstds %fr8,-16(%r30)
69 addib,= -1,%r24,L(end)
70 ldw -12(%r30),%r1
72 C Main loop
73 LDEF(loop)
74 ldws 0(%r26),%r29
75 fldws,ma 4(%r25),%fr5
76 sub %r29,%r19,%r22
77 add %r22,%r19,%r0
78 stws,ma %r22,4(%r26)
79 addc %r28,%r1,%r19
80 xmpyu %fr4,%fr5,%fr6
81 ldw -16(%r30),%r28
82 fstds %fr6,-16(%r30)
83 addc %r0,%r28,%r28
84 addib,<> -1,%r24,L(loop)
85 ldw -12(%r30),%r1
87 LDEF(end)
88 ldw 0(%r26),%r29
89 sub %r29,%r19,%r22
90 add %r22,%r19,%r0
91 stws,ma %r22,4(%r26)
92 addc %r28,%r1,%r19
93 ldw -16(%r30),%r28
94 ldws 0(%r26),%r29
95 addc %r0,%r28,%r28
96 sub %r29,%r19,%r22
97 add %r22,%r19,%r0
98 stws,ma %r22,4(%r26)
99 addc %r0,%r28,%r28
100 bv 0(%r2)
101 ldo -64(%r30),%r30
103 LDEF(just_one_limb)
104 xmpyu %fr4,%fr5,%fr6
105 ldw 0(%r26),%r29
106 fstds %fr6,-16(%r30)
107 ldw -12(%r30),%r1
108 ldw -16(%r30),%r28
109 sub %r29,%r1,%r22
110 add %r22,%r1,%r0
111 stw %r22,0(%r26)
112 addc %r0,%r28,%r28
113 bv 0(%r2)
114 ldo -64(%r30),%r30
115 EPILOGUE()