dma: rework config parsing
[dragonfly.git] / contrib / gmp / mpn / generic / toom3_sqr.c
blob0c8a4ff74db2db12b55b486207f21a31c4fd6c80
1 /* mpn_toom3_sqr -- Square {ap,an}.
3 Contributed to the GNU project by Torbjorn Granlund.
5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9 Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
11 This file is part of the GNU MP Library.
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of the GNU Lesser General Public License as published by
15 the Free Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
18 The GNU MP Library is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
21 License for more details.
23 You should have received a copy of the GNU Lesser General Public License
24 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
28 Things to work on:
30 1. Trim allocation. The allocations for as1 and asm1 could be
31 avoided by instead reusing the pp area and the scratch area.
32 2. Use new toom functions for the recursive calls.
35 #include "gmp.h"
36 #include "gmp-impl.h"
38 /* Evaluate in: -1, 0, +1, +2, +inf
40 <-s-><--n--><--n--><--n-->
41 ___ ______ ______ ______
42 |a3_|___a2_|___a1_|___a0_|
43 |_b1_|___b0_|
44 <-t--><--n-->
46 v0 = a0 * b0 # A(0)*B(0)
47 v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2
48 vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1
49 v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6
50 vinf= a2 * b2 # A(inf)*B(inf)
53 #if TUNE_PROGRAM_BUILD
54 #define MAYBE_sqr_basecase 1
55 #define MAYBE_sqr_toom3 1
56 #else
57 #define MAYBE_sqr_basecase \
58 (SQR_TOOM3_THRESHOLD < 3 * SQR_KARATSUBA_THRESHOLD)
59 #define MAYBE_sqr_toom3 \
60 (SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD)
61 #endif
63 #define TOOM3_SQR_N_REC(p, a, n, ws) \
64 do { \
65 if (MAYBE_sqr_basecase \
66 && BELOW_THRESHOLD (n, SQR_KARATSUBA_THRESHOLD)) \
67 mpn_sqr_basecase (p, a, n); \
68 else if (! MAYBE_sqr_toom3 \
69 || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \
70 mpn_kara_sqr_n (p, a, n, ws); \
71 else \
72 mpn_toom3_sqr_n (p, a, n, ws); \
73 } while (0)
75 void
76 mpn_toom3_sqr (mp_ptr pp,
77 mp_srcptr ap, mp_size_t an,
78 mp_ptr scratch)
80 mp_size_t n, s;
81 mp_limb_t cy, vinf0;
82 mp_ptr gp;
83 mp_ptr as1, asm1, as2;
84 TMP_DECL;
86 #define a0 ap
87 #define a1 (ap + n)
88 #define a2 (ap + 2*n)
90 n = (an + 2) / (size_t) 3;
92 s = an - 2 * n;
94 ASSERT (0 < s && s <= n);
96 TMP_MARK;
98 as1 = TMP_SALLOC_LIMBS (n + 1);
99 asm1 = TMP_SALLOC_LIMBS (n + 1);
100 as2 = TMP_SALLOC_LIMBS (n + 1);
102 gp = pp;
104 /* Compute as1 and asm1. */
105 cy = mpn_add (gp, a0, n, a2, s);
106 #if HAVE_NATIVE_mpn_addsub_n
107 if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
109 cy = mpn_addsub_n (as1, asm1, a1, gp, n);
110 as1[n] = 0;
111 asm1[n] = 0;
113 else
115 cy2 = mpn_addsub_n (as1, asm1, gp, a1, n);
116 as1[n] = cy + (cy2 >> 1);
117 asm1[n] = cy - (cy & 1);
119 #else
120 as1[n] = cy + mpn_add_n (as1, gp, a1, n);
121 if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
123 mpn_sub_n (asm1, a1, gp, n);
124 asm1[n] = 0;
126 else
128 cy -= mpn_sub_n (asm1, gp, a1, n);
129 asm1[n] = cy;
131 #endif
133 /* Compute as2. */
134 #if HAVE_NATIVE_mpn_addlsh1_n
135 cy = mpn_addlsh1_n (as2, a1, a2, s);
136 if (s != n)
137 cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
138 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
139 #else
140 cy = mpn_lshift (as2, a2, s, 1);
141 cy += mpn_add_n (as2, a1, as2, s);
142 if (s != n)
143 cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
144 cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
145 cy += mpn_add_n (as2, a0, as2, n);
146 #endif
147 as2[n] = cy;
149 ASSERT (as1[n] <= 2);
150 ASSERT (asm1[n] <= 1);
152 #define v0 pp /* 2n */
153 #define v1 (pp + 2 * n) /* 2n+1 */
154 #define vinf (pp + 4 * n) /* s+s */
155 #define vm1 scratch /* 2n+1 */
156 #define v2 (scratch + 2 * n + 1) /* 2n+2 */
157 #define scratch_out (scratch + 4 * n + 4)
159 /* vm1, 2n+1 limbs */
160 #ifdef SMALLER_RECURSION
161 TOOM3_SQR_N_REC (vm1, asm1, n, scratch_out);
162 cy = 0;
163 if (asm1[n] != 0)
164 cy = asm1[n] + mpn_add_n (vm1 + n, vm1 + n, asm1, n);
165 if (asm1[n] != 0)
166 cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
167 vm1[2 * n] = cy;
168 #else
169 TOOM3_SQR_N_REC (vm1, asm1, n + 1, scratch_out);
170 #endif
172 TOOM3_SQR_N_REC (v2, as2, n + 1, scratch_out); /* v2, 2n+1 limbs */
174 TOOM3_SQR_N_REC (vinf, a2, s, scratch_out); /* vinf, s+s limbs */
176 vinf0 = vinf[0]; /* v1 overlaps with this */
178 #ifdef SMALLER_RECURSION
179 /* v1, 2n+1 limbs */
180 TOOM3_SQR_N_REC (v1, as1, n, scratch_out);
181 if (as1[n] == 1)
183 cy = as1[n] + mpn_add_n (v1 + n, v1 + n, as1, n);
185 else if (as1[n] != 0)
187 #if HAVE_NATIVE_mpn_addlsh1_n
188 cy = 2 * as1[n] + mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
189 #else
190 cy = 2 * as1[n] + mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
191 #endif
193 else
194 cy = 0;
195 if (as1[n] == 1)
197 cy += mpn_add_n (v1 + n, v1 + n, as1, n);
199 else if (as1[n] != 0)
201 #if HAVE_NATIVE_mpn_addlsh1_n
202 cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
203 #else
204 cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
205 #endif
207 v1[2 * n] = cy;
208 #else
209 cy = vinf[1];
210 TOOM3_SQR_N_REC (v1, as1, n + 1, scratch_out);
211 vinf[1] = cy;
212 #endif
214 TOOM3_SQR_N_REC (v0, ap, n, scratch_out); /* v0, 2n limbs */
216 mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + s, 1, vinf0, scratch_out);
218 TMP_FREE;