4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * If compiled without -DRF_INLINE_MACROS then needs -lm at link time
28 * If compiled with -DRF_INLINE_MACROS then needs conv.il at compile time
29 * (i.e. cc <compiler_flags> -DRF_INLINE_MACROS conv.il mont_mulf.c )
32 #include <sys/types.h>
35 static const double TwoTo16
= 65536.0;
36 static const double TwoToMinus16
= 1.0/65536.0;
37 static const double Zero
= 0.0;
38 static const double TwoTo32
= 65536.0 * 65536.0;
39 static const double TwoToMinus32
= 1.0 / (65536.0 * 65536.0);
41 #ifdef RF_INLINE_MACROS
43 double upper32(double);
44 double lower32(double, double);
45 double mod(double, double, double);
52 return (floor(x
* TwoToMinus32
));
57 lower32(double x
, double y
)
59 return (x
- TwoTo32
* floor(x
* TwoToMinus32
));
63 mod(double x
, double oneoverm
, double m
)
65 return (x
- m
* floor(x
* oneoverm
));
72 cleanup(double *dt
, int from
, int tlen
)
75 double tmp
, tmp1
, x
, x1
;
79 for (i
= 2 * from
; i
< 2 * tlen
; i
+= 2) {
82 dt
[i
] = lower32(x
, Zero
) + tmp
;
83 dt
[i
+ 1] = lower32(x1
, Zero
) + tmp1
;
91 conv_d16_to_i32(uint32_t *i32
, double *d16
, int64_t *tmp
, int ilen
)
94 int64_t t
, t1
, /* Using int64_t and not uint64_t */
95 a
, b
, c
, d
; /* because more efficient code is */
96 /* generated this way, and there */
101 for (i
= 0; i
< ilen
- 1; i
++) {
102 c
= (int64_t)d16
[2 * i
+ 2];
103 t1
+= a
& 0xffffffff;
105 d
= (int64_t)d16
[2 * i
+ 3];
106 t1
+= (b
& 0xffff) << 16;
107 t
+= (b
>> 16) + (t1
>> 32);
108 i32
[i
] = t1
& 0xffffffff;
113 t1
+= a
& 0xffffffff;
115 t1
+= (b
& 0xffff) << 16;
116 i32
[i
] = t1
& 0xffffffff;
120 conv_i32_to_d32(double *d32
, uint32_t *i32
, int len
)
125 for (i
= 0; i
< len
; i
++)
126 d32
[i
] = (double)(i32
[i
]);
131 conv_i32_to_d16(double *d16
, uint32_t *i32
, int len
)
137 for (i
= 0; i
< len
; i
++) {
139 d16
[2 * i
] = (double)(a
& 0xffff);
140 d16
[2 * i
+ 1] = (double)(a
>> 16);
144 #ifdef RF_INLINE_MACROS
147 i16_to_d16_and_d32x4(const double *, /* 1/(2^16) */
148 const double *, /* 2^16 */
149 const double *, /* 0 */
150 double *, /* result16 */
151 double *, /* result32 */
152 float *); /* source - should be unsigned int* */
153 /* converted to float* */
159 i16_to_d16_and_d32x4(const double *dummy1
, /* 1/(2^16) */
160 const double *dummy2
, /* 2^16 */
161 const double *dummy3
, /* 0 */
164 float *src
) /* source - should be unsigned int* */
165 /* converted to float* */
170 i32
= (uint32_t *)src
;
175 result16
[0] = (double)(a
& 0xffff);
176 result16
[1] = (double)(a
>> 16);
177 result32
[0] = (double)a
;
178 result16
[2] = (double)(b
& 0xffff);
179 result16
[3] = (double)(b
>> 16);
180 result32
[1] = (double)b
;
181 result16
[4] = (double)(c
& 0xffff);
182 result16
[5] = (double)(c
>> 16);
183 result32
[2] = (double)c
;
184 result16
[6] = (double)(d
& 0xffff);
185 result16
[7] = (double)(d
>> 16);
186 result32
[3] = (double)d
;
193 conv_i32_to_d32_and_d16(double *d32
, double *d16
, uint32_t *i32
, int len
)
199 for (i
= 0; i
< len
- 3; i
+= 4) {
200 i16_to_d16_and_d32x4(&TwoToMinus16
, &TwoTo16
, &Zero
,
201 &(d16
[2*i
]), &(d32
[i
]), (float *)(&(i32
[i
])));
203 for (; i
< len
; i
++) {
205 d32
[i
] = (double)(i32
[i
]);
206 d16
[2 * i
] = (double)(a
& 0xffff);
207 d16
[2 * i
+ 1] = (double)(a
>> 16);
213 adjust_montf_result(uint32_t *i32
, uint32_t *nint
, int len
)
221 for (i
= len
- 1; i
>= 0; i
--) {
222 if (i32
[i
] != nint
[i
]) break;
225 if ((i
< 0) || (i32
[i
] > nint
[i
])) {
227 for (i
= 0; i
< len
; i
++) {
228 acc
= acc
+ (uint64_t)(i32
[i
]) - (uint64_t)(nint
[i
]);
229 i32
[i
] = acc
& 0xffffffff;
237 * the lengths of the input arrays should be at least the following:
238 * result[nlen+1], dm1[nlen], dm2[2*nlen+1], dt[4*nlen+2], dn[nlen], nint[nlen]
239 * all of them should be different from one another
241 void mont_mulf_noconv(uint32_t *result
,
242 double *dm1
, double *dm2
, double *dt
,
243 double *dn
, uint32_t *nint
,
244 int nlen
, double dn0
)
247 double digit
, m2j
, a
, b
;
248 double *pdm1
, *pdm2
, *pdn
, *pdtj
, pdn_0
, pdm1_0
;
253 pdm2
[2 * nlen
] = Zero
;
256 for (i
= 0; i
< 4 * nlen
+ 2; i
++)
258 a
= dt
[0] = pdm1
[0] * pdm2
[0];
259 digit
= mod(lower32(a
, Zero
) * dn0
, TwoToMinus16
, TwoTo16
);
262 for (j
= jj
= 0; j
< 2 * nlen
; j
++, jj
++, pdtj
++) {
264 a
= pdtj
[0] + pdn
[0] * digit
;
265 b
= pdtj
[1] + pdm1
[0] * pdm2
[j
+ 1] + a
* TwoToMinus16
;
269 for (i
= 1; i
< nlen
; i
++) {
270 pdtj
[2 * i
] += pdm1
[i
] * m2j
+ pdn
[i
] * digit
;
273 cleanup(dt
, j
/ 2 + 1, 2 * nlen
+ 1);
277 digit
= mod(lower32(b
, Zero
) * dn0
,
278 TwoToMinus16
, TwoTo16
);
281 a
= dt
[0] = pdm1
[0] * pdm2
[0];
283 dt
[65] = dt
[64] = dt
[63] = dt
[62] = dt
[61] = dt
[60] =
284 dt
[59] = dt
[58] = dt
[57] = dt
[56] = dt
[55] =
285 dt
[54] = dt
[53] = dt
[52] = dt
[51] = dt
[50] =
286 dt
[49] = dt
[48] = dt
[47] = dt
[46] = dt
[45] =
287 dt
[44] = dt
[43] = dt
[42] = dt
[41] = dt
[40] =
288 dt
[39] = dt
[38] = dt
[37] = dt
[36] = dt
[35] =
289 dt
[34] = dt
[33] = dt
[32] = dt
[31] = dt
[30] =
290 dt
[29] = dt
[28] = dt
[27] = dt
[26] = dt
[25] =
291 dt
[24] = dt
[23] = dt
[22] = dt
[21] = dt
[20] =
292 dt
[19] = dt
[18] = dt
[17] = dt
[16] = dt
[15] =
293 dt
[14] = dt
[13] = dt
[12] = dt
[11] = dt
[10] =
294 dt
[9] = dt
[8] = dt
[7] = dt
[6] = dt
[5] = dt
[4] =
295 dt
[3] = dt
[2] = dt
[1] = Zero
;
300 digit
= mod(lower32(a
, Zero
) * dn0
, TwoToMinus16
, TwoTo16
);
303 for (j
= 0; j
< 32; j
++, pdtj
++) {
306 a
= pdtj
[0] + pdn_0
* digit
;
307 b
= pdtj
[1] + pdm1_0
* pdm2
[j
+ 1] + a
* TwoToMinus16
;
310 pdtj
[2] += pdm1
[1] *m2j
+ pdn
[1] * digit
;
311 pdtj
[4] += pdm1
[2] *m2j
+ pdn
[2] * digit
;
312 pdtj
[6] += pdm1
[3] *m2j
+ pdn
[3] * digit
;
313 pdtj
[8] += pdm1
[4] *m2j
+ pdn
[4] * digit
;
314 pdtj
[10] += pdm1
[5] *m2j
+ pdn
[5] * digit
;
315 pdtj
[12] += pdm1
[6] *m2j
+ pdn
[6] * digit
;
316 pdtj
[14] += pdm1
[7] *m2j
+ pdn
[7] * digit
;
317 pdtj
[16] += pdm1
[8] *m2j
+ pdn
[8] * digit
;
318 pdtj
[18] += pdm1
[9] *m2j
+ pdn
[9] * digit
;
319 pdtj
[20] += pdm1
[10] *m2j
+ pdn
[10] * digit
;
320 pdtj
[22] += pdm1
[11] *m2j
+ pdn
[11] * digit
;
321 pdtj
[24] += pdm1
[12] *m2j
+ pdn
[12] * digit
;
322 pdtj
[26] += pdm1
[13] *m2j
+ pdn
[13] * digit
;
323 pdtj
[28] += pdm1
[14] *m2j
+ pdn
[14] * digit
;
324 pdtj
[30] += pdm1
[15] *m2j
+ pdn
[15] * digit
;
325 /* no need for cleanup, cannot overflow */
326 digit
= mod(lower32(b
, Zero
) * dn0
,
327 TwoToMinus16
, TwoTo16
);
331 conv_d16_to_i32(result
, dt
+ 2 * nlen
, (int64_t *)dt
, nlen
+ 1);
332 adjust_montf_result(result
, nint
, nlen
);