1 @ vim
: set tabstop
=8 softtabstop
=8 shiftwidth
=8 noexpandtab syntax
=armasm
:
2 /**********************************************************************
3 * Copyright (c) 2014 Wladimir J. van der Laan *
4 * Distributed under the MIT software license, see the accompanying *
5 * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
6 **********************************************************************/
8 ARM implementation of field_10x26 inner loops.
12 - To avoid unnecessary loads and make use of available registers, two
13 'passes' have every time been interleaved, with the odd passes accumulating c' and d'
14 which will be added to c and d respectively in the even passes
20 @ eabi attributes
- see readelf
-A
21 .eabi_attribute 8, 1 @ Tag_ARM_ISA_use = yes
22 .eabi_attribute 9, 0 @ Tag_Thumb_ISA_use = no
23 .eabi_attribute 10, 0 @ Tag_FP_arch = none
24 .eabi_attribute 24, 1 @ Tag_ABI_align_needed = 8-byte
25 .eabi_attribute 25, 1 @ Tag_ABI_align_preserved = 8-byte, except leaf SP
26 .eabi_attribute 30, 2 @ Tag_ABI_optimization_goals = Aggressive Speed
27 .eabi_attribute 34, 1 @ Tag_CPU_unaligned_access = v6
33 .set field_not_M, 0xfc000000 @ ~M = ~0x3ffffff
36 .global secp256k1_fe_mul_inner
37 .type secp256k1_fe_mul_inner, %function
39 @
r0 r Restrict
: can overlap with
a, not with
b
42 @ Stack
(total
4+10*4 = 44)
43 @ sp
+ #0 saved 'r' pointer
44 @ sp
+ #4 + 4*X t0,t1,t2,t3,t4,t5,t6,t7,u8,t9
45 secp256k1_fe_mul_inner
:
46 stmfd sp
!, {r4, r5, r6, r7, r8, r9, r10, r11, r14}
47 sub sp
, sp
, #48 @ frame=44 + alignment
48 str
r0, [sp
, #0] @ save result address, we need it only at the end
50 /******************************************
51 * Main computation code.
52 ******************************************
63 Note: do not write to r[] here, it may overlap with a[]
66 /* A - interleaved with B */
67 ldr
r7, [r1, #0*4] @ a[0]
68 ldr
r8, [r2, #9*4] @ b[9]
69 ldr
r0, [r1, #1*4] @ a[1]
70 umull
r5, r6, r7, r8 @ d
= a[0] * b[9]
71 ldr
r14, [r2, #8*4] @ b[8]
72 umull
r9, r10, r0, r8 @ d
' = a[1] * b[9]
73 ldr r7, [r1, #2*4] @ a[2]
74 umlal r5, r6, r0, r14 @ d += a[1] * b[8]
75 ldr r8, [r2, #7*4] @ b[7]
76 umlal r9, r10, r7, r14 @ d' += a[2] * b[8]
77 ldr
r0, [r1, #3*4] @ a[3]
78 umlal
r5, r6, r7, r8 @ d
+= a[2] * b[7]
79 ldr
r14, [r2, #6*4] @ b[6]
80 umlal
r9, r10, r0, r8 @ d
' += a[3] * b[7]
81 ldr r7, [r1, #4*4] @ a[4]
82 umlal r5, r6, r0, r14 @ d += a[3] * b[6]
83 ldr r8, [r2, #5*4] @ b[5]
84 umlal r9, r10, r7, r14 @ d' += a[4] * b[6]
85 ldr
r0, [r1, #5*4] @ a[5]
86 umlal
r5, r6, r7, r8 @ d
+= a[4] * b[5]
87 ldr
r14, [r2, #4*4] @ b[4]
88 umlal
r9, r10, r0, r8 @ d
' += a[5] * b[5]
89 ldr r7, [r1, #6*4] @ a[6]
90 umlal r5, r6, r0, r14 @ d += a[5] * b[4]
91 ldr r8, [r2, #3*4] @ b[3]
92 umlal r9, r10, r7, r14 @ d' += a[6] * b[4]
93 ldr
r0, [r1, #7*4] @ a[7]
94 umlal
r5, r6, r7, r8 @ d
+= a[6] * b[3]
95 ldr
r14, [r2, #2*4] @ b[2]
96 umlal
r9, r10, r0, r8 @ d
' += a[7] * b[3]
97 ldr r7, [r1, #8*4] @ a[8]
98 umlal r5, r6, r0, r14 @ d += a[7] * b[2]
99 ldr r8, [r2, #1*4] @ b[1]
100 umlal r9, r10, r7, r14 @ d' += a[8] * b[2]
101 ldr
r0, [r1, #9*4] @ a[9]
102 umlal
r5, r6, r7, r8 @ d
+= a[8] * b[1]
103 ldr
r14, [r2, #0*4] @ b[0]
104 umlal
r9, r10, r0, r8 @ d
' += a[9] * b[1]
105 ldr r7, [r1, #0*4] @ a[0]
106 umlal r5, r6, r0, r14 @ d += a[9] * b[0]
109 bic r0, r5, field_not_M @ t9 = d & M
110 str r0, [sp, #4 + 4*9]
111 mov r5, r5, lsr #26 @ d >>= 26
112 orr r5, r5, r6, asl #6
116 umull r3, r4, r7, r14 @ c = a[0] * b[0]
117 adds r5, r5, r9 @ d += d'
120 bic
r0, r5, field_not_M @ u0
= d
& M
121 mov
r5, r5, lsr
#26 @ d >>= 26
122 orr
r5, r5, r6, asl
#6
124 movw
r14, field_R0 @ c
+= u0
* R0
125 umlal
r3, r4, r0, r14
127 bic
r14, r3, field_not_M @ t0
= c
& M
128 str
r14, [sp
, #4 + 0*4]
129 mov
r3, r3, lsr
#26 @ c >>= 26
130 orr
r3, r3, r4, asl
#6
132 mov
r14, field_R1 @ c
+= u0
* R1
133 umlal
r3, r4, r0, r14
135 /* C - interleaved with D */
136 ldr
r7, [r1, #0*4] @ a[0]
137 ldr
r8, [r2, #2*4] @ b[2]
138 ldr
r14, [r2, #1*4] @ b[1]
139 umull
r11, r12, r7, r8 @ c
' = a[0] * b[2]
140 ldr r0, [r1, #1*4] @ a[1]
141 umlal r3, r4, r7, r14 @ c += a[0] * b[1]
142 ldr r8, [r2, #0*4] @ b[0]
143 umlal r11, r12, r0, r14 @ c' += a[1] * b[1]
144 ldr
r7, [r1, #2*4] @ a[2]
145 umlal
r3, r4, r0, r8 @ c
+= a[1] * b[0]
146 ldr
r14, [r2, #9*4] @ b[9]
147 umlal
r11, r12, r7, r8 @ c
' += a[2] * b[0]
148 ldr r0, [r1, #3*4] @ a[3]
149 umlal r5, r6, r7, r14 @ d += a[2] * b[9]
150 ldr r8, [r2, #8*4] @ b[8]
151 umull r9, r10, r0, r14 @ d' = a[3] * b[9]
152 ldr
r7, [r1, #4*4] @ a[4]
153 umlal
r5, r6, r0, r8 @ d
+= a[3] * b[8]
154 ldr
r14, [r2, #7*4] @ b[7]
155 umlal
r9, r10, r7, r8 @ d
' += a[4] * b[8]
156 ldr r0, [r1, #5*4] @ a[5]
157 umlal r5, r6, r7, r14 @ d += a[4] * b[7]
158 ldr r8, [r2, #6*4] @ b[6]
159 umlal r9, r10, r0, r14 @ d' += a[5] * b[7]
160 ldr
r7, [r1, #6*4] @ a[6]
161 umlal
r5, r6, r0, r8 @ d
+= a[5] * b[6]
162 ldr
r14, [r2, #5*4] @ b[5]
163 umlal
r9, r10, r7, r8 @ d
' += a[6] * b[6]
164 ldr r0, [r1, #7*4] @ a[7]
165 umlal r5, r6, r7, r14 @ d += a[6] * b[5]
166 ldr r8, [r2, #4*4] @ b[4]
167 umlal r9, r10, r0, r14 @ d' += a[7] * b[5]
168 ldr
r7, [r1, #8*4] @ a[8]
169 umlal
r5, r6, r0, r8 @ d
+= a[7] * b[4]
170 ldr
r14, [r2, #3*4] @ b[3]
171 umlal
r9, r10, r7, r8 @ d
' += a[8] * b[4]
172 ldr r0, [r1, #9*4] @ a[9]
173 umlal r5, r6, r7, r14 @ d += a[8] * b[3]
174 ldr r8, [r2, #2*4] @ b[2]
175 umlal r9, r10, r0, r14 @ d' += a[9] * b[3]
176 umlal
r5, r6, r0, r8 @ d
+= a[9] * b[2]
178 bic
r0, r5, field_not_M @ u1
= d
& M
179 mov
r5, r5, lsr
#26 @ d >>= 26
180 orr
r5, r5, r6, asl
#6
182 movw
r14, field_R0 @ c
+= u1
* R0
183 umlal
r3, r4, r0, r14
185 bic
r14, r3, field_not_M @ t1
= c
& M
186 str
r14, [sp
, #4 + 1*4]
187 mov
r3, r3, lsr
#26 @ c >>= 26
188 orr
r3, r3, r4, asl
#6
190 mov
r14, field_R1 @ c
+= u1
* R1
191 umlal
r3, r4, r0, r14
194 adds
r3, r3, r11 @ c
+= c
'
196 adds r5, r5, r9 @ d += d'
199 bic
r0, r5, field_not_M @ u2
= d
& M
200 mov
r5, r5, lsr
#26 @ d >>= 26
201 orr
r5, r5, r6, asl
#6
203 movw
r14, field_R0 @ c
+= u2
* R0
204 umlal
r3, r4, r0, r14
206 bic
r14, r3, field_not_M @ t2
= c
& M
207 str
r14, [sp
, #4 + 2*4]
208 mov
r3, r3, lsr
#26 @ c >>= 26
209 orr
r3, r3, r4, asl
#6
211 mov
r14, field_R1 @ c
+= u2
* R1
212 umlal
r3, r4, r0, r14
214 /* E - interleaved with F */
215 ldr
r7, [r1, #0*4] @ a[0]
216 ldr
r8, [r2, #4*4] @ b[4]
217 umull
r11, r12, r7, r8 @ c
' = a[0] * b[4]
218 ldr r8, [r2, #3*4] @ b[3]
219 umlal r3, r4, r7, r8 @ c += a[0] * b[3]
220 ldr r7, [r1, #1*4] @ a[1]
221 umlal r11, r12, r7, r8 @ c' += a[1] * b[3]
222 ldr
r8, [r2, #2*4] @ b[2]
223 umlal
r3, r4, r7, r8 @ c
+= a[1] * b[2]
224 ldr
r7, [r1, #2*4] @ a[2]
225 umlal
r11, r12, r7, r8 @ c
' += a[2] * b[2]
226 ldr r8, [r2, #1*4] @ b[1]
227 umlal r3, r4, r7, r8 @ c += a[2] * b[1]
228 ldr r7, [r1, #3*4] @ a[3]
229 umlal r11, r12, r7, r8 @ c' += a[3] * b[1]
230 ldr
r8, [r2, #0*4] @ b[0]
231 umlal
r3, r4, r7, r8 @ c
+= a[3] * b[0]
232 ldr
r7, [r1, #4*4] @ a[4]
233 umlal
r11, r12, r7, r8 @ c
' += a[4] * b[0]
234 ldr r8, [r2, #9*4] @ b[9]
235 umlal r5, r6, r7, r8 @ d += a[4] * b[9]
236 ldr r7, [r1, #5*4] @ a[5]
237 umull r9, r10, r7, r8 @ d' = a[5] * b[9]
238 ldr
r8, [r2, #8*4] @ b[8]
239 umlal
r5, r6, r7, r8 @ d
+= a[5] * b[8]
240 ldr
r7, [r1, #6*4] @ a[6]
241 umlal
r9, r10, r7, r8 @ d
' += a[6] * b[8]
242 ldr r8, [r2, #7*4] @ b[7]
243 umlal r5, r6, r7, r8 @ d += a[6] * b[7]
244 ldr r7, [r1, #7*4] @ a[7]
245 umlal r9, r10, r7, r8 @ d' += a[7] * b[7]
246 ldr
r8, [r2, #6*4] @ b[6]
247 umlal
r5, r6, r7, r8 @ d
+= a[7] * b[6]
248 ldr
r7, [r1, #8*4] @ a[8]
249 umlal
r9, r10, r7, r8 @ d
' += a[8] * b[6]
250 ldr r8, [r2, #5*4] @ b[5]
251 umlal r5, r6, r7, r8 @ d += a[8] * b[5]
252 ldr r7, [r1, #9*4] @ a[9]
253 umlal r9, r10, r7, r8 @ d' += a[9] * b[5]
254 ldr
r8, [r2, #4*4] @ b[4]
255 umlal
r5, r6, r7, r8 @ d
+= a[9] * b[4]
257 bic
r0, r5, field_not_M @ u3
= d
& M
258 mov
r5, r5, lsr
#26 @ d >>= 26
259 orr
r5, r5, r6, asl
#6
261 movw
r14, field_R0 @ c
+= u3
* R0
262 umlal
r3, r4, r0, r14
264 bic
r14, r3, field_not_M @ t3
= c
& M
265 str
r14, [sp
, #4 + 3*4]
266 mov
r3, r3, lsr
#26 @ c >>= 26
267 orr
r3, r3, r4, asl
#6
269 mov
r14, field_R1 @ c
+= u3
* R1
270 umlal
r3, r4, r0, r14
273 adds
r3, r3, r11 @ c
+= c
'
275 adds r5, r5, r9 @ d += d'
278 bic
r0, r5, field_not_M @ u4
= d
& M
279 mov
r5, r5, lsr
#26 @ d >>= 26
280 orr
r5, r5, r6, asl
#6
282 movw
r14, field_R0 @ c
+= u4
* R0
283 umlal
r3, r4, r0, r14
285 bic
r14, r3, field_not_M @ t4
= c
& M
286 str
r14, [sp
, #4 + 4*4]
287 mov
r3, r3, lsr
#26 @ c >>= 26
288 orr
r3, r3, r4, asl
#6
290 mov
r14, field_R1 @ c
+= u4
* R1
291 umlal
r3, r4, r0, r14
293 /* G - interleaved with H */
294 ldr
r7, [r1, #0*4] @ a[0]
295 ldr
r8, [r2, #6*4] @ b[6]
296 ldr
r14, [r2, #5*4] @ b[5]
297 umull
r11, r12, r7, r8 @ c
' = a[0] * b[6]
298 ldr r0, [r1, #1*4] @ a[1]
299 umlal r3, r4, r7, r14 @ c += a[0] * b[5]
300 ldr r8, [r2, #4*4] @ b[4]
301 umlal r11, r12, r0, r14 @ c' += a[1] * b[5]
302 ldr
r7, [r1, #2*4] @ a[2]
303 umlal
r3, r4, r0, r8 @ c
+= a[1] * b[4]
304 ldr
r14, [r2, #3*4] @ b[3]
305 umlal
r11, r12, r7, r8 @ c
' += a[2] * b[4]
306 ldr r0, [r1, #3*4] @ a[3]
307 umlal r3, r4, r7, r14 @ c += a[2] * b[3]
308 ldr r8, [r2, #2*4] @ b[2]
309 umlal r11, r12, r0, r14 @ c' += a[3] * b[3]
310 ldr
r7, [r1, #4*4] @ a[4]
311 umlal
r3, r4, r0, r8 @ c
+= a[3] * b[2]
312 ldr
r14, [r2, #1*4] @ b[1]
313 umlal
r11, r12, r7, r8 @ c
' += a[4] * b[2]
314 ldr r0, [r1, #5*4] @ a[5]
315 umlal r3, r4, r7, r14 @ c += a[4] * b[1]
316 ldr r8, [r2, #0*4] @ b[0]
317 umlal r11, r12, r0, r14 @ c' += a[5] * b[1]
318 ldr
r7, [r1, #6*4] @ a[6]
319 umlal
r3, r4, r0, r8 @ c
+= a[5] * b[0]
320 ldr
r14, [r2, #9*4] @ b[9]
321 umlal
r11, r12, r7, r8 @ c
' += a[6] * b[0]
322 ldr r0, [r1, #7*4] @ a[7]
323 umlal r5, r6, r7, r14 @ d += a[6] * b[9]
324 ldr r8, [r2, #8*4] @ b[8]
325 umull r9, r10, r0, r14 @ d' = a[7] * b[9]
326 ldr
r7, [r1, #8*4] @ a[8]
327 umlal
r5, r6, r0, r8 @ d
+= a[7] * b[8]
328 ldr
r14, [r2, #7*4] @ b[7]
329 umlal
r9, r10, r7, r8 @ d
' += a[8] * b[8]
330 ldr r0, [r1, #9*4] @ a[9]
331 umlal r5, r6, r7, r14 @ d += a[8] * b[7]
332 ldr r8, [r2, #6*4] @ b[6]
333 umlal r9, r10, r0, r14 @ d' += a[9] * b[7]
334 umlal
r5, r6, r0, r8 @ d
+= a[9] * b[6]
336 bic
r0, r5, field_not_M @ u5
= d
& M
337 mov
r5, r5, lsr
#26 @ d >>= 26
338 orr
r5, r5, r6, asl
#6
340 movw
r14, field_R0 @ c
+= u5
* R0
341 umlal
r3, r4, r0, r14
343 bic
r14, r3, field_not_M @ t5
= c
& M
344 str
r14, [sp
, #4 + 5*4]
345 mov
r3, r3, lsr
#26 @ c >>= 26
346 orr
r3, r3, r4, asl
#6
348 mov
r14, field_R1 @ c
+= u5
* R1
349 umlal
r3, r4, r0, r14
352 adds
r3, r3, r11 @ c
+= c
'
354 adds r5, r5, r9 @ d += d'
357 bic
r0, r5, field_not_M @ u6
= d
& M
358 mov
r5, r5, lsr
#26 @ d >>= 26
359 orr
r5, r5, r6, asl
#6
361 movw
r14, field_R0 @ c
+= u6
* R0
362 umlal
r3, r4, r0, r14
364 bic
r14, r3, field_not_M @ t6
= c
& M
365 str
r14, [sp
, #4 + 6*4]
366 mov
r3, r3, lsr
#26 @ c >>= 26
367 orr
r3, r3, r4, asl
#6
369 mov
r14, field_R1 @ c
+= u6
* R1
370 umlal
r3, r4, r0, r14
372 /* I - interleaved with J */
373 ldr
r8, [r2, #8*4] @ b[8]
374 ldr
r7, [r1, #0*4] @ a[0]
375 ldr
r14, [r2, #7*4] @ b[7]
376 umull
r11, r12, r7, r8 @ c
' = a[0] * b[8]
377 ldr r0, [r1, #1*4] @ a[1]
378 umlal r3, r4, r7, r14 @ c += a[0] * b[7]
379 ldr r8, [r2, #6*4] @ b[6]
380 umlal r11, r12, r0, r14 @ c' += a[1] * b[7]
381 ldr
r7, [r1, #2*4] @ a[2]
382 umlal
r3, r4, r0, r8 @ c
+= a[1] * b[6]
383 ldr
r14, [r2, #5*4] @ b[5]
384 umlal
r11, r12, r7, r8 @ c
' += a[2] * b[6]
385 ldr r0, [r1, #3*4] @ a[3]
386 umlal r3, r4, r7, r14 @ c += a[2] * b[5]
387 ldr r8, [r2, #4*4] @ b[4]
388 umlal r11, r12, r0, r14 @ c' += a[3] * b[5]
389 ldr
r7, [r1, #4*4] @ a[4]
390 umlal
r3, r4, r0, r8 @ c
+= a[3] * b[4]
391 ldr
r14, [r2, #3*4] @ b[3]
392 umlal
r11, r12, r7, r8 @ c
' += a[4] * b[4]
393 ldr r0, [r1, #5*4] @ a[5]
394 umlal r3, r4, r7, r14 @ c += a[4] * b[3]
395 ldr r8, [r2, #2*4] @ b[2]
396 umlal r11, r12, r0, r14 @ c' += a[5] * b[3]
397 ldr
r7, [r1, #6*4] @ a[6]
398 umlal
r3, r4, r0, r8 @ c
+= a[5] * b[2]
399 ldr
r14, [r2, #1*4] @ b[1]
400 umlal
r11, r12, r7, r8 @ c
' += a[6] * b[2]
401 ldr r0, [r1, #7*4] @ a[7]
402 umlal r3, r4, r7, r14 @ c += a[6] * b[1]
403 ldr r8, [r2, #0*4] @ b[0]
404 umlal r11, r12, r0, r14 @ c' += a[7] * b[1]
405 ldr
r7, [r1, #8*4] @ a[8]
406 umlal
r3, r4, r0, r8 @ c
+= a[7] * b[0]
407 ldr
r14, [r2, #9*4] @ b[9]
408 umlal
r11, r12, r7, r8 @ c
' += a[8] * b[0]
409 ldr r0, [r1, #9*4] @ a[9]
410 umlal r5, r6, r7, r14 @ d += a[8] * b[9]
411 ldr r8, [r2, #8*4] @ b[8]
412 umull r9, r10, r0, r14 @ d' = a[9] * b[9]
413 umlal
r5, r6, r0, r8 @ d
+= a[9] * b[8]
415 bic
r0, r5, field_not_M @ u7
= d
& M
416 mov
r5, r5, lsr
#26 @ d >>= 26
417 orr
r5, r5, r6, asl
#6
419 movw
r14, field_R0 @ c
+= u7
* R0
420 umlal
r3, r4, r0, r14
422 bic
r14, r3, field_not_M @ t7
= c
& M
423 str
r14, [sp
, #4 + 7*4]
424 mov
r3, r3, lsr
#26 @ c >>= 26
425 orr
r3, r3, r4, asl
#6
427 mov
r14, field_R1 @ c
+= u7
* R1
428 umlal
r3, r4, r0, r14
431 adds
r3, r3, r11 @ c
+= c
'
433 adds r5, r5, r9 @ d += d'
436 bic
r0, r5, field_not_M @ u8
= d
& M
437 str
r0, [sp
, #4 + 8*4]
438 mov
r5, r5, lsr
#26 @ d >>= 26
439 orr
r5, r5, r6, asl
#6
441 movw
r14, field_R0 @ c
+= u8
* R0
442 umlal
r3, r4, r0, r14
444 /******************************************
445 * compute and write back result
446 ******************************************
456 r1,r2,r10,r14 scratch
458 Note: do not read from a[] after here, it may overlap with r[]
461 add r1, sp
, #4 + 3*4 @ r[3..7] = t3..7, r11=u8, r12=t9
462 ldmia
r1, {r2,r7,r8,r9,r10,r11,r12}
464 stmia
r1, {r2,r7,r8,r9,r10}
466 bic
r2, r3, field_not_M @ r
[8] = c
& M
468 mov
r3, r3, lsr
#26 @ c >>= 26
469 orr
r3, r3, r4, asl
#6
471 mov
r14, field_R1 @ c
+= u8
* R1
472 umlal
r3, r4, r11, r14
473 movw
r14, field_R0 @ c
+= d
* R0
474 umlal
r3, r4, r5, r14
475 adds
r3, r3, r12 @ c
+= t9
478 add r1, sp
, #4 + 0*4 @ r7,r8,r9 = t0,t1,t2
481 ubfx
r2, r3, #0, #22 @ r[9] = c & (M >> 4)
483 mov
r3, r3, lsr
#22 @ c >>= 22
484 orr
r3, r3, r4, asl
#10
486 movw
r14, field_R1
<< 4 @ c
+= d
* (R1 << 4)
487 umlal
r3, r4, r5, r14
489 movw
r14, field_R0
>> 4 @ d
= c
* (R0 >> 4) + t0
(64x64 multiply+
add)
490 umull
r5, r6, r3, r14 @ d
= c.lo
* (R0 >> 4)
491 adds
r5, r5, r7 @ d.lo
+= t0
492 mla
r6, r14, r4, r6 @ d.hi
+= c.hi
* (R0 >> 4)
493 adc
r6, r6, 0 @ d.hi
+= carry
495 bic
r2, r5, field_not_M @ r
[0] = d
& M
498 mov
r5, r5, lsr
#26 @ d >>= 26
499 orr
r5, r5, r6, asl
#6
502 movw
r14, field_R1
>> 4 @ d
+= c
* (R1 >> 4) + t1
(64x64 multiply+
add)
503 umull
r1, r2, r3, r14 @ tmp
= c.lo
* (R1 >> 4)
504 adds
r5, r5, r8 @ d.lo
+= t1
505 adc
r6, r6, #0 @ d.hi += carry
506 adds
r5, r5, r1 @ d.lo
+= tmp.lo
507 mla
r2, r14, r4, r2 @ tmp.hi
+= c.hi
* (R1 >> 4)
508 adc
r6, r6, r2 @ d.hi
+= carry
+ tmp.hi
510 bic
r2, r5, field_not_M @ r
[1] = d
& M
512 mov
r5, r5, lsr
#26 @ d >>= 26 (ignore hi)
513 orr
r5, r5, r6, asl
#6
515 add r5, r5, r9 @ d
+= t2
516 str
r5, [r0, #2*4] @ r[2] = d
519 ldmfd sp
!, {r4, r5, r6, r7, r8, r9, r10, r11, pc
}
520 .size secp256k1_fe_mul_inner, .-secp256k1_fe_mul_inner
523 .global secp256k1_fe_sqr_inner
524 .type secp256k1_fe_sqr_inner, %function
526 @
r0 r Can overlap with
a
528 @ Stack
(total
4+10*4 = 44)
529 @ sp
+ #0 saved 'r' pointer
530 @ sp
+ #4 + 4*X t0,t1,t2,t3,t4,t5,t6,t7,u8,t9
531 secp256k1_fe_sqr_inner
:
532 stmfd sp
!, {r4, r5, r6, r7, r8, r9, r10, r11, r14}
533 sub sp
, sp
, #48 @ frame=44 + alignment
534 str
r0, [sp
, #0] @ save result address, we need it only at the end
535 /******************************************
536 * Main computation code.
537 ******************************************
540 r0,r14,r2,r7,r8 scratch
547 Note: do not write to r[] here, it may overlap with a[]
549 /* A interleaved with B */
550 ldr
r0, [r1, #1*4] @ a[1]*2
551 ldr
r7, [r1, #0*4] @ a[0]
553 ldr
r14, [r1, #9*4] @ a[9]
554 umull
r3, r4, r7, r7 @ c
= a[0] * a[0]
555 ldr
r8, [r1, #8*4] @ a[8]
557 umull
r5, r6, r7, r14 @ d
= a[0]*2 * a[9]
558 ldr
r7, [r1, #2*4] @ a[2]*2
559 umull
r9, r10, r0, r14 @ d
' = a[1]*2 * a[9]
560 ldr r14, [r1, #7*4] @ a[7]
561 umlal r5, r6, r0, r8 @ d += a[1]*2 * a[8]
563 ldr r0, [r1, #3*4] @ a[3]*2
564 umlal r9, r10, r7, r8 @ d' += a[2]*2 * a[8]
565 ldr
r8, [r1, #6*4] @ a[6]
566 umlal
r5, r6, r7, r14 @ d
+= a[2]*2 * a[7]
568 ldr
r7, [r1, #4*4] @ a[4]*2
569 umlal
r9, r10, r0, r14 @ d
' += a[3]*2 * a[7]
570 ldr r14, [r1, #5*4] @ a[5]
572 umlal r5, r6, r0, r8 @ d += a[3]*2 * a[6]
573 umlal r9, r10, r7, r8 @ d' += a[4]*2 * a[6]
574 umlal
r5, r6, r7, r14 @ d
+= a[4]*2 * a[5]
575 umlal
r9, r10, r14, r14 @ d
' += a[5] * a[5]
577 bic r0, r5, field_not_M @ t9 = d & M
578 str r0, [sp, #4 + 9*4]
579 mov r5, r5, lsr #26 @ d >>= 26
580 orr r5, r5, r6, asl #6
584 adds r5, r5, r9 @ d += d'
587 bic
r0, r5, field_not_M @ u0
= d
& M
588 mov
r5, r5, lsr
#26 @ d >>= 26
589 orr
r5, r5, r6, asl
#6
591 movw
r14, field_R0 @ c
+= u0
* R0
592 umlal
r3, r4, r0, r14
593 bic
r14, r3, field_not_M @ t0
= c
& M
594 str
r14, [sp
, #4 + 0*4]
595 mov
r3, r3, lsr
#26 @ c >>= 26
596 orr
r3, r3, r4, asl
#6
598 mov
r14, field_R1 @ c
+= u0
* R1
599 umlal
r3, r4, r0, r14
601 /* C interleaved with D */
602 ldr
r0, [r1, #0*4] @ a[0]*2
603 ldr
r14, [r1, #1*4] @ a[1]
605 ldr
r8, [r1, #2*4] @ a[2]
606 umlal
r3, r4, r0, r14 @ c
+= a[0]*2 * a[1]
607 mov
r7, r8, asl
#1 @ a[2]*2
608 umull
r11, r12, r14, r14 @ c
' = a[1] * a[1]
609 ldr r14, [r1, #9*4] @ a[9]
610 umlal r11, r12, r0, r8 @ c' += a[0]*2 * a[2]
611 ldr
r0, [r1, #3*4] @ a[3]*2
612 ldr
r8, [r1, #8*4] @ a[8]
613 umlal
r5, r6, r7, r14 @ d
+= a[2]*2 * a[9]
615 ldr
r7, [r1, #4*4] @ a[4]*2
616 umull
r9, r10, r0, r14 @ d
' = a[3]*2 * a[9]
617 ldr r14, [r1, #7*4] @ a[7]
618 umlal r5, r6, r0, r8 @ d += a[3]*2 * a[8]
620 ldr r0, [r1, #5*4] @ a[5]*2
621 umlal r9, r10, r7, r8 @ d' += a[4]*2 * a[8]
622 ldr
r8, [r1, #6*4] @ a[6]
624 umlal
r5, r6, r7, r14 @ d
+= a[4]*2 * a[7]
625 umlal
r9, r10, r0, r14 @ d
' += a[5]*2 * a[7]
626 umlal r5, r6, r0, r8 @ d += a[5]*2 * a[6]
627 umlal r9, r10, r8, r8 @ d' += a[6] * a[6]
629 bic
r0, r5, field_not_M @ u1
= d
& M
630 mov
r5, r5, lsr
#26 @ d >>= 26
631 orr
r5, r5, r6, asl
#6
633 movw
r14, field_R0 @ c
+= u1
* R0
634 umlal
r3, r4, r0, r14
635 bic
r14, r3, field_not_M @ t1
= c
& M
636 str
r14, [sp
, #4 + 1*4]
637 mov
r3, r3, lsr
#26 @ c >>= 26
638 orr
r3, r3, r4, asl
#6
640 mov
r14, field_R1 @ c
+= u1
* R1
641 umlal
r3, r4, r0, r14
644 adds
r3, r3, r11 @ c
+= c
'
646 adds r5, r5, r9 @ d += d'
649 bic
r0, r5, field_not_M @ u2
= d
& M
650 mov
r5, r5, lsr
#26 @ d >>= 26
651 orr
r5, r5, r6, asl
#6
653 movw
r14, field_R0 @ c
+= u2
* R0
654 umlal
r3, r4, r0, r14
655 bic
r14, r3, field_not_M @ t2
= c
& M
656 str
r14, [sp
, #4 + 2*4]
657 mov
r3, r3, lsr
#26 @ c >>= 26
658 orr
r3, r3, r4, asl
#6
660 mov
r14, field_R1 @ c
+= u2
* R1
661 umlal
r3, r4, r0, r14
663 /* E interleaved with F */
664 ldr
r7, [r1, #0*4] @ a[0]*2
665 ldr
r0, [r1, #1*4] @ a[1]*2
666 ldr
r14, [r1, #2*4] @ a[2]
668 ldr
r8, [r1, #3*4] @ a[3]
670 umlal
r3, r4, r7, r8 @ c
+= a[0]*2 * a[3]
672 umull
r11, r12, r7, r2 @ c
' = a[0]*2 * a[4]
673 mov r2, r2, asl #1 @ a[4]*2
674 umlal r11, r12, r0, r8 @ c' += a[1]*2 * a[3]
675 ldr
r8, [r1, #9*4] @ a[9]
676 umlal
r3, r4, r0, r14 @ c
+= a[1]*2 * a[2]
677 ldr
r0, [r1, #5*4] @ a[5]*2
678 umlal
r11, r12, r14, r14 @ c
' += a[2] * a[2]
679 ldr r14, [r1, #8*4] @ a[8]
681 umlal r5, r6, r2, r8 @ d += a[4]*2 * a[9]
682 ldr r7, [r1, #6*4] @ a[6]*2
683 umull r9, r10, r0, r8 @ d' = a[5]*2 * a[9]
685 ldr
r8, [r1, #7*4] @ a[7]
686 umlal
r5, r6, r0, r14 @ d
+= a[5]*2 * a[8]
687 umlal
r9, r10, r7, r14 @ d
' += a[6]*2 * a[8]
688 umlal r5, r6, r7, r8 @ d += a[6]*2 * a[7]
689 umlal r9, r10, r8, r8 @ d' += a[7] * a[7]
691 bic
r0, r5, field_not_M @ u3
= d
& M
692 mov
r5, r5, lsr
#26 @ d >>= 26
693 orr
r5, r5, r6, asl
#6
695 movw
r14, field_R0 @ c
+= u3
* R0
696 umlal
r3, r4, r0, r14
697 bic
r14, r3, field_not_M @ t3
= c
& M
698 str
r14, [sp
, #4 + 3*4]
699 mov
r3, r3, lsr
#26 @ c >>= 26
700 orr
r3, r3, r4, asl
#6
702 mov
r14, field_R1 @ c
+= u3
* R1
703 umlal
r3, r4, r0, r14
706 adds
r3, r3, r11 @ c
+= c
'
708 adds r5, r5, r9 @ d += d'
711 bic
r0, r5, field_not_M @ u4
= d
& M
712 mov
r5, r5, lsr
#26 @ d >>= 26
713 orr
r5, r5, r6, asl
#6
715 movw
r14, field_R0 @ c
+= u4
* R0
716 umlal
r3, r4, r0, r14
717 bic
r14, r3, field_not_M @ t4
= c
& M
718 str
r14, [sp
, #4 + 4*4]
719 mov
r3, r3, lsr
#26 @ c >>= 26
720 orr
r3, r3, r4, asl
#6
722 mov
r14, field_R1 @ c
+= u4
* R1
723 umlal
r3, r4, r0, r14
725 /* G interleaved with H */
726 ldr
r7, [r1, #0*4] @ a[0]*2
727 ldr
r0, [r1, #1*4] @ a[1]*2
729 ldr
r8, [r1, #5*4] @ a[5]
730 ldr
r2, [r1, #6*4] @ a[6]
731 umlal
r3, r4, r7, r8 @ c
+= a[0]*2 * a[5]
732 ldr
r14, [r1, #4*4] @ a[4]
734 umull
r11, r12, r7, r2 @ c
' = a[0]*2 * a[6]
735 ldr r7, [r1, #2*4] @ a[2]*2
736 umlal r11, r12, r0, r8 @ c' += a[1]*2 * a[5]
738 ldr
r8, [r1, #3*4] @ a[3]
739 umlal
r3, r4, r0, r14 @ c
+= a[1]*2 * a[4]
740 mov
r0, r2, asl
#1 @ a[6]*2
741 umlal
r11, r12, r7, r14 @ c
' += a[2]*2 * a[4]
742 ldr r14, [r1, #9*4] @ a[9]
743 umlal r3, r4, r7, r8 @ c += a[2]*2 * a[3]
744 ldr r7, [r1, #7*4] @ a[7]*2
745 umlal r11, r12, r8, r8 @ c' += a[3] * a[3]
747 ldr
r8, [r1, #8*4] @ a[8]
748 umlal
r5, r6, r0, r14 @ d
+= a[6]*2 * a[9]
749 umull
r9, r10, r7, r14 @ d
' = a[7]*2 * a[9]
750 umlal r5, r6, r7, r8 @ d += a[7]*2 * a[8]
751 umlal r9, r10, r8, r8 @ d' += a[8] * a[8]
753 bic
r0, r5, field_not_M @ u5
= d
& M
754 mov
r5, r5, lsr
#26 @ d >>= 26
755 orr
r5, r5, r6, asl
#6
757 movw
r14, field_R0 @ c
+= u5
* R0
758 umlal
r3, r4, r0, r14
759 bic
r14, r3, field_not_M @ t5
= c
& M
760 str
r14, [sp
, #4 + 5*4]
761 mov
r3, r3, lsr
#26 @ c >>= 26
762 orr
r3, r3, r4, asl
#6
764 mov
r14, field_R1 @ c
+= u5
* R1
765 umlal
r3, r4, r0, r14
768 adds
r3, r3, r11 @ c
+= c
'
770 adds r5, r5, r9 @ d += d'
773 bic
r0, r5, field_not_M @ u6
= d
& M
774 mov
r5, r5, lsr
#26 @ d >>= 26
775 orr
r5, r5, r6, asl
#6
777 movw
r14, field_R0 @ c
+= u6
* R0
778 umlal
r3, r4, r0, r14
779 bic
r14, r3, field_not_M @ t6
= c
& M
780 str
r14, [sp
, #4 + 6*4]
781 mov
r3, r3, lsr
#26 @ c >>= 26
782 orr
r3, r3, r4, asl
#6
784 mov
r14, field_R1 @ c
+= u6
* R1
785 umlal
r3, r4, r0, r14
787 /* I interleaved with J */
788 ldr
r7, [r1, #0*4] @ a[0]*2
789 ldr
r0, [r1, #1*4] @ a[1]*2
791 ldr
r8, [r1, #7*4] @ a[7]
792 ldr
r2, [r1, #8*4] @ a[8]
793 umlal
r3, r4, r7, r8 @ c
+= a[0]*2 * a[7]
794 ldr
r14, [r1, #6*4] @ a[6]
796 umull
r11, r12, r7, r2 @ c
' = a[0]*2 * a[8]
797 ldr r7, [r1, #2*4] @ a[2]*2
798 umlal r11, r12, r0, r8 @ c' += a[1]*2 * a[7]
799 ldr
r8, [r1, #5*4] @ a[5]
800 umlal
r3, r4, r0, r14 @ c
+= a[1]*2 * a[6]
801 ldr
r0, [r1, #3*4] @ a[3]*2
803 umlal
r11, r12, r7, r14 @ c
' += a[2]*2 * a[6]
804 ldr r14, [r1, #4*4] @ a[4]
806 umlal r3, r4, r7, r8 @ c += a[2]*2 * a[5]
807 mov r2, r2, asl #1 @ a[8]*2
808 umlal r11, r12, r0, r8 @ c' += a[3]*2 * a[5]
809 umlal
r3, r4, r0, r14 @ c
+= a[3]*2 * a[4]
810 umlal
r11, r12, r14, r14 @ c
' += a[4] * a[4]
811 ldr r8, [r1, #9*4] @ a[9]
812 umlal r5, r6, r2, r8 @ d += a[8]*2 * a[9]
813 @ r8 will be used in J
815 bic r0, r5, field_not_M @ u7 = d & M
816 mov r5, r5, lsr #26 @ d >>= 26
817 orr r5, r5, r6, asl #6
819 movw r14, field_R0 @ c += u7 * R0
820 umlal r3, r4, r0, r14
821 bic r14, r3, field_not_M @ t7 = c & M
822 str r14, [sp, #4 + 7*4]
823 mov r3, r3, lsr #26 @ c >>= 26
824 orr r3, r3, r4, asl #6
826 mov r14, field_R1 @ c += u7 * R1
827 umlal r3, r4, r0, r14
830 adds r3, r3, r11 @ c += c'
832 umlal
r5, r6, r8, r8 @ d
+= a[9] * a[9]
834 bic
r0, r5, field_not_M @ u8
= d
& M
835 str
r0, [sp
, #4 + 8*4]
836 mov
r5, r5, lsr
#26 @ d >>= 26
837 orr
r5, r5, r6, asl
#6
839 movw
r14, field_R0 @ c
+= u8
* R0
840 umlal
r3, r4, r0, r14
842 /******************************************
843 * compute and write back result
844 ******************************************
854 r1,r2,r10,r14 scratch
856 Note: do not read from a[] after here, it may overlap with r[]
859 add r1, sp
, #4 + 3*4 @ r[3..7] = t3..7, r11=u8, r12=t9
860 ldmia
r1, {r2,r7,r8,r9,r10,r11,r12}
862 stmia
r1, {r2,r7,r8,r9,r10}
864 bic
r2, r3, field_not_M @ r
[8] = c
& M
866 mov
r3, r3, lsr
#26 @ c >>= 26
867 orr
r3, r3, r4, asl
#6
869 mov
r14, field_R1 @ c
+= u8
* R1
870 umlal
r3, r4, r11, r14
871 movw
r14, field_R0 @ c
+= d
* R0
872 umlal
r3, r4, r5, r14
873 adds
r3, r3, r12 @ c
+= t9
876 add r1, sp
, #4 + 0*4 @ r7,r8,r9 = t0,t1,t2
879 ubfx
r2, r3, #0, #22 @ r[9] = c & (M >> 4)
881 mov
r3, r3, lsr
#22 @ c >>= 22
882 orr
r3, r3, r4, asl
#10
884 movw
r14, field_R1
<< 4 @ c
+= d
* (R1 << 4)
885 umlal
r3, r4, r5, r14
887 movw
r14, field_R0
>> 4 @ d
= c
* (R0 >> 4) + t0
(64x64 multiply+
add)
888 umull
r5, r6, r3, r14 @ d
= c.lo
* (R0 >> 4)
889 adds
r5, r5, r7 @ d.lo
+= t0
890 mla
r6, r14, r4, r6 @ d.hi
+= c.hi
* (R0 >> 4)
891 adc
r6, r6, 0 @ d.hi
+= carry
893 bic
r2, r5, field_not_M @ r
[0] = d
& M
896 mov
r5, r5, lsr
#26 @ d >>= 26
897 orr
r5, r5, r6, asl
#6
900 movw
r14, field_R1
>> 4 @ d
+= c
* (R1 >> 4) + t1
(64x64 multiply+
add)
901 umull
r1, r2, r3, r14 @ tmp
= c.lo
* (R1 >> 4)
902 adds
r5, r5, r8 @ d.lo
+= t1
903 adc
r6, r6, #0 @ d.hi += carry
904 adds
r5, r5, r1 @ d.lo
+= tmp.lo
905 mla
r2, r14, r4, r2 @ tmp.hi
+= c.hi
* (R1 >> 4)
906 adc
r6, r6, r2 @ d.hi
+= carry
+ tmp.hi
908 bic
r2, r5, field_not_M @ r
[1] = d
& M
910 mov
r5, r5, lsr
#26 @ d >>= 26 (ignore hi)
911 orr
r5, r5, r6, asl
#6
913 add r5, r5, r9 @ d
+= t2
914 str
r5, [r0, #2*4] @ r[2] = d
917 ldmfd sp
!, {r4, r5, r6, r7, r8, r9, r10, r11, pc
}
918 .size secp256k1_fe_sqr_inner, .-secp256k1_fe_sqr_inner