1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Hash code using AES intrinsics.
9 uintptr
aeshashbody(void*, uintptr
, uintptr
, Slice
)
10 __asm__(GOSYM_PREFIX
"runtime.aeshashbody");
12 uintptr
aeshashbody(void*, uintptr
, uintptr
, Slice
)
13 __attribute__((no_split_stack
));
15 #if (defined(__i386__) || defined(__x86_64__)) && defined(HAVE_AS_X86_AES)
17 #include <emmintrin.h>
18 #include <tmmintrin.h>
19 #include <wmmintrin.h>
21 // Force appropriate CPU level. We won't call here unless the CPU
24 #pragma GCC target("ssse3", "aes")
28 // aeshashbody implements a hash function using AES instructions
29 // available in recent x86 processors. Note this is not encryption,
32 // This is written to produce exactly the same results as the gc
33 // implementation, not because that matters, but just to ensure that
34 // this does something reasonable.
35 uintptr
aeshashbody(void* p
, uintptr seed
, uintptr size
, Slice aeskeysched
) {
36 __m128i mseed
, mseed2
, mseed3
, mseed4
, mseed5
, mseed6
, mseed7
, mseed8
;
37 __m128i mval
, mval2
, mval3
, mval4
, mval5
, mval6
, mval7
, mval8
;
39 // Start with hash seed.
40 mseed
= _mm_cvtsi64_si128(seed
);
41 // Get 16 bits of length.
42 mseed
= _mm_insert_epi16(mseed
, size
, 4);
43 // Repeat length 4 times total.
44 mseed
= _mm_shufflehi_epi16(mseed
, 0);
45 // Save unscrambled seed.
47 // XOR in per-process seed.
48 mseed
^= _mm_loadu_si128(aeskeysched
.__values
);
50 mseed
= _mm_aesenc_si128(mseed
, mseed
);
54 // Return scrambled input seed.
55 return _mm_cvtsi128_si64(_mm_aesenc_si128(mseed
, mseed
));
56 } else if (size
< 16) {
57 if ((((uintptr
)(p
) + 16) & 0xff0) != 0) {
58 static const uint64 masks
[32]
59 __attribute__ ((aligned(16))) =
61 0x0000000000000000, 0x0000000000000000,
62 0x00000000000000ff, 0x0000000000000000,
63 0x000000000000ffff, 0x0000000000000000,
64 0x0000000000ffffff, 0x0000000000000000,
65 0x00000000ffffffff, 0x0000000000000000,
66 0x000000ffffffffff, 0x0000000000000000,
67 0x0000ffffffffffff, 0x0000000000000000,
68 0x00ffffffffffffff, 0x0000000000000000,
69 0xffffffffffffffff, 0x0000000000000000,
70 0xffffffffffffffff, 0x00000000000000ff,
71 0xffffffffffffffff, 0x000000000000ffff,
72 0xffffffffffffffff, 0x0000000000ffffff,
73 0xffffffffffffffff, 0x00000000ffffffff,
74 0xffffffffffffffff, 0x000000ffffffffff,
75 0xffffffffffffffff, 0x0000ffffffffffff,
76 0xffffffffffffffff, 0x00ffffffffffffff
79 // 16 bytes loaded at p won't cross a page
80 // boundary, so we can load directly.
81 mval
= _mm_loadu_si128(p
);
82 mval
&= *(const __m128i
*)(&masks
[size
*2]);
84 static const uint64 shifts
[32]
85 __attribute__ ((aligned(16))) =
87 0x0000000000000000, 0x0000000000000000,
88 0xffffffffffffff0f, 0xffffffffffffffff,
89 0xffffffffffff0f0e, 0xffffffffffffffff,
90 0xffffffffff0f0e0d, 0xffffffffffffffff,
91 0xffffffff0f0e0d0c, 0xffffffffffffffff,
92 0xffffff0f0e0d0c0b, 0xffffffffffffffff,
93 0xffff0f0e0d0c0b0a, 0xffffffffffffffff,
94 0xff0f0e0d0c0b0a09, 0xffffffffffffffff,
95 0x0f0e0d0c0b0a0908, 0xffffffffffffffff,
96 0x0e0d0c0b0a090807, 0xffffffffffffff0f,
97 0x0d0c0b0a09080706, 0xffffffffffff0f0e,
98 0x0c0b0a0908070605, 0xffffffffff0f0e0d,
99 0x0b0a090807060504, 0xffffffff0f0e0d0c,
100 0x0a09080706050403, 0xffffff0f0e0d0c0b,
101 0x0908070605040302, 0xffff0f0e0d0c0b0a,
102 0x0807060504030201, 0xff0f0e0d0c0b0a09,
105 // address ends in 1111xxxx. Might be
106 // up against a page boundary, so load
107 // ending at last byte. Then shift
108 // bytes down using pshufb.
109 mval
= _mm_loadu_si128((void*)((char*)p
- 16 + size
));
110 mval
= _mm_shuffle_epi8(mval
, *(const __m128i
*)(&shifts
[size
*2]));
113 mval
= _mm_loadu_si128(p
);
116 // XOR data with seed.
118 // Scramble combo 3 times.
119 mval
= _mm_aesenc_si128(mval
, mval
);
120 mval
= _mm_aesenc_si128(mval
, mval
);
121 mval
= _mm_aesenc_si128(mval
, mval
);
122 return _mm_cvtsi128_si64(mval
);
123 } else if (size
<= 32) {
124 // Make second starting seed.
125 mseed2
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 16));
126 mseed2
= _mm_aesenc_si128(mseed2
, mseed2
);
127 // Load data to be hashed.
128 mval
= _mm_loadu_si128(p
);
129 mval2
= _mm_loadu_si128((void*)((char*)p
+ size
- 16));
134 mval
= _mm_aesenc_si128(mval
, mval
);
135 mval2
= _mm_aesenc_si128(mval2
, mval2
);
136 mval
= _mm_aesenc_si128(mval
, mval
);
137 mval2
= _mm_aesenc_si128(mval2
, mval2
);
138 mval
= _mm_aesenc_si128(mval
, mval
);
139 mval2
= _mm_aesenc_si128(mval2
, mval2
);
142 return _mm_cvtsi128_si64(mval
);
143 } else if (size
<= 64) {
144 // Make 3 more starting seeds.
147 mseed2
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 16));
148 mseed3
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 32));
149 mseed4
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 48));
150 mseed2
= _mm_aesenc_si128(mseed2
, mseed2
);
151 mseed3
= _mm_aesenc_si128(mseed3
, mseed3
);
152 mseed4
= _mm_aesenc_si128(mseed4
, mseed4
);
154 mval
= _mm_loadu_si128(p
);
155 mval2
= _mm_loadu_si128((void*)((char*)p
+ 16));
156 mval3
= _mm_loadu_si128((void*)((char*)p
+ size
- 32));
157 mval4
= _mm_loadu_si128((void*)((char*)p
+ size
- 16));
164 mval
= _mm_aesenc_si128(mval
, mval
);
165 mval2
= _mm_aesenc_si128(mval2
, mval2
);
166 mval3
= _mm_aesenc_si128(mval3
, mval3
);
167 mval4
= _mm_aesenc_si128(mval4
, mval4
);
169 mval
= _mm_aesenc_si128(mval
, mval
);
170 mval2
= _mm_aesenc_si128(mval2
, mval2
);
171 mval3
= _mm_aesenc_si128(mval3
, mval3
);
172 mval4
= _mm_aesenc_si128(mval4
, mval4
);
174 mval
= _mm_aesenc_si128(mval
, mval
);
175 mval2
= _mm_aesenc_si128(mval2
, mval2
);
176 mval3
= _mm_aesenc_si128(mval3
, mval3
);
177 mval4
= _mm_aesenc_si128(mval4
, mval4
);
182 return _mm_cvtsi128_si64(mval
);
183 } else if (size
<= 128) {
184 // Make 7 more starting seeds.
191 mseed2
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 16));
192 mseed3
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 32));
193 mseed4
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 48));
194 mseed5
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 64));
195 mseed6
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 80));
196 mseed7
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 96));
197 mseed8
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 112));
198 mseed2
= _mm_aesenc_si128(mseed2
, mseed2
);
199 mseed3
= _mm_aesenc_si128(mseed3
, mseed3
);
200 mseed4
= _mm_aesenc_si128(mseed4
, mseed4
);
201 mseed5
= _mm_aesenc_si128(mseed5
, mseed5
);
202 mseed6
= _mm_aesenc_si128(mseed6
, mseed6
);
203 mseed7
= _mm_aesenc_si128(mseed7
, mseed7
);
204 mseed8
= _mm_aesenc_si128(mseed8
, mseed8
);
207 mval
= _mm_loadu_si128(p
);
208 mval2
= _mm_loadu_si128((void*)((char*)p
+ 16));
209 mval3
= _mm_loadu_si128((void*)((char*)p
+ 32));
210 mval4
= _mm_loadu_si128((void*)((char*)p
+ 48));
211 mval5
= _mm_loadu_si128((void*)((char*)p
+ size
- 64));
212 mval6
= _mm_loadu_si128((void*)((char*)p
+ size
- 48));
213 mval7
= _mm_loadu_si128((void*)((char*)p
+ size
- 32));
214 mval8
= _mm_loadu_si128((void*)((char*)p
+ size
- 16));
227 mval
= _mm_aesenc_si128(mval
, mval
);
228 mval2
= _mm_aesenc_si128(mval2
, mval2
);
229 mval3
= _mm_aesenc_si128(mval3
, mval3
);
230 mval4
= _mm_aesenc_si128(mval4
, mval4
);
231 mval5
= _mm_aesenc_si128(mval5
, mval5
);
232 mval6
= _mm_aesenc_si128(mval6
, mval6
);
233 mval7
= _mm_aesenc_si128(mval7
, mval7
);
234 mval8
= _mm_aesenc_si128(mval8
, mval8
);
236 mval
= _mm_aesenc_si128(mval
, mval
);
237 mval2
= _mm_aesenc_si128(mval2
, mval2
);
238 mval3
= _mm_aesenc_si128(mval3
, mval3
);
239 mval4
= _mm_aesenc_si128(mval4
, mval4
);
240 mval5
= _mm_aesenc_si128(mval5
, mval5
);
241 mval6
= _mm_aesenc_si128(mval6
, mval6
);
242 mval7
= _mm_aesenc_si128(mval7
, mval7
);
243 mval8
= _mm_aesenc_si128(mval8
, mval8
);
245 mval
= _mm_aesenc_si128(mval
, mval
);
246 mval2
= _mm_aesenc_si128(mval2
, mval2
);
247 mval3
= _mm_aesenc_si128(mval3
, mval3
);
248 mval4
= _mm_aesenc_si128(mval4
, mval4
);
249 mval5
= _mm_aesenc_si128(mval5
, mval5
);
250 mval6
= _mm_aesenc_si128(mval6
, mval6
);
251 mval7
= _mm_aesenc_si128(mval7
, mval7
);
252 mval8
= _mm_aesenc_si128(mval8
, mval8
);
262 return _mm_cvtsi128_si64(mval
);
264 // Make 7 more starting seeds.
271 mseed2
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 16));
272 mseed3
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 32));
273 mseed4
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 48));
274 mseed5
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 64));
275 mseed6
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 80));
276 mseed7
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 96));
277 mseed8
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 112));
278 mseed2
= _mm_aesenc_si128(mseed2
, mseed2
);
279 mseed3
= _mm_aesenc_si128(mseed3
, mseed3
);
280 mseed4
= _mm_aesenc_si128(mseed4
, mseed4
);
281 mseed5
= _mm_aesenc_si128(mseed5
, mseed5
);
282 mseed6
= _mm_aesenc_si128(mseed6
, mseed6
);
283 mseed7
= _mm_aesenc_si128(mseed7
, mseed7
);
284 mseed8
= _mm_aesenc_si128(mseed8
, mseed8
);
286 // Start with last (possibly overlapping) block.
287 mval
= _mm_loadu_si128((void*)((char*)p
+ size
- 128));
288 mval2
= _mm_loadu_si128((void*)((char*)p
+ size
- 112));
289 mval3
= _mm_loadu_si128((void*)((char*)p
+ size
- 96));
290 mval4
= _mm_loadu_si128((void*)((char*)p
+ size
- 80));
291 mval5
= _mm_loadu_si128((void*)((char*)p
+ size
- 64));
292 mval6
= _mm_loadu_si128((void*)((char*)p
+ size
- 48));
293 mval7
= _mm_loadu_si128((void*)((char*)p
+ size
- 32));
294 mval8
= _mm_loadu_si128((void*)((char*)p
+ size
- 16));
306 // Compute number of remaining 128-byte blocks.
311 mval
= _mm_aesenc_si128(mval
, mval
);
312 mval2
= _mm_aesenc_si128(mval2
, mval2
);
313 mval3
= _mm_aesenc_si128(mval3
, mval3
);
314 mval4
= _mm_aesenc_si128(mval4
, mval4
);
315 mval5
= _mm_aesenc_si128(mval5
, mval5
);
316 mval6
= _mm_aesenc_si128(mval6
, mval6
);
317 mval7
= _mm_aesenc_si128(mval7
, mval7
);
318 mval8
= _mm_aesenc_si128(mval8
, mval8
);
320 // Scramble state, XOR in a block.
321 mval
= _mm_aesenc_si128(mval
, _mm_loadu_si128(p
));
322 mval2
= _mm_aesenc_si128(mval2
, _mm_loadu_si128((void*)((char*)p
+ 16)));
323 mval3
= _mm_aesenc_si128(mval3
, _mm_loadu_si128((void*)((char*)p
+ 32)));
324 mval4
= _mm_aesenc_si128(mval4
, _mm_loadu_si128((void*)((char*)p
+ 48)));
325 mval5
= _mm_aesenc_si128(mval5
, _mm_loadu_si128((void*)((char*)p
+ 64)));
326 mval6
= _mm_aesenc_si128(mval6
, _mm_loadu_si128((void*)((char*)p
+ 80)));
327 mval7
= _mm_aesenc_si128(mval7
, _mm_loadu_si128((void*)((char*)p
+ 96)));
328 mval8
= _mm_aesenc_si128(mval8
, _mm_loadu_si128((void*)((char*)p
+ 112)));
330 p
= (void*)((char*)p
+ 128);
331 } while (--size
> 0);
333 // 3 more scrambles to finish.
334 mval
= _mm_aesenc_si128(mval
, mval
);
335 mval2
= _mm_aesenc_si128(mval2
, mval2
);
336 mval3
= _mm_aesenc_si128(mval3
, mval3
);
337 mval4
= _mm_aesenc_si128(mval4
, mval4
);
338 mval5
= _mm_aesenc_si128(mval5
, mval5
);
339 mval6
= _mm_aesenc_si128(mval6
, mval6
);
340 mval7
= _mm_aesenc_si128(mval7
, mval7
);
341 mval8
= _mm_aesenc_si128(mval8
, mval8
);
342 mval
= _mm_aesenc_si128(mval
, mval
);
343 mval2
= _mm_aesenc_si128(mval2
, mval2
);
344 mval3
= _mm_aesenc_si128(mval3
, mval3
);
345 mval4
= _mm_aesenc_si128(mval4
, mval4
);
346 mval5
= _mm_aesenc_si128(mval5
, mval5
);
347 mval6
= _mm_aesenc_si128(mval6
, mval6
);
348 mval7
= _mm_aesenc_si128(mval7
, mval7
);
349 mval8
= _mm_aesenc_si128(mval8
, mval8
);
350 mval
= _mm_aesenc_si128(mval
, mval
);
351 mval2
= _mm_aesenc_si128(mval2
, mval2
);
352 mval3
= _mm_aesenc_si128(mval3
, mval3
);
353 mval4
= _mm_aesenc_si128(mval4
, mval4
);
354 mval5
= _mm_aesenc_si128(mval5
, mval5
);
355 mval6
= _mm_aesenc_si128(mval6
, mval6
);
356 mval7
= _mm_aesenc_si128(mval7
, mval7
);
357 mval8
= _mm_aesenc_si128(mval8
, mval8
);
366 return _mm_cvtsi128_si64(mval
);
370 #else // !defined(__x86_64__)
372 // The 32-bit version of aeshashbody.
374 uintptr
aeshashbody(void* p
, uintptr seed
, uintptr size
, Slice aeskeysched
) {
375 __m128i mseed
, mseed2
, mseed3
, mseed4
;
376 __m128i mval
, mval2
, mval3
, mval4
;
378 // Start with hash seed.
379 mseed
= _mm_cvtsi32_si128(seed
);
380 // Get 16 bits of length.
381 mseed
= _mm_insert_epi16(mseed
, size
, 4);
382 // Replace size with its low 2 bytes repeated 4 times.
383 mseed
= _mm_shufflehi_epi16(mseed
, 0);
384 // Save unscrambled seed.
386 // XOR in per-process seed.
387 mseed
^= _mm_loadu_si128(aeskeysched
.__values
);
389 mseed
= _mm_aesenc_si128(mseed
, mseed
);
393 // Return scrambled input seed.
394 return _mm_cvtsi128_si32(_mm_aesenc_si128(mseed
, mseed
));
395 } else if (size
< 16) {
396 if ((((uintptr
)(p
) + 16) & 0xff0) != 0) {
397 static const uint64 masks
[32]
398 __attribute__ ((aligned(16))) =
400 0x0000000000000000, 0x0000000000000000,
401 0x00000000000000ff, 0x0000000000000000,
402 0x000000000000ffff, 0x0000000000000000,
403 0x0000000000ffffff, 0x0000000000000000,
404 0x00000000ffffffff, 0x0000000000000000,
405 0x000000ffffffffff, 0x0000000000000000,
406 0x0000ffffffffffff, 0x0000000000000000,
407 0x00ffffffffffffff, 0x0000000000000000,
408 0xffffffffffffffff, 0x0000000000000000,
409 0xffffffffffffffff, 0x00000000000000ff,
410 0xffffffffffffffff, 0x000000000000ffff,
411 0xffffffffffffffff, 0x0000000000ffffff,
412 0xffffffffffffffff, 0x00000000ffffffff,
413 0xffffffffffffffff, 0x000000ffffffffff,
414 0xffffffffffffffff, 0x0000ffffffffffff,
415 0xffffffffffffffff, 0x00ffffffffffffff
418 // 16 bytes loaded at p won't cross a page
419 // boundary, so we can load it directly.
420 mval
= _mm_loadu_si128(p
);
421 mval
&= *(const __m128i
*)(&masks
[size
*2]);
423 static const uint64 shifts
[32]
424 __attribute__ ((aligned(16))) =
426 0x0000000000000000, 0x0000000000000000,
427 0xffffffffffffff0f, 0xffffffffffffffff,
428 0xffffffffffff0f0e, 0xffffffffffffffff,
429 0xffffffffff0f0e0d, 0xffffffffffffffff,
430 0xffffffff0f0e0d0c, 0xffffffffffffffff,
431 0xffffff0f0e0d0c0b, 0xffffffffffffffff,
432 0xffff0f0e0d0c0b0a, 0xffffffffffffffff,
433 0xff0f0e0d0c0b0a09, 0xffffffffffffffff,
434 0x0f0e0d0c0b0a0908, 0xffffffffffffffff,
435 0x0e0d0c0b0a090807, 0xffffffffffffff0f,
436 0x0d0c0b0a09080706, 0xffffffffffff0f0e,
437 0x0c0b0a0908070605, 0xffffffffff0f0e0d,
438 0x0b0a090807060504, 0xffffffff0f0e0d0c,
439 0x0a09080706050403, 0xffffff0f0e0d0c0b,
440 0x0908070605040302, 0xffff0f0e0d0c0b0a,
441 0x0807060504030201, 0xff0f0e0d0c0b0a09,
444 // address ends in 1111xxxx. Might be
445 // up against a page boundary, so load
446 // ending at last byte. Then shift
447 // bytes down using pshufb.
448 mval
= _mm_loadu_si128((void*)((char*)p
- 16 + size
));
449 mval
= _mm_shuffle_epi8(mval
, *(const __m128i
*)(&shifts
[size
*2]));
452 mval
= _mm_loadu_si128(p
);
455 // Scramble input, XOR in seed.
456 mval
= _mm_aesenc_si128(mval
, mseed
);
457 mval
= _mm_aesenc_si128(mval
, mval
);
458 mval
= _mm_aesenc_si128(mval
, mval
);
459 return _mm_cvtsi128_si32(mval
);
460 } else if (size
<= 32) {
461 // Make second starting seed.
462 mseed2
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 16));
463 mseed2
= _mm_aesenc_si128(mseed2
, mseed2
);
464 // Load data to be hashed.
465 mval
= _mm_loadu_si128(p
);
466 mval2
= _mm_loadu_si128((void*)((char*)p
+ size
- 16));
469 mval
= _mm_aesenc_si128(mval
, mseed
);
470 mval2
= _mm_aesenc_si128(mval2
, mseed2
);
471 mval
= _mm_aesenc_si128(mval
, mval
);
472 mval2
= _mm_aesenc_si128(mval2
, mval2
);
473 mval
= _mm_aesenc_si128(mval
, mval
);
474 mval2
= _mm_aesenc_si128(mval2
, mval2
);
478 return _mm_cvtsi128_si32(mval
);
479 } else if (size
<= 64) {
480 // Make 3 more starting seeds.
483 mseed2
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 16));
484 mseed3
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 32));
485 mseed4
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 48));
486 mseed2
= _mm_aesenc_si128(mseed2
, mseed2
);
487 mseed3
= _mm_aesenc_si128(mseed3
, mseed3
);
488 mseed4
= _mm_aesenc_si128(mseed4
, mseed4
);
490 mval
= _mm_loadu_si128(p
);
491 mval2
= _mm_loadu_si128((void*)((char*)p
+ 16));
492 mval3
= _mm_loadu_si128((void*)((char*)p
+ size
- 32));
493 mval4
= _mm_loadu_si128((void*)((char*)p
+ size
- 16));
495 mval
= _mm_aesenc_si128(mval
, mseed
);
496 mval2
= _mm_aesenc_si128(mval2
, mseed2
);
497 mval3
= _mm_aesenc_si128(mval3
, mseed3
);
498 mval4
= _mm_aesenc_si128(mval4
, mseed4
);
500 mval
= _mm_aesenc_si128(mval
, mval
);
501 mval2
= _mm_aesenc_si128(mval2
, mval2
);
502 mval3
= _mm_aesenc_si128(mval3
, mval3
);
503 mval4
= _mm_aesenc_si128(mval4
, mval4
);
505 mval
= _mm_aesenc_si128(mval
, mval
);
506 mval2
= _mm_aesenc_si128(mval2
, mval2
);
507 mval3
= _mm_aesenc_si128(mval3
, mval3
);
508 mval4
= _mm_aesenc_si128(mval4
, mval4
);
513 return _mm_cvtsi128_si32(mval
);
515 // Make 3 more starting seeds.
518 mseed2
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 16));
519 mseed3
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 32));
520 mseed4
^= _mm_loadu_si128((void*)((char*)aeskeysched
.__values
+ 48));
521 mseed2
= _mm_aesenc_si128(mseed2
, mseed2
);
522 mseed3
= _mm_aesenc_si128(mseed3
, mseed3
);
523 mseed4
= _mm_aesenc_si128(mseed4
, mseed4
);
525 // Start with last (possibly overlapping) block.
526 mval
= _mm_loadu_si128((void*)((char*)p
+ size
- 64));
527 mval2
= _mm_loadu_si128((void*)((char*)p
+ size
- 48));
528 mval3
= _mm_loadu_si128((void*)((char*)p
+ size
- 32));
529 mval4
= _mm_loadu_si128((void*)((char*)p
+ size
- 16));
531 // Scramble state once.
532 mval
= _mm_aesenc_si128(mval
, mseed
);
533 mval2
= _mm_aesenc_si128(mval2
, mseed2
);
534 mval3
= _mm_aesenc_si128(mval3
, mseed3
);
535 mval4
= _mm_aesenc_si128(mval4
, mseed4
);
537 // Compute number of remaining 64-byte blocks.
541 // Scramble state, XOR in a block.
542 mval
= _mm_aesenc_si128(mval
, _mm_loadu_si128(p
));
543 mval2
= _mm_aesenc_si128(mval2
, _mm_loadu_si128((void*)((char*)p
+ 16)));
544 mval3
= _mm_aesenc_si128(mval3
, _mm_loadu_si128((void*)((char*)p
+ 32)));
545 mval4
= _mm_aesenc_si128(mval4
, _mm_loadu_si128((void*)((char*)p
+ 48)));
548 mval
= _mm_aesenc_si128(mval
, mval
);
549 mval2
= _mm_aesenc_si128(mval2
, mval2
);
550 mval3
= _mm_aesenc_si128(mval3
, mval3
);
551 mval4
= _mm_aesenc_si128(mval4
, mval4
);
553 p
= (void*)((char*)p
+ 64);
554 } while (--size
> 0);
556 // 2 more scrambles to finish.
557 mval
= _mm_aesenc_si128(mval
, mval
);
558 mval2
= _mm_aesenc_si128(mval2
, mval2
);
559 mval3
= _mm_aesenc_si128(mval3
, mval3
);
560 mval4
= _mm_aesenc_si128(mval4
, mval4
);
562 mval
= _mm_aesenc_si128(mval
, mval
);
563 mval2
= _mm_aesenc_si128(mval2
, mval2
);
564 mval3
= _mm_aesenc_si128(mval3
, mval3
);
565 mval4
= _mm_aesenc_si128(mval4
, mval4
);
570 return _mm_cvtsi128_si32(mval
);
574 #endif // !defined(__x86_64__)
576 #elif defined(__aarch64__)
578 // Undefine some identifiers that we pick up from the Go runtime package that
579 // are used in arm_neon.h.
588 #include <arm_neon.h>
590 // Force appropriate CPU level. We won't call here unless the CPU
593 #pragma GCC target("+crypto")
595 // The arm64 version of aeshashbody.
597 uintptr
aeshashbody(void* p
, uintptr seed
, uintptr size
, Slice aeskeysched
) {
601 uint8x16_t vseed
, vseed2
, vseed3
, vseed4
;
602 uint8x16_t vseed5
, vseed6
, vseed7
, vseed8
;
603 uint8x16_t vval
, vval2
, vval3
, vval4
;
604 uint8x16_t vval5
, vval6
, vval7
, vval8
;
605 uint8x16_t vvalLoop
, vvalLoop2
, vvalLoop3
, vvalLoop4
;
606 uint8x16_t vvalLoop5
, vvalLoop6
, vvalLoop7
, vvalLoop8
;
608 uint8x16x3_t avseed3
;
610 pseed
= (uint8x16_t
*)(aeskeysched
.__values
);
612 // Combined hash seed and length.
613 vinit32
= vdupq_n_u32(0);
614 vinit32
[0] = (uint32
)seed
;
615 vinit32
[1] = (uint32
)size
;
616 vinit
= vreinterpretq_u8_u32(vinit32
);
618 // Mix in per-process seed.
619 vseed
= vaeseq_u8(*pseed
, vinit
);
622 vseed
= vaesmcq_u8(vseed
);
626 // Return 64 bits of scrambled input seed.
627 return vreinterpretq_u64_u8(vseed
)[0];
628 } else if (size
< 16) {
629 vval
= vreinterpretq_u8_u32(vdupq_n_u32(0));
630 if ((size
& 8) != 0) {
631 vval
= vreinterpretq_u8_u64(vld1q_lane_u64((uint64_t*)(p
), vreinterpretq_u64_u8(vval
), 0));
632 p
= (void*)((uint64_t*)(p
) + 1);
634 if ((size
& 4) != 0) {
635 vval
= vreinterpretq_u8_u32(vld1q_lane_u32((uint32_t*)(p
), vreinterpretq_u32_u8(vval
), 2));
636 p
= (void*)((uint32_t*)(p
) + 1);
638 if ((size
& 2) != 0) {
639 vval
= vreinterpretq_u8_u16(vld1q_lane_u16((uint16_t*)(p
), vreinterpretq_u16_u8(vval
), 6));
640 p
= (void*)((uint16_t*)(p
) + 1);
642 if ((size
& 1) != 0) {
643 vval
= vld1q_lane_u8((uint8
*)(p
), vval
, 14);
646 vval
= *(uint8x16_t
*)(p
);
648 vval
= vaeseq_u8(vval
, vseed
);
649 vval
= vaesmcq_u8(vval
);
650 vval
= vaeseq_u8(vval
, vseed
);
651 vval
= vaesmcq_u8(vval
);
652 vval
= vaeseq_u8(vval
, vseed
);
653 return vreinterpretq_u64_u8(vval
)[0];
654 } else if (size
<= 32) {
655 // Make a second seed.
656 vseed2
= vaeseq_u8(*pseed
, vinit
);
657 vseed2
= vaesmcq_u8(vseed2
);
658 vval
= *(uint8x16_t
*)(p
);
659 vval2
= *(uint8x16_t
*)((char*)(p
) + (size
- 16));
661 vval
= vaeseq_u8(vval
, vseed
);
662 vval
= vaesmcq_u8(vval
);
663 vval2
= vaeseq_u8(vval2
, vseed2
);
664 vval2
= vaesmcq_u8(vval2
);
666 vval
= vaeseq_u8(vval
, vseed
);
667 vval
= vaesmcq_u8(vval
);
668 vval2
= vaeseq_u8(vval2
, vseed2
);
669 vval2
= vaesmcq_u8(vval2
);
671 vval
= vaeseq_u8(vval
, vseed
);
672 vval2
= vaeseq_u8(vval2
, vseed2
);
676 return vreinterpretq_u64_u8(vval
)[0];
677 } else if (size
<= 64) {
678 avseed3
= vld1q_u8_x3((uint8
*)(pseed
));
679 vseed2
= avseed3
.val
[0];
680 vseed3
= avseed3
.val
[1];
681 vseed4
= avseed3
.val
[2];
683 vseed2
= vaeseq_u8(vseed2
, vinit
);
684 vseed2
= vaesmcq_u8(vseed2
);
685 vseed3
= vaeseq_u8(vseed3
, vinit
);
686 vseed3
= vaesmcq_u8(vseed3
);
687 vseed4
= vaeseq_u8(vseed4
, vinit
);
688 vseed4
= vaesmcq_u8(vseed4
);
690 avval2
= vld1q_u8_x2((uint8
*)(p
));
691 vval
= avval2
.val
[0];
692 vval2
= avval2
.val
[1];
693 avval2
= vld1q_u8_x2((uint8
*)(p
) + (size
- 32));
694 vval3
= avval2
.val
[0];
695 vval4
= avval2
.val
[1];
697 vval
= vaeseq_u8(vval
, vseed
);
698 vval
= vaesmcq_u8(vval
);
699 vval2
= vaeseq_u8(vval2
, vseed2
);
700 vval2
= vaesmcq_u8(vval2
);
701 vval3
= vaeseq_u8(vval3
, vseed3
);
702 vval3
= vaesmcq_u8(vval3
);
703 vval4
= vaeseq_u8(vval4
, vseed4
);
704 vval4
= vaesmcq_u8(vval4
);
706 vval
= vaeseq_u8(vval
, vseed
);
707 vval
= vaesmcq_u8(vval
);
708 vval2
= vaeseq_u8(vval2
, vseed2
);
709 vval2
= vaesmcq_u8(vval2
);
710 vval3
= vaeseq_u8(vval3
, vseed3
);
711 vval3
= vaesmcq_u8(vval3
);
712 vval4
= vaeseq_u8(vval4
, vseed4
);
713 vval4
= vaesmcq_u8(vval4
);
715 vval
= vaeseq_u8(vval
, vseed
);
716 vval2
= vaeseq_u8(vval2
, vseed2
);
717 vval3
= vaeseq_u8(vval3
, vseed3
);
718 vval4
= vaeseq_u8(vval4
, vseed4
);
724 return vreinterpretq_u64_u8(vval
)[0];
725 } else if (size
<= 128) {
726 // For some reason vld1q_u8_x4 is missing.
727 avseed3
= vld1q_u8_x3((uint8
*)(pseed
));
728 vseed2
= avseed3
.val
[0];
729 vseed3
= avseed3
.val
[1];
730 vseed4
= avseed3
.val
[2];
731 avseed3
= vld1q_u8_x3((uint8
*)(pseed
+ 3));
732 vseed5
= avseed3
.val
[0];
733 vseed6
= avseed3
.val
[1];
734 vseed7
= avseed3
.val
[2];
735 vseed8
= *(pseed
+ 6);
737 vseed2
= vaeseq_u8(vseed2
, vinit
);
738 vseed2
= vaesmcq_u8(vseed2
);
739 vseed3
= vaeseq_u8(vseed3
, vinit
);
740 vseed3
= vaesmcq_u8(vseed3
);
741 vseed4
= vaeseq_u8(vseed4
, vinit
);
742 vseed4
= vaesmcq_u8(vseed4
);
743 vseed5
= vaeseq_u8(vseed5
, vinit
);
744 vseed5
= vaesmcq_u8(vseed5
);
745 vseed6
= vaeseq_u8(vseed6
, vinit
);
746 vseed6
= vaesmcq_u8(vseed6
);
747 vseed7
= vaeseq_u8(vseed7
, vinit
);
748 vseed7
= vaesmcq_u8(vseed7
);
749 vseed8
= vaeseq_u8(vseed8
, vinit
);
750 vseed8
= vaesmcq_u8(vseed8
);
752 avval2
= vld1q_u8_x2((uint8
*)(p
));
753 vval
= avval2
.val
[0];
754 vval2
= avval2
.val
[1];
755 avval2
= vld1q_u8_x2((uint8
*)(p
) + 32);
756 vval3
= avval2
.val
[0];
757 vval4
= avval2
.val
[1];
758 avval2
= vld1q_u8_x2((uint8
*)(p
) + (size
- 64));
759 vval5
= avval2
.val
[0];
760 vval6
= avval2
.val
[1];
761 avval2
= vld1q_u8_x2((uint8
*)(p
) + (size
- 32));
762 vval7
= avval2
.val
[0];
763 vval8
= avval2
.val
[1];
765 vval
= vaeseq_u8(vval
, vseed
);
766 vval
= vaesmcq_u8(vval
);
767 vval2
= vaeseq_u8(vval2
, vseed2
);
768 vval2
= vaesmcq_u8(vval2
);
769 vval3
= vaeseq_u8(vval3
, vseed3
);
770 vval3
= vaesmcq_u8(vval3
);
771 vval4
= vaeseq_u8(vval4
, vseed4
);
772 vval4
= vaesmcq_u8(vval4
);
773 vval5
= vaeseq_u8(vval5
, vseed5
);
774 vval5
= vaesmcq_u8(vval5
);
775 vval6
= vaeseq_u8(vval6
, vseed6
);
776 vval6
= vaesmcq_u8(vval6
);
777 vval7
= vaeseq_u8(vval7
, vseed7
);
778 vval7
= vaesmcq_u8(vval7
);
779 vval8
= vaeseq_u8(vval8
, vseed8
);
780 vval8
= vaesmcq_u8(vval8
);
782 vval
= vaeseq_u8(vval
, vseed
);
783 vval
= vaesmcq_u8(vval
);
784 vval2
= vaeseq_u8(vval2
, vseed2
);
785 vval2
= vaesmcq_u8(vval2
);
786 vval3
= vaeseq_u8(vval3
, vseed3
);
787 vval3
= vaesmcq_u8(vval3
);
788 vval4
= vaeseq_u8(vval4
, vseed4
);
789 vval4
= vaesmcq_u8(vval4
);
790 vval5
= vaeseq_u8(vval5
, vseed5
);
791 vval5
= vaesmcq_u8(vval5
);
792 vval6
= vaeseq_u8(vval6
, vseed6
);
793 vval6
= vaesmcq_u8(vval6
);
794 vval7
= vaeseq_u8(vval7
, vseed7
);
795 vval7
= vaesmcq_u8(vval7
);
796 vval8
= vaeseq_u8(vval8
, vseed8
);
797 vval8
= vaesmcq_u8(vval8
);
799 vval
= vaeseq_u8(vval
, vseed
);
800 vval2
= vaeseq_u8(vval2
, vseed2
);
801 vval3
= vaeseq_u8(vval3
, vseed3
);
802 vval4
= vaeseq_u8(vval4
, vseed4
);
803 vval5
= vaeseq_u8(vval5
, vseed5
);
804 vval6
= vaeseq_u8(vval6
, vseed6
);
805 vval7
= vaeseq_u8(vval7
, vseed7
);
806 vval8
= vaeseq_u8(vval8
, vseed8
);
816 return vreinterpretq_u64_u8(vval
)[0];
818 // For some reason vld1q_u8_x4 is missing.
819 avseed3
= vld1q_u8_x3((uint8
*)(pseed
));
820 vseed2
= avseed3
.val
[0];
821 vseed3
= avseed3
.val
[1];
822 vseed4
= avseed3
.val
[2];
823 avseed3
= vld1q_u8_x3((uint8
*)(pseed
+ 3));
824 vseed5
= avseed3
.val
[0];
825 vseed6
= avseed3
.val
[1];
826 vseed7
= avseed3
.val
[2];
827 vseed8
= *(pseed
+ 6);
829 vseed2
= vaeseq_u8(vseed2
, vinit
);
830 vseed2
= vaesmcq_u8(vseed2
);
831 vseed3
= vaeseq_u8(vseed3
, vinit
);
832 vseed3
= vaesmcq_u8(vseed3
);
833 vseed4
= vaeseq_u8(vseed4
, vinit
);
834 vseed4
= vaesmcq_u8(vseed4
);
835 vseed5
= vaeseq_u8(vseed5
, vinit
);
836 vseed5
= vaesmcq_u8(vseed5
);
837 vseed6
= vaeseq_u8(vseed6
, vinit
);
838 vseed6
= vaesmcq_u8(vseed6
);
839 vseed7
= vaeseq_u8(vseed7
, vinit
);
840 vseed7
= vaesmcq_u8(vseed7
);
841 vseed8
= vaeseq_u8(vseed8
, vinit
);
842 vseed8
= vaesmcq_u8(vseed8
);
844 avval2
= vld1q_u8_x2((uint8
*)(p
) + (size
- 128));
845 vval
= avval2
.val
[0];
846 vval2
= avval2
.val
[1];
847 avval2
= vld1q_u8_x2((uint8
*)(p
) + (size
- 96));
848 vval3
= avval2
.val
[0];
849 vval4
= avval2
.val
[1];
850 avval2
= vld1q_u8_x2((uint8
*)(p
) + (size
- 64));
851 vval5
= avval2
.val
[0];
852 vval6
= avval2
.val
[1];
853 avval2
= vld1q_u8_x2((uint8
*)(p
) + (size
- 32));
854 vval7
= avval2
.val
[0];
855 vval8
= avval2
.val
[1];
869 vval
= vaeseq_u8(vval
, vvalLoop
);
870 vval
= vaesmcq_u8(vval
);
871 vval2
= vaeseq_u8(vval2
, vvalLoop2
);
872 vval2
= vaesmcq_u8(vval2
);
873 vval3
= vaeseq_u8(vval3
, vvalLoop3
);
874 vval3
= vaesmcq_u8(vval3
);
875 vval4
= vaeseq_u8(vval4
, vvalLoop4
);
876 vval4
= vaesmcq_u8(vval4
);
877 vval5
= vaeseq_u8(vval5
, vvalLoop5
);
878 vval5
= vaesmcq_u8(vval5
);
879 vval6
= vaeseq_u8(vval6
, vvalLoop6
);
880 vval6
= vaesmcq_u8(vval6
);
881 vval7
= vaeseq_u8(vval7
, vvalLoop7
);
882 vval7
= vaesmcq_u8(vval7
);
883 vval8
= vaeseq_u8(vval8
, vvalLoop8
);
884 vval8
= vaesmcq_u8(vval8
);
886 avval2
= vld1q_u8_x2((uint8
*)(p
));
887 vvalLoop
= avval2
.val
[0];
888 vvalLoop2
= avval2
.val
[1];
889 avval2
= vld1q_u8_x2((uint8
*)(p
) + 32);
890 vvalLoop3
= avval2
.val
[0];
891 vvalLoop4
= avval2
.val
[1];
892 avval2
= vld1q_u8_x2((uint8
*)(p
) + 64);
893 vvalLoop5
= avval2
.val
[0];
894 vvalLoop6
= avval2
.val
[1];
895 avval2
= vld1q_u8_x2((uint8
*)(p
) + 96);
896 vvalLoop7
= avval2
.val
[0];
897 vvalLoop8
= avval2
.val
[1];
899 p
= (void *)((uint8
*)(p
) + 128);
901 vval
= vaeseq_u8(vval
, vvalLoop
);
902 vval
= vaesmcq_u8(vval
);
903 vval2
= vaeseq_u8(vval2
, vvalLoop2
);
904 vval2
= vaesmcq_u8(vval2
);
905 vval3
= vaeseq_u8(vval3
, vvalLoop3
);
906 vval3
= vaesmcq_u8(vval3
);
907 vval4
= vaeseq_u8(vval4
, vvalLoop4
);
908 vval4
= vaesmcq_u8(vval4
);
909 vval5
= vaeseq_u8(vval5
, vvalLoop5
);
910 vval5
= vaesmcq_u8(vval5
);
911 vval6
= vaeseq_u8(vval6
, vvalLoop6
);
912 vval6
= vaesmcq_u8(vval6
);
913 vval7
= vaeseq_u8(vval7
, vvalLoop7
);
914 vval7
= vaesmcq_u8(vval7
);
915 vval8
= vaeseq_u8(vval8
, vvalLoop8
);
916 vval8
= vaesmcq_u8(vval8
);
917 } while (--size
> 0);
919 vval
= vaeseq_u8(vval
, vvalLoop
);
920 vval
= vaesmcq_u8(vval
);
921 vval2
= vaeseq_u8(vval2
, vvalLoop2
);
922 vval2
= vaesmcq_u8(vval2
);
923 vval3
= vaeseq_u8(vval3
, vvalLoop3
);
924 vval3
= vaesmcq_u8(vval3
);
925 vval4
= vaeseq_u8(vval4
, vvalLoop4
);
926 vval4
= vaesmcq_u8(vval4
);
927 vval5
= vaeseq_u8(vval5
, vvalLoop5
);
928 vval5
= vaesmcq_u8(vval5
);
929 vval6
= vaeseq_u8(vval6
, vvalLoop6
);
930 vval6
= vaesmcq_u8(vval6
);
931 vval7
= vaeseq_u8(vval7
, vvalLoop7
);
932 vval7
= vaesmcq_u8(vval7
);
933 vval8
= vaeseq_u8(vval8
, vvalLoop8
);
934 vval8
= vaesmcq_u8(vval8
);
937 vval
= vaeseq_u8(vval
, vvalLoop
);
938 vval
= vaesmcq_u8(vval
);
939 vval2
= vaeseq_u8(vval2
, vvalLoop2
);
940 vval2
= vaesmcq_u8(vval2
);
941 vval3
= vaeseq_u8(vval3
, vvalLoop3
);
942 vval3
= vaesmcq_u8(vval3
);
943 vval4
= vaeseq_u8(vval4
, vvalLoop4
);
944 vval4
= vaesmcq_u8(vval4
);
945 vval5
= vaeseq_u8(vval5
, vvalLoop5
);
946 vval5
= vaesmcq_u8(vval5
);
947 vval6
= vaeseq_u8(vval6
, vvalLoop6
);
948 vval6
= vaesmcq_u8(vval6
);
949 vval7
= vaeseq_u8(vval7
, vvalLoop7
);
950 vval7
= vaesmcq_u8(vval7
);
951 vval8
= vaeseq_u8(vval8
, vvalLoop8
);
952 vval8
= vaesmcq_u8(vval8
);
954 vval
= vaeseq_u8(vval
, vvalLoop
);
955 vval2
= vaeseq_u8(vval2
, vvalLoop2
);
956 vval3
= vaeseq_u8(vval3
, vvalLoop3
);
957 vval4
= vaeseq_u8(vval4
, vvalLoop4
);
958 vval5
= vaeseq_u8(vval5
, vvalLoop5
);
959 vval6
= vaeseq_u8(vval6
, vvalLoop6
);
960 vval7
= vaeseq_u8(vval7
, vvalLoop7
);
961 vval8
= vaeseq_u8(vval8
, vvalLoop8
);
971 return vreinterpretq_u64_u8(vval
)[0];
975 #else // (!defined(__i386__) && !defined(__x86_64__) || !defined(HAVE_AS_X86_AES)) && !defined(__aarch64__)
977 uintptr
aeshashbody(void* p
__attribute__((unused
)),
978 uintptr seed
__attribute__((unused
)),
979 uintptr size
__attribute__((unused
)),
980 Slice aeskeysched
__attribute__((unused
))) {
981 // We should never get here on a non-x86, non-arm64 system.
982 runtime_throw("impossible call to aeshashbody");
985 #endif // !defined(__i386__) && !defined(__x86_64__) || !defined(HAVE_AS_X86_AES)