Update LOCAL_PATCHES after libsanitizer merge.
[official-gcc.git] / libgo / runtime / aeshash.c
blob00658d7a8962550b5ab808c84e986e2f1f3288ed
1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Hash code using AES intrinsics.
7 #include "runtime.h"
9 uintptr aeshashbody(void*, uintptr, uintptr, Slice)
10 __asm__(GOSYM_PREFIX "runtime.aeshashbody");
12 uintptr aeshashbody(void*, uintptr, uintptr, Slice)
13 __attribute__((no_split_stack));
15 #if (defined(__i386__) || defined(__x86_64__)) && defined(HAVE_AS_X86_AES)
17 #include <emmintrin.h>
18 #include <tmmintrin.h>
19 #include <wmmintrin.h>
21 // Force appropriate CPU level. We won't call here unless the CPU
22 // supports it.
24 #pragma GCC target("ssse3", "aes")
26 #ifdef __x86_64__
28 // aeshashbody implements a hash function using AES instructions
29 // available in recent x86 processors. Note this is not encryption,
30 // just hashing.
32 // This is written to produce exactly the same results as the gc
33 // implementation, not because that matters, but just to ensure that
34 // this does something reasonable.
35 uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
36 __m128i mseed, mseed2, mseed3, mseed4, mseed5, mseed6, mseed7, mseed8;
37 __m128i mval, mval2, mval3, mval4, mval5, mval6, mval7, mval8;
39 // Start with hash seed.
40 mseed = _mm_cvtsi64_si128(seed);
41 // Get 16 bits of length.
42 mseed = _mm_insert_epi16(mseed, size, 4);
43 // Repeat length 4 times total.
44 mseed = _mm_shufflehi_epi16(mseed, 0);
45 // Save unscrambled seed.
46 mseed2 = mseed;
47 // XOR in per-process seed.
48 mseed ^= _mm_loadu_si128(aeskeysched.__values);
49 // Scramble seed.
50 mseed = _mm_aesenc_si128(mseed, mseed);
52 if (size <= 16) {
53 if (size == 0) {
54 // Return scrambled input seed.
55 return _mm_cvtsi128_si64(_mm_aesenc_si128(mseed, mseed));
56 } else if (size < 16) {
57 if ((((uintptr)(p) + 16) & 0xff0) != 0) {
58 static const uint64 masks[32]
59 __attribute__ ((aligned(16))) =
61 0x0000000000000000, 0x0000000000000000,
62 0x00000000000000ff, 0x0000000000000000,
63 0x000000000000ffff, 0x0000000000000000,
64 0x0000000000ffffff, 0x0000000000000000,
65 0x00000000ffffffff, 0x0000000000000000,
66 0x000000ffffffffff, 0x0000000000000000,
67 0x0000ffffffffffff, 0x0000000000000000,
68 0x00ffffffffffffff, 0x0000000000000000,
69 0xffffffffffffffff, 0x0000000000000000,
70 0xffffffffffffffff, 0x00000000000000ff,
71 0xffffffffffffffff, 0x000000000000ffff,
72 0xffffffffffffffff, 0x0000000000ffffff,
73 0xffffffffffffffff, 0x00000000ffffffff,
74 0xffffffffffffffff, 0x000000ffffffffff,
75 0xffffffffffffffff, 0x0000ffffffffffff,
76 0xffffffffffffffff, 0x00ffffffffffffff
79 // 16 bytes loaded at p won't cross a page
80 // boundary, so we can load directly.
81 mval = _mm_loadu_si128(p);
82 mval &= *(const __m128i*)(&masks[size*2]);
83 } else {
84 static const uint64 shifts[32]
85 __attribute__ ((aligned(16))) =
87 0x0000000000000000, 0x0000000000000000,
88 0xffffffffffffff0f, 0xffffffffffffffff,
89 0xffffffffffff0f0e, 0xffffffffffffffff,
90 0xffffffffff0f0e0d, 0xffffffffffffffff,
91 0xffffffff0f0e0d0c, 0xffffffffffffffff,
92 0xffffff0f0e0d0c0b, 0xffffffffffffffff,
93 0xffff0f0e0d0c0b0a, 0xffffffffffffffff,
94 0xff0f0e0d0c0b0a09, 0xffffffffffffffff,
95 0x0f0e0d0c0b0a0908, 0xffffffffffffffff,
96 0x0e0d0c0b0a090807, 0xffffffffffffff0f,
97 0x0d0c0b0a09080706, 0xffffffffffff0f0e,
98 0x0c0b0a0908070605, 0xffffffffff0f0e0d,
99 0x0b0a090807060504, 0xffffffff0f0e0d0c,
100 0x0a09080706050403, 0xffffff0f0e0d0c0b,
101 0x0908070605040302, 0xffff0f0e0d0c0b0a,
102 0x0807060504030201, 0xff0f0e0d0c0b0a09,
105 // address ends in 1111xxxx. Might be
106 // up against a page boundary, so load
107 // ending at last byte. Then shift
108 // bytes down using pshufb.
109 mval = _mm_loadu_si128((void*)((char*)p - 16 + size));
110 mval = _mm_shuffle_epi8(mval, *(const __m128i*)(&shifts[size*2]));
112 } else {
113 mval = _mm_loadu_si128(p);
116 // XOR data with seed.
117 mval ^= mseed;
118 // Scramble combo 3 times.
119 mval = _mm_aesenc_si128(mval, mval);
120 mval = _mm_aesenc_si128(mval, mval);
121 mval = _mm_aesenc_si128(mval, mval);
122 return _mm_cvtsi128_si64(mval);
123 } else if (size <= 32) {
124 // Make second starting seed.
125 mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
126 mseed2 = _mm_aesenc_si128(mseed2, mseed2);
127 // Load data to be hashed.
128 mval = _mm_loadu_si128(p);
129 mval2 = _mm_loadu_si128((void*)((char*)p + size - 16));
130 // XOR with seed.
131 mval ^= mseed;
132 mval2 ^= mseed2;
133 // Scramble 3 times.
134 mval = _mm_aesenc_si128(mval, mval);
135 mval2 = _mm_aesenc_si128(mval2, mval2);
136 mval = _mm_aesenc_si128(mval, mval);
137 mval2 = _mm_aesenc_si128(mval2, mval2);
138 mval = _mm_aesenc_si128(mval, mval);
139 mval2 = _mm_aesenc_si128(mval2, mval2);
140 // Combine results.
141 mval ^= mval2;
142 return _mm_cvtsi128_si64(mval);
143 } else if (size <= 64) {
144 // Make 3 more starting seeds.
145 mseed3 = mseed2;
146 mseed4 = mseed2;
147 mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
148 mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
149 mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
150 mseed2 = _mm_aesenc_si128(mseed2, mseed2);
151 mseed3 = _mm_aesenc_si128(mseed3, mseed3);
152 mseed4 = _mm_aesenc_si128(mseed4, mseed4);
154 mval = _mm_loadu_si128(p);
155 mval2 = _mm_loadu_si128((void*)((char*)p + 16));
156 mval3 = _mm_loadu_si128((void*)((char*)p + size - 32));
157 mval4 = _mm_loadu_si128((void*)((char*)p + size - 16));
159 mval ^= mseed;
160 mval2 ^= mseed2;
161 mval3 ^= mseed3;
162 mval4 ^= mseed4;
164 mval = _mm_aesenc_si128(mval, mval);
165 mval2 = _mm_aesenc_si128(mval2, mval2);
166 mval3 = _mm_aesenc_si128(mval3, mval3);
167 mval4 = _mm_aesenc_si128(mval4, mval4);
169 mval = _mm_aesenc_si128(mval, mval);
170 mval2 = _mm_aesenc_si128(mval2, mval2);
171 mval3 = _mm_aesenc_si128(mval3, mval3);
172 mval4 = _mm_aesenc_si128(mval4, mval4);
174 mval = _mm_aesenc_si128(mval, mval);
175 mval2 = _mm_aesenc_si128(mval2, mval2);
176 mval3 = _mm_aesenc_si128(mval3, mval3);
177 mval4 = _mm_aesenc_si128(mval4, mval4);
179 mval ^= mval3;
180 mval2 ^= mval4;
181 mval ^= mval2;
182 return _mm_cvtsi128_si64(mval);
183 } else if (size <= 128) {
184 // Make 7 more starting seeds.
185 mseed3 = mseed2;
186 mseed4 = mseed2;
187 mseed5 = mseed2;
188 mseed6 = mseed2;
189 mseed7 = mseed2;
190 mseed8 = mseed2;
191 mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
192 mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
193 mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
194 mseed5 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 64));
195 mseed6 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 80));
196 mseed7 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 96));
197 mseed8 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 112));
198 mseed2 = _mm_aesenc_si128(mseed2, mseed2);
199 mseed3 = _mm_aesenc_si128(mseed3, mseed3);
200 mseed4 = _mm_aesenc_si128(mseed4, mseed4);
201 mseed5 = _mm_aesenc_si128(mseed5, mseed5);
202 mseed6 = _mm_aesenc_si128(mseed6, mseed6);
203 mseed7 = _mm_aesenc_si128(mseed7, mseed7);
204 mseed8 = _mm_aesenc_si128(mseed8, mseed8);
206 // Load data.
207 mval = _mm_loadu_si128(p);
208 mval2 = _mm_loadu_si128((void*)((char*)p + 16));
209 mval3 = _mm_loadu_si128((void*)((char*)p + 32));
210 mval4 = _mm_loadu_si128((void*)((char*)p + 48));
211 mval5 = _mm_loadu_si128((void*)((char*)p + size - 64));
212 mval6 = _mm_loadu_si128((void*)((char*)p + size - 48));
213 mval7 = _mm_loadu_si128((void*)((char*)p + size - 32));
214 mval8 = _mm_loadu_si128((void*)((char*)p + size - 16));
216 // XOR with seed.
217 mval ^= mseed;
218 mval2 ^= mseed2;
219 mval3 ^= mseed3;
220 mval4 ^= mseed4;
221 mval5 ^= mseed5;
222 mval6 ^= mseed6;
223 mval7 ^= mseed7;
224 mval8 ^= mseed8;
226 // Scramble 3 times.
227 mval = _mm_aesenc_si128(mval, mval);
228 mval2 = _mm_aesenc_si128(mval2, mval2);
229 mval3 = _mm_aesenc_si128(mval3, mval3);
230 mval4 = _mm_aesenc_si128(mval4, mval4);
231 mval5 = _mm_aesenc_si128(mval5, mval5);
232 mval6 = _mm_aesenc_si128(mval6, mval6);
233 mval7 = _mm_aesenc_si128(mval7, mval7);
234 mval8 = _mm_aesenc_si128(mval8, mval8);
236 mval = _mm_aesenc_si128(mval, mval);
237 mval2 = _mm_aesenc_si128(mval2, mval2);
238 mval3 = _mm_aesenc_si128(mval3, mval3);
239 mval4 = _mm_aesenc_si128(mval4, mval4);
240 mval5 = _mm_aesenc_si128(mval5, mval5);
241 mval6 = _mm_aesenc_si128(mval6, mval6);
242 mval7 = _mm_aesenc_si128(mval7, mval7);
243 mval8 = _mm_aesenc_si128(mval8, mval8);
245 mval = _mm_aesenc_si128(mval, mval);
246 mval2 = _mm_aesenc_si128(mval2, mval2);
247 mval3 = _mm_aesenc_si128(mval3, mval3);
248 mval4 = _mm_aesenc_si128(mval4, mval4);
249 mval5 = _mm_aesenc_si128(mval5, mval5);
250 mval6 = _mm_aesenc_si128(mval6, mval6);
251 mval7 = _mm_aesenc_si128(mval7, mval7);
252 mval8 = _mm_aesenc_si128(mval8, mval8);
254 // Combine results.
255 mval ^= mval5;
256 mval2 ^= mval6;
257 mval3 ^= mval7;
258 mval4 ^= mval8;
259 mval ^= mval3;
260 mval2 ^= mval4;
261 mval ^= mval2;
262 return _mm_cvtsi128_si64(mval);
263 } else {
264 // Make 7 more starting seeds.
265 mseed3 = mseed2;
266 mseed4 = mseed2;
267 mseed5 = mseed2;
268 mseed6 = mseed2;
269 mseed7 = mseed2;
270 mseed8 = mseed2;
271 mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
272 mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
273 mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
274 mseed5 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 64));
275 mseed6 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 80));
276 mseed7 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 96));
277 mseed8 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 112));
278 mseed2 = _mm_aesenc_si128(mseed2, mseed2);
279 mseed3 = _mm_aesenc_si128(mseed3, mseed3);
280 mseed4 = _mm_aesenc_si128(mseed4, mseed4);
281 mseed5 = _mm_aesenc_si128(mseed5, mseed5);
282 mseed6 = _mm_aesenc_si128(mseed6, mseed6);
283 mseed7 = _mm_aesenc_si128(mseed7, mseed7);
284 mseed8 = _mm_aesenc_si128(mseed8, mseed8);
286 // Start with last (possibly overlapping) block.
287 mval = _mm_loadu_si128((void*)((char*)p + size - 128));
288 mval2 = _mm_loadu_si128((void*)((char*)p + size - 112));
289 mval3 = _mm_loadu_si128((void*)((char*)p + size - 96));
290 mval4 = _mm_loadu_si128((void*)((char*)p + size - 80));
291 mval5 = _mm_loadu_si128((void*)((char*)p + size - 64));
292 mval6 = _mm_loadu_si128((void*)((char*)p + size - 48));
293 mval7 = _mm_loadu_si128((void*)((char*)p + size - 32));
294 mval8 = _mm_loadu_si128((void*)((char*)p + size - 16));
296 // XOR in seed.
297 mval ^= mseed;
298 mval2 ^= mseed2;
299 mval3 ^= mseed3;
300 mval4 ^= mseed4;
301 mval5 ^= mseed5;
302 mval6 ^= mseed6;
303 mval7 ^= mseed7;
304 mval8 ^= mseed8;
306 // Compute number of remaining 128-byte blocks.
307 size--;
308 size >>= 7;
309 do {
310 // Scramble state.
311 mval = _mm_aesenc_si128(mval, mval);
312 mval2 = _mm_aesenc_si128(mval2, mval2);
313 mval3 = _mm_aesenc_si128(mval3, mval3);
314 mval4 = _mm_aesenc_si128(mval4, mval4);
315 mval5 = _mm_aesenc_si128(mval5, mval5);
316 mval6 = _mm_aesenc_si128(mval6, mval6);
317 mval7 = _mm_aesenc_si128(mval7, mval7);
318 mval8 = _mm_aesenc_si128(mval8, mval8);
320 // Scramble state, XOR in a block.
321 mval = _mm_aesenc_si128(mval, _mm_loadu_si128(p));
322 mval2 = _mm_aesenc_si128(mval2, _mm_loadu_si128((void*)((char*)p + 16)));
323 mval3 = _mm_aesenc_si128(mval3, _mm_loadu_si128((void*)((char*)p + 32)));
324 mval4 = _mm_aesenc_si128(mval4, _mm_loadu_si128((void*)((char*)p + 48)));
325 mval5 = _mm_aesenc_si128(mval5, _mm_loadu_si128((void*)((char*)p + 64)));
326 mval6 = _mm_aesenc_si128(mval6, _mm_loadu_si128((void*)((char*)p + 80)));
327 mval7 = _mm_aesenc_si128(mval7, _mm_loadu_si128((void*)((char*)p + 96)));
328 mval8 = _mm_aesenc_si128(mval8, _mm_loadu_si128((void*)((char*)p + 112)));
330 p = (void*)((char*)p + 128);
331 } while (--size > 0);
333 // 3 more scrambles to finish.
334 mval = _mm_aesenc_si128(mval, mval);
335 mval2 = _mm_aesenc_si128(mval2, mval2);
336 mval3 = _mm_aesenc_si128(mval3, mval3);
337 mval4 = _mm_aesenc_si128(mval4, mval4);
338 mval5 = _mm_aesenc_si128(mval5, mval5);
339 mval6 = _mm_aesenc_si128(mval6, mval6);
340 mval7 = _mm_aesenc_si128(mval7, mval7);
341 mval8 = _mm_aesenc_si128(mval8, mval8);
342 mval = _mm_aesenc_si128(mval, mval);
343 mval2 = _mm_aesenc_si128(mval2, mval2);
344 mval3 = _mm_aesenc_si128(mval3, mval3);
345 mval4 = _mm_aesenc_si128(mval4, mval4);
346 mval5 = _mm_aesenc_si128(mval5, mval5);
347 mval6 = _mm_aesenc_si128(mval6, mval6);
348 mval7 = _mm_aesenc_si128(mval7, mval7);
349 mval8 = _mm_aesenc_si128(mval8, mval8);
350 mval = _mm_aesenc_si128(mval, mval);
351 mval2 = _mm_aesenc_si128(mval2, mval2);
352 mval3 = _mm_aesenc_si128(mval3, mval3);
353 mval4 = _mm_aesenc_si128(mval4, mval4);
354 mval5 = _mm_aesenc_si128(mval5, mval5);
355 mval6 = _mm_aesenc_si128(mval6, mval6);
356 mval7 = _mm_aesenc_si128(mval7, mval7);
357 mval8 = _mm_aesenc_si128(mval8, mval8);
359 mval ^= mval5;
360 mval2 ^= mval6;
361 mval3 ^= mval7;
362 mval4 ^= mval8;
363 mval ^= mval3;
364 mval2 ^= mval4;
365 mval ^= mval2;
366 return _mm_cvtsi128_si64(mval);
370 #else // !defined(__x86_64__)
372 // The 32-bit version of aeshashbody.
374 uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
375 __m128i mseed, mseed2, mseed3, mseed4;
376 __m128i mval, mval2, mval3, mval4;
378 // Start with hash seed.
379 mseed = _mm_cvtsi32_si128(seed);
380 // Get 16 bits of length.
381 mseed = _mm_insert_epi16(mseed, size, 4);
382 // Replace size with its low 2 bytes repeated 4 times.
383 mseed = _mm_shufflehi_epi16(mseed, 0);
384 // Save unscrambled seed.
385 mseed2 = mseed;
386 // XOR in per-process seed.
387 mseed ^= _mm_loadu_si128(aeskeysched.__values);
388 // Scramble seed.
389 mseed = _mm_aesenc_si128(mseed, mseed);
391 if (size <= 16) {
392 if (size == 0) {
393 // Return scrambled input seed.
394 return _mm_cvtsi128_si32(_mm_aesenc_si128(mseed, mseed));
395 } else if (size < 16) {
396 if ((((uintptr)(p) + 16) & 0xff0) != 0) {
397 static const uint64 masks[32]
398 __attribute__ ((aligned(16))) =
400 0x0000000000000000, 0x0000000000000000,
401 0x00000000000000ff, 0x0000000000000000,
402 0x000000000000ffff, 0x0000000000000000,
403 0x0000000000ffffff, 0x0000000000000000,
404 0x00000000ffffffff, 0x0000000000000000,
405 0x000000ffffffffff, 0x0000000000000000,
406 0x0000ffffffffffff, 0x0000000000000000,
407 0x00ffffffffffffff, 0x0000000000000000,
408 0xffffffffffffffff, 0x0000000000000000,
409 0xffffffffffffffff, 0x00000000000000ff,
410 0xffffffffffffffff, 0x000000000000ffff,
411 0xffffffffffffffff, 0x0000000000ffffff,
412 0xffffffffffffffff, 0x00000000ffffffff,
413 0xffffffffffffffff, 0x000000ffffffffff,
414 0xffffffffffffffff, 0x0000ffffffffffff,
415 0xffffffffffffffff, 0x00ffffffffffffff
418 // 16 bytes loaded at p won't cross a page
419 // boundary, so we can load it directly.
420 mval = _mm_loadu_si128(p);
421 mval &= *(const __m128i*)(&masks[size*2]);
422 } else {
423 static const uint64 shifts[32]
424 __attribute__ ((aligned(16))) =
426 0x0000000000000000, 0x0000000000000000,
427 0xffffffffffffff0f, 0xffffffffffffffff,
428 0xffffffffffff0f0e, 0xffffffffffffffff,
429 0xffffffffff0f0e0d, 0xffffffffffffffff,
430 0xffffffff0f0e0d0c, 0xffffffffffffffff,
431 0xffffff0f0e0d0c0b, 0xffffffffffffffff,
432 0xffff0f0e0d0c0b0a, 0xffffffffffffffff,
433 0xff0f0e0d0c0b0a09, 0xffffffffffffffff,
434 0x0f0e0d0c0b0a0908, 0xffffffffffffffff,
435 0x0e0d0c0b0a090807, 0xffffffffffffff0f,
436 0x0d0c0b0a09080706, 0xffffffffffff0f0e,
437 0x0c0b0a0908070605, 0xffffffffff0f0e0d,
438 0x0b0a090807060504, 0xffffffff0f0e0d0c,
439 0x0a09080706050403, 0xffffff0f0e0d0c0b,
440 0x0908070605040302, 0xffff0f0e0d0c0b0a,
441 0x0807060504030201, 0xff0f0e0d0c0b0a09,
444 // address ends in 1111xxxx. Might be
445 // up against a page boundary, so load
446 // ending at last byte. Then shift
447 // bytes down using pshufb.
448 mval = _mm_loadu_si128((void*)((char*)p - 16 + size));
449 mval = _mm_shuffle_epi8(mval, *(const __m128i*)(&shifts[size*2]));
451 } else {
452 mval = _mm_loadu_si128(p);
455 // Scramble input, XOR in seed.
456 mval = _mm_aesenc_si128(mval, mseed);
457 mval = _mm_aesenc_si128(mval, mval);
458 mval = _mm_aesenc_si128(mval, mval);
459 return _mm_cvtsi128_si32(mval);
460 } else if (size <= 32) {
461 // Make second starting seed.
462 mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
463 mseed2 = _mm_aesenc_si128(mseed2, mseed2);
464 // Load data to be hashed.
465 mval = _mm_loadu_si128(p);
466 mval2 = _mm_loadu_si128((void*)((char*)p + size - 16));
468 // Scramble 3 times.
469 mval = _mm_aesenc_si128(mval, mseed);
470 mval2 = _mm_aesenc_si128(mval2, mseed2);
471 mval = _mm_aesenc_si128(mval, mval);
472 mval2 = _mm_aesenc_si128(mval2, mval2);
473 mval = _mm_aesenc_si128(mval, mval);
474 mval2 = _mm_aesenc_si128(mval2, mval2);
476 // Combine results.
477 mval ^= mval2;
478 return _mm_cvtsi128_si32(mval);
479 } else if (size <= 64) {
480 // Make 3 more starting seeds.
481 mseed3 = mseed2;
482 mseed4 = mseed2;
483 mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
484 mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
485 mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
486 mseed2 = _mm_aesenc_si128(mseed2, mseed2);
487 mseed3 = _mm_aesenc_si128(mseed3, mseed3);
488 mseed4 = _mm_aesenc_si128(mseed4, mseed4);
490 mval = _mm_loadu_si128(p);
491 mval2 = _mm_loadu_si128((void*)((char*)p + 16));
492 mval3 = _mm_loadu_si128((void*)((char*)p + size - 32));
493 mval4 = _mm_loadu_si128((void*)((char*)p + size - 16));
495 mval = _mm_aesenc_si128(mval, mseed);
496 mval2 = _mm_aesenc_si128(mval2, mseed2);
497 mval3 = _mm_aesenc_si128(mval3, mseed3);
498 mval4 = _mm_aesenc_si128(mval4, mseed4);
500 mval = _mm_aesenc_si128(mval, mval);
501 mval2 = _mm_aesenc_si128(mval2, mval2);
502 mval3 = _mm_aesenc_si128(mval3, mval3);
503 mval4 = _mm_aesenc_si128(mval4, mval4);
505 mval = _mm_aesenc_si128(mval, mval);
506 mval2 = _mm_aesenc_si128(mval2, mval2);
507 mval3 = _mm_aesenc_si128(mval3, mval3);
508 mval4 = _mm_aesenc_si128(mval4, mval4);
510 mval ^= mval3;
511 mval2 ^= mval4;
512 mval ^= mval2;
513 return _mm_cvtsi128_si32(mval);
514 } else {
515 // Make 3 more starting seeds.
516 mseed3 = mseed2;
517 mseed4 = mseed2;
518 mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
519 mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
520 mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
521 mseed2 = _mm_aesenc_si128(mseed2, mseed2);
522 mseed3 = _mm_aesenc_si128(mseed3, mseed3);
523 mseed4 = _mm_aesenc_si128(mseed4, mseed4);
525 // Start with last (possibly overlapping) block.
526 mval = _mm_loadu_si128((void*)((char*)p + size - 64));
527 mval2 = _mm_loadu_si128((void*)((char*)p + size - 48));
528 mval3 = _mm_loadu_si128((void*)((char*)p + size - 32));
529 mval4 = _mm_loadu_si128((void*)((char*)p + size - 16));
531 // Scramble state once.
532 mval = _mm_aesenc_si128(mval, mseed);
533 mval2 = _mm_aesenc_si128(mval2, mseed2);
534 mval3 = _mm_aesenc_si128(mval3, mseed3);
535 mval4 = _mm_aesenc_si128(mval4, mseed4);
537 // Compute number of remaining 64-byte blocks.
538 size--;
539 size >>= 6;
540 do {
541 // Scramble state, XOR in a block.
542 mval = _mm_aesenc_si128(mval, _mm_loadu_si128(p));
543 mval2 = _mm_aesenc_si128(mval2, _mm_loadu_si128((void*)((char*)p + 16)));
544 mval3 = _mm_aesenc_si128(mval3, _mm_loadu_si128((void*)((char*)p + 32)));
545 mval4 = _mm_aesenc_si128(mval4, _mm_loadu_si128((void*)((char*)p + 48)));
547 // Scramble state.
548 mval = _mm_aesenc_si128(mval, mval);
549 mval2 = _mm_aesenc_si128(mval2, mval2);
550 mval3 = _mm_aesenc_si128(mval3, mval3);
551 mval4 = _mm_aesenc_si128(mval4, mval4);
553 p = (void*)((char*)p + 64);
554 } while (--size > 0);
556 // 2 more scrambles to finish.
557 mval = _mm_aesenc_si128(mval, mval);
558 mval2 = _mm_aesenc_si128(mval2, mval2);
559 mval3 = _mm_aesenc_si128(mval3, mval3);
560 mval4 = _mm_aesenc_si128(mval4, mval4);
562 mval = _mm_aesenc_si128(mval, mval);
563 mval2 = _mm_aesenc_si128(mval2, mval2);
564 mval3 = _mm_aesenc_si128(mval3, mval3);
565 mval4 = _mm_aesenc_si128(mval4, mval4);
567 mval ^= mval3;
568 mval2 ^= mval4;
569 mval ^= mval2;
570 return _mm_cvtsi128_si32(mval);
574 #endif // !defined(__x86_64__)
576 #elif defined(__aarch64__)
578 // Undefine some identifiers that we pick up from the Go runtime package that
579 // are used in arm_neon.h.
581 #undef t1
582 #undef tx
583 #undef t2
584 #undef t3
585 #undef t4
586 #undef t5
588 #include <arm_neon.h>
590 // Force appropriate CPU level. We won't call here unless the CPU
591 // supports it.
593 #pragma GCC target("+crypto")
595 // The arm64 version of aeshashbody.
597 uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
598 uint8x16_t *pseed;
599 uint32x4_t vinit32;
600 uint8x16_t vinit;
601 uint8x16_t vseed, vseed2, vseed3, vseed4;
602 uint8x16_t vseed5, vseed6, vseed7, vseed8;
603 uint8x16_t vval, vval2, vval3, vval4;
604 uint8x16_t vval5, vval6, vval7, vval8;
605 uint8x16_t vvalLoop, vvalLoop2, vvalLoop3, vvalLoop4;
606 uint8x16_t vvalLoop5, vvalLoop6, vvalLoop7, vvalLoop8;
607 uint8x16x2_t avval2;
608 uint8x16x3_t avseed3;
610 pseed = (uint8x16_t*)(aeskeysched.__values);
612 // Combined hash seed and length.
613 vinit32 = vdupq_n_u32(0);
614 vinit32[0] = (uint32)seed;
615 vinit32[1] = (uint32)size;
616 vinit = vreinterpretq_u8_u32(vinit32);
618 // Mix in per-process seed.
619 vseed = vaeseq_u8(*pseed, vinit);
620 ++pseed;
621 // Scramble seed.
622 vseed = vaesmcq_u8(vseed);
624 if (size <= 16) {
625 if (size == 0) {
626 // Return 64 bits of scrambled input seed.
627 return vreinterpretq_u64_u8(vseed)[0];
628 } else if (size < 16) {
629 vval = vreinterpretq_u8_u32(vdupq_n_u32(0));
630 if ((size & 8) != 0) {
631 vval = vreinterpretq_u8_u64(vld1q_lane_u64((uint64_t*)(p), vreinterpretq_u64_u8(vval), 0));
632 p = (void*)((uint64_t*)(p) + 1);
634 if ((size & 4) != 0) {
635 vval = vreinterpretq_u8_u32(vld1q_lane_u32((uint32_t*)(p), vreinterpretq_u32_u8(vval), 2));
636 p = (void*)((uint32_t*)(p) + 1);
638 if ((size & 2) != 0) {
639 vval = vreinterpretq_u8_u16(vld1q_lane_u16((uint16_t*)(p), vreinterpretq_u16_u8(vval), 6));
640 p = (void*)((uint16_t*)(p) + 1);
642 if ((size & 1) != 0) {
643 vval = vld1q_lane_u8((uint8*)(p), vval, 14);
645 } else {
646 vval = *(uint8x16_t*)(p);
648 vval = vaeseq_u8(vval, vseed);
649 vval = vaesmcq_u8(vval);
650 vval = vaeseq_u8(vval, vseed);
651 vval = vaesmcq_u8(vval);
652 vval = vaeseq_u8(vval, vseed);
653 return vreinterpretq_u64_u8(vval)[0];
654 } else if (size <= 32) {
655 // Make a second seed.
656 vseed2 = vaeseq_u8(*pseed, vinit);
657 vseed2 = vaesmcq_u8(vseed2);
658 vval = *(uint8x16_t*)(p);
659 vval2 = *(uint8x16_t*)((char*)(p) + (size - 16));
661 vval = vaeseq_u8(vval, vseed);
662 vval = vaesmcq_u8(vval);
663 vval2 = vaeseq_u8(vval2, vseed2);
664 vval2 = vaesmcq_u8(vval2);
666 vval = vaeseq_u8(vval, vseed);
667 vval = vaesmcq_u8(vval);
668 vval2 = vaeseq_u8(vval2, vseed2);
669 vval2 = vaesmcq_u8(vval2);
671 vval = vaeseq_u8(vval, vseed);
672 vval2 = vaeseq_u8(vval2, vseed2);
674 vval ^= vval2;
676 return vreinterpretq_u64_u8(vval)[0];
677 } else if (size <= 64) {
678 avseed3 = vld1q_u8_x3((uint8*)(pseed));
679 vseed2 = avseed3.val[0];
680 vseed3 = avseed3.val[1];
681 vseed4 = avseed3.val[2];
683 vseed2 = vaeseq_u8(vseed2, vinit);
684 vseed2 = vaesmcq_u8(vseed2);
685 vseed3 = vaeseq_u8(vseed3, vinit);
686 vseed3 = vaesmcq_u8(vseed3);
687 vseed4 = vaeseq_u8(vseed4, vinit);
688 vseed4 = vaesmcq_u8(vseed4);
690 avval2 = vld1q_u8_x2((uint8*)(p));
691 vval = avval2.val[0];
692 vval2 = avval2.val[1];
693 avval2 = vld1q_u8_x2((uint8*)(p) + (size - 32));
694 vval3 = avval2.val[0];
695 vval4 = avval2.val[1];
697 vval = vaeseq_u8(vval, vseed);
698 vval = vaesmcq_u8(vval);
699 vval2 = vaeseq_u8(vval2, vseed2);
700 vval2 = vaesmcq_u8(vval2);
701 vval3 = vaeseq_u8(vval3, vseed3);
702 vval3 = vaesmcq_u8(vval3);
703 vval4 = vaeseq_u8(vval4, vseed4);
704 vval4 = vaesmcq_u8(vval4);
706 vval = vaeseq_u8(vval, vseed);
707 vval = vaesmcq_u8(vval);
708 vval2 = vaeseq_u8(vval2, vseed2);
709 vval2 = vaesmcq_u8(vval2);
710 vval3 = vaeseq_u8(vval3, vseed3);
711 vval3 = vaesmcq_u8(vval3);
712 vval4 = vaeseq_u8(vval4, vseed4);
713 vval4 = vaesmcq_u8(vval4);
715 vval = vaeseq_u8(vval, vseed);
716 vval2 = vaeseq_u8(vval2, vseed2);
717 vval3 = vaeseq_u8(vval3, vseed3);
718 vval4 = vaeseq_u8(vval4, vseed4);
720 vval ^= vval3;
721 vval2 ^= vval4;
722 vval ^= vval2;
724 return vreinterpretq_u64_u8(vval)[0];
725 } else if (size <= 128) {
726 // For some reason vld1q_u8_x4 is missing.
727 avseed3 = vld1q_u8_x3((uint8*)(pseed));
728 vseed2 = avseed3.val[0];
729 vseed3 = avseed3.val[1];
730 vseed4 = avseed3.val[2];
731 avseed3 = vld1q_u8_x3((uint8*)(pseed + 3));
732 vseed5 = avseed3.val[0];
733 vseed6 = avseed3.val[1];
734 vseed7 = avseed3.val[2];
735 vseed8 = *(pseed + 6);
737 vseed2 = vaeseq_u8(vseed2, vinit);
738 vseed2 = vaesmcq_u8(vseed2);
739 vseed3 = vaeseq_u8(vseed3, vinit);
740 vseed3 = vaesmcq_u8(vseed3);
741 vseed4 = vaeseq_u8(vseed4, vinit);
742 vseed4 = vaesmcq_u8(vseed4);
743 vseed5 = vaeseq_u8(vseed5, vinit);
744 vseed5 = vaesmcq_u8(vseed5);
745 vseed6 = vaeseq_u8(vseed6, vinit);
746 vseed6 = vaesmcq_u8(vseed6);
747 vseed7 = vaeseq_u8(vseed7, vinit);
748 vseed7 = vaesmcq_u8(vseed7);
749 vseed8 = vaeseq_u8(vseed8, vinit);
750 vseed8 = vaesmcq_u8(vseed8);
752 avval2 = vld1q_u8_x2((uint8*)(p));
753 vval = avval2.val[0];
754 vval2 = avval2.val[1];
755 avval2 = vld1q_u8_x2((uint8*)(p) + 32);
756 vval3 = avval2.val[0];
757 vval4 = avval2.val[1];
758 avval2 = vld1q_u8_x2((uint8*)(p) + (size - 64));
759 vval5 = avval2.val[0];
760 vval6 = avval2.val[1];
761 avval2 = vld1q_u8_x2((uint8*)(p) + (size - 32));
762 vval7 = avval2.val[0];
763 vval8 = avval2.val[1];
765 vval = vaeseq_u8(vval, vseed);
766 vval = vaesmcq_u8(vval);
767 vval2 = vaeseq_u8(vval2, vseed2);
768 vval2 = vaesmcq_u8(vval2);
769 vval3 = vaeseq_u8(vval3, vseed3);
770 vval3 = vaesmcq_u8(vval3);
771 vval4 = vaeseq_u8(vval4, vseed4);
772 vval4 = vaesmcq_u8(vval4);
773 vval5 = vaeseq_u8(vval5, vseed5);
774 vval5 = vaesmcq_u8(vval5);
775 vval6 = vaeseq_u8(vval6, vseed6);
776 vval6 = vaesmcq_u8(vval6);
777 vval7 = vaeseq_u8(vval7, vseed7);
778 vval7 = vaesmcq_u8(vval7);
779 vval8 = vaeseq_u8(vval8, vseed8);
780 vval8 = vaesmcq_u8(vval8);
782 vval = vaeseq_u8(vval, vseed);
783 vval = vaesmcq_u8(vval);
784 vval2 = vaeseq_u8(vval2, vseed2);
785 vval2 = vaesmcq_u8(vval2);
786 vval3 = vaeseq_u8(vval3, vseed3);
787 vval3 = vaesmcq_u8(vval3);
788 vval4 = vaeseq_u8(vval4, vseed4);
789 vval4 = vaesmcq_u8(vval4);
790 vval5 = vaeseq_u8(vval5, vseed5);
791 vval5 = vaesmcq_u8(vval5);
792 vval6 = vaeseq_u8(vval6, vseed6);
793 vval6 = vaesmcq_u8(vval6);
794 vval7 = vaeseq_u8(vval7, vseed7);
795 vval7 = vaesmcq_u8(vval7);
796 vval8 = vaeseq_u8(vval8, vseed8);
797 vval8 = vaesmcq_u8(vval8);
799 vval = vaeseq_u8(vval, vseed);
800 vval2 = vaeseq_u8(vval2, vseed2);
801 vval3 = vaeseq_u8(vval3, vseed3);
802 vval4 = vaeseq_u8(vval4, vseed4);
803 vval5 = vaeseq_u8(vval5, vseed5);
804 vval6 = vaeseq_u8(vval6, vseed6);
805 vval7 = vaeseq_u8(vval7, vseed7);
806 vval8 = vaeseq_u8(vval8, vseed8);
808 vval ^= vval5;
809 vval2 ^= vval6;
810 vval3 ^= vval7;
811 vval4 ^= vval8;
812 vval ^= vval3;
813 vval2 ^= vval4;
814 vval ^= vval2;
816 return vreinterpretq_u64_u8(vval)[0];
817 } else {
818 // For some reason vld1q_u8_x4 is missing.
819 avseed3 = vld1q_u8_x3((uint8*)(pseed));
820 vseed2 = avseed3.val[0];
821 vseed3 = avseed3.val[1];
822 vseed4 = avseed3.val[2];
823 avseed3 = vld1q_u8_x3((uint8*)(pseed + 3));
824 vseed5 = avseed3.val[0];
825 vseed6 = avseed3.val[1];
826 vseed7 = avseed3.val[2];
827 vseed8 = *(pseed + 6);
829 vseed2 = vaeseq_u8(vseed2, vinit);
830 vseed2 = vaesmcq_u8(vseed2);
831 vseed3 = vaeseq_u8(vseed3, vinit);
832 vseed3 = vaesmcq_u8(vseed3);
833 vseed4 = vaeseq_u8(vseed4, vinit);
834 vseed4 = vaesmcq_u8(vseed4);
835 vseed5 = vaeseq_u8(vseed5, vinit);
836 vseed5 = vaesmcq_u8(vseed5);
837 vseed6 = vaeseq_u8(vseed6, vinit);
838 vseed6 = vaesmcq_u8(vseed6);
839 vseed7 = vaeseq_u8(vseed7, vinit);
840 vseed7 = vaesmcq_u8(vseed7);
841 vseed8 = vaeseq_u8(vseed8, vinit);
842 vseed8 = vaesmcq_u8(vseed8);
844 avval2 = vld1q_u8_x2((uint8*)(p) + (size - 128));
845 vval = avval2.val[0];
846 vval2 = avval2.val[1];
847 avval2 = vld1q_u8_x2((uint8*)(p) + (size - 96));
848 vval3 = avval2.val[0];
849 vval4 = avval2.val[1];
850 avval2 = vld1q_u8_x2((uint8*)(p) + (size - 64));
851 vval5 = avval2.val[0];
852 vval6 = avval2.val[1];
853 avval2 = vld1q_u8_x2((uint8*)(p) + (size - 32));
854 vval7 = avval2.val[0];
855 vval8 = avval2.val[1];
857 vvalLoop = vseed;
858 vvalLoop2 = vseed2;
859 vvalLoop3 = vseed3;
860 vvalLoop4 = vseed4;
861 vvalLoop5 = vseed5;
862 vvalLoop6 = vseed6;
863 vvalLoop7 = vseed7;
864 vvalLoop8 = vseed8;
866 size--;
867 size >>= 7;
868 do {
869 vval = vaeseq_u8(vval, vvalLoop);
870 vval = vaesmcq_u8(vval);
871 vval2 = vaeseq_u8(vval2, vvalLoop2);
872 vval2 = vaesmcq_u8(vval2);
873 vval3 = vaeseq_u8(vval3, vvalLoop3);
874 vval3 = vaesmcq_u8(vval3);
875 vval4 = vaeseq_u8(vval4, vvalLoop4);
876 vval4 = vaesmcq_u8(vval4);
877 vval5 = vaeseq_u8(vval5, vvalLoop5);
878 vval5 = vaesmcq_u8(vval5);
879 vval6 = vaeseq_u8(vval6, vvalLoop6);
880 vval6 = vaesmcq_u8(vval6);
881 vval7 = vaeseq_u8(vval7, vvalLoop7);
882 vval7 = vaesmcq_u8(vval7);
883 vval8 = vaeseq_u8(vval8, vvalLoop8);
884 vval8 = vaesmcq_u8(vval8);
886 avval2 = vld1q_u8_x2((uint8*)(p));
887 vvalLoop = avval2.val[0];
888 vvalLoop2 = avval2.val[1];
889 avval2 = vld1q_u8_x2((uint8*)(p) + 32);
890 vvalLoop3 = avval2.val[0];
891 vvalLoop4 = avval2.val[1];
892 avval2 = vld1q_u8_x2((uint8*)(p) + 64);
893 vvalLoop5 = avval2.val[0];
894 vvalLoop6 = avval2.val[1];
895 avval2 = vld1q_u8_x2((uint8*)(p) + 96);
896 vvalLoop7 = avval2.val[0];
897 vvalLoop8 = avval2.val[1];
899 p = (void *)((uint8*)(p) + 128);
901 vval = vaeseq_u8(vval, vvalLoop);
902 vval = vaesmcq_u8(vval);
903 vval2 = vaeseq_u8(vval2, vvalLoop2);
904 vval2 = vaesmcq_u8(vval2);
905 vval3 = vaeseq_u8(vval3, vvalLoop3);
906 vval3 = vaesmcq_u8(vval3);
907 vval4 = vaeseq_u8(vval4, vvalLoop4);
908 vval4 = vaesmcq_u8(vval4);
909 vval5 = vaeseq_u8(vval5, vvalLoop5);
910 vval5 = vaesmcq_u8(vval5);
911 vval6 = vaeseq_u8(vval6, vvalLoop6);
912 vval6 = vaesmcq_u8(vval6);
913 vval7 = vaeseq_u8(vval7, vvalLoop7);
914 vval7 = vaesmcq_u8(vval7);
915 vval8 = vaeseq_u8(vval8, vvalLoop8);
916 vval8 = vaesmcq_u8(vval8);
917 } while (--size > 0);
919 vval = vaeseq_u8(vval, vvalLoop);
920 vval = vaesmcq_u8(vval);
921 vval2 = vaeseq_u8(vval2, vvalLoop2);
922 vval2 = vaesmcq_u8(vval2);
923 vval3 = vaeseq_u8(vval3, vvalLoop3);
924 vval3 = vaesmcq_u8(vval3);
925 vval4 = vaeseq_u8(vval4, vvalLoop4);
926 vval4 = vaesmcq_u8(vval4);
927 vval5 = vaeseq_u8(vval5, vvalLoop5);
928 vval5 = vaesmcq_u8(vval5);
929 vval6 = vaeseq_u8(vval6, vvalLoop6);
930 vval6 = vaesmcq_u8(vval6);
931 vval7 = vaeseq_u8(vval7, vvalLoop7);
932 vval7 = vaesmcq_u8(vval7);
933 vval8 = vaeseq_u8(vval8, vvalLoop8);
934 vval8 = vaesmcq_u8(vval8);
937 vval = vaeseq_u8(vval, vvalLoop);
938 vval = vaesmcq_u8(vval);
939 vval2 = vaeseq_u8(vval2, vvalLoop2);
940 vval2 = vaesmcq_u8(vval2);
941 vval3 = vaeseq_u8(vval3, vvalLoop3);
942 vval3 = vaesmcq_u8(vval3);
943 vval4 = vaeseq_u8(vval4, vvalLoop4);
944 vval4 = vaesmcq_u8(vval4);
945 vval5 = vaeseq_u8(vval5, vvalLoop5);
946 vval5 = vaesmcq_u8(vval5);
947 vval6 = vaeseq_u8(vval6, vvalLoop6);
948 vval6 = vaesmcq_u8(vval6);
949 vval7 = vaeseq_u8(vval7, vvalLoop7);
950 vval7 = vaesmcq_u8(vval7);
951 vval8 = vaeseq_u8(vval8, vvalLoop8);
952 vval8 = vaesmcq_u8(vval8);
954 vval = vaeseq_u8(vval, vvalLoop);
955 vval2 = vaeseq_u8(vval2, vvalLoop2);
956 vval3 = vaeseq_u8(vval3, vvalLoop3);
957 vval4 = vaeseq_u8(vval4, vvalLoop4);
958 vval5 = vaeseq_u8(vval5, vvalLoop5);
959 vval6 = vaeseq_u8(vval6, vvalLoop6);
960 vval7 = vaeseq_u8(vval7, vvalLoop7);
961 vval8 = vaeseq_u8(vval8, vvalLoop8);
963 vval ^= vval5;
964 vval2 ^= vval6;
965 vval3 ^= vval7;
966 vval4 ^= vval8;
967 vval ^= vval3;
968 vval2 ^= vval4;
969 vval ^= vval2;
971 return vreinterpretq_u64_u8(vval)[0];
975 #else // (!defined(__i386__) && !defined(__x86_64__) || !defined(HAVE_AS_X86_AES)) && !defined(__aarch64__)
977 uintptr aeshashbody(void* p __attribute__((unused)),
978 uintptr seed __attribute__((unused)),
979 uintptr size __attribute__((unused)),
980 Slice aeskeysched __attribute__((unused))) {
981 // We should never get here on a non-x86, non-arm64 system.
982 runtime_throw("impossible call to aeshashbody");
985 #endif // !defined(__i386__) && !defined(__x86_64__) || !defined(HAVE_AS_X86_AES)