* gcc.dg/ipa/inlinehint-4.c: Also pass --param inline-unit-growth=20.
[official-gcc.git] / libgo / runtime / aeshash.c
blob7f29baa07b2c2e0d231aa2405f78f06cbe1490f8
1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Hash code using AES intrinsics.
7 #include "runtime.h"
9 uintptr aeshashbody(void*, uintptr, uintptr, Slice)
10 __asm__(GOSYM_PREFIX "runtime.aeshashbody");
12 uintptr aeshashbody(void*, uintptr, uintptr, Slice)
13 __attribute__((no_split_stack));
15 #if (defined(__i386__) || defined(__x86_64__)) && defined(HAVE_AS_X86_AES)
17 #include <emmintrin.h>
18 #include <tmmintrin.h>
19 #include <wmmintrin.h>
21 // Force appropriate CPU level. We won't call here unless the CPU
22 // supports it.
24 #pragma GCC target("ssse3", "aes")
26 #ifdef __x86_64__
28 // aeshashbody implements a hash function using AES instructions
29 // available in recent x86 processors. Note this is not encryption,
30 // just hashing.
32 // This is written to produce exactly the same results as the gc
33 // implementation, not because that matters, but just to ensure that
34 // this does something reasonable.
35 uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
36 __m128i mseed, mseed2, mseed3, mseed4, mseed5, mseed6, mseed7, mseed8;
37 __m128i mval, mval2, mval3, mval4, mval5, mval6, mval7, mval8;
39 // Start with hash seed.
40 mseed = _mm_cvtsi64_si128(seed);
41 // Get 16 bits of length.
42 mseed = _mm_insert_epi16(mseed, size, 4);
43 // Repeat length 4 times total.
44 mseed = _mm_shufflehi_epi16(mseed, 0);
45 // Save unscrambled seed.
46 mseed2 = mseed;
47 // XOR in per-process seed.
48 mseed ^= _mm_loadu_si128(aeskeysched.__values);
49 // Scramble seed.
50 mseed = _mm_aesenc_si128(mseed, mseed);
52 if (size <= 16) {
53 if (size == 0) {
54 // Return scrambled input seed.
55 return _mm_cvtsi128_si64(_mm_aesenc_si128(mseed, mseed));
56 } else if (size < 16) {
57 if ((((uintptr)(p) + 16) & 0xff0) != 0) {
58 static const uint64 masks[32]
59 __attribute__ ((aligned(16))) =
61 0x0000000000000000, 0x0000000000000000,
62 0x00000000000000ff, 0x0000000000000000,
63 0x000000000000ffff, 0x0000000000000000,
64 0x0000000000ffffff, 0x0000000000000000,
65 0x00000000ffffffff, 0x0000000000000000,
66 0x000000ffffffffff, 0x0000000000000000,
67 0x0000ffffffffffff, 0x0000000000000000,
68 0x00ffffffffffffff, 0x0000000000000000,
69 0xffffffffffffffff, 0x0000000000000000,
70 0xffffffffffffffff, 0x00000000000000ff,
71 0xffffffffffffffff, 0x000000000000ffff,
72 0xffffffffffffffff, 0x0000000000ffffff,
73 0xffffffffffffffff, 0x00000000ffffffff,
74 0xffffffffffffffff, 0x000000ffffffffff,
75 0xffffffffffffffff, 0x0000ffffffffffff,
76 0xffffffffffffffff, 0x00ffffffffffffff
79 // 16 bytes loaded at p won't cross a page
80 // boundary, so we can load directly.
81 mval = _mm_loadu_si128(p);
82 mval &= *(const __m128i*)(&masks[size*2]);
83 } else {
84 static const uint64 shifts[32]
85 __attribute__ ((aligned(16))) =
87 0x0000000000000000, 0x0000000000000000,
88 0xffffffffffffff0f, 0xffffffffffffffff,
89 0xffffffffffff0f0e, 0xffffffffffffffff,
90 0xffffffffff0f0e0d, 0xffffffffffffffff,
91 0xffffffff0f0e0d0c, 0xffffffffffffffff,
92 0xffffff0f0e0d0c0b, 0xffffffffffffffff,
93 0xffff0f0e0d0c0b0a, 0xffffffffffffffff,
94 0xff0f0e0d0c0b0a09, 0xffffffffffffffff,
95 0x0f0e0d0c0b0a0908, 0xffffffffffffffff,
96 0x0e0d0c0b0a090807, 0xffffffffffffff0f,
97 0x0d0c0b0a09080706, 0xffffffffffff0f0e,
98 0x0c0b0a0908070605, 0xffffffffff0f0e0d,
99 0x0b0a090807060504, 0xffffffff0f0e0d0c,
100 0x0a09080706050403, 0xffffff0f0e0d0c0b,
101 0x0908070605040302, 0xffff0f0e0d0c0b0a,
102 0x0807060504030201, 0xff0f0e0d0c0b0a09,
105 // address ends in 1111xxxx. Might be
106 // up against a page boundary, so load
107 // ending at last byte. Then shift
108 // bytes down using pshufb.
109 mval = _mm_loadu_si128((void*)((char*)p - 16 + size));
110 mval = _mm_shuffle_epi8(mval, *(const __m128i*)(&shifts[size*2]));
112 } else {
113 mval = _mm_loadu_si128(p);
116 // XOR data with seed.
117 mval ^= mseed;
118 // Scramble combo 3 times.
119 mval = _mm_aesenc_si128(mval, mval);
120 mval = _mm_aesenc_si128(mval, mval);
121 mval = _mm_aesenc_si128(mval, mval);
122 return _mm_cvtsi128_si64(mval);
123 } else if (size <= 32) {
124 // Make second starting seed.
125 mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
126 mseed2 = _mm_aesenc_si128(mseed2, mseed2);
127 // Load data to be hashed.
128 mval = _mm_loadu_si128(p);
129 mval2 = _mm_loadu_si128((void*)((char*)p + size - 16));
130 // XOR with seed.
131 mval ^= mseed;
132 mval2 ^= mseed2;
133 // Scramble 3 times.
134 mval = _mm_aesenc_si128(mval, mval);
135 mval2 = _mm_aesenc_si128(mval2, mval2);
136 mval = _mm_aesenc_si128(mval, mval);
137 mval2 = _mm_aesenc_si128(mval2, mval2);
138 mval = _mm_aesenc_si128(mval, mval);
139 mval2 = _mm_aesenc_si128(mval2, mval2);
140 // Combine results.
141 mval ^= mval2;
142 return _mm_cvtsi128_si64(mval);
143 } else if (size <= 64) {
144 // Make 3 more starting seeds.
145 mseed3 = mseed2;
146 mseed4 = mseed2;
147 mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
148 mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
149 mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
150 mseed2 = _mm_aesenc_si128(mseed2, mseed2);
151 mseed3 = _mm_aesenc_si128(mseed3, mseed3);
152 mseed4 = _mm_aesenc_si128(mseed4, mseed4);
154 mval = _mm_loadu_si128(p);
155 mval2 = _mm_loadu_si128((void*)((char*)p + 16));
156 mval3 = _mm_loadu_si128((void*)((char*)p + size - 32));
157 mval4 = _mm_loadu_si128((void*)((char*)p + size - 16));
159 mval ^= mseed;
160 mval2 ^= mseed2;
161 mval3 ^= mseed3;
162 mval4 ^= mseed4;
164 mval = _mm_aesenc_si128(mval, mval);
165 mval2 = _mm_aesenc_si128(mval2, mval2);
166 mval3 = _mm_aesenc_si128(mval3, mval3);
167 mval4 = _mm_aesenc_si128(mval4, mval4);
169 mval = _mm_aesenc_si128(mval, mval);
170 mval2 = _mm_aesenc_si128(mval2, mval2);
171 mval3 = _mm_aesenc_si128(mval3, mval3);
172 mval4 = _mm_aesenc_si128(mval4, mval4);
174 mval = _mm_aesenc_si128(mval, mval);
175 mval2 = _mm_aesenc_si128(mval2, mval2);
176 mval3 = _mm_aesenc_si128(mval3, mval3);
177 mval4 = _mm_aesenc_si128(mval4, mval4);
179 mval ^= mval3;
180 mval2 ^= mval4;
181 mval ^= mval2;
182 return _mm_cvtsi128_si64(mval);
183 } else if (size <= 128) {
184 // Make 7 more starting seeds.
185 mseed3 = mseed2;
186 mseed4 = mseed2;
187 mseed5 = mseed2;
188 mseed6 = mseed2;
189 mseed7 = mseed2;
190 mseed8 = mseed2;
191 mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
192 mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
193 mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
194 mseed5 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 64));
195 mseed6 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 80));
196 mseed7 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 96));
197 mseed8 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 112));
198 mseed2 = _mm_aesenc_si128(mseed2, mseed2);
199 mseed3 = _mm_aesenc_si128(mseed3, mseed3);
200 mseed4 = _mm_aesenc_si128(mseed4, mseed4);
201 mseed5 = _mm_aesenc_si128(mseed5, mseed5);
202 mseed6 = _mm_aesenc_si128(mseed6, mseed6);
203 mseed7 = _mm_aesenc_si128(mseed7, mseed7);
204 mseed8 = _mm_aesenc_si128(mseed8, mseed8);
206 // Load data.
207 mval = _mm_loadu_si128(p);
208 mval2 = _mm_loadu_si128((void*)((char*)p + 16));
209 mval3 = _mm_loadu_si128((void*)((char*)p + 32));
210 mval4 = _mm_loadu_si128((void*)((char*)p + 48));
211 mval5 = _mm_loadu_si128((void*)((char*)p + size - 64));
212 mval6 = _mm_loadu_si128((void*)((char*)p + size - 48));
213 mval7 = _mm_loadu_si128((void*)((char*)p + size - 32));
214 mval8 = _mm_loadu_si128((void*)((char*)p + size - 16));
216 // XOR with seed.
217 mval ^= mseed;
218 mval2 ^= mseed2;
219 mval3 ^= mseed3;
220 mval4 ^= mseed4;
221 mval5 ^= mseed5;
222 mval6 ^= mseed6;
223 mval7 ^= mseed7;
224 mval8 ^= mseed8;
226 // Scramble 3 times.
227 mval = _mm_aesenc_si128(mval, mval);
228 mval2 = _mm_aesenc_si128(mval2, mval2);
229 mval3 = _mm_aesenc_si128(mval3, mval3);
230 mval4 = _mm_aesenc_si128(mval4, mval4);
231 mval5 = _mm_aesenc_si128(mval5, mval5);
232 mval6 = _mm_aesenc_si128(mval6, mval6);
233 mval7 = _mm_aesenc_si128(mval7, mval7);
234 mval8 = _mm_aesenc_si128(mval8, mval8);
236 mval = _mm_aesenc_si128(mval, mval);
237 mval2 = _mm_aesenc_si128(mval2, mval2);
238 mval3 = _mm_aesenc_si128(mval3, mval3);
239 mval4 = _mm_aesenc_si128(mval4, mval4);
240 mval5 = _mm_aesenc_si128(mval5, mval5);
241 mval6 = _mm_aesenc_si128(mval6, mval6);
242 mval7 = _mm_aesenc_si128(mval7, mval7);
243 mval8 = _mm_aesenc_si128(mval8, mval8);
245 mval = _mm_aesenc_si128(mval, mval);
246 mval2 = _mm_aesenc_si128(mval2, mval2);
247 mval3 = _mm_aesenc_si128(mval3, mval3);
248 mval4 = _mm_aesenc_si128(mval4, mval4);
249 mval5 = _mm_aesenc_si128(mval5, mval5);
250 mval6 = _mm_aesenc_si128(mval6, mval6);
251 mval7 = _mm_aesenc_si128(mval7, mval7);
252 mval8 = _mm_aesenc_si128(mval8, mval8);
254 // Combine results.
255 mval ^= mval5;
256 mval2 ^= mval6;
257 mval3 ^= mval7;
258 mval4 ^= mval8;
259 mval ^= mval3;
260 mval2 ^= mval4;
261 mval ^= mval2;
262 return _mm_cvtsi128_si64(mval);
263 } else {
264 // Make 7 more starting seeds.
265 mseed3 = mseed2;
266 mseed4 = mseed2;
267 mseed5 = mseed2;
268 mseed6 = mseed2;
269 mseed7 = mseed2;
270 mseed8 = mseed2;
271 mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
272 mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
273 mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
274 mseed5 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 64));
275 mseed6 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 80));
276 mseed7 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 96));
277 mseed8 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 112));
278 mseed2 = _mm_aesenc_si128(mseed2, mseed2);
279 mseed3 = _mm_aesenc_si128(mseed3, mseed3);
280 mseed4 = _mm_aesenc_si128(mseed4, mseed4);
281 mseed5 = _mm_aesenc_si128(mseed5, mseed5);
282 mseed6 = _mm_aesenc_si128(mseed6, mseed6);
283 mseed7 = _mm_aesenc_si128(mseed7, mseed7);
284 mseed8 = _mm_aesenc_si128(mseed8, mseed8);
286 // Start with last (possibly overlapping) block.
287 mval = _mm_loadu_si128((void*)((char*)p + size - 128));
288 mval2 = _mm_loadu_si128((void*)((char*)p + size - 112));
289 mval3 = _mm_loadu_si128((void*)((char*)p + size - 96));
290 mval4 = _mm_loadu_si128((void*)((char*)p + size - 80));
291 mval5 = _mm_loadu_si128((void*)((char*)p + size - 64));
292 mval6 = _mm_loadu_si128((void*)((char*)p + size - 48));
293 mval7 = _mm_loadu_si128((void*)((char*)p + size - 32));
294 mval8 = _mm_loadu_si128((void*)((char*)p + size - 16));
296 // XOR in seed.
297 mval ^= mseed;
298 mval2 ^= mseed2;
299 mval3 ^= mseed3;
300 mval4 ^= mseed4;
301 mval5 ^= mseed5;
302 mval6 ^= mseed6;
303 mval7 ^= mseed7;
304 mval8 ^= mseed8;
306 // Compute number of remaining 128-byte blocks.
307 size--;
308 size >>= 7;
309 do {
310 // Scramble state.
311 mval = _mm_aesenc_si128(mval, mval);
312 mval2 = _mm_aesenc_si128(mval2, mval2);
313 mval3 = _mm_aesenc_si128(mval3, mval3);
314 mval4 = _mm_aesenc_si128(mval4, mval4);
315 mval5 = _mm_aesenc_si128(mval5, mval5);
316 mval6 = _mm_aesenc_si128(mval6, mval6);
317 mval7 = _mm_aesenc_si128(mval7, mval7);
318 mval8 = _mm_aesenc_si128(mval8, mval8);
320 // Scramble state, XOR in a block.
321 mval = _mm_aesenc_si128(mval, _mm_loadu_si128(p));
322 mval2 = _mm_aesenc_si128(mval2, _mm_loadu_si128((void*)((char*)p + 16)));
323 mval3 = _mm_aesenc_si128(mval3, _mm_loadu_si128((void*)((char*)p + 32)));
324 mval4 = _mm_aesenc_si128(mval4, _mm_loadu_si128((void*)((char*)p + 48)));
325 mval5 = _mm_aesenc_si128(mval5, _mm_loadu_si128((void*)((char*)p + 64)));
326 mval6 = _mm_aesenc_si128(mval6, _mm_loadu_si128((void*)((char*)p + 80)));
327 mval7 = _mm_aesenc_si128(mval7, _mm_loadu_si128((void*)((char*)p + 96)));
328 mval8 = _mm_aesenc_si128(mval8, _mm_loadu_si128((void*)((char*)p + 112)));
330 p = (void*)((char*)p + 128);
331 } while (--size > 0);
333 // 3 more scrambles to finish.
334 mval = _mm_aesenc_si128(mval, mval);
335 mval2 = _mm_aesenc_si128(mval2, mval2);
336 mval3 = _mm_aesenc_si128(mval3, mval3);
337 mval4 = _mm_aesenc_si128(mval4, mval4);
338 mval5 = _mm_aesenc_si128(mval5, mval5);
339 mval6 = _mm_aesenc_si128(mval6, mval6);
340 mval7 = _mm_aesenc_si128(mval7, mval7);
341 mval8 = _mm_aesenc_si128(mval8, mval8);
342 mval = _mm_aesenc_si128(mval, mval);
343 mval2 = _mm_aesenc_si128(mval2, mval2);
344 mval3 = _mm_aesenc_si128(mval3, mval3);
345 mval4 = _mm_aesenc_si128(mval4, mval4);
346 mval5 = _mm_aesenc_si128(mval5, mval5);
347 mval6 = _mm_aesenc_si128(mval6, mval6);
348 mval7 = _mm_aesenc_si128(mval7, mval7);
349 mval8 = _mm_aesenc_si128(mval8, mval8);
350 mval = _mm_aesenc_si128(mval, mval);
351 mval2 = _mm_aesenc_si128(mval2, mval2);
352 mval3 = _mm_aesenc_si128(mval3, mval3);
353 mval4 = _mm_aesenc_si128(mval4, mval4);
354 mval5 = _mm_aesenc_si128(mval5, mval5);
355 mval6 = _mm_aesenc_si128(mval6, mval6);
356 mval7 = _mm_aesenc_si128(mval7, mval7);
357 mval8 = _mm_aesenc_si128(mval8, mval8);
359 mval ^= mval5;
360 mval2 ^= mval6;
361 mval3 ^= mval7;
362 mval4 ^= mval8;
363 mval ^= mval3;
364 mval2 ^= mval4;
365 mval ^= mval2;
366 return _mm_cvtsi128_si64(mval);
370 #else // !defined(__x86_64__)
372 // The 32-bit version of aeshashbody.
374 uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
375 __m128i mseed, mseed2, mseed3, mseed4;
376 __m128i mval, mval2, mval3, mval4;
378 // Start with hash seed.
379 mseed = _mm_cvtsi32_si128(seed);
380 // Get 16 bits of length.
381 mseed = _mm_insert_epi16(mseed, size, 4);
382 // Replace size with its low 2 bytes repeated 4 times.
383 mseed = _mm_shufflehi_epi16(mseed, 0);
384 // Save unscrambled seed.
385 mseed2 = mseed;
386 // XOR in per-process seed.
387 mseed ^= _mm_loadu_si128(aeskeysched.__values);
388 // Scramble seed.
389 mseed = _mm_aesenc_si128(mseed, mseed);
391 if (size <= 16) {
392 if (size == 0) {
393 // Return scrambled input seed.
394 return _mm_cvtsi128_si32(_mm_aesenc_si128(mseed, mseed));
395 } else if (size < 16) {
396 if ((((uintptr)(p) + 16) & 0xff0) != 0) {
397 static const uint64 masks[32]
398 __attribute__ ((aligned(16))) =
400 0x0000000000000000, 0x0000000000000000,
401 0x00000000000000ff, 0x0000000000000000,
402 0x000000000000ffff, 0x0000000000000000,
403 0x0000000000ffffff, 0x0000000000000000,
404 0x00000000ffffffff, 0x0000000000000000,
405 0x000000ffffffffff, 0x0000000000000000,
406 0x0000ffffffffffff, 0x0000000000000000,
407 0x00ffffffffffffff, 0x0000000000000000,
408 0xffffffffffffffff, 0x0000000000000000,
409 0xffffffffffffffff, 0x00000000000000ff,
410 0xffffffffffffffff, 0x000000000000ffff,
411 0xffffffffffffffff, 0x0000000000ffffff,
412 0xffffffffffffffff, 0x00000000ffffffff,
413 0xffffffffffffffff, 0x000000ffffffffff,
414 0xffffffffffffffff, 0x0000ffffffffffff,
415 0xffffffffffffffff, 0x00ffffffffffffff
418 // 16 bytes loaded at p won't cross a page
419 // boundary, so we can load it directly.
420 mval = _mm_loadu_si128(p);
421 mval &= *(const __m128i*)(&masks[size*2]);
422 } else {
423 static const uint64 shifts[32]
424 __attribute__ ((aligned(16))) =
426 0x0000000000000000, 0x0000000000000000,
427 0xffffffffffffff0f, 0xffffffffffffffff,
428 0xffffffffffff0f0e, 0xffffffffffffffff,
429 0xffffffffff0f0e0d, 0xffffffffffffffff,
430 0xffffffff0f0e0d0c, 0xffffffffffffffff,
431 0xffffff0f0e0d0c0b, 0xffffffffffffffff,
432 0xffff0f0e0d0c0b0a, 0xffffffffffffffff,
433 0xff0f0e0d0c0b0a09, 0xffffffffffffffff,
434 0x0f0e0d0c0b0a0908, 0xffffffffffffffff,
435 0x0e0d0c0b0a090807, 0xffffffffffffff0f,
436 0x0d0c0b0a09080706, 0xffffffffffff0f0e,
437 0x0c0b0a0908070605, 0xffffffffff0f0e0d,
438 0x0b0a090807060504, 0xffffffff0f0e0d0c,
439 0x0a09080706050403, 0xffffff0f0e0d0c0b,
440 0x0908070605040302, 0xffff0f0e0d0c0b0a,
441 0x0807060504030201, 0xff0f0e0d0c0b0a09,
444 // address ends in 1111xxxx. Might be
445 // up against a page boundary, so load
446 // ending at last byte. Then shift
447 // bytes down using pshufb.
448 mval = _mm_loadu_si128((void*)((char*)p - 16 + size));
449 mval = _mm_shuffle_epi8(mval, *(const __m128i*)(&shifts[size*2]));
451 } else {
452 mval = _mm_loadu_si128(p);
455 // Scramble input, XOR in seed.
456 mval = _mm_aesenc_si128(mval, mseed);
457 mval = _mm_aesenc_si128(mval, mval);
458 mval = _mm_aesenc_si128(mval, mval);
459 return _mm_cvtsi128_si32(mval);
460 } else if (size <= 32) {
461 // Make second starting seed.
462 mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
463 mseed2 = _mm_aesenc_si128(mseed2, mseed2);
464 // Load data to be hashed.
465 mval = _mm_loadu_si128(p);
466 mval2 = _mm_loadu_si128((void*)((char*)p + size - 16));
468 // Scramble 3 times.
469 mval = _mm_aesenc_si128(mval, mseed);
470 mval2 = _mm_aesenc_si128(mval2, mseed2);
471 mval = _mm_aesenc_si128(mval, mval);
472 mval2 = _mm_aesenc_si128(mval2, mval2);
473 mval = _mm_aesenc_si128(mval, mval);
474 mval2 = _mm_aesenc_si128(mval2, mval2);
476 // Combine results.
477 mval ^= mval2;
478 return _mm_cvtsi128_si32(mval);
479 } else if (size <= 64) {
480 // Make 3 more starting seeds.
481 mseed3 = mseed2;
482 mseed4 = mseed2;
483 mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
484 mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
485 mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
486 mseed2 = _mm_aesenc_si128(mseed2, mseed2);
487 mseed3 = _mm_aesenc_si128(mseed3, mseed3);
488 mseed4 = _mm_aesenc_si128(mseed4, mseed4);
490 mval = _mm_loadu_si128(p);
491 mval2 = _mm_loadu_si128((void*)((char*)p + 16));
492 mval3 = _mm_loadu_si128((void*)((char*)p + size - 32));
493 mval4 = _mm_loadu_si128((void*)((char*)p + size - 16));
495 mval = _mm_aesenc_si128(mval, mseed);
496 mval2 = _mm_aesenc_si128(mval2, mseed2);
497 mval3 = _mm_aesenc_si128(mval3, mseed3);
498 mval4 = _mm_aesenc_si128(mval4, mseed4);
500 mval = _mm_aesenc_si128(mval, mval);
501 mval2 = _mm_aesenc_si128(mval2, mval2);
502 mval3 = _mm_aesenc_si128(mval3, mval3);
503 mval4 = _mm_aesenc_si128(mval4, mval4);
505 mval = _mm_aesenc_si128(mval, mval);
506 mval2 = _mm_aesenc_si128(mval2, mval2);
507 mval3 = _mm_aesenc_si128(mval3, mval3);
508 mval4 = _mm_aesenc_si128(mval4, mval4);
510 mval ^= mval3;
511 mval2 ^= mval4;
512 mval ^= mval2;
513 return _mm_cvtsi128_si32(mval);
514 } else {
515 // Make 3 more starting seeds.
516 mseed3 = mseed2;
517 mseed4 = mseed2;
518 mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
519 mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
520 mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
521 mseed2 = _mm_aesenc_si128(mseed2, mseed2);
522 mseed3 = _mm_aesenc_si128(mseed3, mseed3);
523 mseed4 = _mm_aesenc_si128(mseed4, mseed4);
525 // Start with last (possibly overlapping) block.
526 mval = _mm_loadu_si128((void*)((char*)p + size - 64));
527 mval2 = _mm_loadu_si128((void*)((char*)p + size - 48));
528 mval3 = _mm_loadu_si128((void*)((char*)p + size - 32));
529 mval4 = _mm_loadu_si128((void*)((char*)p + size - 16));
531 // Scramble state once.
532 mval = _mm_aesenc_si128(mval, mseed);
533 mval2 = _mm_aesenc_si128(mval2, mseed2);
534 mval3 = _mm_aesenc_si128(mval3, mseed3);
535 mval4 = _mm_aesenc_si128(mval4, mseed4);
537 // Compute number of remaining 64-byte blocks.
538 size--;
539 size >>= 6;
540 do {
541 // Scramble state, XOR in a block.
542 mval = _mm_aesenc_si128(mval, _mm_loadu_si128(p));
543 mval2 = _mm_aesenc_si128(mval2, _mm_loadu_si128((void*)((char*)p + 16)));
544 mval3 = _mm_aesenc_si128(mval3, _mm_loadu_si128((void*)((char*)p + 32)));
545 mval4 = _mm_aesenc_si128(mval4, _mm_loadu_si128((void*)((char*)p + 48)));
547 // Scramble state.
548 mval = _mm_aesenc_si128(mval, mval);
549 mval2 = _mm_aesenc_si128(mval2, mval2);
550 mval3 = _mm_aesenc_si128(mval3, mval3);
551 mval4 = _mm_aesenc_si128(mval4, mval4);
553 p = (void*)((char*)p + 64);
554 } while (--size > 0);
556 // 2 more scrambles to finish.
557 mval = _mm_aesenc_si128(mval, mval);
558 mval2 = _mm_aesenc_si128(mval2, mval2);
559 mval3 = _mm_aesenc_si128(mval3, mval3);
560 mval4 = _mm_aesenc_si128(mval4, mval4);
562 mval = _mm_aesenc_si128(mval, mval);
563 mval2 = _mm_aesenc_si128(mval2, mval2);
564 mval3 = _mm_aesenc_si128(mval3, mval3);
565 mval4 = _mm_aesenc_si128(mval4, mval4);
567 mval ^= mval3;
568 mval2 ^= mval4;
569 mval ^= mval2;
570 return _mm_cvtsi128_si32(mval);
574 #endif // !defined(__x86_64__)
576 #else // !defined(__i386__) && !defined(__x86_64__) || !defined(HAVE_AS_X86_AES)
578 uintptr aeshashbody(void* p __attribute__((unused)),
579 uintptr seed __attribute__((unused)),
580 uintptr size __attribute__((unused)),
581 Slice aeskeysched __attribute__((unused))) {
582 // We should never get here on a non-x86 system.
583 runtime_throw("impossible call to aeshashbody");
586 #endif // !defined(__i386__) && !defined(__x86_64__) || !defined(HAVE_AS_X86_AES)