split: port ‘split -n N /dev/null’ better to macOS
[coreutils.git] / src / cksum_pclmul.c
blob9c6e4df54d95297da78e4022caf69c8416580255
1 /* cksum -- calculate and print POSIX checksums and sizes of files
2 Copyright (C) 1992-2023 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 #include <config.h>
19 #include <stdio.h>
20 #include <sys/types.h>
21 #include <stdint.h>
22 #include <x86intrin.h>
23 #include "system.h"
25 /* Number of bytes to read at once. */
26 #define BUFLEN (1 << 16)
28 extern uint_fast32_t const crctab[8][256];
30 extern bool
31 cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
33 /* Calculate CRC32 using PCLMULQDQ CPU instruction found in x86/x64 CPUs */
35 bool
36 cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
38 __m128i buf[BUFLEN / sizeof (__m128i)];
39 uint_fast32_t crc = 0;
40 uintmax_t length = 0;
41 size_t bytes_read;
42 __m128i single_mult_constant;
43 __m128i four_mult_constant;
44 __m128i shuffle_constant;
46 if (!fp || !crc_out || !length_out)
47 return false;
49 /* These constants and general algorithms are taken from the Intel whitepaper
50 "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
52 single_mult_constant = _mm_set_epi64x (0xC5B9CD4C, 0xE8A45605);
53 four_mult_constant = _mm_set_epi64x (0x8833794C, 0xE6228B11);
55 /* Constant to byteswap a full SSE register */
56 shuffle_constant = _mm_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8,
57 9, 10, 11, 12, 13, 14, 15);
59 while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0)
61 __m128i *datap;
62 __m128i data;
63 __m128i data2;
64 __m128i data3;
65 __m128i data4;
66 __m128i data5;
67 __m128i data6;
68 __m128i data7;
69 __m128i data8;
70 __m128i fold_data;
71 __m128i xor_crc;
73 if (length + bytes_read < length)
75 errno = EOVERFLOW;
76 return false;
78 length += bytes_read;
80 if (bytes_read == 0)
82 if (ferror (fp))
83 return false;
86 datap = (__m128i *)buf;
88 /* Fold in parallel eight 16-byte blocks into four 16-byte blocks */
89 if (bytes_read >= 16 * 8)
91 data = _mm_loadu_si128 (datap);
92 data = _mm_shuffle_epi8 (data, shuffle_constant);
93 /* XOR in initial CRC value (for us 0 so no effect), or CRC value
94 calculated for previous BUFLEN buffer from fread */
95 xor_crc = _mm_set_epi32 (crc, 0, 0, 0);
96 crc = 0;
97 data = _mm_xor_si128 (data, xor_crc);
98 data3 = _mm_loadu_si128 (datap + 1);
99 data3 = _mm_shuffle_epi8 (data3, shuffle_constant);
100 data5 = _mm_loadu_si128 (datap + 2);
101 data5 = _mm_shuffle_epi8 (data5, shuffle_constant);
102 data7 = _mm_loadu_si128 (datap + 3);
103 data7 = _mm_shuffle_epi8 (data7, shuffle_constant);
106 while (bytes_read >= 16 * 8)
108 datap += 4;
110 /* Do multiplication here for four consecutive 16 byte blocks */
111 data2 = _mm_clmulepi64_si128 (data, four_mult_constant, 0x00);
112 data = _mm_clmulepi64_si128 (data, four_mult_constant, 0x11);
113 data4 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x00);
114 data3 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x11);
115 data6 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x00);
116 data5 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x11);
117 data8 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x00);
118 data7 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x11);
120 /* Now multiplication results for the four blocks is xor:ed with
121 next four 16 byte blocks from the buffer. This effectively
122 "consumes" the first four blocks from the buffer.
123 Keep xor result in variables for multiplication in next
124 round of loop. */
125 data = _mm_xor_si128 (data, data2);
126 data2 = _mm_loadu_si128 (datap);
127 data2 = _mm_shuffle_epi8 (data2, shuffle_constant);
128 data = _mm_xor_si128 (data, data2);
130 data3 = _mm_xor_si128 (data3, data4);
131 data4 = _mm_loadu_si128 (datap + 1);
132 data4 = _mm_shuffle_epi8 (data4, shuffle_constant);
133 data3 = _mm_xor_si128 (data3, data4);
135 data5 = _mm_xor_si128 (data5, data6);
136 data6 = _mm_loadu_si128 (datap + 2);
137 data6 = _mm_shuffle_epi8 (data6, shuffle_constant);
138 data5 = _mm_xor_si128 (data5, data6);
140 data7 = _mm_xor_si128 (data7, data8);
141 data8 = _mm_loadu_si128 (datap + 3);
142 data8 = _mm_shuffle_epi8 (data8, shuffle_constant);
143 data7 = _mm_xor_si128 (data7, data8);
145 bytes_read -= (16 * 4);
147 /* At end of loop we write out results from variables back into
148 the buffer, for use in single fold loop */
149 data = _mm_shuffle_epi8 (data, shuffle_constant);
150 _mm_storeu_si128 (datap, data);
151 data3 = _mm_shuffle_epi8 (data3, shuffle_constant);
152 _mm_storeu_si128 (datap + 1, data3);
153 data5 = _mm_shuffle_epi8 (data5, shuffle_constant);
154 _mm_storeu_si128 (datap + 2, data5);
155 data7 = _mm_shuffle_epi8 (data7, shuffle_constant);
156 _mm_storeu_si128 (datap + 3, data7);
159 /* Fold two 16-byte blocks into one 16-byte block */
160 if (bytes_read >= 32)
162 data = _mm_loadu_si128 (datap);
163 data = _mm_shuffle_epi8 (data, shuffle_constant);
164 xor_crc = _mm_set_epi32 (crc, 0, 0, 0);
165 crc = 0;
166 data = _mm_xor_si128 (data, xor_crc);
167 while (bytes_read >= 32)
169 datap++;
171 data2 = _mm_clmulepi64_si128 (data, single_mult_constant, 0x00);
172 data = _mm_clmulepi64_si128 (data, single_mult_constant, 0x11);
173 fold_data = _mm_loadu_si128 (datap);
174 fold_data = _mm_shuffle_epi8 (fold_data, shuffle_constant);
175 data = _mm_xor_si128 (data, data2);
176 data = _mm_xor_si128 (data, fold_data);
177 bytes_read -= 16;
179 data = _mm_shuffle_epi8 (data, shuffle_constant);
180 _mm_storeu_si128 (datap, data);
183 /* And finish up last 0-31 bytes in a byte by byte fashion */
184 unsigned char *cp = (unsigned char *)datap;
185 while (bytes_read--)
186 crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF];
187 if (feof (fp))
188 break;
191 *crc_out = crc;
192 *length_out = length;
194 return true;