1 /* cksum -- calculate and print POSIX checksums and sizes of files
2 Copyright (C) 1992-2022 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
20 #include <sys/types.h>
22 #include <x86intrin.h>
25 /* Number of bytes to read at once. */
26 #define BUFLEN (1 << 16)
28 extern uint_fast32_t const crctab
[8][256];
31 cksum_pclmul (FILE *fp
, uint_fast32_t *crc_out
, uintmax_t *length_out
);
33 /* Calculate CRC32 using PCLMULQDQ CPU instruction found in x86/x64 CPUs */
36 cksum_pclmul (FILE *fp
, uint_fast32_t *crc_out
, uintmax_t *length_out
)
38 __m128i buf
[BUFLEN
/ sizeof (__m128i
)];
39 uint_fast32_t crc
= 0;
42 __m128i single_mult_constant
;
43 __m128i four_mult_constant
;
44 __m128i shuffle_constant
;
46 if (!fp
|| !crc_out
|| !length_out
)
49 /* These constants and general algorithms are taken from the Intel whitepaper
50 "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
52 single_mult_constant
= _mm_set_epi64x (0xC5B9CD4C, 0xE8A45605);
53 four_mult_constant
= _mm_set_epi64x (0x8833794C, 0xE6228B11);
55 /* Constant to byteswap a full SSE register */
56 shuffle_constant
= _mm_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8,
57 9, 10, 11, 12, 13, 14, 15);
59 while ((bytes_read
= fread (buf
, 1, BUFLEN
, fp
)) > 0)
73 if (length
+ bytes_read
< length
)
86 datap
= (__m128i
*)buf
;
88 /* Fold in parallel eight 16-byte blocks into four 16-byte blocks */
89 if (bytes_read
>= 16 * 8)
91 data
= _mm_loadu_si128 (datap
);
92 data
= _mm_shuffle_epi8 (data
, shuffle_constant
);
93 /* XOR in initial CRC value (for us 0 so no effect), or CRC value
94 calculated for previous BUFLEN buffer from fread */
95 xor_crc
= _mm_set_epi32 (crc
, 0, 0, 0);
97 data
= _mm_xor_si128 (data
, xor_crc
);
98 data3
= _mm_loadu_si128 (datap
+ 1);
99 data3
= _mm_shuffle_epi8 (data3
, shuffle_constant
);
100 data5
= _mm_loadu_si128 (datap
+ 2);
101 data5
= _mm_shuffle_epi8 (data5
, shuffle_constant
);
102 data7
= _mm_loadu_si128 (datap
+ 3);
103 data7
= _mm_shuffle_epi8 (data7
, shuffle_constant
);
106 while (bytes_read
>= 16 * 8)
110 /* Do multiplication here for four consecutive 16 byte blocks */
111 data2
= _mm_clmulepi64_si128 (data
, four_mult_constant
, 0x00);
112 data
= _mm_clmulepi64_si128 (data
, four_mult_constant
, 0x11);
113 data4
= _mm_clmulepi64_si128 (data3
, four_mult_constant
, 0x00);
114 data3
= _mm_clmulepi64_si128 (data3
, four_mult_constant
, 0x11);
115 data6
= _mm_clmulepi64_si128 (data5
, four_mult_constant
, 0x00);
116 data5
= _mm_clmulepi64_si128 (data5
, four_mult_constant
, 0x11);
117 data8
= _mm_clmulepi64_si128 (data7
, four_mult_constant
, 0x00);
118 data7
= _mm_clmulepi64_si128 (data7
, four_mult_constant
, 0x11);
120 /* Now multiplication results for the four blocks is xor:ed with
121 next four 16 byte blocks from the buffer. This effectively
122 "consumes" the first four blocks from the buffer.
123 Keep xor result in variables for multiplication in next
125 data
= _mm_xor_si128 (data
, data2
);
126 data2
= _mm_loadu_si128 (datap
);
127 data2
= _mm_shuffle_epi8 (data2
, shuffle_constant
);
128 data
= _mm_xor_si128 (data
, data2
);
130 data3
= _mm_xor_si128 (data3
, data4
);
131 data4
= _mm_loadu_si128 (datap
+ 1);
132 data4
= _mm_shuffle_epi8 (data4
, shuffle_constant
);
133 data3
= _mm_xor_si128 (data3
, data4
);
135 data5
= _mm_xor_si128 (data5
, data6
);
136 data6
= _mm_loadu_si128 (datap
+ 2);
137 data6
= _mm_shuffle_epi8 (data6
, shuffle_constant
);
138 data5
= _mm_xor_si128 (data5
, data6
);
140 data7
= _mm_xor_si128 (data7
, data8
);
141 data8
= _mm_loadu_si128 (datap
+ 3);
142 data8
= _mm_shuffle_epi8 (data8
, shuffle_constant
);
143 data7
= _mm_xor_si128 (data7
, data8
);
145 bytes_read
-= (16 * 4);
147 /* At end of loop we write out results from variables back into
148 the buffer, for use in single fold loop */
149 data
= _mm_shuffle_epi8 (data
, shuffle_constant
);
150 _mm_storeu_si128 (datap
, data
);
151 data3
= _mm_shuffle_epi8 (data3
, shuffle_constant
);
152 _mm_storeu_si128 (datap
+ 1, data3
);
153 data5
= _mm_shuffle_epi8 (data5
, shuffle_constant
);
154 _mm_storeu_si128 (datap
+ 2, data5
);
155 data7
= _mm_shuffle_epi8 (data7
, shuffle_constant
);
156 _mm_storeu_si128 (datap
+ 3, data7
);
159 /* Fold two 16-byte blocks into one 16-byte block */
160 if (bytes_read
>= 32)
162 data
= _mm_loadu_si128 (datap
);
163 data
= _mm_shuffle_epi8 (data
, shuffle_constant
);
164 xor_crc
= _mm_set_epi32 (crc
, 0, 0, 0);
166 data
= _mm_xor_si128 (data
, xor_crc
);
167 while (bytes_read
>= 32)
171 data2
= _mm_clmulepi64_si128 (data
, single_mult_constant
, 0x00);
172 data
= _mm_clmulepi64_si128 (data
, single_mult_constant
, 0x11);
173 fold_data
= _mm_loadu_si128 (datap
);
174 fold_data
= _mm_shuffle_epi8 (fold_data
, shuffle_constant
);
175 data
= _mm_xor_si128 (data
, data2
);
176 data
= _mm_xor_si128 (data
, fold_data
);
179 data
= _mm_shuffle_epi8 (data
, shuffle_constant
);
180 _mm_storeu_si128 (datap
, data
);
183 /* And finish up last 0-31 bytes in a byte by byte fashion */
184 unsigned char *cp
= (unsigned char *)datap
;
186 crc
= (crc
<< 8) ^ crctab
[0][((crc
>> 24) ^ *cp
++) & 0xFF];
192 *length_out
= length
;