Fix semdiff syntactic output
[hiphop-php.git] / hphp / util / word-mem.h
blob31adfab9434d25f2d914dc16d78715632bafbb9a
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
16 #ifndef incl_HPHP_WORD_MEM_H_
17 #define incl_HPHP_WORD_MEM_H_
19 #include <limits>
20 #include <folly/Portability.h>
22 #include "hphp/util/assertions.h"
24 extern "C" void* _memcpy8(void* dst, const void* src, size_t len);
25 extern "C" void* _memcpy16(void* dst, const void* src, size_t len);
26 extern "C" void _bcopy32(void* dst, const void* src, size_t len);
27 extern "C" void _bcopy_in_64(void* dst, const void* src, size_t lenIn64);
29 namespace HPHP {
32 * Specialized memcpy implementations that takes advantage of the known
33 * properties in length and alignment.
35 * o memcpy8(dst, src, len) is equivalent to
36 * static_cast<char*>(memcpy(dst, src, (len + 7) / 8 * 8)) + len;
37 * it returns a char* pointing to dst[len] instead of dst, in order to
38 * ease its use in string operations.
40 * Note that it could overrun the buffer by up to 7 bytes, depending on len
41 * and alignment of the buffers. When both src and dst are aligned to 8
42 * bytes, it is safe. It can also be used in other situations given
43 * sufficient readable space after the buffers.
45 * o memcpy16(dst, src, len) is equivalent to
46 * assert(len > 0 && len % 16 == 0);
47 * memcpy(dst, src, len);
49 * o bcopy32(dst, src, len) is equivalent to
50 * assert(len >= 32);
51 * memcpy(dst, src, len / 32 * 32);
52 * except that it returns void.
54 * o bcopy_in_64(dst, src, lenIn64) is equivalent to
55 * assert(lenIn64 > 0);
56 * memcpy(dst, src, 64 * lenIn64);
57 * except that it returns void.
60 inline char* memcpy8(void* dst, const void* src, uint32_t len) {
61 #if defined(__x86_64__)
62 return reinterpret_cast<char*>(_memcpy8(dst, src, len));
63 #else
64 memcpy(dst, src, len);
65 return reinterpret_cast<char*>(dst) + len;
66 #endif
69 inline char* memcpy16(void* dst, const void* src, uint32_t len) {
70 assertx(len > 0 && len % 16 == 0);
71 #if defined(__x86_64__)
72 return reinterpret_cast<char*>(_memcpy16(dst, src, len));
73 #else
74 return reinterpret_cast<char*>(memcpy(dst, src, len));
75 #endif
78 inline void bcopy32(void* dst, const void* src, uint32_t len) {
79 assertx(len >= 32);
80 #if defined(__x86_64__)
81 _bcopy32(dst, src, len);
82 #else
83 memcpy(dst, src, len / 32 * 32);
84 #endif
87 inline void bcopy_in_64(void* dst, const void* src, uint32_t lenIn64) {
88 assertx(lenIn64 != 0);
89 #if defined(__x86_64__)
90 _bcopy_in_64(dst, src, lenIn64);
91 #else
92 memcpy(dst, src, lenIn64 * 64);
93 #endif
96 // Inline assembly version to avoid a function call.
97 inline void bcopy32_inline(void* dst, const void* src, uint32_t len) {
98 assertx(len >= 32);
99 #if defined(__x86_64__)
100 __asm__ __volatile__("shr $5, %0\n"
101 ".LBCP32%=:\n"
102 "movdqu (%1), %%xmm0\n"
103 "movdqu 16(%1), %%xmm1\n"
104 "add $32, %1\n"
105 "movdqu %%xmm0, (%2)\n"
106 "movdqu %%xmm1, 16(%2)\n"
107 "add $32, %2\n"
108 "dec %0\n"
109 "jg .LBCP32%=\n"
110 : "+r"(len), "+r"(src), "+r"(dst)
111 :: "xmm0", "xmm1"
113 #elif defined(__aarch64__)
114 int64_t t3, t4, t5, t6, t7;
115 __asm__ __volatile__("lsr %x0, %x0, #5\n"
116 "sub %x1, %x1, #16\n"
117 "sub %x2, %x2, #16\n"
118 ".LBCP32%=:\n"
119 "ldp %x3, %x4, [%x1, #16]\n"
120 "ldp %x5, %x6, [%x1, #32]!\n"
121 "stp %x3, %x4, [%x2, #16]\n"
122 "stp %x5, %x6, [%x2, #32]!\n"
123 "subs %x0, %x0, #1\n"
124 "bgt .LBCP32%=\n"
125 : "+r"(len), "+r"(src), "+r"(dst),
126 "=r"(t3), "=r"(t4), "=r"(t5), "=r"(t6), "=r"(t7)
128 #else
129 bcopy32(dst, src, len);
130 #endif
133 inline void memcpy16_inline(void* dst, const void* src, uint64_t len) {
134 assertx(len >=16 && len % 16 == 0);
135 #if defined(__x86_64__)
136 __asm__ __volatile__("movdqu -16(%1, %0), %%xmm0\n"
137 "movdqu %%xmm0, -16(%2, %0)\n"
138 "shr $5, %0\n"
139 "jz .LEND%=\n"
140 ".LR32%=:\n"
141 "movdqu (%1), %%xmm0\n"
142 "movdqu 16(%1), %%xmm1\n"
143 "add $32, %1\n"
144 "movdqu %%xmm0, (%2)\n"
145 "movdqu %%xmm1, 16(%2)\n"
146 "add $32, %2\n"
147 "dec %0\n"
148 "jg .LR32%=\n"
149 ".LEND%=:\n"
150 : "+r"(len), "+r"(src), "+r"(dst)
151 :: "xmm0", "xmm1"
153 #elif defined(__aarch64__)
154 int64_t t3, t4, t5, t6, s1, d1, d2;
155 __asm__ __volatile__("mov %x7, %x1\n"
156 "add %x1, %x1, %x0\n"
157 "ldp %x3, %x4, [%x1, #-16]!\n"
158 "mov %x8, %x2\n"
159 "add %x2, %x2, %x0\n"
160 "stp %x3, %x4, [%x2, #-16]!\n"
161 "lsr %x0, %x0, #5\n"
162 "cbz %x0, .LEND%=\n"
163 "sub %x7, %x7, #16\n"
164 "sub %x8, %x8, #16\n"
165 ".LR32%=:\n"
166 "ldp %x3, %x4, [%x7, #16]\n"
167 "ldp %x5, %x6, [%x7, #32]!\n"
168 "stp %x3, %x4, [%x8, #16]\n"
169 "stp %x5, %x6, [%x8, #32]!\n"
170 "subs %x0, %x0, #1\n"
171 "bgt .LR32%=\n"
172 ".LEND%=:\n"
173 : "+r"(len), "+r"(src), "+r"(dst),
174 "=r"(t3), "=r"(t4), "=r"(t5), "=r"(t6),
175 "=r"(s1), "=r"(d1), "=r"(d2)
177 #else
178 memcpy16(dst, src, len);
179 #endif
182 //////////////////////////////////////////////////////////////////////
184 // ASan is less precise than valgrind and believes this function overruns reads
185 #ifndef FOLLY_SANITIZE_ADDRESS
188 * Word at a time comparison for two strings of length `lenBytes'. Returns
189 * true if the regions are the same. This should be invoked only when we know
190 * the two strings have the same length. It will not check for the null
191 * terminator.
193 * Assumes it can load more words than the size to compare (this is often
194 * possible in HPHP when you know you dealing with request-allocated memory).
195 * The final word compare is adjusted to handle the slack in lenBytes so only
196 * the bytes we care about are compared.
198 * Assumes that the the buffer addresses are 8-bytes aligned.
200 ALWAYS_INLINE
201 bool wordsame(const void* mem1, const void* mem2, uint32_t lenBytes) {
202 using T = uint64_t;
203 auto constexpr W = sizeof(T);
205 assert(reinterpret_cast<const uintptr_t>(mem1) % W == 0);
206 assert(reinterpret_cast<const uintptr_t>(mem2) % W == 0);
208 // Inverse of lenBytes. Do the negation here to avoid doing it later on the
209 // critical path.
210 int32_t const nBytes = -lenBytes;
211 // Check if `lenBytes' is 0, do the check right here to reuse the flags of
212 // the neg instruction. This saves a test instruction.
213 if (UNLIKELY(nBytes == 0)) return true;
214 // Do the shift here to avoid doing it later on the critical path. But we
215 // will have to switch to 64 bit here to support very long strings.
216 int64_t nBits = static_cast<int64_t>(nBytes) * 8u;
218 // Use the base+index addressing mode in x86, so that we only need to
219 // increment the base pointer in the loop.
220 auto p1 = reinterpret_cast<intptr_t>(mem1);
221 auto const diff = reinterpret_cast<intptr_t>(mem2) - p1;
223 T data;
224 do {
225 data = *(reinterpret_cast<const T*>(p1));
226 data ^= *(reinterpret_cast<const T*>(p1 + diff));
227 p1 += W;
228 nBits += W * 8;
229 if (nBits >= 0) {
230 // As a note for future consideration, we could consider precomputing a
231 // 64-bit mask, so that the fraction of the last qword can be checked
232 // faster. But that would require an additional register for the
233 // mask. So it depends on register pressure of the call site.
234 return !(data << nBits);
236 } while (data == 0);
237 return false;
240 #else // FOLLY_SANITIZE_ADDRESS
242 ALWAYS_INLINE
243 bool wordsame(const void* mem1, const void* mem2, size_t lenBytes) {
244 assert(reinterpret_cast<const uintptr_t>(mem1) % 4 == 0);
245 return !memcmp(mem1, mem2, lenBytes);
248 #endif
251 * Like memcpy, but copies numT POD values 8 bytes at a time.
252 * The actual number of bytes copied must be a multiple of 8.
254 template<class T>
255 T* wordcpy(T* to, const T* from, size_t numT) {
256 assert(numT < std::numeric_limits<int64_t>::max() &&
257 (numT * sizeof(T)) % 8 == 0);
258 size_t numWords = numT * sizeof(T) / 8;
259 assert(numWords != 0);
260 auto d = (int64_t*)to;
261 auto s = (int64_t*)from;
262 do {
263 *d++ = *s++;
264 } while (--numWords);
265 return to;
269 * Fills a memory area with ones, 8 bytes at a time.
271 template<class T>
272 T* wordfillones(T* ptr, size_t numT) {
273 assert(numT < std::numeric_limits<int64_t>::max() &&
274 (numT * sizeof(T)) % 8 == 0);
275 assert(numT != 0);
276 auto numWords = numT * sizeof(T) / 8;
277 auto d = (int64_t*)ptr;
278 do {
279 *d++ = -1;
280 } while (--numWords);
281 return ptr;
284 //////////////////////////////////////////////////////////////////////
288 #endif