2 +----------------------------------------------------------------------+
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
16 #ifndef incl_HPHP_WORD_MEM_H_
17 #define incl_HPHP_WORD_MEM_H_
20 #include <folly/Portability.h>
22 #include "hphp/util/assertions.h"
23 #include "hphp/util/portability.h"
25 extern "C" void* _memcpy8(void* dst
, const void* src
, size_t len
);
26 extern "C" void* _memcpy16(void* dst
, const void* src
, size_t len
);
27 extern "C" void _bcopy32(void* dst
, const void* src
, size_t len
);
28 extern "C" void _bcopy_in_64(void* dst
, const void* src
, size_t lenIn64
);
33 * Specialized memcpy implementations that takes advantage of the known
34 * properties in length and alignment.
36 * o memcpy8(dst, src, len) is equivalent to
37 * static_cast<char*>(memcpy(dst, src, (len + 7) / 8 * 8)) + len;
38 * it returns a char* pointing to dst[len] instead of dst, in order to
39 * ease its use in string operations.
41 * Note that it could overrun the buffer by up to 7 bytes, depending on len
42 * and alignment of the buffers. When both src and dst are aligned to 8
43 * bytes, it is safe. It can also be used in other situations given
44 * sufficient readable space after the buffers.
46 * o memcpy16(dst, src, len) is equivalent to
47 * assert(len > 0 && len % 16 == 0);
48 * memcpy(dst, src, len);
50 * o bcopy32(dst, src, len) is equivalent to
52 * memcpy(dst, src, len / 32 * 32);
53 * except that it returns void.
55 * o bcopy_in_64(dst, src, lenIn64) is equivalent to
56 * assert(lenIn64 > 0);
57 * memcpy(dst, src, 64 * lenIn64);
58 * except that it returns void.
61 inline char* memcpy8(void* dst
, const void* src
, size_t len
) {
62 #if defined(__x86_64__)
63 return reinterpret_cast<char*>(_memcpy8(dst
, src
, len
));
65 memcpy(dst
, src
, len
);
66 return reinterpret_cast<char*>(dst
) + len
;
70 inline char* memcpy16(void* dst
, const void* src
, size_t len
) {
71 assertx(len
> 0 && len
% 16 == 0);
72 #if defined(__x86_64__)
73 return reinterpret_cast<char*>(_memcpy16(dst
, src
, len
));
75 return reinterpret_cast<char*>(memcpy(dst
, src
, len
));
79 inline void bcopy32(void* dst
, const void* src
, size_t len
) {
81 #if defined(__x86_64__)
82 _bcopy32(dst
, src
, len
);
84 memcpy(dst
, src
, len
/ 32 * 32);
88 inline void bcopy_in_64(void* dst
, const void* src
, size_t lenIn64
) {
89 assertx(lenIn64
!= 0);
90 #if defined(__x86_64__)
91 _bcopy_in_64(dst
, src
, lenIn64
);
93 memcpy(dst
, src
, lenIn64
* 64);
97 // Inline assembly version to avoid a function call.
98 inline void bcopy32_inline(void* dst
, const void* src
, size_t len
) {
100 #if defined(__x86_64__)
101 __asm__
__volatile__("shr $5, %0\n"
102 ASM_LOCAL_LABEL("BCP32%=") ":\n"
103 "movdqu (%1), %%xmm0\n"
104 "movdqu 16(%1), %%xmm1\n"
106 "movdqu %%xmm0, (%2)\n"
107 "movdqu %%xmm1, 16(%2)\n"
110 "jg " ASM_LOCAL_LABEL("BCP32%=") "\n"
111 : "+r"(len
), "+r"(src
), "+r"(dst
)
114 #elif defined(__aarch64__)
115 int64_t t3
, t4
, t5
, t6
, t7
;
116 __asm__
__volatile__("lsr %x0, %x0, #5\n"
117 "sub %x1, %x1, #16\n"
118 "sub %x2, %x2, #16\n"
119 ASM_LOCAL_LABEL("BCP32%=") ":\n"
120 "ldp %x3, %x4, [%x1, #16]\n"
121 "ldp %x5, %x6, [%x1, #32]!\n"
122 "stp %x3, %x4, [%x2, #16]\n"
123 "stp %x5, %x6, [%x2, #32]!\n"
124 "subs %x0, %x0, #1\n"
125 "bgt " ASM_LOCAL_LABEL("BCP32%=") "\n"
126 : "+r"(len
), "+r"(src
), "+r"(dst
),
127 "=r"(t3
), "=r"(t4
), "=r"(t5
), "=r"(t6
), "=r"(t7
)
131 bcopy32(dst
, src
, len
);
135 inline void memcpy16_inline(void* dst
, const void* src
, size_t len
) {
136 assertx(len
>=16 && len
% 16 == 0);
137 #if defined(__x86_64__)
138 __asm__
__volatile__("movdqu -16(%1, %0), %%xmm0\n"
139 "movdqu %%xmm0, -16(%2, %0)\n"
141 "jz " ASM_LOCAL_LABEL("END%=") "\n"
142 ASM_LOCAL_LABEL("R32%=") ":\n"
143 "movdqu (%1), %%xmm0\n"
144 "movdqu 16(%1), %%xmm1\n"
146 "movdqu %%xmm0, (%2)\n"
147 "movdqu %%xmm1, 16(%2)\n"
150 "jg " ASM_LOCAL_LABEL("R32%=") "\n"
151 ASM_LOCAL_LABEL("END%=") ":\n"
152 : "+r"(len
), "+r"(src
), "+r"(dst
)
155 #elif defined(__aarch64__)
156 int64_t t3
, t4
, t5
, t6
, s1
, d1
, d2
;
157 __asm__
__volatile__("mov %x7, %x1\n"
158 "add %x1, %x1, %x0\n"
159 "ldp %x3, %x4, [%x1, #-16]!\n"
161 "add %x2, %x2, %x0\n"
162 "stp %x3, %x4, [%x2, #-16]!\n"
164 "cbz %x0, " ASM_LOCAL_LABEL("END%=") "\n"
165 "sub %x7, %x7, #16\n"
166 "sub %x8, %x8, #16\n"
167 ASM_LOCAL_LABEL("R32%=") ":\n"
168 "ldp %x3, %x4, [%x7, #16]\n"
169 "ldp %x5, %x6, [%x7, #32]!\n"
170 "stp %x3, %x4, [%x8, #16]\n"
171 "stp %x5, %x6, [%x8, #32]!\n"
172 "subs %x0, %x0, #1\n"
173 "bgt " ASM_LOCAL_LABEL("R32%=") "\n"
174 ASM_LOCAL_LABEL("END%=") ":\n"
175 : "+r"(len
), "+r"(src
), "+r"(dst
),
176 "=r"(t3
), "=r"(t4
), "=r"(t5
), "=r"(t6
),
177 "=r"(s1
), "=r"(d1
), "=r"(d2
)
181 memcpy16(dst
, src
, len
);
185 //////////////////////////////////////////////////////////////////////
188 * Word at a time comparison for two strings of length `lenBytes'. Returns
189 * true if the regions are the same. This should be invoked only when we know
190 * the two strings have the same length. It will not check for the null
193 * Assumes that the buffer addresses are word aligned, and that it can read
194 * lenBytes rounded up to a whole word. This is possible in HPHP because we
195 * always allocate whole numbers of words. The final word compare is adjusted
196 * to handle the slack in lenBytes so only the bytes we care about are
200 bool wordsame(const void* mem1
, const void* mem2
, uint32_t lenBytes
) {
202 auto constexpr DEBUG_ONLY W
= sizeof(T
);
204 assert(reinterpret_cast<const uintptr_t>(mem1
) % W
== 0);
205 assert(reinterpret_cast<const uintptr_t>(mem2
) % W
== 0);
207 // ASan is less precise than valgrind and believes this function overruns reads
210 // For speed, we count up towards 0 from -lenBytes * 8 in units of a word of
211 // bits. When we reach a value >= 0, that is the number of bits we need to
212 // ignore in the last compare. Since we're on a little-endian architecture,
213 // we can do the ignoring by shifting left by that many bits. We also unroll
214 // the nBits increment from the first iteration, because we can fold that
215 // calculation together with the multiply by 8 into a single lea instruction.
216 const int32_t nBytes
= -lenBytes
;
217 // We need to bail out early if len is 0, and we can save a test instruction
218 // if we reuse the flags from the negation we just did.
219 if (UNLIKELY(nBytes
== 0)) return true;
220 int64_t nBits
= int64_t(nBytes
) * 8 + (W
* 8);
222 // Use the base+index addressing mode in x86, so that we only need to
223 // increment the base pointer in the loop.
224 auto p1
= reinterpret_cast<intptr_t>(mem1
);
225 auto const diff
= reinterpret_cast<intptr_t>(mem2
) - p1
;
229 data
= *(reinterpret_cast<const T
*>(p1
));
230 data
^= *(reinterpret_cast<const T
*>(p1
+ diff
));
232 // As a note for future consideration, we could consider precomputing a
233 // 64-bit mask, so that the fraction of the last qword can be checked
234 // faster. But that would require an additional register for the
235 // mask. So it depends on register pressure of the call site.
236 return !(data
<< nBits
);
243 #else // FOLLY_SANITIZE
245 return !memcmp(mem1
, mem2
, lenBytes
);
251 * Like memcpy, but copies numT POD values 8 bytes at a time.
252 * The actual number of bytes copied must be a multiple of 8.
255 T
* wordcpy(T
* to
, const T
* from
, size_t numT
) {
256 assert(numT
< std::numeric_limits
<int64_t>::max() &&
257 (numT
* sizeof(T
)) % 8 == 0);
258 size_t numWords
= numT
* sizeof(T
) / 8;
259 assert(numWords
!= 0);
260 auto d
= (int64_t*)to
;
261 auto s
= (int64_t*)from
;
264 } while (--numWords
);
269 * Fills a memory area with ones, 8 bytes at a time.
272 T
* wordfillones(T
* ptr
, size_t numT
) {
273 assert(numT
< std::numeric_limits
<int64_t>::max() &&
274 (numT
* sizeof(T
)) % 8 == 0);
276 auto numWords
= numT
* sizeof(T
) / 8;
277 auto d
= (int64_t*)ptr
;
280 } while (--numWords
);
284 //////////////////////////////////////////////////////////////////////