2 * Copyright (C) 2008 Denys Vlasenko <vda.linux@googlemail.com>
4 * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball
8 #error "Never use <libc-string_i386.h> directly; include <string.h> instead"
11 #ifndef _LIBC_STRING_i386_H
12 #define _LIBC_STRING_i386_H 1
14 static __always_inline
15 void *inlined_memset_const_c_count4(void *s
, unsigned eax
, unsigned count
)
22 /* Very small (2 stores or less) are best done with direct
23 * mov <const>,<mem> instructions (they do not clobber registers) */
25 *(char *)(s
+ 0) = eax
;
29 /* You wonder why & 0xff is needed? Try memset(p, '\xff', size).
30 * If char is signed, '\xff' == -1! */
31 eax
= (eax
& 0xff) * 0x01010101; /* done at compile time */
34 *(short *)(s
+ 0) = eax
;
38 *(short *)(s
+ 0) = eax
;
39 *(char *) (s
+ 2) = eax
;
42 if (count
== 1*4 + 0) {
43 *(int *)(s
+ 0) = eax
;
46 if (count
== 1*4 + 1) {
47 *(int *) (s
+ 0) = eax
;
48 *(char *)(s
+ 4) = eax
;
51 if (count
== 1*4 + 2) {
52 *(int *) (s
+ 0) = eax
;
53 *(short *)(s
+ 4) = eax
;
57 /* Small string stores: don't clobber ecx
58 * (clobbers only eax and edi) */
59 #define small_store(arg) { \
60 __asm__ __volatile__( \
63 : "a" (eax), "0" (s) \
68 if (count
== 1*4 + 3) small_store("stosl; stosw; stosb");
69 if (count
== 2*4 + 0) {
74 if (count
== 2*4 + 1) small_store("stosl; stosl; stosb");
75 if (count
== 2*4 + 2) small_store("stosl; stosl; stosw");
76 if (count
== 2*4 + 3) small_store("stosl; stosl; stosw; stosb");
77 if (count
== 3*4 + 0) small_store("stosl; stosl; stosl");
78 if (count
== 3*4 + 1) small_store("stosl; stosl; stosl; stosb");
79 if (count
== 3*4 + 2) small_store("stosl; stosl; stosl; stosw");
80 if (count
== 3*4 + 3) small_store("stosl; stosl; stosl; stosw; stosb");
81 if (count
== 4*4 + 0) small_store("stosl; stosl; stosl; stosl");
82 if (count
== 4*4 + 1) small_store("stosl; stosl; stosl; stosl; stosb");
83 /* going over 7 bytes is suboptimal */
84 /* stosw is 2-byte insn, so this one takes 6 bytes: */
85 if (count
== 4*4 + 2) small_store("stosl; stosl; stosl; stosl; stosw");
87 if (count
== 4*4 + 3) small_store("stosl; stosl; stosl; stosl; stosw; stosb");
89 if (count
== 5*4 + 0) small_store("stosl; stosl; stosl; stosl; stosl");
91 if (count
== 5*4 + 1) small_store("stosl; stosl; stosl; stosl; stosl; stosb");
93 if (count
== 5*4 + 2) small_store("stosl; stosl; stosl; stosl; stosl; stosw");
94 /* 8 bytes, but oh well... */
95 if (count
== 5*4 + 3) small_store("stosl; stosl; stosl; stosl; stosl; stosw; stosb");
97 if (count
== 6*4 + 0) small_store("stosl; stosl; stosl; stosl; stosl; stosl");
98 /* the rest would be 7+ bytes and is handled below instead */
101 /* Not small, but multiple-of-4 store.
102 * "mov <const>,%ecx; rep; stosl" sequence is 7 bytes */
103 __asm__
__volatile__(
105 : "=&c" (ecx
), "=&D" (edi
)
106 : "a" (eax
), "0" (count
/ 4), "1" (s
)
111 #if 1 /* -51 bytes on shared i386 build with gcc 4.3.0 */
112 #define memset(s, c, count) ( \
113 ( !(__builtin_constant_p(c) && __builtin_constant_p(count)) \
114 || ((count) > (6*4 + 0) && ((count) % 4) != 0) \
116 ? memset((s), (c), (count)) \
117 : inlined_memset_const_c_count4((s), (c), (count)) \
122 static __always_inline
123 void *inlined_mempcpy_const_count4(void *d
, const void *s
, unsigned count
)
132 *(char *)d
= *(char *)s
;
136 *(short *)d
= *(short *)s
;
139 /* Small string moves: don't clobber ecx
140 * (clobbers only esi and edi) */
141 #define small_move(arg) { \
142 __asm__ __volatile__( \
144 : "=&S" (esi), "=&D" (edi) \
150 if (count
== 3) small_move("movsw; movsb");
151 if (count
== 1*4 + 0) {
152 *(int *)d
= *(int *)s
;
155 if (count
== 1*4 + 1) small_move("movsl; movsb");
156 if (count
== 1*4 + 2) small_move("movsl; movsw");
157 if (count
== 1*4 + 3) small_move("movsl; movsw; movsb");
158 if (count
== 2*4 + 0) small_move("movsl; movsl");
159 if (count
== 2*4 + 1) small_move("movsl; movsl; movsb");
160 if (count
== 2*4 + 2) small_move("movsl; movsl; movsw");
161 if (count
== 2*4 + 3) small_move("movsl; movsl; movsw; movsb");
162 if (count
== 3*4 + 0) small_move("movsl; movsl; movsl");
163 if (count
== 3*4 + 1) small_move("movsl; movsl; movsl; movsb");
164 if (count
== 3*4 + 2) small_move("movsl; movsl; movsl; movsw");
165 if (count
== 3*4 + 3) small_move("movsl; movsl; movsl; movsw; movsb");
166 if (count
== 4*4 + 0) small_move("movsl; movsl; movsl; movsl");
167 if (count
== 4*4 + 1) small_move("movsl; movsl; movsl; movsl; movsb");
168 /* going over 7 bytes is suboptimal */
169 /* movsw is 2-byte insn, so this one takes 6 bytes: */
170 if (count
== 4*4 + 2) small_move("movsl; movsl; movsl; movsl; movsw");
172 if (count
== 4*4 + 3) small_move("movsl; movsl; movsl; movsl; movsw; movsb");
174 if (count
== 5*4 + 0) small_move("movsl; movsl; movsl; movsl; movsl");
176 if (count
== 5*4 + 1) small_move("movsl; movsl; movsl; movsl; movsl; movsb");
178 if (count
== 5*4 + 2) small_move("movsl; movsl; movsl; movsl; movsl; movsw");
179 /* 8 bytes, but oh well... */
180 if (count
== 5*4 + 3) small_move("movsl; movsl; movsl; movsl; movsl; movsw; movsb");
182 if (count
== 6*4 + 0) small_move("movsl; movsl; movsl; movsl; movsl; movsl");
183 /* the rest would be 7+ bytes and is handled below instead */
186 /* Not small, but multiple-of-4 move.
187 * "mov <const>,%ecx; rep; movsl" sequence is 7 bytes */
188 __asm__
__volatile__(
190 : "=&c" (ecx
), "=&S" (esi
), "=&D" (edi
)
191 : "0" (count
/ 4), "1" (s
), "2" (d
)
196 static __always_inline
197 void *inlined_memcpy_const_count4(void *d
, const void *s
, unsigned count
)
199 inlined_mempcpy_const_count4(d
, s
, count
);
202 #if 1 /* +34 bytes on shared i386 build with gcc 4.3.0 */
203 #define mempcpy(d, s, count) ( \
204 ( !(__builtin_constant_p(count)) \
205 || ((count) > (6*4 + 0) && ((count) % 4) != 0) \
207 ? mempcpy((d), (s), (count)) \
208 : inlined_mempcpy_const_count4((d), (s), (count)) \
210 #define memcpy(d, s, count) ( \
211 ( !(__builtin_constant_p(count)) \
212 || ((count) > (6*4 + 0) && ((count) % 4) != 0) \
214 ? memcpy((d), (s), (count)) \
215 : inlined_memcpy_const_count4((d), (s), (count)) \
220 static __always_inline
221 size_t inlined_strlen(const char *s
)
225 __asm__
__volatile__(
229 : "=c" (ecx
), "=&D" (edi
)
230 : "1" (s
), "a" (0), "0" (0xffffffffu
)
235 #if 0 /* +1108 bytes on shared i386 build with gcc 4.3.0 */
236 #define strlen(s) inlined_strlen(s)
240 static __always_inline
241 char *inlined_stpcpy(char *dest
, const char *src
)
245 __asm__
__volatile__(
248 " testb %%al, %%al\n"
250 : "=&S" (esi
), "=&D" (edi
), "=&a" (eax
)
251 : "0" (src
), "1" (dest
)
256 static __always_inline
257 char *inlined_strcpy(char *dest
, const char *src
)
259 inlined_stpcpy(dest
, src
);
262 #if 0 /* +562 bytes on shared i386 build with gcc 4.3.0 */
263 #define stpcpy(dest, src) inlined_stpcpy(dest, src)
264 #define strcpy(dest, src) inlined_strcpy(dest, src)
268 static __always_inline
269 void *inlined_memchr(const void *s
, int c
, size_t count
)
273 /* Unfortunately, c gets loaded to %eax (wide insn), not %al */
274 __asm__
__volatile__(
277 " leal -1(%%edi), %%edi\n"
280 " xorl %%edi, %%edi\n"
282 : "=&D" (edi
), "=&c" (ecx
)
283 : "a" (c
), "0" (s
), "1" (count
)
288 static __always_inline
289 void *inlined_memchr_const_c(const void *s
, int c
, size_t count
)
291 #if defined __OPTIMIZE__
294 __asm__
__volatile__(
296 " movb %4, %%al\n" /* const c to %%al */
298 " leal -1(%%edi), %%edi\n"
301 " xorl %%edi, %%edi\n"
303 : "=&D" (edi
), "=&c" (ecx
), "=&a" (eax
)
304 : "0" (s
), "i" (c
), "1" (count
)
309 /* With -O0, gcc can't figure out how to encode CONST c
310 * as an immediate operand. Generating slightly bigger code
311 * (usually "movl CONST,%eax", 3 bytes bigger than needed):
315 __asm__
__volatile__(
318 " leal -1(%%edi), %%edi\n"
321 " xorl %%edi, %%edi\n"
323 : "=&D" (edi
), "=&c" (ecx
), "=&a" (eax
)
324 : "0" (s
), "2" (c
), "1" (count
)
330 #if 1 /* +2 bytes on shared i386 build with gcc 4.3.0 */
331 #define memchr(s, c, count) ( \
332 __builtin_constant_p(c) \
333 ? inlined_memchr_const_c(s, (c) & 0xff, count) \
334 : inlined_memchr(s, c, count) \
338 #endif /* _LIBC_STRING_i386_H */