4 * This file is part of OpenTTD.
5 * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
6 * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
7 * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
10 /** @file string.cpp Handling of C-type strings (char*). */
14 #include "core/alloc_func.hpp"
15 #include "core/math_func.hpp"
18 #include "table/control_codes.h"
21 #include <ctype.h> /* required for tolower() */
24 #include <errno.h> // required by vsnprintf implementation for MSVC
28 /* Required by strnatcmp. */
29 #include <unicode/ustring.h>
32 #endif /* WITH_ICU_SORT */
36 /* Since version 3.14, MinGW Runtime has snprintf() and vsnprintf() conform to C99 but it's not the case for older versions */
37 #if (__MINGW32_MAJOR_VERSION < 3) || ((__MINGW32_MAJOR_VERSION == 3) && (__MINGW32_MINOR_VERSION < 14))
38 int CDECL
snprintf(char *str
, size_t size
, const char *format
, ...)
44 ret
= vsnprintf(str
, size
, format
, ap
);
48 #endif /* MinGW Runtime < 3.14 */
50 #if defined(_MSC_VER) && _MSC_VER < 1900
52 * Almost POSIX compliant implementation of \c vsnprintf for VC compiler.
53 * The difference is in the value returned on output truncation. This
54 * implementation returns size whereas a POSIX implementation returns
55 * size or more (the number of bytes that would be written to str
56 * had size been sufficiently large excluding the terminating null byte).
58 int CDECL
vsnprintf(char *str
, size_t size
, const char *format
, va_list ap
)
60 if (size
== 0) return 0;
63 int ret
= _vsnprintf(str
, size
, format
, ap
);
66 if (errno
!= ERANGE
) {
67 /* There's a formatting error, better get that looked
68 * at properly instead of ignoring it. */
71 } else if ((size_t)ret
< size
) {
72 /* The buffer is big enough for the number of
73 * characters stored (excluding null), i.e.
74 * the string has been null-terminated. */
78 /* The buffer is too small for _vsnprintf to write the
79 * null-terminator at its end and return size. */
88 * Copies characters from one buffer to another.
90 * Copies the source string to the destination buffer with respect of the
91 * terminating null-character and the maximum size of the destination
94 * @note usage ttd_strlcpy(dst, src, lengthof(dst));
95 * @note lengthof() applies only to fixed size arrays
97 * @param dst The destination buffer
98 * @param src The buffer containing the string to copy
99 * @param size The maximum size of the destination buffer
101 void ttd_strlcpy(char *dst
, const char *src
, size_t size
)
104 while (--size
> 0 && *src
!= '\0') {
111 /** Allocate a copy of a given string, and error out on failure. */
112 char *xstrdup (const char *s
)
114 return (char*) xmemdup (s
, strlen(s
) + 1);
118 * Allocate a copy of a given string, with bounded size, and error out
121 * Note! This is not the same as strndup, because it assumes that the
122 * string passed in is at least of the required size, unlike strndup,
123 * which will check if there is a null in the requested initial segment.
125 char *xstrmemdup (const char *s
, size_t n
)
127 char *p
= xmalloc (n
+ 1);
133 /** Allocate a copy of a given string, with bounded size, and error out on failure. */
134 char *xstrndup (const char *s
, size_t n
)
136 return xstrmemdup (s
, ttd_strnlen (s
, n
));
139 /** Allocate a formatted string. */
140 char *str_vfmt (const char *fmt
, va_list args
)
144 if (vasprintf (&s
, fmt
, args
) == -1) out_of_memory();
148 int len
= vsnprintf (buf
, lengthof(buf
), fmt
, args
);
149 return (char*) xmemdup (buf
, len
+ 1);
154 * Format, "printf", into a newly allocated string.
155 * @param str The formatting string.
156 * @return The formatted string. You must free this!
158 char *CDECL
str_fmt(const char *str
, ...)
162 char *s
= str_vfmt (str
, va
);
168 #ifdef DEFINE_STRCASESTR
169 char *strcasestr(const char *haystack
, const char *needle
)
171 size_t hay_len
= strlen(haystack
);
172 size_t needle_len
= strlen(needle
);
173 while (hay_len
>= needle_len
) {
174 if (strncasecmp(haystack
, needle
, needle_len
) == 0) return const_cast<char *>(haystack
);
182 #endif /* DEFINE_STRCASESTR */
185 * Skip some of the 'garbage' in the string that we don't want to use
186 * to sort on. This way the alphabetical sorting will work better as
187 * we would be actually using those characters instead of some other
188 * characters such as spaces and tildes at the begin of the name.
189 * @param str The string to skip the initial garbage of.
190 * @return The string with the garbage skipped.
192 static const char *SkipGarbage(const char *str
)
194 while (*str
!= '\0' && (*str
< '0' || IsInsideMM(*str
, ';', '@' + 1) || IsInsideMM(*str
, '[', '`' + 1) || IsInsideMM(*str
, '{', '~' + 1))) str
++;
199 * Compares two strings using case insensitive natural sort.
201 * @param s1 First string to compare.
202 * @param s2 Second string to compare.
203 * @param ignore_garbage_at_front Skip punctuation characters in the front
204 * @return Less than zero if s1 < s2, zero if s1 == s2, greater than zero if s1 > s2.
206 int strnatcmp(const char *s1
, const char *s2
, bool ignore_garbage_at_front
)
208 if (ignore_garbage_at_front
) {
209 s1
= SkipGarbage(s1
);
210 s2
= SkipGarbage(s2
);
213 if (_current_collator
!= NULL
) {
214 UErrorCode status
= U_ZERO_ERROR
;
215 int result
= _current_collator
->compareUTF8(s1
, s2
, status
);
216 if (U_SUCCESS(status
)) return result
;
219 #endif /* WITH_ICU_SORT */
221 /* Do a normal comparison if ICU is missing or if we cannot create a collator. */
222 return strcasecmp(s1
, s2
);
226 * Convert a given ASCII string to lowercase.
227 * NOTE: only support ASCII characters, no UTF8 fancy. As currently
228 * the function is only used to lowercase data-filenames if they are
229 * not found, this is sufficient. If more, or general functionality is
230 * needed, look to r7271 where it was removed because it was broken when
231 * using certain locales: eg in Turkish the uppercase 'I' was converted to
232 * '?', so just revert to the old functionality
233 * @param str string to convert
234 * @return String has changed.
236 bool strtolower(char *str
)
238 bool changed
= false;
239 for (; *str
!= '\0'; str
++) {
240 char new_str
= tolower(*str
);
241 changed
|= new_str
!= *str
;
251 * Decode and consume the next UTF-8 encoded character.
252 * @param c Buffer to place decoded character.
253 * @param s Character stream to retrieve character from.
254 * @return Number of characters in the sequence.
256 size_t Utf8Decode(WChar
*c
, const char *s
)
260 if (!HasBit(s
[0], 7)) {
261 /* Single byte character: 0xxxxxxx */
264 } else if (GB(s
[0], 5, 3) == 6) {
265 if (IsUtf8Part(s
[1])) {
266 /* Double byte character: 110xxxxx 10xxxxxx */
267 *c
= GB(s
[0], 0, 5) << 6 | GB(s
[1], 0, 6);
268 if (*c
>= 0x80) return 2;
270 } else if (GB(s
[0], 4, 4) == 14) {
271 if (IsUtf8Part(s
[1]) && IsUtf8Part(s
[2])) {
272 /* Triple byte character: 1110xxxx 10xxxxxx 10xxxxxx */
273 *c
= GB(s
[0], 0, 4) << 12 | GB(s
[1], 0, 6) << 6 | GB(s
[2], 0, 6);
274 if (*c
>= 0x800) return 3;
276 } else if (GB(s
[0], 3, 5) == 30) {
277 if (IsUtf8Part(s
[1]) && IsUtf8Part(s
[2]) && IsUtf8Part(s
[3])) {
278 /* 4 byte character: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
279 *c
= GB(s
[0], 0, 3) << 18 | GB(s
[1], 0, 6) << 12 | GB(s
[2], 0, 6) << 6 | GB(s
[3], 0, 6);
280 if (*c
>= 0x10000 && *c
<= 0x10FFFF) return 4;
284 /* DEBUG(misc, 1, "[utf8] invalid UTF-8 sequence"); */
291 * Encode a unicode character and place it in the buffer.
292 * @param buf Buffer to place character.
293 * @param c Unicode character to encode.
294 * @return Number of characters in the encoded sequence.
296 size_t Utf8Encode(char *buf
, WChar c
)
301 } else if (c
< 0x800) {
302 *buf
++ = 0xC0 + GB(c
, 6, 5);
303 *buf
= 0x80 + GB(c
, 0, 6);
305 } else if (c
< 0x10000) {
306 *buf
++ = 0xE0 + GB(c
, 12, 4);
307 *buf
++ = 0x80 + GB(c
, 6, 6);
308 *buf
= 0x80 + GB(c
, 0, 6);
310 } else if (c
< 0x110000) {
311 *buf
++ = 0xF0 + GB(c
, 18, 3);
312 *buf
++ = 0x80 + GB(c
, 12, 6);
313 *buf
++ = 0x80 + GB(c
, 6, 6);
314 *buf
= 0x80 + GB(c
, 0, 6);
318 /* DEBUG(misc, 1, "[utf8] can't UTF-8 encode value 0x%X", c); */
324 * Properly terminate an UTF8 string to some maximum length
325 * @param s string to check if it needs additional trimming
326 * @param maxlen the maximum length the buffer can have.
327 * @return the new length in bytes of the string (eg. strlen(new_string))
328 * @note maxlen is the string length _INCLUDING_ the terminating '\0'
330 size_t Utf8TrimString(char *s
, size_t maxlen
)
334 for (const char *ptr
= strchr(s
, '\0'); *s
!= '\0';) {
335 size_t len
= Utf8EncodedCharLen(*s
);
336 /* Silently ignore invalid UTF8 sequences, our only concern trimming */
337 if (len
== 0) len
= 1;
339 /* Take care when a hard cutoff was made for the string and
340 * the last UTF8 sequence is invalid */
341 if (length
+ len
>= maxlen
|| (s
+ len
> ptr
)) break;
351 * Get the length of an UTF-8 encoded string in number of characters
352 * and thus not the number of bytes that the encoded string contains.
353 * @param s The string to get the length for.
354 * @return The length of the string in characters.
356 size_t Utf8StringLength(const char *s
)
360 while (Utf8Consume(&t
) != 0) len
++;
365 * Only allow certain keys. You can define the filter to be used. This makes
366 * sure no invalid keys can get into an editbox, like BELL.
367 * @param key character to be checked
368 * @param afilter the filter to use
369 * @return true or false depending if the character is printable/valid or not
371 bool IsValidChar(WChar key
, CharSetFilter afilter
)
374 case CS_ALPHANUMERAL
: return IsPrintable(key
);
375 case CS_NUMERAL
: return (key
>= '0' && key
<= '9');
376 case CS_HEXADECIMAL
: return (key
>= '0' && key
<= '9') || (key
>= 'a' && key
<= 'f') || (key
>= 'A' && key
<= 'F');
383 * Checks whether the given string is valid, i.e. contains only
384 * valid (printable) characters and is properly terminated.
385 * @param str The string to validate.
386 * @param last The last character of the string, i.e. the string
387 * must be terminated here or earlier.
389 bool StrValid(const char *str
, const char *last
)
391 /* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
393 while (str
<= last
&& *str
!= '\0') {
394 size_t len
= Utf8EncodedCharLen(*str
);
395 /* Encoded length is 0 if the character isn't known.
396 * The length check is needed to prevent Utf8Decode to read
397 * over the terminating '\0' if that happens to be placed
398 * within the encoding of an UTF8 character. */
399 if (len
== 0 || str
+ len
> last
) return false;
402 len
= Utf8Decode(&c
, str
);
403 if (!IsPrintable(c
) || (c
>= SCC_SPRITE_START
&& c
<= SCC_SPRITE_END
)) {
414 * Scans the string for valid characters and if it finds invalid ones,
415 * replaces them with a question mark '?' (if not ignored)
416 * @param str the string to validate
417 * @param last the last valid character of str
418 * @param settings the settings for the string validation.
420 void str_validate(char *str
, const char *last
, StringValidationSettings settings
)
422 /* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
425 while (str
<= last
&& *str
!= '\0') {
426 size_t len
= Utf8EncodedCharLen(*str
);
427 /* If the character is unknown, i.e. encoded length is 0
428 * we assume worst case for the length check.
429 * The length check is needed to prevent Utf8Decode to read
430 * over the terminating '\0' if that happens to be placed
431 * within the encoding of an UTF8 character. */
432 if ((len
== 0 && str
+ 4 > last
) || str
+ len
> last
) break;
435 len
= Utf8Decode(&c
, str
);
436 /* It's possible to encode the string termination character
437 * into a multiple bytes. This prevents those termination
438 * characters to be skipped */
439 if (c
== '\0') break;
441 if ((IsPrintable(c
) && (c
< SCC_SPRITE_START
|| c
> SCC_SPRITE_END
)) || ((settings
& SVS_ALLOW_CONTROL_CODE
) != 0 && c
== SCC_ENCODED
)) {
442 /* Copy the character back. Even if dst is current the same as str
443 * (i.e. no characters have been changed) this is quicker than
444 * moving the pointers ahead by len */
447 } while (--len
!= 0);
448 } else if ((settings
& SVS_ALLOW_NEWLINE
) != 0 && c
== '\n') {
451 if ((settings
& SVS_ALLOW_NEWLINE
) != 0 && c
== '\r' && str
[1] == '\n') {
455 /* Replace the undesirable character with a question mark */
457 if ((settings
& SVS_REPLACE_WITH_QUESTION_MARK
) != 0) *dst
++ = '?';
465 * Scans the string for valid characters and if it finds invalid ones,
466 * replaces them with a question mark '?'.
467 * @param str the string to validate
469 void ValidateString(const char *str
)
471 /* We know it is '\0' terminated. */
472 str_validate(const_cast<char *>(str
), str
+ strlen(str
) + 1);
476 * Scan the string for old values of SCC_ENCODED and fix it to
477 * it's new, static value.
478 * @param str the string to scan
479 * @param last the last valid character of str
481 void str_fix_scc_encoded(char *str
, const char *last
)
483 while (str
<= last
&& *str
!= '\0') {
484 size_t len
= Utf8EncodedCharLen(*str
);
485 if ((len
== 0 && str
+ 4 > last
) || str
+ len
> last
) break;
488 len
= Utf8Decode(&c
, str
);
489 if (c
== '\0') break;
491 if (c
== 0xE028 || c
== 0xE02A) {
494 str
+= Utf8Encode(str
, c
);
499 /** Scans the string for colour codes and strips them */
500 void str_strip_colours(char *str
)
506 for (len
= Utf8Decode(&c
, str
); c
!= '\0'; len
= Utf8Decode(&c
, str
)) {
507 if (c
< SCC_BLUE
|| c
> SCC_BLACK
) {
508 /* Copy the character back. Even if dst is current the same as str
509 * (i.e. no characters have been changed) this is quicker than
510 * moving the pointers ahead by len */
513 } while (--len
!= 0);
515 /* Just skip (strip) the colour codes */
523 /* buffer-aware string functions */
525 /** Set this string according to a format and args. */
526 bool stringb::fmt (const char *fmt
, ...)
529 va_start (args
, fmt
);
530 bool r
= vfmt (fmt
, args
);
535 /** Append to this string according to a format and args. */
536 bool stringb::append_fmt (const char *fmt
, ...)
539 va_start (args
, fmt
);
540 bool r
= append_vfmt (fmt
, args
);
545 /** Append a unicode character encoded as utf-8 to the string. */
546 bool stringb::append_utf8 (WChar c
)
548 assert (len
< capacity
);
549 size_t left
= capacity
- len
;
552 if (left
<= 1) return false;
554 } else if (c
< 0x800) {
555 if (left
<= 2) return false;
556 buffer
[len
++] = 0xC0 + GB(c
, 6, 5);
557 buffer
[len
++] = 0x80 + GB(c
, 0, 6);
558 } else if (c
< 0x10000) {
559 if (left
<= 3) return false;
560 buffer
[len
++] = 0xE0 + GB(c
, 12, 4);
561 buffer
[len
++] = 0x80 + GB(c
, 6, 6);
562 buffer
[len
++] = 0x80 + GB(c
, 0, 6);
563 } else if (c
< 0x110000) {
564 if (left
<= 4) return false;
565 buffer
[len
++] = 0xF0 + GB(c
, 18, 3);
566 buffer
[len
++] = 0x80 + GB(c
, 12, 6);
567 buffer
[len
++] = 0x80 + GB(c
, 6, 6);
568 buffer
[len
++] = 0x80 + GB(c
, 0, 6);
570 /* DEBUG(misc, 1, "[utf8] can't UTF-8 encode value 0x%X", c); */
571 if (left
<= 1) return false;
579 /** Append the hexadecimal representation of an md5sum. */
580 bool stringb::append_md5sum (const uint8 md5sum
[16])
582 for (uint i
= 0; i
< 16; i
++) {
583 if (!append_fmt ("%02X", md5sum
[i
])) return false;