Remove unused enum CharSetFilter values
[openttd/fttd.git] / src / string.cpp
blob4c129f583dce8319f2d7c43d841ae92684e0d30a
1 /* $Id$ */
3 /*
4 * This file is part of OpenTTD.
5 * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
6 * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
7 * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
8 */
10 /** @file string.cpp Handling of C-type strings (char*). */
12 #include "stdafx.h"
13 #include "debug.h"
14 #include "core/alloc_func.hpp"
15 #include "core/math_func.hpp"
16 #include "string.h"
18 #include "table/control_codes.h"
20 #include <stdarg.h>
21 #include <ctype.h> /* required for tolower() */
23 #ifdef _MSC_VER
24 #include <errno.h> // required by vsnprintf implementation for MSVC
25 #endif
27 #ifdef WITH_ICU_SORT
28 /* Required by strnatcmp. */
29 #include <unicode/ustring.h>
30 #include "language.h"
31 #include "gfx_func.h"
32 #endif /* WITH_ICU_SORT */
35 #ifdef WIN32
36 /* Since version 3.14, MinGW Runtime has snprintf() and vsnprintf() conform to C99 but it's not the case for older versions */
37 #if (__MINGW32_MAJOR_VERSION < 3) || ((__MINGW32_MAJOR_VERSION == 3) && (__MINGW32_MINOR_VERSION < 14))
38 int CDECL snprintf(char *str, size_t size, const char *format, ...)
40 va_list ap;
41 int ret;
43 va_start(ap, format);
44 ret = vsnprintf(str, size, format, ap);
45 va_end(ap);
46 return ret;
48 #endif /* MinGW Runtime < 3.14 */
50 #if defined(_MSC_VER) && _MSC_VER < 1900
51 /**
52 * Almost POSIX compliant implementation of \c vsnprintf for VC compiler.
53 * The difference is in the value returned on output truncation. This
54 * implementation returns size whereas a POSIX implementation returns
55 * size or more (the number of bytes that would be written to str
56 * had size been sufficiently large excluding the terminating null byte).
58 int CDECL vsnprintf(char *str, size_t size, const char *format, va_list ap)
60 if (size == 0) return 0;
62 errno = 0;
63 int ret = _vsnprintf(str, size, format, ap);
65 if (ret < 0) {
66 if (errno != ERANGE) {
67 /* There's a formatting error, better get that looked
68 * at properly instead of ignoring it. */
69 NOT_REACHED();
71 } else if ((size_t)ret < size) {
72 /* The buffer is big enough for the number of
73 * characters stored (excluding null), i.e.
74 * the string has been null-terminated. */
75 return ret;
78 /* The buffer is too small for _vsnprintf to write the
79 * null-terminator at its end and return size. */
80 str[size - 1] = '\0';
81 return (int)size;
83 #endif /* _MSC_VER */
85 #endif /* WIN32 */
87 /**
88 * Copies characters from one buffer to another.
90 * Copies the source string to the destination buffer with respect of the
91 * terminating null-character and the maximum size of the destination
92 * buffer.
94 * @note usage ttd_strlcpy(dst, src, lengthof(dst));
95 * @note lengthof() applies only to fixed size arrays
97 * @param dst The destination buffer
98 * @param src The buffer containing the string to copy
99 * @param size The maximum size of the destination buffer
101 void ttd_strlcpy(char *dst, const char *src, size_t size)
103 assert(size > 0);
104 while (--size > 0 && *src != '\0') {
105 *dst++ = *src++;
107 *dst = '\0';
111 /** Allocate a copy of a given string, and error out on failure. */
112 char *xstrdup (const char *s)
114 return (char*) xmemdup (s, strlen(s) + 1);
118 * Allocate a copy of a given string, with bounded size, and error out
119 * on failure.
121 * Note! This is not the same as strndup, because it assumes that the
122 * string passed in is at least of the required size, unlike strndup,
123 * which will check if there is a null in the requested initial segment.
125 char *xstrmemdup (const char *s, size_t n)
127 char *p = xmalloc (n + 1);
128 memcpy (p, s, n);
129 p[n] = '\0';
130 return p;
133 /** Allocate a copy of a given string, with bounded size, and error out on failure. */
134 char *xstrndup (const char *s, size_t n)
136 return xstrmemdup (s, ttd_strnlen (s, n));
139 /** Allocate a formatted string. */
140 char *str_vfmt (const char *fmt, va_list args)
142 #ifdef _GNU_SOURCE
143 char *s;
144 if (vasprintf (&s, fmt, args) == -1) out_of_memory();
145 return s;
146 #else
147 char buf[4096];
148 int len = vsnprintf (buf, lengthof(buf), fmt, args);
149 return (char*) xmemdup (buf, len + 1);
150 #endif
154 * Format, "printf", into a newly allocated string.
155 * @param str The formatting string.
156 * @return The formatted string. You must free this!
158 char *CDECL str_fmt(const char *str, ...)
160 va_list va;
161 va_start(va, str);
162 char *s = str_vfmt (str, va);
163 va_end(va);
164 return s;
168 #ifdef DEFINE_STRCASESTR
169 char *strcasestr(const char *haystack, const char *needle)
171 size_t hay_len = strlen(haystack);
172 size_t needle_len = strlen(needle);
173 while (hay_len >= needle_len) {
174 if (strncasecmp(haystack, needle, needle_len) == 0) return const_cast<char *>(haystack);
176 haystack++;
177 hay_len--;
180 return NULL;
182 #endif /* DEFINE_STRCASESTR */
185 * Skip some of the 'garbage' in the string that we don't want to use
186 * to sort on. This way the alphabetical sorting will work better as
187 * we would be actually using those characters instead of some other
188 * characters such as spaces and tildes at the begin of the name.
189 * @param str The string to skip the initial garbage of.
190 * @return The string with the garbage skipped.
192 static const char *SkipGarbage(const char *str)
194 while (*str != '\0' && (*str < '0' || IsInsideMM(*str, ';', '@' + 1) || IsInsideMM(*str, '[', '`' + 1) || IsInsideMM(*str, '{', '~' + 1))) str++;
195 return str;
199 * Compares two strings using case insensitive natural sort.
201 * @param s1 First string to compare.
202 * @param s2 Second string to compare.
203 * @param ignore_garbage_at_front Skip punctuation characters in the front
204 * @return Less than zero if s1 < s2, zero if s1 == s2, greater than zero if s1 > s2.
206 int strnatcmp(const char *s1, const char *s2, bool ignore_garbage_at_front)
208 if (ignore_garbage_at_front) {
209 s1 = SkipGarbage(s1);
210 s2 = SkipGarbage(s2);
212 #ifdef WITH_ICU_SORT
213 if (_current_collator != NULL) {
214 UErrorCode status = U_ZERO_ERROR;
215 int result = _current_collator->compareUTF8(s1, s2, status);
216 if (U_SUCCESS(status)) return result;
219 #endif /* WITH_ICU_SORT */
221 /* Do a normal comparison if ICU is missing or if we cannot create a collator. */
222 return strcasecmp(s1, s2);
226 * Convert a given ASCII string to lowercase.
227 * NOTE: only support ASCII characters, no UTF8 fancy. As currently
228 * the function is only used to lowercase data-filenames if they are
229 * not found, this is sufficient. If more, or general functionality is
230 * needed, look to r7271 where it was removed because it was broken when
231 * using certain locales: eg in Turkish the uppercase 'I' was converted to
232 * '?', so just revert to the old functionality
233 * @param str string to convert
234 * @return String has changed.
236 bool strtolower(char *str)
238 bool changed = false;
239 for (; *str != '\0'; str++) {
240 char new_str = tolower(*str);
241 changed |= new_str != *str;
242 *str = new_str;
244 return changed;
248 /* UTF-8 handling */
251 * Decode and consume the next UTF-8 encoded character.
252 * @param c Buffer to place decoded character.
253 * @param s Character stream to retrieve character from.
254 * @return Number of characters in the sequence.
256 size_t Utf8Decode(WChar *c, const char *s)
258 assert(c != NULL);
260 if (!HasBit(s[0], 7)) {
261 /* Single byte character: 0xxxxxxx */
262 *c = s[0];
263 return 1;
264 } else if (GB(s[0], 5, 3) == 6) {
265 if (IsUtf8Part(s[1])) {
266 /* Double byte character: 110xxxxx 10xxxxxx */
267 *c = GB(s[0], 0, 5) << 6 | GB(s[1], 0, 6);
268 if (*c >= 0x80) return 2;
270 } else if (GB(s[0], 4, 4) == 14) {
271 if (IsUtf8Part(s[1]) && IsUtf8Part(s[2])) {
272 /* Triple byte character: 1110xxxx 10xxxxxx 10xxxxxx */
273 *c = GB(s[0], 0, 4) << 12 | GB(s[1], 0, 6) << 6 | GB(s[2], 0, 6);
274 if (*c >= 0x800) return 3;
276 } else if (GB(s[0], 3, 5) == 30) {
277 if (IsUtf8Part(s[1]) && IsUtf8Part(s[2]) && IsUtf8Part(s[3])) {
278 /* 4 byte character: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
279 *c = GB(s[0], 0, 3) << 18 | GB(s[1], 0, 6) << 12 | GB(s[2], 0, 6) << 6 | GB(s[3], 0, 6);
280 if (*c >= 0x10000 && *c <= 0x10FFFF) return 4;
284 /* DEBUG(misc, 1, "[utf8] invalid UTF-8 sequence"); */
285 *c = '?';
286 return 1;
291 * Encode a unicode character and place it in the buffer.
292 * @param buf Buffer to place character.
293 * @param c Unicode character to encode.
294 * @return Number of characters in the encoded sequence.
296 size_t Utf8Encode(char *buf, WChar c)
298 if (c < 0x80) {
299 *buf = c;
300 return 1;
301 } else if (c < 0x800) {
302 *buf++ = 0xC0 + GB(c, 6, 5);
303 *buf = 0x80 + GB(c, 0, 6);
304 return 2;
305 } else if (c < 0x10000) {
306 *buf++ = 0xE0 + GB(c, 12, 4);
307 *buf++ = 0x80 + GB(c, 6, 6);
308 *buf = 0x80 + GB(c, 0, 6);
309 return 3;
310 } else if (c < 0x110000) {
311 *buf++ = 0xF0 + GB(c, 18, 3);
312 *buf++ = 0x80 + GB(c, 12, 6);
313 *buf++ = 0x80 + GB(c, 6, 6);
314 *buf = 0x80 + GB(c, 0, 6);
315 return 4;
318 /* DEBUG(misc, 1, "[utf8] can't UTF-8 encode value 0x%X", c); */
319 *buf = '?';
320 return 1;
324 * Properly terminate an UTF8 string to some maximum length
325 * @param s string to check if it needs additional trimming
326 * @param maxlen the maximum length the buffer can have.
327 * @return the new length in bytes of the string (eg. strlen(new_string))
328 * @note maxlen is the string length _INCLUDING_ the terminating '\0'
330 size_t Utf8TrimString(char *s, size_t maxlen)
332 size_t length = 0;
334 for (const char *ptr = strchr(s, '\0'); *s != '\0';) {
335 size_t len = Utf8EncodedCharLen(*s);
336 /* Silently ignore invalid UTF8 sequences, our only concern trimming */
337 if (len == 0) len = 1;
339 /* Take care when a hard cutoff was made for the string and
340 * the last UTF8 sequence is invalid */
341 if (length + len >= maxlen || (s + len > ptr)) break;
342 s += len;
343 length += len;
346 *s = '\0';
347 return length;
351 * Get the length of an UTF-8 encoded string in number of characters
352 * and thus not the number of bytes that the encoded string contains.
353 * @param s The string to get the length for.
354 * @return The length of the string in characters.
356 size_t Utf8StringLength(const char *s)
358 size_t len = 0;
359 const char *t = s;
360 while (Utf8Consume(&t) != 0) len++;
361 return len;
365 * Only allow certain keys. You can define the filter to be used. This makes
366 * sure no invalid keys can get into an editbox, like BELL.
367 * @param key character to be checked
368 * @param afilter the filter to use
369 * @return true or false depending if the character is printable/valid or not
371 bool IsValidChar(WChar key, CharSetFilter afilter)
373 switch (afilter) {
374 case CS_ALPHANUMERAL: return IsPrintable(key);
375 case CS_NUMERAL: return (key >= '0' && key <= '9');
376 case CS_HEXADECIMAL: return (key >= '0' && key <= '9') || (key >= 'a' && key <= 'f') || (key >= 'A' && key <= 'F');
379 return false;
383 * Checks whether the given string is valid, i.e. contains only
384 * valid (printable) characters and is properly terminated.
385 * @param str The string to validate.
386 * @param last The last character of the string, i.e. the string
387 * must be terminated here or earlier.
389 bool StrValid(const char *str, const char *last)
391 /* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
393 while (str <= last && *str != '\0') {
394 size_t len = Utf8EncodedCharLen(*str);
395 /* Encoded length is 0 if the character isn't known.
396 * The length check is needed to prevent Utf8Decode to read
397 * over the terminating '\0' if that happens to be placed
398 * within the encoding of an UTF8 character. */
399 if (len == 0 || str + len > last) return false;
401 WChar c;
402 len = Utf8Decode(&c, str);
403 if (!IsPrintable(c) || (c >= SCC_SPRITE_START && c <= SCC_SPRITE_END)) {
404 return false;
407 str += len;
410 return *str == '\0';
414 * Scans the string for valid characters and if it finds invalid ones,
415 * replaces them with a question mark '?' (if not ignored)
416 * @param str the string to validate
417 * @param last the last valid character of str
418 * @param settings the settings for the string validation.
420 void str_validate(char *str, const char *last, StringValidationSettings settings)
422 /* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
424 char *dst = str;
425 while (str <= last && *str != '\0') {
426 size_t len = Utf8EncodedCharLen(*str);
427 /* If the character is unknown, i.e. encoded length is 0
428 * we assume worst case for the length check.
429 * The length check is needed to prevent Utf8Decode to read
430 * over the terminating '\0' if that happens to be placed
431 * within the encoding of an UTF8 character. */
432 if ((len == 0 && str + 4 > last) || str + len > last) break;
434 WChar c;
435 len = Utf8Decode(&c, str);
436 /* It's possible to encode the string termination character
437 * into a multiple bytes. This prevents those termination
438 * characters to be skipped */
439 if (c == '\0') break;
441 if ((IsPrintable(c) && (c < SCC_SPRITE_START || c > SCC_SPRITE_END)) || ((settings & SVS_ALLOW_CONTROL_CODE) != 0 && c == SCC_ENCODED)) {
442 /* Copy the character back. Even if dst is current the same as str
443 * (i.e. no characters have been changed) this is quicker than
444 * moving the pointers ahead by len */
445 do {
446 *dst++ = *str++;
447 } while (--len != 0);
448 } else if ((settings & SVS_ALLOW_NEWLINE) != 0 && c == '\n') {
449 *dst++ = *str++;
450 } else {
451 if ((settings & SVS_ALLOW_NEWLINE) != 0 && c == '\r' && str[1] == '\n') {
452 str += len;
453 continue;
455 /* Replace the undesirable character with a question mark */
456 str += len;
457 if ((settings & SVS_REPLACE_WITH_QUESTION_MARK) != 0) *dst++ = '?';
461 *dst = '\0';
465 * Scans the string for valid characters and if it finds invalid ones,
466 * replaces them with a question mark '?'.
467 * @param str the string to validate
469 void ValidateString(const char *str)
471 /* We know it is '\0' terminated. */
472 str_validate(const_cast<char *>(str), str + strlen(str) + 1);
476 * Scan the string for old values of SCC_ENCODED and fix it to
477 * it's new, static value.
478 * @param str the string to scan
479 * @param last the last valid character of str
481 void str_fix_scc_encoded(char *str, const char *last)
483 while (str <= last && *str != '\0') {
484 size_t len = Utf8EncodedCharLen(*str);
485 if ((len == 0 && str + 4 > last) || str + len > last) break;
487 WChar c;
488 len = Utf8Decode(&c, str);
489 if (c == '\0') break;
491 if (c == 0xE028 || c == 0xE02A) {
492 c = SCC_ENCODED;
494 str += Utf8Encode(str, c);
496 *str = '\0';
499 /** Scans the string for colour codes and strips them */
500 void str_strip_colours(char *str)
502 char *dst = str;
503 WChar c;
504 size_t len;
506 for (len = Utf8Decode(&c, str); c != '\0'; len = Utf8Decode(&c, str)) {
507 if (c < SCC_BLUE || c > SCC_BLACK) {
508 /* Copy the character back. Even if dst is current the same as str
509 * (i.e. no characters have been changed) this is quicker than
510 * moving the pointers ahead by len */
511 do {
512 *dst++ = *str++;
513 } while (--len != 0);
514 } else {
515 /* Just skip (strip) the colour codes */
516 str += len;
519 *dst = '\0';
523 /* buffer-aware string functions */
525 /** Set this string according to a format and args. */
526 bool stringb::fmt (const char *fmt, ...)
528 va_list args;
529 va_start (args, fmt);
530 bool r = vfmt (fmt, args);
531 va_end (args);
532 return r;
535 /** Append to this string according to a format and args. */
536 bool stringb::append_fmt (const char *fmt, ...)
538 va_list args;
539 va_start (args, fmt);
540 bool r = append_vfmt (fmt, args);
541 va_end (args);
542 return r;
545 /** Append a unicode character encoded as utf-8 to the string. */
546 bool stringb::append_utf8 (WChar c)
548 assert (len < capacity);
549 size_t left = capacity - len;
551 if (c < 0x80) {
552 if (left <= 1) return false;
553 buffer[len++] = c;
554 } else if (c < 0x800) {
555 if (left <= 2) return false;
556 buffer[len++] = 0xC0 + GB(c, 6, 5);
557 buffer[len++] = 0x80 + GB(c, 0, 6);
558 } else if (c < 0x10000) {
559 if (left <= 3) return false;
560 buffer[len++] = 0xE0 + GB(c, 12, 4);
561 buffer[len++] = 0x80 + GB(c, 6, 6);
562 buffer[len++] = 0x80 + GB(c, 0, 6);
563 } else if (c < 0x110000) {
564 if (left <= 4) return false;
565 buffer[len++] = 0xF0 + GB(c, 18, 3);
566 buffer[len++] = 0x80 + GB(c, 12, 6);
567 buffer[len++] = 0x80 + GB(c, 6, 6);
568 buffer[len++] = 0x80 + GB(c, 0, 6);
569 } else {
570 /* DEBUG(misc, 1, "[utf8] can't UTF-8 encode value 0x%X", c); */
571 if (left <= 1) return false;
572 buffer[len++] = '?';
575 buffer[len] = '\0';
576 return true;
579 /** Append the hexadecimal representation of an md5sum. */
580 bool stringb::append_md5sum (const uint8 md5sum [16])
582 for (uint i = 0; i < 16; i++) {
583 if (!append_fmt ("%02X", md5sum[i])) return false;
586 return true;