src/string.cpp

   1 /* $Id$ */
   2
   3 /*
   4  * This file is part of OpenTTD.
   5  * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
   6  * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   7  * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
   8  */
   9
  10 /** @file string.cpp Handling of C-type strings (char*). */
  11
  12 #include "stdafx.h"
  13 #include "debug.h"
  14 #include "core/alloc_func.hpp"
  15 #include "core/math_func.hpp"
  16 #include "string.h"
  17
  18 #include "table/control_codes.h"
  19
  20 #include <stdarg.h>
  21 #include <ctype.h> /* required for tolower() */
  22
  23 #ifdef _MSC_VER
  24 #include <errno.h> // required by vsnprintf implementation for MSVC
  25 #endif
  26
  27 #ifdef WITH_ICU_SORT
  28 /* Required by strnatcmp. */
  29 #include <unicode/ustring.h>
  30 #include "language.h"
  31 #include "gfx_func.h"
  32 #endif /* WITH_ICU_SORT */
  33
  34
  35 #ifdef WIN32
  36 /* Since version 3.14, MinGW Runtime has snprintf() and vsnprintf() conform to C99 but it's not the case for older versions */
  37 #if (__MINGW32_MAJOR_VERSION < 3) || ((__MINGW32_MAJOR_VERSION == 3) && (__MINGW32_MINOR_VERSION < 14))
  38 int CDECL snprintf(char *str, size_t size, const char *format, ...)
  39 {
  40         va_list ap;
  41         int ret;
  42
  43         va_start(ap, format);
  44         ret = vsnprintf(str, size, format, ap);
  45         va_end(ap);
  46         return ret;
  47 }
  48 #endif /* MinGW Runtime < 3.14 */
  49
  50 #if defined(_MSC_VER) && _MSC_VER < 1900
  51 /**
  52  * Almost POSIX compliant implementation of \c vsnprintf for VC compiler.
  53  * The difference is in the value returned on output truncation. This
  54  * implementation returns size whereas a POSIX implementation returns
  55  * size or more (the number of bytes that would be written to str
  56  * had size been sufficiently large excluding the terminating null byte).
  57  */
  58 int CDECL vsnprintf(char *str, size_t size, const char *format, va_list ap)
  59 {
  60         if (size == 0) return 0;
  61
  62         errno = 0;
  63         int ret = _vsnprintf(str, size, format, ap);
  64
  65         if (ret < 0) {
  66                 if (errno != ERANGE) {
  67                         /* There's a formatting error, better get that looked
  68                          * at properly instead of ignoring it. */
  69                         NOT_REACHED();
  70                 }
  71         } else if ((size_t)ret < size) {
  72                 /* The buffer is big enough for the number of
  73                  * characters stored (excluding null), i.e.
  74                  * the string has been null-terminated. */
  75                 return ret;
  76         }
  77
  78         /* The buffer is too small for _vsnprintf to write the
  79          * null-terminator at its end and return size. */
  80         str[size - 1] = '\0';
  81         return (int)size;
  82 }
  83 #endif /* _MSC_VER */
  84
  85 #endif /* WIN32 */
  86
  87 /**
  88  * Copies characters from one buffer to another.
  89  *
  90  * Copies the source string to the destination buffer with respect of the
  91  * terminating null-character and the maximum size of the destination
  92  * buffer.
  93  *
  94  * @note usage ttd_strlcpy(dst, src, lengthof(dst));
  95  * @note lengthof() applies only to fixed size arrays
  96  *
  97  * @param dst The destination buffer
  98  * @param src The buffer containing the string to copy
  99  * @param size The maximum size of the destination buffer
 100  */
 101 void ttd_strlcpy(char *dst, const char *src, size_t size)
 102 {
 103         assert(size > 0);
 104         while (--size > 0 && *src != '\0') {
 105                 *dst++ = *src++;
 106         }
 107         *dst = '\0';
 108 }
 109
 110
 111 /** Allocate a copy of a given string, and error out on failure. */
 112 char *xstrdup (const char *s)
 113 {
 114         return (char*) xmemdup (s, strlen(s) + 1);
 115 }
 116
 117 /**
 118  * Allocate a copy of a given string, with bounded size, and error out
 119  * on failure.
 120  *
 121  * Note! This is not the same as strndup, because it assumes that the
 122  * string passed in is at least of the required size, unlike strndup,
 123  * which will check if there is a null in the requested initial segment.
 124  */
 125 char *xstrmemdup (const char *s, size_t n)
 126 {
 127         char *p = xmalloc (n + 1);
 128         memcpy (p, s, n);
 129         p[n] = '\0';
 130         return p;
 131 }
 132
 133 /** Allocate a copy of a given string, with bounded size, and error out on failure. */
 134 char *xstrndup (const char *s, size_t n)
 135 {
 136         return xstrmemdup (s, ttd_strnlen (s, n));
 137 }
 138
 139 /** Allocate a formatted string. */
 140 char *str_vfmt (const char *fmt, va_list args)
 141 {
 142 #ifdef _GNU_SOURCE
 143         char *s;
 144         if (vasprintf (&s, fmt, args) == -1) out_of_memory();
 145         return s;
 146 #else
 147         char buf[4096];
 148         int len = vsnprintf (buf, lengthof(buf), fmt, args);
 149         return (char*) xmemdup (buf, len + 1);
 150 #endif
 151 }
 152
 153 /**
 154  * Format, "printf", into a newly allocated string.
 155  * @param str The formatting string.
 156  * @return The formatted string. You must free this!
 157  */
 158 char *CDECL str_fmt(const char *str, ...)
 159 {
 160         va_list va;
 161         va_start(va, str);
 162         char *s = str_vfmt (str, va);
 163         va_end(va);
 164         return s;
 165 }
 166
 167
 168 #ifdef DEFINE_STRCASESTR
 169 char *strcasestr(const char *haystack, const char *needle)
 170 {
 171         size_t hay_len = strlen(haystack);
 172         size_t needle_len = strlen(needle);
 173         while (hay_len >= needle_len) {
 174                 if (strncasecmp(haystack, needle, needle_len) == 0) return const_cast<char *>(haystack);
 175
 176                 haystack++;
 177                 hay_len--;
 178         }
 179
 180         return NULL;
 181 }
 182 #endif /* DEFINE_STRCASESTR */
 183
 184 /**
 185  * Skip some of the 'garbage' in the string that we don't want to use
 186  * to sort on. This way the alphabetical sorting will work better as
 187  * we would be actually using those characters instead of some other
 188  * characters such as spaces and tildes at the begin of the name.
 189  * @param str The string to skip the initial garbage of.
 190  * @return The string with the garbage skipped.
 191  */
 192 static const char *SkipGarbage(const char *str)
 193 {
 194         while (*str != '\0' && (*str < '0' || IsInsideMM(*str, ';', '@' + 1) || IsInsideMM(*str, '[', '`' + 1) || IsInsideMM(*str, '{', '~' + 1))) str++;
 195         return str;
 196 }
 197
 198 /**
 199  * Compares two strings using case insensitive natural sort.
 200  *
 201  * @param s1 First string to compare.
 202  * @param s2 Second string to compare.
 203  * @param ignore_garbage_at_front Skip punctuation characters in the front
 204  * @return Less than zero if s1 < s2, zero if s1 == s2, greater than zero if s1 > s2.
 205  */
 206 int strnatcmp(const char *s1, const char *s2, bool ignore_garbage_at_front)
 207 {
 208         if (ignore_garbage_at_front) {
 209                 s1 = SkipGarbage(s1);
 210                 s2 = SkipGarbage(s2);
 211         }
 212 #ifdef WITH_ICU_SORT
 213         if (_current_collator != NULL) {
 214                 UErrorCode status = U_ZERO_ERROR;
 215                 int result = _current_collator->compareUTF8(s1, s2, status);
 216                 if (U_SUCCESS(status)) return result;
 217         }
 218
 219 #endif /* WITH_ICU_SORT */
 220
 221         /* Do a normal comparison if ICU is missing or if we cannot create a collator. */
 222         return strcasecmp(s1, s2);
 223 }
 224
 225 /**
 226  * Convert a given ASCII string to lowercase.
 227  * NOTE: only support ASCII characters, no UTF8 fancy. As currently
 228  * the function is only used to lowercase data-filenames if they are
 229  * not found, this is sufficient. If more, or general functionality is
 230  * needed, look to r7271 where it was removed because it was broken when
 231  * using certain locales: eg in Turkish the uppercase 'I' was converted to
 232  * '?', so just revert to the old functionality
 233  * @param str string to convert
 234  * @return String has changed.
 235  */
 236 bool strtolower(char *str)
 237 {
 238         bool changed = false;
 239         for (; *str != '\0'; str++) {
 240                 char new_str = tolower(*str);
 241                 changed |= new_str != *str;
 242                 *str = new_str;
 243         }
 244         return changed;
 245 }
 246
 247
 248 /* UTF-8 handling */
 249
 250 /**
 251  * Decode and consume the next UTF-8 encoded character.
 252  * @param c Buffer to place decoded character.
 253  * @param s Character stream to retrieve character from.
 254  * @return Number of characters in the sequence.
 255  */
 256 size_t Utf8Decode(WChar *c, const char *s)
 257 {
 258         assert(c != NULL);
 259
 260         if (!HasBit(s[0], 7)) {
 261                 /* Single byte character: 0xxxxxxx */
 262                 *c = s[0];
 263                 return 1;
 264         } else if (GB(s[0], 5, 3) == 6) {
 265                 if (IsUtf8Part(s[1])) {
 266                         /* Double byte character: 110xxxxx 10xxxxxx */
 267                         *c = GB(s[0], 0, 5) << 6 | GB(s[1], 0, 6);
 268                         if (*c >= 0x80) return 2;
 269                 }
 270         } else if (GB(s[0], 4, 4) == 14) {
 271                 if (IsUtf8Part(s[1]) && IsUtf8Part(s[2])) {
 272                         /* Triple byte character: 1110xxxx 10xxxxxx 10xxxxxx */
 273                         *c = GB(s[0], 0, 4) << 12 | GB(s[1], 0, 6) << 6 | GB(s[2], 0, 6);
 274                         if (*c >= 0x800) return 3;
 275                 }
 276         } else if (GB(s[0], 3, 5) == 30) {
 277                 if (IsUtf8Part(s[1]) && IsUtf8Part(s[2]) && IsUtf8Part(s[3])) {
 278                         /* 4 byte character: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
 279                         *c = GB(s[0], 0, 3) << 18 | GB(s[1], 0, 6) << 12 | GB(s[2], 0, 6) << 6 | GB(s[3], 0, 6);
 280                         if (*c >= 0x10000 && *c <= 0x10FFFF) return 4;
 281                 }
 282         }
 283
 284         /* DEBUG(misc, 1, "[utf8] invalid UTF-8 sequence"); */
 285         *c = '?';
 286         return 1;
 287 }
 288
 289
 290 /**
 291  * Encode a unicode character and place it in the buffer.
 292  * @param buf Buffer to place character.
 293  * @param c   Unicode character to encode.
 294  * @return Number of characters in the encoded sequence.
 295  */
 296 size_t Utf8Encode(char *buf, WChar c)
 297 {
 298         if (c < 0x80) {
 299                 *buf = c;
 300                 return 1;
 301         } else if (c < 0x800) {
 302                 *buf++ = 0xC0 + GB(c,  6, 5);
 303                 *buf   = 0x80 + GB(c,  0, 6);
 304                 return 2;
 305         } else if (c < 0x10000) {
 306                 *buf++ = 0xE0 + GB(c, 12, 4);
 307                 *buf++ = 0x80 + GB(c,  6, 6);
 308                 *buf   = 0x80 + GB(c,  0, 6);
 309                 return 3;
 310         } else if (c < 0x110000) {
 311                 *buf++ = 0xF0 + GB(c, 18, 3);
 312                 *buf++ = 0x80 + GB(c, 12, 6);
 313                 *buf++ = 0x80 + GB(c,  6, 6);
 314                 *buf   = 0x80 + GB(c,  0, 6);
 315                 return 4;
 316         }
 317
 318         /* DEBUG(misc, 1, "[utf8] can't UTF-8 encode value 0x%X", c); */
 319         *buf = '?';
 320         return 1;
 321 }
 322
 323 /**
 324  * Properly terminate an UTF8 string to some maximum length
 325  * @param s string to check if it needs additional trimming
 326  * @param maxlen the maximum length the buffer can have.
 327  * @return the new length in bytes of the string (eg. strlen(new_string))
 328  * @note maxlen is the string length _INCLUDING_ the terminating '\0'
 329  */
 330 size_t Utf8TrimString(char *s, size_t maxlen)
 331 {
 332         size_t length = 0;
 333
 334         for (const char *ptr = strchr(s, '\0'); *s != '\0';) {
 335                 size_t len = Utf8EncodedCharLen(*s);
 336                 /* Silently ignore invalid UTF8 sequences, our only concern trimming */
 337                 if (len == 0) len = 1;
 338
 339                 /* Take care when a hard cutoff was made for the string and
 340                  * the last UTF8 sequence is invalid */
 341                 if (length + len >= maxlen || (s + len > ptr)) break;
 342                 s += len;
 343                 length += len;
 344         }
 345
 346         *s = '\0';
 347         return length;
 348 }
 349
 350 /**
 351  * Get the length of an UTF-8 encoded string in number of characters
 352  * and thus not the number of bytes that the encoded string contains.
 353  * @param s The string to get the length for.
 354  * @return The length of the string in characters.
 355  */
 356 size_t Utf8StringLength(const char *s)
 357 {
 358         size_t len = 0;
 359         const char *t = s;
 360         while (Utf8Consume(&t) != 0) len++;
 361         return len;
 362 }
 363
 364 /**
 365  * Only allow certain keys. You can define the filter to be used. This makes
 366  *  sure no invalid keys can get into an editbox, like BELL.
 367  * @param key character to be checked
 368  * @param afilter the filter to use
 369  * @return true or false depending if the character is printable/valid or not
 370  */
 371 bool IsValidChar(WChar key, CharSetFilter afilter)
 372 {
 373         switch (afilter) {
 374                 case CS_ALPHANUMERAL:  return IsPrintable(key);
 375                 case CS_NUMERAL:       return (key >= '0' && key <= '9');
 376                 case CS_HEXADECIMAL:   return (key >= '0' && key <= '9') || (key >= 'a' && key <= 'f') || (key >= 'A' && key <= 'F');
 377         }
 378
 379         return false;
 380 }
 381
 382 /**
 383  * Checks whether the given string is valid, i.e. contains only
 384  * valid (printable) characters and is properly terminated.
 385  * @param str  The string to validate.
 386  * @param last The last character of the string, i.e. the string
 387  *             must be terminated here or earlier.
 388  */
 389 bool StrValid(const char *str, const char *last)
 390 {
 391         /* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
 392
 393         while (str <= last && *str != '\0') {
 394                 size_t len = Utf8EncodedCharLen(*str);
 395                 /* Encoded length is 0 if the character isn't known.
 396                  * The length check is needed to prevent Utf8Decode to read
 397                  * over the terminating '\0' if that happens to be placed
 398                  * within the encoding of an UTF8 character. */
 399                 if (len == 0 || str + len > last) return false;
 400
 401                 WChar c;
 402                 len = Utf8Decode(&c, str);
 403                 if (!IsPrintable(c) || (c >= SCC_SPRITE_START && c <= SCC_SPRITE_END)) {
 404                         return false;
 405                 }
 406
 407                 str += len;
 408         }
 409
 410         return *str == '\0';
 411 }
 412
 413 /**
 414  * Scans the string for valid characters and if it finds invalid ones,
 415  * replaces them with a question mark '?' (if not ignored)
 416  * @param str the string to validate
 417  * @param last the last valid character of str
 418  * @param settings the settings for the string validation.
 419  */
 420 void str_validate(char *str, const char *last, StringValidationSettings settings)
 421 {
 422         /* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
 423
 424         char *dst = str;
 425         while (str <= last && *str != '\0') {
 426                 size_t len = Utf8EncodedCharLen(*str);
 427                 /* If the character is unknown, i.e. encoded length is 0
 428                  * we assume worst case for the length check.
 429                  * The length check is needed to prevent Utf8Decode to read
 430                  * over the terminating '\0' if that happens to be placed
 431                  * within the encoding of an UTF8 character. */
 432                 if ((len == 0 && str + 4 > last) || str + len > last) break;
 433
 434                 WChar c;
 435                 len = Utf8Decode(&c, str);
 436                 /* It's possible to encode the string termination character
 437                  * into a multiple bytes. This prevents those termination
 438                  * characters to be skipped */
 439                 if (c == '\0') break;
 440
 441                 if ((IsPrintable(c) && (c < SCC_SPRITE_START || c > SCC_SPRITE_END)) || ((settings & SVS_ALLOW_CONTROL_CODE) != 0 && c == SCC_ENCODED)) {
 442                         /* Copy the character back. Even if dst is current the same as str
 443                          * (i.e. no characters have been changed) this is quicker than
 444                          * moving the pointers ahead by len */
 445                         do {
 446                                 *dst++ = *str++;
 447                         } while (--len != 0);
 448                 } else if ((settings & SVS_ALLOW_NEWLINE) != 0  && c == '\n') {
 449                         *dst++ = *str++;
 450                 } else {
 451                         if ((settings & SVS_ALLOW_NEWLINE) != 0 && c == '\r' && str[1] == '\n') {
 452                                 str += len;
 453                                 continue;
 454                         }
 455                         /* Replace the undesirable character with a question mark */
 456                         str += len;
 457                         if ((settings & SVS_REPLACE_WITH_QUESTION_MARK) != 0) *dst++ = '?';
 458                 }
 459         }
 460
 461         *dst = '\0';
 462 }
 463
 464 /**
 465  * Scans the string for valid characters and if it finds invalid ones,
 466  * replaces them with a question mark '?'.
 467  * @param str the string to validate
 468  */
 469 void ValidateString(const char *str)
 470 {
 471         /* We know it is '\0' terminated. */
 472         str_validate(const_cast<char *>(str), str + strlen(str) + 1);
 473 }
 474
 475 /**
 476  * Scan the string for old values of SCC_ENCODED and fix it to
 477  * it's new, static value.
 478  * @param str the string to scan
 479  * @param last the last valid character of str
 480  */
 481 void str_fix_scc_encoded(char *str, const char *last)
 482 {
 483         while (str <= last && *str != '\0') {
 484                 size_t len = Utf8EncodedCharLen(*str);
 485                 if ((len == 0 && str + 4 > last) || str + len > last) break;
 486
 487                 WChar c;
 488                 len = Utf8Decode(&c, str);
 489                 if (c == '\0') break;
 490
 491                 if (c == 0xE028 || c == 0xE02A) {
 492                         c = SCC_ENCODED;
 493                 }
 494                 str += Utf8Encode(str, c);
 495         }
 496         *str = '\0';
 497 }
 498
 499 /** Scans the string for colour codes and strips them */
 500 void str_strip_colours(char *str)
 501 {
 502         char *dst = str;
 503         WChar c;
 504         size_t len;
 505
 506         for (len = Utf8Decode(&c, str); c != '\0'; len = Utf8Decode(&c, str)) {
 507                 if (c < SCC_BLUE || c > SCC_BLACK) {
 508                         /* Copy the character back. Even if dst is current the same as str
 509                          * (i.e. no characters have been changed) this is quicker than
 510                          * moving the pointers ahead by len */
 511                         do {
 512                                 *dst++ = *str++;
 513                         } while (--len != 0);
 514                 } else {
 515                         /* Just skip (strip) the colour codes */
 516                         str += len;
 517                 }
 518         }
 519         *dst = '\0';
 520 }
 521
 522
 523 /* buffer-aware string functions */
 524
 525 /** Set this string according to a format and args. */
 526 bool stringb::fmt (const char *fmt, ...)
 527 {
 528         va_list args;
 529         va_start (args, fmt);
 530         bool r = vfmt (fmt, args);
 531         va_end (args);
 532         return r;
 533 }
 534
 535 /** Append to this string according to a format and args. */
 536 bool stringb::append_fmt (const char *fmt, ...)
 537 {
 538         va_list args;
 539         va_start (args, fmt);
 540         bool r = append_vfmt (fmt, args);
 541         va_end (args);
 542         return r;
 543 }
 544
 545 /** Append a unicode character encoded as utf-8 to the string. */
 546 bool stringb::append_utf8 (WChar c)
 547 {
 548         assert (len < capacity);
 549         size_t left = capacity - len;
 550
 551         if (c < 0x80) {
 552                 if (left <= 1) return false;
 553                 buffer[len++] = c;
 554         } else if (c < 0x800) {
 555                 if (left <= 2) return false;
 556                 buffer[len++] = 0xC0 + GB(c,  6, 5);
 557                 buffer[len++] = 0x80 + GB(c,  0, 6);
 558         } else if (c < 0x10000) {
 559                 if (left <= 3) return false;
 560                 buffer[len++] = 0xE0 + GB(c, 12, 4);
 561                 buffer[len++] = 0x80 + GB(c,  6, 6);
 562                 buffer[len++] = 0x80 + GB(c,  0, 6);
 563         } else if (c < 0x110000) {
 564                 if (left <= 4) return false;
 565                 buffer[len++] = 0xF0 + GB(c, 18, 3);
 566                 buffer[len++] = 0x80 + GB(c, 12, 6);
 567                 buffer[len++] = 0x80 + GB(c,  6, 6);
 568                 buffer[len++] = 0x80 + GB(c,  0, 6);
 569         } else {
 570                 /* DEBUG(misc, 1, "[utf8] can't UTF-8 encode value 0x%X", c); */
 571                 if (left <= 1) return false;
 572                 buffer[len++] = '?';
 573         }
 574
 575         buffer[len] = '\0';
 576         return true;
 577 }
 578
 579 /** Append the hexadecimal representation of an md5sum. */
 580 bool stringb::append_md5sum (const uint8 md5sum [16])
 581 {
 582         for (uint i = 0; i < 16; i++) {
 583                 if (!append_fmt ("%02X", md5sum[i])) return false;
 584         }
 585
 586         return true;
 587 }