src/string.cpp

   1 /* $Id$ */
   2
   3 /*
   4  * This file is part of OpenTTD.
   5  * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
   6  * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   7  * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
   8  */
   9
  10 /** @file string.cpp Handling of C-type strings (char*). */
  11
  12 #include "stdafx.h"
  13 #include "debug.h"
  14 #include "core/alloc_func.hpp"
  15 #include "core/math_func.hpp"
  16 #include "string.h"
  17
  18 #include "table/control_codes.h"
  19
  20 #include <stdarg.h>
  21 #include <ctype.h> /* required for tolower() */
  22
  23 #ifdef _MSC_VER
  24 #include <errno.h> // required by vsnprintf implementation for MSVC
  25 #endif
  26
  27 #ifdef WITH_ICU_SORT
  28 /* Required by strnatcmp. */
  29 #include <unicode/ustring.h>
  30 #include "language.h"
  31 #include "gfx_func.h"
  32 #endif /* WITH_ICU_SORT */
  33
  34
  35 #ifdef WIN32
  36 /* Since version 3.14, MinGW Runtime has snprintf() and vsnprintf() conform to C99 but it's not the case for older versions */
  37 #if (__MINGW32_MAJOR_VERSION < 3) || ((__MINGW32_MAJOR_VERSION == 3) && (__MINGW32_MINOR_VERSION < 14))
  38 int CDECL snprintf(char *str, size_t size, const char *format, ...)
  39 {
  40         va_list ap;
  41         int ret;
  42
  43         va_start(ap, format);
  44         ret = vsnprintf(str, size, format, ap);
  45         va_end(ap);
  46         return ret;
  47 }
  48 #endif /* MinGW Runtime < 3.14 */
  49
  50 #if defined(_MSC_VER) && _MSC_VER < 1900
  51 /**
  52  * Almost POSIX compliant implementation of \c vsnprintf for VC compiler.
  53  * The difference is in the value returned on output truncation. This
  54  * implementation returns size whereas a POSIX implementation returns
  55  * size or more (the number of bytes that would be written to str
  56  * had size been sufficiently large excluding the terminating null byte).
  57  */
  58 int CDECL vsnprintf(char *str, size_t size, const char *format, va_list ap)
  59 {
  60         if (size == 0) return 0;
  61
  62         errno = 0;
  63         int ret = _vsnprintf(str, size, format, ap);
  64
  65         if (ret < 0) {
  66                 if (errno != ERANGE) {
  67                         /* There's a formatting error, better get that looked
  68                          * at properly instead of ignoring it. */
  69                         NOT_REACHED();
  70                 }
  71         } else if ((size_t)ret < size) {
  72                 /* The buffer is big enough for the number of
  73                  * characters stored (excluding null), i.e.
  74                  * the string has been null-terminated. */
  75                 return ret;
  76         }
  77
  78         /* The buffer is too small for _vsnprintf to write the
  79          * null-terminator at its end and return size. */
  80         str[size - 1] = '\0';
  81         return (int)size;
  82 }
  83 #endif /* _MSC_VER */
  84
  85 #endif /* WIN32 */
  86
  87 /**
  88  * Copies characters from one buffer to another.
  89  *
  90  * Copies the source string to the destination buffer with respect of the
  91  * terminating null-character and the maximum size of the destination
  92  * buffer.
  93  *
  94  * @note usage ttd_strlcpy(dst, src, lengthof(dst));
  95  * @note lengthof() applies only to fixed size arrays
  96  *
  97  * @param dst The destination buffer
  98  * @param src The buffer containing the string to copy
  99  * @param size The maximum size of the destination buffer
 100  */
 101 void ttd_strlcpy(char *dst, const char *src, size_t size)
 102 {
 103         assert(size > 0);
 104         while (--size > 0 && *src != '\0') {
 105                 *dst++ = *src++;
 106         }
 107         *dst = '\0';
 108 }
 109
 110
 111 /** Allocate a copy of a given string, and error out on failure. */
 112 char *xstrdup (const char *s)
 113 {
 114         return (char*) xmemdup (s, strlen(s) + 1);
 115 }
 116
 117 /**
 118  * Allocate a copy of a given string, with bounded size, and error out
 119  * on failure.
 120  *
 121  * Note! This is not the same as strndup, because it assumes that the
 122  * string passed in is at least of the required size, unlike strndup,
 123  * which will check if there is a null in the requested initial segment.
 124  */
 125 char *xstrmemdup (const char *s, size_t n)
 126 {
 127         char *p = xmalloc (n + 1);
 128         memcpy (p, s, n);
 129         p[n] = '\0';
 130         return p;
 131 }
 132
 133 /** Allocate a copy of a given string, with bounded size, and error out on failure. */
 134 char *xstrndup (const char *s, size_t n)
 135 {
 136         return xstrmemdup (s, ttd_strnlen (s, n));
 137 }
 138
 139 /** Allocate a formatted string. */
 140 char *str_vfmt (const char *fmt, va_list args)
 141 {
 142 #ifdef _GNU_SOURCE
 143         char *s;
 144         if (vasprintf (&s, fmt, args) == -1) out_of_memory();
 145         return s;
 146 #else
 147         char buf[4096];
 148         int len = vsnprintf (buf, lengthof(buf), fmt, args);
 149         return (char*) xmemdup (buf, len + 1);
 150 #endif
 151 }
 152
 153 /**
 154  * Format, "printf", into a newly allocated string.
 155  * @param str The formatting string.
 156  * @return The formatted string. You must free this!
 157  */
 158 char *CDECL str_fmt(const char *str, ...)
 159 {
 160         va_list va;
 161         va_start(va, str);
 162         char *s = str_vfmt (str, va);
 163         va_end(va);
 164         return s;
 165 }
 166
 167
 168 #ifdef DEFINE_STRCASESTR
 169 char *strcasestr(const char *haystack, const char *needle)
 170 {
 171         size_t hay_len = strlen(haystack);
 172         size_t needle_len = strlen(needle);
 173         while (hay_len >= needle_len) {
 174                 if (strncasecmp(haystack, needle, needle_len) == 0) return const_cast<char *>(haystack);
 175
 176                 haystack++;
 177                 hay_len--;
 178         }
 179
 180         return NULL;
 181 }
 182 #endif /* DEFINE_STRCASESTR */
 183
 184 /**
 185  * Skip some of the 'garbage' in the string that we don't want to use
 186  * to sort on. This way the alphabetical sorting will work better as
 187  * we would be actually using those characters instead of some other
 188  * characters such as spaces and tildes at the begin of the name.
 189  * @param str The string to skip the initial garbage of.
 190  * @return The string with the garbage skipped.
 191  */
 192 static const char *SkipGarbage(const char *str)
 193 {
 194         while (*str != '\0' && (*str < '0' || IsInsideMM(*str, ';', '@' + 1) || IsInsideMM(*str, '[', '`' + 1) || IsInsideMM(*str, '{', '~' + 1))) str++;
 195         return str;
 196 }
 197
 198 /**
 199  * Compares two strings using case insensitive natural sort.
 200  *
 201  * @param s1 First string to compare.
 202  * @param s2 Second string to compare.
 203  * @param ignore_garbage_at_front Skip punctuation characters in the front
 204  * @return Less than zero if s1 < s2, zero if s1 == s2, greater than zero if s1 > s2.
 205  */
 206 int strnatcmp(const char *s1, const char *s2, bool ignore_garbage_at_front)
 207 {
 208         if (ignore_garbage_at_front) {
 209                 s1 = SkipGarbage(s1);
 210                 s2 = SkipGarbage(s2);
 211         }
 212 #ifdef WITH_ICU_SORT
 213         if (_current_collator != NULL) {
 214                 UErrorCode status = U_ZERO_ERROR;
 215                 int result = _current_collator->compareUTF8(s1, s2, status);
 216                 if (U_SUCCESS(status)) return result;
 217         }
 218
 219 #endif /* WITH_ICU_SORT */
 220
 221         /* Do a normal comparison if ICU is missing or if we cannot create a collator. */
 222         return strcasecmp(s1, s2);
 223 }
 224
 225 /**
 226  * Convert a given ASCII string to lowercase.
 227  * NOTE: only support ASCII characters, no UTF8 fancy. As currently
 228  * the function is only used to lowercase data-filenames if they are
 229  * not found, this is sufficient. If more, or general functionality is
 230  * needed, look to r7271 where it was removed because it was broken when
 231  * using certain locales: eg in Turkish the uppercase 'I' was converted to
 232  * '?', so just revert to the old functionality
 233  * @param str string to convert
 234  * @return String has changed.
 235  */
 236 bool strtolower(char *str)
 237 {
 238         bool changed = false;
 239         for (; *str != '\0'; str++) {
 240                 char new_str = tolower(*str);
 241                 changed |= new_str != *str;
 242                 *str = new_str;
 243         }
 244         return changed;
 245 }
 246
 247
 248 /* UTF-8 handling */
 249
 250 /**
 251  * Decode and consume the next UTF-8 encoded character.
 252  * @param c Buffer to place decoded character.
 253  * @param s Character stream to retrieve character from.
 254  * @return Number of characters in the sequence.
 255  */
 256 size_t Utf8Decode(WChar *c, const char *s)
 257 {
 258         assert(c != NULL);
 259
 260         if (!HasBit(s[0], 7)) {
 261                 /* Single byte character: 0xxxxxxx */
 262                 *c = s[0];
 263                 return 1;
 264         } else if (GB(s[0], 5, 3) == 6) {
 265                 if (IsUtf8Part(s[1])) {
 266                         /* Double byte character: 110xxxxx 10xxxxxx */
 267                         *c = GB(s[0], 0, 5) << 6 | GB(s[1], 0, 6);
 268                         if (*c >= 0x80) return 2;
 269                 }
 270         } else if (GB(s[0], 4, 4) == 14) {
 271                 if (IsUtf8Part(s[1]) && IsUtf8Part(s[2])) {
 272                         /* Triple byte character: 1110xxxx 10xxxxxx 10xxxxxx */
 273                         *c = GB(s[0], 0, 4) << 12 | GB(s[1], 0, 6) << 6 | GB(s[2], 0, 6);
 274                         if (*c >= 0x800) return 3;
 275                 }
 276         } else if (GB(s[0], 3, 5) == 30) {
 277                 if (IsUtf8Part(s[1]) && IsUtf8Part(s[2]) && IsUtf8Part(s[3])) {
 278                         /* 4 byte character: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
 279                         *c = GB(s[0], 0, 3) << 18 | GB(s[1], 0, 6) << 12 | GB(s[2], 0, 6) << 6 | GB(s[3], 0, 6);
 280                         if (*c >= 0x10000 && *c <= 0x10FFFF) return 4;
 281                 }
 282         }
 283
 284         /* DEBUG(misc, 1, "[utf8] invalid UTF-8 sequence"); */
 285         *c = '?';
 286         return 1;
 287 }
 288
 289
 290 /**
 291  * Encode a unicode character and place it in the buffer.
 292  * @param buf Buffer to place character.
 293  * @param c   Unicode character to encode.
 294  * @return Number of characters in the encoded sequence.
 295  */
 296 size_t Utf8Encode(char *buf, WChar c)
 297 {
 298         if (c < 0x80) {
 299                 *buf = c;
 300                 return 1;
 301         } else if (c < 0x800) {
 302                 *buf++ = 0xC0 + GB(c,  6, 5);
 303                 *buf   = 0x80 + GB(c,  0, 6);
 304                 return 2;
 305         } else if (c < 0x10000) {
 306                 *buf++ = 0xE0 + GB(c, 12, 4);
 307                 *buf++ = 0x80 + GB(c,  6, 6);
 308                 *buf   = 0x80 + GB(c,  0, 6);
 309                 return 3;
 310         } else if (c < 0x110000) {
 311                 *buf++ = 0xF0 + GB(c, 18, 3);
 312                 *buf++ = 0x80 + GB(c, 12, 6);
 313                 *buf++ = 0x80 + GB(c,  6, 6);
 314                 *buf   = 0x80 + GB(c,  0, 6);
 315                 return 4;
 316         }
 317
 318         /* DEBUG(misc, 1, "[utf8] can't UTF-8 encode value 0x%X", c); */
 319         *buf = '?';
 320         return 1;
 321 }
 322
 323 /**
 324  * Properly terminate an UTF8 string to some maximum length
 325  * @param s string to check if it needs additional trimming
 326  * @param maxlen the maximum length the buffer can have.
 327  * @return the new length in bytes of the string (eg. strlen(new_string))
 328  * @note maxlen is the string length _INCLUDING_ the terminating '\0'
 329  */
 330 size_t Utf8TrimString(char *s, size_t maxlen)
 331 {
 332         size_t length = 0;
 333
 334         for (const char *ptr = strchr(s, '\0'); *s != '\0';) {
 335                 size_t len = Utf8EncodedCharLen(*s);
 336                 /* Silently ignore invalid UTF8 sequences, our only concern trimming */
 337                 if (len == 0) len = 1;
 338
 339                 /* Take care when a hard cutoff was made for the string and
 340                  * the last UTF8 sequence is invalid */
 341                 if (length + len >= maxlen || (s + len > ptr)) break;
 342                 s += len;
 343                 length += len;
 344         }
 345
 346         *s = '\0';
 347         return length;
 348 }
 349
 350 /**
 351  * Get the length of an UTF-8 encoded string in number of characters
 352  * and thus not the number of bytes that the encoded string contains.
 353  * @param s The string to get the length for.
 354  * @return The length of the string in characters.
 355  */
 356 size_t Utf8StringLength(const char *s)
 357 {
 358         size_t len = 0;
 359         const char *t = s;
 360         while (Utf8Consume(&t) != 0) len++;
 361         return len;
 362 }
 363
 364 /**
 365  * Checks whether the given string is valid, i.e. contains only
 366  * valid (printable) characters and is properly terminated.
 367  * @param str  The string to validate.
 368  * @param last The last character of the string, i.e. the string
 369  *             must be terminated here or earlier.
 370  */
 371 bool StrValid(const char *str, const char *last)
 372 {
 373         /* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
 374
 375         while (str <= last && *str != '\0') {
 376                 size_t len = Utf8EncodedCharLen(*str);
 377                 /* Encoded length is 0 if the character isn't known.
 378                  * The length check is needed to prevent Utf8Decode to read
 379                  * over the terminating '\0' if that happens to be placed
 380                  * within the encoding of an UTF8 character. */
 381                 if (len == 0 || str + len > last) return false;
 382
 383                 WChar c;
 384                 len = Utf8Decode(&c, str);
 385                 if (!IsPrintable(c) || (c >= SCC_SPRITE_START && c <= SCC_SPRITE_END)) {
 386                         return false;
 387                 }
 388
 389                 str += len;
 390         }
 391
 392         return *str == '\0';
 393 }
 394
 395 /**
 396  * Scans the string for valid characters and if it finds invalid ones,
 397  * replaces them with a question mark '?' (if not ignored)
 398  * @param str the string to validate
 399  * @param last the last valid character of str
 400  * @param settings the settings for the string validation.
 401  */
 402 void str_validate(char *str, const char *last, StringValidationSettings settings)
 403 {
 404         /* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
 405
 406         char *dst = str;
 407         while (str <= last && *str != '\0') {
 408                 size_t len = Utf8EncodedCharLen(*str);
 409                 /* If the character is unknown, i.e. encoded length is 0
 410                  * we assume worst case for the length check.
 411                  * The length check is needed to prevent Utf8Decode to read
 412                  * over the terminating '\0' if that happens to be placed
 413                  * within the encoding of an UTF8 character. */
 414                 if ((len == 0 && str + 4 > last) || str + len > last) break;
 415
 416                 WChar c;
 417                 len = Utf8Decode(&c, str);
 418                 /* It's possible to encode the string termination character
 419                  * into a multiple bytes. This prevents those termination
 420                  * characters to be skipped */
 421                 if (c == '\0') break;
 422
 423                 if ((IsPrintable(c) && (c < SCC_SPRITE_START || c > SCC_SPRITE_END)) || ((settings & SVS_ALLOW_CONTROL_CODE) != 0 && c == SCC_ENCODED)) {
 424                         /* Copy the character back. Even if dst is current the same as str
 425                          * (i.e. no characters have been changed) this is quicker than
 426                          * moving the pointers ahead by len */
 427                         do {
 428                                 *dst++ = *str++;
 429                         } while (--len != 0);
 430                 } else if ((settings & SVS_ALLOW_NEWLINE) != 0  && c == '\n') {
 431                         *dst++ = *str++;
 432                 } else {
 433                         if ((settings & SVS_ALLOW_NEWLINE) != 0 && c == '\r' && str[1] == '\n') {
 434                                 str += len;
 435                                 continue;
 436                         }
 437                         /* Replace the undesirable character with a question mark */
 438                         str += len;
 439                         if ((settings & SVS_REPLACE_WITH_QUESTION_MARK) != 0) *dst++ = '?';
 440                 }
 441         }
 442
 443         *dst = '\0';
 444 }
 445
 446 /**
 447  * Scans the string for valid characters and if it finds invalid ones,
 448  * replaces them with a question mark '?'.
 449  * @param str the string to validate
 450  */
 451 void ValidateString(const char *str)
 452 {
 453         /* We know it is '\0' terminated. */
 454         str_validate(const_cast<char *>(str), str + strlen(str) + 1);
 455 }
 456
 457 /**
 458  * Scan the string for old values of SCC_ENCODED and fix it to
 459  * it's new, static value.
 460  * @param str the string to scan
 461  * @param last the last valid character of str
 462  */
 463 void str_fix_scc_encoded(char *str, const char *last)
 464 {
 465         while (str <= last && *str != '\0') {
 466                 size_t len = Utf8EncodedCharLen(*str);
 467                 if ((len == 0 && str + 4 > last) || str + len > last) break;
 468
 469                 WChar c;
 470                 len = Utf8Decode(&c, str);
 471                 if (c == '\0') break;
 472
 473                 if (c == 0xE028 || c == 0xE02A) {
 474                         c = SCC_ENCODED;
 475                 }
 476                 str += Utf8Encode(str, c);
 477         }
 478         *str = '\0';
 479 }
 480
 481 /** Scans the string for colour codes and strips them */
 482 void str_strip_colours(char *str)
 483 {
 484         char *dst = str;
 485         WChar c;
 486         size_t len;
 487
 488         for (len = Utf8Decode(&c, str); c != '\0'; len = Utf8Decode(&c, str)) {
 489                 if (c < SCC_BLUE || c > SCC_BLACK) {
 490                         /* Copy the character back. Even if dst is current the same as str
 491                          * (i.e. no characters have been changed) this is quicker than
 492                          * moving the pointers ahead by len */
 493                         do {
 494                                 *dst++ = *str++;
 495                         } while (--len != 0);
 496                 } else {
 497                         /* Just skip (strip) the colour codes */
 498                         str += len;
 499                 }
 500         }
 501         *dst = '\0';
 502 }
 503
 504
 505 /* buffer-aware string functions */
 506
 507 /** Set this string according to a format and args. */
 508 bool stringb::fmt (const char *fmt, ...)
 509 {
 510         va_list args;
 511         va_start (args, fmt);
 512         bool r = vfmt (fmt, args);
 513         va_end (args);
 514         return r;
 515 }
 516
 517 /** Append to this string according to a format and args. */
 518 bool stringb::append_fmt (const char *fmt, ...)
 519 {
 520         va_list args;
 521         va_start (args, fmt);
 522         bool r = append_vfmt (fmt, args);
 523         va_end (args);
 524         return r;
 525 }
 526
 527 /** Append a unicode character encoded as utf-8 to the string. */
 528 bool stringb::append_utf8 (WChar c)
 529 {
 530         assert (len < capacity);
 531         size_t left = capacity - len;
 532
 533         if (c < 0x80) {
 534                 if (left <= 1) return false;
 535                 buffer[len++] = c;
 536         } else if (c < 0x800) {
 537                 if (left <= 2) return false;
 538                 buffer[len++] = 0xC0 + GB(c,  6, 5);
 539                 buffer[len++] = 0x80 + GB(c,  0, 6);
 540         } else if (c < 0x10000) {
 541                 if (left <= 3) return false;
 542                 buffer[len++] = 0xE0 + GB(c, 12, 4);
 543                 buffer[len++] = 0x80 + GB(c,  6, 6);
 544                 buffer[len++] = 0x80 + GB(c,  0, 6);
 545         } else if (c < 0x110000) {
 546                 if (left <= 4) return false;
 547                 buffer[len++] = 0xF0 + GB(c, 18, 3);
 548                 buffer[len++] = 0x80 + GB(c, 12, 6);
 549                 buffer[len++] = 0x80 + GB(c,  6, 6);
 550                 buffer[len++] = 0x80 + GB(c,  0, 6);
 551         } else {
 552                 /* DEBUG(misc, 1, "[utf8] can't UTF-8 encode value 0x%X", c); */
 553                 if (left <= 1) return false;
 554                 buffer[len++] = '?';
 555         }
 556
 557         buffer[len] = '\0';
 558         return true;
 559 }
 560
 561 /** Append the hexadecimal representation of an md5sum. */
 562 bool stringb::append_md5sum (const uint8 md5sum [16])
 563 {
 564         for (uint i = 0; i < 16; i++) {
 565                 if (!append_fmt ("%02X", md5sum[i])) return false;
 566         }
 567
 568         return true;
 569 }