src/string.h

   1 /* $Id$ */
   2
   3 /*
   4  * This file is part of OpenTTD.
   5  * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
   6  * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
   7  * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
   8  */
   9
  10 /** @file string.h Types and function related to low-level strings. */
  11
  12 #ifndef STRING_H
  13 #define STRING_H
  14
  15 #include <stdarg.h>
  16
  17 /* Needed for NetBSD version (so feature) testing */
  18 #if defined(__NetBSD__) || defined(__FreeBSD__)
  19 #include <sys/param.h>
  20 #endif
  21
  22 #include "core/bitmath_func.hpp"
  23 #include "core/enum_type.hpp"
  24 #include "core/alloc_func.hpp"
  25
  26
  27 /** Allocate dynamic memory with a copy of given data, and error out on failure. */
  28 static inline void *xmemdup (const void *src, size_t size)
  29 {
  30         return memcpy (xmalloc(size), src, size);
  31 }
  32
  33 /** Allocate dynamic memory with a copy of given type data, and error out on failure. */
  34 template <typename T>
  35 static inline T *xmemdupt (const T *src, size_t size = 1)
  36 {
  37         return (T*) memcpy (xmalloct<T>(size), src, size * sizeof(T));
  38 }
  39
  40
  41 #ifdef _GNU_SOURCE
  42 #define ttd_strnlen strnlen
  43 #else
  44 /**
  45  * Get the length of a string, within a limited buffer.
  46  *
  47  * @param str The pointer to the first element of the buffer
  48  * @param maxlen The maximum size of the buffer
  49  * @return The length of the string
  50  */
  51 static inline size_t ttd_strnlen(const char *str, size_t maxlen)
  52 {
  53         const char *t;
  54         for (t = str; (size_t)(t - str) < maxlen && *t != '\0'; t++) {}
  55         return t - str;
  56 }
  57 #endif
  58
  59 void ttd_strlcpy(char *dst, const char *src, size_t size);
  60
  61
  62 char *xstrdup (const char *s);
  63 char *xstrmemdup (const char *s, size_t n);
  64 char *xstrndup (const char *s, size_t n);
  65
  66 char *str_vfmt(const char *str, va_list args) WARN_FORMAT(1, 0);
  67 char *CDECL str_fmt(const char *str, ...) WARN_FORMAT(1, 2);
  68
  69
  70 /* strcasestr is available for _GNU_SOURCE, BSD and some Apple */
  71 #if defined(_GNU_SOURCE) || (defined(__BSD_VISIBLE) && __BSD_VISIBLE) || (defined(__APPLE__) && (!defined(_POSIX_C_SOURCE) || defined(_DARWIN_C_SOURCE))) || defined(_NETBSD_SOURCE)
  72 #       undef DEFINE_STRCASESTR
  73 #else
  74 #       define DEFINE_STRCASESTR
  75 char *strcasestr(const char *haystack, const char *needle);
  76 #endif /* strcasestr is available */
  77
  78 int strnatcmp(const char *s1, const char *s2, bool ignore_garbage_at_front = false);
  79
  80 bool strtolower(char *str);
  81
  82 /**
  83  * Check if a string buffer is empty.
  84  *
  85  * @param s The pointer to the first element of the buffer
  86  * @return true if the buffer starts with the terminating null-character or
  87  *         if the given pointer points to NULL else return false
  88  */
  89 static inline bool StrEmpty(const char *s)
  90 {
  91         return s == NULL || s[0] == '\0';
  92 }
  93
  94
  95 /* UTF-8 handling */
  96
  97 /** Type for wide characters, i.e. non-UTF8 encoded unicode characters. */
  98 typedef uint32 WChar;
  99
 100 /** Max. length of UTF-8 encoded unicode character. */
 101 static const uint MAX_CHAR_LENGTH = 4;
 102
 103 /* The following are directional formatting codes used to get the LTR and RTL strings right:
 104  * http://www.unicode.org/unicode/reports/tr9/#Directional_Formatting_Codes */
 105 static const WChar CHAR_TD_LRM = 0x200E; ///< The next character acts like a left-to-right character.
 106 static const WChar CHAR_TD_RLM = 0x200F; ///< The next character acts like a right-to-left character.
 107 static const WChar CHAR_TD_LRE = 0x202A; ///< The following text is embedded left-to-right.
 108 static const WChar CHAR_TD_RLE = 0x202B; ///< The following text is embedded right-to-left.
 109 static const WChar CHAR_TD_LRO = 0x202D; ///< Force the following characters to be treated as left-to-right characters.
 110 static const WChar CHAR_TD_RLO = 0x202E; ///< Force the following characters to be treated as right-to-left characters.
 111 static const WChar CHAR_TD_PDF = 0x202C; ///< Restore the text-direction state to before the last LRE, RLE, LRO or RLO.
 112
 113 /** A non-breaking space. */
 114 #define NBSP "\xC2\xA0"
 115
 116 /** A left-to-right marker, marks the next character as left-to-right. */
 117 #define LRM "\xE2\x80\x8E"
 118
 119 /**
 120  * Return the length of a UTF-8 encoded character.
 121  * @param c Unicode character.
 122  * @return Length of UTF-8 encoding for character.
 123  */
 124 static inline int8 Utf8CharLen(WChar c)
 125 {
 126         if (c < 0x80)       return 1;
 127         if (c < 0x800)      return 2;
 128         if (c < 0x10000)    return 3;
 129         if (c < 0x110000)   return 4;
 130
 131         /* Invalid valid, we encode as a '?' */
 132         return 1;
 133 }
 134
 135 /**
 136  * Return the length of an UTF-8 encoded value based on a single char. This
 137  * char should be the first byte of the UTF-8 encoding. If not, or encoding
 138  * is invalid, return value is 0
 139  * @param c char to query length of
 140  * @return requested size
 141  */
 142 static inline int8 Utf8EncodedCharLen(char c)
 143 {
 144         if (GB(c, 3, 5) == 0x1E) return 4;
 145         if (GB(c, 4, 4) == 0x0E) return 3;
 146         if (GB(c, 5, 3) == 0x06) return 2;
 147         if (GB(c, 7, 1) == 0x00) return 1;
 148
 149         /* Invalid UTF8 start encoding */
 150         return 0;
 151 }
 152
 153 /* Check if the given character is part of a UTF8 sequence */
 154 static inline bool IsUtf8Part(char c)
 155 {
 156         return GB(c, 6, 2) == 2;
 157 }
 158
 159 /**
 160  * Retrieve the previous UNICODE character in an UTF-8 encoded string.
 161  * @param s char pointer pointing to (the first char of) the next character
 162  * @return a pointer in 's' to the previous UNICODE character's first byte
 163  * @note The function should not be used to determine the length of the previous
 164  * encoded char because it might be an invalid/corrupt start-sequence
 165  */
 166 static inline char *Utf8PrevChar(char *s)
 167 {
 168         char *ret = s;
 169         while (IsUtf8Part(*--ret)) {}
 170         return ret;
 171 }
 172
 173 static inline const char *Utf8PrevChar(const char *s)
 174 {
 175         const char *ret = s;
 176         while (IsUtf8Part(*--ret)) {}
 177         return ret;
 178 }
 179
 180 size_t Utf8Decode(WChar *c, const char *s);
 181 size_t Utf8Encode(char *buf, WChar c);
 182 size_t Utf8TrimString(char *s, size_t maxlen);
 183
 184 static inline WChar Utf8Consume(const char **s)
 185 {
 186         WChar c;
 187         *s += Utf8Decode(&c, *s);
 188         return c;
 189 }
 190
 191 size_t Utf8StringLength(const char *s);
 192
 193 /**
 194  * Is the given character a text direction character.
 195  * @param c The character to test.
 196  * @return true iff the character is used to influence
 197  *         the text direction.
 198  */
 199 static inline bool IsTextDirectionChar(WChar c)
 200 {
 201         switch (c) {
 202                 case CHAR_TD_LRM:
 203                 case CHAR_TD_RLM:
 204                 case CHAR_TD_LRE:
 205                 case CHAR_TD_RLE:
 206                 case CHAR_TD_LRO:
 207                 case CHAR_TD_RLO:
 208                 case CHAR_TD_PDF:
 209                         return true;
 210
 211                 default:
 212                         return false;
 213         }
 214 }
 215
 216 static inline bool IsPrintable(WChar c)
 217 {
 218         if (c < 0x20)   return false;
 219         if (c < 0xE000) return true;
 220         if (c < 0xE200) return false;
 221         return true;
 222 }
 223
 224 /**
 225  * Check whether UNICODE character is whitespace or not, i.e. whether
 226  * this is a potential line-break character.
 227  * @param c UNICODE character to check
 228  * @return a boolean value whether 'c' is a whitespace character or not
 229  * @see http://www.fileformat.info/info/unicode/category/Zs/list.htm
 230  */
 231 static inline bool IsWhitespace(WChar c)
 232 {
 233         return c == 0x0020 /* SPACE */ || c == 0x3000; /* IDEOGRAPHIC SPACE */
 234 }
 235
 236 /**
 237  * Valid filter types for IsValidChar.
 238  */
 239 enum CharSetFilter {
 240         CS_ALPHANUMERAL,      ///< Both numeric and alphabetic and spaces and stuff
 241         CS_NUMERAL,           ///< Only numeric ones
 242         CS_NUMERAL_SPACE,     ///< Only numbers and spaces
 243         CS_ALPHA,             ///< Only alphabetic values
 244         CS_HEXADECIMAL,       ///< Only hexadecimal characters
 245 };
 246
 247 bool IsValidChar(WChar key, CharSetFilter afilter);
 248
 249 /** Settings for the string validation. */
 250 enum StringValidationSettings {
 251         SVS_NONE                       = 0,      ///< Allow nothing and replace nothing.
 252         SVS_REPLACE_WITH_QUESTION_MARK = 1 << 0, ///< Replace the unknown/bad bits with question marks.
 253         SVS_ALLOW_NEWLINE              = 1 << 1, ///< Allow newlines.
 254         SVS_ALLOW_CONTROL_CODE         = 1 << 2, ///< Allow the special control codes.
 255 };
 256 DECLARE_ENUM_AS_BIT_SET(StringValidationSettings)
 257
 258 bool StrValid(const char *str, const char *last);
 259 void str_validate(char *str, const char *last, StringValidationSettings settings = SVS_REPLACE_WITH_QUESTION_MARK);
 260 void ValidateString(const char *str);
 261
 262 void str_fix_scc_encoded(char *str, const char *last);
 263 void str_strip_colours(char *str);
 264
 265 /**
 266  * Is the given character a lead surrogate code point?
 267  * @param c The character to test.
 268  * @return True if the character is a lead surrogate code point.
 269  */
 270 static inline bool Utf16IsLeadSurrogate(uint c)
 271 {
 272         return c >= 0xD800 && c <= 0xDBFF;
 273 }
 274
 275 /**
 276  * Is the given character a lead surrogate code point?
 277  * @param c The character to test.
 278  * @return True if the character is a lead surrogate code point.
 279  */
 280 static inline bool Utf16IsTrailSurrogate(uint c)
 281 {
 282         return c >= 0xDC00 && c <= 0xDFFF;
 283 }
 284
 285 /**
 286  * Convert an UTF-16 surrogate pair to the corresponding Unicode character.
 287  * @param lead Lead surrogate code point.
 288  * @param trail Trail surrogate code point.
 289  * @return Decoded Unicode character.
 290  */
 291 static inline WChar Utf16DecodeSurrogate(uint lead, uint trail)
 292 {
 293         return 0x10000 + (((lead - 0xD800) << 10) | (trail - 0xDC00));
 294 }
 295
 296 /**
 297  * Decode an UTF-16 character.
 298  * @param c Pointer to one or two UTF-16 code points.
 299  * @return Decoded Unicode character.
 300  */
 301 static inline WChar Utf16DecodeChar(const uint16 *c)
 302 {
 303         if (Utf16IsLeadSurrogate(c[0])) {
 304                 return Utf16DecodeSurrogate(c[0], c[1]);
 305         } else {
 306                 return *c;
 307         }
 308 }
 309
 310
 311 /* buffer-aware string functions */
 312
 313 /** Copy a string, pointer version. */
 314 template <uint N>
 315 static inline void bstrcpy (char (*dest) [N], const char *src)
 316 {
 317         snprintf (&(*dest)[0], N, "%s", src);
 318 }
 319
 320 /** Copy a string, reference version. */
 321 template <uint N>
 322 static inline void bstrcpy (char (&dest) [N], const char *src)
 323 {
 324         bstrcpy (&dest, src);
 325 }
 326
 327 /** Format a string from a va_list, pointer version. */
 328 template <uint N>
 329 static inline void bstrvfmt (char (*dest) [N], const char *fmt, va_list args)
 330 {
 331         vsnprintf (&(*dest)[0], N, fmt, args);
 332 }
 333
 334 /** Format a string from a va_list, reference version. */
 335 template <uint N>
 336 static inline void bstrvfmt (char (&dest) [N], const char *fmt, va_list args)
 337 {
 338         bstrvfmt (&dest, fmt, args);
 339 }
 340
 341 /* The following one must be a macro because there is no variadic template
 342  * support in MSVC. */
 343
 344 /** Get the pointer and size to use for a static buffer, pointer version. */
 345 template <uint N>
 346 static inline void bstrptr (char (*dest) [N], char **buffer, uint *size)
 347 {
 348         *buffer = &(*dest)[0];
 349         *size = N;
 350 }
 351
 352 /** Get the pointer and size to use for a static buffer, reference version. */
 353 template <uint N>
 354 static inline void bstrptr (char (&dest) [N], char **buffer, uint *size)
 355 {
 356         *buffer = &(dest)[0];
 357         *size = N;
 358 }
 359
 360 /** Format a string. */
 361 #define bstrfmt(dest, ...) do {                                 \
 362         char *bstrfmt__buffer;                                  \
 363         uint  bstrfmt__size;                                    \
 364         bstrptr (dest, &bstrfmt__buffer, &bstrfmt__size);       \
 365         snprintf (bstrfmt__buffer, bstrfmt__size, __VA_ARGS__); \
 366         } while(0)
 367
 368
 369 /** Fixed buffer string template class. */
 370 template <class T>
 371 struct stringt : T {
 372         stringt (void) : T() { }
 373
 374         template <class T1>
 375         stringt (T1 t1) : T (t1) { }
 376
 377         template <class T1, class T2>
 378         stringt (T1 t1, T2 t2) : T (t1, t2) { }
 379
 380         /** Get the storage size. */
 381         size_t get__capacity (void) const
 382         {
 383                 return T::get_capacity();
 384         }
 385
 386         /** Get the storage buffer. */
 387         char *get__buffer (void)
 388         {
 389                 return T::get_buffer();
 390         }
 391
 392         /** Get the storage buffer, const version. */
 393         const char *get__buffer (void) const
 394         {
 395                 return const_cast<stringt*>(this)->get__buffer();
 396         }
 397
 398         const char *c_str() const
 399         {
 400                 return get__buffer();
 401         }
 402
 403         /** Get the current length of the string. */
 404         size_t length (void) const
 405         {
 406                 return T::len;
 407         }
 408
 409         /** Check if this string is empty. */
 410         bool empty (void) const
 411         {
 412                 return length() == 0;
 413         }
 414
 415         /** Get the current length of the string in utf8 chars. */
 416         size_t utf8length (void) const
 417         {
 418                 return Utf8StringLength (c_str());
 419         }
 420
 421         /** Check if this string is full. */
 422         bool full (void) const
 423         {
 424                 return length() == get__capacity() - 1;
 425         }
 426
 427         /** Reset the string. */
 428         void clear (void)
 429         {
 430                 T::len = 0;
 431                 get__buffer()[0] = '\0';
 432         }
 433
 434         /** Fill the string with zeroes (to avoid undefined contents). */
 435         void zerofill (void)
 436         {
 437                 T::len = 0;
 438                 memset (get__buffer(), 0, get__capacity());
 439         }
 440
 441         /** Truncate the string to a given length. */
 442         void truncate (size_t newlen)
 443         {
 444                 assert (newlen <= T::len);
 445                 T::len = newlen;
 446                 get__buffer()[T::len] = '\0';
 447         }
 448
 449         /** Set string length and provide return value. */
 450         bool set__return (uint n)
 451         {
 452                 const size_t m = get__capacity();
 453                 if (n < m) {
 454                         T::len = n;
 455                         return true;
 456                 } else {
 457                         T::len = m - 1;
 458                         return false;
 459                 }
 460         }
 461
 462         /** Copy a given string into this one. */
 463         bool copy (const char *src)
 464         {
 465                 uint n = snprintf (get__buffer(), get__capacity(), "%s", src);
 466                 return set__return (n);
 467         }
 468
 469         /** Set this string according to a format and args. */
 470         bool vfmt (const char *fmt, va_list args) WARN_FORMAT(2, 0)
 471         {
 472                 uint n = vsnprintf (get__buffer(), get__capacity(), fmt, args);
 473                 return set__return (n);
 474         }
 475
 476         /** Append a single char to the string. */
 477         bool append (char c)
 478         {
 479                 assert (T::len < get__capacity());
 480                 if (full()) return false;
 481                 char *data = get__buffer();
 482                 data[T::len++] = c;
 483                 data[T::len] = '\0';
 484                 return true;
 485         }
 486
 487         /** Update string length and provide return value when appending. */
 488         bool append__return (uint n)
 489         {
 490                 const size_t m = get__capacity();
 491                 if (n < m - T::len) {
 492                         T::len += n;
 493                         return true;
 494                 } else {
 495                         T::len = m - 1;
 496                         return false;
 497                 }
 498         }
 499
 500         /** Append a given string to this one. */
 501         bool append (const char *src)
 502         {
 503                 assert (T::len < get__capacity());
 504                 uint n = snprintf (get__buffer() + T::len,
 505                         get__capacity() - T::len, "%s", src);
 506                 return append__return (n);
 507         }
 508
 509         /** Append to this string according to a format and args. */
 510         bool append_vfmt (const char *fmt, va_list args) WARN_FORMAT(2, 0)
 511         {
 512                 assert (T::len < get__capacity());
 513                 uint n = vsnprintf (get__buffer() + T::len,
 514                         get__capacity() - T::len, fmt, args);
 515                 return append__return (n);
 516         }
 517
 518         /** Replace invalid chars in string. */
 519         void validate (StringValidationSettings settings = SVS_REPLACE_WITH_QUESTION_MARK)
 520         {
 521                 assert (T::len < get__capacity());
 522                 char *buffer = get__buffer();
 523                 str_validate (buffer, buffer + T::len, settings);
 524         }
 525
 526         /** Convert string to lowercase. */
 527         void tolower (void)
 528         {
 529                 strtolower (get__buffer());
 530         }
 531 };
 532
 533 /** Fixed buffer string base class. */
 534 struct stringb_ {
 535         size_t       len;      ///< current string length
 536         const size_t capacity; ///< allocated storage capacity
 537         char * const buffer;   ///< allocated storage buffer
 538
 539         size_t get_capacity (void) const
 540         {
 541                 return capacity;
 542         }
 543
 544         char *get_buffer (void)
 545         {
 546                 return buffer;
 547         }
 548
 549         stringb_ (size_t capacity, char *buffer)
 550                 : len(0), capacity(capacity), buffer(buffer)
 551         {
 552                 assert (capacity > 0);
 553                 buffer[0] = '\0';
 554         }
 555
 556         stringb_ (const stringb_ &) : len(0), capacity(0), buffer(NULL)
 557         {
 558                 NOT_REACHED();
 559         }
 560 };
 561
 562 /** Fixed buffer string class. */
 563 struct stringb : stringt<stringb_> {
 564         stringb (size_t capacity, char *buffer)
 565                 : stringt<stringb_> (capacity, buffer)
 566         {
 567         }
 568
 569         template <uint N>
 570         stringb (char (*buffer) [N]) : stringt<stringb_> (N, &(*buffer)[0])
 571         {
 572         }
 573
 574         template <uint N>
 575         stringb (char (&buffer) [N]) : stringt<stringb_> (N, &buffer[0])
 576         {
 577         }
 578
 579         /* Set this string according to a format and args. */
 580         bool fmt (const char *fmt, ...) WARN_FORMAT(2, 3);
 581
 582         /* Append to this string according to a format and args. */
 583         bool append_fmt (const char *fmt, ...) WARN_FORMAT(2, 3);
 584
 585         /* Append a unicode character encoded as utf-8 to the string. */
 586         bool append_utf8 (WChar c);
 587
 588         /* Append the hexadecimal representation of an md5sum. */
 589         bool append_md5sum (const uint8 md5sum [16]);
 590 };
 591
 592 /** Static string with (some) built-in bounds checking. */
 593 template <uint N>
 594 struct sstring_ : stringb {
 595         char data[N]; ///< string storage
 596
 597         sstring_ (void) : stringb (N, data)
 598         {
 599                 assert_tcompile (N > 0);
 600                 assert (data[0] == '\0'); // should have been set by stringb constructor
 601         }
 602
 603         static inline size_t get_capacity (void)
 604         {
 605                 return N;
 606         }
 607
 608         inline char *get_buffer (void)
 609         {
 610                 return data;
 611         }
 612 };
 613
 614 /** Static string with (some) built-in bounds checking. */
 615 template <uint N>
 616 struct sstring : stringt<sstring_<N> > {
 617 };
 618
 619
 620 /** Convert the md5sum to a hexadecimal string representation, pointer version. */
 621 template <uint N>
 622 static inline void md5sumToString (char (*buf) [N], const uint8 md5sum [16])
 623 {
 624         assert_tcompile (N > 2 * 16);
 625         stringb tmp (N, &(*buf)[0]);
 626         tmp.append_md5sum (md5sum);
 627 }
 628
 629 /** Convert the md5sum to a hexadecimal string representation, reference version. */
 630 template <uint N>
 631 static inline void md5sumToString (char (&buf) [N], const uint8 md5sum [16])
 632 {
 633         md5sumToString (&buf, md5sum);
 634 }
 635
 636 #endif /* STRING_H */