src/lib/string/util_string.c

   1 /* Copyright (c) 2003-2004, Roger Dingledine
   2  * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson.
   3  * Copyright (c) 2007-2021, The Tor Project, Inc. */
   4 /* See LICENSE for licensing information */
   5
   6 /**
   7  * \file util_string.c
   8  * \brief Non-standard string functions used throughout Tor.
   9  **/
  10
  11 #include "lib/string/util_string.h"
  12 #include "lib/string/compat_ctype.h"
  13 #include "lib/err/torerr.h"
  14 #include "lib/ctime/di_ops.h"
  15 #include "lib/defs/digest_sizes.h"
  16
  17 #include <string.h>
  18 #include <stdlib.h>
  19
  20 /** Given <b>hlen</b> bytes at <b>haystack</b> and <b>nlen</b> bytes at
  21  * <b>needle</b>, return a pointer to the first occurrence of the needle
  22  * within the haystack, or NULL if there is no such occurrence.
  23  *
  24  * This function is <em>not</em> timing-safe.
  25  *
  26  * Requires that <b>nlen</b> be greater than zero.
  27  */
  28 const void *
  29 tor_memmem(const void *_haystack, size_t hlen,
  30            const void *_needle, size_t nlen)
  31 {
  32 #if defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2)
  33   raw_assert(nlen);
  34   return memmem(_haystack, hlen, _needle, nlen);
  35 #else
  36   /* This isn't as fast as the GLIBC implementation, but it doesn't need to
  37    * be. */
  38   const char *p, *last_possible_start;
  39   const char *haystack = (const char*)_haystack;
  40   const char *needle = (const char*)_needle;
  41   char first;
  42   raw_assert(nlen);
  43
  44   if (nlen > hlen)
  45     return NULL;
  46
  47   p = haystack;
  48   /* Last position at which the needle could start. */
  49   last_possible_start = haystack + hlen - nlen;
  50   first = *(const char*)needle;
  51   while ((p = memchr(p, first, last_possible_start + 1 - p))) {
  52     if (fast_memeq(p, needle, nlen))
  53       return p;
  54     if (++p > last_possible_start) {
  55       /* This comparison shouldn't be necessary, since if p was previously
  56        * equal to last_possible_start, the next memchr call would be
  57        * "memchr(p, first, 0)", which will return NULL. But it clarifies the
  58        * logic. */
  59       return NULL;
  60     }
  61   }
  62   return NULL;
  63 #endif /* defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2) */
  64 }
  65
  66 const void *
  67 tor_memstr(const void *haystack, size_t hlen, const char *needle)
  68 {
  69   return tor_memmem(haystack, hlen, needle, strlen(needle));
  70 }
  71
  72 /** Return true iff the 'len' bytes at 'mem' are all zero. */
  73 int
  74 fast_mem_is_zero(const char *mem, size_t len)
  75 {
  76   static const char ZERO[] = {
  77     0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
  78   };
  79   while (len >= sizeof(ZERO)) {
  80     /* It's safe to use fast_memcmp here, since the very worst thing an
  81      * attacker could learn is how many initial bytes of a secret were zero */
  82     if (fast_memcmp(mem, ZERO, sizeof(ZERO)))
  83       return 0;
  84     len -= sizeof(ZERO);
  85     mem += sizeof(ZERO);
  86   }
  87   /* Deal with leftover bytes. */
  88   if (len)
  89     return fast_memeq(mem, ZERO, len);
  90
  91   return 1;
  92 }
  93
  94 /** Return true iff the DIGEST_LEN bytes in digest are all zero. */
  95 int
  96 tor_digest_is_zero(const char *digest)
  97 {
  98   return safe_mem_is_zero(digest, DIGEST_LEN);
  99 }
 100
 101 /** Return true iff the DIGEST256_LEN bytes in digest are all zero. */
 102 int
 103 tor_digest256_is_zero(const char *digest)
 104 {
 105   return safe_mem_is_zero(digest, DIGEST256_LEN);
 106 }
 107
 108 /** Remove from the string <b>s</b> every character which appears in
 109  * <b>strip</b>. */
 110 void
 111 tor_strstrip(char *s, const char *strip)
 112 {
 113   char *readp = s;
 114   while (*readp) {
 115     if (strchr(strip, *readp)) {
 116       ++readp;
 117     } else {
 118       *s++ = *readp++;
 119     }
 120   }
 121   *s = '\0';
 122 }
 123
 124 /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
 125  * lowercase. */
 126 void
 127 tor_strlower(char *s)
 128 {
 129   while (*s) {
 130     *s = TOR_TOLOWER(*s);
 131     ++s;
 132   }
 133 }
 134
 135 /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
 136  * lowercase. */
 137 void
 138 tor_strupper(char *s)
 139 {
 140   while (*s) {
 141     *s = TOR_TOUPPER(*s);
 142     ++s;
 143   }
 144 }
 145
 146 /** Replaces <b>old</b> with <b>replacement</b> in <b>s</b> */
 147 void
 148 tor_strreplacechar(char *s, char find, char replacement)
 149 {
 150   for (s = strchr(s, find); s; s = strchr(s + 1, find)) {
 151     *s = replacement;
 152   }
 153 }
 154
 155 /** Return 1 if every character in <b>s</b> is printable, else return 0.
 156  */
 157 int
 158 tor_strisprint(const char *s)
 159 {
 160   while (*s) {
 161     if (!TOR_ISPRINT(*s))
 162       return 0;
 163     s++;
 164   }
 165   return 1;
 166 }
 167
 168 /** Return 1 if no character in <b>s</b> is uppercase, else return 0.
 169  */
 170 int
 171 tor_strisnonupper(const char *s)
 172 {
 173   while (*s) {
 174     if (TOR_ISUPPER(*s))
 175       return 0;
 176     s++;
 177   }
 178   return 1;
 179 }
 180
 181 /** Return true iff every character in <b>s</b> is whitespace space; else
 182  * return false. */
 183 int
 184 tor_strisspace(const char *s)
 185 {
 186   while (*s) {
 187     if (!TOR_ISSPACE(*s))
 188       return 0;
 189     s++;
 190   }
 191   return 1;
 192 }
 193
 194 /** As strcmp, except that either string may be NULL.  The NULL string is
 195  * considered to be before any non-NULL string. */
 196 int
 197 strcmp_opt(const char *s1, const char *s2)
 198 {
 199   if (!s1) {
 200     if (!s2)
 201       return 0;
 202     else
 203       return -1;
 204   } else if (!s2) {
 205     return 1;
 206   } else {
 207     return strcmp(s1, s2);
 208   }
 209 }
 210
 211 /** Compares the first strlen(s2) characters of s1 with s2.  Returns as for
 212  * strcmp.
 213  */
 214 int
 215 strcmpstart(const char *s1, const char *s2)
 216 {
 217   size_t n = strlen(s2);
 218   return strncmp(s1, s2, n);
 219 }
 220
 221 /** Compares the first strlen(s2) characters of s1 with s2.  Returns as for
 222  * strcasecmp.
 223  */
 224 int
 225 strcasecmpstart(const char *s1, const char *s2)
 226 {
 227   size_t n = strlen(s2);
 228   return strncasecmp(s1, s2, n);
 229 }
 230
 231 /** Compare the value of the string <b>prefix</b> with the start of the
 232  * <b>memlen</b>-byte memory chunk at <b>mem</b>.  Return as for strcmp.
 233  *
 234  * [As fast_memcmp(mem, prefix, strlen(prefix)) but returns -1 if memlen is
 235  * less than strlen(prefix).]
 236  */
 237 int
 238 fast_memcmpstart(const void *mem, size_t memlen,
 239                 const char *prefix)
 240 {
 241   size_t plen = strlen(prefix);
 242   if (memlen < plen)
 243     return -1;
 244   return fast_memcmp(mem, prefix, plen);
 245 }
 246
 247 /** Compares the last strlen(s2) characters of s1 with s2.  Returns as for
 248  * strcmp.
 249  */
 250 int
 251 strcmpend(const char *s1, const char *s2)
 252 {
 253   size_t n1 = strlen(s1), n2 = strlen(s2);
 254   if (n2>n1)
 255     return strcmp(s1,s2);
 256   else
 257     return strncmp(s1+(n1-n2), s2, n2);
 258 }
 259
 260 /** Compares the last strlen(s2) characters of s1 with s2.  Returns as for
 261  * strcasecmp.
 262  */
 263 int
 264 strcasecmpend(const char *s1, const char *s2)
 265 {
 266   size_t n1 = strlen(s1), n2 = strlen(s2);
 267   if (n2>n1) /* then they can't be the same; figure out which is bigger */
 268     return strcasecmp(s1,s2);
 269   else
 270     return strncasecmp(s1+(n1-n2), s2, n2);
 271 }
 272
 273 /** Return a pointer to the first char of s that is not whitespace and
 274  * not a comment, or to the terminating NUL if no such character exists.
 275  */
 276 const char *
 277 eat_whitespace(const char *s)
 278 {
 279   raw_assert(s);
 280
 281   while (1) {
 282     switch (*s) {
 283     case '\0':
 284     default:
 285       return s;
 286     case ' ':
 287     case '\t':
 288     case '\n':
 289     case '\r':
 290       ++s;
 291       break;
 292     case '#':
 293       ++s;
 294       while (*s && *s != '\n')
 295         ++s;
 296     }
 297   }
 298 }
 299
 300 /** Return a pointer to the first char of s that is not whitespace and
 301  * not a comment, or to the terminating NUL if no such character exists.
 302  */
 303 const char *
 304 eat_whitespace_eos(const char *s, const char *eos)
 305 {
 306   raw_assert(s);
 307   raw_assert(eos && s <= eos);
 308
 309   while (s < eos) {
 310     switch (*s) {
 311     case '\0':
 312     default:
 313       return s;
 314     case ' ':
 315     case '\t':
 316     case '\n':
 317     case '\r':
 318       ++s;
 319       break;
 320     case '#':
 321       ++s;
 322       while (s < eos && *s && *s != '\n')
 323         ++s;
 324     }
 325   }
 326   return s;
 327 }
 328
 329 /** Return a pointer to the first char of s that is not a space or a tab
 330  * or a \\r, or to the terminating NUL if no such character exists. */
 331 const char *
 332 eat_whitespace_no_nl(const char *s)
 333 {
 334   while (*s == ' ' || *s == '\t' || *s == '\r')
 335     ++s;
 336   return s;
 337 }
 338
 339 /** As eat_whitespace_no_nl, but stop at <b>eos</b> whether we have
 340  * found a non-whitespace character or not. */
 341 const char *
 342 eat_whitespace_eos_no_nl(const char *s, const char *eos)
 343 {
 344   while (s < eos && (*s == ' ' || *s == '\t' || *s == '\r'))
 345     ++s;
 346   return s;
 347 }
 348
 349 /** Return a pointer to the first char of s that is whitespace or <b>#</b>,
 350  * or to the terminating NUL if no such character exists.
 351  */
 352 const char *
 353 find_whitespace(const char *s)
 354 {
 355   /* tor_assert(s); */
 356   while (1) {
 357     switch (*s)
 358     {
 359     case '\0':
 360     case '#':
 361     case ' ':
 362     case '\r':
 363     case '\n':
 364     case '\t':
 365       return s;
 366     default:
 367       ++s;
 368     }
 369   }
 370 }
 371
 372 /** As find_whitespace, but stop at <b>eos</b> whether we have found a
 373  * whitespace or not. */
 374 const char *
 375 find_whitespace_eos(const char *s, const char *eos)
 376 {
 377   /* tor_assert(s); */
 378   while (s < eos) {
 379     switch (*s)
 380     {
 381     case '\0':
 382     case '#':
 383     case ' ':
 384     case '\r':
 385     case '\n':
 386     case '\t':
 387       return s;
 388     default:
 389       ++s;
 390     }
 391   }
 392   return s;
 393 }
 394
 395 /** Return the first occurrence of <b>needle</b> in <b>haystack</b> that
 396  * occurs at the start of a line (that is, at the beginning of <b>haystack</b>
 397  * or immediately after a newline).  Return NULL if no such string is found.
 398  */
 399 const char *
 400 find_str_at_start_of_line(const char *haystack, const char *needle)
 401 {
 402   size_t needle_len = strlen(needle);
 403
 404   do {
 405     if (!strncmp(haystack, needle, needle_len))
 406       return haystack;
 407
 408     haystack = strchr(haystack, '\n');
 409     if (!haystack)
 410       return NULL;
 411     else
 412       ++haystack;
 413   } while (*haystack);
 414
 415   return NULL;
 416 }
 417
 418 /** Returns true if <b>string</b> could be a C identifier.
 419     A C identifier must begin with a letter or an underscore and the
 420     rest of its characters can be letters, numbers or underscores. No
 421     length limit is imposed. */
 422 int
 423 string_is_C_identifier(const char *string)
 424 {
 425   size_t iter;
 426   size_t length = strlen(string);
 427   if (!length)
 428     return 0;
 429
 430   for (iter = 0; iter < length ; iter++) {
 431     if (iter == 0) {
 432       if (!(TOR_ISALPHA(string[iter]) ||
 433             string[iter] == '_'))
 434         return 0;
 435     } else {
 436       if (!(TOR_ISALPHA(string[iter]) ||
 437             TOR_ISDIGIT(string[iter]) ||
 438             string[iter] == '_'))
 439         return 0;
 440     }
 441   }
 442
 443   return 1;
 444 }
 445
 446 /** A byte with the top <b>x</b> bits set. */
 447 #define TOP_BITS(x) ((uint8_t)(0xFF << (8 - (x))))
 448 /** A byte with the lowest <b>x</b> bits set. */
 449 #define LOW_BITS(x) ((uint8_t)(0xFF >> (8 - (x))))
 450
 451 /** Given the leading byte <b>b</b>, return the total number of bytes in the
 452  * UTF-8 character. Returns 0 if it's an invalid leading byte.
 453  */
 454 static uint8_t
 455 bytes_in_char(uint8_t b)
 456 {
 457   if ((TOP_BITS(1) & b) == 0x00)
 458     return 1; // a 1-byte UTF-8 char, aka ASCII
 459   if ((TOP_BITS(3) & b) == TOP_BITS(2))
 460     return 2; // a 2-byte UTF-8 char
 461   if ((TOP_BITS(4) & b) == TOP_BITS(3))
 462     return 3; // a 3-byte UTF-8 char
 463   if ((TOP_BITS(5) & b) == TOP_BITS(4))
 464     return 4; // a 4-byte UTF-8 char
 465
 466   // Invalid: either the top 2 bits are 10, or the top 5 bits are 11111.
 467   return 0;
 468 }
 469
 470 /** Returns true iff <b>b</b> is a UTF-8 continuation byte. */
 471 static bool
 472 is_continuation_byte(uint8_t b)
 473 {
 474   uint8_t top2bits = b & TOP_BITS(2);
 475   return top2bits == TOP_BITS(1);
 476 }
 477
 478 /** Returns true iff the <b>len</b> bytes in <b>c</b> are a valid UTF-8
 479  * character.
 480  */
 481 static bool
 482 validate_char(const uint8_t *c, uint8_t len)
 483 {
 484   if (len == 1)
 485     return true; // already validated this is an ASCII char earlier.
 486
 487   uint8_t mask = LOW_BITS(7 - len); // bitmask for the leading byte.
 488   uint32_t codepoint = c[0] & mask;
 489
 490   mask = LOW_BITS(6); // bitmask for continuation bytes.
 491   for (uint8_t i = 1; i < len; i++) {
 492     if (!is_continuation_byte(c[i]))
 493       return false;
 494     codepoint <<= 6;
 495     codepoint |= (c[i] & mask);
 496   }
 497
 498   if (len == 2 && codepoint <= 0x7f)
 499     return false; // Invalid, overly long encoding, should have fit in 1 byte.
 500
 501   if (len == 3 && codepoint <= 0x7ff)
 502     return false; // Invalid, overly long encoding, should have fit in 2 bytes.
 503
 504   if (len == 4 && codepoint <= 0xffff)
 505     return false; // Invalid, overly long encoding, should have fit in 3 bytes.
 506
 507   if (codepoint >= 0xd800 && codepoint <= 0xdfff)
 508     return false; // Invalid, reserved for UTF-16 surrogate pairs.
 509
 510   return codepoint <= 0x10ffff; // Check if within maximum.
 511 }
 512
 513 /** Returns true iff the first <b>len</b> bytes in <b>str</b> are a
 514     valid UTF-8 string. */
 515 int
 516 string_is_utf8(const char *str, size_t len)
 517 {
 518   // If str is NULL, don't try to read it
 519   if (!str) {
 520     // We could test for this case, but the low-level logs would produce
 521     // confusing test output.
 522     // LCOV_EXCL_START
 523     if (len) {
 524       // Use the low-level logging function, so that the log module can
 525       // validate UTF-8 (if needed in future code)
 526       tor_log_err_sigsafe(
 527         "BUG: string_is_utf8() called with NULL str but non-zero len.");
 528       // Since it's a bug, we should probably reject this string
 529       return false;
 530     }
 531     // LCOV_EXCL_STOP
 532     return true;
 533   }
 534
 535   for (size_t i = 0; i < len;) {
 536     uint8_t num_bytes = bytes_in_char(str[i]);
 537     if (num_bytes == 0) // Invalid leading byte found.
 538       return false;
 539
 540     size_t next_char = i + num_bytes;
 541     if (next_char > len)
 542       return false;
 543
 544     // Validate the continuation bytes in this multi-byte character,
 545     // and advance to the next character in the string.
 546     if (!validate_char((const uint8_t*)&str[i], num_bytes))
 547       return false;
 548     i = next_char;
 549   }
 550   return true;
 551 }
 552
 553 /** As string_is_utf8(), but returns false if the string begins with a UTF-8
 554  * byte order mark (BOM).
 555  */
 556 int
 557 string_is_utf8_no_bom(const char *str, size_t len)
 558 {
 559   if (str && len >= 3 && (!strcmpstart(str, "\uFEFF") ||
 560                           !strcmpstart(str, "\uFFFE"))) {
 561     return false;
 562   }
 563   return string_is_utf8(str, len);
 564 }