Merge branch 'tor-gitlab/mr/583' into maint-0.4.7
[tor.git] / src / lib / string / util_string.c
blobb1c0a114396eb800dd43a4bd3643f1f63b1503ca
1 /* Copyright (c) 2003-2004, Roger Dingledine
2 * Copyright (c) 2004-2006, Roger Dingledine, Nick Mathewson.
3 * Copyright (c) 2007-2021, The Tor Project, Inc. */
4 /* See LICENSE for licensing information */
6 /**
7 * \file util_string.c
8 * \brief Non-standard string functions used throughout Tor.
9 **/
11 #include "lib/string/util_string.h"
12 #include "lib/string/compat_ctype.h"
13 #include "lib/err/torerr.h"
14 #include "lib/ctime/di_ops.h"
15 #include "lib/defs/digest_sizes.h"
17 #include <string.h>
18 #include <stdlib.h>
20 /** Given <b>hlen</b> bytes at <b>haystack</b> and <b>nlen</b> bytes at
21 * <b>needle</b>, return a pointer to the first occurrence of the needle
22 * within the haystack, or NULL if there is no such occurrence.
24 * This function is <em>not</em> timing-safe.
26 * Requires that <b>nlen</b> be greater than zero.
28 const void *
29 tor_memmem(const void *_haystack, size_t hlen,
30 const void *_needle, size_t nlen)
32 #if defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2)
33 raw_assert(nlen);
34 return memmem(_haystack, hlen, _needle, nlen);
35 #else
36 /* This isn't as fast as the GLIBC implementation, but it doesn't need to
37 * be. */
38 const char *p, *last_possible_start;
39 const char *haystack = (const char*)_haystack;
40 const char *needle = (const char*)_needle;
41 char first;
42 raw_assert(nlen);
44 if (nlen > hlen)
45 return NULL;
47 p = haystack;
48 /* Last position at which the needle could start. */
49 last_possible_start = haystack + hlen - nlen;
50 first = *(const char*)needle;
51 while ((p = memchr(p, first, last_possible_start + 1 - p))) {
52 if (fast_memeq(p, needle, nlen))
53 return p;
54 if (++p > last_possible_start) {
55 /* This comparison shouldn't be necessary, since if p was previously
56 * equal to last_possible_start, the next memchr call would be
57 * "memchr(p, first, 0)", which will return NULL. But it clarifies the
58 * logic. */
59 return NULL;
62 return NULL;
63 #endif /* defined(HAVE_MEMMEM) && (!defined(__GNUC__) || __GNUC__ >= 2) */
66 const void *
67 tor_memstr(const void *haystack, size_t hlen, const char *needle)
69 return tor_memmem(haystack, hlen, needle, strlen(needle));
72 /** Return true iff the 'len' bytes at 'mem' are all zero. */
73 int
74 fast_mem_is_zero(const char *mem, size_t len)
76 static const char ZERO[] = {
77 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
79 while (len >= sizeof(ZERO)) {
80 /* It's safe to use fast_memcmp here, since the very worst thing an
81 * attacker could learn is how many initial bytes of a secret were zero */
82 if (fast_memcmp(mem, ZERO, sizeof(ZERO)))
83 return 0;
84 len -= sizeof(ZERO);
85 mem += sizeof(ZERO);
87 /* Deal with leftover bytes. */
88 if (len)
89 return fast_memeq(mem, ZERO, len);
91 return 1;
94 /** Return true iff the DIGEST_LEN bytes in digest are all zero. */
95 int
96 tor_digest_is_zero(const char *digest)
98 return safe_mem_is_zero(digest, DIGEST_LEN);
101 /** Return true iff the DIGEST256_LEN bytes in digest are all zero. */
103 tor_digest256_is_zero(const char *digest)
105 return safe_mem_is_zero(digest, DIGEST256_LEN);
108 /** Remove from the string <b>s</b> every character which appears in
109 * <b>strip</b>. */
110 void
111 tor_strstrip(char *s, const char *strip)
113 char *readp = s;
114 while (*readp) {
115 if (strchr(strip, *readp)) {
116 ++readp;
117 } else {
118 *s++ = *readp++;
121 *s = '\0';
124 /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
125 * lowercase. */
126 void
127 tor_strlower(char *s)
129 while (*s) {
130 *s = TOR_TOLOWER(*s);
131 ++s;
135 /** Convert all alphabetic characters in the nul-terminated string <b>s</b> to
136 * lowercase. */
137 void
138 tor_strupper(char *s)
140 while (*s) {
141 *s = TOR_TOUPPER(*s);
142 ++s;
146 /** Replaces <b>old</b> with <b>replacement</b> in <b>s</b> */
147 void
148 tor_strreplacechar(char *s, char find, char replacement)
150 for (s = strchr(s, find); s; s = strchr(s + 1, find)) {
151 *s = replacement;
155 /** Return 1 if every character in <b>s</b> is printable, else return 0.
158 tor_strisprint(const char *s)
160 while (*s) {
161 if (!TOR_ISPRINT(*s))
162 return 0;
163 s++;
165 return 1;
168 /** Return 1 if no character in <b>s</b> is uppercase, else return 0.
171 tor_strisnonupper(const char *s)
173 while (*s) {
174 if (TOR_ISUPPER(*s))
175 return 0;
176 s++;
178 return 1;
181 /** Return true iff every character in <b>s</b> is whitespace space; else
182 * return false. */
184 tor_strisspace(const char *s)
186 while (*s) {
187 if (!TOR_ISSPACE(*s))
188 return 0;
189 s++;
191 return 1;
194 /** As strcmp, except that either string may be NULL. The NULL string is
195 * considered to be before any non-NULL string. */
197 strcmp_opt(const char *s1, const char *s2)
199 if (!s1) {
200 if (!s2)
201 return 0;
202 else
203 return -1;
204 } else if (!s2) {
205 return 1;
206 } else {
207 return strcmp(s1, s2);
211 /** Compares the first strlen(s2) characters of s1 with s2. Returns as for
212 * strcmp.
215 strcmpstart(const char *s1, const char *s2)
217 size_t n = strlen(s2);
218 return strncmp(s1, s2, n);
221 /** Compares the first strlen(s2) characters of s1 with s2. Returns as for
222 * strcasecmp.
225 strcasecmpstart(const char *s1, const char *s2)
227 size_t n = strlen(s2);
228 return strncasecmp(s1, s2, n);
231 /** Compare the value of the string <b>prefix</b> with the start of the
232 * <b>memlen</b>-byte memory chunk at <b>mem</b>. Return as for strcmp.
234 * [As fast_memcmp(mem, prefix, strlen(prefix)) but returns -1 if memlen is
235 * less than strlen(prefix).]
238 fast_memcmpstart(const void *mem, size_t memlen,
239 const char *prefix)
241 size_t plen = strlen(prefix);
242 if (memlen < plen)
243 return -1;
244 return fast_memcmp(mem, prefix, plen);
247 /** Compares the last strlen(s2) characters of s1 with s2. Returns as for
248 * strcmp.
251 strcmpend(const char *s1, const char *s2)
253 size_t n1 = strlen(s1), n2 = strlen(s2);
254 if (n2>n1)
255 return strcmp(s1,s2);
256 else
257 return strncmp(s1+(n1-n2), s2, n2);
260 /** Compares the last strlen(s2) characters of s1 with s2. Returns as for
261 * strcasecmp.
264 strcasecmpend(const char *s1, const char *s2)
266 size_t n1 = strlen(s1), n2 = strlen(s2);
267 if (n2>n1) /* then they can't be the same; figure out which is bigger */
268 return strcasecmp(s1,s2);
269 else
270 return strncasecmp(s1+(n1-n2), s2, n2);
273 /** Return a pointer to the first char of s that is not whitespace and
274 * not a comment, or to the terminating NUL if no such character exists.
276 const char *
277 eat_whitespace(const char *s)
279 raw_assert(s);
281 while (1) {
282 switch (*s) {
283 case '\0':
284 default:
285 return s;
286 case ' ':
287 case '\t':
288 case '\n':
289 case '\r':
290 ++s;
291 break;
292 case '#':
293 ++s;
294 while (*s && *s != '\n')
295 ++s;
300 /** Return a pointer to the first char of s that is not whitespace and
301 * not a comment, or to the terminating NUL if no such character exists.
303 const char *
304 eat_whitespace_eos(const char *s, const char *eos)
306 raw_assert(s);
307 raw_assert(eos && s <= eos);
309 while (s < eos) {
310 switch (*s) {
311 case '\0':
312 default:
313 return s;
314 case ' ':
315 case '\t':
316 case '\n':
317 case '\r':
318 ++s;
319 break;
320 case '#':
321 ++s;
322 while (s < eos && *s && *s != '\n')
323 ++s;
326 return s;
329 /** Return a pointer to the first char of s that is not a space or a tab
330 * or a \\r, or to the terminating NUL if no such character exists. */
331 const char *
332 eat_whitespace_no_nl(const char *s)
334 while (*s == ' ' || *s == '\t' || *s == '\r')
335 ++s;
336 return s;
339 /** As eat_whitespace_no_nl, but stop at <b>eos</b> whether we have
340 * found a non-whitespace character or not. */
341 const char *
342 eat_whitespace_eos_no_nl(const char *s, const char *eos)
344 while (s < eos && (*s == ' ' || *s == '\t' || *s == '\r'))
345 ++s;
346 return s;
349 /** Return a pointer to the first char of s that is whitespace or <b>#</b>,
350 * or to the terminating NUL if no such character exists.
352 const char *
353 find_whitespace(const char *s)
355 /* tor_assert(s); */
356 while (1) {
357 switch (*s)
359 case '\0':
360 case '#':
361 case ' ':
362 case '\r':
363 case '\n':
364 case '\t':
365 return s;
366 default:
367 ++s;
372 /** As find_whitespace, but stop at <b>eos</b> whether we have found a
373 * whitespace or not. */
374 const char *
375 find_whitespace_eos(const char *s, const char *eos)
377 /* tor_assert(s); */
378 while (s < eos) {
379 switch (*s)
381 case '\0':
382 case '#':
383 case ' ':
384 case '\r':
385 case '\n':
386 case '\t':
387 return s;
388 default:
389 ++s;
392 return s;
395 /** Return the first occurrence of <b>needle</b> in <b>haystack</b> that
396 * occurs at the start of a line (that is, at the beginning of <b>haystack</b>
397 * or immediately after a newline). Return NULL if no such string is found.
399 const char *
400 find_str_at_start_of_line(const char *haystack, const char *needle)
402 size_t needle_len = strlen(needle);
404 do {
405 if (!strncmp(haystack, needle, needle_len))
406 return haystack;
408 haystack = strchr(haystack, '\n');
409 if (!haystack)
410 return NULL;
411 else
412 ++haystack;
413 } while (*haystack);
415 return NULL;
418 /** Returns true if <b>string</b> could be a C identifier.
419 A C identifier must begin with a letter or an underscore and the
420 rest of its characters can be letters, numbers or underscores. No
421 length limit is imposed. */
423 string_is_C_identifier(const char *string)
425 size_t iter;
426 size_t length = strlen(string);
427 if (!length)
428 return 0;
430 for (iter = 0; iter < length ; iter++) {
431 if (iter == 0) {
432 if (!(TOR_ISALPHA(string[iter]) ||
433 string[iter] == '_'))
434 return 0;
435 } else {
436 if (!(TOR_ISALPHA(string[iter]) ||
437 TOR_ISDIGIT(string[iter]) ||
438 string[iter] == '_'))
439 return 0;
443 return 1;
446 /** A byte with the top <b>x</b> bits set. */
447 #define TOP_BITS(x) ((uint8_t)(0xFF << (8 - (x))))
448 /** A byte with the lowest <b>x</b> bits set. */
449 #define LOW_BITS(x) ((uint8_t)(0xFF >> (8 - (x))))
451 /** Given the leading byte <b>b</b>, return the total number of bytes in the
452 * UTF-8 character. Returns 0 if it's an invalid leading byte.
454 static uint8_t
455 bytes_in_char(uint8_t b)
457 if ((TOP_BITS(1) & b) == 0x00)
458 return 1; // a 1-byte UTF-8 char, aka ASCII
459 if ((TOP_BITS(3) & b) == TOP_BITS(2))
460 return 2; // a 2-byte UTF-8 char
461 if ((TOP_BITS(4) & b) == TOP_BITS(3))
462 return 3; // a 3-byte UTF-8 char
463 if ((TOP_BITS(5) & b) == TOP_BITS(4))
464 return 4; // a 4-byte UTF-8 char
466 // Invalid: either the top 2 bits are 10, or the top 5 bits are 11111.
467 return 0;
470 /** Returns true iff <b>b</b> is a UTF-8 continuation byte. */
471 static bool
472 is_continuation_byte(uint8_t b)
474 uint8_t top2bits = b & TOP_BITS(2);
475 return top2bits == TOP_BITS(1);
478 /** Returns true iff the <b>len</b> bytes in <b>c</b> are a valid UTF-8
479 * character.
481 static bool
482 validate_char(const uint8_t *c, uint8_t len)
484 if (len == 1)
485 return true; // already validated this is an ASCII char earlier.
487 uint8_t mask = LOW_BITS(7 - len); // bitmask for the leading byte.
488 uint32_t codepoint = c[0] & mask;
490 mask = LOW_BITS(6); // bitmask for continuation bytes.
491 for (uint8_t i = 1; i < len; i++) {
492 if (!is_continuation_byte(c[i]))
493 return false;
494 codepoint <<= 6;
495 codepoint |= (c[i] & mask);
498 if (len == 2 && codepoint <= 0x7f)
499 return false; // Invalid, overly long encoding, should have fit in 1 byte.
501 if (len == 3 && codepoint <= 0x7ff)
502 return false; // Invalid, overly long encoding, should have fit in 2 bytes.
504 if (len == 4 && codepoint <= 0xffff)
505 return false; // Invalid, overly long encoding, should have fit in 3 bytes.
507 if (codepoint >= 0xd800 && codepoint <= 0xdfff)
508 return false; // Invalid, reserved for UTF-16 surrogate pairs.
510 return codepoint <= 0x10ffff; // Check if within maximum.
513 /** Returns true iff the first <b>len</b> bytes in <b>str</b> are a
514 valid UTF-8 string. */
516 string_is_utf8(const char *str, size_t len)
518 // If str is NULL, don't try to read it
519 if (!str) {
520 // We could test for this case, but the low-level logs would produce
521 // confusing test output.
522 // LCOV_EXCL_START
523 if (len) {
524 // Use the low-level logging function, so that the log module can
525 // validate UTF-8 (if needed in future code)
526 tor_log_err_sigsafe(
527 "BUG: string_is_utf8() called with NULL str but non-zero len.");
528 // Since it's a bug, we should probably reject this string
529 return false;
531 // LCOV_EXCL_STOP
532 return true;
535 for (size_t i = 0; i < len;) {
536 uint8_t num_bytes = bytes_in_char(str[i]);
537 if (num_bytes == 0) // Invalid leading byte found.
538 return false;
540 size_t next_char = i + num_bytes;
541 if (next_char > len)
542 return false;
544 // Validate the continuation bytes in this multi-byte character,
545 // and advance to the next character in the string.
546 if (!validate_char((const uint8_t*)&str[i], num_bytes))
547 return false;
548 i = next_char;
550 return true;
553 /** As string_is_utf8(), but returns false if the string begins with a UTF-8
554 * byte order mark (BOM).
557 string_is_utf8_no_bom(const char *str, size_t len)
559 if (str && len >= 3 && (!strcmpstart(str, "\uFEFF") ||
560 !strcmpstart(str, "\uFFFE"))) {
561 return false;
563 return string_is_utf8(str, len);