server: handle pcre2 now returning -1 for "no match"
[rb-79.git] / sanitize-comment.c
blob109285abef1e1f53dbbfe68ea27d9574f2e348c1
1 /*
2 * Copyright (c) 2017-2020, De Rais <derais@cock.li>
4 * Permission to use, copy, modify, and/or distribute this software for
5 * any purpose with or without fee is hereby granted, provided that the
6 * above copyright notice and this permission notice appear in all
7 * copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
10 * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
11 * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
12 * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
13 * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
14 * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
15 * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
16 * PERFORMANCE OF THIS SOFTWARE.
18 #include <errno.h>
19 #include <stdint.h>
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <time.h>
24 #include <wchar.h>
26 #define PCRE2_CODE_UNIT_WIDTH 8
27 #include <pcre2.h>
29 #include "macros.h"
30 #include "rb79.h"
31 #include "unicode-transforms.h"
34 * We need a way to get codepoints out of UTF-8 strings and if
35 * wchar_t stored codepoint values, that would be great. That's
36 * __STDC_ISO_10646__, though. You can remove this check and cross
37 * your fingers, since rb79 will do a quick check on startup, but
38 * please check why the C implementation doesn't define
39 * __STDC_ISO_10646__ first.
41 #ifndef __STDC_ISO_10646__
42 #error We really want __STD_ISO_10646__
43 #endif
46 * A wordfilter consists of a pcre2 regex and a replacement string
48 struct wordfilter {
49 /* */
50 pcre2_code *code;
51 const char *replacement;
52 size_t replacement_len;
56 * A forbidden consists of a pcre2 regex only
58 struct forbidden {
59 /* */
60 pcre2_code *code;
61 int ban_duration;
62 const char *ban_reason;
65 /* These are constructed in setup_sanitize_comment() */
66 static struct wordfilter *wordfilters;
67 static size_t wordfilters_num;
68 static struct forbidden *forbiddens;
69 static size_t forbiddens_num;
71 /* Special matcher for quoting, newlines, linkifying, etc. */
72 static pcre2_code *format_replacements;
75 * Comparison function for struct translate.
77 * Preconditions:
79 * - *key_v is a wchar_t.
81 * - *tr_v is a struct translate object.
83 * Postconditions:
85 * - Returns -1 (0) [1] if *key_v is less than (equal to) [greater
86 * than] *tr_v's starting range.
88 static int
89 match_translate(const void *key_v, const void *tr_v)
91 const wchar_t *key = key_v;
92 const struct translate *tr = tr_v;
94 if (*key < tr->from_s) {
95 return -1;
96 } else if (*key > tr->from_t) {
97 return 1;
100 return 0;
104 * Add a UTF-8 sequence str onto *buf
106 * Preconditions:
108 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
109 * string.
111 * - str is a valid ASCII (not just UTF-8) string of length str_len.
113 * Postconditions (success):
115 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
116 * string.
118 * - The contents of str have been appended to *buf (and *idx
119 * includes this).
121 static int
122 append_str(char **buf, size_t *idx, size_t *sz, const char *str, size_t str_len)
124 if (str_len + *idx >= *sz) {
125 void *newmem = 0;
126 size_t new_sz = str_len + *idx + (1 << 9);
128 if (str_len + *idx < str_len ||
129 str_len + *idx + (1 << 9) < str_len + *idx) {
130 ERROR_MESSAGE("overflow (str_len = %zu, *idx = %zu)",
131 str_len, *idx);
133 return -1;
136 if (!(newmem = realloc(*buf, new_sz))) {
137 PERROR_MESSAGE("realloc");
139 return -1;
142 *buf = newmem;
143 *sz = new_sz;
146 strncpy(*buf + *idx, str, str_len);
147 *(*buf + *idx + str_len) = '\0';
148 *idx += str_len;
150 return 0;
153 /* Dummy function for when I can't be bothered to strlen(). */
154 static int
155 append_const_str(char **buf, size_t *idx, size_t *len, const char *str)
157 return append_str(buf, idx, len, str, strlen(str));
161 * Add a single character onto *buf
163 * Preconditions:
165 * - *buf is memory of length *len, and up to *idx is a valid UTF-8
166 * string.
168 * - c is an ASCII character.
170 * Postconditions (success):
172 * - *buf is memory of length *len, and up to *idx is a valid UTF-8
173 * string.
175 * - c has been appended to *buf (and *idx includes this).
177 static int
178 append_char(char **buf, size_t *idx, size_t *len, char c)
180 if (1 + *idx >= *len) {
181 void *newmem = 0;
182 size_t new_len = 1 + *idx + (1 << 9);
184 if (*idx + 1 < *idx ||
185 *idx + 1 + (1 << 9) < *idx + 1) {
186 ERROR_MESSAGE("overflow (*idx = %zu)", *idx);
188 return -1;
191 if (!(newmem = realloc(*buf, new_len))) {
192 PERROR_MESSAGE("realloc");
194 return -1;
197 *buf = newmem;
198 *len = new_len;
201 *(*buf + *idx) = c;
202 *(*buf + *idx + 1) = '\0';
203 *idx += 1;
205 return 0;
209 * Add a Unicode codepoint onto *buf
211 * Preconditions:
213 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
214 * string.
216 * - wchar_t is a valid Unicode codepoint.
218 * Postconditions (success):
220 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
221 * string.
223 * - An HTML-escaped sequence like &#123; has been appended to
224 * *buf (and *idx includes this).
226 static int
227 append_wchar_escaped(char **buf, size_t *idx, size_t *sz, wchar_t wc)
229 size_t l = snprintf(0, 0, "&#%ld;", (long) wc);
231 if (l + *idx >= *sz) {
232 void *newmem = 0;
233 size_t new_sz = l + *idx + (1 << 9);
235 if (*idx + l < *idx ||
236 *idx + l + (1 << 9) < *idx + l) {
237 ERROR_MESSAGE("overflow (*idx = %zu, l = %zu)", *idx,
240 return -1;
243 if (!(newmem = realloc(*buf, new_sz))) {
244 PERROR_MESSAGE("realloc");
246 return -1;
249 *buf = newmem;
250 *sz = new_sz;
253 sprintf(*buf + *idx, "&#%ld;", (long) wc);
254 *idx += l;
256 return 0;
260 * Ensure that (*map)[j] = k, fixing up length as appropriate.
262 * Preconditions
264 * - *map is memory of length len.
266 * Postconditions (success):
268 * - *map is memory of length len.
270 * - (*map)[j] = k.
272 static int
273 set_position_mapping(size_t **map, size_t *len, size_t j, size_t k)
275 if (j + 1 >= *len) {
276 void *newmem = 0;
278 if (j + 2 < j ||
279 ((j + 2) * sizeof **map) / (j + 2) != sizeof **map) {
280 ERROR_MESSAGE("overflow (j = %zu)", j);
282 return -1;
285 if (!(newmem = realloc(*map, (j + 2) * sizeof **map))) {
286 PERROR_MESSAGE("realloc");
288 return -1;
291 *map = newmem;
293 for (size_t l = *len; l < j + 2; ++l) {
294 (*map)[l] = ((size_t) -1);
297 *len = j + 2;
300 (*map)[j] = k;
302 return 0;
306 * HTML-escape in to *out.
308 * Preconditions
310 * - in is memory of at least length in_len, valid UTF-8
311 * text.
313 * - *out is memory of at least length *out_len (if *out_len = 0,
314 * *out may be 0), valid UTF-8 text.
316 * - Overwriting *out and *out_len shall not cause a memory leak.
318 * - out, out_len, and out_idx are not 0.
320 * Postconditions (success):
322 * - *out is memory of at least length *out_len, valid UTF-8 text.
324 * - A stretch of HTML-escaped ASCII text representing in has been
325 * added to *out at the position that was *out_idx.
327 * - *out_idx has been updated to point to the end of this stretch.
329 * - If necessary, *out_len has been updated.
331 static int
332 to_html(const char *in, const size_t in_len, size_t in_idx, char **out,
333 size_t *out_len, size_t *out_idx)
335 int ret = -1;
336 wchar_t wc = 0;
337 int mbret = 0;
338 size_t out_sz = 0;
339 size_t initial_out_idx = *out_idx;
341 if (!*out) {
342 if (!(*out = malloc(1))) {
343 PERROR_MESSAGE("malloc");
344 goto done;
347 out_sz = 1;
348 *out_len = 0;
349 (*out)[0] = '\0';
353 * XXX: If you make this multithreaded, be sure to use
354 * mbrtowc(3) here!
356 while (in_idx < in_len &&
357 in[in_idx]) {
358 /* Extract next character */
359 mbret = mbtowc(&wc, in + in_idx, in_len - in_idx);
361 if (mbret == -1) {
362 PERROR_MESSAGE("mbtowc");
363 goto done;
366 if (wc == L'&') {
367 ret = append_str(out, out_idx, &out_sz, "&amp;", 5);
368 } else if (wc == L'"') {
369 ret = append_str(out, out_idx, &out_sz, "&quot;", 6);
370 } else if (wc == L'\'') {
371 ret = append_str(out, out_idx, &out_sz, "&apos;", 6);
372 } else if (wc == L'<') {
373 ret = append_str(out, out_idx, &out_sz, "&lt;", 4);
374 } else if (wc == L'>') {
375 ret = append_str(out, out_idx, &out_sz, "&gt;", 4);
376 } else if (mbret == 1 &&
377 in[in_idx] >= ' ' &&
378 in[in_idx] <= '~') {
379 ret = append_char(out, out_idx, &out_sz, in[in_idx]);
380 } else if (mbret == 1 &&
381 in[in_idx] == '\r') {
382 ret = 0;
383 } else if (mbret == 1 &&
384 in[in_idx] == '\n') {
385 ret = append_char(out, out_idx, &out_sz, in[in_idx]);
386 } else {
387 ret = append_wchar_escaped(out, out_idx, &out_sz, wc);
390 in_idx += mbret;
392 if (ret < 0) {
393 goto done;
397 *out_len = *out_len + (*out_idx - initial_out_idx);
398 ret = 0;
399 done:
401 return ret;
405 * From in construct *out, which is a codepoint-for-codepoint
406 * translation following the rules of unicode-transforms.h. The
407 * result is that *out can be matched with normal regex, even if
408 * in contains obfuscatory Unicode bullshit.
410 * Preconditions
412 * - setup_sanitize_comment() has been invoked more recently than
413 * clean_sanitize_comment().
415 * - in is memory of at least length in_len, valid UTF-8 text.
417 * - Overwriting *out and *out_position_map shall not cause a
418 * memory leak.
420 * - out, out_len, out_position_map, and out_position_map_len are
421 * not 0.
423 * Postconditions (success):
425 * - *out is valid, UTF-8 text of length *out_len.
427 * - For every j in [0, *out_len) such that (*out)[j] starts a
428 * codepoint, in[*(position_map)[j]] is the start of the
429 * corresponding codepoint.
431 * - (*position_map)[*out_len] = in_len.
433 static int
434 to_scannable(const char *in, size_t in_len, char **out, size_t *out_len,
435 size_t **out_position_map, size_t *out_position_map_len)
437 int ret = -1;
438 wchar_t wc = 0;
439 size_t in_idx = 0;
440 size_t out_idx = 0;
441 int mbret = 0;
442 struct translate *tr = 0;
443 size_t out_sz = 0;
445 if (!*out) {
446 if (!(*out = malloc(1))) {
447 PERROR_MESSAGE("malloc");
448 goto done;
451 out_sz = 1;
452 *out_len = 0;
453 (*out)[0] = '\0';
457 * Position_map is here to make wordfiltering work. Suppose in is
459 * I think Nina Purpleton did
460 * nothing wrong
462 * and a wordfilter /Nina Purpleton/i -> "worst girl" is
463 * in effect. Then *out will be
465 * I think Nina Purpleton did nothing wrong
467 * The message should, of course, be filtered to
469 * I think worst girl did nothing
470 * wrong
472 * In order to do that, it would be necessary to have a map
473 * from in to *out on the byte level, since the wordfilter
474 * will only be run against *out.
476 * position_map[j] = k means that out[j] and in[k] mean the
477 * same thing.
479 while (in_idx < in_len) {
480 mbret = mbtowc(&wc, in + in_idx, in_len - in_idx);
482 if (mbret == -1) {
483 PERROR_MESSAGE("mbtowc");
484 goto done;
487 /* We pre-suppose that the insert will go as planned */
488 if (set_position_mapping(out_position_map, out_position_map_len,
489 out_idx, in_idx) < 0) {
490 goto done;
493 if (mbret == 1 &&
494 in[in_idx] >= ' ' &&
495 in[in_idx] <= '~') {
496 if (append_str(out, &out_idx, &out_sz, in + in_idx, 1) <
497 0) {
498 goto done;
500 } else {
501 if ((tr = bsearch(&wc, translates, NUM_OF(translates),
502 sizeof *translates,
503 match_translate))) {
504 if (append_str(out, &out_idx, &out_sz, tr->to,
505 strlen(tr->to)) < 0) {
506 goto done;
508 } else {
509 if (append_str(out, &out_idx, &out_sz, in +
510 in_idx, mbret) < 0) {
511 goto done;
516 in_idx += mbret;
519 if (set_position_mapping(out_position_map, out_position_map_len,
520 out_idx, in_len) < 0) {
521 goto done;
524 (*out)[out_idx] = '\0';
525 *out_len = out_idx;
526 ret = 0;
527 done:
529 return ret;
533 * Read through raw and scannable, checking all forbidden texts in
534 * scannable. If any match is detected, set *is_forbidden to 1.
536 * Preconditions
538 * - setup_sanitize_comment() has been invoked more recently than
539 * clean_sanitize_comment().
541 * - scannable is memory of length at least scannable_len.
543 * - out_is_forbidden, out_ban_duration, out_ban_reason are not 0.
545 * Postconditions (success):
547 * - if any regex specified by the forbidden array matches scannable,
548 * then *out_is_forbidden has been set to 1, with relevant
549 * *out_ban_duration, *out_ban_reason.
551 static int
552 check_forbidden_filters(const char *scannable, const size_t scannable_len,
553 uint_fast8_t *out_is_forbidden, int *out_ban_duration,
554 const
555 char **out_ban_reason)
557 int ret = -1;
559 /* These hold the match locations from pcre2 */
560 int num_matches = 0;
561 pcre2_match_data *match_data = 0;
563 for (size_t j = 0; j < forbiddens_num; ++j) {
564 if (!(match_data = pcre2_match_data_create_from_pattern(
565 forbiddens[j].code, 0))) {
566 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
567 goto done;
570 num_matches = pcre2_match(forbiddens[j].code,
571 (PCRE2_SPTR) scannable, scannable_len,
572 0, 0, match_data, 0);
574 if (num_matches > 0) {
575 *out_is_forbidden = 1;
576 *out_ban_duration = forbiddens[j].ban_duration;
577 *out_ban_reason = forbiddens[j].ban_reason;
578 j = forbiddens_num;
581 pcre2_match_data_free(match_data);
582 match_data = 0;
585 ret = 0;
586 done:
588 return ret;
592 * Read through raw and scannable, checking all wordfilters in
593 * scannable. Where a match is detected, the corresponding postion
594 * (via position_map) in raw is replaced by the replacement specified
595 * by the matching wordfilter.
597 * Preconditions
599 * - setup_sanitize_comment() has been invoked more recently than
600 * clean_sanitize_comment().
602 * - raw is memory of length at least raw_len, valid UTF-8 text.
604 * - scannable is memory of length at least scannable_len.
606 * - For any j in [0, scannable_len), position_map[j] is a valid
607 * index into raw, or is (size_t) -1.
609 * - position_map[scannable_len] = raw_len.
611 * - For any j in [0, scannable_len) such that k = position_map[j]
612 * is not (size_t) -1, scannable[j] and raw[k] are conceptually
613 * the same for wordfiltering.
615 * - Overwriting *out shall not cause a memory leak.
617 * - out and out_len are not 0.
619 * Postconditions (success):
621 * - *out is valid, UTF-8 text of length *out_len such that all
622 * non ASCII codepoints (and '<', '>', '&', '"', ''') are
623 * HTML-escaped.
625 * - *out represents raw, except in those sections of scannable
626 * where a wordfilter matched.
628 static int
629 wordfilter_to_html(const char *raw, const size_t raw_len, const char *scannable,
630 const size_t scannable_len, size_t *position_map, char **out,
631 size_t *out_len)
633 int ret = -1;
635 /* These hold the match locations from pcre2 */
636 uint32_t *ov_counts = 0;
637 PCRE2_SIZE **ov_ps = 0;
638 int *num_matches = 0;
639 pcre2_match_data **match_data = 0;
640 size_t raw_idx = 0;
641 size_t scannable_idx = 0;
642 size_t out_idx = 0;
643 size_t best_match_pos = 0;
644 size_t best_match_idx = 0;
645 size_t l = 0;
646 size_t mbret = 0;
648 if (!(ov_counts = calloc(wordfilters_num, sizeof *ov_counts))) {
649 PERROR_MESSAGE("calloc");
650 goto done;
653 if (!(ov_ps = calloc(wordfilters_num, sizeof *ov_ps))) {
654 PERROR_MESSAGE("calloc");
655 goto done;
658 if (!(num_matches = calloc(wordfilters_num, sizeof *num_matches))) {
659 PERROR_MESSAGE("calloc");
660 goto done;
663 if (!(match_data = calloc(wordfilters_num, sizeof *match_data))) {
664 PERROR_MESSAGE("calloc");
665 goto done;
668 /* First scan, before the loop */
669 for (size_t j = 0; j < wordfilters_num; ++j) {
670 if (!(match_data[j] = pcre2_match_data_create_from_pattern(
671 wordfilters[j].code, 0))) {
672 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
673 goto done;
676 num_matches[j] = pcre2_match(wordfilters[j].code,
677 (PCRE2_SPTR) scannable,
678 scannable_len, scannable_idx, 0,
679 match_data[j], 0);
682 handle_next_match:
683 best_match_pos = (size_t) -1;
684 best_match_idx = (size_t) -1;
686 /* We've run pcre2_match() on everything. Find the soonest match */
687 for (size_t j = 0; j < wordfilters_num; ++j) {
688 if (num_matches[j] <= 0) {
689 continue;
692 ov_ps[j] = pcre2_get_ovector_pointer(match_data[j]);
694 if (ov_ps[j][0] >= scannable_idx &&
695 ov_ps[j][0] < best_match_pos) {
696 best_match_pos = ov_ps[j][0];
697 best_match_idx = j;
701 if (best_match_idx == (size_t) -1) {
702 /* No matches. Turn the rest to html boring-like */
703 ret = to_html(raw, raw_len, raw_idx, out, out_len, &out_idx);
704 goto done;
707 /* Figure out where in raw this match starts */
708 l = best_match_pos;
710 while (l != (size_t) -1 &&
711 position_map[l] == (size_t) -1) {
712 l--;
715 if (l == (size_t) -1) {
716 ERROR_MESSAGE("Impossible condition in "
717 "wordfilter_to_html: raw=\"%s\", best_match_pos = %zu",
718 raw,
719 best_match_pos);
720 goto done;
724 * Now position_map[l] points to the first character in raw
725 * that should be replaced. Fill up to that point.
727 if (position_map[l] &&
728 position_map[l] > raw_idx) {
729 if (to_html(raw, position_map[l], raw_idx, out, out_len,
730 &out_idx) < 0) {
731 goto done;
735 /* Put the substituted text in */
736 if (to_html(wordfilters[best_match_idx].replacement,
737 wordfilters[best_match_idx].replacement_len, 0, out,
738 out_len,
739 &out_idx) < 0) {
740 goto done;
744 * Figure out where we should advance to in inputs. Naively,
745 * we want to set scannable_idx to ov_ps[best_match_idx][1]
746 * (the first character in scannable beyond the match).
747 * However, we have to consider the case of
749 * foo!!!bar
751 * where "foo" -> "baz" is the only transformation. Since
752 * some characters, like "!", are completely ignored by
753 * the scannable transformation, the naive method would
754 * start our scanning at the "b", skipping information.
756 * So, instead, we carefully find the last character in
757 * "foo", then jump one past it. This (unfortunately)
758 * requires a bit more manual fiddling with wide character
759 * conversions.
762 if (ov_ps[best_match_idx][1] <= scannable_idx) {
764 * This should never happen, but let's make sure
765 * we always keep advancing.
767 scannable_idx++;
768 } else {
769 scannable_idx = ov_ps[best_match_idx][1] - 1;
772 l = scannable_idx;
774 while (position_map[l] == (size_t) -1) {
775 l--;
778 raw_idx = position_map[l];
780 /* This is the "jump one past it" part */
781 scannable_idx++;
782 errno = 0;
783 mbret = mbrlen(raw + raw_idx, MB_CUR_MAX, 0);
785 switch (mbret) {
786 case (size_t) -2:
787 case (size_t) -1:
788 PERROR_MESSAGE("mbrlen");
789 goto done;
790 default:
791 raw_idx += mbret;
795 * Now re-check all our matches and figure out which ones
796 * need to be updated
798 for (size_t j = 0; j < wordfilters_num; ++j) {
799 if ((num_matches[j] <= 0) ||
800 ov_ps[j][0] >= scannable_idx) {
801 continue;
804 num_matches[j] = pcre2_match(wordfilters[j].code,
805 (PCRE2_SPTR) scannable,
806 scannable_len, scannable_idx, 0,
807 match_data[j], 0);
810 goto handle_next_match;
811 done:
813 for (size_t j = 0; j < wordfilters_num; ++j) {
814 pcre2_match_data_free(match_data[j]);
815 match_data[j] = 0;
818 free(match_data);
819 free(num_matches);
820 free(ov_counts);
821 free(ov_ps);
823 return ret;
827 * Read through in. Each time a match for format_replacements is
828 * found (something like a newline or a quote) is found, replace
829 * it with some HTML markup. The result is placed in out.
831 * Preconditions:
833 * - setup_sanitize_comment() has been invoked more recently than
834 * clean_sanitize_comment().
836 * - in is memory of length at least in_len, valid UTF-8 text.
838 * - Overwriting *out shall not cause a memory leak.
840 * - out and out_len are not 0.
842 * Postconditions (success):
844 * - *out is valid, UTF-8 text of length *out_len with sane HTML
845 * markup (and HTML escaped), suitable for outputting into an
846 * HTML file.
848 static int
849 insert_html_tags(const char *in, size_t in_len, const char *board, char **out,
850 size_t *out_len)
852 int ret = -1;
853 size_t in_idx = 0;
854 size_t match_pos = 0;
855 size_t after_match_pos = 0;
856 size_t out_idx = 0;
857 pcre2_match_data *match_data = 0;
858 int nret = 0;
859 PCRE2_UCHAR *tmp_1 = 0;
860 PCRE2_SIZE tmp_1_len = 0;
861 PCRE2_UCHAR *tmp_2 = 0;
862 PCRE2_SIZE tmp_2_len = 0;
863 PCRE2_UCHAR *tmp_3 = 0;
864 PCRE2_SIZE tmp_3_len = 0;
865 uint_fast8_t last_was_newline = 1;
866 char *link_target = 0;
867 size_t link_target_len = 0;
869 if (!(match_data = pcre2_match_data_create_from_pattern(
870 format_replacements, 0))) {
871 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
872 goto done;
875 find_next_bit:
877 if (in_idx >= in_len) {
878 goto success;
881 nret = pcre2_match(format_replacements, (PCRE2_SPTR) in, in_len, in_idx,
882 0, match_data, 0);
884 if (nret == PCRE2_ERROR_NOMATCH) {
885 ret = append_str(out, &out_idx, out_len, in + in_idx, in_len -
886 in_idx);
887 goto done;
890 if (nret < 0) {
891 PCRE2_UCHAR8 err_buf[120];
893 pcre2_get_error_message(nret, err_buf, 120);
894 ERROR_MESSAGE("pcre2_match: error while matching \"%.*s\": %s"
895 " (PCRE2 %d)", (int) (in_len - in_idx), in +
896 in_idx, err_buf,
897 nret);
898 goto done;
901 pcre2_substring_free(tmp_1);
902 pcre2_substring_free(tmp_2);
903 pcre2_substring_free(tmp_3);
904 free(link_target);
905 tmp_1 = 0;
906 tmp_2 = 0;
907 tmp_3 = 0;
908 link_target = 0;
910 /* We have match, stuff everything up to it in *out */
911 match_pos = pcre2_get_ovector_pointer(match_data)[0];
912 after_match_pos = pcre2_get_ovector_pointer(match_data)[1];
914 if (match_pos > in_idx) {
915 if (append_str(out, &out_idx, out_len, in + in_idx, match_pos -
916 in_idx) < 0) {
917 goto done;
920 last_was_newline = 0;
921 in_idx = match_pos;
924 /* Figure out what type of match. */
925 if (!pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "newline",
926 &tmp_1, &tmp_1_len)) {
927 if (last_was_newline) {
928 if (append_const_str(out, &out_idx, out_len,
929 "&nbsp;<br />") < 0) {
930 goto done;
932 } else {
933 if (append_const_str(out, &out_idx, out_len, "<br />") <
934 0) {
935 goto done;
939 last_was_newline = 1;
940 in_idx = after_match_pos;
941 goto find_next_bit;
944 last_was_newline = 0;
946 if (!pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "quote",
947 &tmp_1, &tmp_1_len)) {
948 if (append_const_str(out, &out_idx, out_len,
949 "<span class=\"quote\">") < 0) {
950 goto done;
953 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
954 (size_t) tmp_1_len) < 0) {
955 goto done;
958 if (append_const_str(out, &out_idx, out_len, "</span>") < 0) {
959 goto done;
962 in_idx = after_match_pos;
963 goto find_next_bit;
966 if (!pcre2_substring_get_byname(match_data,
967 (PCRE2_SPTR) "intra_postlink", &tmp_1,
968 &tmp_1_len)) {
969 if (pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "a_num",
970 &tmp_2, &tmp_2_len)) {
971 goto problem_with_match;
974 int found = 0;
976 if (db_construct_post_link(board, strlen(board), (const
977 char *) tmp_2,
978 tmp_2_len, &found, &link_target,
979 &link_target_len) < 0) {
980 goto done;
983 if (!found) {
984 if (append_str(out, &out_idx, out_len, in + match_pos,
985 after_match_pos - match_pos) < 0) {
986 goto done;
989 in_idx = after_match_pos;
990 goto find_next_bit;
993 if (append_const_str(out, &out_idx, out_len, "<a href=\"") <
994 0) {
995 goto done;
998 if (append_str(out, &out_idx, out_len, link_target,
999 link_target_len) < 0) {
1000 goto done;
1003 if (append_const_str(out, &out_idx, out_len, "\">") < 0) {
1004 goto done;
1007 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
1008 (size_t) tmp_1_len) < 0) {
1009 goto done;
1012 if (append_const_str(out, &out_idx, out_len, "</a>") < 0) {
1013 goto done;
1016 in_idx = after_match_pos;
1017 goto find_next_bit;
1020 if (!pcre2_substring_get_byname(match_data,
1021 (PCRE2_SPTR) "inter_postlink", &tmp_1,
1022 &tmp_1_len)) {
1023 if (pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "e_num",
1024 &tmp_2, &tmp_2_len)) {
1025 goto problem_with_match;
1028 if (pcre2_substring_get_byname(match_data,
1029 (PCRE2_SPTR) "e_board", &tmp_3,
1030 &tmp_3_len)) {
1031 goto problem_with_match;
1034 int found = 0;
1036 if (db_construct_post_link((const char *) tmp_3, tmp_3_len,
1037 (const char *) tmp_2, tmp_2_len,
1038 &found, &link_target,
1039 &link_target_len) < 0) {
1040 goto done;
1043 if (!found) {
1044 if (append_str(out, &out_idx, out_len, in + match_pos,
1045 after_match_pos - match_pos) < 0) {
1046 goto done;
1049 in_idx = after_match_pos;
1050 goto find_next_bit;
1053 if (append_const_str(out, &out_idx, out_len, "<a href=\"") <
1054 0) {
1055 goto done;
1058 if (append_str(out, &out_idx, out_len, link_target,
1059 link_target_len) < 0) {
1060 goto done;
1063 if (append_const_str(out, &out_idx, out_len, "\">") < 0) {
1064 goto done;
1067 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
1068 (size_t) tmp_1_len) < 0) {
1069 goto done;
1072 if (append_const_str(out, &out_idx, out_len, "</a>") < 0) {
1073 goto done;
1076 in_idx = after_match_pos;
1077 goto find_next_bit;
1080 problem_with_match:
1082 /* There was some kind of match, but it went wrong. */
1083 in_idx++;
1084 goto find_next_bit;
1085 success:
1086 ret = 0;
1087 done:
1088 *out_len = out_idx;
1089 pcre2_substring_free(tmp_1);
1090 pcre2_substring_free(tmp_2);
1091 pcre2_substring_free(tmp_3);
1092 pcre2_match_data_free(match_data);
1094 return ret;
1098 * Make sure that the contents of *pc are ready for safe injection
1099 * into the board, including HTML escaping, wordfiltering, general
1100 * formatting, and adding links.
1102 * Preconditions
1104 * - setup_sanitize_comment() has been invoked more recently than
1105 * clean_sanitize_comment().
1107 * - *pc has been filled out (fields like action, board, etc. have
1108 * been populated) from the POST data.
1110 * Postconditions (success):
1112 * - The prepared_XYZ fields of *pc have been filled out, and each
1113 * is valid ASCII text, with Unicode codepoints.
1116 st_sanitize_text(struct post_cmd *pc, int *our_fault,
1117 uint_fast8_t *is_forbidden, int *ban_duration, const
1118 char **ban_reason)
1120 int ret = -1;
1121 size_t out_idx = 0;
1122 char *html_escaped_comment = 0;
1123 size_t html_escaped_comment_len = 0;
1125 /* Flush out lurking double-free bugs */
1126 free(pc->prepared.name);
1127 pc->prepared.name = 0;
1128 pc->prepared.name_len = 0;
1129 free(pc->prepared.email);
1130 pc->prepared.email = 0;
1131 pc->prepared.email_len = 0;
1132 free(pc->prepared.subject);
1133 pc->prepared.subject = 0;
1134 pc->prepared.subject_len = 0;
1135 free(pc->prepared.comment);
1136 pc->prepared.comment = 0;
1137 pc->prepared.comment_len = 0;
1138 free(pc->prepared.file_name);
1139 pc->prepared.file_name = 0;
1140 pc->prepared.file_name_len = 0;
1141 free(pc->scannable_comment);
1142 pc->scannable_comment = 0;
1143 pc->scannable_comment_len = 0;
1144 free(pc->comment_position_map);
1145 pc->comment_position_map = 0;
1146 pc->comment_position_map_len = 0;
1147 free(pc->scannable_name);
1148 pc->scannable_name = 0;
1149 pc->scannable_name_len = 0;
1150 free(pc->name_position_map);
1151 pc->name_position_map = 0;
1152 pc->name_position_map_len = 0;
1153 free(pc->scannable_email);
1154 pc->scannable_email = 0;
1155 pc->scannable_email_len = 0;
1156 free(pc->email_position_map);
1157 pc->email_position_map = 0;
1158 pc->email_position_map_len = 0;
1159 free(pc->scannable_subject);
1160 pc->scannable_subject = 0;
1161 pc->scannable_subject_len = 0;
1162 free(pc->subject_position_map);
1163 pc->subject_position_map = 0;
1164 pc->subject_position_map_len = 0;
1165 free(pc->scannable_filename);
1166 pc->scannable_filename = 0;
1167 pc->scannable_filename_len = 0;
1168 free(pc->filename_position_map);
1169 pc->filename_position_map = 0;
1170 pc->filename_position_map_len = 0;
1171 out_idx = 0;
1173 if (!pc->raw.name_len) {
1174 free(pc->raw.name);
1176 if (!(pc->raw.name = strdup("Anonymous"))) {
1177 PERROR_MESSAGE("strdup");
1178 *our_fault = 1;
1179 goto done;
1182 pc->raw.name_len = strlen(pc->raw.name);
1185 if (pc->raw.name_len) {
1186 if (to_html(pc->raw.name, pc->raw.name_len, 0,
1187 &pc->prepared.name, &pc->prepared.name_len,
1188 &out_idx) < 0) {
1189 *our_fault = 1;
1190 goto done;
1194 out_idx = 0;
1196 if (pc->raw.email_len) {
1197 if (to_html(pc->raw.email, pc->raw.email_len, 0,
1198 &pc->prepared.email, &pc->prepared.email_len,
1199 &out_idx) < 0) {
1200 *our_fault = 1;
1201 goto done;
1205 out_idx = 0;
1207 if (pc->raw.tripcode_len) {
1208 if (to_html(pc->raw.tripcode, pc->raw.tripcode_len, 0,
1209 &pc->prepared.tripcode, &pc->prepared.tripcode_len,
1210 &out_idx) <
1211 0) {
1212 *our_fault = 1;
1213 goto done;
1217 out_idx = 0;
1219 if (pc->raw.subject_len) {
1220 if (to_html(pc->raw.subject, pc->raw.subject_len, 0,
1221 &pc->prepared.subject, &pc->prepared.subject_len,
1222 &out_idx) <
1223 0) {
1224 *our_fault = 1;
1225 goto done;
1229 out_idx = 0;
1231 if (pc->raw.file_name_len) {
1232 if (to_html(pc->raw.file_name, pc->raw.file_name_len, 0,
1233 &pc->prepared.file_name,
1234 &pc->prepared.file_name_len,
1235 &out_idx) < 0) {
1236 *our_fault = 1;
1237 goto done;
1241 if (to_scannable(pc->raw.comment, pc->raw.comment_len,
1242 &pc->scannable_comment, &pc->scannable_comment_len,
1243 &pc->comment_position_map,
1244 &pc->comment_position_map_len)) {
1245 *our_fault = 1;
1246 goto done;
1249 if (to_scannable(pc->raw.name, pc->raw.name_len, &pc->scannable_name,
1250 &pc->scannable_name_len, &pc->name_position_map,
1251 &pc->name_position_map_len)) {
1252 *our_fault = 1;
1253 goto done;
1256 if (to_scannable(pc->raw.email, pc->raw.email_len, &pc->scannable_email,
1257 &pc->scannable_email_len, &pc->email_position_map,
1258 &pc->email_position_map_len)) {
1259 *our_fault = 1;
1260 goto done;
1263 if (to_scannable(pc->raw.subject, pc->raw.subject_len,
1264 &pc->scannable_subject, &pc->scannable_subject_len,
1265 &pc->subject_position_map,
1266 &pc->subject_position_map_len)) {
1267 *our_fault = 1;
1268 goto done;
1271 if (to_scannable(pc->raw.file_name, pc->raw.file_name_len,
1272 &pc->scannable_filename, &pc->scannable_filename_len,
1273 &pc->filename_position_map,
1274 &pc->filename_position_map_len)) {
1275 *our_fault = 1;
1276 goto done;
1280 * Are they a spambot?
1282 if (check_forbidden_filters(pc->scannable_comment,
1283 pc->scannable_comment_len, is_forbidden,
1284 ban_duration, ban_reason) <
1285 0) {
1286 *our_fault = 1;
1287 goto done;
1290 if (*is_forbidden) {
1291 goto done;
1294 if (check_forbidden_filters(pc->scannable_name, pc->scannable_name_len,
1295 is_forbidden, ban_duration, ban_reason) <
1296 0) {
1297 *our_fault = 1;
1298 goto done;
1301 if (*is_forbidden) {
1302 goto done;
1305 if (check_forbidden_filters(pc->scannable_email,
1306 pc->scannable_email_len, is_forbidden,
1307 ban_duration, ban_reason) < 0) {
1308 *our_fault = 1;
1309 goto done;
1312 if (*is_forbidden) {
1313 goto done;
1316 if (check_forbidden_filters(pc->scannable_subject,
1317 pc->scannable_subject_len, is_forbidden,
1318 ban_duration, ban_reason) <
1319 0) {
1320 *our_fault = 1;
1321 goto done;
1324 if (*is_forbidden) {
1325 goto done;
1328 if (check_forbidden_filters(pc->scannable_filename,
1329 pc->scannable_filename_len, is_forbidden,
1330 ban_duration, ban_reason) <
1331 0) {
1332 *our_fault = 1;
1333 goto done;
1336 if (*is_forbidden) {
1337 *our_fault = 0;
1338 goto done;
1342 * Now we do the fancy thing. Match scannable, build prepared
1343 * out of that.
1345 if (wordfilter_to_html(pc->raw.comment, pc->raw.comment_len,
1346 pc->scannable_comment, pc->scannable_comment_len,
1347 pc->comment_position_map, &html_escaped_comment,
1348 &html_escaped_comment_len) < 0) {
1349 *our_fault = 1;
1350 goto done;
1354 * Everything's in &#123; form, but now take care of >>123,
1355 * <br />, etc.
1357 if (insert_html_tags(html_escaped_comment, html_escaped_comment_len,
1358 pc->raw.board, &pc->prepared.comment,
1359 &pc->prepared.comment_len) < 0) {
1360 *our_fault = 1;
1361 goto done;
1364 ret = 0;
1365 done:
1366 free(html_escaped_comment);
1368 return ret;
1372 * Initialize any static elements needed for this file.
1374 * Preconditions:
1376 * - setup_sanitize_comment() was not invoked more recently than
1377 * clean_sanitize_comment().
1379 * Postconditions (success):
1381 * - Any other function in this file may be safely called.
1384 setup_sanitize_comment(const struct configuration *conf)
1387 * Check that the locale/libc/whatever is set up so that
1388 * UTF-8 handling can work.
1390 int ret = -1;
1391 const char *raw =
1392 "<script>alert(1)</script> , \U0001d511\U0001d526\U0001d52b"
1393 "\U0001d51e\u3000\U0001d513\U0001d532\U0001d52f\U0001d52d"
1394 "\U0001d529\U0001d522\U0001d531\U0001d52c\U0001d52b & "
1395 "\u2468\u0294!\u0ce2!!";
1396 const char *correct_html =
1397 "&lt;script&gt;alert(1)&lt;/script&gt; , &#120081;&#120102;"
1398 "&#120107;&#120094;&#12288;&#120083;&#120114;&#120111;"
1399 "&#120109;&#120105;&#120098;&#120113;&#120108;&#120107; &amp;"
1400 " &#9320;&#660;!&#3298;!!";
1401 const char *correct_scannable =
1402 "<script>alert(1)</script> , Nina Purpleton & 9!!!";
1403 char *html = 0;
1404 size_t html_len = 0;
1405 char *scannable = 0;
1406 size_t scannable_len = 0;
1407 size_t *position_map = 0;
1408 size_t position_map_len = 0;
1409 size_t out_idx = 0;
1411 /* For pcre2_get_error_message */
1412 int err_code = 0;
1413 PCRE2_SIZE err_offset = 0;
1414 PCRE2_UCHAR8 err_buf[120];
1416 if (to_html(raw, strlen(raw), 0, &html, &html_len, &out_idx) < 0) {
1417 goto done;
1420 if (strcmp(html, correct_html)) {
1421 ERROR_MESSAGE("Was expecting html conversion to yield "
1422 "\n\n\u00ab%s\u00bb\n\nInstead, got "
1423 "\n\n\u00ab%s\u00bb\n\n",
1424 correct_html, html);
1425 goto done;
1428 if (to_scannable(raw, strlen(raw), &scannable, &scannable_len,
1429 &position_map, &position_map_len) < 0) {
1430 goto done;
1433 if (strcmp(scannable, correct_scannable)) {
1434 ERROR_MESSAGE("Was expecting scannable conversion to yield "
1435 "\n\n\u00ab%s\u00bb\n\nInstead, got "
1436 "\n\n\u00ab%s\u00bb\n\n",
1437 correct_scannable, scannable);
1438 goto done;
1441 if (!(wordfilters = calloc(conf->wordfilter_inputs_num,
1442 sizeof *wordfilters))) {
1443 PERROR_MESSAGE("calloc");
1444 goto done;
1447 wordfilters_num = conf->wordfilter_inputs_num;
1449 for (size_t j = 0; j < wordfilters_num; ++j) {
1450 wordfilters[j].replacement =
1451 conf->wordfilter_inputs[j].replacement;
1452 wordfilters[j].replacement_len = strlen(
1453 conf->wordfilter_inputs[j].replacement);
1455 if ((wordfilters[j].code = pcre2_compile(
1456 (PCRE2_SPTR8) conf->wordfilter_inputs[j].pattern,
1457 PCRE2_ZERO_TERMINATED, PCRE2_UTF, &err_code,
1458 &err_offset, 0))) {
1459 continue;
1462 pcre2_get_error_message(err_code, err_buf, 120);
1463 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1464 conf->wordfilter_inputs[j].pattern, err_buf);
1465 goto done;
1468 if (!(forbiddens = calloc(conf->forbidden_inputs_num,
1469 sizeof *forbiddens))) {
1470 PERROR_MESSAGE("calloc");
1471 goto done;
1474 forbiddens_num = conf->forbidden_inputs_num;
1476 for (size_t j = 0; j < forbiddens_num; ++j) {
1477 forbiddens[j].ban_duration =
1478 conf->forbidden_inputs[j].ban_duration;
1479 forbiddens[j].ban_reason = conf->forbidden_inputs[j].ban_reason;
1481 if ((forbiddens[j].code = pcre2_compile(
1482 (PCRE2_SPTR8) conf->forbidden_inputs[j].pattern,
1483 PCRE2_ZERO_TERMINATED, PCRE2_UTF, &err_code,
1484 &err_offset, 0))) {
1485 continue;
1488 pcre2_get_error_message(err_code, err_buf, 120);
1489 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1490 conf->forbidden_inputs[j].pattern, err_buf);
1491 goto done;
1494 const char *format_match_str =
1496 /* */
1497 "(?<newline>\\n)" /* */
1498 "|(?<intra_postlink>&gt;&gt;(?<a_num>[0-9]+))" /* */
1499 "|(?<inter_postlink>&gt;&gt;&gt;/" /* */
1500 "(?<e_board>[^ /]+)/(?<e_num>[0-9]+))" /* */
1501 "|(?<quote>(?<![^\n])&gt;[^\n]*)"; /* */
1503 if (!(format_replacements = pcre2_compile(
1504 (PCRE2_SPTR8) format_match_str, PCRE2_ZERO_TERMINATED,
1505 PCRE2_UTF,
1506 &err_code, &err_offset, 0))) {
1507 pcre2_get_error_message(err_code, err_buf, 120);
1508 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1509 format_match_str, err_buf);
1510 goto done;
1513 ret = 0;
1514 done:
1515 free(html);
1516 free(scannable);
1517 free(position_map);
1519 return ret;
1523 * Clean up any memory from this file
1525 * Postconditions (success):
1527 * - Valgrind won't report any memory leaks from this file.
1529 * - setup_sanitize_comment() can be safely called again.
1532 clean_sanitize_comment(void)
1534 for (size_t j = 0; j < wordfilters_num; ++j) {
1535 pcre2_code_free(wordfilters[j].code);
1536 wordfilters[j] = (struct wordfilter) { 0 };
1539 for (size_t j = 0; j < forbiddens_num; ++j) {
1540 pcre2_code_free(forbiddens[j].code);
1541 forbiddens[j] = (struct forbidden) { 0 };
1544 pcre2_code_free(format_replacements);
1545 format_replacements = 0;
1546 free(wordfilters);
1547 wordfilters = 0;
1548 wordfilters_num = 0;
1549 free(forbiddens);
1550 forbiddens = 0;
1551 forbiddens_num = 0;
1553 return 0;