misc: bump copyright years
[rb-79.git] / sanitize-comment.c
blob6346a69304264c8823eb6c134578fce9dc420999
1 /*
2 * Copyright (c) 2017-2020, De Rais <derais@cock.li>
4 * Permission to use, copy, modify, and/or distribute this software for
5 * any purpose with or without fee is hereby granted, provided that the
6 * above copyright notice and this permission notice appear in all
7 * copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
10 * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
11 * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
12 * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
13 * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
14 * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
15 * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
16 * PERFORMANCE OF THIS SOFTWARE.
18 #include <errno.h>
19 #include <stdint.h>
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <time.h>
24 #include <wchar.h>
26 #define PCRE2_CODE_UNIT_WIDTH 8
27 #include <pcre2.h>
29 #include "macros.h"
30 #include "rb79.h"
31 #include "unicode-transforms.h"
34 * We need a way to get codepoints out of UTF-8 strings and if
35 * wchar_t stored codepoint values, that would be great. That's
36 * __STDC_ISO_10646__, though. You can remove this check and cross
37 * your fingers, since rb79 will do a quick check on startup, but
38 * please check why the C implementation doesn't define
39 * __STDC_ISO_10646__ first.
41 #ifndef __STDC_ISO_10646__
42 #error We really want __STD_ISO_10646__
43 #endif
46 * A wordfilter consists of a pcre2 regex and a replacement string
48 struct wordfilter {
49 /* */
50 pcre2_code *code;
51 const char *replacement;
52 size_t replacement_len;
55 /* These are constructed in setup_sanitize_comment() */
56 static struct wordfilter *wordfilters;
57 static size_t wordfilters_num;
59 /* Special matcher for quoting, newlines, linkifying, etc. */
60 static pcre2_code *format_replacements;
63 * Comparison function for struct translate.
65 * Preconditions:
67 * - *key_v is a wchar_t.
69 * - *tr_v is a struct translate object.
71 * Postconditions:
73 * - Returns -1 (0) [1] if *key_v is less than (equal to) [greater
74 * than] *tr_v's starting range.
76 static int
77 match_translate(const void *key_v, const void *tr_v)
79 const wchar_t *key = key_v;
80 const struct translate *tr = tr_v;
82 if (*key < tr->from_s) {
83 return -1;
84 } else if (*key > tr->from_t) {
85 return 1;
88 return 0;
92 * Add a UTF-8 sequence str onto *buf
94 * Preconditions:
96 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
97 * string.
99 * - str is a valid ASCII (not just UTF-8) string of length str_len.
101 * Postconditions (success):
103 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
104 * string.
106 * - The contents of str have been appended to *buf (and *idx
107 * includes this).
109 static int
110 append_str(char **buf, size_t *idx, size_t *sz, const char *str, size_t str_len)
112 if (str_len + *idx >= *sz) {
113 void *newmem = 0;
114 size_t new_sz = str_len + *idx + (1 << 9);
116 if (str_len + *idx < str_len ||
117 str_len + *idx + (1 << 9) < str_len + *idx) {
118 ERROR_MESSAGE("overflow (str_len = %zu, *idx = %zu)",
119 str_len, *idx);
121 return -1;
124 if (!(newmem = realloc(*buf, new_sz))) {
125 PERROR_MESSAGE("realloc");
127 return -1;
130 *buf = newmem;
131 *sz = new_sz;
134 strncpy(*buf + *idx, str, str_len);
135 *(*buf + *idx + str_len) = '\0';
136 *idx += str_len;
138 return 0;
141 /* Dummy function for when I can't be bothered to strlen(). */
142 static int
143 append_const_str(char **buf, size_t *idx, size_t *len, const char *str)
145 return append_str(buf, idx, len, str, strlen(str));
149 * Add a single character onto *buf
151 * Preconditions:
153 * - *buf is memory of length *len, and up to *idx is a valid UTF-8
154 * string.
156 * - c is an ASCII character.
158 * Postconditions (success):
160 * - *buf is memory of length *len, and up to *idx is a valid UTF-8
161 * string.
163 * - c has been appended to *buf (and *idx includes this).
165 static int
166 append_char(char **buf, size_t *idx, size_t *len, char c)
168 if (1 + *idx >= *len) {
169 void *newmem = 0;
170 size_t new_len = 1 + *idx + (1 << 9);
172 if (*idx + 1 < *idx ||
173 *idx + 1 + (1 << 9) < *idx + 1) {
174 ERROR_MESSAGE("overflow (*idx = %zu)", *idx);
176 return -1;
179 if (!(newmem = realloc(*buf, new_len))) {
180 PERROR_MESSAGE("realloc");
182 return -1;
185 *buf = newmem;
186 *len = new_len;
189 *(*buf + *idx) = c;
190 *(*buf + *idx + 1) = '\0';
191 *idx += 1;
193 return 0;
197 * Add a Unicode codepoint onto *buf
199 * Preconditions:
201 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
202 * string.
204 * - wchar_t is a valid Unicode codepoint.
206 * Postconditions (success):
208 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
209 * string.
211 * - An HTML-escaped sequence like &#123; has been appended to
212 * *buf (and *idx includes this).
214 static int
215 append_wchar_escaped(char **buf, size_t *idx, size_t *sz, wchar_t wc)
217 size_t l = snprintf(0, 0, "&#%ld;", (long) wc);
219 if (l + *idx >= *sz) {
220 void *newmem = 0;
221 size_t new_sz = l + *idx + (1 << 9);
223 if (*idx + l < *idx ||
224 *idx + l + (1 << 9) < *idx + l) {
225 ERROR_MESSAGE("overflow (*idx = %zu, l = %zu)", *idx,
228 return -1;
231 if (!(newmem = realloc(*buf, new_sz))) {
232 PERROR_MESSAGE("realloc");
234 return -1;
237 *buf = newmem;
238 *sz = new_sz;
241 sprintf(*buf + *idx, "&#%ld;", (long) wc);
242 *idx += l;
244 return 0;
248 * Ensure that (*map)[j] = k, fixing up length as appropriate.
250 * Preconditions
252 * - *map is memory of length len.
254 * Postconditions (success):
256 * - *map is memory of length len.
258 * - (*map)[j] = k.
260 static int
261 set_position_mapping(size_t **map, size_t *len, size_t j, size_t k)
263 if (j + 1 >= *len) {
264 void *newmem = 0;
266 if (j + 2 < j ||
267 ((j + 2) * sizeof **map) / (j + 2) != sizeof **map) {
268 ERROR_MESSAGE("overflow (j = %zu)", j);
270 return -1;
273 if (!(newmem = realloc(*map, (j + 2) * sizeof **map))) {
274 PERROR_MESSAGE("realloc");
276 return -1;
279 *map = newmem;
281 for (size_t l = *len; l < j + 2; ++l) {
282 (*map)[l] = ((size_t) -1);
285 *len = j + 2;
288 (*map)[j] = k;
290 return 0;
294 * HTML-escape in to *out.
296 * Preconditions
298 * - in is memory of at least length in_len, valid UTF-8
299 * text.
301 * - *out is memory of at least length *out_len (if *out_len = 0,
302 * *out may be 0), valid UTF-8 text.
304 * - Overwriting *out and *out_len shall not cause a memory leak.
306 * - out, out_len, and out_idx are not 0.
308 * Postconditions (success):
310 * - *out is memory of at least length *out_len, valid UTF-8 text.
312 * - A stretch of HTML-escaped ASCII text representing in has been
313 * added to *out at the position that was *out_idx.
315 * - *out_idx has been updated to point to the end of this stretch.
317 * - If necessary, *out_len has been updated.
319 static int
320 to_html(const char *in, const size_t in_len, size_t in_idx, char **out,
321 size_t *out_len, size_t *out_idx)
323 int ret = -1;
324 wchar_t wc = 0;
325 int mbret = 0;
326 size_t out_sz = 0;
327 size_t initial_out_idx = *out_idx;
329 if (!*out) {
330 if (!(*out = malloc(1))) {
331 PERROR_MESSAGE("malloc");
332 goto done;
335 out_sz = 1;
336 *out_len = 0;
337 (*out)[0] = '\0';
341 * XXX: If you make this multithreaded, be sure to use
342 * mbrtowc(3) here!
344 while (in_idx < in_len &&
345 in[in_idx]) {
346 /* Extract next character */
347 mbret = mbtowc(&wc, in + in_idx, in_len - in_idx);
349 if (mbret == -1) {
350 PERROR_MESSAGE("mbtowc");
351 goto done;
354 if (wc == L'&') {
355 ret = append_str(out, out_idx, &out_sz, "&amp;", 5);
356 } else if (wc == L'"') {
357 ret = append_str(out, out_idx, &out_sz, "&quot;", 6);
358 } else if (wc == L'\'') {
359 ret = append_str(out, out_idx, &out_sz, "&apos;", 6);
360 } else if (wc == L'<') {
361 ret = append_str(out, out_idx, &out_sz, "&lt;", 4);
362 } else if (wc == L'>') {
363 ret = append_str(out, out_idx, &out_sz, "&gt;", 4);
364 } else if (mbret == 1 &&
365 in[in_idx] >= ' ' &&
366 in[in_idx] <= '~') {
367 ret = append_char(out, out_idx, &out_sz, in[in_idx]);
368 } else if (mbret == 1 &&
369 in[in_idx] == '\r') {
370 ret = 0;
371 } else if (mbret == 1 &&
372 in[in_idx] == '\n') {
373 ret = append_char(out, out_idx, &out_sz, in[in_idx]);
374 } else {
375 ret = append_wchar_escaped(out, out_idx, &out_sz, wc);
378 in_idx += mbret;
380 if (ret < 0) {
381 goto done;
385 *out_len = *out_len + (*out_idx - initial_out_idx);
386 ret = 0;
387 done:
389 return ret;
393 * From in construct *out, which is a codepoint-for-codepoint
394 * translation following the rules of unicode-transforms.h. The
395 * result is that *out can be matched with normal regex, even if
396 * in contains obfuscatory Unicode bullshit.
398 * Preconditions
400 * - setup_sanitize_comment() has been invoked more recently than
401 * clean_sanitize_comment().
403 * - in is memory of at least length in_len, valid UTF-8 text.
405 * - Overwriting *out and *out_position_map shall not cause a
406 * memory leak.
408 * - out, out_len, out_position_map, and out_position_map_len are
409 * not 0.
411 * Postconditions (success):
413 * - *out is valid, UTF-8 text of length *out_len.
415 * - For every j in [0, *out_len) such that (*out)[j] starts a
416 * codepoint, in[*(position_map)[j]] is the start of the
417 * corresponding codepoint.
419 * - (*position_map)[*out_len] = in_len.
421 static int
422 to_scannable(const char *in, size_t in_len, char **out, size_t *out_len,
423 size_t **out_position_map, size_t *out_position_map_len)
425 int ret = -1;
426 wchar_t wc = 0;
427 size_t in_idx = 0;
428 size_t out_idx = 0;
429 int mbret = 0;
430 struct translate *tr = 0;
431 size_t out_sz = 0;
433 if (!*out) {
434 if (!(*out = malloc(1))) {
435 PERROR_MESSAGE("malloc");
436 goto done;
439 out_sz = 1;
440 *out_len = 0;
441 (*out)[0] = '\0';
445 * Position_map is here to make wordfiltering work. Suppose in is
447 * I think Nina Purpleton did
448 * nothing wrong
450 * and a wordfilter /Nina Purpleton/i -> "worst girl" is
451 * in effect. Then *out will be
453 * I think Nina Purpleton did nothing wrong
455 * The message should, of course, be filtered to
457 * I think worst girl did nothing
458 * wrong
460 * In order to do that, it would be necessary to have a map
461 * from in to *out on the byte level, since the wordfilter
462 * will only be run against *out.
464 * position_map[j] = k means that out[j] and in[k] mean the
465 * same thing.
467 while (in_idx < in_len) {
468 mbret = mbtowc(&wc, in + in_idx, in_len - in_idx);
470 if (mbret == -1) {
471 PERROR_MESSAGE("mbtowc");
472 goto done;
475 /* We pre-suppose that the insert will go as planned */
476 if (set_position_mapping(out_position_map, out_position_map_len,
477 out_idx, in_idx) < 0) {
478 goto done;
481 if (mbret == 1 &&
482 in[in_idx] >= ' ' &&
483 in[in_idx] <= '~') {
484 if (append_str(out, &out_idx, &out_sz, in + in_idx, 1) <
485 0) {
486 goto done;
488 } else {
489 if ((tr = bsearch(&wc, translates, NUM_OF(translates),
490 sizeof *translates,
491 match_translate))) {
492 if (append_str(out, &out_idx, &out_sz, tr->to,
493 strlen(tr->to)) < 0) {
494 goto done;
496 } else {
497 if (append_str(out, &out_idx, &out_sz, in +
498 in_idx, mbret) < 0) {
499 goto done;
504 in_idx += mbret;
507 if (set_position_mapping(out_position_map, out_position_map_len,
508 out_idx, in_len) < 0) {
509 goto done;
512 (*out)[out_idx] = '\0';
513 *out_len = out_idx;
514 ret = 0;
515 done:
517 return ret;
521 * Read through raw and scannable, checking all wordfilters in
522 * scannable. Where a match is detected, the corresponding postion
523 * (via position_map) in raw is replaced by the replacement specified
524 * by the matching wordfilter.
526 * Preconditions
528 * - setup_sanitize_comment() has been invoked more recently than
529 * clean_sanitize_comment().
531 * - raw is memory of length at least raw_len, valid UTF-8 text.
533 * - scannable is memory of length at least scannable_len.
535 * - For any j in [0, scannable_len), position_map[j] is a valid
536 * index into raw, or is (size_t) -1.
538 * - position_map[scannable_len] = raw_len.
540 * - For any j in [0, scannable_len) such that k = position_map[j]
541 * is not (size_t) -1, scannable[j] and raw[k] are conceptually
542 * the same for wordfiltering.
544 * - Overwriting *out shall not cause a memory leak.
546 * - out and out_len are not 0.
548 * Postconditions (success):
550 * - *out is valid, UTF-8 text of length *out_len such that all
551 * non ASCII codepoints (and '<', '>', '&', '"', ''') are
552 * HTML-escaped.
554 * - *out represents raw, except in those sections of scannable
555 * where a wordfilter matched.
557 static int
558 wordfilter_to_html(const char *raw, const size_t raw_len, const char *scannable,
559 const size_t scannable_len, size_t *position_map, char **out,
560 size_t *out_len)
562 int ret = -1;
564 /* These hold the match locations from pcre2 */
565 uint32_t *ov_counts = 0;
566 PCRE2_SIZE **ov_ps = 0;
567 int *num_matches = 0;
568 pcre2_match_data **match_data = 0;
569 size_t raw_idx = 0;
570 size_t scannable_idx = 0;
571 size_t out_idx = 0;
572 size_t best_match_pos = 0;
573 size_t best_match_idx = 0;
574 size_t l = 0;
575 size_t mbret = 0;
577 if (!(ov_counts = calloc(wordfilters_num, sizeof *ov_counts))) {
578 PERROR_MESSAGE("calloc");
579 goto done;
582 if (!(ov_ps = calloc(wordfilters_num, sizeof *ov_ps))) {
583 PERROR_MESSAGE("calloc");
584 goto done;
587 if (!(num_matches = calloc(wordfilters_num, sizeof *num_matches))) {
588 PERROR_MESSAGE("calloc");
589 goto done;
592 if (!(match_data = calloc(wordfilters_num, sizeof *match_data))) {
593 PERROR_MESSAGE("calloc");
594 goto done;
597 /* First scan, before the loop */
598 for (size_t j = 0; j < wordfilters_num; ++j) {
599 if (!(match_data[j] = pcre2_match_data_create_from_pattern(
600 wordfilters[j].code, 0))) {
601 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
602 goto done;
605 num_matches[j] = pcre2_match(wordfilters[j].code,
606 (PCRE2_SPTR) scannable,
607 scannable_len, scannable_idx, 0,
608 match_data[j], 0);
611 handle_next_match:
612 best_match_pos = (size_t) -1;
613 best_match_idx = (size_t) -1;
615 /* We've run pcre2_match() on everything. Find the soonest match */
616 for (size_t j = 0; j < wordfilters_num; ++j) {
617 if (!num_matches[j]) {
618 continue;
621 ov_ps[j] = pcre2_get_ovector_pointer(match_data[j]);
623 if (ov_ps[j][0] >= scannable_idx &&
624 ov_ps[j][0] < best_match_pos) {
625 best_match_pos = ov_ps[j][0];
626 best_match_idx = j;
630 if (best_match_idx == (size_t) -1) {
631 /* No matches. Turn the rest to html boring-like */
632 ret = to_html(raw, raw_len, raw_idx, out, out_len, &out_idx);
633 goto done;
636 /* Figure out where in raw this match starts */
637 l = best_match_pos;
639 while (l != (size_t) -1 &&
640 position_map[l] == (size_t) -1) {
641 l--;
644 if (l == (size_t) -1) {
645 ERROR_MESSAGE("Impossible condition in "
646 "wordfilter_to_html: raw=\"%s\", best_match_pos = %zu",
647 raw,
648 best_match_pos);
649 goto done;
653 * Now position_map[l] points to the first character in raw
654 * that should be replaced. Fill up to that point.
656 if (position_map[l] &&
657 position_map[l] > raw_idx) {
658 if (to_html(raw, position_map[l], raw_idx, out, out_len,
659 &out_idx) < 0) {
660 goto done;
664 /* Put the substituted text in */
665 if (to_html(wordfilters[best_match_idx].replacement,
666 wordfilters[best_match_idx].replacement_len, 0, out,
667 out_len,
668 &out_idx) < 0) {
669 goto done;
673 * Figure out where we should advance to in inputs. Naively,
674 * we want to set scannable_idx to ov_ps[best_match_idx][1]
675 * (the first character in scannable beyond the match).
676 * However, we have to consider the case of
678 * foo!!!bar
680 * where "foo" -> "baz" is the only transformation. Since
681 * some characters, like "!", are completely ignored by
682 * the scannable transformation, the naive method would
683 * start our scanning at the "b", skipping information.
685 * So, instead, we carefully find the last character in
686 * "foo", then jump one past it. This (unfortunately)
687 * requires a bit more manual fiddling with wide character
688 * conversions.
691 if (ov_ps[best_match_idx][1] <= scannable_idx) {
693 * This should never happen, but let's make sure
694 * we always keep advancing.
696 scannable_idx++;
697 } else {
698 scannable_idx = ov_ps[best_match_idx][1] - 1;
701 l = scannable_idx;
703 while (position_map[l] == (size_t) -1) {
704 l--;
707 raw_idx = position_map[l];
709 /* This is the "jump one past it" part */
710 scannable_idx++;
711 errno = 0;
712 mbret = mbrlen(raw + raw_idx, MB_CUR_MAX, 0);
714 switch (mbret) {
715 case (size_t) -2:
716 case (size_t) -1:
717 PERROR_MESSAGE("mbrlen");
718 goto done;
719 default:
720 raw_idx += mbret;
724 * Now re-check all our matches and figure out which ones
725 * need to be updated
727 for (size_t j = 0; j < wordfilters_num; ++j) {
728 if (!num_matches[j] ||
729 ov_ps[j][0] >= scannable_idx) {
730 continue;
733 num_matches[j] = pcre2_match(wordfilters[j].code,
734 (PCRE2_SPTR) scannable,
735 scannable_len, scannable_idx, 0,
736 match_data[j], 0);
739 goto handle_next_match;
740 done:
742 for (size_t j = 0; j < wordfilters_num; ++j) {
743 pcre2_match_data_free(match_data[j]);
744 match_data[j] = 0;
747 free(match_data);
748 free(num_matches);
749 free(ov_counts);
750 free(ov_ps);
752 return ret;
756 * Read through in. Each time a match for format_replacements is
757 * found (something like a newline or a quote) is found, replace
758 * it with some HTML markup. The result is placed in out.
760 * Preconditions:
762 * - setup_sanitize_comment() has been invoked more recently than
763 * clean_sanitize_comment().
765 * - in is memory of length at least in_len, valid UTF-8 text.
767 * - Overwriting *out shall not cause a memory leak.
769 * - out and out_len are not 0.
771 * Postconditions (success):
773 * - *out is valid, UTF-8 text of length *out_len with sane HTML
774 * markup (and HTML escaped), suitable for outputting into an
775 * HTML file.
777 static int
778 insert_html_tags(const char *in, size_t in_len, const char *board, char **out,
779 size_t *out_len)
781 int ret = -1;
782 size_t in_idx = 0;
783 size_t match_pos = 0;
784 size_t after_match_pos = 0;
785 size_t out_idx = 0;
786 pcre2_match_data *match_data = 0;
787 int nret = 0;
788 PCRE2_UCHAR *tmp_1 = 0;
789 PCRE2_SIZE tmp_1_len = 0;
790 PCRE2_UCHAR *tmp_2 = 0;
791 PCRE2_SIZE tmp_2_len = 0;
792 PCRE2_UCHAR *tmp_3 = 0;
793 PCRE2_SIZE tmp_3_len = 0;
794 uint_fast8_t last_was_newline = 1;
795 char *link_target = 0;
796 size_t link_target_len = 0;
798 if (!(match_data = pcre2_match_data_create_from_pattern(
799 format_replacements, 0))) {
800 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
801 goto done;
804 find_next_bit:
806 if (in_idx >= in_len) {
807 goto success;
810 nret = pcre2_match(format_replacements, (PCRE2_SPTR) in, in_len, in_idx,
811 0, match_data, 0);
813 if (nret == PCRE2_ERROR_NOMATCH) {
814 ret = append_str(out, &out_idx, out_len, in + in_idx, in_len -
815 in_idx);
816 goto done;
819 if (nret < 0) {
820 PCRE2_UCHAR8 err_buf[120];
822 pcre2_get_error_message(nret, err_buf, 120);
823 ERROR_MESSAGE("pcre2_match: error while matching \"%.*s\": %s"
824 " (PCRE2 %d)", (int) (in_len - in_idx), in +
825 in_idx, err_buf,
826 nret);
827 goto done;
830 pcre2_substring_free(tmp_1);
831 pcre2_substring_free(tmp_2);
832 pcre2_substring_free(tmp_3);
833 free(link_target);
834 tmp_1 = 0;
835 tmp_2 = 0;
836 tmp_3 = 0;
837 link_target = 0;
839 /* We have match, stuff everything up to it in *out */
840 match_pos = pcre2_get_ovector_pointer(match_data)[0];
841 after_match_pos = pcre2_get_ovector_pointer(match_data)[1];
843 if (match_pos > in_idx) {
844 if (append_str(out, &out_idx, out_len, in + in_idx, match_pos -
845 in_idx) < 0) {
846 goto done;
849 last_was_newline = 0;
850 in_idx = match_pos;
853 /* Figure out what type of match. */
854 if (!pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "newline",
855 &tmp_1, &tmp_1_len)) {
856 if (last_was_newline) {
857 if (append_const_str(out, &out_idx, out_len,
858 "&nbsp;<br />") < 0) {
859 goto done;
861 } else {
862 if (append_const_str(out, &out_idx, out_len, "<br />") <
863 0) {
864 goto done;
868 last_was_newline = 1;
869 in_idx = after_match_pos;
870 goto find_next_bit;
873 last_was_newline = 0;
875 if (!pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "quote",
876 &tmp_1, &tmp_1_len)) {
877 if (append_const_str(out, &out_idx, out_len,
878 "<span class=\"quote\">") < 0) {
879 goto done;
882 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
883 (size_t) tmp_1_len) < 0) {
884 goto done;
887 if (append_const_str(out, &out_idx, out_len, "</span>") < 0) {
888 goto done;
891 in_idx = after_match_pos;
892 goto find_next_bit;
895 if (!pcre2_substring_get_byname(match_data,
896 (PCRE2_SPTR) "intra_postlink", &tmp_1,
897 &tmp_1_len)) {
898 if (pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "a_num",
899 &tmp_2, &tmp_2_len)) {
900 goto problem_with_match;
903 int found = 0;
905 if (db_construct_post_link(board, strlen(board), (const
906 char *) tmp_2,
907 tmp_2_len, &found, &link_target,
908 &link_target_len) < 0) {
909 goto done;
912 if (!found) {
913 if (append_str(out, &out_idx, out_len, in + match_pos,
914 after_match_pos - match_pos) < 0) {
915 goto done;
918 in_idx = after_match_pos;
919 goto find_next_bit;
922 if (append_const_str(out, &out_idx, out_len, "<a href=\"") <
923 0) {
924 goto done;
927 if (append_str(out, &out_idx, out_len, link_target,
928 link_target_len) < 0) {
929 goto done;
932 if (append_const_str(out, &out_idx, out_len, "\">") < 0) {
933 goto done;
936 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
937 (size_t) tmp_1_len) < 0) {
938 goto done;
941 if (append_const_str(out, &out_idx, out_len, "</a>") < 0) {
942 goto done;
945 in_idx = after_match_pos;
946 goto find_next_bit;
949 if (!pcre2_substring_get_byname(match_data,
950 (PCRE2_SPTR) "inter_postlink", &tmp_1,
951 &tmp_1_len)) {
952 if (pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "e_num",
953 &tmp_2, &tmp_2_len)) {
954 goto problem_with_match;
957 if (pcre2_substring_get_byname(match_data,
958 (PCRE2_SPTR) "e_board", &tmp_3,
959 &tmp_3_len)) {
960 goto problem_with_match;
963 int found = 0;
965 if (db_construct_post_link((const char *) tmp_3, tmp_3_len,
966 (const char *) tmp_2, tmp_2_len,
967 &found, &link_target,
968 &link_target_len) < 0) {
969 goto done;
972 if (!found) {
973 if (append_str(out, &out_idx, out_len, in + match_pos,
974 after_match_pos - match_pos) < 0) {
975 goto done;
978 in_idx = after_match_pos;
979 goto find_next_bit;
982 if (append_const_str(out, &out_idx, out_len, "<a href=\"") <
983 0) {
984 goto done;
987 if (append_str(out, &out_idx, out_len, link_target,
988 link_target_len) < 0) {
989 goto done;
992 if (append_const_str(out, &out_idx, out_len, "\">") < 0) {
993 goto done;
996 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
997 (size_t) tmp_1_len) < 0) {
998 goto done;
1001 if (append_const_str(out, &out_idx, out_len, "</a>") < 0) {
1002 goto done;
1005 in_idx = after_match_pos;
1006 goto find_next_bit;
1009 problem_with_match:
1011 /* There was some kind of match, but it went wrong. */
1012 in_idx++;
1013 goto find_next_bit;
1014 success:
1015 ret = 0;
1016 done:
1017 *out_len = out_idx;
1018 pcre2_substring_free(tmp_1);
1019 pcre2_substring_free(tmp_2);
1020 pcre2_substring_free(tmp_3);
1021 pcre2_match_data_free(match_data);
1023 return ret;
1027 * Make sure that the contents of *pc are ready for safe injection
1028 * into the board, including HTML escaping, wordfiltering, general
1029 * formatting, and adding links.
1031 * Preconditions
1033 * - setup_sanitize_comment() has been invoked more recently than
1034 * clean_sanitize_comment().
1036 * - *pc has been filled out (fields like action, board, etc. have
1037 * been populated) from the POST data.
1039 * Postconditions (success):
1041 * - The prepared_XYZ fields of *pc have been filled out, and each
1042 * is valid ASCII text, with Unicode codepoints.
1045 st_sanitize_text(struct post_cmd *pc, int *our_fault)
1047 int ret = -1;
1048 size_t out_idx = 0;
1049 char *html_escaped_comment = 0;
1050 size_t html_escaped_comment_len = 0;
1052 /* Flush out lurking double-free bugs */
1053 free(pc->prepared.name);
1054 pc->prepared.name = 0;
1055 pc->prepared.name_len = 0;
1056 free(pc->prepared.email);
1057 pc->prepared.email = 0;
1058 pc->prepared.email_len = 0;
1059 free(pc->prepared.subject);
1060 pc->prepared.subject = 0;
1061 pc->prepared.subject_len = 0;
1062 free(pc->prepared.comment);
1063 pc->prepared.comment = 0;
1064 pc->prepared.comment_len = 0;
1065 free(pc->prepared.file_name);
1066 pc->prepared.file_name = 0;
1067 pc->prepared.file_name_len = 0;
1068 free(pc->scannable_comment);
1069 pc->scannable_comment = 0;
1070 pc->scannable_comment_len = 0;
1071 free(pc->position_map);
1072 pc->position_map = 0;
1073 pc->position_map_len = 0;
1074 out_idx = 0;
1076 if (!pc->raw.name_len) {
1077 free(pc->raw.name);
1079 if (!(pc->raw.name = strdup("Anonymous"))) {
1080 PERROR_MESSAGE("strdup");
1081 goto done;
1084 pc->raw.name_len = strlen(pc->raw.name);
1087 if (pc->raw.name_len) {
1088 if (to_html(pc->raw.name, pc->raw.name_len, 0,
1089 &pc->prepared.name, &pc->prepared.name_len,
1090 &out_idx) < 0) {
1091 *our_fault = 1;
1092 goto done;
1096 out_idx = 0;
1098 if (pc->raw.email_len) {
1099 if (to_html(pc->raw.email, pc->raw.email_len, 0,
1100 &pc->prepared.email, &pc->prepared.email_len,
1101 &out_idx) < 0) {
1102 *our_fault = 1;
1103 goto done;
1107 out_idx = 0;
1109 if (pc->raw.tripcode_len) {
1110 if (to_html(pc->raw.tripcode, pc->raw.tripcode_len, 0,
1111 &pc->prepared.tripcode, &pc->prepared.tripcode_len,
1112 &out_idx) <
1113 0) {
1114 *our_fault = 1;
1115 goto done;
1119 out_idx = 0;
1121 if (pc->raw.subject_len) {
1122 if (to_html(pc->raw.subject, pc->raw.subject_len, 0,
1123 &pc->prepared.subject, &pc->prepared.subject_len,
1124 &out_idx) <
1125 0) {
1126 *our_fault = 1;
1127 goto done;
1131 out_idx = 0;
1133 if (pc->raw.file_name_len) {
1134 if (to_html(pc->raw.file_name, pc->raw.file_name_len, 0,
1135 &pc->prepared.file_name,
1136 &pc->prepared.file_name_len,
1137 &out_idx) < 0) {
1138 *our_fault = 1;
1139 goto done;
1143 if (to_scannable(pc->raw.comment, pc->raw.comment_len,
1144 &pc->scannable_comment, &pc->scannable_comment_len,
1145 &pc->position_map,
1146 &pc->position_map_len)) {
1147 *our_fault = 1;
1148 goto done;
1152 * Now we do the fancy thing. Match scannable, build prepared
1153 * out of that.
1155 if (wordfilter_to_html(pc->raw.comment, pc->raw.comment_len,
1156 pc->scannable_comment, pc->scannable_comment_len,
1157 pc->position_map,
1158 &html_escaped_comment,
1159 &html_escaped_comment_len) < 0) {
1160 *our_fault = 1;
1161 goto done;
1165 * Everything's in &#123; form, but now take care of >>123,
1166 * <br />, etc.
1168 if (insert_html_tags(html_escaped_comment, html_escaped_comment_len,
1169 pc->raw.board, &pc->prepared.comment,
1170 &pc->prepared.comment_len) < 0) {
1171 *our_fault = 1;
1172 goto done;
1175 ret = 0;
1176 done:
1177 free(html_escaped_comment);
1179 return ret;
1183 * Initialize any static elements needed for this file.
1185 * Preconditions:
1187 * - setup_sanitize_comment() was not invoked more recently than
1188 * clean_sanitize_comment().
1190 * Postconditions (success):
1192 * - Any other function in this file may be safely called.
1195 setup_sanitize_comment(const struct configuration *conf)
1198 * Check that the locale/libc/whatever is set up so that
1199 * UTF-8 handling can work.
1201 int ret = -1;
1202 const char *raw =
1203 "<script>alert(1)</script> , \U0001d511\U0001d526\U0001d52b"
1204 "\U0001d51e\u3000\U0001d513\U0001d532\U0001d52f\U0001d52d"
1205 "\U0001d529\U0001d522\U0001d531\U0001d52c\U0001d52b & "
1206 "\u2468\u0294!\u0ce2!!";
1207 const char *correct_html =
1208 "&lt;script&gt;alert(1)&lt;/script&gt; , &#120081;&#120102;"
1209 "&#120107;&#120094;&#12288;&#120083;&#120114;&#120111;"
1210 "&#120109;&#120105;&#120098;&#120113;&#120108;&#120107; &amp;"
1211 " &#9320;&#660;!&#3298;!!";
1212 const char *correct_scannable =
1213 "<script>alert(1)</script> , Nina Purpleton & 9!!!";
1214 char *html = 0;
1215 size_t html_len = 0;
1216 char *scannable = 0;
1217 size_t scannable_len = 0;
1218 size_t *position_map = 0;
1219 size_t position_map_len = 0;
1220 size_t out_idx = 0;
1222 if (to_html(raw, strlen(raw), 0, &html, &html_len, &out_idx) < 0) {
1223 goto done;
1226 if (strcmp(html, correct_html)) {
1227 ERROR_MESSAGE("Was expecting html conversion to yield "
1228 "\n\n\u00ab%s\u00bb\n\nInstead, got "
1229 "\n\n\u00ab%s\u00bb\n\n",
1230 correct_html, html);
1231 goto done;
1234 if (to_scannable(raw, strlen(raw), &scannable, &scannable_len,
1235 &position_map, &position_map_len) < 0) {
1236 goto done;
1239 if (strcmp(scannable, correct_scannable)) {
1240 ERROR_MESSAGE("Was expecting scannable conversion to yield "
1241 "\n\n\u00ab%s\u00bb\n\nInstead, got "
1242 "\n\n\u00ab%s\u00bb\n\n",
1243 correct_scannable, scannable);
1244 goto done;
1247 if (!(wordfilters = calloc(conf->wordfilter_inputs_num,
1248 sizeof *wordfilters))) {
1249 PERROR_MESSAGE("calloc");
1250 goto done;
1253 wordfilters_num = conf->wordfilter_inputs_num;
1254 int err_code = 0;
1255 PCRE2_SIZE err_offset = 0;
1256 PCRE2_UCHAR8 err_buf[120];
1258 for (size_t j = 0; j < wordfilters_num; ++j) {
1259 wordfilters[j].replacement =
1260 conf->wordfilter_inputs[j].replacement;
1261 wordfilters[j].replacement_len = strlen(
1262 conf->wordfilter_inputs[j].replacement);
1264 if ((wordfilters[j].code = pcre2_compile(
1265 (PCRE2_SPTR8) conf->wordfilter_inputs[j].pattern,
1266 PCRE2_ZERO_TERMINATED, PCRE2_UTF, &err_code,
1267 &err_offset, 0))) {
1268 continue;
1271 pcre2_get_error_message(err_code, err_buf, 120);
1272 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1273 conf->wordfilter_inputs[j].pattern, err_buf);
1274 goto done;
1277 const char *format_match_str =
1279 /* */
1280 "(?<newline>\\n)" /* */
1281 "|(?<intra_postlink>&gt;&gt;(?<a_num>[0-9]+))" /* */
1282 "|(?<inter_postlink>&gt;&gt;&gt;/" /* */
1283 "(?<e_board>[^ /]+)/(?<e_num>[0-9]+))" /* */
1284 "|(?<quote>(?<![^\n])&gt;[^\n]*)"; /* */
1286 if (!(format_replacements = pcre2_compile(
1287 (PCRE2_SPTR8) format_match_str, PCRE2_ZERO_TERMINATED,
1288 PCRE2_UTF,
1289 &err_code, &err_offset, 0))) {
1290 pcre2_get_error_message(err_code, err_buf, 120);
1291 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1292 format_match_str, err_buf);
1293 goto done;
1296 ret = 0;
1297 done:
1298 free(html);
1299 free(scannable);
1300 free(position_map);
1302 return ret;
1306 * Clean up any memory from this file
1308 * Postconditions (success):
1310 * - Valgrind won't report any memory leaks from this file.
1312 * - setup_sanitize_comment() can be safely called again.
1315 clean_sanitize_comment(void)
1317 for (size_t j = 0; j < wordfilters_num; ++j) {
1318 pcre2_code_free(wordfilters[j].code);
1319 wordfilters[j] = (struct wordfilter) { 0 };
1322 pcre2_code_free(format_replacements);
1323 format_replacements = 0;
1324 free(wordfilters);
1325 wordfilters = 0;
1326 wordfilters_num = 0;
1328 return 0;