files: reap file_description_prog children
[rb-79.git] / sanitize-comment.c
blobcfb2e4118a7347f3aa3d67a24b9e6ab36459debb
1 /*
2 * Copyright (c) 2017, De Rais <derais@cock.li>
4 * Permission to use, copy, modify, and/or distribute this software for
5 * any purpose with or without fee is hereby granted, provided that the
6 * above copyright notice and this permission notice appear in all
7 * copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
10 * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
11 * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
12 * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
13 * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
14 * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
15 * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
16 * PERFORMANCE OF THIS SOFTWARE.
18 #include <errno.h>
19 #include <stdint.h>
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <time.h>
24 #include <wchar.h>
26 #define PCRE2_CODE_UNIT_WIDTH 8
27 #include <pcre2.h>
29 #include "macros.h"
30 #include "rb79.h"
31 #include "unicode-transforms.h"
34 * We need a way to get codepoints out of UTF-8 strings and if
35 * wchar_t stored codepoint values, that would be great. That's
36 * __STDC_ISO_10646__, though. You can remove this check and cross
37 * your fingers, since rb79 will do a quick check on startup, but
38 * please check why the C implementation doesn't define
39 * __STDC_ISO_10646__ first.
41 #ifndef __STDC_ISO_10646__
42 #error We really want __STD_ISO_10646__
43 #endif
46 * A wordfilter consists of a pcre2 regex and a replacement string
48 struct wordfilter {
49 /* */
50 pcre2_code *code;
51 const char *replacement;
52 size_t replacement_len;
55 /* These are constructed in setup_sanitize_comment() */
56 static struct wordfilter *wordfilters;
57 static size_t wordfilters_num;
59 /* Special matcher for quoting, newlines, linkifying, etc. */
60 static pcre2_code *format_replacements;
63 * Comparison function for struct translate.
65 * Preconditions:
67 * - *key_v is a wchar_t.
69 * - *tr_v is a struct translate object.
71 * Postconditions:
73 * - Returns -1 (0) [1] if *key_v is less than (equal to) [greater
74 * than] *tr_v's starting range.
76 static int match_translate(const void *key_v, const void *tr_v)
78 const wchar_t *key = (const wchar_t *) key_v;
79 const struct translate *tr = (const struct translate *) tr_v;
81 if (*key < tr->from_s) {
82 return -1;
83 } else if (*key > tr->from_t) {
84 return 1;
87 return 0;
91 * Add a UTF-8 sequence str onto *buf
93 * Preconditions:
95 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
96 * string.
98 * - str is a valid ASCII (not just UTF-8) string of length str_len.
100 * Postconditions (success):
102 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
103 * string.
105 * - The contents of str have been appended to *buf (and *idx
106 * includes this).
108 static int append_str(char **buf, size_t *idx, size_t *sz, const char *str,
109 size_t str_len)
111 if (str_len + *idx >= *sz) {
112 void *newmem = 0;
113 size_t new_sz = str_len + *idx + (1 << 9);
115 if (!(newmem = realloc(*buf, new_sz))) {
116 PERROR_MESSAGE("realloc");
118 return -1;
121 *buf = newmem;
122 *sz = new_sz;
125 strncpy(*buf + *idx, str, str_len);
126 *(*buf + *idx + str_len) = '\0';
127 *idx += str_len;
129 return 0;
132 /* Dummy function for when I can't be bothered to strlen(). */
133 static int append_const_str(char **buf, size_t *idx, size_t *len, const
134 char *str)
136 return append_str(buf, idx, len, str, strlen(str));
140 * Add a single character onto *buf
142 * Preconditions:
144 * - *buf is memory of length *len, and up to *idx is a valid UTF-8
145 * string.
147 * - c is an ASCII character.
149 * Postconditions (success):
151 * - *buf is memory of length *len, and up to *idx is a valid UTF-8
152 * string.
154 * - c has been appended to *buf (and *idx includes this).
156 static int append_char(char **buf, size_t *idx, size_t *len, char c)
158 if (1 + *idx >= *len) {
159 void *newmem = 0;
160 size_t new_len = 1 + *idx + (1 << 9);
162 if (!(newmem = realloc(*buf, new_len))) {
163 PERROR_MESSAGE("realloc");
165 return -1;
168 *buf = newmem;
169 *len = new_len;
172 *(*buf + *idx) = c;
173 *(*buf + *idx + 1) = '\0';
174 *idx += 1;
176 return 0;
180 * Add a Unicode codepoint onto *buf
182 * Preconditions:
184 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
185 * string.
187 * - wchar_t is a valid Unicode codepoint.
189 * Postconditions (success):
191 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
192 * string.
194 * - An HTML-escaped sequence like &#123; has been appended to
195 * *buf (and *idx includes this).
197 static int append_wchar_escaped(char **buf, size_t *idx, size_t *sz, wchar_t wc)
199 size_t l = snprintf(0, 0, "&#%ld;", (long) wc);
201 if (l + *idx >= *sz) {
202 void *newmem = 0;
203 size_t new_sz = l + *idx + (1 << 9);
205 if (!(newmem = realloc(*buf, new_sz))) {
206 PERROR_MESSAGE("realloc");
208 return -1;
211 *buf = newmem;
212 *sz = new_sz;
215 sprintf(*buf + *idx, "&#%ld;", (long) wc);
216 *idx += l;
218 return 0;
222 * Ensure that (*map)[j] = k, fixing up length as appropriate.
224 * Preconditions
226 * - *map is memory of length len.
228 * Postconditions (success):
230 * - *map is memory of length len.
232 * - (*map)[j] = k.
234 static int set_position_mapping(size_t **map, size_t *len, size_t j, size_t k)
236 if (j + 1 >= *len) {
237 void *newmem = 0;
239 if (!(newmem = realloc(*map, (j + 2) * sizeof **map))) {
240 PERROR_MESSAGE("realloc");
242 return -1;
245 *map = newmem;
247 for (size_t l = *len; l < j + 2; ++l) {
248 (*map)[l] = ((size_t) -1);
251 *len = j + 2;
254 (*map)[j] = k;
256 return 0;
260 * HTML-escape in to *out.
262 * Preconditions
264 * - in is memory of at least length in_len, valid UTF-8
265 * text.
267 * - *out is memory of at least length *out_len (if *out_len = 0,
268 * *out may be 0), valid UTF-8 text.
270 * - Overwriting *out and *out_len shall not cause a memory leak.
272 * - out, out_len, and out_idx are not 0.
274 * Postconditions (success):
276 * - *out is memory of at least length *out_len, valid UTF-8 text.
278 * - A stretch of HTML-escaped ASCII text representing in has been
279 * added to *out at the position that was *out_idx.
281 * - *out_idx has been updated to point to the end of this stretch.
283 * - If necessary, *out_len has been updated.
285 static int to_html(const char *in, const size_t in_len, size_t in_idx,
286 char **out, size_t *out_len, size_t *out_idx)
288 int ret = -1;
289 wchar_t wc = 0;
290 int mbret = 0;
291 size_t out_sz = 0;
292 size_t initial_out_idx = *out_idx;
294 if (!*out) {
295 if (!(*out = malloc(1))) {
296 PERROR_MESSAGE("malloc");
297 goto done;
300 out_sz = 1;
301 *out_len = 0;
302 (*out)[0] = '\0';
306 * XXX: If you make this multithreaded, be sure to use
307 * mbrtowc(3) here!
309 while (in_idx < in_len &&
310 in[in_idx]) {
311 /* Extract next character */
312 mbret = mbtowc(&wc, in + in_idx, in_len - in_idx);
314 if (mbret == -1) {
315 PERROR_MESSAGE("mbtowc");
316 goto done;
319 if (wc == L'&') {
320 ret = append_str(out, out_idx, &out_sz, "&amp;", 5);
321 } else if (wc == L'"') {
322 ret = append_str(out, out_idx, &out_sz, "&quot;", 6);
323 } else if (wc == L'\'') {
324 ret = append_str(out, out_idx, &out_sz, "&apos;", 6);
325 } else if (wc == L'<') {
326 ret = append_str(out, out_idx, &out_sz, "&lt;", 4);
327 } else if (wc == L'>') {
328 ret = append_str(out, out_idx, &out_sz, "&gt;", 4);
329 } else if (mbret == 1 &&
330 in[in_idx] >= ' ' &&
331 in[in_idx] <= '~') {
332 ret = append_char(out, out_idx, &out_sz, in[in_idx]);
333 } else if (mbret == 1 &&
334 in[in_idx] == '\r') {
335 ret = 0;
336 } else if (mbret == 1 &&
337 in[in_idx] == '\n') {
338 ret = append_char(out, out_idx, &out_sz, in[in_idx]);
339 } else {
340 ret = append_wchar_escaped(out, out_idx, &out_sz, wc);
343 in_idx += mbret;
345 if (ret < 0) {
346 goto done;
350 *out_len = *out_len + (*out_idx - initial_out_idx);
351 ret = 0;
352 done:
354 return ret;
358 * From in construct *out, which is a codepoint-for-codepoint
359 * translation following the rules of unicode-transforms.h. The
360 * result is that *out can be matched with normal regex, even if
361 * in contains obfuscatory Unicode bullshit.
363 * Preconditions
365 * - setup_sanitize_comment() has been invoked more recently than
366 * clean_sanitize_comment().
368 * - in is memory of at least length in_len, valid UTF-8 text.
370 * - Overwriting *out and *out_position_map shall not cause a
371 * memory leak.
373 * - out, out_len, out_position_map, and out_position_map_len are
374 * not 0.
376 * Postconditions (success):
378 * - *out is valid, UTF-8 text of length *out_len.
380 * - For every j in [0, *out_len) such that (*out)[j] starts a
381 * codepoint, in[*(position_map)[j]] is the start of the
382 * corresponding codepoint.
384 * - (*position_map)[*out_len] = in_len.
386 static int to_scannable(const char *in, size_t in_len, char **out,
387 size_t *out_len, size_t **out_position_map,
388 size_t *out_position_map_len)
390 int ret = -1;
391 wchar_t wc = 0;
392 size_t in_idx = 0;
393 size_t out_idx = 0;
394 int mbret = 0;
395 struct translate *tr = 0;
396 size_t out_sz = 0;
398 if (!*out) {
399 if (!(*out = malloc(1))) {
400 PERROR_MESSAGE("malloc");
401 goto done;
404 out_sz = 1;
405 *out_len = 0;
406 (*out)[0] = '\0';
410 * Position_map is here to make wordfiltering work. Suppose in is
412 * I think Nina Purpleton did
413 * nothing wrong
415 * and a wordfilter /Nina Purpleton/i -> "worst girl" is
416 * in effect. Then *out will be
418 * I think Nina Purpleton did nothing wrong
420 * The message should, of course, be filtered to
422 * I think worst girl did nothing
423 * wrong
425 * In order to do that, it would be necessary to have a map
426 * from in to *out on the byte level, since the wordfilter
427 * will only be run against *out.
429 * position_map[j] = k means that out[j] and in[k] mean the
430 * same thing.
432 while (in_idx < in_len) {
433 mbret = mbtowc(&wc, in + in_idx, in_len - in_idx);
435 if (mbret == -1) {
436 PERROR_MESSAGE("mbtowc");
437 goto done;
440 /* We pre-suppose that the insert will go as planned */
441 if (set_position_mapping(out_position_map, out_position_map_len,
442 out_idx, in_idx) < 0) {
443 goto done;
446 if (mbret == 1 &&
447 in[in_idx] >= ' ' &&
448 in[in_idx] <= '~') {
449 if (append_str(out, &out_idx, &out_sz, in + in_idx, 1) <
450 0) {
451 goto done;
453 } else {
454 if ((tr = bsearch(&wc, translates, NUM_OF(translates),
455 sizeof *translates,
456 match_translate))) {
457 if (append_str(out, &out_idx, &out_sz, tr->to,
458 strlen(tr->to)) < 0) {
459 goto done;
461 } else {
462 if (append_str(out, &out_idx, &out_sz, in +
463 in_idx, mbret) < 0) {
464 goto done;
469 in_idx += mbret;
472 if (set_position_mapping(out_position_map, out_position_map_len,
473 out_idx, in_len) < 0) {
474 goto done;
477 (*out)[out_idx] = '\0';
478 *out_len = out_idx;
479 ret = 0;
480 done:
482 return ret;
486 * Read through raw and scannable, checking all wordfilters in
487 * scannable. Where a match is detected, the corresponding postion
488 * (via position_map) in raw is replaced by the replacement specified
489 * by the matching wordfilter.
491 * Preconditions
493 * - setup_sanitize_comment() has been invoked more recently than
494 * clean_sanitize_comment().
496 * - raw is memory of length at least raw_len, valid UTF-8 text.
498 * - scannable is memory of length at least scannable_len.
500 * - For any j in [0, scannable_len), position_map[j] is a valid
501 * index into raw, or is (size_t) -1.
503 * - position_map[scannable_len] = raw_len.
505 * - For any j in [0, scannable_len) such that k = position_map[j]
506 * is not (size_t) -1, scannable[j] and raw[k] are conceptually
507 * the same for wordfiltering.
509 * - Overwriting *out shall not cause a memory leak.
511 * - out and out_len are not 0.
513 * Postconditions (success):
515 * - *out is valid, UTF-8 text of length *out_len such that all
516 * non ASCII codepoints (and '<', '>', '&', '"', ''') are
517 * HTML-escaped.
519 * - *out represents raw, except in those sections of scannable
520 * where a wordfilter matched.
522 static int wordfilter_to_html(const char *raw, const size_t raw_len, const
523 char *scannable, const size_t scannable_len,
524 size_t *position_map, char **out,
525 size_t *out_len)
527 int ret = -1;
529 /* These hold the match locations from pcre2 */
530 uint32_t *ov_counts = 0;
531 PCRE2_SIZE **ov_ps = 0;
532 int *num_matches = 0;
533 pcre2_match_data **match_data = 0;
534 size_t raw_idx = 0;
535 size_t scannable_idx = 0;
536 size_t out_idx = 0;
537 size_t best_match_pos = 0;
538 size_t best_match_idx = 0;
539 size_t l = 0;
540 size_t mbret = 0;
542 if (!(ov_counts = calloc(wordfilters_num, sizeof *ov_counts))) {
543 PERROR_MESSAGE("calloc");
544 goto done;
547 if (!(ov_ps = calloc(wordfilters_num, sizeof *ov_ps))) {
548 PERROR_MESSAGE("calloc");
549 goto done;
552 if (!(num_matches = calloc(wordfilters_num, sizeof *num_matches))) {
553 PERROR_MESSAGE("calloc");
554 goto done;
557 if (!(match_data = calloc(wordfilters_num, sizeof *match_data))) {
558 PERROR_MESSAGE("calloc");
559 goto done;
562 /* First scan, before the loop */
563 for (size_t j = 0; j < wordfilters_num; ++j) {
564 if (!(match_data[j] = pcre2_match_data_create_from_pattern(
565 wordfilters[j].code, 0))) {
566 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
567 goto done;
570 num_matches[j] = pcre2_match(wordfilters[j].code,
571 (PCRE2_SPTR) scannable,
572 scannable_len, scannable_idx, 0,
573 match_data[j], 0);
576 handle_next_match:
577 best_match_pos = (size_t) -1;
578 best_match_idx = (size_t) -1;
580 /* We've run pcre2_match() on everything. Find the soonest match */
581 for (size_t j = 0; j < wordfilters_num; ++j) {
582 if (!num_matches[j]) {
583 continue;
586 ov_ps[j] = pcre2_get_ovector_pointer(match_data[j]);
588 if (ov_ps[j][0] >= scannable_idx &&
589 ov_ps[j][0] < best_match_pos) {
590 best_match_pos = ov_ps[j][0];
591 best_match_idx = j;
595 if (best_match_idx == (size_t) -1) {
596 /* No matches. Turn the rest to html boring-like */
597 ret = to_html(raw, raw_len, raw_idx, out, out_len, &out_idx);
598 goto done;
601 /* Figure out where in raw this match starts */
602 l = best_match_pos;
604 while (l != (size_t) -1 &&
605 position_map[l] == (size_t) -1) {
606 l--;
609 if (l == (size_t) -1) {
610 ERROR_MESSAGE("Impossible condition in "
611 "wordfilter_to_html: raw=\"%s\", best_match_pos = %zu",
612 raw,
613 best_match_pos);
614 goto done;
618 * Now position_map[l] points to the first character in raw
619 * that should be replaced. Fill up to that point.
621 if (position_map[l] &&
622 position_map[l] > raw_idx) {
623 if (to_html(raw, position_map[l], raw_idx, out, out_len,
624 &out_idx) < 0) {
625 goto done;
629 /* Put the substituted text in */
630 if (to_html(wordfilters[best_match_idx].replacement,
631 wordfilters[best_match_idx].replacement_len, 0, out,
632 out_len,
633 &out_idx) < 0) {
634 goto done;
638 * Figure out where we should advance to in inputs. Naively,
639 * we want to set scannable_idx to ov_ps[best_match_idx][1]
640 * (the first character in scannable beyond the match).
641 * However, we have to consider the case of
643 * foo!!!bar
645 * where "foo" -> "baz" is the only transformation. Since
646 * some characters, like "!", are completely ignored by
647 * the scannable transformation, the naive method would
648 * start our scanning at the "b", skipping information.
650 * So, instead, we carefully find the last character in
651 * "foo", then jump one past it. This (unfortunately)
652 * requires a bit more manual fiddling with wide character
653 * conversions.
656 if (ov_ps[best_match_idx][1] <= scannable_idx) {
658 * This should never happen, but let's make sure
659 * we always keep advancing.
661 scannable_idx++;
662 } else {
663 scannable_idx = ov_ps[best_match_idx][1] - 1;
666 l = scannable_idx;
668 while (position_map[l] == (size_t) -1) {
669 l--;
672 raw_idx = position_map[l];
674 /* This is the "jump one past it" part */
675 scannable_idx++;
676 errno = 0;
677 mbret = mbrlen(raw + raw_idx, MB_CUR_MAX, 0);
679 switch (mbret) {
680 case (size_t) -2:
681 case (size_t) -1:
682 PERROR_MESSAGE("mbrlen");
683 goto done;
684 default:
685 raw_idx += mbret;
689 * Now re-check all our matches and figure out which ones
690 * need to be updated
692 for (size_t j = 0; j < wordfilters_num; ++j) {
693 if (!num_matches[j] ||
694 ov_ps[j][0] >= scannable_idx) {
695 continue;
698 num_matches[j] = pcre2_match(wordfilters[j].code,
699 (PCRE2_SPTR) scannable,
700 scannable_len, scannable_idx, 0,
701 match_data[j], 0);
704 goto handle_next_match;
705 done:
707 for (size_t j = 0; j < wordfilters_num; ++j) {
708 pcre2_match_data_free(match_data[j]);
709 match_data[j] = 0;
712 free(match_data);
713 free(num_matches);
714 free(ov_counts);
715 free(ov_ps);
717 return ret;
721 * Read through in. Each time a match for format_replacements is
722 * found (something like a newline or a quote) is found, replace
723 * it with some HTML markup. The result is placed in out.
725 * Preconditions:
727 * - setup_sanitize_comment() has been invoked more recently than
728 * clean_sanitize_comment().
730 * - in is memory of length at least in_len, valid UTF-8 text.
732 * - Overwriting *out shall not cause a memory leak.
734 * - out and out_len are not 0.
736 * Postconditions (success):
738 * - *out is valid, UTF-8 text of length *out_len with sane HTML
739 * markup (and HTML escaped), suitable for outputting into an
740 * HTML file.
742 static int insert_html_tags(const char *in, size_t in_len, const char *board,
743 char **out, size_t *out_len)
745 int ret = -1;
746 size_t in_idx = 0;
747 size_t match_pos = 0;
748 size_t after_match_pos = 0;
749 size_t out_idx = 0;
750 pcre2_match_data *match_data = 0;
751 int nret = 0;
752 PCRE2_UCHAR *tmp_1 = 0;
753 PCRE2_SIZE tmp_1_len = 0;
754 PCRE2_UCHAR *tmp_2 = 0;
755 PCRE2_SIZE tmp_2_len = 0;
756 PCRE2_UCHAR *tmp_3 = 0;
757 PCRE2_SIZE tmp_3_len = 0;
758 uint_fast8_t last_was_newline = 1;
759 char *link_target = 0;
760 size_t link_target_len = 0;
762 if (!(match_data = pcre2_match_data_create_from_pattern(
763 format_replacements, 0))) {
764 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
765 goto done;
768 find_next_bit:
770 if (in_idx >= in_len) {
771 goto success;
774 nret = pcre2_match(format_replacements, (PCRE2_SPTR) in, in_len, in_idx,
775 0, match_data, 0);
777 if (nret == PCRE2_ERROR_NOMATCH) {
778 ret = append_str(out, &out_idx, out_len, in + in_idx, in_len -
779 in_idx);
780 goto done;
783 if (nret < 0) {
784 PCRE2_UCHAR8 err_buf[120];
786 pcre2_get_error_message(nret, err_buf, 120);
787 ERROR_MESSAGE("pcre2_match: error while matching \"%.*s\": %s"
788 " (PCRE2 %d)", (int) (in_len - in_idx), in +
789 in_idx, err_buf,
790 nret);
791 goto done;
794 pcre2_substring_free(tmp_1);
795 pcre2_substring_free(tmp_2);
796 pcre2_substring_free(tmp_3);
797 free(link_target);
798 tmp_1 = 0;
799 tmp_2 = 0;
800 tmp_3 = 0;
801 link_target = 0;
803 /* We have match, stuff everything up to it in *out */
804 match_pos = pcre2_get_ovector_pointer(match_data)[0];
805 after_match_pos = pcre2_get_ovector_pointer(match_data)[1];
807 if (match_pos > in_idx) {
808 if (append_str(out, &out_idx, out_len, in + in_idx, match_pos -
809 in_idx) < 0) {
810 goto done;
813 last_was_newline = 0;
814 in_idx = match_pos;
817 /* Figure out what type of match. */
818 if (!pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "newline",
819 &tmp_1, &tmp_1_len)) {
820 if (last_was_newline) {
821 if (append_const_str(out, &out_idx, out_len,
822 "&nbsp;<br />") < 0) {
823 goto done;
825 } else {
826 if (append_const_str(out, &out_idx, out_len, "<br />") <
827 0) {
828 goto done;
832 last_was_newline = 1;
833 in_idx = after_match_pos;
834 goto find_next_bit;
837 last_was_newline = 0;
839 if (!pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "quote",
840 &tmp_1, &tmp_1_len)) {
841 if (append_const_str(out, &out_idx, out_len,
842 "<span class=\"quote\">") < 0) {
843 goto done;
846 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
847 (size_t) tmp_1_len) < 0) {
848 goto done;
851 if (append_const_str(out, &out_idx, out_len, "</span>") < 0) {
852 goto done;
855 in_idx = after_match_pos;
856 goto find_next_bit;
859 if (!pcre2_substring_get_byname(match_data,
860 (PCRE2_SPTR) "intra_postlink", &tmp_1,
861 &tmp_1_len)) {
862 if (pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "a_num",
863 &tmp_2, &tmp_2_len)) {
864 goto problem_with_match;
867 int found = 0;
869 if (db_construct_post_link(board, strlen(board), (const
870 char *) tmp_2,
871 tmp_2_len, &found, &link_target,
872 &link_target_len) < 0) {
873 goto done;
876 if (!found) {
877 if (append_str(out, &out_idx, out_len, in + match_pos,
878 after_match_pos - match_pos) < 0) {
879 goto done;
882 in_idx = after_match_pos;
883 goto find_next_bit;
886 if (append_const_str(out, &out_idx, out_len, "<a href=\"") <
887 0) {
888 goto done;
891 if (append_str(out, &out_idx, out_len, link_target,
892 link_target_len) < 0) {
893 goto done;
896 if (append_const_str(out, &out_idx, out_len, "\">") < 0) {
897 goto done;
900 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
901 (size_t) tmp_1_len) < 0) {
902 goto done;
905 if (append_const_str(out, &out_idx, out_len, "</a>") < 0) {
906 goto done;
909 in_idx = after_match_pos;
910 goto find_next_bit;
913 if (!pcre2_substring_get_byname(match_data,
914 (PCRE2_SPTR) "inter_postlink", &tmp_1,
915 &tmp_1_len)) {
916 if (pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "e_num",
917 &tmp_2, &tmp_2_len)) {
918 goto problem_with_match;
921 if (pcre2_substring_get_byname(match_data,
922 (PCRE2_SPTR) "e_board", &tmp_3,
923 &tmp_3_len)) {
924 goto problem_with_match;
927 int found = 0;
929 if (db_construct_post_link((const char *) tmp_3, tmp_3_len,
930 (const char *) tmp_2, tmp_2_len,
931 &found, &link_target,
932 &link_target_len) < 0) {
933 goto done;
936 if (!found) {
937 if (append_str(out, &out_idx, out_len, in + match_pos,
938 after_match_pos - match_pos) < 0) {
939 goto done;
942 in_idx = after_match_pos;
943 goto find_next_bit;
946 if (append_const_str(out, &out_idx, out_len, "<a href=\"") <
947 0) {
948 goto done;
951 if (append_str(out, &out_idx, out_len, link_target,
952 link_target_len) < 0) {
953 goto done;
956 if (append_const_str(out, &out_idx, out_len, "\">") < 0) {
957 goto done;
960 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
961 (size_t) tmp_1_len) < 0) {
962 goto done;
965 if (append_const_str(out, &out_idx, out_len, "</a>") < 0) {
966 goto done;
969 in_idx = after_match_pos;
970 goto find_next_bit;
973 problem_with_match:
975 /* There was some kind of match, but it went wrong. */
976 in_idx++;
977 goto find_next_bit;
978 success:
979 ret = 0;
980 done:
981 *out_len = out_idx;
982 pcre2_substring_free(tmp_1);
983 pcre2_substring_free(tmp_2);
984 pcre2_substring_free(tmp_3);
985 pcre2_match_data_free(match_data);
987 return ret;
991 * Make sure that the contents of *pc are ready for safe injection
992 * into the board, including HTML escaping, wordfiltering, general
993 * formatting, and adding links.
995 * Preconditions
997 * - setup_sanitize_comment() has been invoked more recently than
998 * clean_sanitize_comment().
1000 * - *pc has been filled out (fields like action, board, etc. have
1001 * been populated) from the POST data.
1003 * Postconditions (success):
1005 * - The prepared_XYZ fields of *pc have been filled out, and each
1006 * is valid ASCII text, with Unicode codepoints.
1008 int st_sanitize_text(struct post_cmd *pc, int *our_fault)
1010 int ret = -1;
1011 size_t out_idx = 0;
1012 char *html_escaped_comment = 0;
1013 size_t html_escaped_comment_len = 0;
1015 /* Flush out lurking double-free bugs */
1016 free(pc->prepared.name);
1017 pc->prepared.name = 0;
1018 pc->prepared.name_len = 0;
1019 free(pc->prepared.email);
1020 pc->prepared.email = 0;
1021 pc->prepared.email_len = 0;
1022 free(pc->prepared.subject);
1023 pc->prepared.subject = 0;
1024 pc->prepared.subject_len = 0;
1025 free(pc->prepared.comment);
1026 pc->prepared.comment = 0;
1027 pc->prepared.comment_len = 0;
1028 free(pc->prepared.file_name);
1029 pc->prepared.file_name = 0;
1030 pc->prepared.file_name_len = 0;
1031 free(pc->scannable_comment);
1032 pc->scannable_comment = 0;
1033 pc->scannable_comment_len = 0;
1034 free(pc->position_map);
1035 pc->position_map = 0;
1036 pc->position_map_len = 0;
1037 out_idx = 0;
1039 if (!pc->raw.name_len) {
1040 free(pc->raw.name);
1042 if (!(pc->raw.name = strdup("Anonymous"))) {
1043 PERROR_MESSAGE("strdup");
1044 goto done;
1047 pc->raw.name_len = strlen(pc->raw.name);
1050 if (pc->raw.name_len) {
1051 if (to_html(pc->raw.name, pc->raw.name_len, 0,
1052 &pc->prepared.name, &pc->prepared.name_len,
1053 &out_idx) < 0) {
1054 *our_fault = 1;
1055 goto done;
1059 out_idx = 0;
1061 if (pc->raw.email_len) {
1062 if (to_html(pc->raw.email, pc->raw.email_len, 0,
1063 &pc->prepared.email, &pc->prepared.email_len,
1064 &out_idx) < 0) {
1065 *our_fault = 1;
1066 goto done;
1070 out_idx = 0;
1072 if (pc->raw.tripcode_len) {
1073 if (to_html(pc->raw.tripcode, pc->raw.tripcode_len, 0,
1074 &pc->prepared.tripcode, &pc->prepared.tripcode_len,
1075 &out_idx) <
1076 0) {
1077 *our_fault = 1;
1078 goto done;
1082 out_idx = 0;
1084 if (pc->raw.subject_len) {
1085 if (to_html(pc->raw.subject, pc->raw.subject_len, 0,
1086 &pc->prepared.subject, &pc->prepared.subject_len,
1087 &out_idx) <
1088 0) {
1089 *our_fault = 1;
1090 goto done;
1094 out_idx = 0;
1096 if (pc->raw.file_name_len) {
1097 if (to_html(pc->raw.file_name, pc->raw.file_name_len, 0,
1098 &pc->prepared.file_name,
1099 &pc->prepared.file_name_len,
1100 &out_idx) < 0) {
1101 *our_fault = 1;
1102 goto done;
1106 if (to_scannable(pc->raw.comment, pc->raw.comment_len,
1107 &pc->scannable_comment, &pc->scannable_comment_len,
1108 &pc->position_map,
1109 &pc->position_map_len)) {
1110 *our_fault = 1;
1111 goto done;
1115 * Now we do the fancy thing. Match scannable, build prepared
1116 * out of that.
1118 if (wordfilter_to_html(pc->raw.comment, pc->raw.comment_len,
1119 pc->scannable_comment, pc->scannable_comment_len,
1120 pc->position_map,
1121 &html_escaped_comment,
1122 &html_escaped_comment_len) < 0) {
1123 *our_fault = 1;
1124 goto done;
1128 * Everything's in &#123; form, but now take care of >>123,
1129 * <br />, etc.
1131 if (insert_html_tags(html_escaped_comment, html_escaped_comment_len,
1132 pc->raw.board, &pc->prepared.comment,
1133 &pc->prepared.comment_len) < 0) {
1134 *our_fault = 1;
1135 goto done;
1138 ret = 0;
1139 done:
1140 free(html_escaped_comment);
1142 return ret;
1146 * Initialize any static elements needed for this file.
1148 * Preconditions:
1150 * - setup_sanitize_comment() was not invoked more recently than
1151 * clean_sanitize_comment().
1153 * Postconditions (success):
1155 * - Any other function in this file may be safely called.
1157 int setup_sanitize_comment(const struct configuration *conf)
1160 * Check that the locale/libc/whatever is set up so that
1161 * UTF-8 handling can work.
1163 int ret = -1;
1164 const char *raw =
1165 "<script>alert(1)</script> , \U0001d511\U0001d526\U0001d52b"
1166 "\U0001d51e\u3000\U0001d513\U0001d532\U0001d52f\U0001d52d"
1167 "\U0001d529\U0001d522\U0001d531\U0001d52c\U0001d52b & "
1168 "\u2468\u0294!\u0ce2!!";
1169 const char *correct_html =
1170 "&lt;script&gt;alert(1)&lt;/script&gt; , &#120081;&#120102;"
1171 "&#120107;&#120094;&#12288;&#120083;&#120114;&#120111;"
1172 "&#120109;&#120105;&#120098;&#120113;&#120108;&#120107; &amp;"
1173 " &#9320;&#660;!&#3298;!!";
1174 const char *correct_scannable =
1175 "<script>alert(1)</script> , Nina Purpleton & 9!!!";
1176 char *html = 0;
1177 size_t html_len = 0;
1178 char *scannable = 0;
1179 size_t scannable_len = 0;
1180 size_t *position_map = 0;
1181 size_t position_map_len = 0;
1182 size_t out_idx = 0;
1184 if (to_html(raw, strlen(raw), 0, &html, &html_len, &out_idx) < 0) {
1185 goto done;
1188 if (strcmp(html, correct_html)) {
1189 ERROR_MESSAGE("Was expecting html conversion to yield "
1190 "\n\n\u00ab%s\u00bb\n\nInstead, got "
1191 "\n\n\u00ab%s\u00bb\n\n",
1192 correct_html, html);
1193 goto done;
1196 if (to_scannable(raw, strlen(raw), &scannable, &scannable_len,
1197 &position_map, &position_map_len) < 0) {
1198 goto done;
1201 if (strcmp(scannable, correct_scannable)) {
1202 ERROR_MESSAGE("Was expecting scannable conversion to yield "
1203 "\n\n\u00ab%s\u00bb\n\nInstead, got "
1204 "\n\n\u00ab%s\u00bb\n\n",
1205 correct_scannable, scannable);
1206 goto done;
1209 if (!(wordfilters = calloc(sizeof *wordfilters,
1210 conf->wordfilter_inputs_num))) {
1211 PERROR_MESSAGE("calloc");
1212 goto done;
1215 wordfilters_num = conf->wordfilter_inputs_num;
1216 int err_code = 0;
1217 PCRE2_SIZE err_offset = 0;
1218 PCRE2_UCHAR8 err_buf[120];
1220 for (size_t j = 0; j < wordfilters_num; ++j) {
1221 wordfilters[j].replacement =
1222 conf->wordfilter_inputs[j].replacement;
1223 wordfilters[j].replacement_len = strlen(
1224 conf->wordfilter_inputs[j].replacement);
1226 if ((wordfilters[j].code = pcre2_compile(
1227 (PCRE2_SPTR8) conf->wordfilter_inputs[j].pattern,
1228 PCRE2_ZERO_TERMINATED, PCRE2_UTF, &err_code,
1229 &err_offset, 0))) {
1230 continue;
1233 pcre2_get_error_message(err_code, err_buf, 120);
1234 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1235 conf->wordfilter_inputs[j].pattern, err_buf);
1236 goto done;
1239 const char *format_match_str =
1241 /* */
1242 "(?<newline>\\n)" /* */
1243 "|(?<intra_postlink>&gt;&gt;(?<a_num>[0-9]+))" /* */
1244 "|(?<inter_postlink>&gt;&gt;&gt;/" /* */
1245 "(?<e_board>[^ /]+)/(?<e_num>[0-9]+))" /* */
1246 "|(?<quote>(?<![^\n])&gt;[^\n]*)"; /* */
1248 if (!(format_replacements = pcre2_compile(
1249 (PCRE2_SPTR8) format_match_str, PCRE2_ZERO_TERMINATED,
1250 PCRE2_UTF,
1251 &err_code, &err_offset, 0))) {
1252 pcre2_get_error_message(err_code, err_buf, 120);
1253 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1254 format_match_str, err_buf);
1255 goto done;
1258 ret = 0;
1259 done:
1260 free(html);
1261 free(scannable);
1262 free(position_map);
1264 return ret;
1268 * Clean up any memory from this file
1270 * Postconditions (success):
1272 * - Valgrind won't report any memory leaks from this file.
1274 * - setup_sanitize_comment() can be safely called again.
1276 int clean_sanitize_comment(void)
1278 for (size_t j = 0; j < wordfilters_num; ++j) {
1279 pcre2_code_free(wordfilters[j].code);
1280 wordfilters[j] = (struct wordfilter) { 0 };
1283 pcre2_code_free(format_replacements);
1284 format_replacements = 0;
1285 free(wordfilters);
1286 wordfilters = 0;
1287 wordfilters_num = 0;
1289 return 0;