config: make conv dir configurable
[rb-79.git] / sanitize-comment.c
blob4bae692dced6687d1264785b132684b50f619e2f
1 /*
2 * Copyright (c) 2017-2018, De Rais <derais@cock.li>
4 * Permission to use, copy, modify, and/or distribute this software for
5 * any purpose with or without fee is hereby granted, provided that the
6 * above copyright notice and this permission notice appear in all
7 * copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
10 * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
11 * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
12 * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
13 * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
14 * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
15 * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
16 * PERFORMANCE OF THIS SOFTWARE.
18 #include <errno.h>
19 #include <stdint.h>
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <time.h>
24 #include <wchar.h>
26 #define PCRE2_CODE_UNIT_WIDTH 8
27 #include <pcre2.h>
29 #include "macros.h"
30 #include "rb79.h"
31 #include "unicode-transforms.h"
34 * We need a way to get codepoints out of UTF-8 strings and if
35 * wchar_t stored codepoint values, that would be great. That's
36 * __STDC_ISO_10646__, though. You can remove this check and cross
37 * your fingers, since rb79 will do a quick check on startup, but
38 * please check why the C implementation doesn't define
39 * __STDC_ISO_10646__ first.
41 #ifndef __STDC_ISO_10646__
42 #error We really want __STD_ISO_10646__
43 #endif
46 * A wordfilter consists of a pcre2 regex and a replacement string
48 struct wordfilter {
49 /* */
50 pcre2_code *code;
51 const char *replacement;
52 size_t replacement_len;
55 /* These are constructed in setup_sanitize_comment() */
56 static struct wordfilter *wordfilters;
57 static size_t wordfilters_num;
59 /* Special matcher for quoting, newlines, linkifying, etc. */
60 static pcre2_code *format_replacements;
63 * Comparison function for struct translate.
65 * Preconditions:
67 * - *key_v is a wchar_t.
69 * - *tr_v is a struct translate object.
71 * Postconditions:
73 * - Returns -1 (0) [1] if *key_v is less than (equal to) [greater
74 * than] *tr_v's starting range.
76 static int match_translate(const void *key_v, const void *tr_v)
78 const wchar_t *key = key_v;
79 const struct translate *tr = tr_v;
81 if (*key < tr->from_s) {
82 return -1;
83 } else if (*key > tr->from_t) {
84 return 1;
87 return 0;
91 * Add a UTF-8 sequence str onto *buf
93 * Preconditions:
95 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
96 * string.
98 * - str is a valid ASCII (not just UTF-8) string of length str_len.
100 * Postconditions (success):
102 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
103 * string.
105 * - The contents of str have been appended to *buf (and *idx
106 * includes this).
108 static int append_str(char **buf, size_t *idx, size_t *sz, const char *str,
109 size_t str_len)
111 if (str_len + *idx >= *sz) {
112 void *newmem = 0;
113 size_t new_sz = str_len + *idx + (1 << 9);
115 if (str_len + *idx < str_len ||
116 str_len + *idx + (1 << 9) < str_len + *idx) {
117 ERROR_MESSAGE("overflow (str_len = %zu, *idx = %zu)",
118 str_len, *idx);
120 return -1;
123 if (!(newmem = realloc(*buf, new_sz))) {
124 PERROR_MESSAGE("realloc");
126 return -1;
129 *buf = newmem;
130 *sz = new_sz;
133 strncpy(*buf + *idx, str, str_len);
134 *(*buf + *idx + str_len) = '\0';
135 *idx += str_len;
137 return 0;
140 /* Dummy function for when I can't be bothered to strlen(). */
141 static int append_const_str(char **buf, size_t *idx, size_t *len, const
142 char *str)
144 return append_str(buf, idx, len, str, strlen(str));
148 * Add a single character onto *buf
150 * Preconditions:
152 * - *buf is memory of length *len, and up to *idx is a valid UTF-8
153 * string.
155 * - c is an ASCII character.
157 * Postconditions (success):
159 * - *buf is memory of length *len, and up to *idx is a valid UTF-8
160 * string.
162 * - c has been appended to *buf (and *idx includes this).
164 static int append_char(char **buf, size_t *idx, size_t *len, char c)
166 if (1 + *idx >= *len) {
167 void *newmem = 0;
168 size_t new_len = 1 + *idx + (1 << 9);
170 if (*idx + 1 < *idx ||
171 *idx + 1 + (1 << 9) < *idx + 1) {
172 ERROR_MESSAGE("overflow (*idx = %zu)", *idx);
174 return -1;
177 if (!(newmem = realloc(*buf, new_len))) {
178 PERROR_MESSAGE("realloc");
180 return -1;
183 *buf = newmem;
184 *len = new_len;
187 *(*buf + *idx) = c;
188 *(*buf + *idx + 1) = '\0';
189 *idx += 1;
191 return 0;
195 * Add a Unicode codepoint onto *buf
197 * Preconditions:
199 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
200 * string.
202 * - wchar_t is a valid Unicode codepoint.
204 * Postconditions (success):
206 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
207 * string.
209 * - An HTML-escaped sequence like &#123; has been appended to
210 * *buf (and *idx includes this).
212 static int append_wchar_escaped(char **buf, size_t *idx, size_t *sz, wchar_t wc)
214 size_t l = snprintf(0, 0, "&#%ld;", (long) wc);
216 if (l + *idx >= *sz) {
217 void *newmem = 0;
218 size_t new_sz = l + *idx + (1 << 9);
220 if (*idx + l < *idx ||
221 *idx + l + (1 << 9) < *idx + l) {
222 ERROR_MESSAGE("overflow (*idx = %zu, l = %zu)", *idx,
225 return -1;
228 if (!(newmem = realloc(*buf, new_sz))) {
229 PERROR_MESSAGE("realloc");
231 return -1;
234 *buf = newmem;
235 *sz = new_sz;
238 sprintf(*buf + *idx, "&#%ld;", (long) wc);
239 *idx += l;
241 return 0;
245 * Ensure that (*map)[j] = k, fixing up length as appropriate.
247 * Preconditions
249 * - *map is memory of length len.
251 * Postconditions (success):
253 * - *map is memory of length len.
255 * - (*map)[j] = k.
257 static int set_position_mapping(size_t **map, size_t *len, size_t j, size_t k)
259 if (j + 1 >= *len) {
260 void *newmem = 0;
262 if (j + 2 < j ||
263 ((j + 2) * sizeof **map) / (j + 2) != sizeof **map) {
264 ERROR_MESSAGE("overflow (j = %zu)", j);
266 return -1;
269 if (!(newmem = realloc(*map, (j + 2) * sizeof **map))) {
270 PERROR_MESSAGE("realloc");
272 return -1;
275 *map = newmem;
277 for (size_t l = *len; l < j + 2; ++l) {
278 (*map)[l] = ((size_t) -1);
281 *len = j + 2;
284 (*map)[j] = k;
286 return 0;
290 * HTML-escape in to *out.
292 * Preconditions
294 * - in is memory of at least length in_len, valid UTF-8
295 * text.
297 * - *out is memory of at least length *out_len (if *out_len = 0,
298 * *out may be 0), valid UTF-8 text.
300 * - Overwriting *out and *out_len shall not cause a memory leak.
302 * - out, out_len, and out_idx are not 0.
304 * Postconditions (success):
306 * - *out is memory of at least length *out_len, valid UTF-8 text.
308 * - A stretch of HTML-escaped ASCII text representing in has been
309 * added to *out at the position that was *out_idx.
311 * - *out_idx has been updated to point to the end of this stretch.
313 * - If necessary, *out_len has been updated.
315 static int to_html(const char *in, const size_t in_len, size_t in_idx,
316 char **out, size_t *out_len, size_t *out_idx)
318 int ret = -1;
319 wchar_t wc = 0;
320 int mbret = 0;
321 size_t out_sz = 0;
322 size_t initial_out_idx = *out_idx;
324 if (!*out) {
325 if (!(*out = malloc(1))) {
326 PERROR_MESSAGE("malloc");
327 goto done;
330 out_sz = 1;
331 *out_len = 0;
332 (*out)[0] = '\0';
336 * XXX: If you make this multithreaded, be sure to use
337 * mbrtowc(3) here!
339 while (in_idx < in_len &&
340 in[in_idx]) {
341 /* Extract next character */
342 mbret = mbtowc(&wc, in + in_idx, in_len - in_idx);
344 if (mbret == -1) {
345 PERROR_MESSAGE("mbtowc");
346 goto done;
349 if (wc == L'&') {
350 ret = append_str(out, out_idx, &out_sz, "&amp;", 5);
351 } else if (wc == L'"') {
352 ret = append_str(out, out_idx, &out_sz, "&quot;", 6);
353 } else if (wc == L'\'') {
354 ret = append_str(out, out_idx, &out_sz, "&apos;", 6);
355 } else if (wc == L'<') {
356 ret = append_str(out, out_idx, &out_sz, "&lt;", 4);
357 } else if (wc == L'>') {
358 ret = append_str(out, out_idx, &out_sz, "&gt;", 4);
359 } else if (mbret == 1 &&
360 in[in_idx] >= ' ' &&
361 in[in_idx] <= '~') {
362 ret = append_char(out, out_idx, &out_sz, in[in_idx]);
363 } else if (mbret == 1 &&
364 in[in_idx] == '\r') {
365 ret = 0;
366 } else if (mbret == 1 &&
367 in[in_idx] == '\n') {
368 ret = append_char(out, out_idx, &out_sz, in[in_idx]);
369 } else {
370 ret = append_wchar_escaped(out, out_idx, &out_sz, wc);
373 in_idx += mbret;
375 if (ret < 0) {
376 goto done;
380 *out_len = *out_len + (*out_idx - initial_out_idx);
381 ret = 0;
382 done:
384 return ret;
388 * From in construct *out, which is a codepoint-for-codepoint
389 * translation following the rules of unicode-transforms.h. The
390 * result is that *out can be matched with normal regex, even if
391 * in contains obfuscatory Unicode bullshit.
393 * Preconditions
395 * - setup_sanitize_comment() has been invoked more recently than
396 * clean_sanitize_comment().
398 * - in is memory of at least length in_len, valid UTF-8 text.
400 * - Overwriting *out and *out_position_map shall not cause a
401 * memory leak.
403 * - out, out_len, out_position_map, and out_position_map_len are
404 * not 0.
406 * Postconditions (success):
408 * - *out is valid, UTF-8 text of length *out_len.
410 * - For every j in [0, *out_len) such that (*out)[j] starts a
411 * codepoint, in[*(position_map)[j]] is the start of the
412 * corresponding codepoint.
414 * - (*position_map)[*out_len] = in_len.
416 static int to_scannable(const char *in, size_t in_len, char **out,
417 size_t *out_len, size_t **out_position_map,
418 size_t *out_position_map_len)
420 int ret = -1;
421 wchar_t wc = 0;
422 size_t in_idx = 0;
423 size_t out_idx = 0;
424 int mbret = 0;
425 struct translate *tr = 0;
426 size_t out_sz = 0;
428 if (!*out) {
429 if (!(*out = malloc(1))) {
430 PERROR_MESSAGE("malloc");
431 goto done;
434 out_sz = 1;
435 *out_len = 0;
436 (*out)[0] = '\0';
440 * Position_map is here to make wordfiltering work. Suppose in is
442 * I think Nina Purpleton did
443 * nothing wrong
445 * and a wordfilter /Nina Purpleton/i -> "worst girl" is
446 * in effect. Then *out will be
448 * I think Nina Purpleton did nothing wrong
450 * The message should, of course, be filtered to
452 * I think worst girl did nothing
453 * wrong
455 * In order to do that, it would be necessary to have a map
456 * from in to *out on the byte level, since the wordfilter
457 * will only be run against *out.
459 * position_map[j] = k means that out[j] and in[k] mean the
460 * same thing.
462 while (in_idx < in_len) {
463 mbret = mbtowc(&wc, in + in_idx, in_len - in_idx);
465 if (mbret == -1) {
466 PERROR_MESSAGE("mbtowc");
467 goto done;
470 /* We pre-suppose that the insert will go as planned */
471 if (set_position_mapping(out_position_map, out_position_map_len,
472 out_idx, in_idx) < 0) {
473 goto done;
476 if (mbret == 1 &&
477 in[in_idx] >= ' ' &&
478 in[in_idx] <= '~') {
479 if (append_str(out, &out_idx, &out_sz, in + in_idx, 1) <
480 0) {
481 goto done;
483 } else {
484 if ((tr = bsearch(&wc, translates, NUM_OF(translates),
485 sizeof *translates,
486 match_translate))) {
487 if (append_str(out, &out_idx, &out_sz, tr->to,
488 strlen(tr->to)) < 0) {
489 goto done;
491 } else {
492 if (append_str(out, &out_idx, &out_sz, in +
493 in_idx, mbret) < 0) {
494 goto done;
499 in_idx += mbret;
502 if (set_position_mapping(out_position_map, out_position_map_len,
503 out_idx, in_len) < 0) {
504 goto done;
507 (*out)[out_idx] = '\0';
508 *out_len = out_idx;
509 ret = 0;
510 done:
512 return ret;
516 * Read through raw and scannable, checking all wordfilters in
517 * scannable. Where a match is detected, the corresponding postion
518 * (via position_map) in raw is replaced by the replacement specified
519 * by the matching wordfilter.
521 * Preconditions
523 * - setup_sanitize_comment() has been invoked more recently than
524 * clean_sanitize_comment().
526 * - raw is memory of length at least raw_len, valid UTF-8 text.
528 * - scannable is memory of length at least scannable_len.
530 * - For any j in [0, scannable_len), position_map[j] is a valid
531 * index into raw, or is (size_t) -1.
533 * - position_map[scannable_len] = raw_len.
535 * - For any j in [0, scannable_len) such that k = position_map[j]
536 * is not (size_t) -1, scannable[j] and raw[k] are conceptually
537 * the same for wordfiltering.
539 * - Overwriting *out shall not cause a memory leak.
541 * - out and out_len are not 0.
543 * Postconditions (success):
545 * - *out is valid, UTF-8 text of length *out_len such that all
546 * non ASCII codepoints (and '<', '>', '&', '"', ''') are
547 * HTML-escaped.
549 * - *out represents raw, except in those sections of scannable
550 * where a wordfilter matched.
552 static int wordfilter_to_html(const char *raw, const size_t raw_len, const
553 char *scannable, const size_t scannable_len,
554 size_t *position_map, char **out,
555 size_t *out_len)
557 int ret = -1;
559 /* These hold the match locations from pcre2 */
560 uint32_t *ov_counts = 0;
561 PCRE2_SIZE **ov_ps = 0;
562 int *num_matches = 0;
563 pcre2_match_data **match_data = 0;
564 size_t raw_idx = 0;
565 size_t scannable_idx = 0;
566 size_t out_idx = 0;
567 size_t best_match_pos = 0;
568 size_t best_match_idx = 0;
569 size_t l = 0;
570 size_t mbret = 0;
572 if (!(ov_counts = calloc(wordfilters_num, sizeof *ov_counts))) {
573 PERROR_MESSAGE("calloc");
574 goto done;
577 if (!(ov_ps = calloc(wordfilters_num, sizeof *ov_ps))) {
578 PERROR_MESSAGE("calloc");
579 goto done;
582 if (!(num_matches = calloc(wordfilters_num, sizeof *num_matches))) {
583 PERROR_MESSAGE("calloc");
584 goto done;
587 if (!(match_data = calloc(wordfilters_num, sizeof *match_data))) {
588 PERROR_MESSAGE("calloc");
589 goto done;
592 /* First scan, before the loop */
593 for (size_t j = 0; j < wordfilters_num; ++j) {
594 if (!(match_data[j] = pcre2_match_data_create_from_pattern(
595 wordfilters[j].code, 0))) {
596 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
597 goto done;
600 num_matches[j] = pcre2_match(wordfilters[j].code,
601 (PCRE2_SPTR) scannable,
602 scannable_len, scannable_idx, 0,
603 match_data[j], 0);
606 handle_next_match:
607 best_match_pos = (size_t) -1;
608 best_match_idx = (size_t) -1;
610 /* We've run pcre2_match() on everything. Find the soonest match */
611 for (size_t j = 0; j < wordfilters_num; ++j) {
612 if (!num_matches[j]) {
613 continue;
616 ov_ps[j] = pcre2_get_ovector_pointer(match_data[j]);
618 if (ov_ps[j][0] >= scannable_idx &&
619 ov_ps[j][0] < best_match_pos) {
620 best_match_pos = ov_ps[j][0];
621 best_match_idx = j;
625 if (best_match_idx == (size_t) -1) {
626 /* No matches. Turn the rest to html boring-like */
627 ret = to_html(raw, raw_len, raw_idx, out, out_len, &out_idx);
628 goto done;
631 /* Figure out where in raw this match starts */
632 l = best_match_pos;
634 while (l != (size_t) -1 &&
635 position_map[l] == (size_t) -1) {
636 l--;
639 if (l == (size_t) -1) {
640 ERROR_MESSAGE("Impossible condition in "
641 "wordfilter_to_html: raw=\"%s\", best_match_pos = %zu",
642 raw,
643 best_match_pos);
644 goto done;
648 * Now position_map[l] points to the first character in raw
649 * that should be replaced. Fill up to that point.
651 if (position_map[l] &&
652 position_map[l] > raw_idx) {
653 if (to_html(raw, position_map[l], raw_idx, out, out_len,
654 &out_idx) < 0) {
655 goto done;
659 /* Put the substituted text in */
660 if (to_html(wordfilters[best_match_idx].replacement,
661 wordfilters[best_match_idx].replacement_len, 0, out,
662 out_len,
663 &out_idx) < 0) {
664 goto done;
668 * Figure out where we should advance to in inputs. Naively,
669 * we want to set scannable_idx to ov_ps[best_match_idx][1]
670 * (the first character in scannable beyond the match).
671 * However, we have to consider the case of
673 * foo!!!bar
675 * where "foo" -> "baz" is the only transformation. Since
676 * some characters, like "!", are completely ignored by
677 * the scannable transformation, the naive method would
678 * start our scanning at the "b", skipping information.
680 * So, instead, we carefully find the last character in
681 * "foo", then jump one past it. This (unfortunately)
682 * requires a bit more manual fiddling with wide character
683 * conversions.
686 if (ov_ps[best_match_idx][1] <= scannable_idx) {
688 * This should never happen, but let's make sure
689 * we always keep advancing.
691 scannable_idx++;
692 } else {
693 scannable_idx = ov_ps[best_match_idx][1] - 1;
696 l = scannable_idx;
698 while (position_map[l] == (size_t) -1) {
699 l--;
702 raw_idx = position_map[l];
704 /* This is the "jump one past it" part */
705 scannable_idx++;
706 errno = 0;
707 mbret = mbrlen(raw + raw_idx, MB_CUR_MAX, 0);
709 switch (mbret) {
710 case (size_t) -2:
711 case (size_t) -1:
712 PERROR_MESSAGE("mbrlen");
713 goto done;
714 default:
715 raw_idx += mbret;
719 * Now re-check all our matches and figure out which ones
720 * need to be updated
722 for (size_t j = 0; j < wordfilters_num; ++j) {
723 if (!num_matches[j] ||
724 ov_ps[j][0] >= scannable_idx) {
725 continue;
728 num_matches[j] = pcre2_match(wordfilters[j].code,
729 (PCRE2_SPTR) scannable,
730 scannable_len, scannable_idx, 0,
731 match_data[j], 0);
734 goto handle_next_match;
735 done:
737 for (size_t j = 0; j < wordfilters_num; ++j) {
738 pcre2_match_data_free(match_data[j]);
739 match_data[j] = 0;
742 free(match_data);
743 free(num_matches);
744 free(ov_counts);
745 free(ov_ps);
747 return ret;
751 * Read through in. Each time a match for format_replacements is
752 * found (something like a newline or a quote) is found, replace
753 * it with some HTML markup. The result is placed in out.
755 * Preconditions:
757 * - setup_sanitize_comment() has been invoked more recently than
758 * clean_sanitize_comment().
760 * - in is memory of length at least in_len, valid UTF-8 text.
762 * - Overwriting *out shall not cause a memory leak.
764 * - out and out_len are not 0.
766 * Postconditions (success):
768 * - *out is valid, UTF-8 text of length *out_len with sane HTML
769 * markup (and HTML escaped), suitable for outputting into an
770 * HTML file.
772 static int insert_html_tags(const char *in, size_t in_len, const char *board,
773 char **out, size_t *out_len)
775 int ret = -1;
776 size_t in_idx = 0;
777 size_t match_pos = 0;
778 size_t after_match_pos = 0;
779 size_t out_idx = 0;
780 pcre2_match_data *match_data = 0;
781 int nret = 0;
782 PCRE2_UCHAR *tmp_1 = 0;
783 PCRE2_SIZE tmp_1_len = 0;
784 PCRE2_UCHAR *tmp_2 = 0;
785 PCRE2_SIZE tmp_2_len = 0;
786 PCRE2_UCHAR *tmp_3 = 0;
787 PCRE2_SIZE tmp_3_len = 0;
788 uint_fast8_t last_was_newline = 1;
789 char *link_target = 0;
790 size_t link_target_len = 0;
792 if (!(match_data = pcre2_match_data_create_from_pattern(
793 format_replacements, 0))) {
794 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
795 goto done;
798 find_next_bit:
800 if (in_idx >= in_len) {
801 goto success;
804 nret = pcre2_match(format_replacements, (PCRE2_SPTR) in, in_len, in_idx,
805 0, match_data, 0);
807 if (nret == PCRE2_ERROR_NOMATCH) {
808 ret = append_str(out, &out_idx, out_len, in + in_idx, in_len -
809 in_idx);
810 goto done;
813 if (nret < 0) {
814 PCRE2_UCHAR8 err_buf[120];
816 pcre2_get_error_message(nret, err_buf, 120);
817 ERROR_MESSAGE("pcre2_match: error while matching \"%.*s\": %s"
818 " (PCRE2 %d)", (int) (in_len - in_idx), in +
819 in_idx, err_buf,
820 nret);
821 goto done;
824 pcre2_substring_free(tmp_1);
825 pcre2_substring_free(tmp_2);
826 pcre2_substring_free(tmp_3);
827 free(link_target);
828 tmp_1 = 0;
829 tmp_2 = 0;
830 tmp_3 = 0;
831 link_target = 0;
833 /* We have match, stuff everything up to it in *out */
834 match_pos = pcre2_get_ovector_pointer(match_data)[0];
835 after_match_pos = pcre2_get_ovector_pointer(match_data)[1];
837 if (match_pos > in_idx) {
838 if (append_str(out, &out_idx, out_len, in + in_idx, match_pos -
839 in_idx) < 0) {
840 goto done;
843 last_was_newline = 0;
844 in_idx = match_pos;
847 /* Figure out what type of match. */
848 if (!pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "newline",
849 &tmp_1, &tmp_1_len)) {
850 if (last_was_newline) {
851 if (append_const_str(out, &out_idx, out_len,
852 "&nbsp;<br />") < 0) {
853 goto done;
855 } else {
856 if (append_const_str(out, &out_idx, out_len, "<br />") <
857 0) {
858 goto done;
862 last_was_newline = 1;
863 in_idx = after_match_pos;
864 goto find_next_bit;
867 last_was_newline = 0;
869 if (!pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "quote",
870 &tmp_1, &tmp_1_len)) {
871 if (append_const_str(out, &out_idx, out_len,
872 "<span class=\"quote\">") < 0) {
873 goto done;
876 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
877 (size_t) tmp_1_len) < 0) {
878 goto done;
881 if (append_const_str(out, &out_idx, out_len, "</span>") < 0) {
882 goto done;
885 in_idx = after_match_pos;
886 goto find_next_bit;
889 if (!pcre2_substring_get_byname(match_data,
890 (PCRE2_SPTR) "intra_postlink", &tmp_1,
891 &tmp_1_len)) {
892 if (pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "a_num",
893 &tmp_2, &tmp_2_len)) {
894 goto problem_with_match;
897 int found = 0;
899 if (db_construct_post_link(board, strlen(board), (const
900 char *) tmp_2,
901 tmp_2_len, &found, &link_target,
902 &link_target_len) < 0) {
903 goto done;
906 if (!found) {
907 if (append_str(out, &out_idx, out_len, in + match_pos,
908 after_match_pos - match_pos) < 0) {
909 goto done;
912 in_idx = after_match_pos;
913 goto find_next_bit;
916 if (append_const_str(out, &out_idx, out_len, "<a href=\"") <
917 0) {
918 goto done;
921 if (append_str(out, &out_idx, out_len, link_target,
922 link_target_len) < 0) {
923 goto done;
926 if (append_const_str(out, &out_idx, out_len, "\">") < 0) {
927 goto done;
930 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
931 (size_t) tmp_1_len) < 0) {
932 goto done;
935 if (append_const_str(out, &out_idx, out_len, "</a>") < 0) {
936 goto done;
939 in_idx = after_match_pos;
940 goto find_next_bit;
943 if (!pcre2_substring_get_byname(match_data,
944 (PCRE2_SPTR) "inter_postlink", &tmp_1,
945 &tmp_1_len)) {
946 if (pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "e_num",
947 &tmp_2, &tmp_2_len)) {
948 goto problem_with_match;
951 if (pcre2_substring_get_byname(match_data,
952 (PCRE2_SPTR) "e_board", &tmp_3,
953 &tmp_3_len)) {
954 goto problem_with_match;
957 int found = 0;
959 if (db_construct_post_link((const char *) tmp_3, tmp_3_len,
960 (const char *) tmp_2, tmp_2_len,
961 &found, &link_target,
962 &link_target_len) < 0) {
963 goto done;
966 if (!found) {
967 if (append_str(out, &out_idx, out_len, in + match_pos,
968 after_match_pos - match_pos) < 0) {
969 goto done;
972 in_idx = after_match_pos;
973 goto find_next_bit;
976 if (append_const_str(out, &out_idx, out_len, "<a href=\"") <
977 0) {
978 goto done;
981 if (append_str(out, &out_idx, out_len, link_target,
982 link_target_len) < 0) {
983 goto done;
986 if (append_const_str(out, &out_idx, out_len, "\">") < 0) {
987 goto done;
990 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
991 (size_t) tmp_1_len) < 0) {
992 goto done;
995 if (append_const_str(out, &out_idx, out_len, "</a>") < 0) {
996 goto done;
999 in_idx = after_match_pos;
1000 goto find_next_bit;
1003 problem_with_match:
1005 /* There was some kind of match, but it went wrong. */
1006 in_idx++;
1007 goto find_next_bit;
1008 success:
1009 ret = 0;
1010 done:
1011 *out_len = out_idx;
1012 pcre2_substring_free(tmp_1);
1013 pcre2_substring_free(tmp_2);
1014 pcre2_substring_free(tmp_3);
1015 pcre2_match_data_free(match_data);
1017 return ret;
1021 * Make sure that the contents of *pc are ready for safe injection
1022 * into the board, including HTML escaping, wordfiltering, general
1023 * formatting, and adding links.
1025 * Preconditions
1027 * - setup_sanitize_comment() has been invoked more recently than
1028 * clean_sanitize_comment().
1030 * - *pc has been filled out (fields like action, board, etc. have
1031 * been populated) from the POST data.
1033 * Postconditions (success):
1035 * - The prepared_XYZ fields of *pc have been filled out, and each
1036 * is valid ASCII text, with Unicode codepoints.
1038 int st_sanitize_text(struct post_cmd *pc, int *our_fault)
1040 int ret = -1;
1041 size_t out_idx = 0;
1042 char *html_escaped_comment = 0;
1043 size_t html_escaped_comment_len = 0;
1045 /* Flush out lurking double-free bugs */
1046 free(pc->prepared.name);
1047 pc->prepared.name = 0;
1048 pc->prepared.name_len = 0;
1049 free(pc->prepared.email);
1050 pc->prepared.email = 0;
1051 pc->prepared.email_len = 0;
1052 free(pc->prepared.subject);
1053 pc->prepared.subject = 0;
1054 pc->prepared.subject_len = 0;
1055 free(pc->prepared.comment);
1056 pc->prepared.comment = 0;
1057 pc->prepared.comment_len = 0;
1058 free(pc->prepared.file_name);
1059 pc->prepared.file_name = 0;
1060 pc->prepared.file_name_len = 0;
1061 free(pc->scannable_comment);
1062 pc->scannable_comment = 0;
1063 pc->scannable_comment_len = 0;
1064 free(pc->position_map);
1065 pc->position_map = 0;
1066 pc->position_map_len = 0;
1067 out_idx = 0;
1069 if (!pc->raw.name_len) {
1070 free(pc->raw.name);
1072 if (!(pc->raw.name = strdup("Anonymous"))) {
1073 PERROR_MESSAGE("strdup");
1074 goto done;
1077 pc->raw.name_len = strlen(pc->raw.name);
1080 if (pc->raw.name_len) {
1081 if (to_html(pc->raw.name, pc->raw.name_len, 0,
1082 &pc->prepared.name, &pc->prepared.name_len,
1083 &out_idx) < 0) {
1084 *our_fault = 1;
1085 goto done;
1089 out_idx = 0;
1091 if (pc->raw.email_len) {
1092 if (to_html(pc->raw.email, pc->raw.email_len, 0,
1093 &pc->prepared.email, &pc->prepared.email_len,
1094 &out_idx) < 0) {
1095 *our_fault = 1;
1096 goto done;
1100 out_idx = 0;
1102 if (pc->raw.tripcode_len) {
1103 if (to_html(pc->raw.tripcode, pc->raw.tripcode_len, 0,
1104 &pc->prepared.tripcode, &pc->prepared.tripcode_len,
1105 &out_idx) <
1106 0) {
1107 *our_fault = 1;
1108 goto done;
1112 out_idx = 0;
1114 if (pc->raw.subject_len) {
1115 if (to_html(pc->raw.subject, pc->raw.subject_len, 0,
1116 &pc->prepared.subject, &pc->prepared.subject_len,
1117 &out_idx) <
1118 0) {
1119 *our_fault = 1;
1120 goto done;
1124 out_idx = 0;
1126 if (pc->raw.file_name_len) {
1127 if (to_html(pc->raw.file_name, pc->raw.file_name_len, 0,
1128 &pc->prepared.file_name,
1129 &pc->prepared.file_name_len,
1130 &out_idx) < 0) {
1131 *our_fault = 1;
1132 goto done;
1136 if (to_scannable(pc->raw.comment, pc->raw.comment_len,
1137 &pc->scannable_comment, &pc->scannable_comment_len,
1138 &pc->position_map,
1139 &pc->position_map_len)) {
1140 *our_fault = 1;
1141 goto done;
1145 * Now we do the fancy thing. Match scannable, build prepared
1146 * out of that.
1148 if (wordfilter_to_html(pc->raw.comment, pc->raw.comment_len,
1149 pc->scannable_comment, pc->scannable_comment_len,
1150 pc->position_map,
1151 &html_escaped_comment,
1152 &html_escaped_comment_len) < 0) {
1153 *our_fault = 1;
1154 goto done;
1158 * Everything's in &#123; form, but now take care of >>123,
1159 * <br />, etc.
1161 if (insert_html_tags(html_escaped_comment, html_escaped_comment_len,
1162 pc->raw.board, &pc->prepared.comment,
1163 &pc->prepared.comment_len) < 0) {
1164 *our_fault = 1;
1165 goto done;
1168 ret = 0;
1169 done:
1170 free(html_escaped_comment);
1172 return ret;
1176 * Initialize any static elements needed for this file.
1178 * Preconditions:
1180 * - setup_sanitize_comment() was not invoked more recently than
1181 * clean_sanitize_comment().
1183 * Postconditions (success):
1185 * - Any other function in this file may be safely called.
1187 int setup_sanitize_comment(const struct configuration *conf)
1190 * Check that the locale/libc/whatever is set up so that
1191 * UTF-8 handling can work.
1193 int ret = -1;
1194 const char *raw =
1195 "<script>alert(1)</script> , \U0001d511\U0001d526\U0001d52b"
1196 "\U0001d51e\u3000\U0001d513\U0001d532\U0001d52f\U0001d52d"
1197 "\U0001d529\U0001d522\U0001d531\U0001d52c\U0001d52b & "
1198 "\u2468\u0294!\u0ce2!!";
1199 const char *correct_html =
1200 "&lt;script&gt;alert(1)&lt;/script&gt; , &#120081;&#120102;"
1201 "&#120107;&#120094;&#12288;&#120083;&#120114;&#120111;"
1202 "&#120109;&#120105;&#120098;&#120113;&#120108;&#120107; &amp;"
1203 " &#9320;&#660;!&#3298;!!";
1204 const char *correct_scannable =
1205 "<script>alert(1)</script> , Nina Purpleton & 9!!!";
1206 char *html = 0;
1207 size_t html_len = 0;
1208 char *scannable = 0;
1209 size_t scannable_len = 0;
1210 size_t *position_map = 0;
1211 size_t position_map_len = 0;
1212 size_t out_idx = 0;
1214 if (to_html(raw, strlen(raw), 0, &html, &html_len, &out_idx) < 0) {
1215 goto done;
1218 if (strcmp(html, correct_html)) {
1219 ERROR_MESSAGE("Was expecting html conversion to yield "
1220 "\n\n\u00ab%s\u00bb\n\nInstead, got "
1221 "\n\n\u00ab%s\u00bb\n\n",
1222 correct_html, html);
1223 goto done;
1226 if (to_scannable(raw, strlen(raw), &scannable, &scannable_len,
1227 &position_map, &position_map_len) < 0) {
1228 goto done;
1231 if (strcmp(scannable, correct_scannable)) {
1232 ERROR_MESSAGE("Was expecting scannable conversion to yield "
1233 "\n\n\u00ab%s\u00bb\n\nInstead, got "
1234 "\n\n\u00ab%s\u00bb\n\n",
1235 correct_scannable, scannable);
1236 goto done;
1239 if (!(wordfilters = calloc(conf->wordfilter_inputs_num,
1240 sizeof *wordfilters))) {
1241 PERROR_MESSAGE("calloc");
1242 goto done;
1245 wordfilters_num = conf->wordfilter_inputs_num;
1246 int err_code = 0;
1247 PCRE2_SIZE err_offset = 0;
1248 PCRE2_UCHAR8 err_buf[120];
1250 for (size_t j = 0; j < wordfilters_num; ++j) {
1251 wordfilters[j].replacement =
1252 conf->wordfilter_inputs[j].replacement;
1253 wordfilters[j].replacement_len = strlen(
1254 conf->wordfilter_inputs[j].replacement);
1256 if ((wordfilters[j].code = pcre2_compile(
1257 (PCRE2_SPTR8) conf->wordfilter_inputs[j].pattern,
1258 PCRE2_ZERO_TERMINATED, PCRE2_UTF, &err_code,
1259 &err_offset, 0))) {
1260 continue;
1263 pcre2_get_error_message(err_code, err_buf, 120);
1264 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1265 conf->wordfilter_inputs[j].pattern, err_buf);
1266 goto done;
1269 const char *format_match_str =
1271 /* */
1272 "(?<newline>\\n)" /* */
1273 "|(?<intra_postlink>&gt;&gt;(?<a_num>[0-9]+))" /* */
1274 "|(?<inter_postlink>&gt;&gt;&gt;/" /* */
1275 "(?<e_board>[^ /]+)/(?<e_num>[0-9]+))" /* */
1276 "|(?<quote>(?<![^\n])&gt;[^\n]*)"; /* */
1278 if (!(format_replacements = pcre2_compile(
1279 (PCRE2_SPTR8) format_match_str, PCRE2_ZERO_TERMINATED,
1280 PCRE2_UTF,
1281 &err_code, &err_offset, 0))) {
1282 pcre2_get_error_message(err_code, err_buf, 120);
1283 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1284 format_match_str, err_buf);
1285 goto done;
1288 ret = 0;
1289 done:
1290 free(html);
1291 free(scannable);
1292 free(position_map);
1294 return ret;
1298 * Clean up any memory from this file
1300 * Postconditions (success):
1302 * - Valgrind won't report any memory leaks from this file.
1304 * - setup_sanitize_comment() can be safely called again.
1306 int clean_sanitize_comment(void)
1308 for (size_t j = 0; j < wordfilters_num; ++j) {
1309 pcre2_code_free(wordfilters[j].code);
1310 wordfilters[j] = (struct wordfilter) { 0 };
1313 pcre2_code_free(format_replacements);
1314 format_replacements = 0;
1315 free(wordfilters);
1316 wordfilters = 0;
1317 wordfilters_num = 0;
1319 return 0;