2 * Copyright (c) 2017-2018, De Rais <derais@cock.li>
4 * Permission to use, copy, modify, and/or distribute this software for
5 * any purpose with or without fee is hereby granted, provided that the
6 * above copyright notice and this permission notice appear in all
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
10 * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
11 * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
12 * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
13 * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
14 * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
15 * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
16 * PERFORMANCE OF THIS SOFTWARE.
26 #define PCRE2_CODE_UNIT_WIDTH 8
31 #include "unicode-transforms.h"
34 * We need a way to get codepoints out of UTF-8 strings and if
35 * wchar_t stored codepoint values, that would be great. That's
36 * __STDC_ISO_10646__, though. You can remove this check and cross
37 * your fingers, since rb79 will do a quick check on startup, but
38 * please check why the C implementation doesn't define
39 * __STDC_ISO_10646__ first.
41 #ifndef __STDC_ISO_10646__
42 #error We really want __STD_ISO_10646__
46 * A wordfilter consists of a pcre2 regex and a replacement string
51 const char *replacement
;
52 size_t replacement_len
;
55 /* These are constructed in setup_sanitize_comment() */
56 static struct wordfilter
*wordfilters
;
57 static size_t wordfilters_num
;
59 /* Special matcher for quoting, newlines, linkifying, etc. */
60 static pcre2_code
*format_replacements
;
63 * Comparison function for struct translate.
67 * - *key_v is a wchar_t.
69 * - *tr_v is a struct translate object.
73 * - Returns -1 (0) [1] if *key_v is less than (equal to) [greater
74 * than] *tr_v's starting range.
76 static int match_translate(const void *key_v
, const void *tr_v
)
78 const wchar_t *key
= key_v
;
79 const struct translate
*tr
= tr_v
;
81 if (*key
< tr
->from_s
) {
83 } else if (*key
> tr
->from_t
) {
91 * Add a UTF-8 sequence str onto *buf
95 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
98 * - str is a valid ASCII (not just UTF-8) string of length str_len.
100 * Postconditions (success):
102 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
105 * - The contents of str have been appended to *buf (and *idx
108 static int append_str(char **buf
, size_t *idx
, size_t *sz
, const char *str
,
111 if (str_len
+ *idx
>= *sz
) {
113 size_t new_sz
= str_len
+ *idx
+ (1 << 9);
115 if (str_len
+ *idx
< str_len
||
116 str_len
+ *idx
+ (1 << 9) < str_len
+ *idx
) {
117 ERROR_MESSAGE("overflow (str_len = %zu, *idx = %zu)",
123 if (!(newmem
= realloc(*buf
, new_sz
))) {
124 PERROR_MESSAGE("realloc");
133 strncpy(*buf
+ *idx
, str
, str_len
);
134 *(*buf
+ *idx
+ str_len
) = '\0';
140 /* Dummy function for when I can't be bothered to strlen(). */
141 static int append_const_str(char **buf
, size_t *idx
, size_t *len
, const
144 return append_str(buf
, idx
, len
, str
, strlen(str
));
148 * Add a single character onto *buf
152 * - *buf is memory of length *len, and up to *idx is a valid UTF-8
155 * - c is an ASCII character.
157 * Postconditions (success):
159 * - *buf is memory of length *len, and up to *idx is a valid UTF-8
162 * - c has been appended to *buf (and *idx includes this).
164 static int append_char(char **buf
, size_t *idx
, size_t *len
, char c
)
166 if (1 + *idx
>= *len
) {
168 size_t new_len
= 1 + *idx
+ (1 << 9);
170 if (*idx
+ 1 < *idx
||
171 *idx
+ 1 + (1 << 9) < *idx
+ 1) {
172 ERROR_MESSAGE("overflow (*idx = %zu)", *idx
);
177 if (!(newmem
= realloc(*buf
, new_len
))) {
178 PERROR_MESSAGE("realloc");
188 *(*buf
+ *idx
+ 1) = '\0';
195 * Add a Unicode codepoint onto *buf
199 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
202 * - wchar_t is a valid Unicode codepoint.
204 * Postconditions (success):
206 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
209 * - An HTML-escaped sequence like { has been appended to
210 * *buf (and *idx includes this).
212 static int append_wchar_escaped(char **buf
, size_t *idx
, size_t *sz
, wchar_t wc
)
214 size_t l
= snprintf(0, 0, "&#%ld;", (long) wc
);
216 if (l
+ *idx
>= *sz
) {
218 size_t new_sz
= l
+ *idx
+ (1 << 9);
220 if (*idx
+ l
< *idx
||
221 *idx
+ l
+ (1 << 9) < *idx
+ l
) {
222 ERROR_MESSAGE("overflow (*idx = %zu, l = %zu)", *idx
,
228 if (!(newmem
= realloc(*buf
, new_sz
))) {
229 PERROR_MESSAGE("realloc");
238 sprintf(*buf
+ *idx
, "&#%ld;", (long) wc
);
245 * Ensure that (*map)[j] = k, fixing up length as appropriate.
249 * - *map is memory of length len.
251 * Postconditions (success):
253 * - *map is memory of length len.
257 static int set_position_mapping(size_t **map
, size_t *len
, size_t j
, size_t k
)
263 ((j
+ 2) * sizeof **map
) / (j
+ 2) != sizeof **map
) {
264 ERROR_MESSAGE("overflow (j = %zu)", j
);
269 if (!(newmem
= realloc(*map
, (j
+ 2) * sizeof **map
))) {
270 PERROR_MESSAGE("realloc");
277 for (size_t l
= *len
; l
< j
+ 2; ++l
) {
278 (*map
)[l
] = ((size_t) -1);
290 * HTML-escape in to *out.
294 * - in is memory of at least length in_len, valid UTF-8
297 * - *out is memory of at least length *out_len (if *out_len = 0,
298 * *out may be 0), valid UTF-8 text.
300 * - Overwriting *out and *out_len shall not cause a memory leak.
302 * - out, out_len, and out_idx are not 0.
304 * Postconditions (success):
306 * - *out is memory of at least length *out_len, valid UTF-8 text.
308 * - A stretch of HTML-escaped ASCII text representing in has been
309 * added to *out at the position that was *out_idx.
311 * - *out_idx has been updated to point to the end of this stretch.
313 * - If necessary, *out_len has been updated.
315 static int to_html(const char *in
, const size_t in_len
, size_t in_idx
,
316 char **out
, size_t *out_len
, size_t *out_idx
)
322 size_t initial_out_idx
= *out_idx
;
325 if (!(*out
= malloc(1))) {
326 PERROR_MESSAGE("malloc");
336 * XXX: If you make this multithreaded, be sure to use
339 while (in_idx
< in_len
&&
341 /* Extract next character */
342 mbret
= mbtowc(&wc
, in
+ in_idx
, in_len
- in_idx
);
345 PERROR_MESSAGE("mbtowc");
350 ret
= append_str(out
, out_idx
, &out_sz
, "&", 5);
351 } else if (wc
== L
'"') {
352 ret
= append_str(out
, out_idx
, &out_sz
, """, 6);
353 } else if (wc
== L
'\'') {
354 ret
= append_str(out
, out_idx
, &out_sz
, "'", 6);
355 } else if (wc
== L
'<') {
356 ret
= append_str(out
, out_idx
, &out_sz
, "<", 4);
357 } else if (wc
== L
'>') {
358 ret
= append_str(out
, out_idx
, &out_sz
, ">", 4);
359 } else if (mbret
== 1 &&
362 ret
= append_char(out
, out_idx
, &out_sz
, in
[in_idx
]);
363 } else if (mbret
== 1 &&
364 in
[in_idx
] == '\r') {
366 } else if (mbret
== 1 &&
367 in
[in_idx
] == '\n') {
368 ret
= append_char(out
, out_idx
, &out_sz
, in
[in_idx
]);
370 ret
= append_wchar_escaped(out
, out_idx
, &out_sz
, wc
);
380 *out_len
= *out_len
+ (*out_idx
- initial_out_idx
);
388 * From in construct *out, which is a codepoint-for-codepoint
389 * translation following the rules of unicode-transforms.h. The
390 * result is that *out can be matched with normal regex, even if
391 * in contains obfuscatory Unicode bullshit.
395 * - setup_sanitize_comment() has been invoked more recently than
396 * clean_sanitize_comment().
398 * - in is memory of at least length in_len, valid UTF-8 text.
400 * - Overwriting *out and *out_position_map shall not cause a
403 * - out, out_len, out_position_map, and out_position_map_len are
406 * Postconditions (success):
408 * - *out is valid, UTF-8 text of length *out_len.
410 * - For every j in [0, *out_len) such that (*out)[j] starts a
411 * codepoint, in[*(position_map)[j]] is the start of the
412 * corresponding codepoint.
414 * - (*position_map)[*out_len] = in_len.
416 static int to_scannable(const char *in
, size_t in_len
, char **out
,
417 size_t *out_len
, size_t **out_position_map
,
418 size_t *out_position_map_len
)
425 struct translate
*tr
= 0;
429 if (!(*out
= malloc(1))) {
430 PERROR_MESSAGE("malloc");
440 * Position_map is here to make wordfiltering work. Suppose in is
442 * I think Nina Purpleton did
445 * and a wordfilter /Nina Purpleton/i -> "worst girl" is
446 * in effect. Then *out will be
448 * I think Nina Purpleton did nothing wrong
450 * The message should, of course, be filtered to
452 * I think worst girl did nothing
455 * In order to do that, it would be necessary to have a map
456 * from in to *out on the byte level, since the wordfilter
457 * will only be run against *out.
459 * position_map[j] = k means that out[j] and in[k] mean the
462 while (in_idx
< in_len
) {
463 mbret
= mbtowc(&wc
, in
+ in_idx
, in_len
- in_idx
);
466 PERROR_MESSAGE("mbtowc");
470 /* We pre-suppose that the insert will go as planned */
471 if (set_position_mapping(out_position_map
, out_position_map_len
,
472 out_idx
, in_idx
) < 0) {
479 if (append_str(out
, &out_idx
, &out_sz
, in
+ in_idx
, 1) <
484 if ((tr
= bsearch(&wc
, translates
, NUM_OF(translates
),
487 if (append_str(out
, &out_idx
, &out_sz
, tr
->to
,
488 strlen(tr
->to
)) < 0) {
492 if (append_str(out
, &out_idx
, &out_sz
, in
+
493 in_idx
, mbret
) < 0) {
502 if (set_position_mapping(out_position_map
, out_position_map_len
,
503 out_idx
, in_len
) < 0) {
507 (*out
)[out_idx
] = '\0';
516 * Read through raw and scannable, checking all wordfilters in
517 * scannable. Where a match is detected, the corresponding postion
518 * (via position_map) in raw is replaced by the replacement specified
519 * by the matching wordfilter.
523 * - setup_sanitize_comment() has been invoked more recently than
524 * clean_sanitize_comment().
526 * - raw is memory of length at least raw_len, valid UTF-8 text.
528 * - scannable is memory of length at least scannable_len.
530 * - For any j in [0, scannable_len), position_map[j] is a valid
531 * index into raw, or is (size_t) -1.
533 * - position_map[scannable_len] = raw_len.
535 * - For any j in [0, scannable_len) such that k = position_map[j]
536 * is not (size_t) -1, scannable[j] and raw[k] are conceptually
537 * the same for wordfiltering.
539 * - Overwriting *out shall not cause a memory leak.
541 * - out and out_len are not 0.
543 * Postconditions (success):
545 * - *out is valid, UTF-8 text of length *out_len such that all
546 * non ASCII codepoints (and '<', '>', '&', '"', ''') are
549 * - *out represents raw, except in those sections of scannable
550 * where a wordfilter matched.
552 static int wordfilter_to_html(const char *raw
, const size_t raw_len
, const
553 char *scannable
, const size_t scannable_len
,
554 size_t *position_map
, char **out
,
559 /* These hold the match locations from pcre2 */
560 uint32_t *ov_counts
= 0;
561 PCRE2_SIZE
**ov_ps
= 0;
562 int *num_matches
= 0;
563 pcre2_match_data
**match_data
= 0;
565 size_t scannable_idx
= 0;
567 size_t best_match_pos
= 0;
568 size_t best_match_idx
= 0;
572 if (!(ov_counts
= calloc(wordfilters_num
, sizeof *ov_counts
))) {
573 PERROR_MESSAGE("calloc");
577 if (!(ov_ps
= calloc(wordfilters_num
, sizeof *ov_ps
))) {
578 PERROR_MESSAGE("calloc");
582 if (!(num_matches
= calloc(wordfilters_num
, sizeof *num_matches
))) {
583 PERROR_MESSAGE("calloc");
587 if (!(match_data
= calloc(wordfilters_num
, sizeof *match_data
))) {
588 PERROR_MESSAGE("calloc");
592 /* First scan, before the loop */
593 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
594 if (!(match_data
[j
] = pcre2_match_data_create_from_pattern(
595 wordfilters
[j
].code
, 0))) {
596 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
600 num_matches
[j
] = pcre2_match(wordfilters
[j
].code
,
601 (PCRE2_SPTR
) scannable
,
602 scannable_len
, scannable_idx
, 0,
607 best_match_pos
= (size_t) -1;
608 best_match_idx
= (size_t) -1;
610 /* We've run pcre2_match() on everything. Find the soonest match */
611 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
612 if (!num_matches
[j
]) {
616 ov_ps
[j
] = pcre2_get_ovector_pointer(match_data
[j
]);
618 if (ov_ps
[j
][0] >= scannable_idx
&&
619 ov_ps
[j
][0] < best_match_pos
) {
620 best_match_pos
= ov_ps
[j
][0];
625 if (best_match_idx
== (size_t) -1) {
626 /* No matches. Turn the rest to html boring-like */
627 ret
= to_html(raw
, raw_len
, raw_idx
, out
, out_len
, &out_idx
);
631 /* Figure out where in raw this match starts */
634 while (l
!= (size_t) -1 &&
635 position_map
[l
] == (size_t) -1) {
639 if (l
== (size_t) -1) {
640 ERROR_MESSAGE("Impossible condition in "
641 "wordfilter_to_html: raw=\"%s\", best_match_pos = %zu",
648 * Now position_map[l] points to the first character in raw
649 * that should be replaced. Fill up to that point.
651 if (position_map
[l
] &&
652 position_map
[l
] > raw_idx
) {
653 if (to_html(raw
, position_map
[l
], raw_idx
, out
, out_len
,
659 /* Put the substituted text in */
660 if (to_html(wordfilters
[best_match_idx
].replacement
,
661 wordfilters
[best_match_idx
].replacement_len
, 0, out
,
668 * Figure out where we should advance to in inputs. Naively,
669 * we want to set scannable_idx to ov_ps[best_match_idx][1]
670 * (the first character in scannable beyond the match).
671 * However, we have to consider the case of
675 * where "foo" -> "baz" is the only transformation. Since
676 * some characters, like "!", are completely ignored by
677 * the scannable transformation, the naive method would
678 * start our scanning at the "b", skipping information.
680 * So, instead, we carefully find the last character in
681 * "foo", then jump one past it. This (unfortunately)
682 * requires a bit more manual fiddling with wide character
686 if (ov_ps
[best_match_idx
][1] <= scannable_idx
) {
688 * This should never happen, but let's make sure
689 * we always keep advancing.
693 scannable_idx
= ov_ps
[best_match_idx
][1] - 1;
698 while (position_map
[l
] == (size_t) -1) {
702 raw_idx
= position_map
[l
];
704 /* This is the "jump one past it" part */
707 mbret
= mbrlen(raw
+ raw_idx
, MB_CUR_MAX
, 0);
712 PERROR_MESSAGE("mbrlen");
719 * Now re-check all our matches and figure out which ones
722 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
723 if (!num_matches
[j
] ||
724 ov_ps
[j
][0] >= scannable_idx
) {
728 num_matches
[j
] = pcre2_match(wordfilters
[j
].code
,
729 (PCRE2_SPTR
) scannable
,
730 scannable_len
, scannable_idx
, 0,
734 goto handle_next_match
;
737 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
738 pcre2_match_data_free(match_data
[j
]);
751 * Read through in. Each time a match for format_replacements is
752 * found (something like a newline or a quote) is found, replace
753 * it with some HTML markup. The result is placed in out.
757 * - setup_sanitize_comment() has been invoked more recently than
758 * clean_sanitize_comment().
760 * - in is memory of length at least in_len, valid UTF-8 text.
762 * - Overwriting *out shall not cause a memory leak.
764 * - out and out_len are not 0.
766 * Postconditions (success):
768 * - *out is valid, UTF-8 text of length *out_len with sane HTML
769 * markup (and HTML escaped), suitable for outputting into an
772 static int insert_html_tags(const char *in
, size_t in_len
, const char *board
,
773 char **out
, size_t *out_len
)
777 size_t match_pos
= 0;
778 size_t after_match_pos
= 0;
780 pcre2_match_data
*match_data
= 0;
782 PCRE2_UCHAR
*tmp_1
= 0;
783 PCRE2_SIZE tmp_1_len
= 0;
784 PCRE2_UCHAR
*tmp_2
= 0;
785 PCRE2_SIZE tmp_2_len
= 0;
786 PCRE2_UCHAR
*tmp_3
= 0;
787 PCRE2_SIZE tmp_3_len
= 0;
788 uint_fast8_t last_was_newline
= 1;
789 char *link_target
= 0;
790 size_t link_target_len
= 0;
792 if (!(match_data
= pcre2_match_data_create_from_pattern(
793 format_replacements
, 0))) {
794 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
800 if (in_idx
>= in_len
) {
804 nret
= pcre2_match(format_replacements
, (PCRE2_SPTR
) in
, in_len
, in_idx
,
807 if (nret
== PCRE2_ERROR_NOMATCH
) {
808 ret
= append_str(out
, &out_idx
, out_len
, in
+ in_idx
, in_len
-
814 PCRE2_UCHAR8 err_buf
[120];
816 pcre2_get_error_message(nret
, err_buf
, 120);
817 ERROR_MESSAGE("pcre2_match: error while matching \"%.*s\": %s"
818 " (PCRE2 %d)", (int) (in_len
- in_idx
), in
+
824 pcre2_substring_free(tmp_1
);
825 pcre2_substring_free(tmp_2
);
826 pcre2_substring_free(tmp_3
);
833 /* We have match, stuff everything up to it in *out */
834 match_pos
= pcre2_get_ovector_pointer(match_data
)[0];
835 after_match_pos
= pcre2_get_ovector_pointer(match_data
)[1];
837 if (match_pos
> in_idx
) {
838 if (append_str(out
, &out_idx
, out_len
, in
+ in_idx
, match_pos
-
843 last_was_newline
= 0;
847 /* Figure out what type of match. */
848 if (!pcre2_substring_get_byname(match_data
, (PCRE2_SPTR
) "newline",
849 &tmp_1
, &tmp_1_len
)) {
850 if (last_was_newline
) {
851 if (append_const_str(out
, &out_idx
, out_len
,
852 " <br />") < 0) {
856 if (append_const_str(out
, &out_idx
, out_len
, "<br />") <
862 last_was_newline
= 1;
863 in_idx
= after_match_pos
;
867 last_was_newline
= 0;
869 if (!pcre2_substring_get_byname(match_data
, (PCRE2_SPTR
) "quote",
870 &tmp_1
, &tmp_1_len
)) {
871 if (append_const_str(out
, &out_idx
, out_len
,
872 "<span class=\"quote\">") < 0) {
876 if (append_str(out
, &out_idx
, out_len
, (const char *) tmp_1
,
877 (size_t) tmp_1_len
) < 0) {
881 if (append_const_str(out
, &out_idx
, out_len
, "</span>") < 0) {
885 in_idx
= after_match_pos
;
889 if (!pcre2_substring_get_byname(match_data
,
890 (PCRE2_SPTR
) "intra_postlink", &tmp_1
,
892 if (pcre2_substring_get_byname(match_data
, (PCRE2_SPTR
) "a_num",
893 &tmp_2
, &tmp_2_len
)) {
894 goto problem_with_match
;
899 if (db_construct_post_link(board
, strlen(board
), (const
901 tmp_2_len
, &found
, &link_target
,
902 &link_target_len
) < 0) {
907 if (append_str(out
, &out_idx
, out_len
, in
+ match_pos
,
908 after_match_pos
- match_pos
) < 0) {
912 in_idx
= after_match_pos
;
916 if (append_const_str(out
, &out_idx
, out_len
, "<a href=\"") <
921 if (append_str(out
, &out_idx
, out_len
, link_target
,
922 link_target_len
) < 0) {
926 if (append_const_str(out
, &out_idx
, out_len
, "\">") < 0) {
930 if (append_str(out
, &out_idx
, out_len
, (const char *) tmp_1
,
931 (size_t) tmp_1_len
) < 0) {
935 if (append_const_str(out
, &out_idx
, out_len
, "</a>") < 0) {
939 in_idx
= after_match_pos
;
943 if (!pcre2_substring_get_byname(match_data
,
944 (PCRE2_SPTR
) "inter_postlink", &tmp_1
,
946 if (pcre2_substring_get_byname(match_data
, (PCRE2_SPTR
) "e_num",
947 &tmp_2
, &tmp_2_len
)) {
948 goto problem_with_match
;
951 if (pcre2_substring_get_byname(match_data
,
952 (PCRE2_SPTR
) "e_board", &tmp_3
,
954 goto problem_with_match
;
959 if (db_construct_post_link((const char *) tmp_3
, tmp_3_len
,
960 (const char *) tmp_2
, tmp_2_len
,
961 &found
, &link_target
,
962 &link_target_len
) < 0) {
967 if (append_str(out
, &out_idx
, out_len
, in
+ match_pos
,
968 after_match_pos
- match_pos
) < 0) {
972 in_idx
= after_match_pos
;
976 if (append_const_str(out
, &out_idx
, out_len
, "<a href=\"") <
981 if (append_str(out
, &out_idx
, out_len
, link_target
,
982 link_target_len
) < 0) {
986 if (append_const_str(out
, &out_idx
, out_len
, "\">") < 0) {
990 if (append_str(out
, &out_idx
, out_len
, (const char *) tmp_1
,
991 (size_t) tmp_1_len
) < 0) {
995 if (append_const_str(out
, &out_idx
, out_len
, "</a>") < 0) {
999 in_idx
= after_match_pos
;
1005 /* There was some kind of match, but it went wrong. */
1012 pcre2_substring_free(tmp_1
);
1013 pcre2_substring_free(tmp_2
);
1014 pcre2_substring_free(tmp_3
);
1015 pcre2_match_data_free(match_data
);
1021 * Make sure that the contents of *pc are ready for safe injection
1022 * into the board, including HTML escaping, wordfiltering, general
1023 * formatting, and adding links.
1027 * - setup_sanitize_comment() has been invoked more recently than
1028 * clean_sanitize_comment().
1030 * - *pc has been filled out (fields like action, board, etc. have
1031 * been populated) from the POST data.
1033 * Postconditions (success):
1035 * - The prepared_XYZ fields of *pc have been filled out, and each
1036 * is valid ASCII text, with Unicode codepoints.
1038 int st_sanitize_text(struct post_cmd
*pc
, int *our_fault
)
1042 char *html_escaped_comment
= 0;
1043 size_t html_escaped_comment_len
= 0;
1045 /* Flush out lurking double-free bugs */
1046 free(pc
->prepared
.name
);
1047 pc
->prepared
.name
= 0;
1048 pc
->prepared
.name_len
= 0;
1049 free(pc
->prepared
.email
);
1050 pc
->prepared
.email
= 0;
1051 pc
->prepared
.email_len
= 0;
1052 free(pc
->prepared
.subject
);
1053 pc
->prepared
.subject
= 0;
1054 pc
->prepared
.subject_len
= 0;
1055 free(pc
->prepared
.comment
);
1056 pc
->prepared
.comment
= 0;
1057 pc
->prepared
.comment_len
= 0;
1058 free(pc
->prepared
.file_name
);
1059 pc
->prepared
.file_name
= 0;
1060 pc
->prepared
.file_name_len
= 0;
1061 free(pc
->scannable_comment
);
1062 pc
->scannable_comment
= 0;
1063 pc
->scannable_comment_len
= 0;
1064 free(pc
->position_map
);
1065 pc
->position_map
= 0;
1066 pc
->position_map_len
= 0;
1069 if (!pc
->raw
.name_len
) {
1072 if (!(pc
->raw
.name
= strdup("Anonymous"))) {
1073 PERROR_MESSAGE("strdup");
1077 pc
->raw
.name_len
= strlen(pc
->raw
.name
);
1080 if (pc
->raw
.name_len
) {
1081 if (to_html(pc
->raw
.name
, pc
->raw
.name_len
, 0,
1082 &pc
->prepared
.name
, &pc
->prepared
.name_len
,
1091 if (pc
->raw
.email_len
) {
1092 if (to_html(pc
->raw
.email
, pc
->raw
.email_len
, 0,
1093 &pc
->prepared
.email
, &pc
->prepared
.email_len
,
1102 if (pc
->raw
.tripcode_len
) {
1103 if (to_html(pc
->raw
.tripcode
, pc
->raw
.tripcode_len
, 0,
1104 &pc
->prepared
.tripcode
, &pc
->prepared
.tripcode_len
,
1114 if (pc
->raw
.subject_len
) {
1115 if (to_html(pc
->raw
.subject
, pc
->raw
.subject_len
, 0,
1116 &pc
->prepared
.subject
, &pc
->prepared
.subject_len
,
1126 if (pc
->raw
.file_name_len
) {
1127 if (to_html(pc
->raw
.file_name
, pc
->raw
.file_name_len
, 0,
1128 &pc
->prepared
.file_name
,
1129 &pc
->prepared
.file_name_len
,
1136 if (to_scannable(pc
->raw
.comment
, pc
->raw
.comment_len
,
1137 &pc
->scannable_comment
, &pc
->scannable_comment_len
,
1139 &pc
->position_map_len
)) {
1145 * Now we do the fancy thing. Match scannable, build prepared
1148 if (wordfilter_to_html(pc
->raw
.comment
, pc
->raw
.comment_len
,
1149 pc
->scannable_comment
, pc
->scannable_comment_len
,
1151 &html_escaped_comment
,
1152 &html_escaped_comment_len
) < 0) {
1158 * Everything's in { form, but now take care of >>123,
1161 if (insert_html_tags(html_escaped_comment
, html_escaped_comment_len
,
1162 pc
->raw
.board
, &pc
->prepared
.comment
,
1163 &pc
->prepared
.comment_len
) < 0) {
1170 free(html_escaped_comment
);
1176 * Initialize any static elements needed for this file.
1180 * - setup_sanitize_comment() was not invoked more recently than
1181 * clean_sanitize_comment().
1183 * Postconditions (success):
1185 * - Any other function in this file may be safely called.
1187 int setup_sanitize_comment(const struct configuration
*conf
)
1190 * Check that the locale/libc/whatever is set up so that
1191 * UTF-8 handling can work.
1195 "<script>alert(1)</script> , \U0001d511\U0001d526\U0001d52b"
1196 "\U0001d51e\u3000\U0001d513\U0001d532\U0001d52f\U0001d52d"
1197 "\U0001d529\U0001d522\U0001d531\U0001d52c\U0001d52b & "
1198 "\u2468\u0294!\u0ce2!!";
1199 const char *correct_html
=
1200 "<script>alert(1)</script> , 𝔑𝔦"
1201 "𝔫𝔞 𝔓𝔲𝔯"
1202 "𝔭𝔩𝔢𝔱𝔬𝔫 &"
1203 " ⑨ʔ!ೢ!!";
1204 const char *correct_scannable
=
1205 "<script>alert(1)</script> , Nina Purpleton & 9!!!";
1207 size_t html_len
= 0;
1208 char *scannable
= 0;
1209 size_t scannable_len
= 0;
1210 size_t *position_map
= 0;
1211 size_t position_map_len
= 0;
1214 if (to_html(raw
, strlen(raw
), 0, &html
, &html_len
, &out_idx
) < 0) {
1218 if (strcmp(html
, correct_html
)) {
1219 ERROR_MESSAGE("Was expecting html conversion to yield "
1220 "\n\n\u00ab%s\u00bb\n\nInstead, got "
1221 "\n\n\u00ab%s\u00bb\n\n",
1222 correct_html
, html
);
1226 if (to_scannable(raw
, strlen(raw
), &scannable
, &scannable_len
,
1227 &position_map
, &position_map_len
) < 0) {
1231 if (strcmp(scannable
, correct_scannable
)) {
1232 ERROR_MESSAGE("Was expecting scannable conversion to yield "
1233 "\n\n\u00ab%s\u00bb\n\nInstead, got "
1234 "\n\n\u00ab%s\u00bb\n\n",
1235 correct_scannable
, scannable
);
1239 if (!(wordfilters
= calloc(conf
->wordfilter_inputs_num
,
1240 sizeof *wordfilters
))) {
1241 PERROR_MESSAGE("calloc");
1245 wordfilters_num
= conf
->wordfilter_inputs_num
;
1247 PCRE2_SIZE err_offset
= 0;
1248 PCRE2_UCHAR8 err_buf
[120];
1250 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
1251 wordfilters
[j
].replacement
=
1252 conf
->wordfilter_inputs
[j
].replacement
;
1253 wordfilters
[j
].replacement_len
= strlen(
1254 conf
->wordfilter_inputs
[j
].replacement
);
1256 if ((wordfilters
[j
].code
= pcre2_compile(
1257 (PCRE2_SPTR8
) conf
->wordfilter_inputs
[j
].pattern
,
1258 PCRE2_ZERO_TERMINATED
, PCRE2_UTF
, &err_code
,
1263 pcre2_get_error_message(err_code
, err_buf
, 120);
1264 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1265 conf
->wordfilter_inputs
[j
].pattern
, err_buf
);
1269 const char *format_match_str
=
1272 "(?<newline>\\n)" /* */
1273 "|(?<intra_postlink>>>(?<a_num>[0-9]+))" /* */
1274 "|(?<inter_postlink>>>>/" /* */
1275 "(?<e_board>[^ /]+)/(?<e_num>[0-9]+))" /* */
1276 "|(?<quote>(?<![^\n])>[^\n]*)"; /* */
1278 if (!(format_replacements
= pcre2_compile(
1279 (PCRE2_SPTR8
) format_match_str
, PCRE2_ZERO_TERMINATED
,
1281 &err_code
, &err_offset
, 0))) {
1282 pcre2_get_error_message(err_code
, err_buf
, 120);
1283 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1284 format_match_str
, err_buf
);
1298 * Clean up any memory from this file
1300 * Postconditions (success):
1302 * - Valgrind won't report any memory leaks from this file.
1304 * - setup_sanitize_comment() can be safely called again.
1306 int clean_sanitize_comment(void)
1308 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
1309 pcre2_code_free(wordfilters
[j
].code
);
1310 wordfilters
[j
] = (struct wordfilter
) { 0 };
1313 pcre2_code_free(format_replacements
);
1314 format_replacements
= 0;
1317 wordfilters_num
= 0;