2 * Copyright (c) 2017, De Rais <derais@cock.li>
4 * Permission to use, copy, modify, and/or distribute this software for
5 * any purpose with or without fee is hereby granted, provided that the
6 * above copyright notice and this permission notice appear in all
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
10 * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
11 * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
12 * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
13 * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
14 * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
15 * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
16 * PERFORMANCE OF THIS SOFTWARE.
26 #define PCRE2_CODE_UNIT_WIDTH 8
31 #include "unicode-transforms.h"
34 * We need a way to get codepoints out of UTF-8 strings and if
35 * wchar_t stored codepoint values, that would be great. That's
36 * __STDC_ISO_10646__, though. You can remove this check and cross
37 * your fingers, since rb79 will do a quick check on startup, but
38 * please check why the C implementation doesn't define
39 * __STDC_ISO_10646__ first.
41 #ifndef __STDC_ISO_10646__
42 #error We really want __STD_ISO_10646__
46 * A wordfilter consists of a pcre2 regex and a replacement string
51 const char *replacement
;
52 size_t replacement_len
;
55 /* These are constructed in setup_sanitize_comment() */
56 static struct wordfilter
*wordfilters
;
57 static size_t wordfilters_num
;
59 /* Special matcher for quoting, newlines, linkifying, etc. */
60 static pcre2_code
*format_replacements
;
63 * Comparison function for struct translate.
67 * - *key_v is a wchar_t.
69 * - *tr_v is a struct translate object.
73 * - Returns -1 (0) [1] if *key_v is less than (equal to) [greater
74 * than] *tr_v's starting range.
76 static int match_translate(const void *key_v
, const void *tr_v
)
78 const wchar_t *key
= (const wchar_t *) key_v
;
79 const struct translate
*tr
= (const struct translate
*) tr_v
;
81 if (*key
< tr
->from_s
) {
83 } else if (*key
> tr
->from_t
) {
91 * Add a UTF-8 sequence str onto *buf
95 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
98 * - str is a valid ASCII (not just UTF-8) string of length str_len.
100 * Postconditions (success):
102 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
105 * - The contents of str have been appended to *buf (and *idx
108 static int append_str(char **buf
, size_t *idx
, size_t *sz
, const char *str
,
111 if (str_len
+ *idx
>= *sz
) {
113 size_t new_sz
= str_len
+ *idx
+ (1 << 9);
115 if (!(newmem
= realloc(*buf
, new_sz
))) {
116 PERROR_MESSAGE("realloc");
125 strncpy(*buf
+ *idx
, str
, str_len
);
126 *(*buf
+ *idx
+ str_len
) = '\0';
132 /* Dummy function for when I can't be bothered to strlen(). */
133 static int append_const_str(char **buf
, size_t *idx
, size_t *len
, const
136 return append_str(buf
, idx
, len
, str
, strlen(str
));
140 * Add a single character onto *buf
144 * - *buf is memory of length *len, and up to *idx is a valid UTF-8
147 * - c is an ASCII character.
149 * Postconditions (success):
151 * - *buf is memory of length *len, and up to *idx is a valid UTF-8
154 * - c has been appended to *buf (and *idx includes this).
156 static int append_char(char **buf
, size_t *idx
, size_t *len
, char c
)
158 if (1 + *idx
>= *len
) {
160 size_t new_len
= 1 + *idx
+ (1 << 9);
162 if (!(newmem
= realloc(*buf
, new_len
))) {
163 PERROR_MESSAGE("realloc");
173 *(*buf
+ *idx
+ 1) = '\0';
180 * Add a Unicode codepoint onto *buf
184 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
187 * - wchar_t is a valid Unicode codepoint.
189 * Postconditions (success):
191 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
194 * - An HTML-escaped sequence like { has been appended to
195 * *buf (and *idx includes this).
197 static int append_wchar_escaped(char **buf
, size_t *idx
, size_t *sz
, wchar_t wc
)
199 size_t l
= snprintf(0, 0, "&#%ld;", (long) wc
);
201 if (l
+ *idx
>= *sz
) {
203 size_t new_sz
= l
+ *idx
+ (1 << 9);
205 if (!(newmem
= realloc(*buf
, new_sz
))) {
206 PERROR_MESSAGE("realloc");
215 sprintf(*buf
+ *idx
, "&#%ld;", (long) wc
);
222 * Ensure that (*map)[j] = k, fixing up length as appropriate.
226 * - *map is memory of length len.
228 * Postconditions (success):
230 * - *map is memory of length len.
234 static int set_position_mapping(size_t **map
, size_t *len
, size_t j
, size_t k
)
239 if (!(newmem
= realloc(*map
, (j
+ 2) * sizeof **map
))) {
240 PERROR_MESSAGE("realloc");
247 for (size_t l
= *len
; l
< j
+ 2; ++l
) {
248 (*map
)[l
] = ((size_t) -1);
260 * HTML-escape in to *out.
264 * - in is memory of at least length in_len, valid UTF-8
267 * - *out is memory of at least length *out_len (if *out_len = 0,
268 * *out may be 0), valid UTF-8 text.
270 * - Overwriting *out and *out_len shall not cause a memory leak.
272 * - out, out_len, and out_idx are not 0.
274 * Postconditions (success):
276 * - *out is memory of at least length *out_len, valid UTF-8 text.
278 * - A stretch of HTML-escaped ASCII text representing in has been
279 * added to *out at the position that was *out_idx.
281 * - *out_idx has been updated to point to the end of this stretch.
283 * - If necessary, *out_len has been updated.
285 static int to_html(const char *in
, const size_t in_len
, size_t in_idx
,
286 char **out
, size_t *out_len
, size_t *out_idx
)
292 size_t initial_out_idx
= *out_idx
;
295 if (!(*out
= malloc(1))) {
296 PERROR_MESSAGE("malloc");
306 * XXX: If you make this multithreaded, be sure to use
309 while (in_idx
< in_len
&&
311 /* Extract next character */
312 mbret
= mbtowc(&wc
, in
+ in_idx
, in_len
- in_idx
);
315 PERROR_MESSAGE("mbtowc");
320 ret
= append_str(out
, out_idx
, &out_sz
, "&", 5);
321 } else if (wc
== L
'"') {
322 ret
= append_str(out
, out_idx
, &out_sz
, """, 6);
323 } else if (wc
== L
'\'') {
324 ret
= append_str(out
, out_idx
, &out_sz
, "'", 6);
325 } else if (wc
== L
'<') {
326 ret
= append_str(out
, out_idx
, &out_sz
, "<", 4);
327 } else if (wc
== L
'>') {
328 ret
= append_str(out
, out_idx
, &out_sz
, ">", 4);
329 } else if (mbret
== 1 &&
332 ret
= append_char(out
, out_idx
, &out_sz
, in
[in_idx
]);
333 } else if (mbret
== 1 &&
334 in
[in_idx
] == '\r') {
336 } else if (mbret
== 1 &&
337 in
[in_idx
] == '\n') {
338 ret
= append_char(out
, out_idx
, &out_sz
, in
[in_idx
]);
340 ret
= append_wchar_escaped(out
, out_idx
, &out_sz
, wc
);
350 *out_len
= *out_len
+ (*out_idx
- initial_out_idx
);
358 * From in construct *out, which is a codepoint-for-codepoint
359 * translation following the rules of unicode-transforms.h. The
360 * result is that *out can be matched with normal regex, even if
361 * in contains obfuscatory Unicode bullshit.
365 * - setup_sanitize_comment() has been invoked more recently than
366 * clean_sanitize_comment().
368 * - in is memory of at least length in_len, valid UTF-8 text.
370 * - Overwriting *out and *out_position_map shall not cause a
373 * - out, out_len, out_position_map, and out_position_map_len are
376 * Postconditions (success):
378 * - *out is valid, UTF-8 text of length *out_len.
380 * - For every j in [0, *out_len) such that (*out)[j] starts a
381 * codepoint, in[*(position_map)[j]] is the start of the
382 * corresponding codepoint.
384 * - (*position_map)[*out_len] = in_len.
386 static int to_scannable(const char *in
, size_t in_len
, char **out
,
387 size_t *out_len
, size_t **out_position_map
,
388 size_t *out_position_map_len
)
395 struct translate
*tr
= 0;
399 if (!(*out
= malloc(1))) {
400 PERROR_MESSAGE("malloc");
410 * Position_map is here to make wordfiltering work. Suppose in is
412 * I think Nina Purpleton did
415 * and a wordfilter /Nina Purpleton/i -> "worst girl" is
416 * in effect. Then *out will be
418 * I think Nina Purpleton did nothing wrong
420 * The message should, of course, be filtered to
422 * I think worst girl did nothing
425 * In order to do that, it would be necessary to have a map
426 * from in to *out on the byte level, since the wordfilter
427 * will only be run against *out.
429 * position_map[j] = k means that out[j] and in[k] mean the
432 while (in_idx
< in_len
) {
433 mbret
= mbtowc(&wc
, in
+ in_idx
, in_len
- in_idx
);
436 PERROR_MESSAGE("mbtowc");
440 /* We pre-suppose that the insert will go as planned */
441 if (set_position_mapping(out_position_map
, out_position_map_len
,
442 out_idx
, in_idx
) < 0) {
449 if (append_str(out
, &out_idx
, &out_sz
, in
+ in_idx
, 1) <
454 if ((tr
= bsearch(&wc
, translates
, NUM_OF(translates
),
457 if (append_str(out
, &out_idx
, &out_sz
, tr
->to
,
458 strlen(tr
->to
)) < 0) {
462 if (append_str(out
, &out_idx
, &out_sz
, in
+
463 in_idx
, mbret
) < 0) {
472 if (set_position_mapping(out_position_map
, out_position_map_len
,
473 out_idx
, in_len
) < 0) {
477 (*out
)[out_idx
] = '\0';
486 * Read through raw and scannable, checking all wordfilters in
487 * scannable. Where a match is detected, the corresponding postion
488 * (via position_map) in raw is replaced by the replacement specified
489 * by the matching wordfilter.
493 * - setup_sanitize_comment() has been invoked more recently than
494 * clean_sanitize_comment().
496 * - raw is memory of length at least raw_len, valid UTF-8 text.
498 * - scannable is memory of length at least scannable_len.
500 * - For any j in [0, scannable_len), position_map[j] is a valid
501 * index into raw, or is (size_t) -1.
503 * - position_map[scannable_len] = raw_len.
505 * - For any j in [0, scannable_len) such that k = position_map[j]
506 * is not (size_t) -1, scannable[j] and raw[k] are conceptually
507 * the same for wordfiltering.
509 * - Overwriting *out shall not cause a memory leak.
511 * - out and out_len are not 0.
513 * Postconditions (success):
515 * - *out is valid, UTF-8 text of length *out_len such that all
516 * non ASCII codepoints (and '<', '>', '&', '"', ''') are
519 * - *out represents raw, except in those sections of scannable
520 * where a wordfilter matched.
522 static int wordfilter_to_html(const char *raw
, const size_t raw_len
, const
523 char *scannable
, const size_t scannable_len
,
524 size_t *position_map
, char **out
,
529 /* These hold the match locations from pcre2 */
530 uint32_t *ov_counts
= 0;
531 PCRE2_SIZE
**ov_ps
= 0;
532 int *num_matches
= 0;
533 pcre2_match_data
**match_data
= 0;
535 size_t scannable_idx
= 0;
537 size_t best_match_pos
= 0;
538 size_t best_match_idx
= 0;
542 if (!(ov_counts
= calloc(wordfilters_num
, sizeof *ov_counts
))) {
543 PERROR_MESSAGE("calloc");
547 if (!(ov_ps
= calloc(wordfilters_num
, sizeof *ov_ps
))) {
548 PERROR_MESSAGE("calloc");
552 if (!(num_matches
= calloc(wordfilters_num
, sizeof *num_matches
))) {
553 PERROR_MESSAGE("calloc");
557 if (!(match_data
= calloc(wordfilters_num
, sizeof *match_data
))) {
558 PERROR_MESSAGE("calloc");
562 /* First scan, before the loop */
563 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
564 if (!(match_data
[j
] = pcre2_match_data_create_from_pattern(
565 wordfilters
[j
].code
, 0))) {
566 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
570 num_matches
[j
] = pcre2_match(wordfilters
[j
].code
,
571 (PCRE2_SPTR
) scannable
,
572 scannable_len
, scannable_idx
, 0,
577 best_match_pos
= (size_t) -1;
578 best_match_idx
= (size_t) -1;
580 /* We've run pcre2_match() on everything. Find the soonest match */
581 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
582 if (!num_matches
[j
]) {
586 ov_ps
[j
] = pcre2_get_ovector_pointer(match_data
[j
]);
588 if (ov_ps
[j
][0] >= scannable_idx
&&
589 ov_ps
[j
][0] < best_match_pos
) {
590 best_match_pos
= ov_ps
[j
][0];
595 if (best_match_idx
== (size_t) -1) {
596 /* No matches. Turn the rest to html boring-like */
597 ret
= to_html(raw
, raw_len
, raw_idx
, out
, out_len
, &out_idx
);
601 /* Figure out where in raw this match starts */
604 while (l
!= (size_t) -1 &&
605 position_map
[l
] == (size_t) -1) {
609 if (l
== (size_t) -1) {
610 ERROR_MESSAGE("Impossible condition in "
611 "wordfilter_to_html: raw=\"%s\", best_match_pos = %zu",
618 * Now position_map[l] points to the first character in raw
619 * that should be replaced. Fill up to that point.
621 if (position_map
[l
] &&
622 position_map
[l
] > raw_idx
) {
623 if (to_html(raw
, position_map
[l
], raw_idx
, out
, out_len
,
629 /* Put the substituted text in */
630 if (to_html(wordfilters
[best_match_idx
].replacement
,
631 wordfilters
[best_match_idx
].replacement_len
, 0, out
,
638 * Figure out where we should advance to in inputs. Naively,
639 * we want to set scannable_idx to ov_ps[best_match_idx][1]
640 * (the first character in scannable beyond the match).
641 * However, we have to consider the case of
645 * where "foo" -> "baz" is the only transformation. Since
646 * some characters, like "!", are completely ignored by
647 * the scannable transformation, the naive method would
648 * start our scanning at the "b", skipping information.
650 * So, instead, we carefully find the last character in
651 * "foo", then jump one past it. This (unfortunately)
652 * requires a bit more manual fiddling with wide character
656 if (ov_ps
[best_match_idx
][1] <= scannable_idx
) {
658 * This should never happen, but let's make sure
659 * we always keep advancing.
663 scannable_idx
= ov_ps
[best_match_idx
][1] - 1;
668 while (position_map
[l
] == (size_t) -1) {
672 raw_idx
= position_map
[l
];
674 /* This is the "jump one past it" part */
677 mbret
= mbrlen(raw
+ raw_idx
, MB_CUR_MAX
, 0);
682 PERROR_MESSAGE("mbrlen");
689 * Now re-check all our matches and figure out which ones
692 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
693 if (!num_matches
[j
] ||
694 ov_ps
[j
][0] >= scannable_idx
) {
698 num_matches
[j
] = pcre2_match(wordfilters
[j
].code
,
699 (PCRE2_SPTR
) scannable
,
700 scannable_len
, scannable_idx
, 0,
704 goto handle_next_match
;
707 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
708 pcre2_match_data_free(match_data
[j
]);
721 * Read through in. Each time a match for format_replacements is
722 * found (something like a newline or a quote) is found, replace
723 * it with some HTML markup. The result is placed in out.
727 * - setup_sanitize_comment() has been invoked more recently than
728 * clean_sanitize_comment().
730 * - in is memory of length at least in_len, valid UTF-8 text.
732 * - Overwriting *out shall not cause a memory leak.
734 * - out and out_len are not 0.
736 * Postconditions (success):
738 * - *out is valid, UTF-8 text of length *out_len with sane HTML
739 * markup (and HTML escaped), suitable for outputting into an
742 static int insert_html_tags(const char *in
, size_t in_len
, const char *board
,
743 char **out
, size_t *out_len
)
747 size_t match_pos
= 0;
748 size_t after_match_pos
= 0;
750 pcre2_match_data
*match_data
= 0;
752 PCRE2_UCHAR
*tmp_1
= 0;
753 PCRE2_SIZE tmp_1_len
= 0;
754 PCRE2_UCHAR
*tmp_2
= 0;
755 PCRE2_SIZE tmp_2_len
= 0;
756 PCRE2_UCHAR
*tmp_3
= 0;
757 PCRE2_SIZE tmp_3_len
= 0;
758 uint_fast8_t last_was_newline
= 1;
759 char *link_target
= 0;
760 size_t link_target_len
= 0;
762 if (!(match_data
= pcre2_match_data_create_from_pattern(
763 format_replacements
, 0))) {
764 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
770 if (in_idx
>= in_len
) {
774 nret
= pcre2_match(format_replacements
, (PCRE2_SPTR
) in
, in_len
, in_idx
,
777 if (nret
== PCRE2_ERROR_NOMATCH
) {
778 ret
= append_str(out
, &out_idx
, out_len
, in
+ in_idx
, in_len
-
784 PCRE2_UCHAR8 err_buf
[120];
786 pcre2_get_error_message(nret
, err_buf
, 120);
787 ERROR_MESSAGE("pcre2_match: error while matching \"%.*s\": %s"
788 " (PCRE2 %d)", (int) (in_len
- in_idx
), in
+
794 pcre2_substring_free(tmp_1
);
795 pcre2_substring_free(tmp_2
);
796 pcre2_substring_free(tmp_3
);
803 /* We have match, stuff everything up to it in *out */
804 match_pos
= pcre2_get_ovector_pointer(match_data
)[0];
805 after_match_pos
= pcre2_get_ovector_pointer(match_data
)[1];
807 if (match_pos
> in_idx
) {
808 if (append_str(out
, &out_idx
, out_len
, in
+ in_idx
, match_pos
-
813 last_was_newline
= 0;
817 /* Figure out what type of match. */
818 if (!pcre2_substring_get_byname(match_data
, (PCRE2_SPTR
) "newline",
819 &tmp_1
, &tmp_1_len
)) {
820 if (last_was_newline
) {
821 if (append_const_str(out
, &out_idx
, out_len
,
822 " <br />") < 0) {
826 if (append_const_str(out
, &out_idx
, out_len
, "<br />") <
832 last_was_newline
= 1;
833 in_idx
= after_match_pos
;
837 last_was_newline
= 0;
839 if (!pcre2_substring_get_byname(match_data
, (PCRE2_SPTR
) "quote",
840 &tmp_1
, &tmp_1_len
)) {
841 if (append_const_str(out
, &out_idx
, out_len
,
842 "<span class=\"quote\">") < 0) {
846 if (append_str(out
, &out_idx
, out_len
, (const char *) tmp_1
,
847 (size_t) tmp_1_len
) < 0) {
851 if (append_const_str(out
, &out_idx
, out_len
, "</span>") < 0) {
855 in_idx
= after_match_pos
;
859 if (!pcre2_substring_get_byname(match_data
,
860 (PCRE2_SPTR
) "intra_postlink", &tmp_1
,
862 if (pcre2_substring_get_byname(match_data
, (PCRE2_SPTR
) "a_num",
863 &tmp_2
, &tmp_2_len
)) {
864 goto problem_with_match
;
869 if (db_construct_post_link(board
, strlen(board
), (const
871 tmp_2_len
, &found
, &link_target
,
872 &link_target_len
) < 0) {
877 if (append_str(out
, &out_idx
, out_len
, in
+ match_pos
,
878 after_match_pos
- match_pos
) < 0) {
882 in_idx
= after_match_pos
;
886 if (append_const_str(out
, &out_idx
, out_len
, "<a href=\"") <
891 if (append_str(out
, &out_idx
, out_len
, link_target
,
892 link_target_len
) < 0) {
896 if (append_const_str(out
, &out_idx
, out_len
, "\">") < 0) {
900 if (append_str(out
, &out_idx
, out_len
, (const char *) tmp_1
,
901 (size_t) tmp_1_len
) < 0) {
905 if (append_const_str(out
, &out_idx
, out_len
, "</a>") < 0) {
909 in_idx
= after_match_pos
;
913 if (!pcre2_substring_get_byname(match_data
,
914 (PCRE2_SPTR
) "inter_postlink", &tmp_1
,
916 if (pcre2_substring_get_byname(match_data
, (PCRE2_SPTR
) "e_num",
917 &tmp_2
, &tmp_2_len
)) {
918 goto problem_with_match
;
921 if (pcre2_substring_get_byname(match_data
,
922 (PCRE2_SPTR
) "e_board", &tmp_3
,
924 goto problem_with_match
;
929 if (db_construct_post_link((const char *) tmp_3
, tmp_3_len
,
930 (const char *) tmp_2
, tmp_2_len
,
931 &found
, &link_target
,
932 &link_target_len
) < 0) {
937 if (append_str(out
, &out_idx
, out_len
, in
+ match_pos
,
938 after_match_pos
- match_pos
) < 0) {
942 in_idx
= after_match_pos
;
946 if (append_const_str(out
, &out_idx
, out_len
, "<a href=\"") <
951 if (append_str(out
, &out_idx
, out_len
, link_target
,
952 link_target_len
) < 0) {
956 if (append_const_str(out
, &out_idx
, out_len
, "\">") < 0) {
960 if (append_str(out
, &out_idx
, out_len
, (const char *) tmp_1
,
961 (size_t) tmp_1_len
) < 0) {
965 if (append_const_str(out
, &out_idx
, out_len
, "</a>") < 0) {
969 in_idx
= after_match_pos
;
975 /* There was some kind of match, but it went wrong. */
982 pcre2_substring_free(tmp_1
);
983 pcre2_substring_free(tmp_2
);
984 pcre2_substring_free(tmp_3
);
985 pcre2_match_data_free(match_data
);
991 * Make sure that the contents of *pc are ready for safe injection
992 * into the board, including HTML escaping, wordfiltering, general
993 * formatting, and adding links.
997 * - setup_sanitize_comment() has been invoked more recently than
998 * clean_sanitize_comment().
1000 * - *pc has been filled out (fields like action, board, etc. have
1001 * been populated) from the POST data.
1003 * Postconditions (success):
1005 * - The prepared_XYZ fields of *pc have been filled out, and each
1006 * is valid ASCII text, with Unicode codepoints.
1008 int st_sanitize_text(struct post_cmd
*pc
, int *our_fault
)
1012 char *html_escaped_comment
= 0;
1013 size_t html_escaped_comment_len
= 0;
1015 /* Flush out lurking double-free bugs */
1016 free(pc
->prepared
.name
);
1017 pc
->prepared
.name
= 0;
1018 pc
->prepared
.name_len
= 0;
1019 free(pc
->prepared
.email
);
1020 pc
->prepared
.email
= 0;
1021 pc
->prepared
.email_len
= 0;
1022 free(pc
->prepared
.subject
);
1023 pc
->prepared
.subject
= 0;
1024 pc
->prepared
.subject_len
= 0;
1025 free(pc
->prepared
.comment
);
1026 pc
->prepared
.comment
= 0;
1027 pc
->prepared
.comment_len
= 0;
1028 free(pc
->prepared
.file_name
);
1029 pc
->prepared
.file_name
= 0;
1030 pc
->prepared
.file_name_len
= 0;
1031 free(pc
->scannable_comment
);
1032 pc
->scannable_comment
= 0;
1033 pc
->scannable_comment_len
= 0;
1034 free(pc
->position_map
);
1035 pc
->position_map
= 0;
1036 pc
->position_map_len
= 0;
1039 if (!pc
->raw
.name_len
) {
1042 if (!(pc
->raw
.name
= strdup("Anonymous"))) {
1043 PERROR_MESSAGE("strdup");
1047 pc
->raw
.name_len
= strlen(pc
->raw
.name
);
1050 if (pc
->raw
.name_len
) {
1051 if (to_html(pc
->raw
.name
, pc
->raw
.name_len
, 0,
1052 &pc
->prepared
.name
, &pc
->prepared
.name_len
,
1061 if (pc
->raw
.email_len
) {
1062 if (to_html(pc
->raw
.email
, pc
->raw
.email_len
, 0,
1063 &pc
->prepared
.email
, &pc
->prepared
.email_len
,
1072 if (pc
->raw
.tripcode_len
) {
1073 if (to_html(pc
->raw
.tripcode
, pc
->raw
.tripcode_len
, 0,
1074 &pc
->prepared
.tripcode
, &pc
->prepared
.tripcode_len
,
1084 if (pc
->raw
.subject_len
) {
1085 if (to_html(pc
->raw
.subject
, pc
->raw
.subject_len
, 0,
1086 &pc
->prepared
.subject
, &pc
->prepared
.subject_len
,
1096 if (pc
->raw
.file_name_len
) {
1097 if (to_html(pc
->raw
.file_name
, pc
->raw
.file_name_len
, 0,
1098 &pc
->prepared
.file_name
,
1099 &pc
->prepared
.file_name_len
,
1106 if (to_scannable(pc
->raw
.comment
, pc
->raw
.comment_len
,
1107 &pc
->scannable_comment
, &pc
->scannable_comment_len
,
1109 &pc
->position_map_len
)) {
1115 * Now we do the fancy thing. Match scannable, build prepared
1118 if (wordfilter_to_html(pc
->raw
.comment
, pc
->raw
.comment_len
,
1119 pc
->scannable_comment
, pc
->scannable_comment_len
,
1121 &html_escaped_comment
,
1122 &html_escaped_comment_len
) < 0) {
1128 * Everything's in { form, but now take care of >>123,
1131 if (insert_html_tags(html_escaped_comment
, html_escaped_comment_len
,
1132 pc
->raw
.board
, &pc
->prepared
.comment
,
1133 &pc
->prepared
.comment_len
) < 0) {
1140 free(html_escaped_comment
);
1146 * Initialize any static elements needed for this file.
1150 * - setup_sanitize_comment() was not invoked more recently than
1151 * clean_sanitize_comment().
1153 * Postconditions (success):
1155 * - Any other function in this file may be safely called.
1157 int setup_sanitize_comment(const struct configuration
*conf
)
1160 * Check that the locale/libc/whatever is set up so that
1161 * UTF-8 handling can work.
1165 "<script>alert(1)</script> , \U0001d511\U0001d526\U0001d52b"
1166 "\U0001d51e\u3000\U0001d513\U0001d532\U0001d52f\U0001d52d"
1167 "\U0001d529\U0001d522\U0001d531\U0001d52c\U0001d52b & "
1168 "\u2468\u0294!\u0ce2!!";
1169 const char *correct_html
=
1170 "<script>alert(1)</script> , 𝔑𝔦"
1171 "𝔫𝔞 𝔓𝔲𝔯"
1172 "𝔭𝔩𝔢𝔱𝔬𝔫 &"
1173 " ⑨ʔ!ೢ!!";
1174 const char *correct_scannable
=
1175 "<script>alert(1)</script> , Nina Purpleton & 9!!!";
1177 size_t html_len
= 0;
1178 char *scannable
= 0;
1179 size_t scannable_len
= 0;
1180 size_t *position_map
= 0;
1181 size_t position_map_len
= 0;
1184 if (to_html(raw
, strlen(raw
), 0, &html
, &html_len
, &out_idx
) < 0) {
1188 if (strcmp(html
, correct_html
)) {
1189 ERROR_MESSAGE("Was expecting html conversion to yield "
1190 "\n\n\u00ab%s\u00bb\n\nInstead, got "
1191 "\n\n\u00ab%s\u00bb\n\n",
1192 correct_html
, html
);
1196 if (to_scannable(raw
, strlen(raw
), &scannable
, &scannable_len
,
1197 &position_map
, &position_map_len
) < 0) {
1201 if (strcmp(scannable
, correct_scannable
)) {
1202 ERROR_MESSAGE("Was expecting scannable conversion to yield "
1203 "\n\n\u00ab%s\u00bb\n\nInstead, got "
1204 "\n\n\u00ab%s\u00bb\n\n",
1205 correct_scannable
, scannable
);
1209 if (!(wordfilters
= calloc(sizeof *wordfilters
,
1210 conf
->wordfilter_inputs_num
))) {
1211 PERROR_MESSAGE("calloc");
1215 wordfilters_num
= conf
->wordfilter_inputs_num
;
1217 PCRE2_SIZE err_offset
= 0;
1218 PCRE2_UCHAR8 err_buf
[120];
1220 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
1221 wordfilters
[j
].replacement
=
1222 conf
->wordfilter_inputs
[j
].replacement
;
1223 wordfilters
[j
].replacement_len
= strlen(
1224 conf
->wordfilter_inputs
[j
].replacement
);
1226 if ((wordfilters
[j
].code
= pcre2_compile(
1227 (PCRE2_SPTR8
) conf
->wordfilter_inputs
[j
].pattern
,
1228 PCRE2_ZERO_TERMINATED
, PCRE2_UTF
, &err_code
,
1233 pcre2_get_error_message(err_code
, err_buf
, 120);
1234 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1235 conf
->wordfilter_inputs
[j
].pattern
, err_buf
);
1239 const char *format_match_str
=
1242 "(?<newline>\\n)" /* */
1243 "|(?<intra_postlink>>>(?<a_num>[0-9]+))" /* */
1244 "|(?<inter_postlink>>>>/" /* */
1245 "(?<e_board>[^ /]+)/(?<e_num>[0-9]+))" /* */
1246 "|(?<quote>(?<![^\n])>[^\n]*)"; /* */
1248 if (!(format_replacements
= pcre2_compile(
1249 (PCRE2_SPTR8
) format_match_str
, PCRE2_ZERO_TERMINATED
,
1251 &err_code
, &err_offset
, 0))) {
1252 pcre2_get_error_message(err_code
, err_buf
, 120);
1253 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1254 format_match_str
, err_buf
);
1268 * Clean up any memory from this file
1270 * Postconditions (success):
1272 * - Valgrind won't report any memory leaks from this file.
1274 * - setup_sanitize_comment() can be safely called again.
1276 int clean_sanitize_comment(void)
1278 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
1279 pcre2_code_free(wordfilters
[j
].code
);
1280 wordfilters
[j
] = (struct wordfilter
) { 0 };
1283 pcre2_code_free(format_replacements
);
1284 format_replacements
= 0;
1287 wordfilters_num
= 0;