2 * Copyright (c) 2017-2020, De Rais <derais@cock.li>
4 * Permission to use, copy, modify, and/or distribute this software for
5 * any purpose with or without fee is hereby granted, provided that the
6 * above copyright notice and this permission notice appear in all
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
10 * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
11 * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
12 * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
13 * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
14 * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
15 * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
16 * PERFORMANCE OF THIS SOFTWARE.
26 #define PCRE2_CODE_UNIT_WIDTH 8
31 #include "unicode-transforms.h"
34 * We need a way to get codepoints out of UTF-8 strings and if
35 * wchar_t stored codepoint values, that would be great. That's
36 * __STDC_ISO_10646__, though. You can remove this check and cross
37 * your fingers, since rb79 will do a quick check on startup, but
38 * please check why the C implementation doesn't define
39 * __STDC_ISO_10646__ first.
41 #ifndef __STDC_ISO_10646__
42 #error We really want __STD_ISO_10646__
46 * A wordfilter consists of a pcre2 regex and a replacement string
51 const char *replacement
;
52 size_t replacement_len
;
55 /* These are constructed in setup_sanitize_comment() */
56 static struct wordfilter
*wordfilters
;
57 static size_t wordfilters_num
;
59 /* Special matcher for quoting, newlines, linkifying, etc. */
60 static pcre2_code
*format_replacements
;
63 * Comparison function for struct translate.
67 * - *key_v is a wchar_t.
69 * - *tr_v is a struct translate object.
73 * - Returns -1 (0) [1] if *key_v is less than (equal to) [greater
74 * than] *tr_v's starting range.
77 match_translate(const void *key_v
, const void *tr_v
)
79 const wchar_t *key
= key_v
;
80 const struct translate
*tr
= tr_v
;
82 if (*key
< tr
->from_s
) {
84 } else if (*key
> tr
->from_t
) {
92 * Add a UTF-8 sequence str onto *buf
96 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
99 * - str is a valid ASCII (not just UTF-8) string of length str_len.
101 * Postconditions (success):
103 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
106 * - The contents of str have been appended to *buf (and *idx
110 append_str(char **buf
, size_t *idx
, size_t *sz
, const char *str
, size_t str_len
)
112 if (str_len
+ *idx
>= *sz
) {
114 size_t new_sz
= str_len
+ *idx
+ (1 << 9);
116 if (str_len
+ *idx
< str_len
||
117 str_len
+ *idx
+ (1 << 9) < str_len
+ *idx
) {
118 ERROR_MESSAGE("overflow (str_len = %zu, *idx = %zu)",
124 if (!(newmem
= realloc(*buf
, new_sz
))) {
125 PERROR_MESSAGE("realloc");
134 strncpy(*buf
+ *idx
, str
, str_len
);
135 *(*buf
+ *idx
+ str_len
) = '\0';
141 /* Dummy function for when I can't be bothered to strlen(). */
143 append_const_str(char **buf
, size_t *idx
, size_t *len
, const char *str
)
145 return append_str(buf
, idx
, len
, str
, strlen(str
));
149 * Add a single character onto *buf
153 * - *buf is memory of length *len, and up to *idx is a valid UTF-8
156 * - c is an ASCII character.
158 * Postconditions (success):
160 * - *buf is memory of length *len, and up to *idx is a valid UTF-8
163 * - c has been appended to *buf (and *idx includes this).
166 append_char(char **buf
, size_t *idx
, size_t *len
, char c
)
168 if (1 + *idx
>= *len
) {
170 size_t new_len
= 1 + *idx
+ (1 << 9);
172 if (*idx
+ 1 < *idx
||
173 *idx
+ 1 + (1 << 9) < *idx
+ 1) {
174 ERROR_MESSAGE("overflow (*idx = %zu)", *idx
);
179 if (!(newmem
= realloc(*buf
, new_len
))) {
180 PERROR_MESSAGE("realloc");
190 *(*buf
+ *idx
+ 1) = '\0';
197 * Add a Unicode codepoint onto *buf
201 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
204 * - wchar_t is a valid Unicode codepoint.
206 * Postconditions (success):
208 * - *buf is memory of length *sz, and up to *idx is a valid UTF-8
211 * - An HTML-escaped sequence like { has been appended to
212 * *buf (and *idx includes this).
215 append_wchar_escaped(char **buf
, size_t *idx
, size_t *sz
, wchar_t wc
)
217 size_t l
= snprintf(0, 0, "&#%ld;", (long) wc
);
219 if (l
+ *idx
>= *sz
) {
221 size_t new_sz
= l
+ *idx
+ (1 << 9);
223 if (*idx
+ l
< *idx
||
224 *idx
+ l
+ (1 << 9) < *idx
+ l
) {
225 ERROR_MESSAGE("overflow (*idx = %zu, l = %zu)", *idx
,
231 if (!(newmem
= realloc(*buf
, new_sz
))) {
232 PERROR_MESSAGE("realloc");
241 sprintf(*buf
+ *idx
, "&#%ld;", (long) wc
);
248 * Ensure that (*map)[j] = k, fixing up length as appropriate.
252 * - *map is memory of length len.
254 * Postconditions (success):
256 * - *map is memory of length len.
261 set_position_mapping(size_t **map
, size_t *len
, size_t j
, size_t k
)
267 ((j
+ 2) * sizeof **map
) / (j
+ 2) != sizeof **map
) {
268 ERROR_MESSAGE("overflow (j = %zu)", j
);
273 if (!(newmem
= realloc(*map
, (j
+ 2) * sizeof **map
))) {
274 PERROR_MESSAGE("realloc");
281 for (size_t l
= *len
; l
< j
+ 2; ++l
) {
282 (*map
)[l
] = ((size_t) -1);
294 * HTML-escape in to *out.
298 * - in is memory of at least length in_len, valid UTF-8
301 * - *out is memory of at least length *out_len (if *out_len = 0,
302 * *out may be 0), valid UTF-8 text.
304 * - Overwriting *out and *out_len shall not cause a memory leak.
306 * - out, out_len, and out_idx are not 0.
308 * Postconditions (success):
310 * - *out is memory of at least length *out_len, valid UTF-8 text.
312 * - A stretch of HTML-escaped ASCII text representing in has been
313 * added to *out at the position that was *out_idx.
315 * - *out_idx has been updated to point to the end of this stretch.
317 * - If necessary, *out_len has been updated.
320 to_html(const char *in
, const size_t in_len
, size_t in_idx
, char **out
,
321 size_t *out_len
, size_t *out_idx
)
327 size_t initial_out_idx
= *out_idx
;
330 if (!(*out
= malloc(1))) {
331 PERROR_MESSAGE("malloc");
341 * XXX: If you make this multithreaded, be sure to use
344 while (in_idx
< in_len
&&
346 /* Extract next character */
347 mbret
= mbtowc(&wc
, in
+ in_idx
, in_len
- in_idx
);
350 PERROR_MESSAGE("mbtowc");
355 ret
= append_str(out
, out_idx
, &out_sz
, "&", 5);
356 } else if (wc
== L
'"') {
357 ret
= append_str(out
, out_idx
, &out_sz
, """, 6);
358 } else if (wc
== L
'\'') {
359 ret
= append_str(out
, out_idx
, &out_sz
, "'", 6);
360 } else if (wc
== L
'<') {
361 ret
= append_str(out
, out_idx
, &out_sz
, "<", 4);
362 } else if (wc
== L
'>') {
363 ret
= append_str(out
, out_idx
, &out_sz
, ">", 4);
364 } else if (mbret
== 1 &&
367 ret
= append_char(out
, out_idx
, &out_sz
, in
[in_idx
]);
368 } else if (mbret
== 1 &&
369 in
[in_idx
] == '\r') {
371 } else if (mbret
== 1 &&
372 in
[in_idx
] == '\n') {
373 ret
= append_char(out
, out_idx
, &out_sz
, in
[in_idx
]);
375 ret
= append_wchar_escaped(out
, out_idx
, &out_sz
, wc
);
385 *out_len
= *out_len
+ (*out_idx
- initial_out_idx
);
393 * From in construct *out, which is a codepoint-for-codepoint
394 * translation following the rules of unicode-transforms.h. The
395 * result is that *out can be matched with normal regex, even if
396 * in contains obfuscatory Unicode bullshit.
400 * - setup_sanitize_comment() has been invoked more recently than
401 * clean_sanitize_comment().
403 * - in is memory of at least length in_len, valid UTF-8 text.
405 * - Overwriting *out and *out_position_map shall not cause a
408 * - out, out_len, out_position_map, and out_position_map_len are
411 * Postconditions (success):
413 * - *out is valid, UTF-8 text of length *out_len.
415 * - For every j in [0, *out_len) such that (*out)[j] starts a
416 * codepoint, in[*(position_map)[j]] is the start of the
417 * corresponding codepoint.
419 * - (*position_map)[*out_len] = in_len.
422 to_scannable(const char *in
, size_t in_len
, char **out
, size_t *out_len
,
423 size_t **out_position_map
, size_t *out_position_map_len
)
430 struct translate
*tr
= 0;
434 if (!(*out
= malloc(1))) {
435 PERROR_MESSAGE("malloc");
445 * Position_map is here to make wordfiltering work. Suppose in is
447 * I think Nina Purpleton did
450 * and a wordfilter /Nina Purpleton/i -> "worst girl" is
451 * in effect. Then *out will be
453 * I think Nina Purpleton did nothing wrong
455 * The message should, of course, be filtered to
457 * I think worst girl did nothing
460 * In order to do that, it would be necessary to have a map
461 * from in to *out on the byte level, since the wordfilter
462 * will only be run against *out.
464 * position_map[j] = k means that out[j] and in[k] mean the
467 while (in_idx
< in_len
) {
468 mbret
= mbtowc(&wc
, in
+ in_idx
, in_len
- in_idx
);
471 PERROR_MESSAGE("mbtowc");
475 /* We pre-suppose that the insert will go as planned */
476 if (set_position_mapping(out_position_map
, out_position_map_len
,
477 out_idx
, in_idx
) < 0) {
484 if (append_str(out
, &out_idx
, &out_sz
, in
+ in_idx
, 1) <
489 if ((tr
= bsearch(&wc
, translates
, NUM_OF(translates
),
492 if (append_str(out
, &out_idx
, &out_sz
, tr
->to
,
493 strlen(tr
->to
)) < 0) {
497 if (append_str(out
, &out_idx
, &out_sz
, in
+
498 in_idx
, mbret
) < 0) {
507 if (set_position_mapping(out_position_map
, out_position_map_len
,
508 out_idx
, in_len
) < 0) {
512 (*out
)[out_idx
] = '\0';
521 * Read through raw and scannable, checking all wordfilters in
522 * scannable. Where a match is detected, the corresponding postion
523 * (via position_map) in raw is replaced by the replacement specified
524 * by the matching wordfilter.
528 * - setup_sanitize_comment() has been invoked more recently than
529 * clean_sanitize_comment().
531 * - raw is memory of length at least raw_len, valid UTF-8 text.
533 * - scannable is memory of length at least scannable_len.
535 * - For any j in [0, scannable_len), position_map[j] is a valid
536 * index into raw, or is (size_t) -1.
538 * - position_map[scannable_len] = raw_len.
540 * - For any j in [0, scannable_len) such that k = position_map[j]
541 * is not (size_t) -1, scannable[j] and raw[k] are conceptually
542 * the same for wordfiltering.
544 * - Overwriting *out shall not cause a memory leak.
546 * - out and out_len are not 0.
548 * Postconditions (success):
550 * - *out is valid, UTF-8 text of length *out_len such that all
551 * non ASCII codepoints (and '<', '>', '&', '"', ''') are
554 * - *out represents raw, except in those sections of scannable
555 * where a wordfilter matched.
558 wordfilter_to_html(const char *raw
, const size_t raw_len
, const char *scannable
,
559 const size_t scannable_len
, size_t *position_map
, char **out
,
564 /* These hold the match locations from pcre2 */
565 uint32_t *ov_counts
= 0;
566 PCRE2_SIZE
**ov_ps
= 0;
567 int *num_matches
= 0;
568 pcre2_match_data
**match_data
= 0;
570 size_t scannable_idx
= 0;
572 size_t best_match_pos
= 0;
573 size_t best_match_idx
= 0;
577 if (!(ov_counts
= calloc(wordfilters_num
, sizeof *ov_counts
))) {
578 PERROR_MESSAGE("calloc");
582 if (!(ov_ps
= calloc(wordfilters_num
, sizeof *ov_ps
))) {
583 PERROR_MESSAGE("calloc");
587 if (!(num_matches
= calloc(wordfilters_num
, sizeof *num_matches
))) {
588 PERROR_MESSAGE("calloc");
592 if (!(match_data
= calloc(wordfilters_num
, sizeof *match_data
))) {
593 PERROR_MESSAGE("calloc");
597 /* First scan, before the loop */
598 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
599 if (!(match_data
[j
] = pcre2_match_data_create_from_pattern(
600 wordfilters
[j
].code
, 0))) {
601 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
605 num_matches
[j
] = pcre2_match(wordfilters
[j
].code
,
606 (PCRE2_SPTR
) scannable
,
607 scannable_len
, scannable_idx
, 0,
612 best_match_pos
= (size_t) -1;
613 best_match_idx
= (size_t) -1;
615 /* We've run pcre2_match() on everything. Find the soonest match */
616 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
617 if (!num_matches
[j
]) {
621 ov_ps
[j
] = pcre2_get_ovector_pointer(match_data
[j
]);
623 if (ov_ps
[j
][0] >= scannable_idx
&&
624 ov_ps
[j
][0] < best_match_pos
) {
625 best_match_pos
= ov_ps
[j
][0];
630 if (best_match_idx
== (size_t) -1) {
631 /* No matches. Turn the rest to html boring-like */
632 ret
= to_html(raw
, raw_len
, raw_idx
, out
, out_len
, &out_idx
);
636 /* Figure out where in raw this match starts */
639 while (l
!= (size_t) -1 &&
640 position_map
[l
] == (size_t) -1) {
644 if (l
== (size_t) -1) {
645 ERROR_MESSAGE("Impossible condition in "
646 "wordfilter_to_html: raw=\"%s\", best_match_pos = %zu",
653 * Now position_map[l] points to the first character in raw
654 * that should be replaced. Fill up to that point.
656 if (position_map
[l
] &&
657 position_map
[l
] > raw_idx
) {
658 if (to_html(raw
, position_map
[l
], raw_idx
, out
, out_len
,
664 /* Put the substituted text in */
665 if (to_html(wordfilters
[best_match_idx
].replacement
,
666 wordfilters
[best_match_idx
].replacement_len
, 0, out
,
673 * Figure out where we should advance to in inputs. Naively,
674 * we want to set scannable_idx to ov_ps[best_match_idx][1]
675 * (the first character in scannable beyond the match).
676 * However, we have to consider the case of
680 * where "foo" -> "baz" is the only transformation. Since
681 * some characters, like "!", are completely ignored by
682 * the scannable transformation, the naive method would
683 * start our scanning at the "b", skipping information.
685 * So, instead, we carefully find the last character in
686 * "foo", then jump one past it. This (unfortunately)
687 * requires a bit more manual fiddling with wide character
691 if (ov_ps
[best_match_idx
][1] <= scannable_idx
) {
693 * This should never happen, but let's make sure
694 * we always keep advancing.
698 scannable_idx
= ov_ps
[best_match_idx
][1] - 1;
703 while (position_map
[l
] == (size_t) -1) {
707 raw_idx
= position_map
[l
];
709 /* This is the "jump one past it" part */
712 mbret
= mbrlen(raw
+ raw_idx
, MB_CUR_MAX
, 0);
717 PERROR_MESSAGE("mbrlen");
724 * Now re-check all our matches and figure out which ones
727 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
728 if (!num_matches
[j
] ||
729 ov_ps
[j
][0] >= scannable_idx
) {
733 num_matches
[j
] = pcre2_match(wordfilters
[j
].code
,
734 (PCRE2_SPTR
) scannable
,
735 scannable_len
, scannable_idx
, 0,
739 goto handle_next_match
;
742 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
743 pcre2_match_data_free(match_data
[j
]);
756 * Read through in. Each time a match for format_replacements is
757 * found (something like a newline or a quote) is found, replace
758 * it with some HTML markup. The result is placed in out.
762 * - setup_sanitize_comment() has been invoked more recently than
763 * clean_sanitize_comment().
765 * - in is memory of length at least in_len, valid UTF-8 text.
767 * - Overwriting *out shall not cause a memory leak.
769 * - out and out_len are not 0.
771 * Postconditions (success):
773 * - *out is valid, UTF-8 text of length *out_len with sane HTML
774 * markup (and HTML escaped), suitable for outputting into an
778 insert_html_tags(const char *in
, size_t in_len
, const char *board
, char **out
,
783 size_t match_pos
= 0;
784 size_t after_match_pos
= 0;
786 pcre2_match_data
*match_data
= 0;
788 PCRE2_UCHAR
*tmp_1
= 0;
789 PCRE2_SIZE tmp_1_len
= 0;
790 PCRE2_UCHAR
*tmp_2
= 0;
791 PCRE2_SIZE tmp_2_len
= 0;
792 PCRE2_UCHAR
*tmp_3
= 0;
793 PCRE2_SIZE tmp_3_len
= 0;
794 uint_fast8_t last_was_newline
= 1;
795 char *link_target
= 0;
796 size_t link_target_len
= 0;
798 if (!(match_data
= pcre2_match_data_create_from_pattern(
799 format_replacements
, 0))) {
800 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
806 if (in_idx
>= in_len
) {
810 nret
= pcre2_match(format_replacements
, (PCRE2_SPTR
) in
, in_len
, in_idx
,
813 if (nret
== PCRE2_ERROR_NOMATCH
) {
814 ret
= append_str(out
, &out_idx
, out_len
, in
+ in_idx
, in_len
-
820 PCRE2_UCHAR8 err_buf
[120];
822 pcre2_get_error_message(nret
, err_buf
, 120);
823 ERROR_MESSAGE("pcre2_match: error while matching \"%.*s\": %s"
824 " (PCRE2 %d)", (int) (in_len
- in_idx
), in
+
830 pcre2_substring_free(tmp_1
);
831 pcre2_substring_free(tmp_2
);
832 pcre2_substring_free(tmp_3
);
839 /* We have match, stuff everything up to it in *out */
840 match_pos
= pcre2_get_ovector_pointer(match_data
)[0];
841 after_match_pos
= pcre2_get_ovector_pointer(match_data
)[1];
843 if (match_pos
> in_idx
) {
844 if (append_str(out
, &out_idx
, out_len
, in
+ in_idx
, match_pos
-
849 last_was_newline
= 0;
853 /* Figure out what type of match. */
854 if (!pcre2_substring_get_byname(match_data
, (PCRE2_SPTR
) "newline",
855 &tmp_1
, &tmp_1_len
)) {
856 if (last_was_newline
) {
857 if (append_const_str(out
, &out_idx
, out_len
,
858 " <br />") < 0) {
862 if (append_const_str(out
, &out_idx
, out_len
, "<br />") <
868 last_was_newline
= 1;
869 in_idx
= after_match_pos
;
873 last_was_newline
= 0;
875 if (!pcre2_substring_get_byname(match_data
, (PCRE2_SPTR
) "quote",
876 &tmp_1
, &tmp_1_len
)) {
877 if (append_const_str(out
, &out_idx
, out_len
,
878 "<span class=\"quote\">") < 0) {
882 if (append_str(out
, &out_idx
, out_len
, (const char *) tmp_1
,
883 (size_t) tmp_1_len
) < 0) {
887 if (append_const_str(out
, &out_idx
, out_len
, "</span>") < 0) {
891 in_idx
= after_match_pos
;
895 if (!pcre2_substring_get_byname(match_data
,
896 (PCRE2_SPTR
) "intra_postlink", &tmp_1
,
898 if (pcre2_substring_get_byname(match_data
, (PCRE2_SPTR
) "a_num",
899 &tmp_2
, &tmp_2_len
)) {
900 goto problem_with_match
;
905 if (db_construct_post_link(board
, strlen(board
), (const
907 tmp_2_len
, &found
, &link_target
,
908 &link_target_len
) < 0) {
913 if (append_str(out
, &out_idx
, out_len
, in
+ match_pos
,
914 after_match_pos
- match_pos
) < 0) {
918 in_idx
= after_match_pos
;
922 if (append_const_str(out
, &out_idx
, out_len
, "<a href=\"") <
927 if (append_str(out
, &out_idx
, out_len
, link_target
,
928 link_target_len
) < 0) {
932 if (append_const_str(out
, &out_idx
, out_len
, "\">") < 0) {
936 if (append_str(out
, &out_idx
, out_len
, (const char *) tmp_1
,
937 (size_t) tmp_1_len
) < 0) {
941 if (append_const_str(out
, &out_idx
, out_len
, "</a>") < 0) {
945 in_idx
= after_match_pos
;
949 if (!pcre2_substring_get_byname(match_data
,
950 (PCRE2_SPTR
) "inter_postlink", &tmp_1
,
952 if (pcre2_substring_get_byname(match_data
, (PCRE2_SPTR
) "e_num",
953 &tmp_2
, &tmp_2_len
)) {
954 goto problem_with_match
;
957 if (pcre2_substring_get_byname(match_data
,
958 (PCRE2_SPTR
) "e_board", &tmp_3
,
960 goto problem_with_match
;
965 if (db_construct_post_link((const char *) tmp_3
, tmp_3_len
,
966 (const char *) tmp_2
, tmp_2_len
,
967 &found
, &link_target
,
968 &link_target_len
) < 0) {
973 if (append_str(out
, &out_idx
, out_len
, in
+ match_pos
,
974 after_match_pos
- match_pos
) < 0) {
978 in_idx
= after_match_pos
;
982 if (append_const_str(out
, &out_idx
, out_len
, "<a href=\"") <
987 if (append_str(out
, &out_idx
, out_len
, link_target
,
988 link_target_len
) < 0) {
992 if (append_const_str(out
, &out_idx
, out_len
, "\">") < 0) {
996 if (append_str(out
, &out_idx
, out_len
, (const char *) tmp_1
,
997 (size_t) tmp_1_len
) < 0) {
1001 if (append_const_str(out
, &out_idx
, out_len
, "</a>") < 0) {
1005 in_idx
= after_match_pos
;
1011 /* There was some kind of match, but it went wrong. */
1018 pcre2_substring_free(tmp_1
);
1019 pcre2_substring_free(tmp_2
);
1020 pcre2_substring_free(tmp_3
);
1021 pcre2_match_data_free(match_data
);
1027 * Make sure that the contents of *pc are ready for safe injection
1028 * into the board, including HTML escaping, wordfiltering, general
1029 * formatting, and adding links.
1033 * - setup_sanitize_comment() has been invoked more recently than
1034 * clean_sanitize_comment().
1036 * - *pc has been filled out (fields like action, board, etc. have
1037 * been populated) from the POST data.
1039 * Postconditions (success):
1041 * - The prepared_XYZ fields of *pc have been filled out, and each
1042 * is valid ASCII text, with Unicode codepoints.
1045 st_sanitize_text(struct post_cmd
*pc
, int *our_fault
)
1049 char *html_escaped_comment
= 0;
1050 size_t html_escaped_comment_len
= 0;
1052 /* Flush out lurking double-free bugs */
1053 free(pc
->prepared
.name
);
1054 pc
->prepared
.name
= 0;
1055 pc
->prepared
.name_len
= 0;
1056 free(pc
->prepared
.email
);
1057 pc
->prepared
.email
= 0;
1058 pc
->prepared
.email_len
= 0;
1059 free(pc
->prepared
.subject
);
1060 pc
->prepared
.subject
= 0;
1061 pc
->prepared
.subject_len
= 0;
1062 free(pc
->prepared
.comment
);
1063 pc
->prepared
.comment
= 0;
1064 pc
->prepared
.comment_len
= 0;
1065 free(pc
->prepared
.file_name
);
1066 pc
->prepared
.file_name
= 0;
1067 pc
->prepared
.file_name_len
= 0;
1068 free(pc
->scannable_comment
);
1069 pc
->scannable_comment
= 0;
1070 pc
->scannable_comment_len
= 0;
1071 free(pc
->position_map
);
1072 pc
->position_map
= 0;
1073 pc
->position_map_len
= 0;
1076 if (!pc
->raw
.name_len
) {
1079 if (!(pc
->raw
.name
= strdup("Anonymous"))) {
1080 PERROR_MESSAGE("strdup");
1084 pc
->raw
.name_len
= strlen(pc
->raw
.name
);
1087 if (pc
->raw
.name_len
) {
1088 if (to_html(pc
->raw
.name
, pc
->raw
.name_len
, 0,
1089 &pc
->prepared
.name
, &pc
->prepared
.name_len
,
1098 if (pc
->raw
.email_len
) {
1099 if (to_html(pc
->raw
.email
, pc
->raw
.email_len
, 0,
1100 &pc
->prepared
.email
, &pc
->prepared
.email_len
,
1109 if (pc
->raw
.tripcode_len
) {
1110 if (to_html(pc
->raw
.tripcode
, pc
->raw
.tripcode_len
, 0,
1111 &pc
->prepared
.tripcode
, &pc
->prepared
.tripcode_len
,
1121 if (pc
->raw
.subject_len
) {
1122 if (to_html(pc
->raw
.subject
, pc
->raw
.subject_len
, 0,
1123 &pc
->prepared
.subject
, &pc
->prepared
.subject_len
,
1133 if (pc
->raw
.file_name_len
) {
1134 if (to_html(pc
->raw
.file_name
, pc
->raw
.file_name_len
, 0,
1135 &pc
->prepared
.file_name
,
1136 &pc
->prepared
.file_name_len
,
1143 if (to_scannable(pc
->raw
.comment
, pc
->raw
.comment_len
,
1144 &pc
->scannable_comment
, &pc
->scannable_comment_len
,
1146 &pc
->position_map_len
)) {
1152 * Now we do the fancy thing. Match scannable, build prepared
1155 if (wordfilter_to_html(pc
->raw
.comment
, pc
->raw
.comment_len
,
1156 pc
->scannable_comment
, pc
->scannable_comment_len
,
1158 &html_escaped_comment
,
1159 &html_escaped_comment_len
) < 0) {
1165 * Everything's in { form, but now take care of >>123,
1168 if (insert_html_tags(html_escaped_comment
, html_escaped_comment_len
,
1169 pc
->raw
.board
, &pc
->prepared
.comment
,
1170 &pc
->prepared
.comment_len
) < 0) {
1177 free(html_escaped_comment
);
1183 * Initialize any static elements needed for this file.
1187 * - setup_sanitize_comment() was not invoked more recently than
1188 * clean_sanitize_comment().
1190 * Postconditions (success):
1192 * - Any other function in this file may be safely called.
1195 setup_sanitize_comment(const struct configuration
*conf
)
1198 * Check that the locale/libc/whatever is set up so that
1199 * UTF-8 handling can work.
1203 "<script>alert(1)</script> , \U0001d511\U0001d526\U0001d52b"
1204 "\U0001d51e\u3000\U0001d513\U0001d532\U0001d52f\U0001d52d"
1205 "\U0001d529\U0001d522\U0001d531\U0001d52c\U0001d52b & "
1206 "\u2468\u0294!\u0ce2!!";
1207 const char *correct_html
=
1208 "<script>alert(1)</script> , 𝔑𝔦"
1209 "𝔫𝔞 𝔓𝔲𝔯"
1210 "𝔭𝔩𝔢𝔱𝔬𝔫 &"
1211 " ⑨ʔ!ೢ!!";
1212 const char *correct_scannable
=
1213 "<script>alert(1)</script> , Nina Purpleton & 9!!!";
1215 size_t html_len
= 0;
1216 char *scannable
= 0;
1217 size_t scannable_len
= 0;
1218 size_t *position_map
= 0;
1219 size_t position_map_len
= 0;
1222 if (to_html(raw
, strlen(raw
), 0, &html
, &html_len
, &out_idx
) < 0) {
1226 if (strcmp(html
, correct_html
)) {
1227 ERROR_MESSAGE("Was expecting html conversion to yield "
1228 "\n\n\u00ab%s\u00bb\n\nInstead, got "
1229 "\n\n\u00ab%s\u00bb\n\n",
1230 correct_html
, html
);
1234 if (to_scannable(raw
, strlen(raw
), &scannable
, &scannable_len
,
1235 &position_map
, &position_map_len
) < 0) {
1239 if (strcmp(scannable
, correct_scannable
)) {
1240 ERROR_MESSAGE("Was expecting scannable conversion to yield "
1241 "\n\n\u00ab%s\u00bb\n\nInstead, got "
1242 "\n\n\u00ab%s\u00bb\n\n",
1243 correct_scannable
, scannable
);
1247 if (!(wordfilters
= calloc(conf
->wordfilter_inputs_num
,
1248 sizeof *wordfilters
))) {
1249 PERROR_MESSAGE("calloc");
1253 wordfilters_num
= conf
->wordfilter_inputs_num
;
1255 PCRE2_SIZE err_offset
= 0;
1256 PCRE2_UCHAR8 err_buf
[120];
1258 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
1259 wordfilters
[j
].replacement
=
1260 conf
->wordfilter_inputs
[j
].replacement
;
1261 wordfilters
[j
].replacement_len
= strlen(
1262 conf
->wordfilter_inputs
[j
].replacement
);
1264 if ((wordfilters
[j
].code
= pcre2_compile(
1265 (PCRE2_SPTR8
) conf
->wordfilter_inputs
[j
].pattern
,
1266 PCRE2_ZERO_TERMINATED
, PCRE2_UTF
, &err_code
,
1271 pcre2_get_error_message(err_code
, err_buf
, 120);
1272 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1273 conf
->wordfilter_inputs
[j
].pattern
, err_buf
);
1277 const char *format_match_str
=
1280 "(?<newline>\\n)" /* */
1281 "|(?<intra_postlink>>>(?<a_num>[0-9]+))" /* */
1282 "|(?<inter_postlink>>>>/" /* */
1283 "(?<e_board>[^ /]+)/(?<e_num>[0-9]+))" /* */
1284 "|(?<quote>(?<![^\n])>[^\n]*)"; /* */
1286 if (!(format_replacements
= pcre2_compile(
1287 (PCRE2_SPTR8
) format_match_str
, PCRE2_ZERO_TERMINATED
,
1289 &err_code
, &err_offset
, 0))) {
1290 pcre2_get_error_message(err_code
, err_buf
, 120);
1291 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1292 format_match_str
, err_buf
);
1306 * Clean up any memory from this file
1308 * Postconditions (success):
1310 * - Valgrind won't report any memory leaks from this file.
1312 * - setup_sanitize_comment() can be safely called again.
1315 clean_sanitize_comment(void)
1317 for (size_t j
= 0; j
< wordfilters_num
; ++j
) {
1318 pcre2_code_free(wordfilters
[j
].code
);
1319 wordfilters
[j
] = (struct wordfilter
) { 0 };
1322 pcre2_code_free(format_replacements
);
1323 format_replacements
= 0;
1326 wordfilters_num
= 0;