From 0c785d80aaf98eb53182d29581fd7162da2f8c2d Mon Sep 17 00:00:00 2001 From: De Rais Date: Thu, 28 May 2020 20:46:12 -0400 Subject: [PATCH] server: implement forbidden regex list For the spammers that try just a little bit harder. --- config.def.h | 10 ++++ rb79-ban-ip.c | 2 + rb79-delete-post.c | 2 + rb79-moderate-post.c | 2 + rb79-server.c | 3 ++ rb79-update-recent-page.c | 2 + rb79-view-thread.c | 2 + rb79.h | 8 ++++ sanitize-comment.c | 119 ++++++++++++++++++++++++++++++++++++++++++++-- 9 files changed, 147 insertions(+), 3 deletions(-) diff --git a/config.def.h b/config.def.h index f00a5a8..e1b8a8a 100644 --- a/config.def.h +++ b/config.def.h @@ -416,3 +416,13 @@ static const struct wordfilter_input wordfilter_inputs[] = { { .pattern = "(?i).*?\\b(smh|fam|tbh|succ|thicc)\\b.*", .replacement = "( \u0361\u00b0 \u035c\u0296 \u0361\u00b0)" }, }; + +/* + * What are some phrases that can't be posted? As with wordfilters, + * pattern will be compiled with pcre2, only with the UTF compatible + * option. + */ +static const struct forbidden_input forbidden_inputs[] = { + /* */ + { .pattern = "Actually [Ss][Ee][Ee][Dd] Destiny was good" }, +}; diff --git a/rb79-ban-ip.c b/rb79-ban-ip.c index 3e97023..d21f2f0 100644 --- a/rb79-ban-ip.c +++ b/rb79-ban-ip.c @@ -175,6 +175,8 @@ main(int argc, char **argv) .challenges_num = NUM_OF(challenges), /* */ .wordfilter_inputs = wordfilter_inputs, /* */ .wordfilter_inputs_num = NUM_OF(wordfilter_inputs), /* */ + .forbidden_inputs = forbidden_inputs, /* */ + .forbidden_inputs_num = NUM_OF(forbidden_inputs), /* */ }; /* Interpret board */ diff --git a/rb79-delete-post.c b/rb79-delete-post.c index d69f630..16c08eb 100644 --- a/rb79-delete-post.c +++ b/rb79-delete-post.c @@ -110,6 +110,8 @@ main(int argc, char **argv) .challenges_num = NUM_OF(challenges), /* */ .wordfilter_inputs = wordfilter_inputs, /* */ .wordfilter_inputs_num = NUM_OF(wordfilter_inputs), /* */ + .forbidden_inputs = forbidden_inputs, /* */ + .forbidden_inputs_num = NUM_OF(forbidden_inputs), /* */ }; /* Interpret board */ diff --git a/rb79-moderate-post.c b/rb79-moderate-post.c index 66f758c..baadb9a 100644 --- a/rb79-moderate-post.c +++ b/rb79-moderate-post.c @@ -152,6 +152,8 @@ main(int argc, char **argv) .challenges_num = NUM_OF(challenges), /* */ .wordfilter_inputs = wordfilter_inputs, /* */ .wordfilter_inputs_num = NUM_OF(wordfilter_inputs), /* */ + .forbidden_inputs = forbidden_inputs, /* */ + .forbidden_inputs_num = NUM_OF(forbidden_inputs), /* */ }; /* Interpret board */ diff --git a/rb79-server.c b/rb79-server.c index be6175f..df4855b 100644 --- a/rb79-server.c +++ b/rb79-server.c @@ -270,6 +270,7 @@ handle_op_or_reply(struct configuration *conf, FCGX_Request *r, struct } LOG("Bad text (400)"); + LOG("Comment was \"%s\"", UBSAFES(pc->raw.comment)); report_bad_request(r, "Disallowed text"); goto done; } @@ -762,6 +763,8 @@ main(void) .challenges_num = NUM_OF(challenges), /* */ .wordfilter_inputs = wordfilter_inputs, /* */ .wordfilter_inputs_num = NUM_OF(wordfilter_inputs), /* */ + .forbidden_inputs = forbidden_inputs, /* */ + .forbidden_inputs_num = NUM_OF(forbidden_inputs), /* */ }; if (preconditions_check(&conf) < 0) { diff --git a/rb79-update-recent-page.c b/rb79-update-recent-page.c index 4979189..5d446a2 100644 --- a/rb79-update-recent-page.c +++ b/rb79-update-recent-page.c @@ -67,6 +67,8 @@ main(void) .challenges_num = NUM_OF(challenges), /* */ .wordfilter_inputs = wordfilter_inputs, /* */ .wordfilter_inputs_num = NUM_OF(wordfilter_inputs), /* */ + .forbidden_inputs = forbidden_inputs, /* */ + .forbidden_inputs_num = NUM_OF(forbidden_inputs), /* */ }; /* Set up a minimal part of the system */ diff --git a/rb79-view-thread.c b/rb79-view-thread.c index efd9fc8..ef941ce 100644 --- a/rb79-view-thread.c +++ b/rb79-view-thread.c @@ -182,6 +182,8 @@ main(int argc, char **argv) .challenges_num = NUM_OF(challenges), /* */ .wordfilter_inputs = wordfilter_inputs, /* */ .wordfilter_inputs_num = NUM_OF(wordfilter_inputs), /* */ + .forbidden_inputs = forbidden_inputs, /* */ + .forbidden_inputs_num = NUM_OF(forbidden_inputs), /* */ }; /* Interpret board */ diff --git a/rb79.h b/rb79.h index 5fb4aa6..a22af38 100644 --- a/rb79.h +++ b/rb79.h @@ -205,6 +205,12 @@ struct wordfilter_input { const char *replacement; }; +/* regex-backed forbidden input */ +struct forbidden_input { + /* */ + const char *pattern; +}; + /* See config.def.h for detailed descriptions. */ struct configuration { /* */ @@ -227,6 +233,8 @@ struct configuration { size_t challenges_num; const struct wordfilter_input *wordfilter_inputs; size_t wordfilter_inputs_num; + const struct forbidden_input *forbidden_inputs; + size_t forbidden_inputs_num; }; /* db_writeback_ZZZ takes a callback. */ diff --git a/sanitize-comment.c b/sanitize-comment.c index 6346a69..78649fa 100644 --- a/sanitize-comment.c +++ b/sanitize-comment.c @@ -52,9 +52,19 @@ struct wordfilter { size_t replacement_len; }; +/* + * A forbidden consists of a pcre2 regex only + */ +struct forbidden { + /* */ + pcre2_code *code; +}; + /* These are constructed in setup_sanitize_comment() */ static struct wordfilter *wordfilters; static size_t wordfilters_num; +static struct forbidden *forbiddens; +static size_t forbiddens_num; /* Special matcher for quoting, newlines, linkifying, etc. */ static pcre2_code *format_replacements; @@ -518,6 +528,60 @@ done: } /* + * Read through raw and scannable, checking all forbidden texts in + * scannable. If any match is detected, set *is_forbidden to 1. + * + * Preconditions + * + * - setup_sanitize_comment() has been invoked more recently than + * clean_sanitize_comment(). + * + * - scannable is memory of length at least scannable_len. + * + * - is_forbidden is not 0. + * + * Postconditions (success): + * + * - if any regex specified by the forbidden array matches scannable, + * then *is_forbidden has been set to 1. + */ +static int +check_forbidden_filters(const char *scannable, const size_t scannable_len, + uint_fast8_t *is_forbidden) +{ + int ret = -1; + + /* These hold the match locations from pcre2 */ + int num_matches = 0; + pcre2_match_data *match_data = 0; + + for (size_t j = 0; j < forbiddens_num; ++j) { + if (!(match_data = pcre2_match_data_create_from_pattern( + forbiddens[j].code, 0))) { + PERROR_MESSAGE("pcre2_match_data_create_from_pattern"); + goto done; + } + + num_matches = pcre2_match(forbiddens[j].code, + (PCRE2_SPTR) scannable, scannable_len, + 0, 0, match_data, 0); + + if (num_matches > 1) { + *is_forbidden = 1; + j = forbiddens_num; + } + + pcre2_match_data_free(match_data); + match_data = 0; + } + + ret = 0; +done: + + return ret; +} + +/* * Read through raw and scannable, checking all wordfilters in * scannable. Where a match is detected, the corresponding postion * (via position_map) in raw is replaced by the replacement specified @@ -1046,6 +1110,7 @@ st_sanitize_text(struct post_cmd *pc, int *our_fault) { int ret = -1; size_t out_idx = 0; + uint_fast8_t is_forbidden = 0; char *html_escaped_comment = 0; size_t html_escaped_comment_len = 0; @@ -1078,6 +1143,7 @@ st_sanitize_text(struct post_cmd *pc, int *our_fault) if (!(pc->raw.name = strdup("Anonymous"))) { PERROR_MESSAGE("strdup"); + *our_fault = 1; goto done; } @@ -1149,6 +1215,21 @@ st_sanitize_text(struct post_cmd *pc, int *our_fault) } /* + * Are they a spambot? + */ + if (check_forbidden_filters(pc->scannable_comment, + pc->scannable_comment_len, &is_forbidden) < + 0) { + *our_fault = 1; + goto done; + } + + if (is_forbidden) { + *our_fault = 0; + goto done; + } + + /* * Now we do the fancy thing. Match scannable, build prepared * out of that. */ @@ -1219,6 +1300,11 @@ setup_sanitize_comment(const struct configuration *conf) size_t position_map_len = 0; size_t out_idx = 0; + /* For pcre2_get_error_message */ + int err_code = 0; + PCRE2_SIZE err_offset = 0; + PCRE2_UCHAR8 err_buf[120]; + if (to_html(raw, strlen(raw), 0, &html, &html_len, &out_idx) < 0) { goto done; } @@ -1251,9 +1337,6 @@ setup_sanitize_comment(const struct configuration *conf) } wordfilters_num = conf->wordfilter_inputs_num; - int err_code = 0; - PCRE2_SIZE err_offset = 0; - PCRE2_UCHAR8 err_buf[120]; for (size_t j = 0; j < wordfilters_num; ++j) { wordfilters[j].replacement = @@ -1274,6 +1357,28 @@ setup_sanitize_comment(const struct configuration *conf) goto done; } + if (!(forbiddens = calloc(conf->forbidden_inputs_num, + sizeof *forbiddens))) { + PERROR_MESSAGE("calloc"); + goto done; + } + + forbiddens_num = conf->forbidden_inputs_num; + + for (size_t j = 0; j < forbiddens_num; ++j) { + if ((forbiddens[j].code = pcre2_compile( + (PCRE2_SPTR8) conf->forbidden_inputs[j].pattern, + PCRE2_ZERO_TERMINATED, PCRE2_UTF, &err_code, + &err_offset, 0))) { + continue; + } + + pcre2_get_error_message(err_code, err_buf, 120); + ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s", + conf->forbidden_inputs[j].pattern, err_buf); + goto done; + } + const char *format_match_str = /* */ @@ -1319,11 +1424,19 @@ clean_sanitize_comment(void) wordfilters[j] = (struct wordfilter) { 0 }; } + for (size_t j = 0; j < forbiddens_num; ++j) { + pcre2_code_free(forbiddens[j].code); + forbiddens[j] = (struct forbidden) { 0 }; + } + pcre2_code_free(format_replacements); format_replacements = 0; free(wordfilters); wordfilters = 0; wordfilters_num = 0; + free(forbiddens); + forbiddens = 0; + forbiddens_num = 0; return 0; } -- 2.11.4.GIT