sanitize-comment.c

   1 /*
   2  * Copyright (c) 2017-2020, De Rais <derais@cock.li>
   3  *
   4  * Permission to use, copy, modify, and/or distribute this software for
   5  * any purpose with or without fee is hereby granted, provided that the
   6  * above copyright notice and this permission notice appear in all
   7  * copies.
   8  *
   9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
  10  * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
  11  * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
  12  * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
  13  * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
  14  * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
  15  * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
  16  * PERFORMANCE OF THIS SOFTWARE.
  17  */
  18 #include <errno.h>
  19 #include <stdint.h>
  20 #include <stdio.h>
  21 #include <stdlib.h>
  22 #include <string.h>
  23 #include <time.h>
  24 #include <wchar.h>
  25
  26 #define PCRE2_CODE_UNIT_WIDTH 8
  27 #include <pcre2.h>
  28
  29 #include "macros.h"
  30 #include "rb79.h"
  31 #include "unicode-transforms.h"
  32
  33 /*
  34  * We need a way to get codepoints out of UTF-8 strings and if
  35  * wchar_t stored codepoint values, that would be great. That's
  36  * __STDC_ISO_10646__, though. You can remove this check and cross
  37  * your fingers, since rb79 will do a quick check on startup, but
  38  * please check why the C implementation doesn't define
  39  * __STDC_ISO_10646__ first.
  40  */
  41 #ifndef __STDC_ISO_10646__
  42 #error We really want __STD_ISO_10646__
  43 #endif
  44
  45 /*
  46  * A wordfilter consists of a pcre2 regex and a replacement string
  47  */
  48 struct wordfilter {
  49         /* */
  50         pcre2_code *code;
  51         const char *replacement;
  52         size_t replacement_len;
  53 };
  54
  55 /*
  56  * A forbidden consists of a pcre2 regex only
  57  */
  58 struct forbidden {
  59         /* */
  60         pcre2_code *code;
  61         int ban_duration;
  62         const char *ban_reason;
  63 };
  64
  65 /* These are constructed in setup_sanitize_comment() */
  66 static struct wordfilter *wordfilters;
  67 static size_t wordfilters_num;
  68 static struct forbidden *forbiddens;
  69 static size_t forbiddens_num;
  70
  71 /* Special matcher for quoting, newlines, linkifying, etc. */
  72 static pcre2_code *format_replacements;
  73
  74 /*
  75  * Comparison function for struct translate.
  76  *
  77  * Preconditions:
  78  *
  79  *  - *key_v is a wchar_t.
  80  *
  81  *  - *tr_v is a struct translate object.
  82  *
  83  * Postconditions:
  84  *
  85  *  - Returns -1 (0) [1] if *key_v is less than (equal to) [greater
  86  *    than] *tr_v's starting range.
  87  */
  88 static int
  89 match_translate(const void *key_v, const void *tr_v)
  90 {
  91         const wchar_t *key = key_v;
  92         const struct translate *tr = tr_v;
  93
  94         if (*key < tr->from_s) {
  95                 return -1;
  96         } else if (*key > tr->from_t) {
  97                 return 1;
  98         }
  99
 100         return 0;
 101 }
 102
 103 /*
 104  * Add a UTF-8 sequence str onto *buf
 105  *
 106  * Preconditions:
 107  *
 108  *  - *buf is memory of length *sz, and up to *idx is a valid UTF-8
 109  *    string.
 110  *
 111  *  - str is a valid ASCII (not just UTF-8) string of length str_len.
 112  *
 113  * Postconditions (success):
 114  *
 115  *  - *buf is memory of length *sz, and up to *idx is a valid UTF-8
 116  *    string.
 117  *
 118  *  - The contents of str have been appended to *buf (and *idx
 119  *    includes this).
 120  */
 121 static int
 122 append_str(char **buf, size_t *idx, size_t *sz, const char *str, size_t str_len)
 123 {
 124         if (str_len + *idx >= *sz) {
 125                 void *newmem = 0;
 126                 size_t new_sz = str_len + *idx + (1 << 9);
 127
 128                 if (str_len + *idx < str_len ||
 129                     str_len + *idx + (1 << 9) < str_len + *idx) {
 130                         ERROR_MESSAGE("overflow (str_len = %zu, *idx = %zu)",
 131                                       str_len, *idx);
 132
 133                         return -1;
 134                 }
 135
 136                 if (!(newmem = realloc(*buf, new_sz))) {
 137                         PERROR_MESSAGE("realloc");
 138
 139                         return -1;
 140                 }
 141
 142                 *buf = newmem;
 143                 *sz = new_sz;
 144         }
 145
 146         strncpy(*buf + *idx, str, str_len);
 147         *(*buf + *idx + str_len) = '\0';
 148         *idx += str_len;
 149
 150         return 0;
 151 }
 152
 153 /* Dummy function for when I can't be bothered to strlen(). */
 154 static int
 155 append_const_str(char **buf, size_t *idx, size_t *len, const char *str)
 156 {
 157         return append_str(buf, idx, len, str, strlen(str));
 158 }
 159
 160 /*
 161  * Add a single character onto *buf
 162  *
 163  * Preconditions:
 164  *
 165  *  - *buf is memory of length *len, and up to *idx is a valid UTF-8
 166  *    string.
 167  *
 168  *  - c is an ASCII character.
 169  *
 170  * Postconditions (success):
 171  *
 172  *  - *buf is memory of length *len, and up to *idx is a valid UTF-8
 173  *    string.
 174  *
 175  *  - c has been appended to *buf (and *idx includes this).
 176  */
 177 static int
 178 append_char(char **buf, size_t *idx, size_t *len, char c)
 179 {
 180         if (1 + *idx >= *len) {
 181                 void *newmem = 0;
 182                 size_t new_len = 1 + *idx + (1 << 9);
 183
 184                 if (*idx + 1 < *idx ||
 185                     *idx + 1 + (1 << 9) < *idx + 1) {
 186                         ERROR_MESSAGE("overflow (*idx = %zu)", *idx);
 187
 188                         return -1;
 189                 }
 190
 191                 if (!(newmem = realloc(*buf, new_len))) {
 192                         PERROR_MESSAGE("realloc");
 193
 194                         return -1;
 195                 }
 196
 197                 *buf = newmem;
 198                 *len = new_len;
 199         }
 200
 201         *(*buf + *idx) = c;
 202         *(*buf + *idx + 1) = '\0';
 203         *idx += 1;
 204
 205         return 0;
 206 }
 207
 208 /*
 209  * Add a Unicode codepoint onto *buf
 210  *
 211  * Preconditions:
 212  *
 213  *  - *buf is memory of length *sz, and up to *idx is a valid UTF-8
 214  *    string.
 215  *
 216  *  - wchar_t is a valid Unicode codepoint.
 217  *
 218  * Postconditions (success):
 219  *
 220  *  - *buf is memory of length *sz, and up to *idx is a valid UTF-8
 221  *    string.
 222  *
 223  *  - An HTML-escaped sequence like &#123; has been appended to
 224  *    *buf (and *idx includes this).
 225  */
 226 static int
 227 append_wchar_escaped(char **buf, size_t *idx, size_t *sz, wchar_t wc)
 228 {
 229         size_t l = snprintf(0, 0, "&#%ld;", (long) wc);
 230
 231         if (l + *idx >= *sz) {
 232                 void *newmem = 0;
 233                 size_t new_sz = l + *idx + (1 << 9);
 234
 235                 if (*idx + l < *idx ||
 236                     *idx + l + (1 << 9) < *idx + l) {
 237                         ERROR_MESSAGE("overflow (*idx = %zu, l = %zu)", *idx,
 238                                       l);
 239
 240                         return -1;
 241                 }
 242
 243                 if (!(newmem = realloc(*buf, new_sz))) {
 244                         PERROR_MESSAGE("realloc");
 245
 246                         return -1;
 247                 }
 248
 249                 *buf = newmem;
 250                 *sz = new_sz;
 251         }
 252
 253         sprintf(*buf + *idx, "&#%ld;", (long) wc);
 254         *idx += l;
 255
 256         return 0;
 257 }
 258
 259 /*
 260  * Ensure that (*map)[j] = k, fixing up length as appropriate.
 261  *
 262  * Preconditions
 263  *
 264  *  - *map is memory of length len.
 265  *
 266  * Postconditions (success):
 267  *
 268  *  - *map is memory of length len.
 269  *
 270  *  - (*map)[j] = k.
 271  */
 272 static int
 273 set_position_mapping(size_t **map, size_t *len, size_t j, size_t k)
 274 {
 275         if (j + 1 >= *len) {
 276                 void *newmem = 0;
 277
 278                 if (j + 2 < j ||
 279                     ((j + 2) * sizeof **map) / (j + 2) != sizeof **map) {
 280                         ERROR_MESSAGE("overflow (j = %zu)", j);
 281
 282                         return -1;
 283                 }
 284
 285                 if (!(newmem = realloc(*map, (j + 2) * sizeof **map))) {
 286                         PERROR_MESSAGE("realloc");
 287
 288                         return -1;
 289                 }
 290
 291                 *map = newmem;
 292
 293                 for (size_t l = *len; l < j + 2; ++l) {
 294                         (*map)[l] = ((size_t) -1);
 295                 }
 296
 297                 *len = j + 2;
 298         }
 299
 300         (*map)[j] = k;
 301
 302         return 0;
 303 }
 304
 305 /*
 306  * HTML-escape in to *out.
 307  *
 308  * Preconditions
 309  *
 310  *  - in is memory of at least length in_len, valid UTF-8
 311  *    text.
 312  *
 313  *  - *out is memory of at least length *out_len (if *out_len = 0,
 314  *    *out may be 0), valid UTF-8 text.
 315  *
 316  *  - Overwriting *out and *out_len  shall not cause a memory leak.
 317  *
 318  *  - out, out_len, and out_idx are not 0.
 319  *
 320  * Postconditions (success):
 321  *
 322  *  - *out is memory of at least length *out_len, valid UTF-8 text.
 323  *
 324  *  - A stretch of HTML-escaped ASCII text representing in has been
 325  *    added to *out at the position that was *out_idx.
 326  *
 327  *  - *out_idx has been updated to point to the end of this stretch.
 328  *
 329  *  - If necessary, *out_len has been updated.
 330  */
 331 static int
 332 to_html(const char *in, const size_t in_len, size_t in_idx, char **out,
 333         size_t *out_len, size_t *out_idx)
 334 {
 335         int ret = -1;
 336         wchar_t wc = 0;
 337         int mbret = 0;
 338         size_t out_sz = 0;
 339         size_t initial_out_idx = *out_idx;
 340
 341         if (!*out) {
 342                 if (!(*out = malloc(1))) {
 343                         PERROR_MESSAGE("malloc");
 344                         goto done;
 345                 }
 346
 347                 out_sz = 1;
 348                 *out_len = 0;
 349                 (*out)[0] = '\0';
 350         }
 351
 352         /*
 353          * XXX: If you make this multithreaded, be sure to use
 354          * mbrtowc(3) here!
 355          */
 356         while (in_idx < in_len &&
 357                in[in_idx]) {
 358                 /* Extract next character */
 359                 mbret = mbtowc(&wc, in + in_idx, in_len - in_idx);
 360
 361                 if (mbret == -1) {
 362                         PERROR_MESSAGE("mbtowc");
 363                         goto done;
 364                 }
 365
 366                 if (wc == L'&') {
 367                         ret = append_str(out, out_idx, &out_sz, "&amp;", 5);
 368                 } else if (wc == L'"') {
 369                         ret = append_str(out, out_idx, &out_sz, "&quot;", 6);
 370                 } else if (wc == L'\'') {
 371                         ret = append_str(out, out_idx, &out_sz, "&apos;", 6);
 372                 } else if (wc == L'<') {
 373                         ret = append_str(out, out_idx, &out_sz, "&lt;", 4);
 374                 } else if (wc == L'>') {
 375                         ret = append_str(out, out_idx, &out_sz, "&gt;", 4);
 376                 } else if (mbret == 1 &&
 377                            in[in_idx] >= ' ' &&
 378                            in[in_idx] <= '~') {
 379                         ret = append_char(out, out_idx, &out_sz, in[in_idx]);
 380                 } else if (mbret == 1 &&
 381                            in[in_idx] == '\r') {
 382                         ret = 0;
 383                 } else if (mbret == 1 &&
 384                            in[in_idx] == '\n') {
 385                         ret = append_char(out, out_idx, &out_sz, in[in_idx]);
 386                 } else {
 387                         ret = append_wchar_escaped(out, out_idx, &out_sz, wc);
 388                 }
 389
 390                 in_idx += mbret;
 391
 392                 if (ret < 0) {
 393                         goto done;
 394                 }
 395         }
 396
 397         *out_len = *out_len + (*out_idx - initial_out_idx);
 398         ret = 0;
 399 done:
 400
 401         return ret;
 402 }
 403
 404 /*
 405  * From in construct *out, which is a codepoint-for-codepoint
 406  * translation following the rules of unicode-transforms.h. The
 407  * result is that *out can be matched with normal regex, even if
 408  * in contains obfuscatory Unicode bullshit.
 409  *
 410  * Preconditions
 411  *
 412  *  - setup_sanitize_comment() has been invoked more recently than
 413  *    clean_sanitize_comment().
 414  *
 415  *  - in is memory of at least length in_len, valid UTF-8 text.
 416  *
 417  *  - Overwriting *out and *out_position_map shall not cause a
 418  *    memory leak.
 419  *
 420  *  - out, out_len, out_position_map, and out_position_map_len are
 421  *    not 0.
 422  *
 423  * Postconditions (success):
 424  *
 425  *  - *out is valid, UTF-8 text of length *out_len.
 426  *
 427  *  - For every j in [0, *out_len) such that (*out)[j] starts a
 428  *    codepoint, in[*(position_map)[j]] is the start of the
 429  *    corresponding codepoint.
 430  *
 431  *  - (*position_map)[*out_len] = in_len.
 432  */
 433 static int
 434 to_scannable(const char *in, size_t in_len, char **out, size_t *out_len,
 435              size_t **out_position_map, size_t *out_position_map_len)
 436 {
 437         int ret = -1;
 438         wchar_t wc = 0;
 439         size_t in_idx = 0;
 440         size_t out_idx = 0;
 441         int mbret = 0;
 442         struct translate *tr = 0;
 443         size_t out_sz = 0;
 444
 445         if (!*out) {
 446                 if (!(*out = malloc(1))) {
 447                         PERROR_MESSAGE("malloc");
 448                         goto done;
 449                 }
 450
 451                 out_sz = 1;
 452                 *out_len = 0;
 453                 (*out)[0] = '\0';
 454         }
 455
 456         /*
 457          * Position_map is here to make wordfiltering work. Suppose in is
 458          *
 459          *     Ｉ  ｔｈｉｎｋ  Ｎｉｎａ  Ｐｕｒｐｌｅｔｏｎ  ｄｉｄ
 460          *     ｎｏｔｈｉｎｇ  ｗｒｏｎｇ
 461          *
 462          * and a wordfilter /Nina Purpleton/i -> "worst girl" is
 463          * in effect. Then *out will be
 464          *
 465          *      I think Nina Purpleton did nothing wrong
 466          *
 467          * The message should, of course, be filtered to
 468          *
 469          *     Ｉ  ｔｈｉｎｋ worst girl ｄｉｄ ｎｏｔｈｉｎｇ
 470          *     ｗｒｏｎｇ
 471          *
 472          * In order to do that, it would be necessary to have a map
 473          * from in to *out on the byte level, since the wordfilter
 474          * will only be run against *out.
 475          *
 476          * position_map[j] = k means that out[j] and in[k] mean the
 477          * same thing.
 478          */
 479         while (in_idx < in_len) {
 480                 mbret = mbtowc(&wc, in + in_idx, in_len - in_idx);
 481
 482                 if (mbret == -1) {
 483                         PERROR_MESSAGE("mbtowc");
 484                         goto done;
 485                 }
 486
 487                 /* We pre-suppose that the insert will go as planned */
 488                 if (set_position_mapping(out_position_map, out_position_map_len,
 489                                          out_idx, in_idx) < 0) {
 490                         goto done;
 491                 }
 492
 493                 if (mbret == 1 &&
 494                     in[in_idx] >= ' ' &&
 495                     in[in_idx] <= '~') {
 496                         if (append_str(out, &out_idx, &out_sz, in + in_idx, 1) <
 497                             0) {
 498                                 goto done;
 499                         }
 500                 } else {
 501                         if ((tr = bsearch(&wc, translates, NUM_OF(translates),
 502                                           sizeof *translates,
 503                                           match_translate))) {
 504                                 if (append_str(out, &out_idx, &out_sz, tr->to,
 505                                                strlen(tr->to)) < 0) {
 506                                         goto done;
 507                                 }
 508                         } else {
 509                                 if (append_str(out, &out_idx, &out_sz, in +
 510                                                in_idx, mbret) < 0) {
 511                                         goto done;
 512                                 }
 513                         }
 514                 }
 515
 516                 in_idx += mbret;
 517         }
 518
 519         if (set_position_mapping(out_position_map, out_position_map_len,
 520                                  out_idx, in_len) < 0) {
 521                 goto done;
 522         }
 523
 524         (*out)[out_idx] = '\0';
 525         *out_len = out_idx;
 526         ret = 0;
 527 done:
 528
 529         return ret;
 530 }
 531
 532 /*
 533  * Read through raw and scannable, checking all forbidden texts in
 534  * scannable. If any match is detected, set *is_forbidden to 1.
 535  *
 536  * Preconditions
 537  *
 538  *  - setup_sanitize_comment() has been invoked more recently than
 539  *    clean_sanitize_comment().
 540  *
 541  *  - scannable is memory of length at least scannable_len.
 542  *
 543  *  - out_is_forbidden, out_ban_duration, out_ban_reason are not 0.
 544  *
 545  * Postconditions (success):
 546  *
 547  *  - if any regex specified by the forbidden array matches scannable,
 548  *    then *out_is_forbidden has been set to 1, with relevant
 549  *    *out_ban_duration, *out_ban_reason.
 550  */
 551 static int
 552 check_forbidden_filters(const char *scannable, const size_t scannable_len,
 553                         uint_fast8_t *out_is_forbidden, int *out_ban_duration,
 554                         const
 555                         char **out_ban_reason)
 556 {
 557         int ret = -1;
 558
 559         /* These hold the match locations from pcre2 */
 560         int num_matches = 0;
 561         pcre2_match_data *match_data = 0;
 562
 563         for (size_t j = 0; j < forbiddens_num; ++j) {
 564                 if (!(match_data = pcre2_match_data_create_from_pattern(
 565                               forbiddens[j].code, 0))) {
 566                         PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
 567                         goto done;
 568                 }
 569
 570                 num_matches = pcre2_match(forbiddens[j].code,
 571                                           (PCRE2_SPTR) scannable, scannable_len,
 572                                           0, 0, match_data, 0);
 573
 574                 if (num_matches > 0) {
 575                         *out_is_forbidden = 1;
 576                         *out_ban_duration = forbiddens[j].ban_duration;
 577                         *out_ban_reason = forbiddens[j].ban_reason;
 578                         j = forbiddens_num;
 579                 }
 580
 581                 pcre2_match_data_free(match_data);
 582                 match_data = 0;
 583         }
 584
 585         ret = 0;
 586 done:
 587
 588         return ret;
 589 }
 590
 591 /*
 592  * Read through raw and scannable, checking all wordfilters in
 593  * scannable. Where a match is detected, the corresponding postion
 594  * (via position_map) in raw is replaced by the replacement specified
 595  * by the matching wordfilter.
 596  *
 597  * Preconditions
 598  *
 599  *  - setup_sanitize_comment() has been invoked more recently than
 600  *    clean_sanitize_comment().
 601  *
 602  *  - raw is memory of length at least raw_len, valid UTF-8 text.
 603  *
 604  *  - scannable is memory of length at least scannable_len.
 605  *
 606  *  - For any j in [0, scannable_len), position_map[j] is a valid
 607  *    index into raw, or is (size_t) -1.
 608  *
 609  *  - position_map[scannable_len] = raw_len.
 610  *
 611  *  - For any j in [0, scannable_len) such that k = position_map[j]
 612  *    is not (size_t) -1, scannable[j] and raw[k] are conceptually
 613  *    the same for wordfiltering.
 614  *
 615  *  - Overwriting *out shall not cause a memory leak.
 616  *
 617  *  - out and out_len are not 0.
 618  *
 619  * Postconditions (success):
 620  *
 621  *  - *out is valid, UTF-8 text of length *out_len such that all
 622  *    non ASCII codepoints (and '<', '>', '&', '"', ''') are
 623  *    HTML-escaped.
 624  *
 625  *  - *out represents raw, except in those sections of scannable
 626  *    where a wordfilter matched.
 627  */
 628 static int
 629 wordfilter_to_html(const char *raw, const size_t raw_len, const char *scannable,
 630                    const size_t scannable_len, size_t *position_map, char **out,
 631                    size_t *out_len)
 632 {
 633         int ret = -1;
 634
 635         /* These hold the match locations from pcre2 */
 636         uint32_t *ov_counts = 0;
 637         PCRE2_SIZE **ov_ps = 0;
 638         int *num_matches = 0;
 639         pcre2_match_data **match_data = 0;
 640         size_t raw_idx = 0;
 641         size_t scannable_idx = 0;
 642         size_t out_idx = 0;
 643         size_t best_match_pos = 0;
 644         size_t best_match_idx = 0;
 645         size_t l = 0;
 646         size_t mbret = 0;
 647
 648         if (!(ov_counts = calloc(wordfilters_num, sizeof *ov_counts))) {
 649                 PERROR_MESSAGE("calloc");
 650                 goto done;
 651         }
 652
 653         if (!(ov_ps = calloc(wordfilters_num, sizeof *ov_ps))) {
 654                 PERROR_MESSAGE("calloc");
 655                 goto done;
 656         }
 657
 658         if (!(num_matches = calloc(wordfilters_num, sizeof *num_matches))) {
 659                 PERROR_MESSAGE("calloc");
 660                 goto done;
 661         }
 662
 663         if (!(match_data = calloc(wordfilters_num, sizeof *match_data))) {
 664                 PERROR_MESSAGE("calloc");
 665                 goto done;
 666         }
 667
 668         /* First scan, before the loop */
 669         for (size_t j = 0; j < wordfilters_num; ++j) {
 670                 if (!(match_data[j] = pcre2_match_data_create_from_pattern(
 671                               wordfilters[j].code, 0))) {
 672                         PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
 673                         goto done;
 674                 }
 675
 676                 num_matches[j] = pcre2_match(wordfilters[j].code,
 677                                              (PCRE2_SPTR) scannable,
 678                                              scannable_len, scannable_idx, 0,
 679                                              match_data[j], 0);
 680         }
 681
 682 handle_next_match:
 683         best_match_pos = (size_t) -1;
 684         best_match_idx = (size_t) -1;
 685
 686         /* We've run pcre2_match() on everything. Find the soonest match */
 687         for (size_t j = 0; j < wordfilters_num; ++j) {
 688                 if (!num_matches[j]) {
 689                         continue;
 690                 }
 691
 692                 ov_ps[j] = pcre2_get_ovector_pointer(match_data[j]);
 693
 694                 if (ov_ps[j][0] >= scannable_idx &&
 695                     ov_ps[j][0] < best_match_pos) {
 696                         best_match_pos = ov_ps[j][0];
 697                         best_match_idx = j;
 698                 }
 699         }
 700
 701         if (best_match_idx == (size_t) -1) {
 702                 /* No matches. Turn the rest to html boring-like */
 703                 ret = to_html(raw, raw_len, raw_idx, out, out_len, &out_idx);
 704                 goto done;
 705         }
 706
 707         /* Figure out where in raw this match starts */
 708         l = best_match_pos;
 709
 710         while (l != (size_t) -1 &&
 711                position_map[l] == (size_t) -1) {
 712                 l--;
 713         }
 714
 715         if (l == (size_t) -1) {
 716                 ERROR_MESSAGE("Impossible condition in "
 717                               "wordfilter_to_html: raw=\"%s\", best_match_pos = %zu",
 718                               raw,
 719                               best_match_pos);
 720                 goto done;
 721         }
 722
 723         /*
 724          * Now position_map[l] points to the first character in raw
 725          * that should be replaced. Fill up to that point.
 726          */
 727         if (position_map[l] &&
 728             position_map[l] > raw_idx) {
 729                 if (to_html(raw, position_map[l], raw_idx, out, out_len,
 730                             &out_idx) < 0) {
 731                         goto done;
 732                 }
 733         }
 734
 735         /* Put the substituted text in */
 736         if (to_html(wordfilters[best_match_idx].replacement,
 737                     wordfilters[best_match_idx].replacement_len, 0, out,
 738                     out_len,
 739                     &out_idx) < 0) {
 740                 goto done;
 741         }
 742
 743         /*
 744          * Figure out where we should advance to in inputs. Naively,
 745          * we want to set scannable_idx to ov_ps[best_match_idx][1]
 746          * (the first character in scannable beyond the match).
 747          * However, we have to consider the case of
 748          *
 749          *      foo！！！bar
 750          *
 751          * where "foo" -> "baz" is the only transformation. Since
 752          * some characters, like "！", are completely ignored by
 753          * the scannable transformation, the naive method would
 754          * start our scanning at the "b", skipping information.
 755          *
 756          * So, instead, we carefully find the last character in
 757          * "foo", then jump one past it. This (unfortunately)
 758          * requires a bit more manual fiddling with wide character
 759          * conversions.
 760          *
 761          */
 762         if (ov_ps[best_match_idx][1] <= scannable_idx) {
 763                 /*
 764                  * This should never happen, but let's make sure
 765                  * we always keep advancing.
 766                  */
 767                 scannable_idx++;
 768         } else {
 769                 scannable_idx = ov_ps[best_match_idx][1] - 1;
 770         }
 771
 772         l = scannable_idx;
 773
 774         while (position_map[l] == (size_t) -1) {
 775                 l--;
 776         }
 777
 778         raw_idx = position_map[l];
 779
 780         /* This is the "jump one past it" part */
 781         scannable_idx++;
 782         errno = 0;
 783         mbret = mbrlen(raw + raw_idx, MB_CUR_MAX, 0);
 784
 785         switch (mbret) {
 786         case (size_t) -2:
 787         case (size_t) -1:
 788                 PERROR_MESSAGE("mbrlen");
 789                 goto done;
 790         default:
 791                 raw_idx += mbret;
 792         }
 793
 794         /*
 795          * Now re-check all our matches and figure out which ones
 796          * need to be updated
 797          */
 798         for (size_t j = 0; j < wordfilters_num; ++j) {
 799                 if (!num_matches[j] ||
 800                     ov_ps[j][0] >= scannable_idx) {
 801                         continue;
 802                 }
 803
 804                 num_matches[j] = pcre2_match(wordfilters[j].code,
 805                                              (PCRE2_SPTR) scannable,
 806                                              scannable_len, scannable_idx, 0,
 807                                              match_data[j], 0);
 808         }
 809
 810         goto handle_next_match;
 811 done:
 812
 813         for (size_t j = 0; j < wordfilters_num; ++j) {
 814                 pcre2_match_data_free(match_data[j]);
 815                 match_data[j] = 0;
 816         }
 817
 818         free(match_data);
 819         free(num_matches);
 820         free(ov_counts);
 821         free(ov_ps);
 822
 823         return ret;
 824 }
 825
 826 /*
 827  * Read through in. Each time a match for format_replacements is
 828  * found (something like a newline or a quote) is found, replace
 829  * it with some HTML markup. The result is placed in out.
 830  *
 831  * Preconditions:
 832  *
 833  *  - setup_sanitize_comment() has been invoked more recently than
 834  *    clean_sanitize_comment().
 835  *
 836  *  - in is memory of length at least in_len, valid UTF-8 text.
 837  *
 838  *  - Overwriting *out shall not cause a memory leak.
 839  *
 840  *  - out and out_len are not 0.
 841  *
 842  * Postconditions (success):
 843  *
 844  *  - *out is valid, UTF-8 text of length *out_len with sane HTML
 845  *    markup (and HTML escaped), suitable for outputting into an
 846  *    HTML file.
 847  */
 848 static int
 849 insert_html_tags(const char *in, size_t in_len, const char *board, char **out,
 850                  size_t *out_len)
 851 {
 852         int ret = -1;
 853         size_t in_idx = 0;
 854         size_t match_pos = 0;
 855         size_t after_match_pos = 0;
 856         size_t out_idx = 0;
 857         pcre2_match_data *match_data = 0;
 858         int nret = 0;
 859         PCRE2_UCHAR *tmp_1 = 0;
 860         PCRE2_SIZE tmp_1_len = 0;
 861         PCRE2_UCHAR *tmp_2 = 0;
 862         PCRE2_SIZE tmp_2_len = 0;
 863         PCRE2_UCHAR *tmp_3 = 0;
 864         PCRE2_SIZE tmp_3_len = 0;
 865         uint_fast8_t last_was_newline = 1;
 866         char *link_target = 0;
 867         size_t link_target_len = 0;
 868
 869         if (!(match_data = pcre2_match_data_create_from_pattern(
 870                       format_replacements, 0))) {
 871                 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
 872                 goto done;
 873         }
 874
 875 find_next_bit:
 876
 877         if (in_idx >= in_len) {
 878                 goto success;
 879         }
 880
 881         nret = pcre2_match(format_replacements, (PCRE2_SPTR) in, in_len, in_idx,
 882                            0, match_data, 0);
 883
 884         if (nret == PCRE2_ERROR_NOMATCH) {
 885                 ret = append_str(out, &out_idx, out_len, in + in_idx, in_len -
 886                                  in_idx);
 887                 goto done;
 888         }
 889
 890         if (nret < 0) {
 891                 PCRE2_UCHAR8 err_buf[120];
 892
 893                 pcre2_get_error_message(nret, err_buf, 120);
 894                 ERROR_MESSAGE("pcre2_match: error while matching \"%.*s\": %s"
 895                               " (PCRE2 %d)", (int) (in_len - in_idx), in +
 896                               in_idx, err_buf,
 897                               nret);
 898                 goto done;
 899         }
 900
 901         pcre2_substring_free(tmp_1);
 902         pcre2_substring_free(tmp_2);
 903         pcre2_substring_free(tmp_3);
 904         free(link_target);
 905         tmp_1 = 0;
 906         tmp_2 = 0;
 907         tmp_3 = 0;
 908         link_target = 0;
 909
 910         /* We have match, stuff everything up to it in *out */
 911         match_pos = pcre2_get_ovector_pointer(match_data)[0];
 912         after_match_pos = pcre2_get_ovector_pointer(match_data)[1];
 913
 914         if (match_pos > in_idx) {
 915                 if (append_str(out, &out_idx, out_len, in + in_idx, match_pos -
 916                                in_idx) < 0) {
 917                         goto done;
 918                 }
 919
 920                 last_was_newline = 0;
 921                 in_idx = match_pos;
 922         }
 923
 924         /* Figure out what type of match. */
 925         if (!pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "newline",
 926                                         &tmp_1, &tmp_1_len)) {
 927                 if (last_was_newline) {
 928                         if (append_const_str(out, &out_idx, out_len,
 929                                              "&nbsp;<br />") < 0) {
 930                                 goto done;
 931                         }
 932                 } else {
 933                         if (append_const_str(out, &out_idx, out_len, "<br />") <
 934                             0) {
 935                                 goto done;
 936                         }
 937                 }
 938
 939                 last_was_newline = 1;
 940                 in_idx = after_match_pos;
 941                 goto find_next_bit;
 942         }
 943
 944         last_was_newline = 0;
 945
 946         if (!pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "quote",
 947                                         &tmp_1, &tmp_1_len)) {
 948                 if (append_const_str(out, &out_idx, out_len,
 949                                      "<span class=\"quote\">") < 0) {
 950                         goto done;
 951                 }
 952
 953                 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
 954                                (size_t) tmp_1_len) < 0) {
 955                         goto done;
 956                 }
 957
 958                 if (append_const_str(out, &out_idx, out_len, "</span>") < 0) {
 959                         goto done;
 960                 }
 961
 962                 in_idx = after_match_pos;
 963                 goto find_next_bit;
 964         }
 965
 966         if (!pcre2_substring_get_byname(match_data,
 967                                         (PCRE2_SPTR) "intra_postlink", &tmp_1,
 968                                         &tmp_1_len)) {
 969                 if (pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "a_num",
 970                                                &tmp_2, &tmp_2_len)) {
 971                         goto problem_with_match;
 972                 }
 973
 974                 int found = 0;
 975
 976                 if (db_construct_post_link(board, strlen(board), (const
 977                                                                   char *) tmp_2,
 978                                            tmp_2_len, &found, &link_target,
 979                                            &link_target_len) < 0) {
 980                         goto done;
 981                 }
 982
 983                 if (!found) {
 984                         if (append_str(out, &out_idx, out_len, in + match_pos,
 985                                        after_match_pos - match_pos) < 0) {
 986                                 goto done;
 987                         }
 988
 989                         in_idx = after_match_pos;
 990                         goto find_next_bit;
 991                 }
 992
 993                 if (append_const_str(out, &out_idx, out_len, "<a href=\"") <
 994                     0) {
 995                         goto done;
 996                 }
 997
 998                 if (append_str(out, &out_idx, out_len, link_target,
 999                                link_target_len) < 0) {
1000                         goto done;
1001                 }
1002
1003                 if (append_const_str(out, &out_idx, out_len, "\">") < 0) {
1004                         goto done;
1005                 }
1006
1007                 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
1008                                (size_t) tmp_1_len) < 0) {
1009                         goto done;
1010                 }
1011
1012                 if (append_const_str(out, &out_idx, out_len, "</a>") < 0) {
1013                         goto done;
1014                 }
1015
1016                 in_idx = after_match_pos;
1017                 goto find_next_bit;
1018         }
1019
1020         if (!pcre2_substring_get_byname(match_data,
1021                                         (PCRE2_SPTR) "inter_postlink", &tmp_1,
1022                                         &tmp_1_len)) {
1023                 if (pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "e_num",
1024                                                &tmp_2, &tmp_2_len)) {
1025                         goto problem_with_match;
1026                 }
1027
1028                 if (pcre2_substring_get_byname(match_data,
1029                                                (PCRE2_SPTR) "e_board", &tmp_3,
1030                                                &tmp_3_len)) {
1031                         goto problem_with_match;
1032                 }
1033
1034                 int found = 0;
1035
1036                 if (db_construct_post_link((const char *) tmp_3, tmp_3_len,
1037                                            (const char *) tmp_2, tmp_2_len,
1038                                            &found, &link_target,
1039                                            &link_target_len) < 0) {
1040                         goto done;
1041                 }
1042
1043                 if (!found) {
1044                         if (append_str(out, &out_idx, out_len, in + match_pos,
1045                                        after_match_pos - match_pos) < 0) {
1046                                 goto done;
1047                         }
1048
1049                         in_idx = after_match_pos;
1050                         goto find_next_bit;
1051                 }
1052
1053                 if (append_const_str(out, &out_idx, out_len, "<a href=\"") <
1054                     0) {
1055                         goto done;
1056                 }
1057
1058                 if (append_str(out, &out_idx, out_len, link_target,
1059                                link_target_len) < 0) {
1060                         goto done;
1061                 }
1062
1063                 if (append_const_str(out, &out_idx, out_len, "\">") < 0) {
1064                         goto done;
1065                 }
1066
1067                 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
1068                                (size_t) tmp_1_len) < 0) {
1069                         goto done;
1070                 }
1071
1072                 if (append_const_str(out, &out_idx, out_len, "</a>") < 0) {
1073                         goto done;
1074                 }
1075
1076                 in_idx = after_match_pos;
1077                 goto find_next_bit;
1078         }
1079
1080 problem_with_match:
1081
1082         /* There was some kind of match, but it went wrong. */
1083         in_idx++;
1084         goto find_next_bit;
1085 success:
1086         ret = 0;
1087 done:
1088         *out_len = out_idx;
1089         pcre2_substring_free(tmp_1);
1090         pcre2_substring_free(tmp_2);
1091         pcre2_substring_free(tmp_3);
1092         pcre2_match_data_free(match_data);
1093
1094         return ret;
1095 }
1096
1097 /*
1098  * Make sure that the contents of *pc are ready for safe injection
1099  * into the board, including HTML escaping, wordfiltering, general
1100  * formatting, and adding links.
1101  *
1102  * Preconditions
1103  *
1104  *  - setup_sanitize_comment() has been invoked more recently than
1105  *    clean_sanitize_comment().
1106  *
1107  *  - *pc has been filled out (fields like action, board, etc. have
1108  *    been populated) from the POST data.
1109  *
1110  * Postconditions (success):
1111  *
1112  *  - The prepared_XYZ fields of *pc have been filled out, and each
1113  *    is valid ASCII text, with Unicode codepoints.
1114  */
1115 int
1116 st_sanitize_text(struct post_cmd *pc, int *our_fault,
1117                  uint_fast8_t *is_forbidden, int *ban_duration, const
1118                  char **ban_reason)
1119 {
1120         int ret = -1;
1121         size_t out_idx = 0;
1122         char *html_escaped_comment = 0;
1123         size_t html_escaped_comment_len = 0;
1124
1125         /* Flush out lurking double-free bugs */
1126         free(pc->prepared.name);
1127         pc->prepared.name = 0;
1128         pc->prepared.name_len = 0;
1129         free(pc->prepared.email);
1130         pc->prepared.email = 0;
1131         pc->prepared.email_len = 0;
1132         free(pc->prepared.subject);
1133         pc->prepared.subject = 0;
1134         pc->prepared.subject_len = 0;
1135         free(pc->prepared.comment);
1136         pc->prepared.comment = 0;
1137         pc->prepared.comment_len = 0;
1138         free(pc->prepared.file_name);
1139         pc->prepared.file_name = 0;
1140         pc->prepared.file_name_len = 0;
1141         free(pc->scannable_comment);
1142         pc->scannable_comment = 0;
1143         pc->scannable_comment_len = 0;
1144         free(pc->comment_position_map);
1145         pc->comment_position_map = 0;
1146         pc->comment_position_map_len = 0;
1147         free(pc->scannable_name);
1148         pc->scannable_name = 0;
1149         pc->scannable_name_len = 0;
1150         free(pc->name_position_map);
1151         pc->name_position_map = 0;
1152         pc->name_position_map_len = 0;
1153         free(pc->scannable_email);
1154         pc->scannable_email = 0;
1155         pc->scannable_email_len = 0;
1156         free(pc->email_position_map);
1157         pc->email_position_map = 0;
1158         pc->email_position_map_len = 0;
1159         free(pc->scannable_subject);
1160         pc->scannable_subject = 0;
1161         pc->scannable_subject_len = 0;
1162         free(pc->subject_position_map);
1163         pc->subject_position_map = 0;
1164         pc->subject_position_map_len = 0;
1165         free(pc->scannable_filename);
1166         pc->scannable_filename = 0;
1167         pc->scannable_filename_len = 0;
1168         free(pc->filename_position_map);
1169         pc->filename_position_map = 0;
1170         pc->filename_position_map_len = 0;
1171         out_idx = 0;
1172
1173         if (!pc->raw.name_len) {
1174                 free(pc->raw.name);
1175
1176                 if (!(pc->raw.name = strdup("Anonymous"))) {
1177                         PERROR_MESSAGE("strdup");
1178                         *our_fault = 1;
1179                         goto done;
1180                 }
1181
1182                 pc->raw.name_len = strlen(pc->raw.name);
1183         }
1184
1185         if (pc->raw.name_len) {
1186                 if (to_html(pc->raw.name, pc->raw.name_len, 0,
1187                             &pc->prepared.name, &pc->prepared.name_len,
1188                             &out_idx) < 0) {
1189                         *our_fault = 1;
1190                         goto done;
1191                 }
1192         }
1193
1194         out_idx = 0;
1195
1196         if (pc->raw.email_len) {
1197                 if (to_html(pc->raw.email, pc->raw.email_len, 0,
1198                             &pc->prepared.email, &pc->prepared.email_len,
1199                             &out_idx) < 0) {
1200                         *our_fault = 1;
1201                         goto done;
1202                 }
1203         }
1204
1205         out_idx = 0;
1206
1207         if (pc->raw.tripcode_len) {
1208                 if (to_html(pc->raw.tripcode, pc->raw.tripcode_len, 0,
1209                             &pc->prepared.tripcode, &pc->prepared.tripcode_len,
1210                             &out_idx) <
1211                     0) {
1212                         *our_fault = 1;
1213                         goto done;
1214                 }
1215         }
1216
1217         out_idx = 0;
1218
1219         if (pc->raw.subject_len) {
1220                 if (to_html(pc->raw.subject, pc->raw.subject_len, 0,
1221                             &pc->prepared.subject, &pc->prepared.subject_len,
1222                             &out_idx) <
1223                     0) {
1224                         *our_fault = 1;
1225                         goto done;
1226                 }
1227         }
1228
1229         out_idx = 0;
1230
1231         if (pc->raw.file_name_len) {
1232                 if (to_html(pc->raw.file_name, pc->raw.file_name_len, 0,
1233                             &pc->prepared.file_name,
1234                             &pc->prepared.file_name_len,
1235                             &out_idx) < 0) {
1236                         *our_fault = 1;
1237                         goto done;
1238                 }
1239         }
1240
1241         if (to_scannable(pc->raw.comment, pc->raw.comment_len,
1242                          &pc->scannable_comment, &pc->scannable_comment_len,
1243                          &pc->comment_position_map,
1244                          &pc->comment_position_map_len)) {
1245                 *our_fault = 1;
1246                 goto done;
1247         }
1248
1249         if (to_scannable(pc->raw.name, pc->raw.name_len, &pc->scannable_name,
1250                          &pc->scannable_name_len, &pc->name_position_map,
1251                          &pc->name_position_map_len)) {
1252                 *our_fault = 1;
1253                 goto done;
1254         }
1255
1256         if (to_scannable(pc->raw.email, pc->raw.email_len, &pc->scannable_email,
1257                          &pc->scannable_email_len, &pc->email_position_map,
1258                          &pc->email_position_map_len)) {
1259                 *our_fault = 1;
1260                 goto done;
1261         }
1262
1263         if (to_scannable(pc->raw.subject, pc->raw.subject_len,
1264                          &pc->scannable_subject, &pc->scannable_subject_len,
1265                          &pc->subject_position_map,
1266                          &pc->subject_position_map_len)) {
1267                 *our_fault = 1;
1268                 goto done;
1269         }
1270
1271         if (to_scannable(pc->raw.file_name, pc->raw.file_name_len,
1272                          &pc->scannable_filename, &pc->scannable_filename_len,
1273                          &pc->filename_position_map,
1274                          &pc->filename_position_map_len)) {
1275                 *our_fault = 1;
1276                 goto done;
1277         }
1278
1279         /*
1280          * Are they a spambot?
1281          */
1282         if (check_forbidden_filters(pc->scannable_comment,
1283                                     pc->scannable_comment_len, is_forbidden,
1284                                     ban_duration, ban_reason) <
1285             0) {
1286                 *our_fault = 1;
1287                 goto done;
1288         }
1289
1290         if (*is_forbidden) {
1291                 goto done;
1292         }
1293
1294         if (check_forbidden_filters(pc->scannable_name, pc->scannable_name_len,
1295                                     is_forbidden, ban_duration, ban_reason) <
1296             0) {
1297                 *our_fault = 1;
1298                 goto done;
1299         }
1300
1301         if (*is_forbidden) {
1302                 goto done;
1303         }
1304
1305         if (check_forbidden_filters(pc->scannable_email,
1306                                     pc->scannable_email_len, is_forbidden,
1307                                     ban_duration, ban_reason) < 0) {
1308                 *our_fault = 1;
1309                 goto done;
1310         }
1311
1312         if (*is_forbidden) {
1313                 goto done;
1314         }
1315
1316         if (check_forbidden_filters(pc->scannable_subject,
1317                                     pc->scannable_subject_len, is_forbidden,
1318                                     ban_duration, ban_reason) <
1319             0) {
1320                 *our_fault = 1;
1321                 goto done;
1322         }
1323
1324         if (*is_forbidden) {
1325                 goto done;
1326         }
1327
1328         if (check_forbidden_filters(pc->scannable_filename,
1329                                     pc->scannable_filename_len, is_forbidden,
1330                                     ban_duration, ban_reason) <
1331             0) {
1332                 *our_fault = 1;
1333                 goto done;
1334         }
1335
1336         if (*is_forbidden) {
1337                 *our_fault = 0;
1338                 goto done;
1339         }
1340
1341         /*
1342          * Now we do the fancy thing. Match scannable, build prepared
1343          * out of that.
1344          */
1345         if (wordfilter_to_html(pc->raw.comment, pc->raw.comment_len,
1346                                pc->scannable_comment, pc->scannable_comment_len,
1347                                pc->comment_position_map, &html_escaped_comment,
1348                                &html_escaped_comment_len) < 0) {
1349                 *our_fault = 1;
1350                 goto done;
1351         }
1352
1353         /*
1354          * Everything's in &#123; form, but now take care of >>123,
1355          * <br />, etc.
1356          */
1357         if (insert_html_tags(html_escaped_comment, html_escaped_comment_len,
1358                              pc->raw.board, &pc->prepared.comment,
1359                              &pc->prepared.comment_len) < 0) {
1360                 *our_fault = 1;
1361                 goto done;
1362         }
1363
1364         ret = 0;
1365 done:
1366         free(html_escaped_comment);
1367
1368         return ret;
1369 }
1370
1371 /*
1372  * Initialize any static elements needed for this file.
1373  *
1374  * Preconditions:
1375  *
1376  *  - setup_sanitize_comment() was not invoked more recently than
1377  *    clean_sanitize_comment().
1378  *
1379  * Postconditions (success):
1380  *
1381  *  - Any other function in this file may be safely called.
1382  */
1383 int
1384 setup_sanitize_comment(const struct configuration *conf)
1385 {
1386         /*
1387          * Check that the locale/libc/whatever is set up so that
1388          * UTF-8 handling can work.
1389          */
1390         int ret = -1;
1391         const char *raw =
1392                 "<script>alert(1)</script> , \U0001d511\U0001d526\U0001d52b"
1393                 "\U0001d51e\u3000\U0001d513\U0001d532\U0001d52f\U0001d52d"
1394                 "\U0001d529\U0001d522\U0001d531\U0001d52c\U0001d52b & "
1395                 "\u2468\u0294!\u0ce2!!";
1396         const char *correct_html =
1397                 "&lt;script&gt;alert(1)&lt;/script&gt; , &#120081;&#120102;"
1398                 "&#120107;&#120094;&#12288;&#120083;&#120114;&#120111;"
1399                 "&#120109;&#120105;&#120098;&#120113;&#120108;&#120107; &amp;"
1400                 " &#9320;&#660;!&#3298;!!";
1401         const char *correct_scannable =
1402                 "<script>alert(1)</script> , Nina Purpleton & 9!!!";
1403         char *html = 0;
1404         size_t html_len = 0;
1405         char *scannable = 0;
1406         size_t scannable_len = 0;
1407         size_t *position_map = 0;
1408         size_t position_map_len = 0;
1409         size_t out_idx = 0;
1410
1411         /* For pcre2_get_error_message */
1412         int err_code = 0;
1413         PCRE2_SIZE err_offset = 0;
1414         PCRE2_UCHAR8 err_buf[120];
1415
1416         if (to_html(raw, strlen(raw), 0, &html, &html_len, &out_idx) < 0) {
1417                 goto done;
1418         }
1419
1420         if (strcmp(html, correct_html)) {
1421                 ERROR_MESSAGE("Was expecting html conversion to yield "
1422                               "\n\n\u00ab%s\u00bb\n\nInstead, got "
1423                               "\n\n\u00ab%s\u00bb\n\n",
1424                               correct_html, html);
1425                 goto done;
1426         }
1427
1428         if (to_scannable(raw, strlen(raw), &scannable, &scannable_len,
1429                          &position_map, &position_map_len) < 0) {
1430                 goto done;
1431         }
1432
1433         if (strcmp(scannable, correct_scannable)) {
1434                 ERROR_MESSAGE("Was expecting scannable conversion to yield "
1435                               "\n\n\u00ab%s\u00bb\n\nInstead, got "
1436                               "\n\n\u00ab%s\u00bb\n\n",
1437                               correct_scannable, scannable);
1438                 goto done;
1439         }
1440
1441         if (!(wordfilters = calloc(conf->wordfilter_inputs_num,
1442                                    sizeof *wordfilters))) {
1443                 PERROR_MESSAGE("calloc");
1444                 goto done;
1445         }
1446
1447         wordfilters_num = conf->wordfilter_inputs_num;
1448
1449         for (size_t j = 0; j < wordfilters_num; ++j) {
1450                 wordfilters[j].replacement =
1451                         conf->wordfilter_inputs[j].replacement;
1452                 wordfilters[j].replacement_len = strlen(
1453                         conf->wordfilter_inputs[j].replacement);
1454
1455                 if ((wordfilters[j].code = pcre2_compile(
1456                              (PCRE2_SPTR8) conf->wordfilter_inputs[j].pattern,
1457                              PCRE2_ZERO_TERMINATED, PCRE2_UTF, &err_code,
1458                              &err_offset, 0))) {
1459                         continue;
1460                 }
1461
1462                 pcre2_get_error_message(err_code, err_buf, 120);
1463                 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1464                               conf->wordfilter_inputs[j].pattern, err_buf);
1465                 goto done;
1466         }
1467
1468         if (!(forbiddens = calloc(conf->forbidden_inputs_num,
1469                                   sizeof *forbiddens))) {
1470                 PERROR_MESSAGE("calloc");
1471                 goto done;
1472         }
1473
1474         forbiddens_num = conf->forbidden_inputs_num;
1475
1476         for (size_t j = 0; j < forbiddens_num; ++j) {
1477                 forbiddens[j].ban_duration =
1478                         conf->forbidden_inputs[j].ban_duration;
1479                 forbiddens[j].ban_reason = conf->forbidden_inputs[j].ban_reason;
1480
1481                 if ((forbiddens[j].code = pcre2_compile(
1482                              (PCRE2_SPTR8) conf->forbidden_inputs[j].pattern,
1483                              PCRE2_ZERO_TERMINATED, PCRE2_UTF, &err_code,
1484                              &err_offset, 0))) {
1485                         continue;
1486                 }
1487
1488                 pcre2_get_error_message(err_code, err_buf, 120);
1489                 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1490                               conf->forbidden_inputs[j].pattern, err_buf);
1491                 goto done;
1492         }
1493
1494         const char *format_match_str =
1495
1496                 /* */
1497                 "(?<newline>\\n)"                              /* */
1498                 "|(?<intra_postlink>&gt;&gt;(?<a_num>[0-9]+))" /* */
1499                 "|(?<inter_postlink>&gt;&gt;&gt;/"             /* */
1500                 "(?<e_board>[^ /]+)/(?<e_num>[0-9]+))"         /* */
1501                 "|(?<quote>(?<![^\n])&gt;[^\n]*)";             /* */
1502
1503         if (!(format_replacements = pcre2_compile(
1504                       (PCRE2_SPTR8) format_match_str, PCRE2_ZERO_TERMINATED,
1505                       PCRE2_UTF,
1506                       &err_code, &err_offset, 0))) {
1507                 pcre2_get_error_message(err_code, err_buf, 120);
1508                 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1509                               format_match_str, err_buf);
1510                 goto done;
1511         }
1512
1513         ret = 0;
1514 done:
1515         free(html);
1516         free(scannable);
1517         free(position_map);
1518
1519         return ret;
1520 }
1521
1522 /*
1523  * Clean up any memory from this file
1524  *
1525  * Postconditions (success):
1526  *
1527  *  - Valgrind won't report any memory leaks from this file.
1528  *
1529  *  - setup_sanitize_comment() can be safely called again.
1530  */
1531 int
1532 clean_sanitize_comment(void)
1533 {
1534         for (size_t j = 0; j < wordfilters_num; ++j) {
1535                 pcre2_code_free(wordfilters[j].code);
1536                 wordfilters[j] = (struct wordfilter) { 0 };
1537         }
1538
1539         for (size_t j = 0; j < forbiddens_num; ++j) {
1540                 pcre2_code_free(forbiddens[j].code);
1541                 forbiddens[j] = (struct forbidden) { 0 };
1542         }
1543
1544         pcre2_code_free(format_replacements);
1545         format_replacements = 0;
1546         free(wordfilters);
1547         wordfilters = 0;
1548         wordfilters_num = 0;
1549         free(forbiddens);
1550         forbiddens = 0;
1551         forbiddens_num = 0;
1552
1553         return 0;
1554 }