sanitize-comment.c

   1 /*
   2  * Copyright (c) 2017-2020, De Rais <derais@cock.li>
   3  *
   4  * Permission to use, copy, modify, and/or distribute this software for
   5  * any purpose with or without fee is hereby granted, provided that the
   6  * above copyright notice and this permission notice appear in all
   7  * copies.
   8  *
   9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
  10  * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
  11  * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
  12  * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
  13  * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
  14  * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
  15  * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
  16  * PERFORMANCE OF THIS SOFTWARE.
  17  */
  18 #include <errno.h>
  19 #include <stdint.h>
  20 #include <stdio.h>
  21 #include <stdlib.h>
  22 #include <string.h>
  23 #include <time.h>
  24 #include <wchar.h>
  25
  26 #define PCRE2_CODE_UNIT_WIDTH 8
  27 #include <pcre2.h>
  28
  29 #include "macros.h"
  30 #include "rb79.h"
  31 #include "unicode-transforms.h"
  32
  33 /*
  34  * We need a way to get codepoints out of UTF-8 strings and if
  35  * wchar_t stored codepoint values, that would be great. That's
  36  * __STDC_ISO_10646__, though. You can remove this check and cross
  37  * your fingers, since rb79 will do a quick check on startup, but
  38  * please check why the C implementation doesn't define
  39  * __STDC_ISO_10646__ first.
  40  */
  41 #ifndef __STDC_ISO_10646__
  42 #error We really want __STD_ISO_10646__
  43 #endif
  44
  45 /*
  46  * A wordfilter consists of a pcre2 regex and a replacement string
  47  */
  48 struct wordfilter {
  49         /* */
  50         pcre2_code *code;
  51         const char *replacement;
  52         size_t replacement_len;
  53 };
  54
  55 /* These are constructed in setup_sanitize_comment() */
  56 static struct wordfilter *wordfilters;
  57 static size_t wordfilters_num;
  58
  59 /* Special matcher for quoting, newlines, linkifying, etc. */
  60 static pcre2_code *format_replacements;
  61
  62 /*
  63  * Comparison function for struct translate.
  64  *
  65  * Preconditions:
  66  *
  67  *  - *key_v is a wchar_t.
  68  *
  69  *  - *tr_v is a struct translate object.
  70  *
  71  * Postconditions:
  72  *
  73  *  - Returns -1 (0) [1] if *key_v is less than (equal to) [greater
  74  *    than] *tr_v's starting range.
  75  */
  76 static int
  77 match_translate(const void *key_v, const void *tr_v)
  78 {
  79         const wchar_t *key = key_v;
  80         const struct translate *tr = tr_v;
  81
  82         if (*key < tr->from_s) {
  83                 return -1;
  84         } else if (*key > tr->from_t) {
  85                 return 1;
  86         }
  87
  88         return 0;
  89 }
  90
  91 /*
  92  * Add a UTF-8 sequence str onto *buf
  93  *
  94  * Preconditions:
  95  *
  96  *  - *buf is memory of length *sz, and up to *idx is a valid UTF-8
  97  *    string.
  98  *
  99  *  - str is a valid ASCII (not just UTF-8) string of length str_len.
 100  *
 101  * Postconditions (success):
 102  *
 103  *  - *buf is memory of length *sz, and up to *idx is a valid UTF-8
 104  *    string.
 105  *
 106  *  - The contents of str have been appended to *buf (and *idx
 107  *    includes this).
 108  */
 109 static int
 110 append_str(char **buf, size_t *idx, size_t *sz, const char *str, size_t str_len)
 111 {
 112         if (str_len + *idx >= *sz) {
 113                 void *newmem = 0;
 114                 size_t new_sz = str_len + *idx + (1 << 9);
 115
 116                 if (str_len + *idx < str_len ||
 117                     str_len + *idx + (1 << 9) < str_len + *idx) {
 118                         ERROR_MESSAGE("overflow (str_len = %zu, *idx = %zu)",
 119                                       str_len, *idx);
 120
 121                         return -1;
 122                 }
 123
 124                 if (!(newmem = realloc(*buf, new_sz))) {
 125                         PERROR_MESSAGE("realloc");
 126
 127                         return -1;
 128                 }
 129
 130                 *buf = newmem;
 131                 *sz = new_sz;
 132         }
 133
 134         strncpy(*buf + *idx, str, str_len);
 135         *(*buf + *idx + str_len) = '\0';
 136         *idx += str_len;
 137
 138         return 0;
 139 }
 140
 141 /* Dummy function for when I can't be bothered to strlen(). */
 142 static int
 143 append_const_str(char **buf, size_t *idx, size_t *len, const char *str)
 144 {
 145         return append_str(buf, idx, len, str, strlen(str));
 146 }
 147
 148 /*
 149  * Add a single character onto *buf
 150  *
 151  * Preconditions:
 152  *
 153  *  - *buf is memory of length *len, and up to *idx is a valid UTF-8
 154  *    string.
 155  *
 156  *  - c is an ASCII character.
 157  *
 158  * Postconditions (success):
 159  *
 160  *  - *buf is memory of length *len, and up to *idx is a valid UTF-8
 161  *    string.
 162  *
 163  *  - c has been appended to *buf (and *idx includes this).
 164  */
 165 static int
 166 append_char(char **buf, size_t *idx, size_t *len, char c)
 167 {
 168         if (1 + *idx >= *len) {
 169                 void *newmem = 0;
 170                 size_t new_len = 1 + *idx + (1 << 9);
 171
 172                 if (*idx + 1 < *idx ||
 173                     *idx + 1 + (1 << 9) < *idx + 1) {
 174                         ERROR_MESSAGE("overflow (*idx = %zu)", *idx);
 175
 176                         return -1;
 177                 }
 178
 179                 if (!(newmem = realloc(*buf, new_len))) {
 180                         PERROR_MESSAGE("realloc");
 181
 182                         return -1;
 183                 }
 184
 185                 *buf = newmem;
 186                 *len = new_len;
 187         }
 188
 189         *(*buf + *idx) = c;
 190         *(*buf + *idx + 1) = '\0';
 191         *idx += 1;
 192
 193         return 0;
 194 }
 195
 196 /*
 197  * Add a Unicode codepoint onto *buf
 198  *
 199  * Preconditions:
 200  *
 201  *  - *buf is memory of length *sz, and up to *idx is a valid UTF-8
 202  *    string.
 203  *
 204  *  - wchar_t is a valid Unicode codepoint.
 205  *
 206  * Postconditions (success):
 207  *
 208  *  - *buf is memory of length *sz, and up to *idx is a valid UTF-8
 209  *    string.
 210  *
 211  *  - An HTML-escaped sequence like &#123; has been appended to
 212  *    *buf (and *idx includes this).
 213  */
 214 static int
 215 append_wchar_escaped(char **buf, size_t *idx, size_t *sz, wchar_t wc)
 216 {
 217         size_t l = snprintf(0, 0, "&#%ld;", (long) wc);
 218
 219         if (l + *idx >= *sz) {
 220                 void *newmem = 0;
 221                 size_t new_sz = l + *idx + (1 << 9);
 222
 223                 if (*idx + l < *idx ||
 224                     *idx + l + (1 << 9) < *idx + l) {
 225                         ERROR_MESSAGE("overflow (*idx = %zu, l = %zu)", *idx,
 226                                       l);
 227
 228                         return -1;
 229                 }
 230
 231                 if (!(newmem = realloc(*buf, new_sz))) {
 232                         PERROR_MESSAGE("realloc");
 233
 234                         return -1;
 235                 }
 236
 237                 *buf = newmem;
 238                 *sz = new_sz;
 239         }
 240
 241         sprintf(*buf + *idx, "&#%ld;", (long) wc);
 242         *idx += l;
 243
 244         return 0;
 245 }
 246
 247 /*
 248  * Ensure that (*map)[j] = k, fixing up length as appropriate.
 249  *
 250  * Preconditions
 251  *
 252  *  - *map is memory of length len.
 253  *
 254  * Postconditions (success):
 255  *
 256  *  - *map is memory of length len.
 257  *
 258  *  - (*map)[j] = k.
 259  */
 260 static int
 261 set_position_mapping(size_t **map, size_t *len, size_t j, size_t k)
 262 {
 263         if (j + 1 >= *len) {
 264                 void *newmem = 0;
 265
 266                 if (j + 2 < j ||
 267                     ((j + 2) * sizeof **map) / (j + 2) != sizeof **map) {
 268                         ERROR_MESSAGE("overflow (j = %zu)", j);
 269
 270                         return -1;
 271                 }
 272
 273                 if (!(newmem = realloc(*map, (j + 2) * sizeof **map))) {
 274                         PERROR_MESSAGE("realloc");
 275
 276                         return -1;
 277                 }
 278
 279                 *map = newmem;
 280
 281                 for (size_t l = *len; l < j + 2; ++l) {
 282                         (*map)[l] = ((size_t) -1);
 283                 }
 284
 285                 *len = j + 2;
 286         }
 287
 288         (*map)[j] = k;
 289
 290         return 0;
 291 }
 292
 293 /*
 294  * HTML-escape in to *out.
 295  *
 296  * Preconditions
 297  *
 298  *  - in is memory of at least length in_len, valid UTF-8
 299  *    text.
 300  *
 301  *  - *out is memory of at least length *out_len (if *out_len = 0,
 302  *    *out may be 0), valid UTF-8 text.
 303  *
 304  *  - Overwriting *out and *out_len  shall not cause a memory leak.
 305  *
 306  *  - out, out_len, and out_idx are not 0.
 307  *
 308  * Postconditions (success):
 309  *
 310  *  - *out is memory of at least length *out_len, valid UTF-8 text.
 311  *
 312  *  - A stretch of HTML-escaped ASCII text representing in has been
 313  *    added to *out at the position that was *out_idx.
 314  *
 315  *  - *out_idx has been updated to point to the end of this stretch.
 316  *
 317  *  - If necessary, *out_len has been updated.
 318  */
 319 static int
 320 to_html(const char *in, const size_t in_len, size_t in_idx, char **out,
 321         size_t *out_len, size_t *out_idx)
 322 {
 323         int ret = -1;
 324         wchar_t wc = 0;
 325         int mbret = 0;
 326         size_t out_sz = 0;
 327         size_t initial_out_idx = *out_idx;
 328
 329         if (!*out) {
 330                 if (!(*out = malloc(1))) {
 331                         PERROR_MESSAGE("malloc");
 332                         goto done;
 333                 }
 334
 335                 out_sz = 1;
 336                 *out_len = 0;
 337                 (*out)[0] = '\0';
 338         }
 339
 340         /*
 341          * XXX: If you make this multithreaded, be sure to use
 342          * mbrtowc(3) here!
 343          */
 344         while (in_idx < in_len &&
 345                in[in_idx]) {
 346                 /* Extract next character */
 347                 mbret = mbtowc(&wc, in + in_idx, in_len - in_idx);
 348
 349                 if (mbret == -1) {
 350                         PERROR_MESSAGE("mbtowc");
 351                         goto done;
 352                 }
 353
 354                 if (wc == L'&') {
 355                         ret = append_str(out, out_idx, &out_sz, "&amp;", 5);
 356                 } else if (wc == L'"') {
 357                         ret = append_str(out, out_idx, &out_sz, "&quot;", 6);
 358                 } else if (wc == L'\'') {
 359                         ret = append_str(out, out_idx, &out_sz, "&apos;", 6);
 360                 } else if (wc == L'<') {
 361                         ret = append_str(out, out_idx, &out_sz, "&lt;", 4);
 362                 } else if (wc == L'>') {
 363                         ret = append_str(out, out_idx, &out_sz, "&gt;", 4);
 364                 } else if (mbret == 1 &&
 365                            in[in_idx] >= ' ' &&
 366                            in[in_idx] <= '~') {
 367                         ret = append_char(out, out_idx, &out_sz, in[in_idx]);
 368                 } else if (mbret == 1 &&
 369                            in[in_idx] == '\r') {
 370                         ret = 0;
 371                 } else if (mbret == 1 &&
 372                            in[in_idx] == '\n') {
 373                         ret = append_char(out, out_idx, &out_sz, in[in_idx]);
 374                 } else {
 375                         ret = append_wchar_escaped(out, out_idx, &out_sz, wc);
 376                 }
 377
 378                 in_idx += mbret;
 379
 380                 if (ret < 0) {
 381                         goto done;
 382                 }
 383         }
 384
 385         *out_len = *out_len + (*out_idx - initial_out_idx);
 386         ret = 0;
 387 done:
 388
 389         return ret;
 390 }
 391
 392 /*
 393  * From in construct *out, which is a codepoint-for-codepoint
 394  * translation following the rules of unicode-transforms.h. The
 395  * result is that *out can be matched with normal regex, even if
 396  * in contains obfuscatory Unicode bullshit.
 397  *
 398  * Preconditions
 399  *
 400  *  - setup_sanitize_comment() has been invoked more recently than
 401  *    clean_sanitize_comment().
 402  *
 403  *  - in is memory of at least length in_len, valid UTF-8 text.
 404  *
 405  *  - Overwriting *out and *out_position_map shall not cause a
 406  *    memory leak.
 407  *
 408  *  - out, out_len, out_position_map, and out_position_map_len are
 409  *    not 0.
 410  *
 411  * Postconditions (success):
 412  *
 413  *  - *out is valid, UTF-8 text of length *out_len.
 414  *
 415  *  - For every j in [0, *out_len) such that (*out)[j] starts a
 416  *    codepoint, in[*(position_map)[j]] is the start of the
 417  *    corresponding codepoint.
 418  *
 419  *  - (*position_map)[*out_len] = in_len.
 420  */
 421 static int
 422 to_scannable(const char *in, size_t in_len, char **out, size_t *out_len,
 423              size_t **out_position_map, size_t *out_position_map_len)
 424 {
 425         int ret = -1;
 426         wchar_t wc = 0;
 427         size_t in_idx = 0;
 428         size_t out_idx = 0;
 429         int mbret = 0;
 430         struct translate *tr = 0;
 431         size_t out_sz = 0;
 432
 433         if (!*out) {
 434                 if (!(*out = malloc(1))) {
 435                         PERROR_MESSAGE("malloc");
 436                         goto done;
 437                 }
 438
 439                 out_sz = 1;
 440                 *out_len = 0;
 441                 (*out)[0] = '\0';
 442         }
 443
 444         /*
 445          * Position_map is here to make wordfiltering work. Suppose in is
 446          *
 447          *     Ｉ  ｔｈｉｎｋ  Ｎｉｎａ  Ｐｕｒｐｌｅｔｏｎ  ｄｉｄ
 448          *     ｎｏｔｈｉｎｇ  ｗｒｏｎｇ
 449          *
 450          * and a wordfilter /Nina Purpleton/i -> "worst girl" is
 451          * in effect. Then *out will be
 452          *
 453          *      I think Nina Purpleton did nothing wrong
 454          *
 455          * The message should, of course, be filtered to
 456          *
 457          *     Ｉ  ｔｈｉｎｋ worst girl ｄｉｄ ｎｏｔｈｉｎｇ
 458          *     ｗｒｏｎｇ
 459          *
 460          * In order to do that, it would be necessary to have a map
 461          * from in to *out on the byte level, since the wordfilter
 462          * will only be run against *out.
 463          *
 464          * position_map[j] = k means that out[j] and in[k] mean the
 465          * same thing.
 466          */
 467         while (in_idx < in_len) {
 468                 mbret = mbtowc(&wc, in + in_idx, in_len - in_idx);
 469
 470                 if (mbret == -1) {
 471                         PERROR_MESSAGE("mbtowc");
 472                         goto done;
 473                 }
 474
 475                 /* We pre-suppose that the insert will go as planned */
 476                 if (set_position_mapping(out_position_map, out_position_map_len,
 477                                          out_idx, in_idx) < 0) {
 478                         goto done;
 479                 }
 480
 481                 if (mbret == 1 &&
 482                     in[in_idx] >= ' ' &&
 483                     in[in_idx] <= '~') {
 484                         if (append_str(out, &out_idx, &out_sz, in + in_idx, 1) <
 485                             0) {
 486                                 goto done;
 487                         }
 488                 } else {
 489                         if ((tr = bsearch(&wc, translates, NUM_OF(translates),
 490                                           sizeof *translates,
 491                                           match_translate))) {
 492                                 if (append_str(out, &out_idx, &out_sz, tr->to,
 493                                                strlen(tr->to)) < 0) {
 494                                         goto done;
 495                                 }
 496                         } else {
 497                                 if (append_str(out, &out_idx, &out_sz, in +
 498                                                in_idx, mbret) < 0) {
 499                                         goto done;
 500                                 }
 501                         }
 502                 }
 503
 504                 in_idx += mbret;
 505         }
 506
 507         if (set_position_mapping(out_position_map, out_position_map_len,
 508                                  out_idx, in_len) < 0) {
 509                 goto done;
 510         }
 511
 512         (*out)[out_idx] = '\0';
 513         *out_len = out_idx;
 514         ret = 0;
 515 done:
 516
 517         return ret;
 518 }
 519
 520 /*
 521  * Read through raw and scannable, checking all wordfilters in
 522  * scannable. Where a match is detected, the corresponding postion
 523  * (via position_map) in raw is replaced by the replacement specified
 524  * by the matching wordfilter.
 525  *
 526  * Preconditions
 527  *
 528  *  - setup_sanitize_comment() has been invoked more recently than
 529  *    clean_sanitize_comment().
 530  *
 531  *  - raw is memory of length at least raw_len, valid UTF-8 text.
 532  *
 533  *  - scannable is memory of length at least scannable_len.
 534  *
 535  *  - For any j in [0, scannable_len), position_map[j] is a valid
 536  *    index into raw, or is (size_t) -1.
 537  *
 538  *  - position_map[scannable_len] = raw_len.
 539  *
 540  *  - For any j in [0, scannable_len) such that k = position_map[j]
 541  *    is not (size_t) -1, scannable[j] and raw[k] are conceptually
 542  *    the same for wordfiltering.
 543  *
 544  *  - Overwriting *out shall not cause a memory leak.
 545  *
 546  *  - out and out_len are not 0.
 547  *
 548  * Postconditions (success):
 549  *
 550  *  - *out is valid, UTF-8 text of length *out_len such that all
 551  *    non ASCII codepoints (and '<', '>', '&', '"', ''') are
 552  *    HTML-escaped.
 553  *
 554  *  - *out represents raw, except in those sections of scannable
 555  *    where a wordfilter matched.
 556  */
 557 static int
 558 wordfilter_to_html(const char *raw, const size_t raw_len, const char *scannable,
 559                    const size_t scannable_len, size_t *position_map, char **out,
 560                    size_t *out_len)
 561 {
 562         int ret = -1;
 563
 564         /* These hold the match locations from pcre2 */
 565         uint32_t *ov_counts = 0;
 566         PCRE2_SIZE **ov_ps = 0;
 567         int *num_matches = 0;
 568         pcre2_match_data **match_data = 0;
 569         size_t raw_idx = 0;
 570         size_t scannable_idx = 0;
 571         size_t out_idx = 0;
 572         size_t best_match_pos = 0;
 573         size_t best_match_idx = 0;
 574         size_t l = 0;
 575         size_t mbret = 0;
 576
 577         if (!(ov_counts = calloc(wordfilters_num, sizeof *ov_counts))) {
 578                 PERROR_MESSAGE("calloc");
 579                 goto done;
 580         }
 581
 582         if (!(ov_ps = calloc(wordfilters_num, sizeof *ov_ps))) {
 583                 PERROR_MESSAGE("calloc");
 584                 goto done;
 585         }
 586
 587         if (!(num_matches = calloc(wordfilters_num, sizeof *num_matches))) {
 588                 PERROR_MESSAGE("calloc");
 589                 goto done;
 590         }
 591
 592         if (!(match_data = calloc(wordfilters_num, sizeof *match_data))) {
 593                 PERROR_MESSAGE("calloc");
 594                 goto done;
 595         }
 596
 597         /* First scan, before the loop */
 598         for (size_t j = 0; j < wordfilters_num; ++j) {
 599                 if (!(match_data[j] = pcre2_match_data_create_from_pattern(
 600                               wordfilters[j].code, 0))) {
 601                         PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
 602                         goto done;
 603                 }
 604
 605                 num_matches[j] = pcre2_match(wordfilters[j].code,
 606                                              (PCRE2_SPTR) scannable,
 607                                              scannable_len, scannable_idx, 0,
 608                                              match_data[j], 0);
 609         }
 610
 611 handle_next_match:
 612         best_match_pos = (size_t) -1;
 613         best_match_idx = (size_t) -1;
 614
 615         /* We've run pcre2_match() on everything. Find the soonest match */
 616         for (size_t j = 0; j < wordfilters_num; ++j) {
 617                 if (!num_matches[j]) {
 618                         continue;
 619                 }
 620
 621                 ov_ps[j] = pcre2_get_ovector_pointer(match_data[j]);
 622
 623                 if (ov_ps[j][0] >= scannable_idx &&
 624                     ov_ps[j][0] < best_match_pos) {
 625                         best_match_pos = ov_ps[j][0];
 626                         best_match_idx = j;
 627                 }
 628         }
 629
 630         if (best_match_idx == (size_t) -1) {
 631                 /* No matches. Turn the rest to html boring-like */
 632                 ret = to_html(raw, raw_len, raw_idx, out, out_len, &out_idx);
 633                 goto done;
 634         }
 635
 636         /* Figure out where in raw this match starts */
 637         l = best_match_pos;
 638
 639         while (l != (size_t) -1 &&
 640                position_map[l] == (size_t) -1) {
 641                 l--;
 642         }
 643
 644         if (l == (size_t) -1) {
 645                 ERROR_MESSAGE("Impossible condition in "
 646                               "wordfilter_to_html: raw=\"%s\", best_match_pos = %zu",
 647                               raw,
 648                               best_match_pos);
 649                 goto done;
 650         }
 651
 652         /*
 653          * Now position_map[l] points to the first character in raw
 654          * that should be replaced. Fill up to that point.
 655          */
 656         if (position_map[l] &&
 657             position_map[l] > raw_idx) {
 658                 if (to_html(raw, position_map[l], raw_idx, out, out_len,
 659                             &out_idx) < 0) {
 660                         goto done;
 661                 }
 662         }
 663
 664         /* Put the substituted text in */
 665         if (to_html(wordfilters[best_match_idx].replacement,
 666                     wordfilters[best_match_idx].replacement_len, 0, out,
 667                     out_len,
 668                     &out_idx) < 0) {
 669                 goto done;
 670         }
 671
 672         /*
 673          * Figure out where we should advance to in inputs. Naively,
 674          * we want to set scannable_idx to ov_ps[best_match_idx][1]
 675          * (the first character in scannable beyond the match).
 676          * However, we have to consider the case of
 677          *
 678          *      foo！！！bar
 679          *
 680          * where "foo" -> "baz" is the only transformation. Since
 681          * some characters, like "！", are completely ignored by
 682          * the scannable transformation, the naive method would
 683          * start our scanning at the "b", skipping information.
 684          *
 685          * So, instead, we carefully find the last character in
 686          * "foo", then jump one past it. This (unfortunately)
 687          * requires a bit more manual fiddling with wide character
 688          * conversions.
 689          *
 690          */
 691         if (ov_ps[best_match_idx][1] <= scannable_idx) {
 692                 /*
 693                  * This should never happen, but let's make sure
 694                  * we always keep advancing.
 695                  */
 696                 scannable_idx++;
 697         } else {
 698                 scannable_idx = ov_ps[best_match_idx][1] - 1;
 699         }
 700
 701         l = scannable_idx;
 702
 703         while (position_map[l] == (size_t) -1) {
 704                 l--;
 705         }
 706
 707         raw_idx = position_map[l];
 708
 709         /* This is the "jump one past it" part */
 710         scannable_idx++;
 711         errno = 0;
 712         mbret = mbrlen(raw + raw_idx, MB_CUR_MAX, 0);
 713
 714         switch (mbret) {
 715         case (size_t) -2:
 716         case (size_t) -1:
 717                 PERROR_MESSAGE("mbrlen");
 718                 goto done;
 719         default:
 720                 raw_idx += mbret;
 721         }
 722
 723         /*
 724          * Now re-check all our matches and figure out which ones
 725          * need to be updated
 726          */
 727         for (size_t j = 0; j < wordfilters_num; ++j) {
 728                 if (!num_matches[j] ||
 729                     ov_ps[j][0] >= scannable_idx) {
 730                         continue;
 731                 }
 732
 733                 num_matches[j] = pcre2_match(wordfilters[j].code,
 734                                              (PCRE2_SPTR) scannable,
 735                                              scannable_len, scannable_idx, 0,
 736                                              match_data[j], 0);
 737         }
 738
 739         goto handle_next_match;
 740 done:
 741
 742         for (size_t j = 0; j < wordfilters_num; ++j) {
 743                 pcre2_match_data_free(match_data[j]);
 744                 match_data[j] = 0;
 745         }
 746
 747         free(match_data);
 748         free(num_matches);
 749         free(ov_counts);
 750         free(ov_ps);
 751
 752         return ret;
 753 }
 754
 755 /*
 756  * Read through in. Each time a match for format_replacements is
 757  * found (something like a newline or a quote) is found, replace
 758  * it with some HTML markup. The result is placed in out.
 759  *
 760  * Preconditions:
 761  *
 762  *  - setup_sanitize_comment() has been invoked more recently than
 763  *    clean_sanitize_comment().
 764  *
 765  *  - in is memory of length at least in_len, valid UTF-8 text.
 766  *
 767  *  - Overwriting *out shall not cause a memory leak.
 768  *
 769  *  - out and out_len are not 0.
 770  *
 771  * Postconditions (success):
 772  *
 773  *  - *out is valid, UTF-8 text of length *out_len with sane HTML
 774  *    markup (and HTML escaped), suitable for outputting into an
 775  *    HTML file.
 776  */
 777 static int
 778 insert_html_tags(const char *in, size_t in_len, const char *board, char **out,
 779                  size_t *out_len)
 780 {
 781         int ret = -1;
 782         size_t in_idx = 0;
 783         size_t match_pos = 0;
 784         size_t after_match_pos = 0;
 785         size_t out_idx = 0;
 786         pcre2_match_data *match_data = 0;
 787         int nret = 0;
 788         PCRE2_UCHAR *tmp_1 = 0;
 789         PCRE2_SIZE tmp_1_len = 0;
 790         PCRE2_UCHAR *tmp_2 = 0;
 791         PCRE2_SIZE tmp_2_len = 0;
 792         PCRE2_UCHAR *tmp_3 = 0;
 793         PCRE2_SIZE tmp_3_len = 0;
 794         uint_fast8_t last_was_newline = 1;
 795         char *link_target = 0;
 796         size_t link_target_len = 0;
 797
 798         if (!(match_data = pcre2_match_data_create_from_pattern(
 799                       format_replacements, 0))) {
 800                 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
 801                 goto done;
 802         }
 803
 804 find_next_bit:
 805
 806         if (in_idx >= in_len) {
 807                 goto success;
 808         }
 809
 810         nret = pcre2_match(format_replacements, (PCRE2_SPTR) in, in_len, in_idx,
 811                            0, match_data, 0);
 812
 813         if (nret == PCRE2_ERROR_NOMATCH) {
 814                 ret = append_str(out, &out_idx, out_len, in + in_idx, in_len -
 815                                  in_idx);
 816                 goto done;
 817         }
 818
 819         if (nret < 0) {
 820                 PCRE2_UCHAR8 err_buf[120];
 821
 822                 pcre2_get_error_message(nret, err_buf, 120);
 823                 ERROR_MESSAGE("pcre2_match: error while matching \"%.*s\": %s"
 824                               " (PCRE2 %d)", (int) (in_len - in_idx), in +
 825                               in_idx, err_buf,
 826                               nret);
 827                 goto done;
 828         }
 829
 830         pcre2_substring_free(tmp_1);
 831         pcre2_substring_free(tmp_2);
 832         pcre2_substring_free(tmp_3);
 833         free(link_target);
 834         tmp_1 = 0;
 835         tmp_2 = 0;
 836         tmp_3 = 0;
 837         link_target = 0;
 838
 839         /* We have match, stuff everything up to it in *out */
 840         match_pos = pcre2_get_ovector_pointer(match_data)[0];
 841         after_match_pos = pcre2_get_ovector_pointer(match_data)[1];
 842
 843         if (match_pos > in_idx) {
 844                 if (append_str(out, &out_idx, out_len, in + in_idx, match_pos -
 845                                in_idx) < 0) {
 846                         goto done;
 847                 }
 848
 849                 last_was_newline = 0;
 850                 in_idx = match_pos;
 851         }
 852
 853         /* Figure out what type of match. */
 854         if (!pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "newline",
 855                                         &tmp_1, &tmp_1_len)) {
 856                 if (last_was_newline) {
 857                         if (append_const_str(out, &out_idx, out_len,
 858                                              "&nbsp;<br />") < 0) {
 859                                 goto done;
 860                         }
 861                 } else {
 862                         if (append_const_str(out, &out_idx, out_len, "<br />") <
 863                             0) {
 864                                 goto done;
 865                         }
 866                 }
 867
 868                 last_was_newline = 1;
 869                 in_idx = after_match_pos;
 870                 goto find_next_bit;
 871         }
 872
 873         last_was_newline = 0;
 874
 875         if (!pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "quote",
 876                                         &tmp_1, &tmp_1_len)) {
 877                 if (append_const_str(out, &out_idx, out_len,
 878                                      "<span class=\"quote\">") < 0) {
 879                         goto done;
 880                 }
 881
 882                 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
 883                                (size_t) tmp_1_len) < 0) {
 884                         goto done;
 885                 }
 886
 887                 if (append_const_str(out, &out_idx, out_len, "</span>") < 0) {
 888                         goto done;
 889                 }
 890
 891                 in_idx = after_match_pos;
 892                 goto find_next_bit;
 893         }
 894
 895         if (!pcre2_substring_get_byname(match_data,
 896                                         (PCRE2_SPTR) "intra_postlink", &tmp_1,
 897                                         &tmp_1_len)) {
 898                 if (pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "a_num",
 899                                                &tmp_2, &tmp_2_len)) {
 900                         goto problem_with_match;
 901                 }
 902
 903                 int found = 0;
 904
 905                 if (db_construct_post_link(board, strlen(board), (const
 906                                                                   char *) tmp_2,
 907                                            tmp_2_len, &found, &link_target,
 908                                            &link_target_len) < 0) {
 909                         goto done;
 910                 }
 911
 912                 if (!found) {
 913                         if (append_str(out, &out_idx, out_len, in + match_pos,
 914                                        after_match_pos - match_pos) < 0) {
 915                                 goto done;
 916                         }
 917
 918                         in_idx = after_match_pos;
 919                         goto find_next_bit;
 920                 }
 921
 922                 if (append_const_str(out, &out_idx, out_len, "<a href=\"") <
 923                     0) {
 924                         goto done;
 925                 }
 926
 927                 if (append_str(out, &out_idx, out_len, link_target,
 928                                link_target_len) < 0) {
 929                         goto done;
 930                 }
 931
 932                 if (append_const_str(out, &out_idx, out_len, "\">") < 0) {
 933                         goto done;
 934                 }
 935
 936                 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
 937                                (size_t) tmp_1_len) < 0) {
 938                         goto done;
 939                 }
 940
 941                 if (append_const_str(out, &out_idx, out_len, "</a>") < 0) {
 942                         goto done;
 943                 }
 944
 945                 in_idx = after_match_pos;
 946                 goto find_next_bit;
 947         }
 948
 949         if (!pcre2_substring_get_byname(match_data,
 950                                         (PCRE2_SPTR) "inter_postlink", &tmp_1,
 951                                         &tmp_1_len)) {
 952                 if (pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "e_num",
 953                                                &tmp_2, &tmp_2_len)) {
 954                         goto problem_with_match;
 955                 }
 956
 957                 if (pcre2_substring_get_byname(match_data,
 958                                                (PCRE2_SPTR) "e_board", &tmp_3,
 959                                                &tmp_3_len)) {
 960                         goto problem_with_match;
 961                 }
 962
 963                 int found = 0;
 964
 965                 if (db_construct_post_link((const char *) tmp_3, tmp_3_len,
 966                                            (const char *) tmp_2, tmp_2_len,
 967                                            &found, &link_target,
 968                                            &link_target_len) < 0) {
 969                         goto done;
 970                 }
 971
 972                 if (!found) {
 973                         if (append_str(out, &out_idx, out_len, in + match_pos,
 974                                        after_match_pos - match_pos) < 0) {
 975                                 goto done;
 976                         }
 977
 978                         in_idx = after_match_pos;
 979                         goto find_next_bit;
 980                 }
 981
 982                 if (append_const_str(out, &out_idx, out_len, "<a href=\"") <
 983                     0) {
 984                         goto done;
 985                 }
 986
 987                 if (append_str(out, &out_idx, out_len, link_target,
 988                                link_target_len) < 0) {
 989                         goto done;
 990                 }
 991
 992                 if (append_const_str(out, &out_idx, out_len, "\">") < 0) {
 993                         goto done;
 994                 }
 995
 996                 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
 997                                (size_t) tmp_1_len) < 0) {
 998                         goto done;
 999                 }
1000
1001                 if (append_const_str(out, &out_idx, out_len, "</a>") < 0) {
1002                         goto done;
1003                 }
1004
1005                 in_idx = after_match_pos;
1006                 goto find_next_bit;
1007         }
1008
1009 problem_with_match:
1010
1011         /* There was some kind of match, but it went wrong. */
1012         in_idx++;
1013         goto find_next_bit;
1014 success:
1015         ret = 0;
1016 done:
1017         *out_len = out_idx;
1018         pcre2_substring_free(tmp_1);
1019         pcre2_substring_free(tmp_2);
1020         pcre2_substring_free(tmp_3);
1021         pcre2_match_data_free(match_data);
1022
1023         return ret;
1024 }
1025
1026 /*
1027  * Make sure that the contents of *pc are ready for safe injection
1028  * into the board, including HTML escaping, wordfiltering, general
1029  * formatting, and adding links.
1030  *
1031  * Preconditions
1032  *
1033  *  - setup_sanitize_comment() has been invoked more recently than
1034  *    clean_sanitize_comment().
1035  *
1036  *  - *pc has been filled out (fields like action, board, etc. have
1037  *    been populated) from the POST data.
1038  *
1039  * Postconditions (success):
1040  *
1041  *  - The prepared_XYZ fields of *pc have been filled out, and each
1042  *    is valid ASCII text, with Unicode codepoints.
1043  */
1044 int
1045 st_sanitize_text(struct post_cmd *pc, int *our_fault)
1046 {
1047         int ret = -1;
1048         size_t out_idx = 0;
1049         char *html_escaped_comment = 0;
1050         size_t html_escaped_comment_len = 0;
1051
1052         /* Flush out lurking double-free bugs */
1053         free(pc->prepared.name);
1054         pc->prepared.name = 0;
1055         pc->prepared.name_len = 0;
1056         free(pc->prepared.email);
1057         pc->prepared.email = 0;
1058         pc->prepared.email_len = 0;
1059         free(pc->prepared.subject);
1060         pc->prepared.subject = 0;
1061         pc->prepared.subject_len = 0;
1062         free(pc->prepared.comment);
1063         pc->prepared.comment = 0;
1064         pc->prepared.comment_len = 0;
1065         free(pc->prepared.file_name);
1066         pc->prepared.file_name = 0;
1067         pc->prepared.file_name_len = 0;
1068         free(pc->scannable_comment);
1069         pc->scannable_comment = 0;
1070         pc->scannable_comment_len = 0;
1071         free(pc->position_map);
1072         pc->position_map = 0;
1073         pc->position_map_len = 0;
1074         out_idx = 0;
1075
1076         if (!pc->raw.name_len) {
1077                 free(pc->raw.name);
1078
1079                 if (!(pc->raw.name = strdup("Anonymous"))) {
1080                         PERROR_MESSAGE("strdup");
1081                         goto done;
1082                 }
1083
1084                 pc->raw.name_len = strlen(pc->raw.name);
1085         }
1086
1087         if (pc->raw.name_len) {
1088                 if (to_html(pc->raw.name, pc->raw.name_len, 0,
1089                             &pc->prepared.name, &pc->prepared.name_len,
1090                             &out_idx) < 0) {
1091                         *our_fault = 1;
1092                         goto done;
1093                 }
1094         }
1095
1096         out_idx = 0;
1097
1098         if (pc->raw.email_len) {
1099                 if (to_html(pc->raw.email, pc->raw.email_len, 0,
1100                             &pc->prepared.email, &pc->prepared.email_len,
1101                             &out_idx) < 0) {
1102                         *our_fault = 1;
1103                         goto done;
1104                 }
1105         }
1106
1107         out_idx = 0;
1108
1109         if (pc->raw.tripcode_len) {
1110                 if (to_html(pc->raw.tripcode, pc->raw.tripcode_len, 0,
1111                             &pc->prepared.tripcode, &pc->prepared.tripcode_len,
1112                             &out_idx) <
1113                     0) {
1114                         *our_fault = 1;
1115                         goto done;
1116                 }
1117         }
1118
1119         out_idx = 0;
1120
1121         if (pc->raw.subject_len) {
1122                 if (to_html(pc->raw.subject, pc->raw.subject_len, 0,
1123                             &pc->prepared.subject, &pc->prepared.subject_len,
1124                             &out_idx) <
1125                     0) {
1126                         *our_fault = 1;
1127                         goto done;
1128                 }
1129         }
1130
1131         out_idx = 0;
1132
1133         if (pc->raw.file_name_len) {
1134                 if (to_html(pc->raw.file_name, pc->raw.file_name_len, 0,
1135                             &pc->prepared.file_name,
1136                             &pc->prepared.file_name_len,
1137                             &out_idx) < 0) {
1138                         *our_fault = 1;
1139                         goto done;
1140                 }
1141         }
1142
1143         if (to_scannable(pc->raw.comment, pc->raw.comment_len,
1144                          &pc->scannable_comment, &pc->scannable_comment_len,
1145                          &pc->position_map,
1146                          &pc->position_map_len)) {
1147                 *our_fault = 1;
1148                 goto done;
1149         }
1150
1151         /*
1152          * Now we do the fancy thing. Match scannable, build prepared
1153          * out of that.
1154          */
1155         if (wordfilter_to_html(pc->raw.comment, pc->raw.comment_len,
1156                                pc->scannable_comment, pc->scannable_comment_len,
1157                                pc->position_map,
1158                                &html_escaped_comment,
1159                                &html_escaped_comment_len) < 0) {
1160                 *our_fault = 1;
1161                 goto done;
1162         }
1163
1164         /*
1165          * Everything's in &#123; form, but now take care of >>123,
1166          * <br />, etc.
1167          */
1168         if (insert_html_tags(html_escaped_comment, html_escaped_comment_len,
1169                              pc->raw.board, &pc->prepared.comment,
1170                              &pc->prepared.comment_len) < 0) {
1171                 *our_fault = 1;
1172                 goto done;
1173         }
1174
1175         ret = 0;
1176 done:
1177         free(html_escaped_comment);
1178
1179         return ret;
1180 }
1181
1182 /*
1183  * Initialize any static elements needed for this file.
1184  *
1185  * Preconditions:
1186  *
1187  *  - setup_sanitize_comment() was not invoked more recently than
1188  *    clean_sanitize_comment().
1189  *
1190  * Postconditions (success):
1191  *
1192  *  - Any other function in this file may be safely called.
1193  */
1194 int
1195 setup_sanitize_comment(const struct configuration *conf)
1196 {
1197         /*
1198          * Check that the locale/libc/whatever is set up so that
1199          * UTF-8 handling can work.
1200          */
1201         int ret = -1;
1202         const char *raw =
1203                 "<script>alert(1)</script> , \U0001d511\U0001d526\U0001d52b"
1204                 "\U0001d51e\u3000\U0001d513\U0001d532\U0001d52f\U0001d52d"
1205                 "\U0001d529\U0001d522\U0001d531\U0001d52c\U0001d52b & "
1206                 "\u2468\u0294!\u0ce2!!";
1207         const char *correct_html =
1208                 "&lt;script&gt;alert(1)&lt;/script&gt; , &#120081;&#120102;"
1209                 "&#120107;&#120094;&#12288;&#120083;&#120114;&#120111;"
1210                 "&#120109;&#120105;&#120098;&#120113;&#120108;&#120107; &amp;"
1211                 " &#9320;&#660;!&#3298;!!";
1212         const char *correct_scannable =
1213                 "<script>alert(1)</script> , Nina Purpleton & 9!!!";
1214         char *html = 0;
1215         size_t html_len = 0;
1216         char *scannable = 0;
1217         size_t scannable_len = 0;
1218         size_t *position_map = 0;
1219         size_t position_map_len = 0;
1220         size_t out_idx = 0;
1221
1222         if (to_html(raw, strlen(raw), 0, &html, &html_len, &out_idx) < 0) {
1223                 goto done;
1224         }
1225
1226         if (strcmp(html, correct_html)) {
1227                 ERROR_MESSAGE("Was expecting html conversion to yield "
1228                               "\n\n\u00ab%s\u00bb\n\nInstead, got "
1229                               "\n\n\u00ab%s\u00bb\n\n",
1230                               correct_html, html);
1231                 goto done;
1232         }
1233
1234         if (to_scannable(raw, strlen(raw), &scannable, &scannable_len,
1235                          &position_map, &position_map_len) < 0) {
1236                 goto done;
1237         }
1238
1239         if (strcmp(scannable, correct_scannable)) {
1240                 ERROR_MESSAGE("Was expecting scannable conversion to yield "
1241                               "\n\n\u00ab%s\u00bb\n\nInstead, got "
1242                               "\n\n\u00ab%s\u00bb\n\n",
1243                               correct_scannable, scannable);
1244                 goto done;
1245         }
1246
1247         if (!(wordfilters = calloc(conf->wordfilter_inputs_num,
1248                                    sizeof *wordfilters))) {
1249                 PERROR_MESSAGE("calloc");
1250                 goto done;
1251         }
1252
1253         wordfilters_num = conf->wordfilter_inputs_num;
1254         int err_code = 0;
1255         PCRE2_SIZE err_offset = 0;
1256         PCRE2_UCHAR8 err_buf[120];
1257
1258         for (size_t j = 0; j < wordfilters_num; ++j) {
1259                 wordfilters[j].replacement =
1260                         conf->wordfilter_inputs[j].replacement;
1261                 wordfilters[j].replacement_len = strlen(
1262                         conf->wordfilter_inputs[j].replacement);
1263
1264                 if ((wordfilters[j].code = pcre2_compile(
1265                              (PCRE2_SPTR8) conf->wordfilter_inputs[j].pattern,
1266                              PCRE2_ZERO_TERMINATED, PCRE2_UTF, &err_code,
1267                              &err_offset, 0))) {
1268                         continue;
1269                 }
1270
1271                 pcre2_get_error_message(err_code, err_buf, 120);
1272                 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1273                               conf->wordfilter_inputs[j].pattern, err_buf);
1274                 goto done;
1275         }
1276
1277         const char *format_match_str =
1278
1279                 /* */
1280                 "(?<newline>\\n)"                              /* */
1281                 "|(?<intra_postlink>&gt;&gt;(?<a_num>[0-9]+))" /* */
1282                 "|(?<inter_postlink>&gt;&gt;&gt;/"             /* */
1283                 "(?<e_board>[^ /]+)/(?<e_num>[0-9]+))"         /* */
1284                 "|(?<quote>(?<![^\n])&gt;[^\n]*)";             /* */
1285
1286         if (!(format_replacements = pcre2_compile(
1287                       (PCRE2_SPTR8) format_match_str, PCRE2_ZERO_TERMINATED,
1288                       PCRE2_UTF,
1289                       &err_code, &err_offset, 0))) {
1290                 pcre2_get_error_message(err_code, err_buf, 120);
1291                 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1292                               format_match_str, err_buf);
1293                 goto done;
1294         }
1295
1296         ret = 0;
1297 done:
1298         free(html);
1299         free(scannable);
1300         free(position_map);
1301
1302         return ret;
1303 }
1304
1305 /*
1306  * Clean up any memory from this file
1307  *
1308  * Postconditions (success):
1309  *
1310  *  - Valgrind won't report any memory leaks from this file.
1311  *
1312  *  - setup_sanitize_comment() can be safely called again.
1313  */
1314 int
1315 clean_sanitize_comment(void)
1316 {
1317         for (size_t j = 0; j < wordfilters_num; ++j) {
1318                 pcre2_code_free(wordfilters[j].code);
1319                 wordfilters[j] = (struct wordfilter) { 0 };
1320         }
1321
1322         pcre2_code_free(format_replacements);
1323         format_replacements = 0;
1324         free(wordfilters);
1325         wordfilters = 0;
1326         wordfilters_num = 0;
1327
1328         return 0;
1329 }