sanitize-comment.c

   1 /*
   2  * Copyright (c) 2017, De Rais <derais@cock.li>
   3  *
   4  * Permission to use, copy, modify, and/or distribute this software for
   5  * any purpose with or without fee is hereby granted, provided that the
   6  * above copyright notice and this permission notice appear in all
   7  * copies.
   8  *
   9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
  10  * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
  11  * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
  12  * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
  13  * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
  14  * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
  15  * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
  16  * PERFORMANCE OF THIS SOFTWARE.
  17  */
  18 #include <errno.h>
  19 #include <stdint.h>
  20 #include <stdio.h>
  21 #include <stdlib.h>
  22 #include <string.h>
  23 #include <time.h>
  24 #include <wchar.h>
  25
  26 #define PCRE2_CODE_UNIT_WIDTH 8
  27 #include <pcre2.h>
  28
  29 #include "macros.h"
  30 #include "rb79.h"
  31 #include "unicode-transforms.h"
  32
  33 /*
  34  * We need a way to get codepoints out of UTF-8 strings and if
  35  * wchar_t stored codepoint values, that would be great. That's
  36  * __STDC_ISO_10646__, though. You can remove this check and cross
  37  * your fingers, since rb79 will do a quick check on startup, but
  38  * please check why the C implementation doesn't define
  39  * __STDC_ISO_10646__ first.
  40  */
  41 #ifndef __STDC_ISO_10646__
  42 #error We really want __STD_ISO_10646__
  43 #endif
  44
  45 /*
  46  * A wordfilter consists of a pcre2 regex and a replacement string
  47  */
  48 struct wordfilter {
  49         /* */
  50         pcre2_code *code;
  51         const char *replacement;
  52         size_t replacement_len;
  53 };
  54
  55 /* These are constructed in setup_sanitize_comment() */
  56 static struct wordfilter *wordfilters;
  57 static size_t wordfilters_num;
  58
  59 /* Special matcher for quoting, newlines, linkifying, etc. */
  60 static pcre2_code *format_replacements;
  61
  62 /*
  63  * Comparison function for struct translate.
  64  *
  65  * Preconditions:
  66  *
  67  *  - *key_v is a wchar_t.
  68  *
  69  *  - *tr_v is a struct translate object.
  70  *
  71  * Postconditions:
  72  *
  73  *  - Returns -1 (0) [1] if *key_v is less than (equal to) [greater
  74  *    than] *tr_v's starting range.
  75  */
  76 static int match_translate(const void *key_v, const void *tr_v)
  77 {
  78         const wchar_t *key = (const wchar_t *) key_v;
  79         const struct translate *tr = (const struct translate *) tr_v;
  80
  81         if (*key < tr->from_s) {
  82                 return -1;
  83         } else if (*key > tr->from_t) {
  84                 return 1;
  85         }
  86
  87         return 0;
  88 }
  89
  90 /*
  91  * Add a UTF-8 sequence str onto *buf
  92  *
  93  * Preconditions:
  94  *
  95  *  - *buf is memory of length *sz, and up to *idx is a valid UTF-8
  96  *    string.
  97  *
  98  *  - str is a valid ASCII (not just UTF-8) string of length str_len.
  99  *
 100  * Postconditions (success):
 101  *
 102  *  - *buf is memory of length *sz, and up to *idx is a valid UTF-8
 103  *    string.
 104  *
 105  *  - The contents of str have been appended to *buf (and *idx
 106  *    includes this).
 107  */
 108 static int append_str(char **buf, size_t *idx, size_t *sz, const char *str,
 109                       size_t str_len)
 110 {
 111         if (str_len + *idx >= *sz) {
 112                 void *newmem = 0;
 113                 size_t new_sz = str_len + *idx + (1 << 9);
 114
 115                 if (!(newmem = realloc(*buf, new_sz))) {
 116                         PERROR_MESSAGE("realloc");
 117
 118                         return -1;
 119                 }
 120
 121                 *buf = newmem;
 122                 *sz = new_sz;
 123         }
 124
 125         strncpy(*buf + *idx, str, str_len);
 126         *(*buf + *idx + str_len) = '\0';
 127         *idx += str_len;
 128
 129         return 0;
 130 }
 131
 132 /* Dummy function for when I can't be bothered to strlen(). */
 133 static int append_const_str(char **buf, size_t *idx, size_t *len, const
 134                             char *str)
 135 {
 136         return append_str(buf, idx, len, str, strlen(str));
 137 }
 138
 139 /*
 140  * Add a single character onto *buf
 141  *
 142  * Preconditions:
 143  *
 144  *  - *buf is memory of length *len, and up to *idx is a valid UTF-8
 145  *    string.
 146  *
 147  *  - c is an ASCII character.
 148  *
 149  * Postconditions (success):
 150  *
 151  *  - *buf is memory of length *len, and up to *idx is a valid UTF-8
 152  *    string.
 153  *
 154  *  - c has been appended to *buf (and *idx includes this).
 155  */
 156 static int append_char(char **buf, size_t *idx, size_t *len, char c)
 157 {
 158         if (1 + *idx >= *len) {
 159                 void *newmem = 0;
 160                 size_t new_len = 1 + *idx + (1 << 9);
 161
 162                 if (!(newmem = realloc(*buf, new_len))) {
 163                         PERROR_MESSAGE("realloc");
 164
 165                         return -1;
 166                 }
 167
 168                 *buf = newmem;
 169                 *len = new_len;
 170         }
 171
 172         *(*buf + *idx) = c;
 173         *(*buf + *idx + 1) = '\0';
 174         *idx += 1;
 175
 176         return 0;
 177 }
 178
 179 /*
 180  * Add a Unicode codepoint onto *buf
 181  *
 182  * Preconditions:
 183  *
 184  *  - *buf is memory of length *sz, and up to *idx is a valid UTF-8
 185  *    string.
 186  *
 187  *  - wchar_t is a valid Unicode codepoint.
 188  *
 189  * Postconditions (success):
 190  *
 191  *  - *buf is memory of length *sz, and up to *idx is a valid UTF-8
 192  *    string.
 193  *
 194  *  - An HTML-escaped sequence like &#123; has been appended to
 195  *    *buf (and *idx includes this).
 196  */
 197 static int append_wchar_escaped(char **buf, size_t *idx, size_t *sz, wchar_t wc)
 198 {
 199         size_t l = snprintf(0, 0, "&#%ld;", (long) wc);
 200
 201         if (l + *idx >= *sz) {
 202                 void *newmem = 0;
 203                 size_t new_sz = l + *idx + (1 << 9);
 204
 205                 if (!(newmem = realloc(*buf, new_sz))) {
 206                         PERROR_MESSAGE("realloc");
 207
 208                         return -1;
 209                 }
 210
 211                 *buf = newmem;
 212                 *sz = new_sz;
 213         }
 214
 215         sprintf(*buf + *idx, "&#%ld;", (long) wc);
 216         *idx += l;
 217
 218         return 0;
 219 }
 220
 221 /*
 222  * Ensure that (*map)[j] = k, fixing up length as appropriate.
 223  *
 224  * Preconditions
 225  *
 226  *  - *map is memory of length len.
 227  *
 228  * Postconditions (success):
 229  *
 230  *  - *map is memory of length len.
 231  *
 232  *  - (*map)[j] = k.
 233  */
 234 static int set_position_mapping(size_t **map, size_t *len, size_t j, size_t k)
 235 {
 236         if (j + 1 >= *len) {
 237                 void *newmem = 0;
 238
 239                 if (!(newmem = realloc(*map, (j + 2) * sizeof **map))) {
 240                         PERROR_MESSAGE("realloc");
 241
 242                         return -1;
 243                 }
 244
 245                 *map = newmem;
 246
 247                 for (size_t l = *len; l < j + 2; ++l) {
 248                         (*map)[l] = ((size_t) -1);
 249                 }
 250
 251                 *len = j + 2;
 252         }
 253
 254         (*map)[j] = k;
 255
 256         return 0;
 257 }
 258
 259 /*
 260  * HTML-escape in to *out.
 261  *
 262  * Preconditions
 263  *
 264  *  - in is memory of at least length in_len, valid UTF-8
 265  *    text.
 266  *
 267  *  - *out is memory of at least length *out_len (if *out_len = 0,
 268  *    *out may be 0), valid UTF-8 text.
 269  *
 270  *  - Overwriting *out and *out_len  shall not cause a memory leak.
 271  *
 272  *  - out, out_len, and out_idx are not 0.
 273  *
 274  * Postconditions (success):
 275  *
 276  *  - *out is memory of at least length *out_len, valid UTF-8 text.
 277  *
 278  *  - A stretch of HTML-escaped ASCII text representing in has been
 279  *    added to *out at the position that was *out_idx.
 280  *
 281  *  - *out_idx has been updated to point to the end of this stretch.
 282  *
 283  *  - If necessary, *out_len has been updated.
 284  */
 285 static int to_html(const char *in, const size_t in_len, size_t in_idx,
 286                    char **out, size_t *out_len, size_t *out_idx)
 287 {
 288         int ret = -1;
 289         wchar_t wc = 0;
 290         int mbret = 0;
 291         size_t out_sz = 0;
 292         size_t initial_out_idx = *out_idx;
 293
 294         if (!*out) {
 295                 if (!(*out = malloc(1))) {
 296                         PERROR_MESSAGE("malloc");
 297                         goto done;
 298                 }
 299
 300                 out_sz = 1;
 301                 *out_len = 0;
 302                 (*out)[0] = '\0';
 303         }
 304
 305         /*
 306          * XXX: If you make this multithreaded, be sure to use
 307          * mbrtowc(3) here!
 308          */
 309         while (in_idx < in_len &&
 310                in[in_idx]) {
 311                 /* Extract next character */
 312                 mbret = mbtowc(&wc, in + in_idx, in_len - in_idx);
 313
 314                 if (mbret == -1) {
 315                         PERROR_MESSAGE("mbtowc");
 316                         goto done;
 317                 }
 318
 319                 if (wc == L'&') {
 320                         ret = append_str(out, out_idx, &out_sz, "&amp;", 5);
 321                 } else if (wc == L'"') {
 322                         ret = append_str(out, out_idx, &out_sz, "&quot;", 6);
 323                 } else if (wc == L'\'') {
 324                         ret = append_str(out, out_idx, &out_sz, "&apos;", 6);
 325                 } else if (wc == L'<') {
 326                         ret = append_str(out, out_idx, &out_sz, "&lt;", 4);
 327                 } else if (wc == L'>') {
 328                         ret = append_str(out, out_idx, &out_sz, "&gt;", 4);
 329                 } else if (mbret == 1 &&
 330                            in[in_idx] >= ' ' &&
 331                            in[in_idx] <= '~') {
 332                         ret = append_char(out, out_idx, &out_sz, in[in_idx]);
 333                 } else if (mbret == 1 &&
 334                            in[in_idx] == '\r') {
 335                         ret = 0;
 336                 } else if (mbret == 1 &&
 337                            in[in_idx] == '\n') {
 338                         ret = append_char(out, out_idx, &out_sz, in[in_idx]);
 339                 } else {
 340                         ret = append_wchar_escaped(out, out_idx, &out_sz, wc);
 341                 }
 342
 343                 in_idx += mbret;
 344
 345                 if (ret < 0) {
 346                         goto done;
 347                 }
 348         }
 349
 350         *out_len = *out_len + (*out_idx - initial_out_idx);
 351         ret = 0;
 352 done:
 353
 354         return ret;
 355 }
 356
 357 /*
 358  * From in construct *out, which is a codepoint-for-codepoint
 359  * translation following the rules of unicode-transforms.h. The
 360  * result is that *out can be matched with normal regex, even if
 361  * in contains obfuscatory Unicode bullshit.
 362  *
 363  * Preconditions
 364  *
 365  *  - setup_sanitize_comment() has been invoked more recently than
 366  *    clean_sanitize_comment().
 367  *
 368  *  - in is memory of at least length in_len, valid UTF-8 text.
 369  *
 370  *  - Overwriting *out and *out_position_map shall not cause a
 371  *    memory leak.
 372  *
 373  *  - out, out_len, out_position_map, and out_position_map_len are
 374  *    not 0.
 375  *
 376  * Postconditions (success):
 377  *
 378  *  - *out is valid, UTF-8 text of length *out_len.
 379  *
 380  *  - For every j in [0, *out_len) such that (*out)[j] starts a
 381  *    codepoint, in[*(position_map)[j]] is the start of the
 382  *    corresponding codepoint.
 383  *
 384  *  - (*position_map)[*out_len] = in_len.
 385  */
 386 static int to_scannable(const char *in, size_t in_len, char **out,
 387                         size_t *out_len, size_t **out_position_map,
 388                         size_t *out_position_map_len)
 389 {
 390         int ret = -1;
 391         wchar_t wc = 0;
 392         size_t in_idx = 0;
 393         size_t out_idx = 0;
 394         int mbret = 0;
 395         struct translate *tr = 0;
 396         size_t out_sz = 0;
 397
 398         if (!*out) {
 399                 if (!(*out = malloc(1))) {
 400                         PERROR_MESSAGE("malloc");
 401                         goto done;
 402                 }
 403
 404                 out_sz = 1;
 405                 *out_len = 0;
 406                 (*out)[0] = '\0';
 407         }
 408
 409         /*
 410          * Position_map is here to make wordfiltering work. Suppose in is
 411          *
 412          *     Ｉ  ｔｈｉｎｋ  Ｎｉｎａ  Ｐｕｒｐｌｅｔｏｎ  ｄｉｄ
 413          *     ｎｏｔｈｉｎｇ  ｗｒｏｎｇ
 414          *
 415          * and a wordfilter /Nina Purpleton/i -> "worst girl" is
 416          * in effect. Then *out will be
 417          *
 418          *      I think Nina Purpleton did nothing wrong
 419          *
 420          * The message should, of course, be filtered to
 421          *
 422          *     Ｉ  ｔｈｉｎｋ worst girl ｄｉｄ ｎｏｔｈｉｎｇ
 423          *     ｗｒｏｎｇ
 424          *
 425          * In order to do that, it would be necessary to have a map
 426          * from in to *out on the byte level, since the wordfilter
 427          * will only be run against *out.
 428          *
 429          * position_map[j] = k means that out[j] and in[k] mean the
 430          * same thing.
 431          */
 432         while (in_idx < in_len) {
 433                 mbret = mbtowc(&wc, in + in_idx, in_len - in_idx);
 434
 435                 if (mbret == -1) {
 436                         PERROR_MESSAGE("mbtowc");
 437                         goto done;
 438                 }
 439
 440                 /* We pre-suppose that the insert will go as planned */
 441                 if (set_position_mapping(out_position_map, out_position_map_len,
 442                                          out_idx, in_idx) < 0) {
 443                         goto done;
 444                 }
 445
 446                 if (mbret == 1 &&
 447                     in[in_idx] >= ' ' &&
 448                     in[in_idx] <= '~') {
 449                         if (append_str(out, &out_idx, &out_sz, in + in_idx, 1) <
 450                             0) {
 451                                 goto done;
 452                         }
 453                 } else {
 454                         if ((tr = bsearch(&wc, translates, NUM_OF(translates),
 455                                           sizeof *translates,
 456                                           match_translate))) {
 457                                 if (append_str(out, &out_idx, &out_sz, tr->to,
 458                                                strlen(tr->to)) < 0) {
 459                                         goto done;
 460                                 }
 461                         } else {
 462                                 if (append_str(out, &out_idx, &out_sz, in +
 463                                                in_idx, mbret) < 0) {
 464                                         goto done;
 465                                 }
 466                         }
 467                 }
 468
 469                 in_idx += mbret;
 470         }
 471
 472         if (set_position_mapping(out_position_map, out_position_map_len,
 473                                  out_idx, in_len) < 0) {
 474                 goto done;
 475         }
 476
 477         (*out)[out_idx] = '\0';
 478         *out_len = out_idx;
 479         ret = 0;
 480 done:
 481
 482         return ret;
 483 }
 484
 485 /*
 486  * Read through raw and scannable, checking all wordfilters in
 487  * scannable. Where a match is detected, the corresponding postion
 488  * (via position_map) in raw is replaced by the replacement specified
 489  * by the matching wordfilter.
 490  *
 491  * Preconditions
 492  *
 493  *  - setup_sanitize_comment() has been invoked more recently than
 494  *    clean_sanitize_comment().
 495  *
 496  *  - raw is memory of length at least raw_len, valid UTF-8 text.
 497  *
 498  *  - scannable is memory of length at least scannable_len.
 499  *
 500  *  - For any j in [0, scannable_len), position_map[j] is a valid
 501  *    index into raw, or is (size_t) -1.
 502  *
 503  *  - position_map[scannable_len] = raw_len.
 504  *
 505  *  - For any j in [0, scannable_len) such that k = position_map[j]
 506  *    is not (size_t) -1, scannable[j] and raw[k] are conceptually
 507  *    the same for wordfiltering.
 508  *
 509  *  - Overwriting *out shall not cause a memory leak.
 510  *
 511  *  - out and out_len are not 0.
 512  *
 513  * Postconditions (success):
 514  *
 515  *  - *out is valid, UTF-8 text of length *out_len such that all
 516  *    non ASCII codepoints (and '<', '>', '&', '"', ''') are
 517  *    HTML-escaped.
 518  *
 519  *  - *out represents raw, except in those sections of scannable
 520  *    where a wordfilter matched.
 521  */
 522 static int wordfilter_to_html(const char *raw, const size_t raw_len, const
 523                               char *scannable, const size_t scannable_len,
 524                               size_t *position_map, char **out,
 525                               size_t *out_len)
 526 {
 527         int ret = -1;
 528
 529         /* These hold the match locations from pcre2 */
 530         uint32_t *ov_counts = 0;
 531         PCRE2_SIZE **ov_ps = 0;
 532         int *num_matches = 0;
 533         pcre2_match_data **match_data = 0;
 534         size_t raw_idx = 0;
 535         size_t scannable_idx = 0;
 536         size_t out_idx = 0;
 537         size_t best_match_pos = 0;
 538         size_t best_match_idx = 0;
 539         size_t l = 0;
 540         size_t mbret = 0;
 541
 542         if (!(ov_counts = calloc(wordfilters_num, sizeof *ov_counts))) {
 543                 PERROR_MESSAGE("calloc");
 544                 goto done;
 545         }
 546
 547         if (!(ov_ps = calloc(wordfilters_num, sizeof *ov_ps))) {
 548                 PERROR_MESSAGE("calloc");
 549                 goto done;
 550         }
 551
 552         if (!(num_matches = calloc(wordfilters_num, sizeof *num_matches))) {
 553                 PERROR_MESSAGE("calloc");
 554                 goto done;
 555         }
 556
 557         if (!(match_data = calloc(wordfilters_num, sizeof *match_data))) {
 558                 PERROR_MESSAGE("calloc");
 559                 goto done;
 560         }
 561
 562         /* First scan, before the loop */
 563         for (size_t j = 0; j < wordfilters_num; ++j) {
 564                 if (!(match_data[j] = pcre2_match_data_create_from_pattern(
 565                               wordfilters[j].code, 0))) {
 566                         PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
 567                         goto done;
 568                 }
 569
 570                 num_matches[j] = pcre2_match(wordfilters[j].code,
 571                                              (PCRE2_SPTR) scannable,
 572                                              scannable_len, scannable_idx, 0,
 573                                              match_data[j], 0);
 574         }
 575
 576 handle_next_match:
 577         best_match_pos = (size_t) -1;
 578         best_match_idx = (size_t) -1;
 579
 580         /* We've run pcre2_match() on everything. Find the soonest match */
 581         for (size_t j = 0; j < wordfilters_num; ++j) {
 582                 if (!num_matches[j]) {
 583                         continue;
 584                 }
 585
 586                 ov_ps[j] = pcre2_get_ovector_pointer(match_data[j]);
 587
 588                 if (ov_ps[j][0] >= scannable_idx &&
 589                     ov_ps[j][0] < best_match_pos) {
 590                         best_match_pos = ov_ps[j][0];
 591                         best_match_idx = j;
 592                 }
 593         }
 594
 595         if (best_match_idx == (size_t) -1) {
 596                 /* No matches. Turn the rest to html boring-like */
 597                 ret = to_html(raw, raw_len, raw_idx, out, out_len, &out_idx);
 598                 goto done;
 599         }
 600
 601         /* Figure out where in raw this match starts */
 602         l = best_match_pos;
 603
 604         while (l != (size_t) -1 &&
 605                position_map[l] == (size_t) -1) {
 606                 l--;
 607         }
 608
 609         if (l == (size_t) -1) {
 610                 ERROR_MESSAGE("Impossible condition in "
 611                               "wordfilter_to_html: raw=\"%s\", best_match_pos = %zu",
 612                               raw,
 613                               best_match_pos);
 614                 goto done;
 615         }
 616
 617         /*
 618          * Now position_map[l] points to the first character in raw
 619          * that should be replaced. Fill up to that point.
 620          */
 621         if (position_map[l] &&
 622             position_map[l] > raw_idx) {
 623                 if (to_html(raw, position_map[l], raw_idx, out, out_len,
 624                             &out_idx) < 0) {
 625                         goto done;
 626                 }
 627         }
 628
 629         /* Put the substituted text in */
 630         if (to_html(wordfilters[best_match_idx].replacement,
 631                     wordfilters[best_match_idx].replacement_len, 0, out,
 632                     out_len,
 633                     &out_idx) < 0) {
 634                 goto done;
 635         }
 636
 637         /*
 638          * Figure out where we should advance to in inputs. Naively,
 639          * we want to set scannable_idx to ov_ps[best_match_idx][1]
 640          * (the first character in scannable beyond the match).
 641          * However, we have to consider the case of
 642          *
 643          *      foo！！！bar
 644          *
 645          * where "foo" -> "baz" is the only transformation. Since
 646          * some characters, like "！", are completely ignored by
 647          * the scannable transformation, the naive method would
 648          * start our scanning at the "b", skipping information.
 649          *
 650          * So, instead, we carefully find the last character in
 651          * "foo", then jump one past it. This (unfortunately)
 652          * requires a bit more manual fiddling with wide character
 653          * conversions.
 654          *
 655          */
 656         if (ov_ps[best_match_idx][1] <= scannable_idx) {
 657                 /*
 658                  * This should never happen, but let's make sure
 659                  * we always keep advancing.
 660                  */
 661                 scannable_idx++;
 662         } else {
 663                 scannable_idx = ov_ps[best_match_idx][1] - 1;
 664         }
 665
 666         l = scannable_idx;
 667
 668         while (position_map[l] == (size_t) -1) {
 669                 l--;
 670         }
 671
 672         raw_idx = position_map[l];
 673
 674         /* This is the "jump one past it" part */
 675         scannable_idx++;
 676         errno = 0;
 677         mbret = mbrlen(raw + raw_idx, MB_CUR_MAX, 0);
 678
 679         switch (mbret) {
 680         case (size_t) -2:
 681         case (size_t) -1:
 682                 PERROR_MESSAGE("mbrlen");
 683                 goto done;
 684         default:
 685                 raw_idx += mbret;
 686         }
 687
 688         /*
 689          * Now re-check all our matches and figure out which ones
 690          * need to be updated
 691          */
 692         for (size_t j = 0; j < wordfilters_num; ++j) {
 693                 if (!num_matches[j] ||
 694                     ov_ps[j][0] >= scannable_idx) {
 695                         continue;
 696                 }
 697
 698                 num_matches[j] = pcre2_match(wordfilters[j].code,
 699                                              (PCRE2_SPTR) scannable,
 700                                              scannable_len, scannable_idx, 0,
 701                                              match_data[j], 0);
 702         }
 703
 704         goto handle_next_match;
 705 done:
 706
 707         for (size_t j = 0; j < wordfilters_num; ++j) {
 708                 pcre2_match_data_free(match_data[j]);
 709                 match_data[j] = 0;
 710         }
 711
 712         free(match_data);
 713         free(num_matches);
 714         free(ov_counts);
 715         free(ov_ps);
 716
 717         return ret;
 718 }
 719
 720 /*
 721  * Read through in. Each time a match for format_replacements is
 722  * found (something like a newline or a quote) is found, replace
 723  * it with some HTML markup. The result is placed in out.
 724  *
 725  * Preconditions:
 726  *
 727  *  - setup_sanitize_comment() has been invoked more recently than
 728  *    clean_sanitize_comment().
 729  *
 730  *  - in is memory of length at least in_len, valid UTF-8 text.
 731  *
 732  *  - Overwriting *out shall not cause a memory leak.
 733  *
 734  *  - out and out_len are not 0.
 735  *
 736  * Postconditions (success):
 737  *
 738  *  - *out is valid, UTF-8 text of length *out_len with sane HTML
 739  *    markup (and HTML escaped), suitable for outputting into an
 740  *    HTML file.
 741  */
 742 static int insert_html_tags(const char *in, size_t in_len, const char *board,
 743                             char **out, size_t *out_len)
 744 {
 745         int ret = -1;
 746         size_t in_idx = 0;
 747         size_t match_pos = 0;
 748         size_t after_match_pos = 0;
 749         size_t out_idx = 0;
 750         pcre2_match_data *match_data = 0;
 751         int nret = 0;
 752         PCRE2_UCHAR *tmp_1 = 0;
 753         PCRE2_SIZE tmp_1_len = 0;
 754         PCRE2_UCHAR *tmp_2 = 0;
 755         PCRE2_SIZE tmp_2_len = 0;
 756         PCRE2_UCHAR *tmp_3 = 0;
 757         PCRE2_SIZE tmp_3_len = 0;
 758         uint_fast8_t last_was_newline = 1;
 759         char *link_target = 0;
 760         size_t link_target_len = 0;
 761
 762         if (!(match_data = pcre2_match_data_create_from_pattern(
 763                       format_replacements, 0))) {
 764                 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
 765                 goto done;
 766         }
 767
 768 find_next_bit:
 769
 770         if (in_idx >= in_len) {
 771                 goto success;
 772         }
 773
 774         nret = pcre2_match(format_replacements, (PCRE2_SPTR) in, in_len, in_idx,
 775                            0, match_data, 0);
 776
 777         if (nret == PCRE2_ERROR_NOMATCH) {
 778                 ret = append_str(out, &out_idx, out_len, in + in_idx, in_len -
 779                                  in_idx);
 780                 goto done;
 781         }
 782
 783         if (nret < 0) {
 784                 PCRE2_UCHAR8 err_buf[120];
 785
 786                 pcre2_get_error_message(nret, err_buf, 120);
 787                 ERROR_MESSAGE("pcre2_match: error while matching \"%.*s\": %s"
 788                               " (PCRE2 %d)", (int) (in_len - in_idx), in +
 789                               in_idx, err_buf,
 790                               nret);
 791                 goto done;
 792         }
 793
 794         pcre2_substring_free(tmp_1);
 795         pcre2_substring_free(tmp_2);
 796         pcre2_substring_free(tmp_3);
 797         free(link_target);
 798         tmp_1 = 0;
 799         tmp_2 = 0;
 800         tmp_3 = 0;
 801         link_target = 0;
 802
 803         /* We have match, stuff everything up to it in *out */
 804         match_pos = pcre2_get_ovector_pointer(match_data)[0];
 805         after_match_pos = pcre2_get_ovector_pointer(match_data)[1];
 806
 807         if (match_pos > in_idx) {
 808                 if (append_str(out, &out_idx, out_len, in + in_idx, match_pos -
 809                                in_idx) < 0) {
 810                         goto done;
 811                 }
 812
 813                 last_was_newline = 0;
 814                 in_idx = match_pos;
 815         }
 816
 817         /* Figure out what type of match. */
 818         if (!pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "newline",
 819                                         &tmp_1, &tmp_1_len)) {
 820                 if (last_was_newline) {
 821                         if (append_const_str(out, &out_idx, out_len,
 822                                              "&nbsp;<br />") < 0) {
 823                                 goto done;
 824                         }
 825                 } else {
 826                         if (append_const_str(out, &out_idx, out_len, "<br />") <
 827                             0) {
 828                                 goto done;
 829                         }
 830                 }
 831
 832                 last_was_newline = 1;
 833                 in_idx = after_match_pos;
 834                 goto find_next_bit;
 835         }
 836
 837         last_was_newline = 0;
 838
 839         if (!pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "quote",
 840                                         &tmp_1, &tmp_1_len)) {
 841                 if (append_const_str(out, &out_idx, out_len,
 842                                      "<span class=\"quote\">") < 0) {
 843                         goto done;
 844                 }
 845
 846                 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
 847                                (size_t) tmp_1_len) < 0) {
 848                         goto done;
 849                 }
 850
 851                 if (append_const_str(out, &out_idx, out_len, "</span>") < 0) {
 852                         goto done;
 853                 }
 854
 855                 in_idx = after_match_pos;
 856                 goto find_next_bit;
 857         }
 858
 859         if (!pcre2_substring_get_byname(match_data,
 860                                         (PCRE2_SPTR) "intra_postlink", &tmp_1,
 861                                         &tmp_1_len)) {
 862                 if (pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "a_num",
 863                                                &tmp_2, &tmp_2_len)) {
 864                         goto problem_with_match;
 865                 }
 866
 867                 int found = 0;
 868
 869                 if (db_construct_post_link(board, strlen(board), (const
 870                                                                   char *) tmp_2,
 871                                            tmp_2_len, &found, &link_target,
 872                                            &link_target_len) < 0) {
 873                         goto done;
 874                 }
 875
 876                 if (!found) {
 877                         if (append_str(out, &out_idx, out_len, in + match_pos,
 878                                        after_match_pos - match_pos) < 0) {
 879                                 goto done;
 880                         }
 881
 882                         in_idx = after_match_pos;
 883                         goto find_next_bit;
 884                 }
 885
 886                 if (append_const_str(out, &out_idx, out_len, "<a href=\"") <
 887                     0) {
 888                         goto done;
 889                 }
 890
 891                 if (append_str(out, &out_idx, out_len, link_target,
 892                                link_target_len) < 0) {
 893                         goto done;
 894                 }
 895
 896                 if (append_const_str(out, &out_idx, out_len, "\">") < 0) {
 897                         goto done;
 898                 }
 899
 900                 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
 901                                (size_t) tmp_1_len) < 0) {
 902                         goto done;
 903                 }
 904
 905                 if (append_const_str(out, &out_idx, out_len, "</a>") < 0) {
 906                         goto done;
 907                 }
 908
 909                 in_idx = after_match_pos;
 910                 goto find_next_bit;
 911         }
 912
 913         if (!pcre2_substring_get_byname(match_data,
 914                                         (PCRE2_SPTR) "inter_postlink", &tmp_1,
 915                                         &tmp_1_len)) {
 916                 if (pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "e_num",
 917                                                &tmp_2, &tmp_2_len)) {
 918                         goto problem_with_match;
 919                 }
 920
 921                 if (pcre2_substring_get_byname(match_data,
 922                                                (PCRE2_SPTR) "e_board", &tmp_3,
 923                                                &tmp_3_len)) {
 924                         goto problem_with_match;
 925                 }
 926
 927                 int found = 0;
 928
 929                 if (db_construct_post_link((const char *) tmp_3, tmp_3_len,
 930                                            (const char *) tmp_2, tmp_2_len,
 931                                            &found, &link_target,
 932                                            &link_target_len) < 0) {
 933                         goto done;
 934                 }
 935
 936                 if (!found) {
 937                         if (append_str(out, &out_idx, out_len, in + match_pos,
 938                                        after_match_pos - match_pos) < 0) {
 939                                 goto done;
 940                         }
 941
 942                         in_idx = after_match_pos;
 943                         goto find_next_bit;
 944                 }
 945
 946                 if (append_const_str(out, &out_idx, out_len, "<a href=\"") <
 947                     0) {
 948                         goto done;
 949                 }
 950
 951                 if (append_str(out, &out_idx, out_len, link_target,
 952                                link_target_len) < 0) {
 953                         goto done;
 954                 }
 955
 956                 if (append_const_str(out, &out_idx, out_len, "\">") < 0) {
 957                         goto done;
 958                 }
 959
 960                 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
 961                                (size_t) tmp_1_len) < 0) {
 962                         goto done;
 963                 }
 964
 965                 if (append_const_str(out, &out_idx, out_len, "</a>") < 0) {
 966                         goto done;
 967                 }
 968
 969                 in_idx = after_match_pos;
 970                 goto find_next_bit;
 971         }
 972
 973 problem_with_match:
 974
 975         /* There was some kind of match, but it went wrong. */
 976         in_idx++;
 977         goto find_next_bit;
 978 success:
 979         ret = 0;
 980 done:
 981         *out_len = out_idx;
 982         pcre2_substring_free(tmp_1);
 983         pcre2_substring_free(tmp_2);
 984         pcre2_substring_free(tmp_3);
 985         pcre2_match_data_free(match_data);
 986
 987         return ret;
 988 }
 989
 990 /*
 991  * Make sure that the contents of *pc are ready for safe injection
 992  * into the board, including HTML escaping, wordfiltering, general
 993  * formatting, and adding links.
 994  *
 995  * Preconditions
 996  *
 997  *  - setup_sanitize_comment() has been invoked more recently than
 998  *    clean_sanitize_comment().
 999  *
1000  *  - *pc has been filled out (fields like action, board, etc. have
1001  *    been populated) from the POST data.
1002  *
1003  * Postconditions (success):
1004  *
1005  *  - The prepared_XYZ fields of *pc have been filled out, and each
1006  *    is valid ASCII text, with Unicode codepoints.
1007  */
1008 int st_sanitize_text(struct post_cmd *pc, int *our_fault)
1009 {
1010         int ret = -1;
1011         size_t out_idx = 0;
1012         char *html_escaped_comment = 0;
1013         size_t html_escaped_comment_len = 0;
1014
1015         /* Flush out lurking double-free bugs */
1016         free(pc->prepared.name);
1017         pc->prepared.name = 0;
1018         pc->prepared.name_len = 0;
1019         free(pc->prepared.email);
1020         pc->prepared.email = 0;
1021         pc->prepared.email_len = 0;
1022         free(pc->prepared.subject);
1023         pc->prepared.subject = 0;
1024         pc->prepared.subject_len = 0;
1025         free(pc->prepared.comment);
1026         pc->prepared.comment = 0;
1027         pc->prepared.comment_len = 0;
1028         free(pc->prepared.file_name);
1029         pc->prepared.file_name = 0;
1030         pc->prepared.file_name_len = 0;
1031         free(pc->scannable_comment);
1032         pc->scannable_comment = 0;
1033         pc->scannable_comment_len = 0;
1034         free(pc->position_map);
1035         pc->position_map = 0;
1036         pc->position_map_len = 0;
1037         out_idx = 0;
1038
1039         if (!pc->raw.name_len) {
1040                 free(pc->raw.name);
1041
1042                 if (!(pc->raw.name = strdup("Anonymous"))) {
1043                         PERROR_MESSAGE("strdup");
1044                         goto done;
1045                 }
1046
1047                 pc->raw.name_len = strlen(pc->raw.name);
1048         }
1049
1050         if (pc->raw.name_len) {
1051                 if (to_html(pc->raw.name, pc->raw.name_len, 0,
1052                             &pc->prepared.name, &pc->prepared.name_len,
1053                             &out_idx) < 0) {
1054                         *our_fault = 1;
1055                         goto done;
1056                 }
1057         }
1058
1059         out_idx = 0;
1060
1061         if (pc->raw.email_len) {
1062                 if (to_html(pc->raw.email, pc->raw.email_len, 0,
1063                             &pc->prepared.email, &pc->prepared.email_len,
1064                             &out_idx) < 0) {
1065                         *our_fault = 1;
1066                         goto done;
1067                 }
1068         }
1069
1070         out_idx = 0;
1071
1072         if (pc->raw.tripcode_len) {
1073                 if (to_html(pc->raw.tripcode, pc->raw.tripcode_len, 0,
1074                             &pc->prepared.tripcode, &pc->prepared.tripcode_len,
1075                             &out_idx) <
1076                     0) {
1077                         *our_fault = 1;
1078                         goto done;
1079                 }
1080         }
1081
1082         out_idx = 0;
1083
1084         if (pc->raw.subject_len) {
1085                 if (to_html(pc->raw.subject, pc->raw.subject_len, 0,
1086                             &pc->prepared.subject, &pc->prepared.subject_len,
1087                             &out_idx) <
1088                     0) {
1089                         *our_fault = 1;
1090                         goto done;
1091                 }
1092         }
1093
1094         out_idx = 0;
1095
1096         if (pc->raw.file_name_len) {
1097                 if (to_html(pc->raw.file_name, pc->raw.file_name_len, 0,
1098                             &pc->prepared.file_name,
1099                             &pc->prepared.file_name_len,
1100                             &out_idx) < 0) {
1101                         *our_fault = 1;
1102                         goto done;
1103                 }
1104         }
1105
1106         if (to_scannable(pc->raw.comment, pc->raw.comment_len,
1107                          &pc->scannable_comment, &pc->scannable_comment_len,
1108                          &pc->position_map,
1109                          &pc->position_map_len)) {
1110                 *our_fault = 1;
1111                 goto done;
1112         }
1113
1114         /*
1115          * Now we do the fancy thing. Match scannable, build prepared
1116          * out of that.
1117          */
1118         if (wordfilter_to_html(pc->raw.comment, pc->raw.comment_len,
1119                                pc->scannable_comment, pc->scannable_comment_len,
1120                                pc->position_map,
1121                                &html_escaped_comment,
1122                                &html_escaped_comment_len) < 0) {
1123                 *our_fault = 1;
1124                 goto done;
1125         }
1126
1127         /*
1128          * Everything's in &#123; form, but now take care of >>123,
1129          * <br />, etc.
1130          */
1131         if (insert_html_tags(html_escaped_comment, html_escaped_comment_len,
1132                              pc->raw.board, &pc->prepared.comment,
1133                              &pc->prepared.comment_len) < 0) {
1134                 *our_fault = 1;
1135                 goto done;
1136         }
1137
1138         ret = 0;
1139 done:
1140         free(html_escaped_comment);
1141
1142         return ret;
1143 }
1144
1145 /*
1146  * Initialize any static elements needed for this file.
1147  *
1148  * Preconditions:
1149  *
1150  *  - setup_sanitize_comment() was not invoked more recently than
1151  *    clean_sanitize_comment().
1152  *
1153  * Postconditions (success):
1154  *
1155  *  - Any other function in this file may be safely called.
1156  */
1157 int setup_sanitize_comment(const struct configuration *conf)
1158 {
1159         /*
1160          * Check that the locale/libc/whatever is set up so that
1161          * UTF-8 handling can work.
1162          */
1163         int ret = -1;
1164         const char *raw =
1165                 "<script>alert(1)</script> , \U0001d511\U0001d526\U0001d52b"
1166                 "\U0001d51e\u3000\U0001d513\U0001d532\U0001d52f\U0001d52d"
1167                 "\U0001d529\U0001d522\U0001d531\U0001d52c\U0001d52b & "
1168                 "\u2468\u0294!\u0ce2!!";
1169         const char *correct_html =
1170                 "&lt;script&gt;alert(1)&lt;/script&gt; , &#120081;&#120102;"
1171                 "&#120107;&#120094;&#12288;&#120083;&#120114;&#120111;"
1172                 "&#120109;&#120105;&#120098;&#120113;&#120108;&#120107; &amp;"
1173                 " &#9320;&#660;!&#3298;!!";
1174         const char *correct_scannable =
1175                 "<script>alert(1)</script> , Nina Purpleton & 9!!!";
1176         char *html = 0;
1177         size_t html_len = 0;
1178         char *scannable = 0;
1179         size_t scannable_len = 0;
1180         size_t *position_map = 0;
1181         size_t position_map_len = 0;
1182         size_t out_idx = 0;
1183
1184         if (to_html(raw, strlen(raw), 0, &html, &html_len, &out_idx) < 0) {
1185                 goto done;
1186         }
1187
1188         if (strcmp(html, correct_html)) {
1189                 ERROR_MESSAGE("Was expecting html conversion to yield "
1190                               "\n\n\u00ab%s\u00bb\n\nInstead, got "
1191                               "\n\n\u00ab%s\u00bb\n\n",
1192                               correct_html, html);
1193                 goto done;
1194         }
1195
1196         if (to_scannable(raw, strlen(raw), &scannable, &scannable_len,
1197                          &position_map, &position_map_len) < 0) {
1198                 goto done;
1199         }
1200
1201         if (strcmp(scannable, correct_scannable)) {
1202                 ERROR_MESSAGE("Was expecting scannable conversion to yield "
1203                               "\n\n\u00ab%s\u00bb\n\nInstead, got "
1204                               "\n\n\u00ab%s\u00bb\n\n",
1205                               correct_scannable, scannable);
1206                 goto done;
1207         }
1208
1209         if (!(wordfilters = calloc(sizeof *wordfilters,
1210                                    conf->wordfilter_inputs_num))) {
1211                 PERROR_MESSAGE("calloc");
1212                 goto done;
1213         }
1214
1215         wordfilters_num = conf->wordfilter_inputs_num;
1216         int err_code = 0;
1217         PCRE2_SIZE err_offset = 0;
1218         PCRE2_UCHAR8 err_buf[120];
1219
1220         for (size_t j = 0; j < wordfilters_num; ++j) {
1221                 wordfilters[j].replacement =
1222                         conf->wordfilter_inputs[j].replacement;
1223                 wordfilters[j].replacement_len = strlen(
1224                         conf->wordfilter_inputs[j].replacement);
1225
1226                 if ((wordfilters[j].code = pcre2_compile(
1227                              (PCRE2_SPTR8) conf->wordfilter_inputs[j].pattern,
1228                              PCRE2_ZERO_TERMINATED, PCRE2_UTF, &err_code,
1229                              &err_offset, 0))) {
1230                         continue;
1231                 }
1232
1233                 pcre2_get_error_message(err_code, err_buf, 120);
1234                 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1235                               conf->wordfilter_inputs[j].pattern, err_buf);
1236                 goto done;
1237         }
1238
1239         const char *format_match_str =
1240
1241                 /* */
1242                 "(?<newline>\\n)"                              /* */
1243                 "|(?<intra_postlink>&gt;&gt;(?<a_num>[0-9]+))" /* */
1244                 "|(?<inter_postlink>&gt;&gt;&gt;/"             /* */
1245                 "(?<e_board>[^ /]+)/(?<e_num>[0-9]+))"         /* */
1246                 "|(?<quote>(?<![^\n])&gt;[^\n]*)";             /* */
1247
1248         if (!(format_replacements = pcre2_compile(
1249                       (PCRE2_SPTR8) format_match_str, PCRE2_ZERO_TERMINATED,
1250                       PCRE2_UTF,
1251                       &err_code, &err_offset, 0))) {
1252                 pcre2_get_error_message(err_code, err_buf, 120);
1253                 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1254                               format_match_str, err_buf);
1255                 goto done;
1256         }
1257
1258         ret = 0;
1259 done:
1260         free(html);
1261         free(scannable);
1262         free(position_map);
1263
1264         return ret;
1265 }
1266
1267 /*
1268  * Clean up any memory from this file
1269  *
1270  * Postconditions (success):
1271  *
1272  *  - Valgrind won't report any memory leaks from this file.
1273  *
1274  *  - setup_sanitize_comment() can be safely called again.
1275  */
1276 int clean_sanitize_comment(void)
1277 {
1278         for (size_t j = 0; j < wordfilters_num; ++j) {
1279                 pcre2_code_free(wordfilters[j].code);
1280                 wordfilters[j] = (struct wordfilter) { 0 };
1281         }
1282
1283         pcre2_code_free(format_replacements);
1284         format_replacements = 0;
1285         free(wordfilters);
1286         wordfilters = 0;
1287         wordfilters_num = 0;
1288
1289         return 0;
1290 }