sanitize-comment.c

   1 /*
   2  * Copyright (c) 2017-2018, De Rais <derais@cock.li>
   3  *
   4  * Permission to use, copy, modify, and/or distribute this software for
   5  * any purpose with or without fee is hereby granted, provided that the
   6  * above copyright notice and this permission notice appear in all
   7  * copies.
   8  *
   9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
  10  * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
  11  * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
  12  * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
  13  * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
  14  * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
  15  * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
  16  * PERFORMANCE OF THIS SOFTWARE.
  17  */
  18 #include <errno.h>
  19 #include <stdint.h>
  20 #include <stdio.h>
  21 #include <stdlib.h>
  22 #include <string.h>
  23 #include <time.h>
  24 #include <wchar.h>
  25
  26 #define PCRE2_CODE_UNIT_WIDTH 8
  27 #include <pcre2.h>
  28
  29 #include "macros.h"
  30 #include "rb79.h"
  31 #include "unicode-transforms.h"
  32
  33 /*
  34  * We need a way to get codepoints out of UTF-8 strings and if
  35  * wchar_t stored codepoint values, that would be great. That's
  36  * __STDC_ISO_10646__, though. You can remove this check and cross
  37  * your fingers, since rb79 will do a quick check on startup, but
  38  * please check why the C implementation doesn't define
  39  * __STDC_ISO_10646__ first.
  40  */
  41 #ifndef __STDC_ISO_10646__
  42 #error We really want __STD_ISO_10646__
  43 #endif
  44
  45 /*
  46  * A wordfilter consists of a pcre2 regex and a replacement string
  47  */
  48 struct wordfilter {
  49         /* */
  50         pcre2_code *code;
  51         const char *replacement;
  52         size_t replacement_len;
  53 };
  54
  55 /* These are constructed in setup_sanitize_comment() */
  56 static struct wordfilter *wordfilters;
  57 static size_t wordfilters_num;
  58
  59 /* Special matcher for quoting, newlines, linkifying, etc. */
  60 static pcre2_code *format_replacements;
  61
  62 /*
  63  * Comparison function for struct translate.
  64  *
  65  * Preconditions:
  66  *
  67  *  - *key_v is a wchar_t.
  68  *
  69  *  - *tr_v is a struct translate object.
  70  *
  71  * Postconditions:
  72  *
  73  *  - Returns -1 (0) [1] if *key_v is less than (equal to) [greater
  74  *    than] *tr_v's starting range.
  75  */
  76 static int match_translate(const void *key_v, const void *tr_v)
  77 {
  78         const wchar_t *key = key_v;
  79         const struct translate *tr = tr_v;
  80
  81         if (*key < tr->from_s) {
  82                 return -1;
  83         } else if (*key > tr->from_t) {
  84                 return 1;
  85         }
  86
  87         return 0;
  88 }
  89
  90 /*
  91  * Add a UTF-8 sequence str onto *buf
  92  *
  93  * Preconditions:
  94  *
  95  *  - *buf is memory of length *sz, and up to *idx is a valid UTF-8
  96  *    string.
  97  *
  98  *  - str is a valid ASCII (not just UTF-8) string of length str_len.
  99  *
 100  * Postconditions (success):
 101  *
 102  *  - *buf is memory of length *sz, and up to *idx is a valid UTF-8
 103  *    string.
 104  *
 105  *  - The contents of str have been appended to *buf (and *idx
 106  *    includes this).
 107  */
 108 static int append_str(char **buf, size_t *idx, size_t *sz, const char *str,
 109                       size_t str_len)
 110 {
 111         if (str_len + *idx >= *sz) {
 112                 void *newmem = 0;
 113                 size_t new_sz = str_len + *idx + (1 << 9);
 114
 115                 if (str_len + *idx < str_len ||
 116                     str_len + *idx + (1 << 9) < str_len + *idx) {
 117                         ERROR_MESSAGE("overflow (str_len = %zu, *idx = %zu)",
 118                                       str_len, *idx);
 119
 120                         return -1;
 121                 }
 122
 123                 if (!(newmem = realloc(*buf, new_sz))) {
 124                         PERROR_MESSAGE("realloc");
 125
 126                         return -1;
 127                 }
 128
 129                 *buf = newmem;
 130                 *sz = new_sz;
 131         }
 132
 133         strncpy(*buf + *idx, str, str_len);
 134         *(*buf + *idx + str_len) = '\0';
 135         *idx += str_len;
 136
 137         return 0;
 138 }
 139
 140 /* Dummy function for when I can't be bothered to strlen(). */
 141 static int append_const_str(char **buf, size_t *idx, size_t *len, const
 142                             char *str)
 143 {
 144         return append_str(buf, idx, len, str, strlen(str));
 145 }
 146
 147 /*
 148  * Add a single character onto *buf
 149  *
 150  * Preconditions:
 151  *
 152  *  - *buf is memory of length *len, and up to *idx is a valid UTF-8
 153  *    string.
 154  *
 155  *  - c is an ASCII character.
 156  *
 157  * Postconditions (success):
 158  *
 159  *  - *buf is memory of length *len, and up to *idx is a valid UTF-8
 160  *    string.
 161  *
 162  *  - c has been appended to *buf (and *idx includes this).
 163  */
 164 static int append_char(char **buf, size_t *idx, size_t *len, char c)
 165 {
 166         if (1 + *idx >= *len) {
 167                 void *newmem = 0;
 168                 size_t new_len = 1 + *idx + (1 << 9);
 169
 170                 if (*idx + 1 < *idx ||
 171                     *idx + 1 + (1 << 9) < *idx + 1) {
 172                         ERROR_MESSAGE("overflow (*idx = %zu)", *idx);
 173
 174                         return -1;
 175                 }
 176
 177                 if (!(newmem = realloc(*buf, new_len))) {
 178                         PERROR_MESSAGE("realloc");
 179
 180                         return -1;
 181                 }
 182
 183                 *buf = newmem;
 184                 *len = new_len;
 185         }
 186
 187         *(*buf + *idx) = c;
 188         *(*buf + *idx + 1) = '\0';
 189         *idx += 1;
 190
 191         return 0;
 192 }
 193
 194 /*
 195  * Add a Unicode codepoint onto *buf
 196  *
 197  * Preconditions:
 198  *
 199  *  - *buf is memory of length *sz, and up to *idx is a valid UTF-8
 200  *    string.
 201  *
 202  *  - wchar_t is a valid Unicode codepoint.
 203  *
 204  * Postconditions (success):
 205  *
 206  *  - *buf is memory of length *sz, and up to *idx is a valid UTF-8
 207  *    string.
 208  *
 209  *  - An HTML-escaped sequence like &#123; has been appended to
 210  *    *buf (and *idx includes this).
 211  */
 212 static int append_wchar_escaped(char **buf, size_t *idx, size_t *sz, wchar_t wc)
 213 {
 214         size_t l = snprintf(0, 0, "&#%ld;", (long) wc);
 215
 216         if (l + *idx >= *sz) {
 217                 void *newmem = 0;
 218                 size_t new_sz = l + *idx + (1 << 9);
 219
 220                 if (*idx + l < *idx ||
 221                     *idx + l + (1 << 9) < *idx + l) {
 222                         ERROR_MESSAGE("overflow (*idx = %zu, l = %zu)", *idx,
 223                                       l);
 224
 225                         return -1;
 226                 }
 227
 228                 if (!(newmem = realloc(*buf, new_sz))) {
 229                         PERROR_MESSAGE("realloc");
 230
 231                         return -1;
 232                 }
 233
 234                 *buf = newmem;
 235                 *sz = new_sz;
 236         }
 237
 238         sprintf(*buf + *idx, "&#%ld;", (long) wc);
 239         *idx += l;
 240
 241         return 0;
 242 }
 243
 244 /*
 245  * Ensure that (*map)[j] = k, fixing up length as appropriate.
 246  *
 247  * Preconditions
 248  *
 249  *  - *map is memory of length len.
 250  *
 251  * Postconditions (success):
 252  *
 253  *  - *map is memory of length len.
 254  *
 255  *  - (*map)[j] = k.
 256  */
 257 static int set_position_mapping(size_t **map, size_t *len, size_t j, size_t k)
 258 {
 259         if (j + 1 >= *len) {
 260                 void *newmem = 0;
 261
 262                 if (j + 2 < j ||
 263                     ((j + 2) * sizeof **map) / (j + 2) != sizeof **map) {
 264                         ERROR_MESSAGE("overflow (j = %zu)", j);
 265
 266                         return -1;
 267                 }
 268
 269                 if (!(newmem = realloc(*map, (j + 2) * sizeof **map))) {
 270                         PERROR_MESSAGE("realloc");
 271
 272                         return -1;
 273                 }
 274
 275                 *map = newmem;
 276
 277                 for (size_t l = *len; l < j + 2; ++l) {
 278                         (*map)[l] = ((size_t) -1);
 279                 }
 280
 281                 *len = j + 2;
 282         }
 283
 284         (*map)[j] = k;
 285
 286         return 0;
 287 }
 288
 289 /*
 290  * HTML-escape in to *out.
 291  *
 292  * Preconditions
 293  *
 294  *  - in is memory of at least length in_len, valid UTF-8
 295  *    text.
 296  *
 297  *  - *out is memory of at least length *out_len (if *out_len = 0,
 298  *    *out may be 0), valid UTF-8 text.
 299  *
 300  *  - Overwriting *out and *out_len  shall not cause a memory leak.
 301  *
 302  *  - out, out_len, and out_idx are not 0.
 303  *
 304  * Postconditions (success):
 305  *
 306  *  - *out is memory of at least length *out_len, valid UTF-8 text.
 307  *
 308  *  - A stretch of HTML-escaped ASCII text representing in has been
 309  *    added to *out at the position that was *out_idx.
 310  *
 311  *  - *out_idx has been updated to point to the end of this stretch.
 312  *
 313  *  - If necessary, *out_len has been updated.
 314  */
 315 static int to_html(const char *in, const size_t in_len, size_t in_idx,
 316                    char **out, size_t *out_len, size_t *out_idx)
 317 {
 318         int ret = -1;
 319         wchar_t wc = 0;
 320         int mbret = 0;
 321         size_t out_sz = 0;
 322         size_t initial_out_idx = *out_idx;
 323
 324         if (!*out) {
 325                 if (!(*out = malloc(1))) {
 326                         PERROR_MESSAGE("malloc");
 327                         goto done;
 328                 }
 329
 330                 out_sz = 1;
 331                 *out_len = 0;
 332                 (*out)[0] = '\0';
 333         }
 334
 335         /*
 336          * XXX: If you make this multithreaded, be sure to use
 337          * mbrtowc(3) here!
 338          */
 339         while (in_idx < in_len &&
 340                in[in_idx]) {
 341                 /* Extract next character */
 342                 mbret = mbtowc(&wc, in + in_idx, in_len - in_idx);
 343
 344                 if (mbret == -1) {
 345                         PERROR_MESSAGE("mbtowc");
 346                         goto done;
 347                 }
 348
 349                 if (wc == L'&') {
 350                         ret = append_str(out, out_idx, &out_sz, "&amp;", 5);
 351                 } else if (wc == L'"') {
 352                         ret = append_str(out, out_idx, &out_sz, "&quot;", 6);
 353                 } else if (wc == L'\'') {
 354                         ret = append_str(out, out_idx, &out_sz, "&apos;", 6);
 355                 } else if (wc == L'<') {
 356                         ret = append_str(out, out_idx, &out_sz, "&lt;", 4);
 357                 } else if (wc == L'>') {
 358                         ret = append_str(out, out_idx, &out_sz, "&gt;", 4);
 359                 } else if (mbret == 1 &&
 360                            in[in_idx] >= ' ' &&
 361                            in[in_idx] <= '~') {
 362                         ret = append_char(out, out_idx, &out_sz, in[in_idx]);
 363                 } else if (mbret == 1 &&
 364                            in[in_idx] == '\r') {
 365                         ret = 0;
 366                 } else if (mbret == 1 &&
 367                            in[in_idx] == '\n') {
 368                         ret = append_char(out, out_idx, &out_sz, in[in_idx]);
 369                 } else {
 370                         ret = append_wchar_escaped(out, out_idx, &out_sz, wc);
 371                 }
 372
 373                 in_idx += mbret;
 374
 375                 if (ret < 0) {
 376                         goto done;
 377                 }
 378         }
 379
 380         *out_len = *out_len + (*out_idx - initial_out_idx);
 381         ret = 0;
 382 done:
 383
 384         return ret;
 385 }
 386
 387 /*
 388  * From in construct *out, which is a codepoint-for-codepoint
 389  * translation following the rules of unicode-transforms.h. The
 390  * result is that *out can be matched with normal regex, even if
 391  * in contains obfuscatory Unicode bullshit.
 392  *
 393  * Preconditions
 394  *
 395  *  - setup_sanitize_comment() has been invoked more recently than
 396  *    clean_sanitize_comment().
 397  *
 398  *  - in is memory of at least length in_len, valid UTF-8 text.
 399  *
 400  *  - Overwriting *out and *out_position_map shall not cause a
 401  *    memory leak.
 402  *
 403  *  - out, out_len, out_position_map, and out_position_map_len are
 404  *    not 0.
 405  *
 406  * Postconditions (success):
 407  *
 408  *  - *out is valid, UTF-8 text of length *out_len.
 409  *
 410  *  - For every j in [0, *out_len) such that (*out)[j] starts a
 411  *    codepoint, in[*(position_map)[j]] is the start of the
 412  *    corresponding codepoint.
 413  *
 414  *  - (*position_map)[*out_len] = in_len.
 415  */
 416 static int to_scannable(const char *in, size_t in_len, char **out,
 417                         size_t *out_len, size_t **out_position_map,
 418                         size_t *out_position_map_len)
 419 {
 420         int ret = -1;
 421         wchar_t wc = 0;
 422         size_t in_idx = 0;
 423         size_t out_idx = 0;
 424         int mbret = 0;
 425         struct translate *tr = 0;
 426         size_t out_sz = 0;
 427
 428         if (!*out) {
 429                 if (!(*out = malloc(1))) {
 430                         PERROR_MESSAGE("malloc");
 431                         goto done;
 432                 }
 433
 434                 out_sz = 1;
 435                 *out_len = 0;
 436                 (*out)[0] = '\0';
 437         }
 438
 439         /*
 440          * Position_map is here to make wordfiltering work. Suppose in is
 441          *
 442          *     Ｉ  ｔｈｉｎｋ  Ｎｉｎａ  Ｐｕｒｐｌｅｔｏｎ  ｄｉｄ
 443          *     ｎｏｔｈｉｎｇ  ｗｒｏｎｇ
 444          *
 445          * and a wordfilter /Nina Purpleton/i -> "worst girl" is
 446          * in effect. Then *out will be
 447          *
 448          *      I think Nina Purpleton did nothing wrong
 449          *
 450          * The message should, of course, be filtered to
 451          *
 452          *     Ｉ  ｔｈｉｎｋ worst girl ｄｉｄ ｎｏｔｈｉｎｇ
 453          *     ｗｒｏｎｇ
 454          *
 455          * In order to do that, it would be necessary to have a map
 456          * from in to *out on the byte level, since the wordfilter
 457          * will only be run against *out.
 458          *
 459          * position_map[j] = k means that out[j] and in[k] mean the
 460          * same thing.
 461          */
 462         while (in_idx < in_len) {
 463                 mbret = mbtowc(&wc, in + in_idx, in_len - in_idx);
 464
 465                 if (mbret == -1) {
 466                         PERROR_MESSAGE("mbtowc");
 467                         goto done;
 468                 }
 469
 470                 /* We pre-suppose that the insert will go as planned */
 471                 if (set_position_mapping(out_position_map, out_position_map_len,
 472                                          out_idx, in_idx) < 0) {
 473                         goto done;
 474                 }
 475
 476                 if (mbret == 1 &&
 477                     in[in_idx] >= ' ' &&
 478                     in[in_idx] <= '~') {
 479                         if (append_str(out, &out_idx, &out_sz, in + in_idx, 1) <
 480                             0) {
 481                                 goto done;
 482                         }
 483                 } else {
 484                         if ((tr = bsearch(&wc, translates, NUM_OF(translates),
 485                                           sizeof *translates,
 486                                           match_translate))) {
 487                                 if (append_str(out, &out_idx, &out_sz, tr->to,
 488                                                strlen(tr->to)) < 0) {
 489                                         goto done;
 490                                 }
 491                         } else {
 492                                 if (append_str(out, &out_idx, &out_sz, in +
 493                                                in_idx, mbret) < 0) {
 494                                         goto done;
 495                                 }
 496                         }
 497                 }
 498
 499                 in_idx += mbret;
 500         }
 501
 502         if (set_position_mapping(out_position_map, out_position_map_len,
 503                                  out_idx, in_len) < 0) {
 504                 goto done;
 505         }
 506
 507         (*out)[out_idx] = '\0';
 508         *out_len = out_idx;
 509         ret = 0;
 510 done:
 511
 512         return ret;
 513 }
 514
 515 /*
 516  * Read through raw and scannable, checking all wordfilters in
 517  * scannable. Where a match is detected, the corresponding postion
 518  * (via position_map) in raw is replaced by the replacement specified
 519  * by the matching wordfilter.
 520  *
 521  * Preconditions
 522  *
 523  *  - setup_sanitize_comment() has been invoked more recently than
 524  *    clean_sanitize_comment().
 525  *
 526  *  - raw is memory of length at least raw_len, valid UTF-8 text.
 527  *
 528  *  - scannable is memory of length at least scannable_len.
 529  *
 530  *  - For any j in [0, scannable_len), position_map[j] is a valid
 531  *    index into raw, or is (size_t) -1.
 532  *
 533  *  - position_map[scannable_len] = raw_len.
 534  *
 535  *  - For any j in [0, scannable_len) such that k = position_map[j]
 536  *    is not (size_t) -1, scannable[j] and raw[k] are conceptually
 537  *    the same for wordfiltering.
 538  *
 539  *  - Overwriting *out shall not cause a memory leak.
 540  *
 541  *  - out and out_len are not 0.
 542  *
 543  * Postconditions (success):
 544  *
 545  *  - *out is valid, UTF-8 text of length *out_len such that all
 546  *    non ASCII codepoints (and '<', '>', '&', '"', ''') are
 547  *    HTML-escaped.
 548  *
 549  *  - *out represents raw, except in those sections of scannable
 550  *    where a wordfilter matched.
 551  */
 552 static int wordfilter_to_html(const char *raw, const size_t raw_len, const
 553                               char *scannable, const size_t scannable_len,
 554                               size_t *position_map, char **out,
 555                               size_t *out_len)
 556 {
 557         int ret = -1;
 558
 559         /* These hold the match locations from pcre2 */
 560         uint32_t *ov_counts = 0;
 561         PCRE2_SIZE **ov_ps = 0;
 562         int *num_matches = 0;
 563         pcre2_match_data **match_data = 0;
 564         size_t raw_idx = 0;
 565         size_t scannable_idx = 0;
 566         size_t out_idx = 0;
 567         size_t best_match_pos = 0;
 568         size_t best_match_idx = 0;
 569         size_t l = 0;
 570         size_t mbret = 0;
 571
 572         if (!(ov_counts = calloc(wordfilters_num, sizeof *ov_counts))) {
 573                 PERROR_MESSAGE("calloc");
 574                 goto done;
 575         }
 576
 577         if (!(ov_ps = calloc(wordfilters_num, sizeof *ov_ps))) {
 578                 PERROR_MESSAGE("calloc");
 579                 goto done;
 580         }
 581
 582         if (!(num_matches = calloc(wordfilters_num, sizeof *num_matches))) {
 583                 PERROR_MESSAGE("calloc");
 584                 goto done;
 585         }
 586
 587         if (!(match_data = calloc(wordfilters_num, sizeof *match_data))) {
 588                 PERROR_MESSAGE("calloc");
 589                 goto done;
 590         }
 591
 592         /* First scan, before the loop */
 593         for (size_t j = 0; j < wordfilters_num; ++j) {
 594                 if (!(match_data[j] = pcre2_match_data_create_from_pattern(
 595                               wordfilters[j].code, 0))) {
 596                         PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
 597                         goto done;
 598                 }
 599
 600                 num_matches[j] = pcre2_match(wordfilters[j].code,
 601                                              (PCRE2_SPTR) scannable,
 602                                              scannable_len, scannable_idx, 0,
 603                                              match_data[j], 0);
 604         }
 605
 606 handle_next_match:
 607         best_match_pos = (size_t) -1;
 608         best_match_idx = (size_t) -1;
 609
 610         /* We've run pcre2_match() on everything. Find the soonest match */
 611         for (size_t j = 0; j < wordfilters_num; ++j) {
 612                 if (!num_matches[j]) {
 613                         continue;
 614                 }
 615
 616                 ov_ps[j] = pcre2_get_ovector_pointer(match_data[j]);
 617
 618                 if (ov_ps[j][0] >= scannable_idx &&
 619                     ov_ps[j][0] < best_match_pos) {
 620                         best_match_pos = ov_ps[j][0];
 621                         best_match_idx = j;
 622                 }
 623         }
 624
 625         if (best_match_idx == (size_t) -1) {
 626                 /* No matches. Turn the rest to html boring-like */
 627                 ret = to_html(raw, raw_len, raw_idx, out, out_len, &out_idx);
 628                 goto done;
 629         }
 630
 631         /* Figure out where in raw this match starts */
 632         l = best_match_pos;
 633
 634         while (l != (size_t) -1 &&
 635                position_map[l] == (size_t) -1) {
 636                 l--;
 637         }
 638
 639         if (l == (size_t) -1) {
 640                 ERROR_MESSAGE("Impossible condition in "
 641                               "wordfilter_to_html: raw=\"%s\", best_match_pos = %zu",
 642                               raw,
 643                               best_match_pos);
 644                 goto done;
 645         }
 646
 647         /*
 648          * Now position_map[l] points to the first character in raw
 649          * that should be replaced. Fill up to that point.
 650          */
 651         if (position_map[l] &&
 652             position_map[l] > raw_idx) {
 653                 if (to_html(raw, position_map[l], raw_idx, out, out_len,
 654                             &out_idx) < 0) {
 655                         goto done;
 656                 }
 657         }
 658
 659         /* Put the substituted text in */
 660         if (to_html(wordfilters[best_match_idx].replacement,
 661                     wordfilters[best_match_idx].replacement_len, 0, out,
 662                     out_len,
 663                     &out_idx) < 0) {
 664                 goto done;
 665         }
 666
 667         /*
 668          * Figure out where we should advance to in inputs. Naively,
 669          * we want to set scannable_idx to ov_ps[best_match_idx][1]
 670          * (the first character in scannable beyond the match).
 671          * However, we have to consider the case of
 672          *
 673          *      foo！！！bar
 674          *
 675          * where "foo" -> "baz" is the only transformation. Since
 676          * some characters, like "！", are completely ignored by
 677          * the scannable transformation, the naive method would
 678          * start our scanning at the "b", skipping information.
 679          *
 680          * So, instead, we carefully find the last character in
 681          * "foo", then jump one past it. This (unfortunately)
 682          * requires a bit more manual fiddling with wide character
 683          * conversions.
 684          *
 685          */
 686         if (ov_ps[best_match_idx][1] <= scannable_idx) {
 687                 /*
 688                  * This should never happen, but let's make sure
 689                  * we always keep advancing.
 690                  */
 691                 scannable_idx++;
 692         } else {
 693                 scannable_idx = ov_ps[best_match_idx][1] - 1;
 694         }
 695
 696         l = scannable_idx;
 697
 698         while (position_map[l] == (size_t) -1) {
 699                 l--;
 700         }
 701
 702         raw_idx = position_map[l];
 703
 704         /* This is the "jump one past it" part */
 705         scannable_idx++;
 706         errno = 0;
 707         mbret = mbrlen(raw + raw_idx, MB_CUR_MAX, 0);
 708
 709         switch (mbret) {
 710         case (size_t) -2:
 711         case (size_t) -1:
 712                 PERROR_MESSAGE("mbrlen");
 713                 goto done;
 714         default:
 715                 raw_idx += mbret;
 716         }
 717
 718         /*
 719          * Now re-check all our matches and figure out which ones
 720          * need to be updated
 721          */
 722         for (size_t j = 0; j < wordfilters_num; ++j) {
 723                 if (!num_matches[j] ||
 724                     ov_ps[j][0] >= scannable_idx) {
 725                         continue;
 726                 }
 727
 728                 num_matches[j] = pcre2_match(wordfilters[j].code,
 729                                              (PCRE2_SPTR) scannable,
 730                                              scannable_len, scannable_idx, 0,
 731                                              match_data[j], 0);
 732         }
 733
 734         goto handle_next_match;
 735 done:
 736
 737         for (size_t j = 0; j < wordfilters_num; ++j) {
 738                 pcre2_match_data_free(match_data[j]);
 739                 match_data[j] = 0;
 740         }
 741
 742         free(match_data);
 743         free(num_matches);
 744         free(ov_counts);
 745         free(ov_ps);
 746
 747         return ret;
 748 }
 749
 750 /*
 751  * Read through in. Each time a match for format_replacements is
 752  * found (something like a newline or a quote) is found, replace
 753  * it with some HTML markup. The result is placed in out.
 754  *
 755  * Preconditions:
 756  *
 757  *  - setup_sanitize_comment() has been invoked more recently than
 758  *    clean_sanitize_comment().
 759  *
 760  *  - in is memory of length at least in_len, valid UTF-8 text.
 761  *
 762  *  - Overwriting *out shall not cause a memory leak.
 763  *
 764  *  - out and out_len are not 0.
 765  *
 766  * Postconditions (success):
 767  *
 768  *  - *out is valid, UTF-8 text of length *out_len with sane HTML
 769  *    markup (and HTML escaped), suitable for outputting into an
 770  *    HTML file.
 771  */
 772 static int insert_html_tags(const char *in, size_t in_len, const char *board,
 773                             char **out, size_t *out_len)
 774 {
 775         int ret = -1;
 776         size_t in_idx = 0;
 777         size_t match_pos = 0;
 778         size_t after_match_pos = 0;
 779         size_t out_idx = 0;
 780         pcre2_match_data *match_data = 0;
 781         int nret = 0;
 782         PCRE2_UCHAR *tmp_1 = 0;
 783         PCRE2_SIZE tmp_1_len = 0;
 784         PCRE2_UCHAR *tmp_2 = 0;
 785         PCRE2_SIZE tmp_2_len = 0;
 786         PCRE2_UCHAR *tmp_3 = 0;
 787         PCRE2_SIZE tmp_3_len = 0;
 788         uint_fast8_t last_was_newline = 1;
 789         char *link_target = 0;
 790         size_t link_target_len = 0;
 791
 792         if (!(match_data = pcre2_match_data_create_from_pattern(
 793                       format_replacements, 0))) {
 794                 PERROR_MESSAGE("pcre2_match_data_create_from_pattern");
 795                 goto done;
 796         }
 797
 798 find_next_bit:
 799
 800         if (in_idx >= in_len) {
 801                 goto success;
 802         }
 803
 804         nret = pcre2_match(format_replacements, (PCRE2_SPTR) in, in_len, in_idx,
 805                            0, match_data, 0);
 806
 807         if (nret == PCRE2_ERROR_NOMATCH) {
 808                 ret = append_str(out, &out_idx, out_len, in + in_idx, in_len -
 809                                  in_idx);
 810                 goto done;
 811         }
 812
 813         if (nret < 0) {
 814                 PCRE2_UCHAR8 err_buf[120];
 815
 816                 pcre2_get_error_message(nret, err_buf, 120);
 817                 ERROR_MESSAGE("pcre2_match: error while matching \"%.*s\": %s"
 818                               " (PCRE2 %d)", (int) (in_len - in_idx), in +
 819                               in_idx, err_buf,
 820                               nret);
 821                 goto done;
 822         }
 823
 824         pcre2_substring_free(tmp_1);
 825         pcre2_substring_free(tmp_2);
 826         pcre2_substring_free(tmp_3);
 827         free(link_target);
 828         tmp_1 = 0;
 829         tmp_2 = 0;
 830         tmp_3 = 0;
 831         link_target = 0;
 832
 833         /* We have match, stuff everything up to it in *out */
 834         match_pos = pcre2_get_ovector_pointer(match_data)[0];
 835         after_match_pos = pcre2_get_ovector_pointer(match_data)[1];
 836
 837         if (match_pos > in_idx) {
 838                 if (append_str(out, &out_idx, out_len, in + in_idx, match_pos -
 839                                in_idx) < 0) {
 840                         goto done;
 841                 }
 842
 843                 last_was_newline = 0;
 844                 in_idx = match_pos;
 845         }
 846
 847         /* Figure out what type of match. */
 848         if (!pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "newline",
 849                                         &tmp_1, &tmp_1_len)) {
 850                 if (last_was_newline) {
 851                         if (append_const_str(out, &out_idx, out_len,
 852                                              "&nbsp;<br />") < 0) {
 853                                 goto done;
 854                         }
 855                 } else {
 856                         if (append_const_str(out, &out_idx, out_len, "<br />") <
 857                             0) {
 858                                 goto done;
 859                         }
 860                 }
 861
 862                 last_was_newline = 1;
 863                 in_idx = after_match_pos;
 864                 goto find_next_bit;
 865         }
 866
 867         last_was_newline = 0;
 868
 869         if (!pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "quote",
 870                                         &tmp_1, &tmp_1_len)) {
 871                 if (append_const_str(out, &out_idx, out_len,
 872                                      "<span class=\"quote\">") < 0) {
 873                         goto done;
 874                 }
 875
 876                 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
 877                                (size_t) tmp_1_len) < 0) {
 878                         goto done;
 879                 }
 880
 881                 if (append_const_str(out, &out_idx, out_len, "</span>") < 0) {
 882                         goto done;
 883                 }
 884
 885                 in_idx = after_match_pos;
 886                 goto find_next_bit;
 887         }
 888
 889         if (!pcre2_substring_get_byname(match_data,
 890                                         (PCRE2_SPTR) "intra_postlink", &tmp_1,
 891                                         &tmp_1_len)) {
 892                 if (pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "a_num",
 893                                                &tmp_2, &tmp_2_len)) {
 894                         goto problem_with_match;
 895                 }
 896
 897                 int found = 0;
 898
 899                 if (db_construct_post_link(board, strlen(board), (const
 900                                                                   char *) tmp_2,
 901                                            tmp_2_len, &found, &link_target,
 902                                            &link_target_len) < 0) {
 903                         goto done;
 904                 }
 905
 906                 if (!found) {
 907                         if (append_str(out, &out_idx, out_len, in + match_pos,
 908                                        after_match_pos - match_pos) < 0) {
 909                                 goto done;
 910                         }
 911
 912                         in_idx = after_match_pos;
 913                         goto find_next_bit;
 914                 }
 915
 916                 if (append_const_str(out, &out_idx, out_len, "<a href=\"") <
 917                     0) {
 918                         goto done;
 919                 }
 920
 921                 if (append_str(out, &out_idx, out_len, link_target,
 922                                link_target_len) < 0) {
 923                         goto done;
 924                 }
 925
 926                 if (append_const_str(out, &out_idx, out_len, "\">") < 0) {
 927                         goto done;
 928                 }
 929
 930                 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
 931                                (size_t) tmp_1_len) < 0) {
 932                         goto done;
 933                 }
 934
 935                 if (append_const_str(out, &out_idx, out_len, "</a>") < 0) {
 936                         goto done;
 937                 }
 938
 939                 in_idx = after_match_pos;
 940                 goto find_next_bit;
 941         }
 942
 943         if (!pcre2_substring_get_byname(match_data,
 944                                         (PCRE2_SPTR) "inter_postlink", &tmp_1,
 945                                         &tmp_1_len)) {
 946                 if (pcre2_substring_get_byname(match_data, (PCRE2_SPTR) "e_num",
 947                                                &tmp_2, &tmp_2_len)) {
 948                         goto problem_with_match;
 949                 }
 950
 951                 if (pcre2_substring_get_byname(match_data,
 952                                                (PCRE2_SPTR) "e_board", &tmp_3,
 953                                                &tmp_3_len)) {
 954                         goto problem_with_match;
 955                 }
 956
 957                 int found = 0;
 958
 959                 if (db_construct_post_link((const char *) tmp_3, tmp_3_len,
 960                                            (const char *) tmp_2, tmp_2_len,
 961                                            &found, &link_target,
 962                                            &link_target_len) < 0) {
 963                         goto done;
 964                 }
 965
 966                 if (!found) {
 967                         if (append_str(out, &out_idx, out_len, in + match_pos,
 968                                        after_match_pos - match_pos) < 0) {
 969                                 goto done;
 970                         }
 971
 972                         in_idx = after_match_pos;
 973                         goto find_next_bit;
 974                 }
 975
 976                 if (append_const_str(out, &out_idx, out_len, "<a href=\"") <
 977                     0) {
 978                         goto done;
 979                 }
 980
 981                 if (append_str(out, &out_idx, out_len, link_target,
 982                                link_target_len) < 0) {
 983                         goto done;
 984                 }
 985
 986                 if (append_const_str(out, &out_idx, out_len, "\">") < 0) {
 987                         goto done;
 988                 }
 989
 990                 if (append_str(out, &out_idx, out_len, (const char *) tmp_1,
 991                                (size_t) tmp_1_len) < 0) {
 992                         goto done;
 993                 }
 994
 995                 if (append_const_str(out, &out_idx, out_len, "</a>") < 0) {
 996                         goto done;
 997                 }
 998
 999                 in_idx = after_match_pos;
1000                 goto find_next_bit;
1001         }
1002
1003 problem_with_match:
1004
1005         /* There was some kind of match, but it went wrong. */
1006         in_idx++;
1007         goto find_next_bit;
1008 success:
1009         ret = 0;
1010 done:
1011         *out_len = out_idx;
1012         pcre2_substring_free(tmp_1);
1013         pcre2_substring_free(tmp_2);
1014         pcre2_substring_free(tmp_3);
1015         pcre2_match_data_free(match_data);
1016
1017         return ret;
1018 }
1019
1020 /*
1021  * Make sure that the contents of *pc are ready for safe injection
1022  * into the board, including HTML escaping, wordfiltering, general
1023  * formatting, and adding links.
1024  *
1025  * Preconditions
1026  *
1027  *  - setup_sanitize_comment() has been invoked more recently than
1028  *    clean_sanitize_comment().
1029  *
1030  *  - *pc has been filled out (fields like action, board, etc. have
1031  *    been populated) from the POST data.
1032  *
1033  * Postconditions (success):
1034  *
1035  *  - The prepared_XYZ fields of *pc have been filled out, and each
1036  *    is valid ASCII text, with Unicode codepoints.
1037  */
1038 int st_sanitize_text(struct post_cmd *pc, int *our_fault)
1039 {
1040         int ret = -1;
1041         size_t out_idx = 0;
1042         char *html_escaped_comment = 0;
1043         size_t html_escaped_comment_len = 0;
1044
1045         /* Flush out lurking double-free bugs */
1046         free(pc->prepared.name);
1047         pc->prepared.name = 0;
1048         pc->prepared.name_len = 0;
1049         free(pc->prepared.email);
1050         pc->prepared.email = 0;
1051         pc->prepared.email_len = 0;
1052         free(pc->prepared.subject);
1053         pc->prepared.subject = 0;
1054         pc->prepared.subject_len = 0;
1055         free(pc->prepared.comment);
1056         pc->prepared.comment = 0;
1057         pc->prepared.comment_len = 0;
1058         free(pc->prepared.file_name);
1059         pc->prepared.file_name = 0;
1060         pc->prepared.file_name_len = 0;
1061         free(pc->scannable_comment);
1062         pc->scannable_comment = 0;
1063         pc->scannable_comment_len = 0;
1064         free(pc->position_map);
1065         pc->position_map = 0;
1066         pc->position_map_len = 0;
1067         out_idx = 0;
1068
1069         if (!pc->raw.name_len) {
1070                 free(pc->raw.name);
1071
1072                 if (!(pc->raw.name = strdup("Anonymous"))) {
1073                         PERROR_MESSAGE("strdup");
1074                         goto done;
1075                 }
1076
1077                 pc->raw.name_len = strlen(pc->raw.name);
1078         }
1079
1080         if (pc->raw.name_len) {
1081                 if (to_html(pc->raw.name, pc->raw.name_len, 0,
1082                             &pc->prepared.name, &pc->prepared.name_len,
1083                             &out_idx) < 0) {
1084                         *our_fault = 1;
1085                         goto done;
1086                 }
1087         }
1088
1089         out_idx = 0;
1090
1091         if (pc->raw.email_len) {
1092                 if (to_html(pc->raw.email, pc->raw.email_len, 0,
1093                             &pc->prepared.email, &pc->prepared.email_len,
1094                             &out_idx) < 0) {
1095                         *our_fault = 1;
1096                         goto done;
1097                 }
1098         }
1099
1100         out_idx = 0;
1101
1102         if (pc->raw.tripcode_len) {
1103                 if (to_html(pc->raw.tripcode, pc->raw.tripcode_len, 0,
1104                             &pc->prepared.tripcode, &pc->prepared.tripcode_len,
1105                             &out_idx) <
1106                     0) {
1107                         *our_fault = 1;
1108                         goto done;
1109                 }
1110         }
1111
1112         out_idx = 0;
1113
1114         if (pc->raw.subject_len) {
1115                 if (to_html(pc->raw.subject, pc->raw.subject_len, 0,
1116                             &pc->prepared.subject, &pc->prepared.subject_len,
1117                             &out_idx) <
1118                     0) {
1119                         *our_fault = 1;
1120                         goto done;
1121                 }
1122         }
1123
1124         out_idx = 0;
1125
1126         if (pc->raw.file_name_len) {
1127                 if (to_html(pc->raw.file_name, pc->raw.file_name_len, 0,
1128                             &pc->prepared.file_name,
1129                             &pc->prepared.file_name_len,
1130                             &out_idx) < 0) {
1131                         *our_fault = 1;
1132                         goto done;
1133                 }
1134         }
1135
1136         if (to_scannable(pc->raw.comment, pc->raw.comment_len,
1137                          &pc->scannable_comment, &pc->scannable_comment_len,
1138                          &pc->position_map,
1139                          &pc->position_map_len)) {
1140                 *our_fault = 1;
1141                 goto done;
1142         }
1143
1144         /*
1145          * Now we do the fancy thing. Match scannable, build prepared
1146          * out of that.
1147          */
1148         if (wordfilter_to_html(pc->raw.comment, pc->raw.comment_len,
1149                                pc->scannable_comment, pc->scannable_comment_len,
1150                                pc->position_map,
1151                                &html_escaped_comment,
1152                                &html_escaped_comment_len) < 0) {
1153                 *our_fault = 1;
1154                 goto done;
1155         }
1156
1157         /*
1158          * Everything's in &#123; form, but now take care of >>123,
1159          * <br />, etc.
1160          */
1161         if (insert_html_tags(html_escaped_comment, html_escaped_comment_len,
1162                              pc->raw.board, &pc->prepared.comment,
1163                              &pc->prepared.comment_len) < 0) {
1164                 *our_fault = 1;
1165                 goto done;
1166         }
1167
1168         ret = 0;
1169 done:
1170         free(html_escaped_comment);
1171
1172         return ret;
1173 }
1174
1175 /*
1176  * Initialize any static elements needed for this file.
1177  *
1178  * Preconditions:
1179  *
1180  *  - setup_sanitize_comment() was not invoked more recently than
1181  *    clean_sanitize_comment().
1182  *
1183  * Postconditions (success):
1184  *
1185  *  - Any other function in this file may be safely called.
1186  */
1187 int setup_sanitize_comment(const struct configuration *conf)
1188 {
1189         /*
1190          * Check that the locale/libc/whatever is set up so that
1191          * UTF-8 handling can work.
1192          */
1193         int ret = -1;
1194         const char *raw =
1195                 "<script>alert(1)</script> , \U0001d511\U0001d526\U0001d52b"
1196                 "\U0001d51e\u3000\U0001d513\U0001d532\U0001d52f\U0001d52d"
1197                 "\U0001d529\U0001d522\U0001d531\U0001d52c\U0001d52b & "
1198                 "\u2468\u0294!\u0ce2!!";
1199         const char *correct_html =
1200                 "&lt;script&gt;alert(1)&lt;/script&gt; , &#120081;&#120102;"
1201                 "&#120107;&#120094;&#12288;&#120083;&#120114;&#120111;"
1202                 "&#120109;&#120105;&#120098;&#120113;&#120108;&#120107; &amp;"
1203                 " &#9320;&#660;!&#3298;!!";
1204         const char *correct_scannable =
1205                 "<script>alert(1)</script> , Nina Purpleton & 9!!!";
1206         char *html = 0;
1207         size_t html_len = 0;
1208         char *scannable = 0;
1209         size_t scannable_len = 0;
1210         size_t *position_map = 0;
1211         size_t position_map_len = 0;
1212         size_t out_idx = 0;
1213
1214         if (to_html(raw, strlen(raw), 0, &html, &html_len, &out_idx) < 0) {
1215                 goto done;
1216         }
1217
1218         if (strcmp(html, correct_html)) {
1219                 ERROR_MESSAGE("Was expecting html conversion to yield "
1220                               "\n\n\u00ab%s\u00bb\n\nInstead, got "
1221                               "\n\n\u00ab%s\u00bb\n\n",
1222                               correct_html, html);
1223                 goto done;
1224         }
1225
1226         if (to_scannable(raw, strlen(raw), &scannable, &scannable_len,
1227                          &position_map, &position_map_len) < 0) {
1228                 goto done;
1229         }
1230
1231         if (strcmp(scannable, correct_scannable)) {
1232                 ERROR_MESSAGE("Was expecting scannable conversion to yield "
1233                               "\n\n\u00ab%s\u00bb\n\nInstead, got "
1234                               "\n\n\u00ab%s\u00bb\n\n",
1235                               correct_scannable, scannable);
1236                 goto done;
1237         }
1238
1239         if (!(wordfilters = calloc(conf->wordfilter_inputs_num,
1240                                    sizeof *wordfilters))) {
1241                 PERROR_MESSAGE("calloc");
1242                 goto done;
1243         }
1244
1245         wordfilters_num = conf->wordfilter_inputs_num;
1246         int err_code = 0;
1247         PCRE2_SIZE err_offset = 0;
1248         PCRE2_UCHAR8 err_buf[120];
1249
1250         for (size_t j = 0; j < wordfilters_num; ++j) {
1251                 wordfilters[j].replacement =
1252                         conf->wordfilter_inputs[j].replacement;
1253                 wordfilters[j].replacement_len = strlen(
1254                         conf->wordfilter_inputs[j].replacement);
1255
1256                 if ((wordfilters[j].code = pcre2_compile(
1257                              (PCRE2_SPTR8) conf->wordfilter_inputs[j].pattern,
1258                              PCRE2_ZERO_TERMINATED, PCRE2_UTF, &err_code,
1259                              &err_offset, 0))) {
1260                         continue;
1261                 }
1262
1263                 pcre2_get_error_message(err_code, err_buf, 120);
1264                 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1265                               conf->wordfilter_inputs[j].pattern, err_buf);
1266                 goto done;
1267         }
1268
1269         const char *format_match_str =
1270
1271                 /* */
1272                 "(?<newline>\\n)"                              /* */
1273                 "|(?<intra_postlink>&gt;&gt;(?<a_num>[0-9]+))" /* */
1274                 "|(?<inter_postlink>&gt;&gt;&gt;/"             /* */
1275                 "(?<e_board>[^ /]+)/(?<e_num>[0-9]+))"         /* */
1276                 "|(?<quote>(?<![^\n])&gt;[^\n]*)";             /* */
1277
1278         if (!(format_replacements = pcre2_compile(
1279                       (PCRE2_SPTR8) format_match_str, PCRE2_ZERO_TERMINATED,
1280                       PCRE2_UTF,
1281                       &err_code, &err_offset, 0))) {
1282                 pcre2_get_error_message(err_code, err_buf, 120);
1283                 ERROR_MESSAGE("pcre2_compile: error with pattern \"%s\": %s",
1284                               format_match_str, err_buf);
1285                 goto done;
1286         }
1287
1288         ret = 0;
1289 done:
1290         free(html);
1291         free(scannable);
1292         free(position_map);
1293
1294         return ret;
1295 }
1296
1297 /*
1298  * Clean up any memory from this file
1299  *
1300  * Postconditions (success):
1301  *
1302  *  - Valgrind won't report any memory leaks from this file.
1303  *
1304  *  - setup_sanitize_comment() can be safely called again.
1305  */
1306 int clean_sanitize_comment(void)
1307 {
1308         for (size_t j = 0; j < wordfilters_num; ++j) {
1309                 pcre2_code_free(wordfilters[j].code);
1310                 wordfilters[j] = (struct wordfilter) { 0 };
1311         }
1312
1313         pcre2_code_free(format_replacements);
1314         format_replacements = 0;
1315         free(wordfilters);
1316         wordfilters = 0;
1317         wordfilters_num = 0;
1318
1319         return 0;
1320 }