contrib/mdocml/mandoc.c

   1 /*      $Id: mandoc.c,v 1.119 2021/08/10 12:55:03 schwarze Exp $ */
   2 /*
   3  * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
   4  * Copyright (c) 2011-2015, 2017-2021 Ingo Schwarze <schwarze@openbsd.org>
   5  *
   6  * Permission to use, copy, modify, and distribute this software for any
   7  * purpose with or without fee is hereby granted, provided that the above
   8  * copyright notice and this permission notice appear in all copies.
   9  *
  10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
  11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
  13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  17  */
  18 #include "config.h"
  19
  20 #include <sys/types.h>
  21
  22 #include <assert.h>
  23 #include <ctype.h>
  24 #include <errno.h>
  25 #include <limits.h>
  26 #include <stdlib.h>
  27 #include <stdio.h>
  28 #include <string.h>
  29 #include <time.h>
  30
  31 #include "mandoc_aux.h"
  32 #include "mandoc.h"
  33 #include "roff.h"
  34 #include "libmandoc.h"
  35 #include "roff_int.h"
  36
  37 static  int      a2time(time_t *, const char *, const char *);
  38 static  char    *time2a(time_t);
  39
  40
  41 enum mandoc_esc
  42 mandoc_font(const char *cp, int sz)
  43 {
  44         switch (sz) {
  45         case 0:
  46                 return ESCAPE_FONTPREV;
  47         case 1:
  48                 switch (cp[0]) {
  49                 case 'B':
  50                 case '3':
  51                         return ESCAPE_FONTBOLD;
  52                 case 'I':
  53                 case '2':
  54                         return ESCAPE_FONTITALIC;
  55                 case 'P':
  56                         return ESCAPE_FONTPREV;
  57                 case 'R':
  58                 case '1':
  59                         return ESCAPE_FONTROMAN;
  60                 case '4':
  61                         return ESCAPE_FONTBI;
  62                 default:
  63                         return ESCAPE_ERROR;
  64                 }
  65         case 2:
  66                 switch (cp[0]) {
  67                 case 'B':
  68                         switch (cp[1]) {
  69                         case 'I':
  70                                 return ESCAPE_FONTBI;
  71                         default:
  72                                 return ESCAPE_ERROR;
  73                         }
  74                 case 'C':
  75                         switch (cp[1]) {
  76                         case 'B':
  77                                 return ESCAPE_FONTCB;
  78                         case 'I':
  79                                 return ESCAPE_FONTCI;
  80                         case 'R':
  81                         case 'W':
  82                                 return ESCAPE_FONTCR;
  83                         default:
  84                                 return ESCAPE_ERROR;
  85                         }
  86                 default:
  87                         return ESCAPE_ERROR;
  88                 }
  89         default:
  90                 return ESCAPE_ERROR;
  91         }
  92 }
  93
  94 enum mandoc_esc
  95 mandoc_escape(const char **end, const char **start, int *sz)
  96 {
  97         const char      *local_start;
  98         int              local_sz, c, i;
  99         char             term;
 100         enum mandoc_esc  gly;
 101
 102         /*
 103          * When the caller doesn't provide return storage,
 104          * use local storage.
 105          */
 106
 107         if (NULL == start)
 108                 start = &local_start;
 109         if (NULL == sz)
 110                 sz = &local_sz;
 111
 112         /*
 113          * Treat "\E" just like "\";
 114          * it only makes a difference in copy mode.
 115          */
 116
 117         if (**end == 'E')
 118                 ++*end;
 119
 120         /*
 121          * Beyond the backslash, at least one input character
 122          * is part of the escape sequence.  With one exception
 123          * (see below), that character won't be returned.
 124          */
 125
 126         gly = ESCAPE_ERROR;
 127         *start = ++*end;
 128         *sz = 0;
 129         term = '\0';
 130
 131         switch ((*start)[-1]) {
 132         /*
 133          * First the glyphs.  There are several different forms of
 134          * these, but each eventually returns a substring of the glyph
 135          * name.
 136          */
 137         case '(':
 138                 gly = ESCAPE_SPECIAL;
 139                 *sz = 2;
 140                 break;
 141         case '[':
 142                 if (**start == ' ') {
 143                         ++*end;
 144                         return ESCAPE_ERROR;
 145                 }
 146                 gly = ESCAPE_SPECIAL;
 147                 term = ']';
 148                 break;
 149         case 'C':
 150                 if ('\'' != **start)
 151                         return ESCAPE_ERROR;
 152                 *start = ++*end;
 153                 gly = ESCAPE_SPECIAL;
 154                 term = '\'';
 155                 break;
 156
 157         /*
 158          * Escapes taking no arguments at all.
 159          */
 160         case '!':
 161         case '?':
 162                 return ESCAPE_UNSUPP;
 163         case '%':
 164         case '&':
 165         case ')':
 166         case ',':
 167         case '/':
 168         case '^':
 169         case 'a':
 170         case 'd':
 171         case 'r':
 172         case 't':
 173         case 'u':
 174         case '{':
 175         case '|':
 176         case '}':
 177                 return ESCAPE_IGNORE;
 178         case 'c':
 179                 return ESCAPE_NOSPACE;
 180         case 'p':
 181                 return ESCAPE_BREAK;
 182
 183         /*
 184          * The \z escape is supposed to output the following
 185          * character without advancing the cursor position.
 186          * Since we are mostly dealing with terminal mode,
 187          * let us just skip the next character.
 188          */
 189         case 'z':
 190                 return ESCAPE_SKIPCHAR;
 191
 192         /*
 193          * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
 194          * 'X' is the trigger.  These have opaque sub-strings.
 195          */
 196         case 'F':
 197         case 'f':
 198         case 'g':
 199         case 'k':
 200         case 'M':
 201         case 'm':
 202         case 'n':
 203         case 'O':
 204         case 'V':
 205         case 'Y':
 206         case '*':
 207                 switch ((*start)[-1]) {
 208                 case 'f':
 209                         gly = ESCAPE_FONT;
 210                         break;
 211                 case '*':
 212                         gly = ESCAPE_DEVICE;
 213                         break;
 214                 default:
 215                         gly = ESCAPE_IGNORE;
 216                         break;
 217                 }
 218                 switch (**start) {
 219                 case '(':
 220                         if ((*start)[-1] == 'O')
 221                                 gly = ESCAPE_ERROR;
 222                         *start = ++*end;
 223                         *sz = 2;
 224                         break;
 225                 case '[':
 226                         if ((*start)[-1] == 'O')
 227                                 gly = (*start)[1] == '5' ?
 228                                     ESCAPE_UNSUPP : ESCAPE_ERROR;
 229                         *start = ++*end;
 230                         term = ']';
 231                         break;
 232                 default:
 233                         if ((*start)[-1] == 'O') {
 234                                 switch (**start) {
 235                                 case '0':
 236                                         gly = ESCAPE_UNSUPP;
 237                                         break;
 238                                 case '1':
 239                                 case '2':
 240                                 case '3':
 241                                 case '4':
 242                                         break;
 243                                 default:
 244                                         gly = ESCAPE_ERROR;
 245                                         break;
 246                                 }
 247                         }
 248                         *sz = 1;
 249                         break;
 250                 }
 251                 break;
 252
 253         /*
 254          * These escapes are of the form \X'Y', where 'X' is the trigger
 255          * and 'Y' is any string.  These have opaque sub-strings.
 256          * The \B and \w escapes are handled in roff.c, roff_res().
 257          */
 258         case 'A':
 259         case 'b':
 260         case 'D':
 261         case 'R':
 262         case 'X':
 263         case 'Z':
 264                 gly = ESCAPE_IGNORE;
 265                 /* FALLTHROUGH */
 266         case 'o':
 267                 if (**start == '\0')
 268                         return ESCAPE_ERROR;
 269                 if (gly == ESCAPE_ERROR)
 270                         gly = ESCAPE_OVERSTRIKE;
 271                 term = **start;
 272                 *start = ++*end;
 273                 break;
 274
 275         /*
 276          * These escapes are of the form \X'N', where 'X' is the trigger
 277          * and 'N' resolves to a numerical expression.
 278          */
 279         case 'h':
 280         case 'H':
 281         case 'L':
 282         case 'l':
 283         case 'S':
 284         case 'v':
 285         case 'x':
 286                 if (strchr(" %&()*+-./0123456789:<=>", **start)) {
 287                         if ('\0' != **start)
 288                                 ++*end;
 289                         return ESCAPE_ERROR;
 290                 }
 291                 switch ((*start)[-1]) {
 292                 case 'h':
 293                         gly = ESCAPE_HORIZ;
 294                         break;
 295                 case 'l':
 296                         gly = ESCAPE_HLINE;
 297                         break;
 298                 default:
 299                         gly = ESCAPE_IGNORE;
 300                         break;
 301                 }
 302                 term = **start;
 303                 *start = ++*end;
 304                 break;
 305
 306         /*
 307          * Special handling for the numbered character escape.
 308          * XXX Do any other escapes need similar handling?
 309          */
 310         case 'N':
 311                 if ('\0' == **start)
 312                         return ESCAPE_ERROR;
 313                 (*end)++;
 314                 if (isdigit((unsigned char)**start)) {
 315                         *sz = 1;
 316                         return ESCAPE_IGNORE;
 317                 }
 318                 (*start)++;
 319                 while (isdigit((unsigned char)**end))
 320                         (*end)++;
 321                 *sz = *end - *start;
 322                 if ('\0' != **end)
 323                         (*end)++;
 324                 return ESCAPE_NUMBERED;
 325
 326         /*
 327          * Sizes get a special category of their own.
 328          */
 329         case 's':
 330                 gly = ESCAPE_IGNORE;
 331
 332                 /* See +/- counts as a sign. */
 333                 if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
 334                         *start = ++*end;
 335
 336                 switch (**end) {
 337                 case '(':
 338                         *start = ++*end;
 339                         *sz = 2;
 340                         break;
 341                 case '[':
 342                         *start = ++*end;
 343                         term = ']';
 344                         break;
 345                 case '\'':
 346                         *start = ++*end;
 347                         term = '\'';
 348                         break;
 349                 case '3':
 350                 case '2':
 351                 case '1':
 352                         *sz = (*end)[-1] == 's' &&
 353                             isdigit((unsigned char)(*end)[1]) ? 2 : 1;
 354                         break;
 355                 default:
 356                         *sz = 1;
 357                         break;
 358                 }
 359
 360                 break;
 361
 362         /*
 363          * Several special characters can be encoded as
 364          * one-byte escape sequences without using \[].
 365          */
 366         case ' ':
 367         case '\'':
 368         case '-':
 369         case '.':
 370         case '0':
 371         case ':':
 372         case '_':
 373         case '`':
 374         case 'e':
 375         case '~':
 376                 gly = ESCAPE_SPECIAL;
 377                 /* FALLTHROUGH */
 378         default:
 379                 if (gly == ESCAPE_ERROR)
 380                         gly = ESCAPE_UNDEF;
 381                 *start = --*end;
 382                 *sz = 1;
 383                 break;
 384         }
 385
 386         /*
 387          * Read up to the terminating character,
 388          * paying attention to nested escapes.
 389          */
 390
 391         if ('\0' != term) {
 392                 while (**end != term) {
 393                         switch (**end) {
 394                         case '\0':
 395                                 return ESCAPE_ERROR;
 396                         case '\\':
 397                                 (*end)++;
 398                                 if (ESCAPE_ERROR ==
 399                                     mandoc_escape(end, NULL, NULL))
 400                                         return ESCAPE_ERROR;
 401                                 break;
 402                         default:
 403                                 (*end)++;
 404                                 break;
 405                         }
 406                 }
 407                 *sz = (*end)++ - *start;
 408
 409                 /*
 410                  * The file chars.c only provides one common list
 411                  * of character names, but \[-] == \- is the only
 412                  * one of the characters with one-byte names that
 413                  * allows enclosing the name in brackets.
 414                  */
 415                 if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-')
 416                         return ESCAPE_ERROR;
 417         } else {
 418                 assert(*sz > 0);
 419                 if ((size_t)*sz > strlen(*start))
 420                         return ESCAPE_ERROR;
 421                 *end += *sz;
 422         }
 423
 424         /* Run post-processors. */
 425
 426         switch (gly) {
 427         case ESCAPE_FONT:
 428                 gly = mandoc_font(*start, *sz);
 429                 break;
 430         case ESCAPE_SPECIAL:
 431                 if (**start == 'c') {
 432                         if (*sz < 6 || *sz > 7 ||
 433                             strncmp(*start, "char", 4) != 0 ||
 434                             (int)strspn(*start + 4, "0123456789") + 4 < *sz)
 435                                 break;
 436                         c = 0;
 437                         for (i = 4; i < *sz; i++)
 438                                 c = 10 * c + ((*start)[i] - '0');
 439                         if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
 440                                 break;
 441                         *start += 4;
 442                         *sz -= 4;
 443                         gly = ESCAPE_NUMBERED;
 444                         break;
 445                 }
 446
 447                 /*
 448                  * Unicode escapes are defined in groff as \[u0000]
 449                  * to \[u10FFFF], where the contained value must be
 450                  * a valid Unicode codepoint.  Here, however, only
 451                  * check the length and range.
 452                  */
 453                 if (**start != 'u' || *sz < 5 || *sz > 7)
 454                         break;
 455                 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
 456                         break;
 457                 if (*sz == 6 && (*start)[1] == '0')
 458                         break;
 459                 if (*sz == 5 && (*start)[1] == 'D' &&
 460                     strchr("89ABCDEF", (*start)[2]) != NULL)
 461                         break;
 462                 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
 463                     + 1 == *sz)
 464                         gly = ESCAPE_UNICODE;
 465                 break;
 466         case ESCAPE_DEVICE:
 467                 assert(*sz == 2 && (*start)[0] == '.' && (*start)[1] == 'T');
 468                 break;
 469         default:
 470                 break;
 471         }
 472
 473         return gly;
 474 }
 475
 476 static int
 477 a2time(time_t *t, const char *fmt, const char *p)
 478 {
 479         struct tm        tm;
 480         char            *pp;
 481
 482         memset(&tm, 0, sizeof(struct tm));
 483
 484         pp = NULL;
 485 #if HAVE_STRPTIME
 486         pp = strptime(p, fmt, &tm);
 487 #endif
 488         if (NULL != pp && '\0' == *pp) {
 489                 *t = mktime(&tm);
 490                 return 1;
 491         }
 492
 493         return 0;
 494 }
 495
 496 static char *
 497 time2a(time_t t)
 498 {
 499         struct tm       *tm;
 500         char            *buf, *p;
 501         size_t           ssz;
 502         int              isz;
 503
 504         buf = NULL;
 505         tm = localtime(&t);
 506         if (tm == NULL)
 507                 goto fail;
 508
 509         /*
 510          * Reserve space:
 511          * up to 9 characters for the month (September) + blank
 512          * up to 2 characters for the day + comma + blank
 513          * 4 characters for the year and a terminating '\0'
 514          */
 515
 516         p = buf = mandoc_malloc(10 + 4 + 4 + 1);
 517
 518         if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0)
 519                 goto fail;
 520         p += (int)ssz;
 521
 522         /*
 523          * The output format is just "%d" here, not "%2d" or "%02d".
 524          * That's also the reason why we can't just format the
 525          * date as a whole with "%B %e, %Y" or "%B %d, %Y".
 526          * Besides, the present approach is less prone to buffer
 527          * overflows, in case anybody should ever introduce the bug
 528          * of looking at LC_TIME.
 529          */
 530
 531         isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday);
 532         if (isz < 0 || isz > 4)
 533                 goto fail;
 534         p += isz;
 535
 536         if (strftime(p, 4 + 1, "%Y", tm) == 0)
 537                 goto fail;
 538         return buf;
 539
 540 fail:
 541         free(buf);
 542         return mandoc_strdup("");
 543 }
 544
 545 char *
 546 mandoc_normdate(struct roff_node *nch, struct roff_node *nbl)
 547 {
 548         char            *cp;
 549         time_t           t;
 550
 551         /* No date specified. */
 552
 553         if (nch == NULL) {
 554                 if (nbl == NULL)
 555                         mandoc_msg(MANDOCERR_DATE_MISSING, 0, 0, NULL);
 556                 else
 557                         mandoc_msg(MANDOCERR_DATE_MISSING, nbl->line,
 558                             nbl->pos, "%s", roff_name[nbl->tok]);
 559                 return mandoc_strdup("");
 560         }
 561         if (*nch->string == '\0') {
 562                 mandoc_msg(MANDOCERR_DATE_MISSING, nch->line,
 563                     nch->pos, "%s", roff_name[nbl->tok]);
 564                 return mandoc_strdup("");
 565         }
 566         if (strcmp(nch->string, "$" "Mdocdate$") == 0)
 567                 return time2a(time(NULL));
 568
 569         /* Valid mdoc(7) date format. */
 570
 571         if (a2time(&t, "$" "Mdocdate: %b %d %Y $", nch->string) ||
 572             a2time(&t, "%b %d, %Y", nch->string)) {
 573                 cp = time2a(t);
 574                 if (t > time(NULL) + 86400)
 575                         mandoc_msg(MANDOCERR_DATE_FUTURE, nch->line,
 576                             nch->pos, "%s %s", roff_name[nbl->tok], cp);
 577                 else if (*nch->string != '$' &&
 578                     strcmp(nch->string, cp) != 0)
 579                         mandoc_msg(MANDOCERR_DATE_NORM, nch->line,
 580                             nch->pos, "%s %s", roff_name[nbl->tok], cp);
 581                 return cp;
 582         }
 583
 584         /* In man(7), do not warn about the legacy format. */
 585
 586         if (a2time(&t, "%Y-%m-%d", nch->string) == 0)
 587                 mandoc_msg(MANDOCERR_DATE_BAD, nch->line, nch->pos,
 588                     "%s %s", roff_name[nbl->tok], nch->string);
 589         else if (t > time(NULL) + 86400)
 590                 mandoc_msg(MANDOCERR_DATE_FUTURE, nch->line, nch->pos,
 591                     "%s %s", roff_name[nbl->tok], nch->string);
 592         else if (nbl->tok == MDOC_Dd)
 593                 mandoc_msg(MANDOCERR_DATE_LEGACY, nch->line, nch->pos,
 594                     "Dd %s", nch->string);
 595
 596         /* Use any non-mdoc(7) date verbatim. */
 597
 598         return mandoc_strdup(nch->string);
 599 }
 600
 601 int
 602 mandoc_eos(const char *p, size_t sz)
 603 {
 604         const char      *q;
 605         int              enclosed, found;
 606
 607         if (0 == sz)
 608                 return 0;
 609
 610         /*
 611          * End-of-sentence recognition must include situations where
 612          * some symbols, such as `)', allow prior EOS punctuation to
 613          * propagate outward.
 614          */
 615
 616         enclosed = found = 0;
 617         for (q = p + (int)sz - 1; q >= p; q--) {
 618                 switch (*q) {
 619                 case '\"':
 620                 case '\'':
 621                 case ']':
 622                 case ')':
 623                         if (0 == found)
 624                                 enclosed = 1;
 625                         break;
 626                 case '.':
 627                 case '!':
 628                 case '?':
 629                         found = 1;
 630                         break;
 631                 default:
 632                         return found &&
 633                             (!enclosed || isalnum((unsigned char)*q));
 634                 }
 635         }
 636
 637         return found && !enclosed;
 638 }
 639
 640 /*
 641  * Convert a string to a long that may not be <0.
 642  * If the string is invalid, or is less than 0, return -1.
 643  */
 644 int
 645 mandoc_strntoi(const char *p, size_t sz, int base)
 646 {
 647         char             buf[32];
 648         char            *ep;
 649         long             v;
 650
 651         if (sz > 31)
 652                 return -1;
 653
 654         memcpy(buf, p, sz);
 655         buf[(int)sz] = '\0';
 656
 657         errno = 0;
 658         v = strtol(buf, &ep, base);
 659
 660         if (buf[0] == '\0' || *ep != '\0')
 661                 return -1;
 662
 663         if (v > INT_MAX)
 664                 v = INT_MAX;
 665         if (v < INT_MIN)
 666                 v = INT_MIN;
 667
 668         return (int)v;
 669 }