3.6 branching and setup.
[dragonfly.git] / contrib / mdocml / mandoc.c
blobdf510226c42d32dd6c16377727e45956eb82f26c
1 /* $Id: mandoc.c,v 1.68 2013/08/08 20:07:47 schwarze Exp $ */
2 /*
3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2011, 2012, 2013 Ingo Schwarze <schwarze@openbsd.org>
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #endif
22 #include <sys/types.h>
24 #include <assert.h>
25 #include <ctype.h>
26 #include <errno.h>
27 #include <limits.h>
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 #include <time.h>
33 #include "mandoc.h"
34 #include "libmandoc.h"
36 #define DATESIZE 32
38 static int a2time(time_t *, const char *, const char *);
39 static char *time2a(time_t);
42 enum mandoc_esc
43 mandoc_escape(const char **end, const char **start, int *sz)
45 const char *local_start;
46 int local_sz;
47 char term;
48 enum mandoc_esc gly;
51 * When the caller doesn't provide return storage,
52 * use local storage.
55 if (NULL == start)
56 start = &local_start;
57 if (NULL == sz)
58 sz = &local_sz;
61 * Beyond the backslash, at least one input character
62 * is part of the escape sequence. With one exception
63 * (see below), that character won't be returned.
66 gly = ESCAPE_ERROR;
67 *start = ++*end;
68 *sz = 0;
69 term = '\0';
71 switch ((*start)[-1]) {
73 * First the glyphs. There are several different forms of
74 * these, but each eventually returns a substring of the glyph
75 * name.
77 case ('('):
78 gly = ESCAPE_SPECIAL;
79 *sz = 2;
80 break;
81 case ('['):
82 gly = ESCAPE_SPECIAL;
84 * Unicode escapes are defined in groff as \[uXXXX] to
85 * \[u10FFFF], where the contained value must be a valid
86 * Unicode codepoint. Here, however, only check whether
87 * it's not a zero-width escape.
89 if ('u' == (*start)[0] && ']' != (*start)[1])
90 gly = ESCAPE_UNICODE;
91 term = ']';
92 break;
93 case ('C'):
94 if ('\'' != **start)
95 return(ESCAPE_ERROR);
96 gly = ESCAPE_SPECIAL;
97 *start = ++*end;
98 term = '\'';
99 break;
102 * The \z escape is supposed to output the following
103 * character without advancing the cursor position.
104 * Since we are mostly dealing with terminal mode,
105 * let us just skip the next character.
107 case ('z'):
108 return(ESCAPE_SKIPCHAR);
111 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
112 * 'X' is the trigger. These have opaque sub-strings.
114 case ('F'):
115 /* FALLTHROUGH */
116 case ('g'):
117 /* FALLTHROUGH */
118 case ('k'):
119 /* FALLTHROUGH */
120 case ('M'):
121 /* FALLTHROUGH */
122 case ('m'):
123 /* FALLTHROUGH */
124 case ('n'):
125 /* FALLTHROUGH */
126 case ('V'):
127 /* FALLTHROUGH */
128 case ('Y'):
129 gly = ESCAPE_IGNORE;
130 /* FALLTHROUGH */
131 case ('f'):
132 if (ESCAPE_ERROR == gly)
133 gly = ESCAPE_FONT;
134 switch (**start) {
135 case ('('):
136 *start = ++*end;
137 *sz = 2;
138 break;
139 case ('['):
140 *start = ++*end;
141 term = ']';
142 break;
143 default:
144 *sz = 1;
145 break;
147 break;
150 * These escapes are of the form \X'Y', where 'X' is the trigger
151 * and 'Y' is any string. These have opaque sub-strings.
153 case ('A'):
154 /* FALLTHROUGH */
155 case ('b'):
156 /* FALLTHROUGH */
157 case ('D'):
158 /* FALLTHROUGH */
159 case ('o'):
160 /* FALLTHROUGH */
161 case ('R'):
162 /* FALLTHROUGH */
163 case ('X'):
164 /* FALLTHROUGH */
165 case ('Z'):
166 if ('\'' != **start)
167 return(ESCAPE_ERROR);
168 gly = ESCAPE_IGNORE;
169 *start = ++*end;
170 term = '\'';
171 break;
174 * These escapes are of the form \X'N', where 'X' is the trigger
175 * and 'N' resolves to a numerical expression.
177 case ('B'):
178 /* FALLTHROUGH */
179 case ('h'):
180 /* FALLTHROUGH */
181 case ('H'):
182 /* FALLTHROUGH */
183 case ('L'):
184 /* FALLTHROUGH */
185 case ('l'):
186 gly = ESCAPE_NUMBERED;
187 /* FALLTHROUGH */
188 case ('S'):
189 /* FALLTHROUGH */
190 case ('v'):
191 /* FALLTHROUGH */
192 case ('w'):
193 /* FALLTHROUGH */
194 case ('x'):
195 if ('\'' != **start)
196 return(ESCAPE_ERROR);
197 if (ESCAPE_ERROR == gly)
198 gly = ESCAPE_IGNORE;
199 *start = ++*end;
200 term = '\'';
201 break;
204 * Special handling for the numbered character escape.
205 * XXX Do any other escapes need similar handling?
207 case ('N'):
208 if ('\0' == **start)
209 return(ESCAPE_ERROR);
210 (*end)++;
211 if (isdigit((unsigned char)**start)) {
212 *sz = 1;
213 return(ESCAPE_IGNORE);
215 (*start)++;
216 while (isdigit((unsigned char)**end))
217 (*end)++;
218 *sz = *end - *start;
219 if ('\0' != **end)
220 (*end)++;
221 return(ESCAPE_NUMBERED);
224 * Sizes get a special category of their own.
226 case ('s'):
227 gly = ESCAPE_IGNORE;
229 /* See +/- counts as a sign. */
230 if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
231 (*end)++;
233 switch (**end) {
234 case ('('):
235 *start = ++*end;
236 *sz = 2;
237 break;
238 case ('['):
239 *start = ++*end;
240 term = ']';
241 break;
242 case ('\''):
243 *start = ++*end;
244 term = '\'';
245 break;
246 default:
247 *sz = 1;
248 break;
251 break;
254 * Anything else is assumed to be a glyph.
255 * In this case, pass back the character after the backslash.
257 default:
258 gly = ESCAPE_SPECIAL;
259 *start = --*end;
260 *sz = 1;
261 break;
264 assert(ESCAPE_ERROR != gly);
267 * Read up to the terminating character,
268 * paying attention to nested escapes.
271 if ('\0' != term) {
272 while (**end != term) {
273 switch (**end) {
274 case ('\0'):
275 return(ESCAPE_ERROR);
276 case ('\\'):
277 (*end)++;
278 if (ESCAPE_ERROR ==
279 mandoc_escape(end, NULL, NULL))
280 return(ESCAPE_ERROR);
281 break;
282 default:
283 (*end)++;
284 break;
287 *sz = (*end)++ - *start;
288 } else {
289 assert(*sz > 0);
290 if ((size_t)*sz > strlen(*start))
291 return(ESCAPE_ERROR);
292 *end += *sz;
295 /* Run post-processors. */
297 switch (gly) {
298 case (ESCAPE_FONT):
299 if (2 == *sz) {
300 if ('C' == **start) {
302 * Treat constant-width font modes
303 * just like regular font modes.
305 (*start)++;
306 (*sz)--;
307 } else {
308 if ('B' == (*start)[0] && 'I' == (*start)[1])
309 gly = ESCAPE_FONTBI;
310 break;
312 } else if (1 != *sz)
313 break;
315 switch (**start) {
316 case ('3'):
317 /* FALLTHROUGH */
318 case ('B'):
319 gly = ESCAPE_FONTBOLD;
320 break;
321 case ('2'):
322 /* FALLTHROUGH */
323 case ('I'):
324 gly = ESCAPE_FONTITALIC;
325 break;
326 case ('P'):
327 gly = ESCAPE_FONTPREV;
328 break;
329 case ('1'):
330 /* FALLTHROUGH */
331 case ('R'):
332 gly = ESCAPE_FONTROMAN;
333 break;
335 break;
336 case (ESCAPE_SPECIAL):
337 if (1 == *sz && 'c' == **start)
338 gly = ESCAPE_NOSPACE;
339 break;
340 default:
341 break;
344 return(gly);
347 void *
348 mandoc_calloc(size_t num, size_t size)
350 void *ptr;
352 ptr = calloc(num, size);
353 if (NULL == ptr) {
354 perror(NULL);
355 exit((int)MANDOCLEVEL_SYSERR);
358 return(ptr);
362 void *
363 mandoc_malloc(size_t size)
365 void *ptr;
367 ptr = malloc(size);
368 if (NULL == ptr) {
369 perror(NULL);
370 exit((int)MANDOCLEVEL_SYSERR);
373 return(ptr);
377 void *
378 mandoc_realloc(void *ptr, size_t size)
381 ptr = realloc(ptr, size);
382 if (NULL == ptr) {
383 perror(NULL);
384 exit((int)MANDOCLEVEL_SYSERR);
387 return(ptr);
390 char *
391 mandoc_strndup(const char *ptr, size_t sz)
393 char *p;
395 p = mandoc_malloc(sz + 1);
396 memcpy(p, ptr, sz);
397 p[(int)sz] = '\0';
398 return(p);
401 char *
402 mandoc_strdup(const char *ptr)
404 char *p;
406 p = strdup(ptr);
407 if (NULL == p) {
408 perror(NULL);
409 exit((int)MANDOCLEVEL_SYSERR);
412 return(p);
416 * Parse a quoted or unquoted roff-style request or macro argument.
417 * Return a pointer to the parsed argument, which is either the original
418 * pointer or advanced by one byte in case the argument is quoted.
419 * Null-terminate the argument in place.
420 * Collapse pairs of quotes inside quoted arguments.
421 * Advance the argument pointer to the next argument,
422 * or to the null byte terminating the argument line.
424 char *
425 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
427 char *start, *cp;
428 int quoted, pairs, white;
430 /* Quoting can only start with a new word. */
431 start = *cpp;
432 quoted = 0;
433 if ('"' == *start) {
434 quoted = 1;
435 start++;
438 pairs = 0;
439 white = 0;
440 for (cp = start; '\0' != *cp; cp++) {
443 * Move the following text left
444 * after quoted quotes and after "\\" and "\t".
446 if (pairs)
447 cp[-pairs] = cp[0];
449 if ('\\' == cp[0]) {
451 * In copy mode, translate double to single
452 * backslashes and backslash-t to literal tabs.
454 switch (cp[1]) {
455 case ('t'):
456 cp[0] = '\t';
457 /* FALLTHROUGH */
458 case ('\\'):
459 pairs++;
460 cp++;
461 break;
462 case (' '):
463 /* Skip escaped blanks. */
464 if (0 == quoted)
465 cp++;
466 break;
467 default:
468 break;
470 } else if (0 == quoted) {
471 if (' ' == cp[0]) {
472 /* Unescaped blanks end unquoted args. */
473 white = 1;
474 break;
476 } else if ('"' == cp[0]) {
477 if ('"' == cp[1]) {
478 /* Quoted quotes collapse. */
479 pairs++;
480 cp++;
481 } else {
482 /* Unquoted quotes end quoted args. */
483 quoted = 2;
484 break;
489 /* Quoted argument without a closing quote. */
490 if (1 == quoted)
491 mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
493 /* Null-terminate this argument and move to the next one. */
494 if (pairs)
495 cp[-pairs] = '\0';
496 if ('\0' != *cp) {
497 *cp++ = '\0';
498 while (' ' == *cp)
499 cp++;
501 *pos += (int)(cp - start) + (quoted ? 1 : 0);
502 *cpp = cp;
504 if ('\0' == *cp && (white || ' ' == cp[-1]))
505 mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
507 return(start);
510 static int
511 a2time(time_t *t, const char *fmt, const char *p)
513 struct tm tm;
514 char *pp;
516 memset(&tm, 0, sizeof(struct tm));
518 pp = NULL;
519 #ifdef HAVE_STRPTIME
520 pp = strptime(p, fmt, &tm);
521 #endif
522 if (NULL != pp && '\0' == *pp) {
523 *t = mktime(&tm);
524 return(1);
527 return(0);
530 static char *
531 time2a(time_t t)
533 struct tm *tm;
534 char *buf, *p;
535 size_t ssz;
536 int isz;
538 tm = localtime(&t);
541 * Reserve space:
542 * up to 9 characters for the month (September) + blank
543 * up to 2 characters for the day + comma + blank
544 * 4 characters for the year and a terminating '\0'
546 p = buf = mandoc_malloc(10 + 4 + 4 + 1);
548 if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
549 goto fail;
550 p += (int)ssz;
552 if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
553 goto fail;
554 p += isz;
556 if (0 == strftime(p, 4 + 1, "%Y", tm))
557 goto fail;
558 return(buf);
560 fail:
561 free(buf);
562 return(NULL);
565 char *
566 mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
568 char *out;
569 time_t t;
571 if (NULL == in || '\0' == *in ||
572 0 == strcmp(in, "$" "Mdocdate$")) {
573 mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL);
574 time(&t);
576 else if (a2time(&t, "%Y-%m-%d", in))
577 t = 0;
578 else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
579 !a2time(&t, "%b %d, %Y", in)) {
580 mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL);
581 t = 0;
583 out = t ? time2a(t) : NULL;
584 return(out ? out : mandoc_strdup(in));
588 mandoc_eos(const char *p, size_t sz, int enclosed)
590 const char *q;
591 int found;
593 if (0 == sz)
594 return(0);
597 * End-of-sentence recognition must include situations where
598 * some symbols, such as `)', allow prior EOS punctuation to
599 * propagate outward.
602 found = 0;
603 for (q = p + (int)sz - 1; q >= p; q--) {
604 switch (*q) {
605 case ('\"'):
606 /* FALLTHROUGH */
607 case ('\''):
608 /* FALLTHROUGH */
609 case (']'):
610 /* FALLTHROUGH */
611 case (')'):
612 if (0 == found)
613 enclosed = 1;
614 break;
615 case ('.'):
616 /* FALLTHROUGH */
617 case ('!'):
618 /* FALLTHROUGH */
619 case ('?'):
620 found = 1;
621 break;
622 default:
623 return(found && (!enclosed || isalnum((unsigned char)*q)));
627 return(found && !enclosed);
631 * Convert a string to a long that may not be <0.
632 * If the string is invalid, or is less than 0, return -1.
635 mandoc_strntoi(const char *p, size_t sz, int base)
637 char buf[32];
638 char *ep;
639 long v;
641 if (sz > 31)
642 return(-1);
644 memcpy(buf, p, sz);
645 buf[(int)sz] = '\0';
647 errno = 0;
648 v = strtol(buf, &ep, base);
650 if (buf[0] == '\0' || *ep != '\0')
651 return(-1);
653 if (v > INT_MAX)
654 v = INT_MAX;
655 if (v < INT_MIN)
656 v = INT_MIN;
658 return((int)v);