1 /* $Id: read.c,v 1.79 2014/08/06 15:09:05 schwarze Exp $ */
3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2010-2014 Ingo Schwarze <schwarze@openbsd.org>
5 * Copyright (c) 2010, 2012 Joerg Sonnenberger <joerg@netbsd.org>
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24 # include <sys/stat.h>
25 # include <sys/mman.h>
40 #include "mandoc_aux.h"
41 #include "libmandoc.h"
46 #define REPARSE_LIMIT 1000
49 char *buf
; /* binary input buffer */
50 size_t sz
; /* size of binary buffer */
54 enum mandoclevel file_status
; /* status of current parse */
55 enum mandoclevel wlevel
; /* ignore messages below this */
56 int line
; /* line number in the file */
57 int options
; /* parser options */
58 struct man
*pman
; /* persistent man parser */
59 struct mdoc
*pmdoc
; /* persistent mdoc parser */
60 struct man
*man
; /* man parser */
61 struct mdoc
*mdoc
; /* mdoc parser */
62 struct roff
*roff
; /* roff parser (!NULL) */
63 char *sodest
; /* filename pointed to by .so */
64 int reparse_count
; /* finite interp. stack */
65 mandocmsg mmsg
; /* warning/error message handler */
67 struct buf
*secondary
;
68 const char *defos
; /* default operating system */
71 static void resize_buf(struct buf
*, size_t);
72 static void mparse_buf_r(struct mparse
*, struct buf
, int);
73 static void pset(const char *, int, struct mparse
*);
74 static int read_whole_file(struct mparse
*, const char *, int,
76 static void mparse_end(struct mparse
*);
77 static void mparse_parse_buffer(struct mparse
*, struct buf
,
80 static const enum mandocerr mandoclimits
[MANDOCLEVEL_MAX
] = {
90 static const char * const mandocerrs
[MANDOCERR_MAX
] = {
95 /* related to the prologue */
96 "missing manual title, using UNTITLED",
97 "missing manual title, using \"\"",
98 "lower case character in document title",
99 "missing manual section, using \"\"",
100 "unknown manual section",
101 "unknown manual volume or arch",
102 "missing date, using today's date",
103 "cannot parse date, using it verbatim",
104 "missing Os macro, using \"\"",
105 "duplicate prologue macro",
106 "late prologue macro",
107 "skipping late title macro",
108 "prologue macros out of order",
110 /* related to document structure */
111 ".so is fragile, better use ln(1)",
113 "content before first section header",
114 "first section is not \"NAME\"",
115 "bad NAME section contents",
116 "sections out of conventional order",
117 "duplicate section title",
118 "unexpected section",
120 /* related to macros and nesting */
122 "skipping paragraph macro",
123 "moving paragraph macro out of list",
124 "skipping no-space macro",
125 "blocks badly nested",
126 "nested displays are not portable",
127 "moving content out of list",
128 ".Vt block has child macro",
129 "fill mode already enabled, skipping",
130 "fill mode already disabled, skipping",
133 /* related to missing macro arguments */
134 "skipping empty request",
135 "conditional request controls empty scope",
136 "skipping empty macro",
137 "empty argument, using 0n",
138 "argument count wrong",
139 "missing display type, using -ragged",
140 "list type is not the first argument",
141 "missing -width in -tag list, using 8n",
142 "missing utility name, using \"\"",
143 "empty head in list item",
145 "missing font type, using \\fR",
146 "unknown font type, using \\fR",
147 "missing -std argument, adding it",
149 /* related to bad macro arguments */
150 "unterminated quoted argument",
151 "duplicate argument",
152 "skipping duplicate argument",
153 "skipping duplicate display type",
154 "skipping duplicate list type",
155 "skipping -width argument",
156 "unknown AT&T UNIX version",
157 "invalid content in Rs block",
158 "invalid Boolean argument",
159 "unknown font, skipping request",
161 /* related to plain text */
162 "blank line in fill mode, using .sp",
163 "tab in filled text",
164 "whitespace at end of input line",
166 "invalid escape sequence",
167 "undefined string, using \"\"",
171 /* related to equations */
172 "unexpected equation scope closure",
173 "equation scope open on exit",
174 "overlapping equation scopes",
175 "unexpected end of equation",
176 "equation syntax error",
178 /* related to tables */
182 "no table layout cells specified",
183 "no table data cells specified",
184 "ignore data in cell",
185 "data block still open",
186 "ignoring extra data cells",
188 /* related to document structure and macros */
189 "input stack limit exceeded, infinite loop?",
190 "skipping bad character",
191 "skipping unknown macro",
192 "skipping item outside list",
193 "skipping column outside column list",
194 "skipping end of block that is not open",
195 "inserting missing end of block",
196 "appending missing end of block",
198 /* related to request and macro arguments */
199 "escaped character not allowed in a name",
200 "argument count wrong",
201 "missing list type, using -item",
202 "missing manual name, using \"\"",
203 "uname(3) system call failed, using UNKNOWN",
204 "unknown standard specifier",
205 "skipping request without numeric argument",
206 "skipping all arguments",
207 "skipping excess arguments",
209 "generic fatal error",
212 "NOT IMPLEMENTED: Bd -file",
213 "NOT IMPLEMENTED: .so with absolute path or \"..\"",
214 ".so request failed",
222 static const char * const mandoclevels
[MANDOCLEVEL_MAX
] = {
234 resize_buf(struct buf
*buf
, size_t initial
)
237 buf
->sz
= buf
->sz
> initial
/2 ? 2 * buf
->sz
: initial
;
238 buf
->buf
= mandoc_realloc(buf
->buf
, buf
->sz
);
242 pset(const char *buf
, int pos
, struct mparse
*curp
)
247 * Try to intuit which kind of manual parser should be used. If
248 * passed in by command-line (-man, -mdoc), then use that
249 * explicitly. If passed as -mandoc, then try to guess from the
250 * line: either skip dot-lines, use -mdoc when finding `.Dt', or
251 * default to -man, which is more lenient.
253 * Separate out pmdoc/pman from mdoc/man: the first persists
254 * through all parsers, while the latter is used per-parse.
257 if ('.' == buf
[0] || '\'' == buf
[0]) {
258 for (i
= 1; buf
[i
]; i
++)
259 if (' ' != buf
[i
] && '\t' != buf
[i
])
265 if (MPARSE_MDOC
& curp
->options
) {
266 curp
->mdoc
= curp
->pmdoc
;
268 } else if (MPARSE_MAN
& curp
->options
) {
269 curp
->man
= curp
->pman
;
273 if (pos
>= 3 && 0 == memcmp(buf
, ".Dd", 3)) {
274 if (NULL
== curp
->pmdoc
)
275 curp
->pmdoc
= mdoc_alloc(
276 curp
->roff
, curp
, curp
->defos
,
277 MPARSE_QUICK
& curp
->options
? 1 : 0);
279 curp
->mdoc
= curp
->pmdoc
;
283 if (NULL
== curp
->pman
)
284 curp
->pman
= man_alloc(curp
->roff
, curp
,
285 MPARSE_QUICK
& curp
->options
? 1 : 0);
287 curp
->man
= curp
->pman
;
291 * Main parse routine for an opened file. This is called for each
292 * opened file and simply loops around the full input file, possibly
293 * nesting (i.e., with `so').
296 mparse_buf_r(struct mparse
*curp
, struct buf blk
, int start
)
298 const struct tbl_span
*span
;
302 int pos
; /* byte number in the ln buffer */
303 int lnn
; /* line number in the real file */
306 memset(&ln
, 0, sizeof(struct buf
));
311 for (i
= 0; i
< (int)blk
.sz
; ) {
312 if (0 == pos
&& '\0' == blk
.buf
[i
])
317 curp
->reparse_count
= 0;
320 while (i
< (int)blk
.sz
&& (start
|| '\0' != blk
.buf
[i
])) {
323 * When finding an unescaped newline character,
324 * leave the character loop to process the line.
325 * Skip a preceding carriage return, if any.
328 if ('\r' == blk
.buf
[i
] && i
+ 1 < (int)blk
.sz
&&
329 '\n' == blk
.buf
[i
+ 1])
331 if ('\n' == blk
.buf
[i
]) {
338 * Make sure we have space for at least
339 * one backslash and one other character
340 * and the trailing NUL byte.
343 if (pos
+ 2 >= (int)ln
.sz
)
344 resize_buf(&ln
, 256);
347 * Warn about bogus characters. If you're using
348 * non-ASCII encoding, you're screwing your
349 * readers. Since I'd rather this not happen,
350 * I'll be helpful and replace these characters
351 * with "?", so we don't display gibberish.
352 * Note to manual writers: use special characters.
355 c
= (unsigned char) blk
.buf
[i
];
357 if ( ! (isascii(c
) &&
358 (isgraph(c
) || isblank(c
)))) {
359 mandoc_vmsg(MANDOCERR_BADCHAR
, curp
,
360 curp
->line
, pos
, "0x%x", c
);
366 /* Trailing backslash = a plain char. */
368 if ('\\' != blk
.buf
[i
] || i
+ 1 == (int)blk
.sz
) {
369 ln
.buf
[pos
++] = blk
.buf
[i
++];
374 * Found escape and at least one other character.
375 * When it's a newline character, skip it.
376 * When there is a carriage return in between,
377 * skip that one as well.
380 if ('\r' == blk
.buf
[i
+ 1] && i
+ 2 < (int)blk
.sz
&&
381 '\n' == blk
.buf
[i
+ 2])
383 if ('\n' == blk
.buf
[i
+ 1]) {
389 if ('"' == blk
.buf
[i
+ 1] || '#' == blk
.buf
[i
+ 1]) {
391 /* Comment, skip to end of line */
392 for (; i
< (int)blk
.sz
; ++i
) {
393 if ('\n' == blk
.buf
[i
]) {
400 /* Backout trailing whitespaces */
401 for (; pos
> 0; --pos
) {
402 if (ln
.buf
[pos
- 1] != ' ')
404 if (pos
> 2 && ln
.buf
[pos
- 2] == '\\')
410 /* Catch escaped bogus characters. */
412 c
= (unsigned char) blk
.buf
[i
+1];
414 if ( ! (isascii(c
) &&
415 (isgraph(c
) || isblank(c
)))) {
416 mandoc_vmsg(MANDOCERR_BADCHAR
, curp
,
417 curp
->line
, pos
, "0x%x", c
);
423 /* Some other escape sequence, copy & cont. */
425 ln
.buf
[pos
++] = blk
.buf
[i
++];
426 ln
.buf
[pos
++] = blk
.buf
[i
++];
429 if (pos
>= (int)ln
.sz
)
430 resize_buf(&ln
, 256);
435 * A significant amount of complexity is contained by
436 * the roff preprocessor. It's line-oriented but can be
437 * expressed on one line, so we need at times to
438 * readjust our starting point and re-run it. The roff
439 * preprocessor can also readjust the buffers with new
440 * data, so we pass them in wholesale.
446 * Maintain a lookaside buffer of all parsed lines. We
447 * only do this if mparse_keep() has been invoked (the
448 * buffer may be accessed with mparse_getkeep()).
451 if (curp
->secondary
) {
452 curp
->secondary
->buf
= mandoc_realloc(
453 curp
->secondary
->buf
,
454 curp
->secondary
->sz
+ pos
+ 2);
455 memcpy(curp
->secondary
->buf
+
458 curp
->secondary
->sz
+= pos
;
460 [curp
->secondary
->sz
] = '\n';
461 curp
->secondary
->sz
++;
463 [curp
->secondary
->sz
] = '\0';
466 rr
= roff_parseln(curp
->roff
, curp
->line
,
467 &ln
.buf
, &ln
.sz
, of
, &of
);
471 if (REPARSE_LIMIT
>= ++curp
->reparse_count
)
472 mparse_buf_r(curp
, ln
, 0);
474 mandoc_msg(MANDOCERR_ROFFLOOP
, curp
,
475 curp
->line
, pos
, NULL
);
479 pos
= (int)strlen(ln
.buf
);
487 assert(MANDOCLEVEL_FATAL
<= curp
->file_status
);
490 if (0 == (MPARSE_SO
& curp
->options
) &&
491 (i
>= (int)blk
.sz
|| '\0' == blk
.buf
[i
])) {
492 curp
->sodest
= mandoc_strdup(ln
.buf
+ of
);
497 * We remove `so' clauses from our lookaside
498 * buffer because we're going to descend into
499 * the file recursively.
502 curp
->secondary
->sz
-= pos
+ 1;
503 mparse_readfd(curp
, -1, ln
.buf
+ of
);
504 if (MANDOCLEVEL_FATAL
<= curp
->file_status
) {
505 mandoc_vmsg(MANDOCERR_SO_FAIL
,
506 curp
, curp
->line
, pos
,
507 ".so %s", ln
.buf
+ of
);
517 * If we encounter errors in the recursive parse, make
518 * sure we don't continue parsing.
521 if (MANDOCLEVEL_FATAL
<= curp
->file_status
)
525 * If input parsers have not been allocated, do so now.
526 * We keep these instanced between parsers, but set them
527 * locally per parse routine since we can use different
528 * parsers with each one.
531 if ( ! (curp
->man
|| curp
->mdoc
))
532 pset(ln
.buf
+ of
, pos
- of
, curp
);
535 * Lastly, push down into the parsers themselves. One
536 * of these will have already been set in the pset()
538 * If libroff returns ROFF_TBL, then add it to the
539 * currently open parse. Since we only get here if
540 * there does exist data (see tbl_data.c), we're
541 * guaranteed that something's been allocated.
542 * Do the same for ROFF_EQN.
548 while (NULL
!= (span
= roff_span(curp
->roff
))) {
550 man_addspan(curp
->man
, span
) :
551 mdoc_addspan(curp
->mdoc
, span
);
555 else if (ROFF_EQN
== rr
)
557 mdoc_addeqn(curp
->mdoc
,
558 roff_eqn(curp
->roff
)) :
559 man_addeqn(curp
->man
,
560 roff_eqn(curp
->roff
));
561 else if (curp
->man
|| curp
->mdoc
)
563 man_parseln(curp
->man
,
564 curp
->line
, ln
.buf
, of
) :
565 mdoc_parseln(curp
->mdoc
,
566 curp
->line
, ln
.buf
, of
);
569 assert(MANDOCLEVEL_FATAL
<= curp
->file_status
);
574 /* Temporary buffers typically are not full. */
576 if (0 == start
&& '\0' == blk
.buf
[i
])
579 /* Start the next input line. */
588 read_whole_file(struct mparse
*curp
, const char *file
, int fd
,
589 struct buf
*fb
, int *with_mmap
)
596 if (-1 == fstat(fd
, &st
)) {
597 curp
->file_status
= MANDOCLEVEL_SYSERR
;
599 (*curp
->mmsg
)(MANDOCERR_SYSSTAT
, curp
->file_status
,
600 file
, 0, 0, strerror(errno
));
605 * If we're a regular file, try just reading in the whole entry
606 * via mmap(). This is faster than reading it into blocks, and
607 * since each file is only a few bytes to begin with, I'm not
608 * concerned that this is going to tank any machines.
611 if (S_ISREG(st
.st_mode
)) {
612 if (st
.st_size
>= (1U << 31)) {
613 curp
->file_status
= MANDOCLEVEL_FATAL
;
615 (*curp
->mmsg
)(MANDOCERR_TOOLARGE
,
616 curp
->file_status
, file
, 0, 0, NULL
);
620 fb
->sz
= (size_t)st
.st_size
;
621 fb
->buf
= mmap(NULL
, fb
->sz
, PROT_READ
, MAP_SHARED
, fd
, 0);
622 if (fb
->buf
!= MAP_FAILED
)
628 * If this isn't a regular file (like, say, stdin), then we must
629 * go the old way and just read things in bit by bit.
638 if (fb
->sz
== (1U << 31)) {
639 curp
->file_status
= MANDOCLEVEL_FATAL
;
641 (*curp
->mmsg
)(MANDOCERR_TOOLARGE
,
646 resize_buf(fb
, 65536);
648 ssz
= read(fd
, fb
->buf
+ (int)off
, fb
->sz
- off
);
654 curp
->file_status
= MANDOCLEVEL_SYSERR
;
656 (*curp
->mmsg
)(MANDOCERR_SYSREAD
,
657 curp
->file_status
, file
, 0, 0,
670 mparse_end(struct mparse
*curp
)
673 if (MANDOCLEVEL_FATAL
<= curp
->file_status
)
676 if (curp
->mdoc
== NULL
&&
678 curp
->sodest
== NULL
) {
679 if (curp
->options
& MPARSE_MDOC
)
680 curp
->mdoc
= curp
->pmdoc
;
682 if (curp
->pman
== NULL
)
683 curp
->pman
= man_alloc(curp
->roff
, curp
,
684 curp
->options
& MPARSE_QUICK
? 1 : 0);
685 curp
->man
= curp
->pman
;
689 if (curp
->mdoc
&& ! mdoc_endparse(curp
->mdoc
)) {
690 assert(MANDOCLEVEL_FATAL
<= curp
->file_status
);
694 if (curp
->man
&& ! man_endparse(curp
->man
)) {
695 assert(MANDOCLEVEL_FATAL
<= curp
->file_status
);
699 roff_endparse(curp
->roff
);
703 mparse_parse_buffer(struct mparse
*curp
, struct buf blk
, const char *file
)
706 static int recursion_depth
;
708 if (64 < recursion_depth
) {
709 mandoc_msg(MANDOCERR_ROFFLOOP
, curp
, curp
->line
, 0, NULL
);
713 /* Line number is per-file. */
719 mparse_buf_r(curp
, blk
, 1);
721 if (0 == --recursion_depth
&& MANDOCLEVEL_FATAL
> curp
->file_status
)
728 mparse_readmem(struct mparse
*curp
, const void *buf
, size_t len
,
733 blk
.buf
= UNCONST(buf
);
736 mparse_parse_buffer(curp
, blk
, file
);
737 return(curp
->file_status
);
741 mparse_readfd(struct mparse
*curp
, int fd
, const char *file
)
746 if (-1 == fd
&& -1 == (fd
= open(file
, O_RDONLY
, 0))) {
747 curp
->file_status
= MANDOCLEVEL_SYSERR
;
749 (*curp
->mmsg
)(MANDOCERR_SYSOPEN
,
751 file
, 0, 0, strerror(errno
));
756 * Run for each opened file; may be called more than once for
757 * each full parse sequence if the opened file is nested (i.e.,
758 * from `so'). Simply sucks in the whole file and moves into
759 * the parse phase for the file.
762 if ( ! read_whole_file(curp
, file
, fd
, &blk
, &with_mmap
))
765 mparse_parse_buffer(curp
, blk
, file
);
769 munmap(blk
.buf
, blk
.sz
);
774 if (STDIN_FILENO
!= fd
&& -1 == close(fd
))
777 return(curp
->file_status
);
781 mparse_alloc(int options
, enum mandoclevel wlevel
,
782 mandocmsg mmsg
, const char *defos
)
786 assert(wlevel
<= MANDOCLEVEL_FATAL
);
788 curp
= mandoc_calloc(1, sizeof(struct mparse
));
790 curp
->options
= options
;
791 curp
->wlevel
= wlevel
;
795 curp
->roff
= roff_alloc(curp
, options
);
796 if (curp
->options
& MPARSE_MDOC
)
797 curp
->pmdoc
= mdoc_alloc(
798 curp
->roff
, curp
, curp
->defos
,
799 curp
->options
& MPARSE_QUICK
? 1 : 0);
800 if (curp
->options
& MPARSE_MAN
)
801 curp
->pman
= man_alloc(curp
->roff
, curp
,
802 curp
->options
& MPARSE_QUICK
? 1 : 0);
808 mparse_reset(struct mparse
*curp
)
811 roff_reset(curp
->roff
);
814 mdoc_reset(curp
->mdoc
);
816 man_reset(curp
->man
);
818 curp
->secondary
->sz
= 0;
820 curp
->file_status
= MANDOCLEVEL_OK
;
829 mparse_free(struct mparse
*curp
)
833 mdoc_free(curp
->pmdoc
);
835 man_free(curp
->pman
);
837 roff_free(curp
->roff
);
839 free(curp
->secondary
->buf
);
841 free(curp
->secondary
);
847 mparse_result(struct mparse
*curp
,
848 struct mdoc
**mdoc
, struct man
**man
, char **sodest
)
851 if (sodest
&& NULL
!= (*sodest
= curp
->sodest
)) {
863 mandoc_vmsg(enum mandocerr t
, struct mparse
*m
,
864 int ln
, int pos
, const char *fmt
, ...)
870 (void)vsnprintf(buf
, sizeof(buf
), fmt
, ap
);
873 mandoc_msg(t
, m
, ln
, pos
, buf
);
877 mandoc_msg(enum mandocerr er
, struct mparse
*m
,
878 int ln
, int col
, const char *msg
)
880 enum mandoclevel level
;
882 level
= MANDOCLEVEL_FATAL
;
883 while (er
< mandoclimits
[level
])
886 if (level
< m
->wlevel
)
890 (*m
->mmsg
)(er
, level
, m
->file
, ln
, col
, msg
);
892 if (m
->file_status
< level
)
893 m
->file_status
= level
;
897 mparse_strerror(enum mandocerr er
)
900 return(mandocerrs
[er
]);
904 mparse_strlevel(enum mandoclevel lvl
)
906 return(mandoclevels
[lvl
]);
910 mparse_keep(struct mparse
*p
)
913 assert(NULL
== p
->secondary
);
914 p
->secondary
= mandoc_calloc(1, sizeof(struct buf
));
918 mparse_getkeep(const struct mparse
*p
)
921 assert(p
->secondary
);
922 return(p
->secondary
->sz
? p
->secondary
->buf
: NULL
);