genman.c: fixed warnings
[k8jam.git] / src / genman.c
blob9dee1387f001f0d08ddb22b3d48b5d3427478da8
1 /*
2 * ttman - text to man converter
4 * Copyright 2006 Timo Hirvonen <tihirvon@gmail.com>
6 * This file is licensed under the GPLv2.
8 * changes by Ketmar // Invisible Vector
9 */
10 #include <ctype.h>
11 #include <errno.h>
12 #include <fcntl.h>
13 #include <stdarg.h>
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include <unistd.h>
18 #include <sys/types.h>
19 #include <sys/stat.h>
20 #include <sys/mman.h>
23 #define BUG() die("BUG in %s\n", __FUNCTION__)
25 #ifdef __GNUC__
26 # define __NORETURN __attribute__((__noreturn__))
27 # define __PRINTF(fmt,nvar) __attribute__((format(printf, fmt, nvar)))
28 #else
29 # define __NORETURN
30 # define __PRINTF(fmt,nvar)
31 #endif
34 typedef struct token {
35 struct token *next;
36 struct token *prev;
37 enum {
38 TOK_TEXT, // max one line w/o \n
39 TOK_NL, // \n
40 TOK_ITALIC, // `
41 TOK_BOLD, // *
42 TOK_INDENT, // \t
43 // keywords (@...)
44 TOK_H1,
45 TOK_H2,
46 TOK_LI,
47 TOK_BR,
48 TOK_PRE,
49 TOK_ENDPRE, // must be after TOK_PRE
50 TOK_RAW,
51 TOK_ENDRAW, // must be after TOK_RAW
52 TOK_TITLE, // WRITE 2 2001-12-13 "Linux 2.0.32" "Linux Programmer's Manual"
53 } type;
54 size_t line;
55 // not NUL-terminated
56 const char *text;
57 // length of text
58 size_t len;
59 } Token;
62 static const char *filename;
63 static char tmp_file[1024];
64 static FILE *outfile;
65 static int cur_line = 1;
66 static Token head = { &head, &head, TOK_TEXT, 0, NULL, 0 };
68 static int bold = 0; // bool
69 static int italic = 0; // bool
70 static size_t indent = 0;
73 static void reset (void) {
74 cur_line = 1;
75 head.next = head.prev = &head;
76 head.type = TOK_TEXT;
77 head.line = 0;
78 head.text = NULL;
79 head.len = 0;
80 bold = italic = 0;
81 indent = 0;
85 #define CONST_STR(str) { str, sizeof(str)-1 }
86 static const struct {
87 const char *str;
88 size_t len;
89 } token_names[] = {
90 CONST_STR("text"),
91 CONST_STR("nl"),
92 CONST_STR("italic"),
93 CONST_STR("bold"),
94 CONST_STR("indent"),
95 // keywords
96 CONST_STR("h1"),
97 CONST_STR("h2"),
98 CONST_STR("li"),
99 CONST_STR("br"),
100 CONST_STR("pre"),
101 CONST_STR("endpre"),
102 CONST_STR("raw"),
103 CONST_STR("endraw"),
104 CONST_STR("title")
106 #define NR_TOKEN_NAMES (sizeof(token_names)/sizeof(token_names[0]))
109 static __NORETURN void quit (void) {
110 if (tmp_file[0]) unlink(tmp_file);
111 exit(1);
115 static __NORETURN __PRINTF(1, 2) void die (const char *format, ...) {
116 va_list ap;
117 fprintf(stderr, "GENMAN: ");
118 va_start(ap, format);
119 vfprintf(stderr, format, ap);
120 va_end(ap);
121 quit();
125 static __NORETURN __PRINTF(2, 3) void syntax (int line, const char *format, ...) {
126 va_list ap;
127 fprintf(stderr, "%s:%d: error: ", filename, line);
128 va_start(ap, format);
129 vfprintf(stderr, format, ap);
130 va_end(ap);
131 quit();
135 static inline const char *keyword_name (int type) {
136 if (type < TOK_H1 || type > TOK_TITLE) die("BUG: no keyword name for type %d\n", type);
137 return token_names[type].str;
141 static void *xmalloc (size_t size) {
142 void *ret = malloc(size);
143 #if defined(__x86_64__) || defined(__ppc64__)
144 if (!ret) die("OOM when allocating %lu bytes\n", size);
145 #else
146 if (!ret) die("OOM when allocating %u bytes\n", size);
147 #endif
148 return ret;
151 static char *memdup (const char *str, int len) {
152 char *s = xmalloc(len+1);
153 if (len > 0) memcpy(s, str, len);
154 s[len] = 0;
155 return s;
159 static Token *new_token (int type) {
160 Token *tok = xmalloc(sizeof(Token));
161 tok->prev = NULL;
162 tok->next = NULL;
163 tok->type = type;
164 tok->line = cur_line;
165 return tok;
169 static void free_token (Token *tok) {
170 Token *prev = tok->prev, *next = tok->next;
171 if (tok == &head) BUG();
172 prev->next = next;
173 next->prev = prev;
174 free(tok);
178 static void emit_token (Token *tok) {
179 tok->prev = head.prev;
180 tok->next = &head;
181 head.prev->next = tok;
182 head.prev = tok;
186 static void emit (int type) {
187 Token *tok = new_token(type);
188 tok->len = 0;
189 tok->text = NULL;
190 emit_token(tok);
194 static int emit_keyword (const char *buf, size_t size) {
195 size_t len;
196 for (len = 0; len < size; ++len) if (!isalnum((unsigned char)buf[len])) break;
197 if (!len) syntax(cur_line, "keyword expected\n");
198 for (size_t i = TOK_H1; i < NR_TOKEN_NAMES; ++i) {
199 if (len != token_names[i].len) continue;
200 if (!strncmp(buf, token_names[i].str, len)) {
201 emit(i);
202 return len;
205 syntax(cur_line, "invalid keyword '@%s'\n", memdup(buf, len));
209 static int emit_text (const char *buf, size_t size) {
210 Token *tok;
211 size_t i;
212 for (i = 0; i < size; ++i) {
213 int c = (unsigned char)buf[i];
214 if (c == '@' || c == '`' || c == '*' || c == '\n' || c == '\\' || c == '\t') break;
216 tok = new_token(TOK_TEXT);
217 tok->text = buf;
218 tok->len = i;
219 emit_token(tok);
220 return i;
224 static void tokenize (const char *buf, size_t size) {
225 size_t pos = 0;
226 while (pos < size) {
227 Token *tok;
228 int ch;
229 ch = (unsigned char)buf[pos++];
230 switch (ch) {
231 case '@':
232 pos += emit_keyword(buf+pos, size-pos);
233 break;
234 case '`':
235 emit(TOK_ITALIC);
236 break;
237 case '*':
238 emit(TOK_BOLD);
239 break;
240 case '\n':
241 emit(TOK_NL);
242 ++cur_line;
243 break;
244 case '\t':
245 emit(TOK_INDENT);
246 break;
247 case ' ': // this can be space or indent
248 if (pos == 1 || pos >= size || buf[pos-2] != '\n' || buf[pos] != ' ') goto normal_text;
249 // indent
250 --pos; // first space
251 while (pos < size) {
252 if (pos+1 >= size) syntax(cur_line, "invalid indentation\n");
253 if (buf[pos] != ' ') break; // done
254 if (buf[pos+1] != ' ') syntax(cur_line, "invalid indentation\n");
255 emit(TOK_INDENT);
256 pos += 2;
258 break;
259 case '\\':
260 tok = new_token(TOK_TEXT);
261 tok->text = buf+pos;
262 tok->len = 1;
263 ++pos;
264 if (pos == size || buf[pos] == '\n') {
265 // just one '\\'
266 --tok->text;
268 if (tok->text[0] == '\\') {
269 tok->text = "\\\\";
270 tok->len = 2;
272 emit_token(tok);
273 break;
274 default:
275 normal_text:
276 --pos;
277 pos += emit_text(buf+pos, size-pos);
278 break;
284 static int is_empty_line (const Token *tok) {
285 while (tok != &head) {
286 switch (tok->type) {
287 case TOK_TEXT:
288 for (size_t i = 0; i < tok->len; ++i) if (tok->text[i] != ' ') return 0;
289 break;
290 case TOK_INDENT:
291 break;
292 case TOK_NL:
293 return 1;
294 default:
295 return 0;
297 tok = tok->next;
299 return 1;
303 static Token *remove_line (Token *tok) {
304 while (tok != &head) {
305 Token *next = tok->next;
306 int type = tok->type;
307 free_token(tok);
308 tok = next;
309 if (type == TOK_NL) break;
311 return tok;
315 static Token *skip_after (Token *tok, int type) {
316 Token *save = tok;
317 while (tok != &head) {
318 if ((int)tok->type == type) {
319 tok = tok->next;
320 if (tok->type != TOK_NL) syntax(tok->line, "newline expected after @%s\n", keyword_name(type));
321 return tok->next;
323 if (tok->type >= TOK_H1) syntax(tok->line, "keywords not allowed betweed @%s and @%s\n", keyword_name(type-1), keyword_name(type));
324 tok = tok->next;
326 syntax(save->prev->line, "missing @%s\n", keyword_name(type));
330 static Token *get_next_line (Token *tok) {
331 while (tok != &head) {
332 int type = tok->type;
333 tok = tok->next;
334 if (type == TOK_NL) break;
336 return tok;
340 static Token *get_indent (Token *tok, size_t *ip) {
341 size_t i = 0;
342 for (i = 0; tok != &head && tok->type == TOK_INDENT; ++i) tok = tok->next;
343 *ip = i;
344 return tok;
348 // line must be non-empty
349 static Token *check_line (Token *tok, size_t *ip) {
350 Token *start;
351 int tok_type;
352 start = tok = get_indent(tok, ip);
353 tok_type = tok->type;
354 switch (tok_type) {
355 case TOK_TEXT:
356 case TOK_BOLD:
357 case TOK_ITALIC:
358 case TOK_BR:
359 tok = tok->next;
360 while (tok != &head) {
361 switch (tok->type) {
362 case TOK_TEXT:
363 case TOK_BOLD:
364 case TOK_ITALIC:
365 case TOK_BR:
366 case TOK_INDENT:
367 break;
368 case TOK_NL:
369 return start;
370 default:
371 syntax(tok->line, "@%s not allowed inside paragraph\n", keyword_name(tok->type));
373 tok = tok->next;
375 break;
376 case TOK_H1:
377 case TOK_H2:
378 case TOK_TITLE:
379 if (*ip) goto indentation;
380 // check arguments
381 tok = tok->next;
382 while (tok != &head) {
383 switch (tok->type) {
384 case TOK_TEXT:
385 case TOK_INDENT:
386 break;
387 case TOK_NL:
388 return start;
389 default:
390 syntax(tok->line, "@%s can contain only text\n", keyword_name(tok_type));
392 tok = tok->next;
394 break;
395 case TOK_LI:
396 // check arguments
397 tok = tok->next;
398 while (tok != &head) {
399 switch (tok->type) {
400 case TOK_TEXT:
401 case TOK_BOLD:
402 case TOK_ITALIC:
403 case TOK_INDENT:
404 break;
405 case TOK_NL:
406 return start;
407 default:
408 syntax(tok->line, "@%s not allowed inside @li\n",
409 keyword_name(tok->type));
411 tok = tok->next;
413 break;
414 case TOK_PRE:
415 // checked later
416 break;
417 case TOK_RAW:
418 if (*ip) goto indentation;
419 // checked later
420 break;
421 case TOK_ENDPRE:
422 case TOK_ENDRAW:
423 syntax(tok->line, "@%s not expected\n", keyword_name(tok->type));
424 break;
425 case TOK_NL:
426 case TOK_INDENT:
427 BUG();
428 break;
430 return start;
431 indentation:
432 syntax(tok->line, "indentation before @%s\n", keyword_name(tok->type));
436 static void insert_nl_before (Token *next) {
437 Token *prev = next->prev;
438 Token *new = new_token(TOK_NL);
439 new->prev = prev;
440 new->next = next;
441 prev->next = new;
442 next->prev = new;
446 static void normalize (void) {
447 Token *tok = head.next;
449 * >= 0 if previous line was text (== amount of indent)
450 * -1 if previous block was @pre (amount of indent doesn't matter)
451 * -2 otherwise (@h1 etc., indent was 0)
453 long prev_indent = -2;
454 while (tok != &head) {
455 Token *start;
456 size_t i;
457 int new_para = 0; // bool
458 // remove empty lines
459 while (is_empty_line(tok)) {
460 tok = remove_line(tok);
461 new_para = 1;
462 if (tok == &head) return;
464 // skips indent
465 start = tok;
466 tok = check_line(tok, &i);
467 switch (tok->type) {
468 case TOK_TEXT:
469 case TOK_ITALIC:
470 case TOK_BOLD:
471 case TOK_BR:
472 // normal text
473 if (new_para && prev_indent >= -1) {
474 // previous line/block was text or @pre
475 // and there was a empty line after it
476 insert_nl_before(start);
478 if (!new_para && prev_indent == (long)i) {
479 // join with previous line
480 Token *nl = start->prev;
481 if (nl->type != TOK_NL) BUG();
482 if ((nl->prev != &head && nl->prev->type == TOK_BR) || tok->type == TOK_BR) {
483 // don't convert \n after/before @br to ' '
484 free_token(nl);
485 } else {
486 // convert "\n" to " "
487 nl->type = TOK_TEXT;
488 nl->text = " ";
489 nl->len = 1;
491 // remove indent
492 while (start->type == TOK_INDENT) {
493 Token *next = start->next;
494 free_token(start);
495 start = next;
498 prev_indent = (long)i;
499 tok = get_next_line(tok);
500 break;
501 case TOK_PRE:
502 case TOK_RAW:
503 // these can be directly after normal text
504 // but not joined with the previous line
505 if (new_para && prev_indent >= -1) {
506 // previous line/block was text or @pre
507 // and there was a empty line after it
508 insert_nl_before(start);
510 tok = skip_after(tok->next, tok->type+1);
511 prev_indent = -1;
512 break;
513 case TOK_H1:
514 case TOK_H2:
515 case TOK_LI:
516 case TOK_TITLE:
517 // remove white space after H1, H2, L1 and TITLE
518 tok = tok->next;
519 while (tok != &head) {
520 int type = tok->type;
521 Token *next;
522 if (type == TOK_TEXT) {
523 while (tok->len && *tok->text == ' ') {
524 ++(tok->text);
525 --(tok->len);
527 if (tok->len) break;
529 if (type != TOK_INDENT) break;
530 // empty TOK_TEXT or TOK_INDENT
531 next = tok->next;
532 free_token(tok);
533 tok = next;
535 // not normal text. can't be joined
536 prev_indent = -2;
537 tok = get_next_line(tok);
538 break;
539 case TOK_NL:
540 case TOK_INDENT:
541 case TOK_ENDPRE:
542 case TOK_ENDRAW:
543 BUG();
544 break;
550 #define output(...) fprintf(outfile, __VA_ARGS__)
552 static void output_buf (const char *buf, int len) {
553 fwrite(buf, 1, len, outfile);
557 static void output_text (Token *tok) {
558 char buf[1024];
559 const char *str = tok->text;
560 int len = tok->len;
561 int pos = 0;
562 while (len) {
563 int c = *str++;
564 if (pos >= (int)sizeof(buf)-1) {
565 output_buf(buf, pos);
566 pos = 0;
568 if (c == '-') buf[pos++] = '\\';
569 buf[pos++] = c;
570 --len;
572 if (pos) output_buf(buf, pos);
576 static Token *output_pre (Token *tok) {
577 int bol = 1;
578 if (tok->type != TOK_NL) syntax(tok->line, "newline expected after @pre\n");
579 output(".nf\n");
580 tok = tok->next;
581 while (tok != &head) {
582 if (bol) {
583 size_t i;
584 tok = get_indent(tok, &i);
585 if (i != indent && tok->type != TOK_NL) syntax(tok->line, "indent changed in @pre\n");
587 switch (tok->type) {
588 case TOK_TEXT:
589 if (bol && tok->len && tok->text[0] == '.') output("\\&");
590 output_text(tok);
591 break;
592 case TOK_NL:
593 output("\n");
594 bol = 1;
595 tok = tok->next;
596 continue;
597 case TOK_ITALIC:
598 output("`");
599 break;
600 case TOK_BOLD:
601 output("*");
602 break;
603 case TOK_INDENT:
604 // FIXME: warn
605 output(" ");
606 break;
607 case TOK_ENDPRE:
608 output(".fi\n");
609 tok = tok->next;
610 if (tok != &head && tok->type == TOK_NL) tok = tok->next;
611 return tok;
612 default:
613 BUG();
614 break;
616 bol = 0;
617 tok = tok->next;
619 return tok;
623 static Token *output_raw (Token *tok) {
624 if (tok->type != TOK_NL) syntax(tok->line, "newline expected after @raw\n");
625 tok = tok->next;
626 while (tok != &head) {
627 switch (tok->type) {
628 case TOK_TEXT:
629 if (tok->len == 2 && !strncmp(tok->text, "\\\\", 2)) {
630 /* ugly special case
631 * "\\" (\) was converted to "\\\\" (\\) because
632 * nroff does escaping too.
634 output("\\");
635 } else {
636 output_buf(tok->text, tok->len);
638 break;
639 case TOK_NL:
640 output("\n");
641 break;
642 case TOK_ITALIC:
643 output("`");
644 break;
645 case TOK_BOLD:
646 output("*");
647 break;
648 case TOK_INDENT:
649 output("\t");
650 break;
651 case TOK_ENDRAW:
652 tok = tok->next;
653 if (tok != &head && tok->type == TOK_NL) tok = tok->next;
654 return tok;
655 default:
656 BUG();
657 break;
659 tok = tok->next;
661 return tok;
665 static Token *output_para (Token *tok) {
666 int bol = 1; // bool
667 while (tok != &head) {
668 switch (tok->type) {
669 case TOK_TEXT:
670 output_text(tok);
671 break;
672 case TOK_ITALIC:
673 italic ^= 1;
674 if (italic) {
675 output("\\fI");
676 } else {
677 output("\\fR");
679 break;
680 case TOK_BOLD:
681 bold ^= 1;
682 if (bold) {
683 output("\\fB");
684 } else {
685 output("\\fR");
687 break;
688 case TOK_BR:
689 if (bol) {
690 output(".br\n");
691 } else {
692 output("\n.br\n");
694 bol = 1;
695 tok = tok->next;
696 continue;
697 case TOK_NL:
698 output("\n");
699 return tok->next;
700 case TOK_INDENT:
701 output(" ");
702 break;
703 default:
704 BUG();
705 break;
707 bol = 0;
708 tok = tok->next;
710 return tok;
714 static Token *title (Token *tok, const char *cmd) {
715 output("%s", cmd);
716 return output_para(tok->next);
720 static Token *dump_one (Token *tok) {
721 size_t i;
722 tok = get_indent(tok, &i);
723 if (tok->type != TOK_RAW) {
724 while (indent < i) {
725 output(".RS\n");
726 ++indent;
728 while (indent > i) {
729 output(".RE\n");
730 --indent;
733 switch (tok->type) {
734 case TOK_TEXT:
735 case TOK_ITALIC:
736 case TOK_BOLD:
737 case TOK_BR:
738 if (tok->type == TOK_TEXT && tok->len && tok->text[0] == '.') output("\\&");
739 tok = output_para(tok);
740 break;
741 case TOK_H1:
742 tok = title(tok, ".SH ");
743 break;
744 case TOK_H2:
745 tok = title(tok, ".SS ");
746 break;
747 case TOK_LI:
748 tok = title(tok, ".TP\n");
749 break;
750 case TOK_PRE:
751 tok = output_pre(tok->next);
752 break;
753 case TOK_RAW:
754 tok = output_raw(tok->next);
755 break;
756 case TOK_TITLE:
757 tok = title(tok, ".TH ");
758 // must be after .TH
759 // no hyphenation, adjust left
760 output(".nh\n.ad l\n");
761 break;
762 case TOK_NL:
763 output("\n");
764 tok = tok->next;
765 break;
766 case TOK_ENDPRE:
767 case TOK_ENDRAW:
768 case TOK_INDENT:
769 BUG();
770 break;
772 return tok;
776 static void dump (void) {
777 Token *tok = head.next;
778 while (tok != &head) tok = dump_one(tok);
782 static void process (void) {
783 struct stat s;
784 const char *buf;
785 int fd = open(filename, O_RDONLY);
786 if (fd == -1) die("opening `%s' for reading: %s\n", filename, strerror(errno));
787 fstat(fd, &s);
788 if (s.st_size) {
789 buf = mmap(NULL, s.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
790 if (buf == MAP_FAILED) die("mmap: %s\n", strerror(errno));
791 tokenize(buf, s.st_size);
792 normalize();
794 dump();
798 void generate_man (const char *infname, const char *outfname) {
799 int fd;
801 reset();
802 filename = infname;
804 snprintf(tmp_file, sizeof(tmp_file), "%s.XXXXXX", outfname);
805 fd = mkstemp(tmp_file);
806 if (fd < 0) die("creating %s: %s\n", tmp_file, strerror(errno));
808 outfile = fdopen(fd, "w");
809 if (!outfile) die("opening %s: %s\n", tmp_file, strerror(errno));
811 process();
813 if (rename(tmp_file, outfname)) die("renaming %s to %s: %s\n", tmp_file, outfname, strerror(errno));