3057 Remove sgml util from cmd/man
[unleashed.git] / usr / src / cmd / localedef / scanner.c
blobe1f0e6da20f0e40f07d180260e9ad2eb3a7626a6
1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
13 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
17 * This file contains the "scanner", which tokenizes the input files
18 * for localedef for processing by the higher level grammar processor.
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <ctype.h>
24 #include <limits.h>
25 #include <string.h>
26 #include <widec.h>
27 #include <sys/types.h>
28 #include <assert.h>
29 #include "localedef.h"
30 #include "parser.tab.h"
32 int com_char = '#';
33 int esc_char = '\\';
34 int mb_cur_min = 1;
35 int mb_cur_max = 1;
36 int lineno = 1;
37 int warnings = 0;
38 static int nextline;
39 static FILE *input = stdin;
40 static const char *filename = "<stdin>";
41 static int instring = 0;
42 static int escaped = 0;
45 * Token space ... grows on demand.
47 static char *token = NULL;
48 static int tokidx;
49 static int toksz = 0;
50 static int hadtok = 0;
53 * Wide string space ... grows on demand.
55 static wchar_t *widestr = NULL;
56 static int wideidx = 0;
57 static int widesz = 0;
60 * The last keyword seen. This is useful to trigger the special lexer rules
61 * for "copy" and also collating symbols and elements.
63 int last_kw = 0;
64 static int category = T_END;
66 static struct token {
67 int id;
68 const char *name;
69 } keywords[] = {
70 { T_COM_CHAR, "comment_char" },
71 { T_ESC_CHAR, "escape_char" },
72 { T_END, "END" },
73 { T_COPY, "copy" },
74 { T_MESSAGES, "LC_MESSAGES" },
75 { T_YESSTR, "yesstr" },
76 { T_YESEXPR, "yesexpr" },
77 { T_NOSTR, "nostr" },
78 { T_NOEXPR, "noexpr" },
79 { T_MONETARY, "LC_MONETARY" },
80 { T_INT_CURR_SYMBOL, "int_curr_symbol" },
81 { T_CURRENCY_SYMBOL, "currency_symbol" },
82 { T_MON_DECIMAL_POINT, "mon_decimal_point" },
83 { T_MON_THOUSANDS_SEP, "mon_thousands_sep" },
84 { T_POSITIVE_SIGN, "positive_sign" },
85 { T_NEGATIVE_SIGN, "negative_sign" },
86 { T_MON_GROUPING, "mon_grouping" },
87 { T_INT_FRAC_DIGITS, "int_frac_digits" },
88 { T_FRAC_DIGITS, "frac_digits" },
89 { T_P_CS_PRECEDES, "p_cs_precedes" },
90 { T_P_SEP_BY_SPACE, "p_sep_by_space" },
91 { T_N_CS_PRECEDES, "n_cs_precedes" },
92 { T_N_SEP_BY_SPACE, "n_sep_by_space" },
93 { T_P_SIGN_POSN, "p_sign_posn" },
94 { T_N_SIGN_POSN, "n_sign_posn" },
95 { T_INT_P_CS_PRECEDES, "int_p_cs_precedes" },
96 { T_INT_N_CS_PRECEDES, "int_n_cs_precedes" },
97 { T_INT_P_SEP_BY_SPACE, "int_p_sep_by_space" },
98 { T_INT_N_SEP_BY_SPACE, "int_n_sep_by_space" },
99 { T_INT_P_SIGN_POSN, "int_p_sign_posn" },
100 { T_INT_N_SIGN_POSN, "int_n_sign_posn" },
101 { T_COLLATE, "LC_COLLATE" },
102 { T_COLLATING_SYMBOL, "collating-symbol" },
103 { T_COLLATING_ELEMENT, "collating-element" },
104 { T_FROM, "from" },
105 { T_ORDER_START, "order_start" },
106 { T_ORDER_END, "order_end" },
107 { T_FORWARD, "forward" },
108 { T_BACKWARD, "backward" },
109 { T_POSITION, "position" },
110 { T_IGNORE, "IGNORE" },
111 { T_UNDEFINED, "UNDEFINED" },
112 { T_NUMERIC, "LC_NUMERIC" },
113 { T_DECIMAL_POINT, "decimal_point" },
114 { T_THOUSANDS_SEP, "thousands_sep" },
115 { T_GROUPING, "grouping" },
116 { T_TIME, "LC_TIME" },
117 { T_ABDAY, "abday" },
118 { T_DAY, "day" },
119 { T_ABMON, "abmon" },
120 { T_MON, "mon" },
121 { T_D_T_FMT, "d_t_fmt" },
122 { T_D_FMT, "d_fmt" },
123 { T_T_FMT, "t_fmt" },
124 { T_AM_PM, "am_pm" },
125 { T_T_FMT_AMPM, "t_fmt_ampm" },
126 { T_ERA, "era" },
127 { T_ERA_D_FMT, "era_d_fmt" },
128 { T_ERA_T_FMT, "era_t_fmt" },
129 { T_ERA_D_T_FMT, "era_d_t_fmt" },
130 { T_ALT_DIGITS, "alt_digits" },
131 { T_CTYPE, "LC_CTYPE" },
132 { T_ISUPPER, "upper" },
133 { T_ISLOWER, "lower" },
134 { T_ISALPHA, "alpha" },
135 { T_ISDIGIT, "digit" },
136 { T_ISPUNCT, "punct" },
137 { T_ISXDIGIT, "xdigit" },
138 { T_ISSPACE, "space" },
139 { T_ISPRINT, "print" },
140 { T_ISGRAPH, "graph" },
141 { T_ISBLANK, "blank" },
142 { T_ISCNTRL, "cntrl" },
144 * These entries are local additions, and not specified by
145 * TOG. Note that they are not guaranteed to be accurate for
146 * all locales, and so applications should not depend on them.
148 { T_ISSPECIAL, "special" },
149 { T_ISENGLISH, "english" },
150 { T_ISPHONOGRAM, "phonogram" },
151 { T_ISIDEOGRAM, "ideogram" },
152 { T_ISNUMBER, "number" },
154 * We have to support this in the grammar, but it would be a
155 * syntax error to define a character as one of these without
156 * also defining it as an alpha or digit. We ignore it in our
157 * parsing.
159 { T_ISALNUM, "alnum" },
160 { T_TOUPPER, "toupper" },
161 { T_TOLOWER, "tolower" },
164 * These are keywords used in the charmap file. Note that
165 * Solaris orginally used angle brackets to wrap some of them,
166 * but we removed that to simplify our parser. The first of these
167 * items are "global items."
169 { T_CHARMAP, "CHARMAP" },
170 { T_WIDTH, "WIDTH" },
171 { T_WIDTH_DEFAULT, "WIDTH_DEFAULT" },
173 { -1, NULL },
177 * These special words are only used in a charmap file, enclosed in <>.
179 static struct token symwords[] = {
180 { T_COM_CHAR, "comment_char" },
181 { T_ESC_CHAR, "escape_char" },
182 { T_CODE_SET, "code_set_name" },
183 { T_MB_CUR_MAX, "mb_cur_max" },
184 { T_MB_CUR_MIN, "mb_cur_min" },
185 { -1, NULL },
188 static int categories[] = {
189 T_CHARMAP,
190 T_CTYPE,
191 T_COLLATE,
192 T_MESSAGES,
193 T_MONETARY,
194 T_NUMERIC,
195 T_TIME,
199 void
200 reset_scanner(const char *fname)
202 if (fname == NULL) {
203 filename = "<stdin>";
204 input = stdin;
205 } else {
206 if (input != stdin)
207 (void) fclose(input);
208 if ((input = fopen(fname, "r")) == NULL) {
209 perror("fopen");
210 exit(4);
212 filename = fname;
214 com_char = '#';
215 esc_char = '\\';
216 instring = 0;
217 escaped = 0;
218 lineno = 1;
219 nextline = 1;
220 tokidx = 0;
221 wideidx = 0;
224 #define hex(x) \
225 (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
226 #define isodigit(x) ((x >= '0') && (x <= '7'))
228 static int
229 scanc(void)
231 int c;
233 c = getc(input);
234 lineno = nextline;
235 if (c == '\n') {
236 nextline++;
238 return (c);
241 static void
242 unscanc(int c)
244 if (c == '\n') {
245 nextline--;
247 if (ungetc(c, input) < 0) {
248 yyerror(_("ungetc failed"));
252 static int
253 scan_hex_byte(void)
255 int c1, c2;
256 int v;
258 c1 = scanc();
259 if (!isxdigit(c1)) {
260 yyerror(_("malformed hex digit"));
261 return (0);
263 c2 = scanc();
264 if (!isxdigit(c2)) {
265 yyerror(_("malformed hex digit"));
266 return (0);
268 v = ((hex(c1) << 4) | hex(c2));
269 return (v);
272 static int
273 scan_dec_byte(void)
275 int c1, c2, c3;
276 int b;
278 c1 = scanc();
279 if (!isdigit(c1)) {
280 yyerror(_("malformed decimal digit"));
281 return (0);
283 b = c1 - '0';
284 c2 = scanc();
285 if (!isdigit(c2)) {
286 yyerror(_("malformed decimal digit"));
287 return (0);
289 b *= 10;
290 b += (c2 - '0');
291 c3 = scanc();
292 if (!isdigit(c3)) {
293 unscanc(c3);
294 } else {
295 b *= 10;
296 b += (c3 - '0');
298 return (b);
301 static int
302 scan_oct_byte(void)
304 int c1, c2, c3;
305 int b;
307 b = 0;
309 c1 = scanc();
310 if (!isodigit(c1)) {
311 yyerror(_("malformed octal digit"));
312 return (0);
314 b = c1 - '0';
315 c2 = scanc();
316 if (!isodigit(c2)) {
317 yyerror(_("malformed octal digit"));
318 return (0);
320 b *= 8;
321 b += (c2 - '0');
322 c3 = scanc();
323 if (!isodigit(c3)) {
324 unscanc(c3);
325 } else {
326 b *= 8;
327 b += (c3 - '0');
329 return (b);
332 void
333 add_tok(int c)
335 if ((tokidx + 1) >= toksz) {
336 toksz += 64;
337 if ((token = realloc(token, toksz)) == NULL) {
338 yyerror(_("out of memory"));
339 tokidx = 0;
340 toksz = 0;
341 return;
345 token[tokidx++] = (char)c;
346 token[tokidx] = 0;
348 void
349 add_wcs(wchar_t c)
351 if ((wideidx + 1) >= widesz) {
352 widesz += 64;
353 widestr = realloc(widestr, (widesz * sizeof (wchar_t)));
354 if (widestr == NULL) {
355 yyerror(_("out of memory"));
356 wideidx = 0;
357 widesz = 0;
358 return;
362 widestr[wideidx++] = c;
363 widestr[wideidx] = 0;
366 wchar_t *
367 get_wcs(void)
369 wchar_t *ws = widestr;
370 wideidx = 0;
371 widestr = NULL;
372 widesz = 0;
373 if (ws == NULL) {
374 if ((ws = wsdup(L"")) == NULL) {
375 yyerror(_("out of memory"));
378 return (ws);
381 static int
382 get_byte(void)
384 int c;
386 if ((c = scanc()) != esc_char) {
387 unscanc(c);
388 return (EOF);
390 c = scanc();
392 switch (c) {
393 case 'd':
394 case 'D':
395 return (scan_dec_byte());
396 case 'x':
397 case 'X':
398 return (scan_hex_byte());
399 case '0':
400 case '1':
401 case '2':
402 case '3':
403 case '4':
404 case '5':
405 case '6':
406 case '7':
407 /* put the character back so we can get it */
408 unscanc(c);
409 return (scan_oct_byte());
410 default:
411 unscanc(c);
412 unscanc(esc_char);
413 return (EOF);
418 get_escaped(int c)
420 switch (c) {
421 case 'n':
422 return ('\n');
423 case 'r':
424 return ('\r');
425 case 't':
426 return ('\t');
427 case 'f':
428 return ('\f');
429 case 'v':
430 return ('\v');
431 case 'b':
432 return ('\b');
433 case 'a':
434 return ('\a');
435 default:
436 return (c);
441 get_wide(void)
443 static char mbs[MB_LEN_MAX + 1] = "";
444 static int mbi = 0;
445 int c;
446 wchar_t wc;
448 if (mb_cur_max >= sizeof (mbs)) {
449 yyerror(_("max multibyte character size too big"));
450 mbi = 0;
451 return (T_NULL);
453 for (;;) {
454 if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) {
456 * end of the byte sequence reached, but no
457 * valid wide decoding. fatal error.
459 mbi = 0;
460 yyerror(_("not a valid character encoding"));
461 return (T_NULL);
463 mbs[mbi++] = c;
464 mbs[mbi] = 0;
466 /* does it decode? */
467 if (to_wide(&wc, mbs) >= 0) {
468 break;
472 mbi = 0;
473 if (category != T_CHARMAP) {
474 if (check_charmap(wc) < 0) {
475 yyerror(_("no symbolic name for character"));
476 return (T_NULL);
480 yylval.wc = wc;
481 return (T_CHAR);
485 get_symbol(void)
487 int c;
489 while ((c = scanc()) != EOF) {
490 if (escaped) {
491 escaped = 0;
492 if (c == '\n')
493 continue;
494 add_tok(get_escaped(c));
495 continue;
497 if (c == esc_char) {
498 escaped = 1;
499 continue;
501 if (c == '\n') { /* well that's strange! */
502 yyerror(_("unterminated symbolic name"));
503 continue;
505 if (c == '>') { /* end of symbol */
508 * This restarts the token from the beginning
509 * the next time we scan a character. (This
510 * token is complete.)
513 if (token == NULL) {
514 yyerror(_("missing symbolic name"));
515 return (T_NULL);
517 tokidx = 0;
520 * A few symbols are handled as keywords outside
521 * of the normal categories.
523 if (category == T_END) {
524 int i;
525 for (i = 0; symwords[i].name != 0; i++) {
526 if (strcmp(token, symwords[i].name) ==
527 0) {
528 last_kw = symwords[i].id;
529 return (last_kw);
534 * Contextual rule: Only literal characters are
535 * permitted in CHARMAP. Anywhere else the symbolic
536 * forms are fine.
538 if ((category != T_CHARMAP) &&
539 (lookup_charmap(token, &yylval.wc)) != -1) {
540 return (T_CHAR);
542 if ((yylval.collsym = lookup_collsym(token)) != NULL) {
543 return (T_COLLSYM);
545 if ((yylval.collelem = lookup_collelem(token)) !=
546 NULL) {
547 return (T_COLLELEM);
549 /* its an undefined symbol */
550 yylval.token = strdup(token);
551 token = NULL;
552 toksz = 0;
553 tokidx = 0;
554 return (T_SYMBOL);
556 add_tok(c);
559 yyerror(_("unterminated symbolic name"));
560 return (EOF);
564 get_category(void)
566 return (category);
569 static int
570 consume_token(void)
572 int len = tokidx;
573 int i;
575 tokidx = 0;
576 if (token == NULL)
577 return (T_NULL);
580 * this one is special, because we don't want it to alter the
581 * last_kw field.
583 if (strcmp(token, "...") == 0) {
584 return (T_ELLIPSIS);
587 /* search for reserved words first */
588 for (i = 0; keywords[i].name; i++) {
589 int j;
590 if (strcmp(keywords[i].name, token) != 0) {
591 continue;
594 last_kw = keywords[i].id;
596 /* clear the top level category if we're done with it */
597 if (last_kw == T_END) {
598 category = T_END;
601 /* set the top level category if we're changing */
602 for (j = 0; categories[j]; j++) {
603 if (categories[j] != last_kw)
604 continue;
605 category = last_kw;
608 return (keywords[i].id);
611 /* maybe its a numeric constant? */
612 if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
613 char *eptr;
614 yylval.num = strtol(token, &eptr, 10);
615 if (*eptr != 0)
616 yyerror(_("malformed number"));
617 return (T_NUMBER);
621 * A single lone character is treated as a character literal.
622 * To avoid duplication of effort, we stick in the charmap.
624 if (len == 1) {
625 yylval.wc = token[0];
626 return (T_CHAR);
629 /* anything else is treated as a symbolic name */
630 yylval.token = strdup(token);
631 token = NULL;
632 toksz = 0;
633 tokidx = 0;
634 return (T_NAME);
637 void
638 scan_to_eol(void)
640 int c;
641 while ((c = scanc()) != '\n') {
642 if (c == EOF) {
643 /* end of file without newline! */
644 errf(_("missing newline"));
645 return;
648 assert(c == '\n');
652 yylex(void)
654 int c;
656 while ((c = scanc()) != EOF) {
658 /* special handling for quoted string */
659 if (instring) {
660 if (escaped) {
661 escaped = 0;
663 /* if newline, just eat and forget it */
664 if (c == '\n')
665 continue;
667 if (strchr("xXd01234567", c)) {
668 unscanc(c);
669 unscanc(esc_char);
670 return (get_wide());
672 yylval.wc = get_escaped(c);
673 return (T_CHAR);
675 if (c == esc_char) {
676 escaped = 1;
677 continue;
679 switch (c) {
680 case '<':
681 return (get_symbol());
682 case '>':
683 /* oops! should generate syntax error */
684 return (T_GT);
685 case '"':
686 instring = 0;
687 return (T_QUOTE);
688 default:
689 yylval.wc = c;
690 return (T_CHAR);
694 /* escaped characters first */
695 if (escaped) {
696 escaped = 0;
697 if (c == '\n') {
698 /* eat the newline */
699 continue;
701 hadtok = 1;
702 if (tokidx) {
703 /* an escape mid-token is nonsense */
704 return (T_NULL);
707 /* numeric escapes are treated as wide characters */
708 if (strchr("xXd01234567", c)) {
709 unscanc(c);
710 unscanc(esc_char);
711 return (get_wide());
714 add_tok(get_escaped(c));
715 continue;
718 /* if it is the escape charter itself note it */
719 if (c == esc_char) {
720 escaped = 1;
721 continue;
724 /* remove from the comment char to end of line */
725 if (c == com_char) {
726 while (c != '\n') {
727 if ((c = scanc()) == EOF) {
728 /* end of file without newline! */
729 return (EOF);
732 assert(c == '\n');
733 if (!hadtok) {
735 * If there were no tokens on this line,
736 * then just pretend it didn't exist at all.
738 continue;
740 hadtok = 0;
741 return (T_NL);
744 if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
746 * These are all token delimiters. If there
747 * is a token already in progress, we need to
748 * process it.
750 unscanc(c);
751 return (consume_token());
754 switch (c) {
755 case '\n':
756 if (!hadtok) {
758 * If the line was completely devoid of tokens,
759 * then just ignore it.
761 continue;
763 /* we're starting a new line, reset the token state */
764 hadtok = 0;
765 return (T_NL);
766 case ',':
767 hadtok = 1;
768 return (T_COMMA);
769 case ';':
770 hadtok = 1;
771 return (T_SEMI);
772 case '(':
773 hadtok = 1;
774 return (T_LPAREN);
775 case ')':
776 hadtok = 1;
777 return (T_RPAREN);
778 case '>':
779 hadtok = 1;
780 return (T_GT);
781 case '<':
782 /* symbol start! */
783 hadtok = 1;
784 return (get_symbol());
785 case ' ':
786 case '\t':
787 /* whitespace, just ignore it */
788 continue;
789 case '"':
790 hadtok = 1;
791 instring = 1;
792 return (T_QUOTE);
793 default:
794 hadtok = 1;
795 add_tok(c);
796 continue;
799 return (EOF);
802 void
803 yyerror(const char *msg)
805 (void) fprintf(stderr, _("%s: %d: error: %s\n"),
806 filename, lineno, msg);
807 exit(4);
810 void
811 errf(const char *fmt, ...)
813 char *msg;
815 va_list va;
816 va_start(va, fmt);
817 (void) vasprintf(&msg, fmt, va);
818 va_end(va);
820 (void) fprintf(stderr, _("%s: %d: error: %s\n"),
821 filename, lineno, msg);
822 free(msg);
823 exit(4);
826 void
827 warn(const char *fmt, ...)
829 char *msg;
831 va_list va;
832 va_start(va, fmt);
833 (void) vasprintf(&msg, fmt, va);
834 va_end(va);
836 (void) fprintf(stderr, _("%s: %d: warning: %s\n"),
837 filename, lineno, msg);
838 free(msg);
839 warnings++;
840 if (!warnok)
841 exit(4);