libc/nls: Sync with FreeBSD.
[dragonfly.git] / usr.bin / localedef / scanner.c
blob1e24e5c52d571f41a5af2f2bcf023dbaf7011e7b
1 /*
2 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
3 * Copyright 2015 John Marino <draco@marino.st>
5 * This source code is derived from the illumos localedef command, and
6 * provided under BSD-style license terms by Nexenta Systems, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
32 * This file contains the "scanner", which tokenizes the input files
33 * for localedef for processing by the higher level grammar processor.
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <ctype.h>
39 #include <limits.h>
40 #include <string.h>
41 #include <wchar.h>
42 #include <sys/types.h>
43 #include <assert.h>
44 #include "localedef.h"
45 #include "parser.h"
47 int com_char = '#';
48 int esc_char = '\\';
49 int mb_cur_min = 1;
50 int mb_cur_max = 1;
51 int lineno = 1;
52 int warnings = 0;
53 int is_stdin = 1;
54 FILE *input;
55 static int nextline;
56 //static FILE *input = stdin;
57 static const char *filename = "<stdin>";
58 static int instring = 0;
59 static int escaped = 0;
62 * Token space ... grows on demand.
64 static char *token = NULL;
65 static int tokidx;
66 static int toksz = 0;
67 static int hadtok = 0;
70 * Wide string space ... grows on demand.
72 static wchar_t *widestr = NULL;
73 static int wideidx = 0;
74 static int widesz = 0;
77 * The last keyword seen. This is useful to trigger the special lexer rules
78 * for "copy" and also collating symbols and elements.
80 int last_kw = 0;
81 static int category = T_END;
83 static struct token {
84 int id;
85 const char *name;
86 } keywords[] = {
87 { T_COM_CHAR, "comment_char" },
88 { T_ESC_CHAR, "escape_char" },
89 { T_END, "END" },
90 { T_COPY, "copy" },
91 { T_MESSAGES, "LC_MESSAGES" },
92 { T_YESSTR, "yesstr" },
93 { T_YESEXPR, "yesexpr" },
94 { T_NOSTR, "nostr" },
95 { T_NOEXPR, "noexpr" },
96 { T_MONETARY, "LC_MONETARY" },
97 { T_INT_CURR_SYMBOL, "int_curr_symbol" },
98 { T_CURRENCY_SYMBOL, "currency_symbol" },
99 { T_MON_DECIMAL_POINT, "mon_decimal_point" },
100 { T_MON_THOUSANDS_SEP, "mon_thousands_sep" },
101 { T_POSITIVE_SIGN, "positive_sign" },
102 { T_NEGATIVE_SIGN, "negative_sign" },
103 { T_MON_GROUPING, "mon_grouping" },
104 { T_INT_FRAC_DIGITS, "int_frac_digits" },
105 { T_FRAC_DIGITS, "frac_digits" },
106 { T_P_CS_PRECEDES, "p_cs_precedes" },
107 { T_P_SEP_BY_SPACE, "p_sep_by_space" },
108 { T_N_CS_PRECEDES, "n_cs_precedes" },
109 { T_N_SEP_BY_SPACE, "n_sep_by_space" },
110 { T_P_SIGN_POSN, "p_sign_posn" },
111 { T_N_SIGN_POSN, "n_sign_posn" },
112 { T_INT_P_CS_PRECEDES, "int_p_cs_precedes" },
113 { T_INT_N_CS_PRECEDES, "int_n_cs_precedes" },
114 { T_INT_P_SEP_BY_SPACE, "int_p_sep_by_space" },
115 { T_INT_N_SEP_BY_SPACE, "int_n_sep_by_space" },
116 { T_INT_P_SIGN_POSN, "int_p_sign_posn" },
117 { T_INT_N_SIGN_POSN, "int_n_sign_posn" },
118 { T_COLLATE, "LC_COLLATE" },
119 { T_COLLATING_SYMBOL, "collating-symbol" },
120 { T_COLLATING_ELEMENT, "collating-element" },
121 { T_FROM, "from" },
122 { T_ORDER_START, "order_start" },
123 { T_ORDER_END, "order_end" },
124 { T_FORWARD, "forward" },
125 { T_BACKWARD, "backward" },
126 { T_POSITION, "position" },
127 { T_IGNORE, "IGNORE" },
128 { T_UNDEFINED, "UNDEFINED" },
129 { T_NUMERIC, "LC_NUMERIC" },
130 { T_DECIMAL_POINT, "decimal_point" },
131 { T_THOUSANDS_SEP, "thousands_sep" },
132 { T_GROUPING, "grouping" },
133 { T_TIME, "LC_TIME" },
134 { T_ABDAY, "abday" },
135 { T_DAY, "day" },
136 { T_ABMON, "abmon" },
137 { T_MON, "mon" },
138 { T_D_T_FMT, "d_t_fmt" },
139 { T_D_FMT, "d_fmt" },
140 { T_T_FMT, "t_fmt" },
141 { T_AM_PM, "am_pm" },
142 { T_T_FMT_AMPM, "t_fmt_ampm" },
143 { T_ERA, "era" },
144 { T_ERA_D_FMT, "era_d_fmt" },
145 { T_ERA_T_FMT, "era_t_fmt" },
146 { T_ERA_D_T_FMT, "era_d_t_fmt" },
147 { T_ALT_DIGITS, "alt_digits" },
148 { T_CTYPE, "LC_CTYPE" },
149 { T_ISUPPER, "upper" },
150 { T_ISLOWER, "lower" },
151 { T_ISALPHA, "alpha" },
152 { T_ISDIGIT, "digit" },
153 { T_ISPUNCT, "punct" },
154 { T_ISXDIGIT, "xdigit" },
155 { T_ISSPACE, "space" },
156 { T_ISPRINT, "print" },
157 { T_ISGRAPH, "graph" },
158 { T_ISBLANK, "blank" },
159 { T_ISCNTRL, "cntrl" },
161 * These entries are local additions, and not specified by
162 * TOG. Note that they are not guaranteed to be accurate for
163 * all locales, and so applications should not depend on them.
165 { T_ISSPECIAL, "special" },
166 { T_ISENGLISH, "english" },
167 { T_ISPHONOGRAM, "phonogram" },
168 { T_ISIDEOGRAM, "ideogram" },
169 { T_ISNUMBER, "number" },
171 * We have to support this in the grammar, but it would be a
172 * syntax error to define a character as one of these without
173 * also defining it as an alpha or digit. We ignore it in our
174 * parsing.
176 { T_ISALNUM, "alnum" },
177 { T_TOUPPER, "toupper" },
178 { T_TOLOWER, "tolower" },
181 * These are keywords used in the charmap file. Note that
182 * Solaris orginally used angle brackets to wrap some of them,
183 * but we removed that to simplify our parser. The first of these
184 * items are "global items."
186 { T_CHARMAP, "CHARMAP" },
187 { T_WIDTH, "WIDTH" },
189 { -1, NULL },
193 * These special words are only used in a charmap file, enclosed in <>.
195 static struct token symwords[] = {
196 { T_COM_CHAR, "comment_char" },
197 { T_ESC_CHAR, "escape_char" },
198 { T_CODE_SET, "code_set_name" },
199 { T_MB_CUR_MAX, "mb_cur_max" },
200 { T_MB_CUR_MIN, "mb_cur_min" },
201 { -1, NULL },
204 static int categories[] = {
205 T_CHARMAP,
206 T_CTYPE,
207 T_COLLATE,
208 T_MESSAGES,
209 T_MONETARY,
210 T_NUMERIC,
211 T_TIME,
212 T_WIDTH,
216 void
217 reset_scanner(const char *fname)
219 if (fname == NULL) {
220 filename = "<stdin>";
221 is_stdin = 1;
222 } else {
223 if (!is_stdin)
224 (void) fclose(input);
225 if ((input = fopen(fname, "r")) == NULL) {
226 perror("fopen");
227 exit(4);
228 } else {
229 is_stdin = 0;
231 filename = fname;
233 com_char = '#';
234 esc_char = '\\';
235 instring = 0;
236 escaped = 0;
237 lineno = 1;
238 nextline = 1;
239 tokidx = 0;
240 wideidx = 0;
243 #define hex(x) \
244 (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
245 #define isodigit(x) ((x >= '0') && (x <= '7'))
247 static int
248 scanc(void)
250 int c;
252 if (is_stdin)
253 c = getc(stdin);
254 else
255 c = getc(input);
256 lineno = nextline;
257 if (c == '\n') {
258 nextline++;
260 return (c);
263 static void
264 unscanc(int c)
266 if (c == '\n') {
267 nextline--;
269 if (ungetc(c, is_stdin ? stdin : input) < 0) {
270 yyerror("ungetc failed");
274 static int
275 scan_hex_byte(void)
277 int c1, c2;
278 int v;
280 c1 = scanc();
281 if (!isxdigit(c1)) {
282 yyerror("malformed hex digit");
283 return (0);
285 c2 = scanc();
286 if (!isxdigit(c2)) {
287 yyerror("malformed hex digit");
288 return (0);
290 v = ((hex(c1) << 4) | hex(c2));
291 return (v);
294 static int
295 scan_dec_byte(void)
297 int c1, c2, c3;
298 int b;
300 c1 = scanc();
301 if (!isdigit(c1)) {
302 yyerror("malformed decimal digit");
303 return (0);
305 b = c1 - '0';
306 c2 = scanc();
307 if (!isdigit(c2)) {
308 yyerror("malformed decimal digit");
309 return (0);
311 b *= 10;
312 b += (c2 - '0');
313 c3 = scanc();
314 if (!isdigit(c3)) {
315 unscanc(c3);
316 } else {
317 b *= 10;
318 b += (c3 - '0');
320 return (b);
323 static int
324 scan_oct_byte(void)
326 int c1, c2, c3;
327 int b;
329 b = 0;
331 c1 = scanc();
332 if (!isodigit(c1)) {
333 yyerror("malformed octal digit");
334 return (0);
336 b = c1 - '0';
337 c2 = scanc();
338 if (!isodigit(c2)) {
339 yyerror("malformed octal digit");
340 return (0);
342 b *= 8;
343 b += (c2 - '0');
344 c3 = scanc();
345 if (!isodigit(c3)) {
346 unscanc(c3);
347 } else {
348 b *= 8;
349 b += (c3 - '0');
351 return (b);
354 void
355 add_tok(int c)
357 if ((tokidx + 1) >= toksz) {
358 toksz += 64;
359 if ((token = realloc(token, toksz)) == NULL) {
360 yyerror("out of memory");
361 tokidx = 0;
362 toksz = 0;
363 return;
367 token[tokidx++] = (char)c;
368 token[tokidx] = 0;
370 void
371 add_wcs(wchar_t c)
373 if ((wideidx + 1) >= widesz) {
374 widesz += 64;
375 widestr = realloc(widestr, (widesz * sizeof (wchar_t)));
376 if (widestr == NULL) {
377 yyerror("out of memory");
378 wideidx = 0;
379 widesz = 0;
380 return;
384 widestr[wideidx++] = c;
385 widestr[wideidx] = 0;
388 wchar_t *
389 get_wcs(void)
391 wchar_t *ws = widestr;
392 wideidx = 0;
393 widestr = NULL;
394 widesz = 0;
395 if (ws == NULL) {
396 if ((ws = wcsdup(L"")) == NULL) {
397 yyerror("out of memory");
400 return (ws);
403 static int
404 get_byte(void)
406 int c;
408 if ((c = scanc()) != esc_char) {
409 unscanc(c);
410 return (EOF);
412 c = scanc();
414 switch (c) {
415 case 'd':
416 case 'D':
417 return (scan_dec_byte());
418 case 'x':
419 case 'X':
420 return (scan_hex_byte());
421 case '0':
422 case '1':
423 case '2':
424 case '3':
425 case '4':
426 case '5':
427 case '6':
428 case '7':
429 /* put the character back so we can get it */
430 unscanc(c);
431 return (scan_oct_byte());
432 default:
433 unscanc(c);
434 unscanc(esc_char);
435 return (EOF);
440 get_escaped(int c)
442 switch (c) {
443 case 'n':
444 return ('\n');
445 case 'r':
446 return ('\r');
447 case 't':
448 return ('\t');
449 case 'f':
450 return ('\f');
451 case 'v':
452 return ('\v');
453 case 'b':
454 return ('\b');
455 case 'a':
456 return ('\a');
457 default:
458 return (c);
463 get_wide(void)
465 static char mbs[MB_LEN_MAX + 1] = "";
466 static int mbi = 0;
467 int c;
468 wchar_t wc;
470 if (mb_cur_max >= (int)sizeof (mbs)) {
471 yyerror("max multibyte character size too big");
472 mbi = 0;
473 return (T_NULL);
475 for (;;) {
476 if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) {
478 * end of the byte sequence reached, but no
479 * valid wide decoding. fatal error.
481 mbi = 0;
482 yyerror("not a valid character encoding");
483 return (T_NULL);
485 mbs[mbi++] = c;
486 mbs[mbi] = 0;
488 /* does it decode? */
489 if (to_wide(&wc, mbs) >= 0) {
490 break;
494 mbi = 0;
495 if ((category != T_CHARMAP) && (category != T_WIDTH)) {
496 if (check_charmap(wc) < 0) {
497 yyerror("no symbolic name for character");
498 return (T_NULL);
502 yylval.wc = wc;
503 return (T_CHAR);
507 get_symbol(void)
509 int c;
511 while ((c = scanc()) != EOF) {
512 if (escaped) {
513 escaped = 0;
514 if (c == '\n')
515 continue;
516 add_tok(get_escaped(c));
517 continue;
519 if (c == esc_char) {
520 escaped = 1;
521 continue;
523 if (c == '\n') { /* well that's strange! */
524 yyerror("unterminated symbolic name");
525 continue;
527 if (c == '>') { /* end of symbol */
530 * This restarts the token from the beginning
531 * the next time we scan a character. (This
532 * token is complete.)
535 if (token == NULL) {
536 yyerror("missing symbolic name");
537 return (T_NULL);
539 tokidx = 0;
542 * A few symbols are handled as keywords outside
543 * of the normal categories.
545 if (category == T_END) {
546 int i;
547 for (i = 0; symwords[i].name != 0; i++) {
548 if (strcmp(token, symwords[i].name) ==
549 0) {
550 last_kw = symwords[i].id;
551 return (last_kw);
556 * Contextual rule: Only literal characters are
557 * permitted in CHARMAP. Anywhere else the symbolic
558 * forms are fine.
560 if ((category != T_CHARMAP) &&
561 (lookup_charmap(token, &yylval.wc)) != -1) {
562 return (T_CHAR);
564 if ((yylval.collsym = lookup_collsym(token)) != NULL) {
565 return (T_COLLSYM);
567 if ((yylval.collelem = lookup_collelem(token)) !=
568 NULL) {
569 return (T_COLLELEM);
571 /* its an undefined symbol */
572 yylval.token = strdup(token);
573 token = NULL;
574 toksz = 0;
575 tokidx = 0;
576 return (T_SYMBOL);
578 add_tok(c);
581 yyerror("unterminated symbolic name");
582 return (EOF);
586 get_category(void)
588 return (category);
591 static int
592 consume_token(void)
594 int len = tokidx;
595 int i;
597 tokidx = 0;
598 if (token == NULL)
599 return (T_NULL);
602 * this one is special, because we don't want it to alter the
603 * last_kw field.
605 if (strcmp(token, "...") == 0) {
606 return (T_ELLIPSIS);
609 /* search for reserved words first */
610 for (i = 0; keywords[i].name; i++) {
611 int j;
612 if (strcmp(keywords[i].name, token) != 0) {
613 continue;
616 last_kw = keywords[i].id;
618 /* clear the top level category if we're done with it */
619 if (last_kw == T_END) {
620 category = T_END;
623 /* set the top level category if we're changing */
624 for (j = 0; categories[j]; j++) {
625 if (categories[j] != last_kw)
626 continue;
627 category = last_kw;
630 return (keywords[i].id);
633 /* maybe its a numeric constant? */
634 if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
635 char *eptr;
636 yylval.num = strtol(token, &eptr, 10);
637 if (*eptr != 0)
638 yyerror("malformed number");
639 return (T_NUMBER);
643 * A single lone character is treated as a character literal.
644 * To avoid duplication of effort, we stick in the charmap.
646 if (len == 1) {
647 yylval.wc = token[0];
648 return (T_CHAR);
651 /* anything else is treated as a symbolic name */
652 yylval.token = strdup(token);
653 token = NULL;
654 toksz = 0;
655 tokidx = 0;
656 return (T_NAME);
659 void
660 scan_to_eol(void)
662 int c;
663 while ((c = scanc()) != '\n') {
664 if (c == EOF) {
665 /* end of file without newline! */
666 errf("missing newline");
667 return;
670 assert(c == '\n');
674 yylex(void)
676 int c;
678 while ((c = scanc()) != EOF) {
680 /* special handling for quoted string */
681 if (instring) {
682 if (escaped) {
683 escaped = 0;
685 /* if newline, just eat and forget it */
686 if (c == '\n')
687 continue;
689 if (strchr("xXd01234567", c)) {
690 unscanc(c);
691 unscanc(esc_char);
692 return (get_wide());
694 yylval.wc = get_escaped(c);
695 return (T_CHAR);
697 if (c == esc_char) {
698 escaped = 1;
699 continue;
701 switch (c) {
702 case '<':
703 return (get_symbol());
704 case '>':
705 /* oops! should generate syntax error */
706 return (T_GT);
707 case '"':
708 instring = 0;
709 return (T_QUOTE);
710 default:
711 yylval.wc = c;
712 return (T_CHAR);
716 /* escaped characters first */
717 if (escaped) {
718 escaped = 0;
719 if (c == '\n') {
720 /* eat the newline */
721 continue;
723 hadtok = 1;
724 if (tokidx) {
725 /* an escape mid-token is nonsense */
726 return (T_NULL);
729 /* numeric escapes are treated as wide characters */
730 if (strchr("xXd01234567", c)) {
731 unscanc(c);
732 unscanc(esc_char);
733 return (get_wide());
736 add_tok(get_escaped(c));
737 continue;
740 /* if it is the escape charter itself note it */
741 if (c == esc_char) {
742 escaped = 1;
743 continue;
746 /* remove from the comment char to end of line */
747 if (c == com_char) {
748 while (c != '\n') {
749 if ((c = scanc()) == EOF) {
750 /* end of file without newline! */
751 return (EOF);
754 assert(c == '\n');
755 if (!hadtok) {
757 * If there were no tokens on this line,
758 * then just pretend it didn't exist at all.
760 continue;
762 hadtok = 0;
763 return (T_NL);
766 if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
768 * These are all token delimiters. If there
769 * is a token already in progress, we need to
770 * process it.
772 unscanc(c);
773 return (consume_token());
776 switch (c) {
777 case '\n':
778 if (!hadtok) {
780 * If the line was completely devoid of tokens,
781 * then just ignore it.
783 continue;
785 /* we're starting a new line, reset the token state */
786 hadtok = 0;
787 return (T_NL);
788 case ',':
789 hadtok = 1;
790 return (T_COMMA);
791 case ';':
792 hadtok = 1;
793 return (T_SEMI);
794 case '(':
795 hadtok = 1;
796 return (T_LPAREN);
797 case ')':
798 hadtok = 1;
799 return (T_RPAREN);
800 case '>':
801 hadtok = 1;
802 return (T_GT);
803 case '<':
804 /* symbol start! */
805 hadtok = 1;
806 return (get_symbol());
807 case ' ':
808 case '\t':
809 /* whitespace, just ignore it */
810 continue;
811 case '"':
812 hadtok = 1;
813 instring = 1;
814 return (T_QUOTE);
815 default:
816 hadtok = 1;
817 add_tok(c);
818 continue;
821 return (EOF);
824 void
825 yyerror(const char *msg)
827 (void) fprintf(stderr, "%s: %d: error: %s\n",
828 filename, lineno, msg);
829 exit(4);
832 __printflike(1, 2) void
833 errf(const char *fmt, ...)
835 char *msg;
837 va_list va;
838 va_start(va, fmt);
839 (void) vasprintf(&msg, fmt, va);
840 va_end(va);
842 (void) fprintf(stderr, "%s: %d: error: %s\n",
843 filename, lineno, msg);
844 free(msg);
845 exit(4);
848 __printflike(1, 2) void
849 warn(const char *fmt, ...)
851 char *msg;
853 va_list va;
854 va_start(va, fmt);
855 (void) vasprintf(&msg, fmt, va);
856 va_end(va);
858 (void) fprintf(stderr, "%s: %d: warning: %s\n",
859 filename, lineno, msg);
860 free(msg);
861 warnings++;
862 if (!warnok)
863 exit(4);