indent(1): Use NULL instead of zero for pointers.
[freebsd-src.git] / usr.bin / localedef / scanner.c
blobfa6208996663d69211c024538f80cc8333347153
1 /*
2 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
3 * Copyright 2015 John Marino <draco@marino.st>
5 * This source code is derived from the illumos localedef command, and
6 * provided under BSD-style license terms by Nexenta Systems, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
32 * This file contains the "scanner", which tokenizes the input files
33 * for localedef for processing by the higher level grammar processor.
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <ctype.h>
41 #include <limits.h>
42 #include <string.h>
43 #include <wchar.h>
44 #include <sys/types.h>
45 #include <assert.h>
46 #include "localedef.h"
47 #include "parser.h"
49 int com_char = '#';
50 int esc_char = '\\';
51 int mb_cur_min = 1;
52 int mb_cur_max = 1;
53 int lineno = 1;
54 int warnings = 0;
55 int is_stdin = 1;
56 FILE *input;
57 static int nextline;
58 //static FILE *input = stdin;
59 static const char *filename = "<stdin>";
60 static int instring = 0;
61 static int escaped = 0;
64 * Token space ... grows on demand.
66 static char *token = NULL;
67 static int tokidx;
68 static int toksz = 0;
69 static int hadtok = 0;
72 * Wide string space ... grows on demand.
74 static wchar_t *widestr = NULL;
75 static int wideidx = 0;
76 static int widesz = 0;
79 * The last keyword seen. This is useful to trigger the special lexer rules
80 * for "copy" and also collating symbols and elements.
82 int last_kw = 0;
83 static int category = T_END;
85 static struct token {
86 int id;
87 const char *name;
88 } keywords[] = {
89 { T_COM_CHAR, "comment_char" },
90 { T_ESC_CHAR, "escape_char" },
91 { T_END, "END" },
92 { T_COPY, "copy" },
93 { T_MESSAGES, "LC_MESSAGES" },
94 { T_YESSTR, "yesstr" },
95 { T_YESEXPR, "yesexpr" },
96 { T_NOSTR, "nostr" },
97 { T_NOEXPR, "noexpr" },
98 { T_MONETARY, "LC_MONETARY" },
99 { T_INT_CURR_SYMBOL, "int_curr_symbol" },
100 { T_CURRENCY_SYMBOL, "currency_symbol" },
101 { T_MON_DECIMAL_POINT, "mon_decimal_point" },
102 { T_MON_THOUSANDS_SEP, "mon_thousands_sep" },
103 { T_POSITIVE_SIGN, "positive_sign" },
104 { T_NEGATIVE_SIGN, "negative_sign" },
105 { T_MON_GROUPING, "mon_grouping" },
106 { T_INT_FRAC_DIGITS, "int_frac_digits" },
107 { T_FRAC_DIGITS, "frac_digits" },
108 { T_P_CS_PRECEDES, "p_cs_precedes" },
109 { T_P_SEP_BY_SPACE, "p_sep_by_space" },
110 { T_N_CS_PRECEDES, "n_cs_precedes" },
111 { T_N_SEP_BY_SPACE, "n_sep_by_space" },
112 { T_P_SIGN_POSN, "p_sign_posn" },
113 { T_N_SIGN_POSN, "n_sign_posn" },
114 { T_INT_P_CS_PRECEDES, "int_p_cs_precedes" },
115 { T_INT_N_CS_PRECEDES, "int_n_cs_precedes" },
116 { T_INT_P_SEP_BY_SPACE, "int_p_sep_by_space" },
117 { T_INT_N_SEP_BY_SPACE, "int_n_sep_by_space" },
118 { T_INT_P_SIGN_POSN, "int_p_sign_posn" },
119 { T_INT_N_SIGN_POSN, "int_n_sign_posn" },
120 { T_COLLATE, "LC_COLLATE" },
121 { T_COLLATING_SYMBOL, "collating-symbol" },
122 { T_COLLATING_ELEMENT, "collating-element" },
123 { T_FROM, "from" },
124 { T_ORDER_START, "order_start" },
125 { T_ORDER_END, "order_end" },
126 { T_FORWARD, "forward" },
127 { T_BACKWARD, "backward" },
128 { T_POSITION, "position" },
129 { T_IGNORE, "IGNORE" },
130 { T_UNDEFINED, "UNDEFINED" },
131 { T_NUMERIC, "LC_NUMERIC" },
132 { T_DECIMAL_POINT, "decimal_point" },
133 { T_THOUSANDS_SEP, "thousands_sep" },
134 { T_GROUPING, "grouping" },
135 { T_TIME, "LC_TIME" },
136 { T_ABDAY, "abday" },
137 { T_DAY, "day" },
138 { T_ABMON, "abmon" },
139 { T_MON, "mon" },
140 { T_D_T_FMT, "d_t_fmt" },
141 { T_D_FMT, "d_fmt" },
142 { T_T_FMT, "t_fmt" },
143 { T_AM_PM, "am_pm" },
144 { T_T_FMT_AMPM, "t_fmt_ampm" },
145 { T_ERA, "era" },
146 { T_ERA_D_FMT, "era_d_fmt" },
147 { T_ERA_T_FMT, "era_t_fmt" },
148 { T_ERA_D_T_FMT, "era_d_t_fmt" },
149 { T_ALT_DIGITS, "alt_digits" },
150 { T_CTYPE, "LC_CTYPE" },
151 { T_ISUPPER, "upper" },
152 { T_ISLOWER, "lower" },
153 { T_ISALPHA, "alpha" },
154 { T_ISDIGIT, "digit" },
155 { T_ISPUNCT, "punct" },
156 { T_ISXDIGIT, "xdigit" },
157 { T_ISSPACE, "space" },
158 { T_ISPRINT, "print" },
159 { T_ISGRAPH, "graph" },
160 { T_ISBLANK, "blank" },
161 { T_ISCNTRL, "cntrl" },
163 * These entries are local additions, and not specified by
164 * TOG. Note that they are not guaranteed to be accurate for
165 * all locales, and so applications should not depend on them.
167 { T_ISSPECIAL, "special" },
168 { T_ISENGLISH, "english" },
169 { T_ISPHONOGRAM, "phonogram" },
170 { T_ISIDEOGRAM, "ideogram" },
171 { T_ISNUMBER, "number" },
173 * We have to support this in the grammar, but it would be a
174 * syntax error to define a character as one of these without
175 * also defining it as an alpha or digit. We ignore it in our
176 * parsing.
178 { T_ISALNUM, "alnum" },
179 { T_TOUPPER, "toupper" },
180 { T_TOLOWER, "tolower" },
183 * These are keywords used in the charmap file. Note that
184 * Solaris originally used angle brackets to wrap some of them,
185 * but we removed that to simplify our parser. The first of these
186 * items are "global items."
188 { T_CHARMAP, "CHARMAP" },
189 { T_WIDTH, "WIDTH" },
191 { -1, NULL },
195 * These special words are only used in a charmap file, enclosed in <>.
197 static struct token symwords[] = {
198 { T_COM_CHAR, "comment_char" },
199 { T_ESC_CHAR, "escape_char" },
200 { T_CODE_SET, "code_set_name" },
201 { T_MB_CUR_MAX, "mb_cur_max" },
202 { T_MB_CUR_MIN, "mb_cur_min" },
203 { -1, NULL },
206 static int categories[] = {
207 T_CHARMAP,
208 T_CTYPE,
209 T_COLLATE,
210 T_MESSAGES,
211 T_MONETARY,
212 T_NUMERIC,
213 T_TIME,
214 T_WIDTH,
218 void
219 reset_scanner(const char *fname)
221 if (fname == NULL) {
222 filename = "<stdin>";
223 is_stdin = 1;
224 } else {
225 if (!is_stdin)
226 (void) fclose(input);
227 if ((input = fopen(fname, "r")) == NULL) {
228 perror("fopen");
229 exit(4);
230 } else {
231 is_stdin = 0;
233 filename = fname;
235 com_char = '#';
236 esc_char = '\\';
237 instring = 0;
238 escaped = 0;
239 lineno = 1;
240 nextline = 1;
241 tokidx = 0;
242 wideidx = 0;
245 #define hex(x) \
246 (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
247 #define isodigit(x) ((x >= '0') && (x <= '7'))
249 static int
250 scanc(void)
252 int c;
254 if (is_stdin)
255 c = getc(stdin);
256 else
257 c = getc(input);
258 lineno = nextline;
259 if (c == '\n') {
260 nextline++;
262 return (c);
265 static void
266 unscanc(int c)
268 if (c == '\n') {
269 nextline--;
271 if (ungetc(c, is_stdin ? stdin : input) < 0) {
272 yyerror("ungetc failed");
276 static int
277 scan_hex_byte(void)
279 int c1, c2;
280 int v;
282 c1 = scanc();
283 if (!isxdigit(c1)) {
284 yyerror("malformed hex digit");
285 return (0);
287 c2 = scanc();
288 if (!isxdigit(c2)) {
289 yyerror("malformed hex digit");
290 return (0);
292 v = ((hex(c1) << 4) | hex(c2));
293 return (v);
296 static int
297 scan_dec_byte(void)
299 int c1, c2, c3;
300 int b;
302 c1 = scanc();
303 if (!isdigit(c1)) {
304 yyerror("malformed decimal digit");
305 return (0);
307 b = c1 - '0';
308 c2 = scanc();
309 if (!isdigit(c2)) {
310 yyerror("malformed decimal digit");
311 return (0);
313 b *= 10;
314 b += (c2 - '0');
315 c3 = scanc();
316 if (!isdigit(c3)) {
317 unscanc(c3);
318 } else {
319 b *= 10;
320 b += (c3 - '0');
322 return (b);
325 static int
326 scan_oct_byte(void)
328 int c1, c2, c3;
329 int b;
331 b = 0;
333 c1 = scanc();
334 if (!isodigit(c1)) {
335 yyerror("malformed octal digit");
336 return (0);
338 b = c1 - '0';
339 c2 = scanc();
340 if (!isodigit(c2)) {
341 yyerror("malformed octal digit");
342 return (0);
344 b *= 8;
345 b += (c2 - '0');
346 c3 = scanc();
347 if (!isodigit(c3)) {
348 unscanc(c3);
349 } else {
350 b *= 8;
351 b += (c3 - '0');
353 return (b);
356 void
357 add_tok(int c)
359 if ((tokidx + 1) >= toksz) {
360 toksz += 64;
361 if ((token = realloc(token, toksz)) == NULL) {
362 yyerror("out of memory");
363 tokidx = 0;
364 toksz = 0;
365 return;
369 token[tokidx++] = (char)c;
370 token[tokidx] = 0;
372 void
373 add_wcs(wchar_t c)
375 if ((wideidx + 1) >= widesz) {
376 widesz += 64;
377 widestr = realloc(widestr, (widesz * sizeof (wchar_t)));
378 if (widestr == NULL) {
379 yyerror("out of memory");
380 wideidx = 0;
381 widesz = 0;
382 return;
386 widestr[wideidx++] = c;
387 widestr[wideidx] = 0;
390 wchar_t *
391 get_wcs(void)
393 wchar_t *ws = widestr;
394 wideidx = 0;
395 widestr = NULL;
396 widesz = 0;
397 if (ws == NULL) {
398 if ((ws = wcsdup(L"")) == NULL) {
399 yyerror("out of memory");
402 return (ws);
405 static int
406 get_byte(void)
408 int c;
410 if ((c = scanc()) != esc_char) {
411 unscanc(c);
412 return (EOF);
414 c = scanc();
416 switch (c) {
417 case 'd':
418 case 'D':
419 return (scan_dec_byte());
420 case 'x':
421 case 'X':
422 return (scan_hex_byte());
423 case '0':
424 case '1':
425 case '2':
426 case '3':
427 case '4':
428 case '5':
429 case '6':
430 case '7':
431 /* put the character back so we can get it */
432 unscanc(c);
433 return (scan_oct_byte());
434 default:
435 unscanc(c);
436 unscanc(esc_char);
437 return (EOF);
442 get_escaped(int c)
444 switch (c) {
445 case 'n':
446 return ('\n');
447 case 'r':
448 return ('\r');
449 case 't':
450 return ('\t');
451 case 'f':
452 return ('\f');
453 case 'v':
454 return ('\v');
455 case 'b':
456 return ('\b');
457 case 'a':
458 return ('\a');
459 default:
460 return (c);
465 get_wide(void)
467 static char mbs[MB_LEN_MAX + 1] = "";
468 static int mbi = 0;
469 int c;
470 wchar_t wc;
472 if (mb_cur_max >= (int)sizeof (mbs)) {
473 yyerror("max multibyte character size too big");
474 mbi = 0;
475 return (T_NULL);
477 for (;;) {
478 if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) {
480 * end of the byte sequence reached, but no
481 * valid wide decoding. fatal error.
483 mbi = 0;
484 yyerror("not a valid character encoding");
485 return (T_NULL);
487 mbs[mbi++] = c;
488 mbs[mbi] = 0;
490 /* does it decode? */
491 if (to_wide(&wc, mbs) >= 0) {
492 break;
496 mbi = 0;
497 if ((category != T_CHARMAP) && (category != T_WIDTH)) {
498 if (check_charmap(wc) < 0) {
499 yyerror("no symbolic name for character");
500 return (T_NULL);
504 yylval.wc = wc;
505 return (T_CHAR);
509 get_symbol(void)
511 int c;
513 while ((c = scanc()) != EOF) {
514 if (escaped) {
515 escaped = 0;
516 if (c == '\n')
517 continue;
518 add_tok(get_escaped(c));
519 continue;
521 if (c == esc_char) {
522 escaped = 1;
523 continue;
525 if (c == '\n') { /* well that's strange! */
526 yyerror("unterminated symbolic name");
527 continue;
529 if (c == '>') { /* end of symbol */
532 * This restarts the token from the beginning
533 * the next time we scan a character. (This
534 * token is complete.)
537 if (token == NULL) {
538 yyerror("missing symbolic name");
539 return (T_NULL);
541 tokidx = 0;
544 * A few symbols are handled as keywords outside
545 * of the normal categories.
547 if (category == T_END) {
548 int i;
549 for (i = 0; symwords[i].name != 0; i++) {
550 if (strcmp(token, symwords[i].name) ==
551 0) {
552 last_kw = symwords[i].id;
553 return (last_kw);
558 * Contextual rule: Only literal characters are
559 * permitted in CHARMAP. Anywhere else the symbolic
560 * forms are fine.
562 if ((category != T_CHARMAP) &&
563 (lookup_charmap(token, &yylval.wc)) != -1) {
564 return (T_CHAR);
566 if ((yylval.collsym = lookup_collsym(token)) != NULL) {
567 return (T_COLLSYM);
569 if ((yylval.collelem = lookup_collelem(token)) !=
570 NULL) {
571 return (T_COLLELEM);
573 /* its an undefined symbol */
574 yylval.token = strdup(token);
575 token = NULL;
576 toksz = 0;
577 tokidx = 0;
578 return (T_SYMBOL);
580 add_tok(c);
583 yyerror("unterminated symbolic name");
584 return (EOF);
588 get_category(void)
590 return (category);
593 static int
594 consume_token(void)
596 int len = tokidx;
597 int i;
599 tokidx = 0;
600 if (token == NULL)
601 return (T_NULL);
604 * this one is special, because we don't want it to alter the
605 * last_kw field.
607 if (strcmp(token, "...") == 0) {
608 return (T_ELLIPSIS);
611 /* search for reserved words first */
612 for (i = 0; keywords[i].name; i++) {
613 int j;
614 if (strcmp(keywords[i].name, token) != 0) {
615 continue;
618 last_kw = keywords[i].id;
620 /* clear the top level category if we're done with it */
621 if (last_kw == T_END) {
622 category = T_END;
625 /* set the top level category if we're changing */
626 for (j = 0; categories[j]; j++) {
627 if (categories[j] != last_kw)
628 continue;
629 category = last_kw;
632 return (keywords[i].id);
635 /* maybe its a numeric constant? */
636 if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
637 char *eptr;
638 yylval.num = strtol(token, &eptr, 10);
639 if (*eptr != 0)
640 yyerror("malformed number");
641 return (T_NUMBER);
645 * A single lone character is treated as a character literal.
646 * To avoid duplication of effort, we stick in the charmap.
648 if (len == 1) {
649 yylval.wc = token[0];
650 return (T_CHAR);
653 /* anything else is treated as a symbolic name */
654 yylval.token = strdup(token);
655 token = NULL;
656 toksz = 0;
657 tokidx = 0;
658 return (T_NAME);
661 void
662 scan_to_eol(void)
664 int c;
665 while ((c = scanc()) != '\n') {
666 if (c == EOF) {
667 /* end of file without newline! */
668 errf("missing newline");
669 return;
672 assert(c == '\n');
676 yylex(void)
678 int c;
680 while ((c = scanc()) != EOF) {
682 /* special handling for quoted string */
683 if (instring) {
684 if (escaped) {
685 escaped = 0;
687 /* if newline, just eat and forget it */
688 if (c == '\n')
689 continue;
691 if (strchr("xXd01234567", c)) {
692 unscanc(c);
693 unscanc(esc_char);
694 return (get_wide());
696 yylval.wc = get_escaped(c);
697 return (T_CHAR);
699 if (c == esc_char) {
700 escaped = 1;
701 continue;
703 switch (c) {
704 case '<':
705 return (get_symbol());
706 case '>':
707 /* oops! should generate syntax error */
708 return (T_GT);
709 case '"':
710 instring = 0;
711 return (T_QUOTE);
712 default:
713 yylval.wc = c;
714 return (T_CHAR);
718 /* escaped characters first */
719 if (escaped) {
720 escaped = 0;
721 if (c == '\n') {
722 /* eat the newline */
723 continue;
725 hadtok = 1;
726 if (tokidx) {
727 /* an escape mid-token is nonsense */
728 return (T_NULL);
731 /* numeric escapes are treated as wide characters */
732 if (strchr("xXd01234567", c)) {
733 unscanc(c);
734 unscanc(esc_char);
735 return (get_wide());
738 add_tok(get_escaped(c));
739 continue;
742 /* if it is the escape charter itself note it */
743 if (c == esc_char) {
744 escaped = 1;
745 continue;
748 /* remove from the comment char to end of line */
749 if (c == com_char) {
750 while (c != '\n') {
751 if ((c = scanc()) == EOF) {
752 /* end of file without newline! */
753 return (EOF);
756 assert(c == '\n');
757 if (!hadtok) {
759 * If there were no tokens on this line,
760 * then just pretend it didn't exist at all.
762 continue;
764 hadtok = 0;
765 return (T_NL);
768 if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
770 * These are all token delimiters. If there
771 * is a token already in progress, we need to
772 * process it.
774 unscanc(c);
775 return (consume_token());
778 switch (c) {
779 case '\n':
780 if (!hadtok) {
782 * If the line was completely devoid of tokens,
783 * then just ignore it.
785 continue;
787 /* we're starting a new line, reset the token state */
788 hadtok = 0;
789 return (T_NL);
790 case ',':
791 hadtok = 1;
792 return (T_COMMA);
793 case ';':
794 hadtok = 1;
795 return (T_SEMI);
796 case '(':
797 hadtok = 1;
798 return (T_LPAREN);
799 case ')':
800 hadtok = 1;
801 return (T_RPAREN);
802 case '>':
803 hadtok = 1;
804 return (T_GT);
805 case '<':
806 /* symbol start! */
807 hadtok = 1;
808 return (get_symbol());
809 case ' ':
810 case '\t':
811 /* whitespace, just ignore it */
812 continue;
813 case '"':
814 hadtok = 1;
815 instring = 1;
816 return (T_QUOTE);
817 default:
818 hadtok = 1;
819 add_tok(c);
820 continue;
823 return (EOF);
826 void
827 yyerror(const char *msg)
829 (void) fprintf(stderr, "%s: %d: error: %s\n",
830 filename, lineno, msg);
831 exit(4);
834 void
835 errf(const char *fmt, ...)
837 char *msg;
839 va_list va;
840 va_start(va, fmt);
841 (void) vasprintf(&msg, fmt, va);
842 va_end(va);
844 (void) fprintf(stderr, "%s: %d: error: %s\n",
845 filename, lineno, msg);
846 free(msg);
847 exit(4);
850 void
851 warn(const char *fmt, ...)
853 char *msg;
855 va_list va;
856 va_start(va, fmt);
857 (void) vasprintf(&msg, fmt, va);
858 va_end(va);
860 (void) fprintf(stderr, "%s: %d: warning: %s\n",
861 filename, lineno, msg);
862 free(msg);
863 warnings++;
864 if (!warnok)
865 exit(4);