2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
13 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
14 * Copyright 2013 DEY Storage Systems, Inc.
18 * This file contains the "scanner", which tokenizes the input files
19 * for localedef for processing by the higher level grammar processor.
28 #include <sys/types.h>
30 #include "localedef.h"
41 static const char *filename
;
42 static int instring
= 0;
43 static int escaped
= 0;
46 * Token space ... grows on demand.
48 static char *token
= NULL
;
51 static int hadtok
= 0;
54 * Wide string space ... grows on demand.
56 static wchar_t *widestr
= NULL
;
57 static int wideidx
= 0;
58 static int widesz
= 0;
61 * The last keyword seen. This is useful to trigger the special lexer rules
62 * for "copy" and also collating symbols and elements.
65 static int category
= T_END
;
71 { T_COM_CHAR
, "comment_char" },
72 { T_ESC_CHAR
, "escape_char" },
75 { T_MESSAGES
, "LC_MESSAGES" },
76 { T_YESSTR
, "yesstr" },
77 { T_YESEXPR
, "yesexpr" },
79 { T_NOEXPR
, "noexpr" },
80 { T_MONETARY
, "LC_MONETARY" },
81 { T_INT_CURR_SYMBOL
, "int_curr_symbol" },
82 { T_CURRENCY_SYMBOL
, "currency_symbol" },
83 { T_MON_DECIMAL_POINT
, "mon_decimal_point" },
84 { T_MON_THOUSANDS_SEP
, "mon_thousands_sep" },
85 { T_POSITIVE_SIGN
, "positive_sign" },
86 { T_NEGATIVE_SIGN
, "negative_sign" },
87 { T_MON_GROUPING
, "mon_grouping" },
88 { T_INT_FRAC_DIGITS
, "int_frac_digits" },
89 { T_FRAC_DIGITS
, "frac_digits" },
90 { T_P_CS_PRECEDES
, "p_cs_precedes" },
91 { T_P_SEP_BY_SPACE
, "p_sep_by_space" },
92 { T_N_CS_PRECEDES
, "n_cs_precedes" },
93 { T_N_SEP_BY_SPACE
, "n_sep_by_space" },
94 { T_P_SIGN_POSN
, "p_sign_posn" },
95 { T_N_SIGN_POSN
, "n_sign_posn" },
96 { T_INT_P_CS_PRECEDES
, "int_p_cs_precedes" },
97 { T_INT_N_CS_PRECEDES
, "int_n_cs_precedes" },
98 { T_INT_P_SEP_BY_SPACE
, "int_p_sep_by_space" },
99 { T_INT_N_SEP_BY_SPACE
, "int_n_sep_by_space" },
100 { T_INT_P_SIGN_POSN
, "int_p_sign_posn" },
101 { T_INT_N_SIGN_POSN
, "int_n_sign_posn" },
102 { T_COLLATE
, "LC_COLLATE" },
103 { T_COLLATING_SYMBOL
, "collating-symbol" },
104 { T_COLLATING_ELEMENT
, "collating-element" },
106 { T_ORDER_START
, "order_start" },
107 { T_ORDER_END
, "order_end" },
108 { T_FORWARD
, "forward" },
109 { T_BACKWARD
, "backward" },
110 { T_POSITION
, "position" },
111 { T_IGNORE
, "IGNORE" },
112 { T_UNDEFINED
, "UNDEFINED" },
113 { T_NUMERIC
, "LC_NUMERIC" },
114 { T_DECIMAL_POINT
, "decimal_point" },
115 { T_THOUSANDS_SEP
, "thousands_sep" },
116 { T_GROUPING
, "grouping" },
117 { T_TIME
, "LC_TIME" },
118 { T_ABDAY
, "abday" },
120 { T_ABMON
, "abmon" },
122 { T_D_T_FMT
, "d_t_fmt" },
123 { T_D_FMT
, "d_fmt" },
124 { T_T_FMT
, "t_fmt" },
125 { T_AM_PM
, "am_pm" },
126 { T_T_FMT_AMPM
, "t_fmt_ampm" },
128 { T_ERA_D_FMT
, "era_d_fmt" },
129 { T_ERA_T_FMT
, "era_t_fmt" },
130 { T_ERA_D_T_FMT
, "era_d_t_fmt" },
131 { T_ALT_DIGITS
, "alt_digits" },
132 { T_CTYPE
, "LC_CTYPE" },
133 { T_ISUPPER
, "upper" },
134 { T_ISLOWER
, "lower" },
135 { T_ISALPHA
, "alpha" },
136 { T_ISDIGIT
, "digit" },
137 { T_ISPUNCT
, "punct" },
138 { T_ISXDIGIT
, "xdigit" },
139 { T_ISSPACE
, "space" },
140 { T_ISPRINT
, "print" },
141 { T_ISGRAPH
, "graph" },
142 { T_ISBLANK
, "blank" },
143 { T_ISCNTRL
, "cntrl" },
145 * These entries are local additions, and not specified by
146 * TOG. Note that they are not guaranteed to be accurate for
147 * all locales, and so applications should not depend on them.
149 { T_ISSPECIAL
, "special" },
150 { T_ISENGLISH
, "english" },
151 { T_ISPHONOGRAM
, "phonogram" },
152 { T_ISIDEOGRAM
, "ideogram" },
153 { T_ISNUMBER
, "number" },
155 * We have to support this in the grammar, but it would be a
156 * syntax error to define a character as one of these without
157 * also defining it as an alpha or digit. We ignore it in our
160 { T_ISALNUM
, "alnum" },
161 { T_TOUPPER
, "toupper" },
162 { T_TOLOWER
, "tolower" },
165 * These are keywords used in the charmap file. Note that
166 * Solaris orginally used angle brackets to wrap some of them,
167 * but we removed that to simplify our parser. The first of these
168 * items are "global items."
170 { T_CHARMAP
, "CHARMAP" },
171 { T_WIDTH
, "WIDTH" },
177 * These special words are only used in a charmap file, enclosed in <>.
179 static struct token symwords
[] = {
180 { T_COM_CHAR
, "comment_char" },
181 { T_ESC_CHAR
, "escape_char" },
182 { T_CODE_SET
, "code_set_name" },
183 { T_MB_CUR_MAX
, "mb_cur_max" },
184 { T_MB_CUR_MIN
, "mb_cur_min" },
188 static int categories
[] = {
201 reset_scanner(const char *fname
)
204 filename
= "<stdin>";
208 (void) fclose(input
);
209 if ((input
= fopen(fname
, "r")) == NULL
) {
226 (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
227 #define isodigit(x) ((x >= '0') && (x <= '7'))
248 if (ungetc(c
, input
) < 0) {
249 yyerror(_("ungetc failed"));
261 yyerror(_("malformed hex digit"));
266 yyerror(_("malformed hex digit"));
269 v
= ((hex(c1
) << 4) | hex(c2
));
281 yyerror(_("malformed decimal digit"));
287 yyerror(_("malformed decimal digit"));
312 yyerror(_("malformed octal digit"));
318 yyerror(_("malformed octal digit"));
336 if ((tokidx
+ 1) >= toksz
) {
338 if ((token
= realloc(token
, toksz
)) == NULL
) {
339 yyerror(_("out of memory"));
346 token
[tokidx
++] = (char)c
;
352 if ((wideidx
+ 1) >= widesz
) {
354 widestr
= realloc(widestr
, (widesz
* sizeof (wchar_t)));
355 if (widestr
== NULL
) {
356 yyerror(_("out of memory"));
363 widestr
[wideidx
++] = c
;
364 widestr
[wideidx
] = 0;
370 wchar_t *ws
= widestr
;
375 if ((ws
= wsdup(L
"")) == NULL
) {
376 yyerror(_("out of memory"));
387 if ((c
= scanc()) != esc_char
) {
396 return (scan_dec_byte());
399 return (scan_hex_byte());
408 /* put the character back so we can get it */
410 return (scan_oct_byte());
444 static char mbs
[MB_LEN_MAX
+ 1] = "";
449 if (mb_cur_max
>= sizeof (mbs
)) {
450 yyerror(_("max multibyte character size too big"));
455 if ((mbi
== mb_cur_max
) || ((c
= get_byte()) == EOF
)) {
457 * end of the byte sequence reached, but no
458 * valid wide decoding. fatal error.
461 yyerror(_("not a valid character encoding"));
467 /* does it decode? */
468 if (to_wide(&wc
, mbs
) >= 0) {
474 if ((category
!= T_CHARMAP
) && (category
!= T_WIDTH
)) {
475 if (check_charmap(wc
) < 0) {
476 yyerror(_("no symbolic name for character"));
490 while ((c
= scanc()) != EOF
) {
495 add_tok(get_escaped(c
));
502 if (c
== '\n') { /* well that's strange! */
503 yyerror(_("unterminated symbolic name"));
506 if (c
== '>') { /* end of symbol */
509 * This restarts the token from the beginning
510 * the next time we scan a character. (This
511 * token is complete.)
515 yyerror(_("missing symbolic name"));
521 * A few symbols are handled as keywords outside
522 * of the normal categories.
524 if (category
== T_END
) {
526 for (i
= 0; symwords
[i
].name
!= 0; i
++) {
527 if (strcmp(token
, symwords
[i
].name
) ==
529 last_kw
= symwords
[i
].id
;
535 * Contextual rule: Only literal characters are
536 * permitted in CHARMAP. Anywhere else the symbolic
539 if ((category
!= T_CHARMAP
) &&
540 (lookup_charmap(token
, &yylval
.wc
)) != -1) {
543 if ((yylval
.collsym
= lookup_collsym(token
)) != NULL
) {
546 if ((yylval
.collelem
= lookup_collelem(token
)) !=
550 /* its an undefined symbol */
551 yylval
.token
= strdup(token
);
560 yyerror(_("unterminated symbolic name"));
581 * this one is special, because we don't want it to alter the
584 if (strcmp(token
, "...") == 0) {
588 /* search for reserved words first */
589 for (i
= 0; keywords
[i
].name
; i
++) {
591 if (strcmp(keywords
[i
].name
, token
) != 0) {
595 last_kw
= keywords
[i
].id
;
597 /* clear the top level category if we're done with it */
598 if (last_kw
== T_END
) {
602 /* set the top level category if we're changing */
603 for (j
= 0; categories
[j
]; j
++) {
604 if (categories
[j
] != last_kw
)
609 return (keywords
[i
].id
);
612 /* maybe its a numeric constant? */
613 if (isdigit(*token
) || (*token
== '-' && isdigit(token
[1]))) {
615 yylval
.num
= strtol(token
, &eptr
, 10);
617 yyerror(_("malformed number"));
622 * A single lone character is treated as a character literal.
623 * To avoid duplication of effort, we stick in the charmap.
626 yylval
.wc
= token
[0];
630 /* anything else is treated as a symbolic name */
631 yylval
.token
= strdup(token
);
642 while ((c
= scanc()) != '\n') {
644 /* end of file without newline! */
645 errf(_("missing newline"));
658 filename
= "<stdin>";
662 while ((c
= scanc()) != EOF
) {
664 /* special handling for quoted string */
669 /* if newline, just eat and forget it */
673 if (strchr("xXd01234567", c
)) {
678 yylval
.wc
= get_escaped(c
);
687 return (get_symbol());
689 /* oops! should generate syntax error */
700 /* escaped characters first */
704 /* eat the newline */
709 /* an escape mid-token is nonsense */
713 /* numeric escapes are treated as wide characters */
714 if (strchr("xXd01234567", c
)) {
720 add_tok(get_escaped(c
));
724 /* if it is the escape charter itself note it */
730 /* remove from the comment char to end of line */
733 if ((c
= scanc()) == EOF
) {
734 /* end of file without newline! */
741 * If there were no tokens on this line,
742 * then just pretend it didn't exist at all.
750 if (strchr(" \t\n;()<>,\"", c
) && (tokidx
!= 0)) {
752 * These are all token delimiters. If there
753 * is a token already in progress, we need to
757 return (consume_token());
764 * If the line was completely devoid of tokens,
765 * then just ignore it.
769 /* we're starting a new line, reset the token state */
790 return (get_symbol());
793 /* whitespace, just ignore it */
809 yyerror(const char *msg
)
811 (void) fprintf(stderr
, _("%s: %d: error: %s\n"),
812 filename
, lineno
, msg
);
817 errf(const char *fmt
, ...)
823 (void) vasprintf(&msg
, fmt
, va
);
826 (void) fprintf(stderr
, _("%s: %d: error: %s\n"),
827 filename
, lineno
, msg
);
833 warn(const char *fmt
, ...)
839 (void) vasprintf(&msg
, fmt
, va
);
842 (void) fprintf(stderr
, _("%s: %d: warning: %s\n"),
843 filename
, lineno
, msg
);