treewide: Replace <name>_cnt by n_<name>s and <name>_cap by allocated_<name>.
[pspp.git] / src / data / identifier.c
blobd9d9b2a6444c25a8f76fec3df66089e1306e6b40
1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2005, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 This file is concerned with the definition of the PSPP syntax, NOT the
19 action of scanning/parsing code .
22 #include <config.h>
24 #include "data/identifier.h"
26 #include <string.h>
27 #include <unistr.h>
28 #include <unictype.h>
30 #include "libpspp/assertion.h"
31 #include "libpspp/cast.h"
33 #include "gl/c-ctype.h"
35 #include "gettext.h"
36 #define _(msgid) gettext (msgid)
38 /* Tokens. */
40 /* Returns TYPE as a string, e.g. "ID" for T_ID. */
41 const char *
42 token_type_to_name (enum token_type type)
44 switch (type)
46 #define TOKEN_TYPE(TYPE) case T_##TYPE: return #TYPE;
47 TOKEN_TYPES
48 #undef TOKEN_TYPE
49 default:
50 return "unknown token type";
54 /* Returns an ASCII string that yields TOKEN if it appeared in a syntax file,
55 as a statically allocated constant string. This function returns NULL for
56 tokens that don't have any fixed string representation, such as identifier
57 and number tokens. */
58 const char *
59 token_type_to_string (enum token_type token)
61 switch (token)
63 case T_ID:
64 case T_POS_NUM:
65 case T_NEG_NUM:
66 case T_STRING:
67 case T_MACRO_ID:
68 case T_MACRO_PUNCT:
69 case T_STOP:
70 return NULL;
72 case T_ENDCMD:
73 return ".";
75 case T_PLUS:
76 return "+";
78 case T_DASH:
79 return "-";
81 case T_ASTERISK:
82 return "*";
84 case T_SLASH:
85 return "/";
87 case T_EQUALS:
88 return "=";
90 case T_LPAREN:
91 return "(";
93 case T_RPAREN:
94 return ")";
96 case T_LBRACK:
97 return "[";
99 case T_RBRACK:
100 return "]";
102 case T_LCURLY:
103 return "{";
105 case T_RCURLY:
106 return "}";
108 case T_COMMA:
109 return ",";
111 case T_SEMICOLON:
112 return ";";
114 case T_COLON:
115 return ":";
117 case T_AND:
118 return "AND";
120 case T_OR:
121 return "OR";
123 case T_NOT:
124 return "NOT";
126 case T_EQ:
127 return "EQ";
129 case T_GE:
130 return ">=";
132 case T_GT:
133 return ">";
135 case T_LE:
136 return "<=";
138 case T_LT:
139 return "<";
141 case T_NE:
142 return "~=";
144 case T_ALL:
145 return "ALL";
147 case T_BY:
148 return "BY";
150 case T_TO:
151 return "TO";
153 case T_WITH:
154 return "WITH";
156 case T_EXP:
157 return "**";
160 NOT_REACHED ();
163 /* Recognizing identifiers. */
165 static bool
166 is_ascii_id1 (unsigned char c)
168 return c_isalpha (c) || c == '@' || c == '#' || c == '$';
171 static bool
172 is_ascii_idn (unsigned char c)
174 return is_ascii_id1 (c) || isdigit (c) || c == '.' || c == '_';
177 /* Returns true if C may be the first byte in an identifier in the current
178 locale.
180 (PSPP is transitioning to using Unicode internally for syntax, so please
181 use lex_uc_is_id1() instead, if possible.) */
182 bool
183 lex_is_id1 (char c)
185 return is_ascii_id1 (c) || (unsigned char) c >= 128;
188 /* Returns true if C may be a byte in an identifier other than the first.
190 (PSPP is transitioning to using Unicode internally for syntax, so please
191 use lex_uc_is_idn() instead, if possible.) */
192 bool
193 lex_is_idn (char c)
195 return is_ascii_idn (c) || (unsigned char) c >= 128;
198 /* Returns true if Unicode code point UC may be the first character in an
199 identifier in the current locale. */
200 bool
201 lex_uc_is_id1 (ucs4_t uc)
203 return (uc < 0x80
204 ? is_ascii_id1 (uc)
205 : (uc_is_general_category_withtable (uc,
206 UC_CATEGORY_MASK_L |
207 UC_CATEGORY_MASK_M |
208 UC_CATEGORY_MASK_S)
209 && uc != 0xfffc && uc != 0xfffd));
212 /* Returns true if Unicode code point UC may be a character in an identifier
213 other than the first. */
214 bool
215 lex_uc_is_idn (ucs4_t uc)
217 return (uc < 0x80
218 ? is_ascii_id1 (uc) || isdigit (uc) || uc == '.' || uc == '_'
219 : (uc_is_general_category_withtable (uc,
220 UC_CATEGORY_MASK_L |
221 UC_CATEGORY_MASK_M |
222 UC_CATEGORY_MASK_S |
223 UC_CATEGORY_MASK_N)
224 && uc != 0xfffc && uc != 0xfffd));
227 /* Returns true if Unicode code point UC is a space that separates tokens. */
228 bool
229 lex_uc_is_space (ucs4_t uc)
231 /* These are all of the Unicode characters in category Zs, Zl, or Zp. */
232 return (uc == ' ' || (uc <= 0x000d && uc >= 0x0009)
233 || (uc >= 0x80
234 && (uc == 0xa0 || uc == 0x85 || uc == 0x1680 || uc == 0x180e
235 || (uc >= 0x2000 && uc <= 0x200a)
236 || uc == 0x2028 || uc == 0x2029 || uc == 0x202f
237 || uc == 0x205f || uc == 0x3000)));
241 /* Returns the length of the longest prefix of STRING that forms
242 a valid identifier. Returns zero if STRING does not begin
243 with a valid identifier. */
244 size_t
245 lex_id_get_length (struct substring string)
247 const uint8_t *s = CHAR_CAST (const uint8_t *, string.string);
248 size_t len = string.length;
249 size_t ofs;
250 int mblen;
252 for (ofs = 0; ofs < string.length; ofs += mblen)
254 ucs4_t uc;
256 mblen = u8_mbtouc (&uc, s + ofs, len - ofs);
257 if (!(ofs == 0 ? lex_uc_is_id1 (uc) : lex_uc_is_idn (uc)))
258 break;
261 return ofs;
264 /* Comparing identifiers. */
266 /* Returns true if TOKEN is a case-insensitive match for KEYWORD.
268 Keywords match if one of the following is true: KEYWORD and
269 TOKEN are identical, or TOKEN is at least 3 characters long
270 and those characters are identical to KEYWORD. (Letters that
271 differ only in case are considered identical.)
273 KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
274 bool
275 lex_id_match (struct substring keyword, struct substring token)
277 return lex_id_match_n (keyword, token, 3);
280 /* Returns true if TOKEN is a case-insensitive match for at least
281 the first N characters of KEYWORD.
283 KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
284 bool
285 lex_id_match_n (struct substring keyword, struct substring token, size_t n)
287 size_t token_len = ss_length (token);
288 size_t keyword_len = ss_length (keyword);
290 if (token_len >= n && token_len < keyword_len)
291 return ss_equals_case (ss_head (keyword, token_len), token);
292 else
293 return ss_equals_case (keyword, token);
296 /* Table of keywords. */
297 struct keyword
299 int token;
300 const struct substring identifier;
303 static const struct keyword keywords[] =
305 { T_AND, SS_LITERAL_INITIALIZER ("AND") },
306 { T_OR, SS_LITERAL_INITIALIZER ("OR") },
307 { T_NOT, SS_LITERAL_INITIALIZER ("NOT") },
308 { T_EQ, SS_LITERAL_INITIALIZER ("EQ") },
309 { T_GE, SS_LITERAL_INITIALIZER ("GE") },
310 { T_GT, SS_LITERAL_INITIALIZER ("GT") },
311 { T_LE, SS_LITERAL_INITIALIZER ("LE") },
312 { T_LT, SS_LITERAL_INITIALIZER ("LT") },
313 { T_NE, SS_LITERAL_INITIALIZER ("NE") },
314 { T_ALL, SS_LITERAL_INITIALIZER ("ALL") },
315 { T_BY, SS_LITERAL_INITIALIZER ("BY") },
316 { T_TO, SS_LITERAL_INITIALIZER ("TO") },
317 { T_WITH, SS_LITERAL_INITIALIZER ("WITH") },
319 static const size_t n_keywords = sizeof keywords / sizeof *keywords;
321 /* Returns true if TOKEN is representable as a keyword. */
322 bool
323 lex_is_keyword (enum token_type token)
325 const struct keyword *kw;
326 for (kw = keywords; kw < &keywords[n_keywords]; kw++)
327 if (kw->token == token)
328 return true;
329 return false;
332 /* Returns the proper token type, either T_ID or a reserved
333 keyword enum, for ID. */
335 lex_id_to_token (struct substring id)
337 if (ss_length (id) >= 2 && ss_length (id) <= 4)
339 const struct keyword *kw;
340 for (kw = keywords; kw < &keywords[n_keywords]; kw++)
341 if (ss_equals_case (kw->identifier, id))
342 return kw->token;
345 return T_ID;