pc+-file-reader: Fix memory leak.
[pspp.git] / src / data / identifier.c
blobdb20010464cab1e3f3a1f42cc1e7a4c012bf8c1b
1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2005, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 This file is concerned with the definition of the PSPP syntax, NOT the
19 action of scanning/parsing code .
22 #include <config.h>
24 #include "data/identifier.h"
26 #include <string.h>
27 #include <unistr.h>
28 #include <unictype.h>
30 #include "libpspp/assertion.h"
31 #include "libpspp/cast.h"
33 #include "gl/c-ctype.h"
35 #include "gettext.h"
36 #define _(msgid) gettext (msgid)
38 /* Tokens. */
40 /* Returns TYPE as a string, e.g. "ID" for T_ID. */
41 const char *
42 token_type_to_name (enum token_type type)
44 switch (type)
46 #define TOKEN_TYPE(TYPE) case T_##TYPE: return #TYPE;
47 TOKEN_TYPES
48 #undef TOKEN_TYPE
49 case TOKEN_N_TYPES:
50 default:
51 return "unknown token type";
55 /* Returns an ASCII string that yields TOKEN if it appeared in a syntax file,
56 as a statically allocated constant string. This function returns NULL for
57 tokens that don't have any fixed string representation, such as identifier
58 and number tokens. */
59 const char *
60 token_type_to_string (enum token_type token)
62 switch (token)
64 case T_ID:
65 case T_POS_NUM:
66 case T_NEG_NUM:
67 case T_STRING:
68 case T_STOP:
69 return NULL;
71 case T_ENDCMD:
72 return ".";
74 case T_PLUS:
75 return "+";
77 case T_DASH:
78 return "-";
80 case T_ASTERISK:
81 return "*";
83 case T_SLASH:
84 return "/";
86 case T_EQUALS:
87 return "=";
89 case T_LPAREN:
90 return "(";
92 case T_RPAREN:
93 return ")";
95 case T_LBRACK:
96 return "[";
98 case T_RBRACK:
99 return "]";
101 case T_COMMA:
102 return ",";
104 case T_AND:
105 return "AND";
107 case T_OR:
108 return "OR";
110 case T_NOT:
111 return "NOT";
113 case T_EQ:
114 return "EQ";
116 case T_GE:
117 return ">=";
119 case T_GT:
120 return ">";
122 case T_LE:
123 return "<=";
125 case T_LT:
126 return "<";
128 case T_NE:
129 return "~=";
131 case T_ALL:
132 return "ALL";
134 case T_BY:
135 return "BY";
137 case T_TO:
138 return "TO";
140 case T_WITH:
141 return "WITH";
143 case T_EXP:
144 return "**";
146 case TOKEN_N_TYPES:
147 NOT_REACHED ();
150 NOT_REACHED ();
153 /* Recognizing identifiers. */
155 static bool
156 is_ascii_id1 (unsigned char c)
158 return c_isalpha (c) || c == '@' || c == '#' || c == '$';
161 static bool
162 is_ascii_idn (unsigned char c)
164 return is_ascii_id1 (c) || isdigit (c) || c == '.' || c == '_';
167 /* Returns true if C may be the first byte in an identifier in the current
168 locale.
170 (PSPP is transitioning to using Unicode internally for syntax, so please
171 use lex_uc_is_id1() instead, if possible.) */
172 bool
173 lex_is_id1 (char c)
175 return is_ascii_id1 (c) || (unsigned char) c >= 128;
178 /* Returns true if C may be a byte in an identifier other than the first.
180 (PSPP is transitioning to using Unicode internally for syntax, so please
181 use lex_uc_is_idn() instead, if possible.) */
182 bool
183 lex_is_idn (char c)
185 return is_ascii_idn (c) || (unsigned char) c >= 128;
188 /* Returns true if Unicode code point UC may be the first character in an
189 identifier in the current locale. */
190 bool
191 lex_uc_is_id1 (ucs4_t uc)
193 return (uc < 0x80
194 ? is_ascii_id1 (uc)
195 : (uc_is_general_category_withtable (uc,
196 UC_CATEGORY_MASK_L |
197 UC_CATEGORY_MASK_M |
198 UC_CATEGORY_MASK_S)
199 && uc != 0xfffc && uc != 0xfffd));
202 /* Returns true if Unicode code point UC may be a character in an identifier
203 other than the first. */
204 bool
205 lex_uc_is_idn (ucs4_t uc)
207 return (uc < 0x80
208 ? is_ascii_id1 (uc) || isdigit (uc) || uc == '.' || uc == '_'
209 : (uc_is_general_category_withtable (uc,
210 UC_CATEGORY_MASK_L |
211 UC_CATEGORY_MASK_M |
212 UC_CATEGORY_MASK_S |
213 UC_CATEGORY_MASK_N)
214 && uc != 0xfffc && uc != 0xfffd));
217 /* Returns true if Unicode code point UC is a space that separates tokens. */
218 bool
219 lex_uc_is_space (ucs4_t uc)
221 /* These are all of the Unicode characters in category Zs, Zl, or Zp. */
222 return (uc == ' ' || (uc <= 0x000d && uc >= 0x0009)
223 || (uc >= 0x80
224 && (uc == 0xa0 || uc == 0x85 || uc == 0x1680 || uc == 0x180e
225 || (uc >= 0x2000 && uc <= 0x200a)
226 || uc == 0x2028 || uc == 0x2029 || uc == 0x202f
227 || uc == 0x205f || uc == 0x3000)));
231 /* Returns the length of the longest prefix of STRING that forms
232 a valid identifier. Returns zero if STRING does not begin
233 with a valid identifier. */
234 size_t
235 lex_id_get_length (struct substring string)
237 const uint8_t *s = CHAR_CAST (const uint8_t *, string.string);
238 size_t len = string.length;
239 size_t ofs;
240 int mblen;
242 for (ofs = 0; ofs < string.length; ofs += mblen)
244 ucs4_t uc;
246 mblen = u8_mbtouc (&uc, s + ofs, len - ofs);
247 if (!(ofs == 0 ? lex_uc_is_id1 (uc) : lex_uc_is_idn (uc)))
248 break;
251 return ofs;
254 /* Comparing identifiers. */
256 /* Returns true if TOKEN is a case-insensitive match for KEYWORD.
258 Keywords match if one of the following is true: KEYWORD and
259 TOKEN are identical, or TOKEN is at least 3 characters long
260 and those characters are identical to KEYWORD. (Letters that
261 differ only in case are considered identical.)
263 KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
264 bool
265 lex_id_match (struct substring keyword, struct substring token)
267 return lex_id_match_n (keyword, token, 3);
270 /* Returns true if TOKEN is a case-insensitive match for at least
271 the first N characters of KEYWORD.
273 KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
274 bool
275 lex_id_match_n (struct substring keyword, struct substring token, size_t n)
277 size_t token_len = ss_length (token);
278 size_t keyword_len = ss_length (keyword);
280 if (token_len >= n && token_len < keyword_len)
281 return ss_equals_case (ss_head (keyword, token_len), token);
282 else
283 return ss_equals_case (keyword, token);
286 /* Table of keywords. */
287 struct keyword
289 int token;
290 const struct substring identifier;
293 static const struct keyword keywords[] =
295 { T_AND, SS_LITERAL_INITIALIZER ("AND") },
296 { T_OR, SS_LITERAL_INITIALIZER ("OR") },
297 { T_NOT, SS_LITERAL_INITIALIZER ("NOT") },
298 { T_EQ, SS_LITERAL_INITIALIZER ("EQ") },
299 { T_GE, SS_LITERAL_INITIALIZER ("GE") },
300 { T_GT, SS_LITERAL_INITIALIZER ("GT") },
301 { T_LE, SS_LITERAL_INITIALIZER ("LE") },
302 { T_LT, SS_LITERAL_INITIALIZER ("LT") },
303 { T_NE, SS_LITERAL_INITIALIZER ("NE") },
304 { T_ALL, SS_LITERAL_INITIALIZER ("ALL") },
305 { T_BY, SS_LITERAL_INITIALIZER ("BY") },
306 { T_TO, SS_LITERAL_INITIALIZER ("TO") },
307 { T_WITH, SS_LITERAL_INITIALIZER ("WITH") },
309 static const size_t keyword_cnt = sizeof keywords / sizeof *keywords;
311 /* Returns true if TOKEN is representable as a keyword. */
312 bool
313 lex_is_keyword (enum token_type token)
315 const struct keyword *kw;
316 for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
317 if (kw->token == token)
318 return true;
319 return false;
322 /* Returns the proper token type, either T_ID or a reserved
323 keyword enum, for ID. */
325 lex_id_to_token (struct substring id)
327 if (ss_length (id) >= 2 && ss_length (id) <= 4)
329 const struct keyword *kw;
330 for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
331 if (ss_equals_case (kw->identifier, id))
332 return kw->token;
335 return T_ID;