src/data/identifier.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2005, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 /*
  18    This file is concerned with the definition of the PSPP syntax, NOT the
  19    action of scanning/parsing code .
  20 */
  21
  22 #include <config.h>
  23
  24 #include "data/identifier.h"
  25
  26 #include <string.h>
  27 #include <unistr.h>
  28 #include <unictype.h>
  29
  30 #include "libpspp/assertion.h"
  31 #include "libpspp/cast.h"
  32
  33 #include "gl/c-ctype.h"
  34
  35 #include "gettext.h"
  36 #define _(msgid) gettext (msgid)
  37
  38 /* Tokens. */
  39
  40 /* Returns TYPE as a string, e.g. "ID" for T_ID. */
  41 const char *
  42 token_type_to_name (enum token_type type)
  43 {
  44   switch (type)
  45     {
  46 #define TOKEN_TYPE(TYPE) case T_##TYPE: return #TYPE;
  47       TOKEN_TYPES
  48 #undef TOKEN_TYPE
  49     default:
  50       return "unknown token type";
  51     }
  52 }
  53
  54 /* Returns an ASCII string that yields TOKEN if it appeared in a syntax file,
  55    as a statically allocated constant string.  This function returns NULL for
  56    tokens that don't have any fixed string representation, such as identifier
  57    and number tokens. */
  58 const char *
  59 token_type_to_string (enum token_type token)
  60 {
  61   switch (token)
  62     {
  63     case T_ID:
  64     case T_POS_NUM:
  65     case T_NEG_NUM:
  66     case T_STRING:
  67     case T_MACRO_ID:
  68     case T_MACRO_PUNCT:
  69     case T_STOP:
  70       return NULL;
  71
  72     case T_ENDCMD:
  73       return ".";
  74
  75     case T_PLUS:
  76       return "+";
  77
  78     case T_DASH:
  79       return "-";
  80
  81     case T_ASTERISK:
  82       return "*";
  83
  84     case T_SLASH:
  85       return "/";
  86
  87     case T_EQUALS:
  88       return "=";
  89
  90     case T_LPAREN:
  91       return "(";
  92
  93     case T_RPAREN:
  94       return ")";
  95
  96     case T_LBRACK:
  97       return "[";
  98
  99     case T_RBRACK:
 100       return "]";
 101
 102     case T_LCURLY:
 103       return "{";
 104
 105     case T_RCURLY:
 106       return "}";
 107
 108     case T_COMMA:
 109       return ",";
 110
 111     case T_SEMICOLON:
 112       return ";";
 113
 114     case T_COLON:
 115       return ":";
 116
 117     case T_AND:
 118       return "AND";
 119
 120     case T_OR:
 121       return "OR";
 122
 123     case T_NOT:
 124       return "NOT";
 125
 126     case T_EQ:
 127       return "EQ";
 128
 129     case T_GE:
 130       return ">=";
 131
 132     case T_GT:
 133       return ">";
 134
 135     case T_LE:
 136       return "<=";
 137
 138     case T_LT:
 139       return "<";
 140
 141     case T_NE:
 142       return "~=";
 143
 144     case T_ALL:
 145       return "ALL";
 146
 147     case T_BY:
 148       return "BY";
 149
 150     case T_TO:
 151       return "TO";
 152
 153     case T_WITH:
 154       return "WITH";
 155
 156     case T_EXP:
 157       return "**";
 158     }
 159
 160   NOT_REACHED ();
 161 }
 162
 163 /* Recognizing identifiers. */
 164
 165 static bool
 166 is_ascii_id1 (unsigned char c)
 167 {
 168   return c_isalpha (c) || c == '@' || c == '#' || c == '$';
 169 }
 170
 171 static bool
 172 is_ascii_idn (unsigned char c)
 173 {
 174   return is_ascii_id1 (c) || isdigit (c) || c == '.' || c == '_';
 175 }
 176
 177 /* Returns true if C may be the first byte in an identifier in the current
 178    locale.
 179
 180    (PSPP is transitioning to using Unicode internally for syntax, so please
 181    use lex_uc_is_id1() instead, if possible.) */
 182 bool
 183 lex_is_id1 (char c)
 184 {
 185   return is_ascii_id1 (c) || (unsigned char) c >= 128;
 186 }
 187
 188 /* Returns true if C may be a byte in an identifier other than the first.
 189
 190    (PSPP is transitioning to using Unicode internally for syntax, so please
 191    use lex_uc_is_idn() instead, if possible.) */
 192 bool
 193 lex_is_idn (char c)
 194 {
 195   return is_ascii_idn (c) || (unsigned char) c >= 128;
 196 }
 197
 198 /* Returns true if Unicode code point UC may be the first character in an
 199    identifier in the current locale. */
 200 bool
 201 lex_uc_is_id1 (ucs4_t uc)
 202 {
 203   return (uc < 0x80
 204           ? is_ascii_id1 (uc)
 205           : (uc_is_general_category_withtable (uc,
 206                                                UC_CATEGORY_MASK_L |
 207                                                UC_CATEGORY_MASK_M |
 208                                                UC_CATEGORY_MASK_S)
 209              && uc != 0xfffc && uc != 0xfffd));
 210 }
 211
 212 /* Returns true if Unicode code point UC may be a character in an identifier
 213    other than the first. */
 214 bool
 215 lex_uc_is_idn (ucs4_t uc)
 216 {
 217   return (uc < 0x80
 218           ? is_ascii_id1 (uc) || isdigit (uc) || uc == '.' || uc == '_'
 219           : (uc_is_general_category_withtable (uc,
 220                                                UC_CATEGORY_MASK_L |
 221                                                UC_CATEGORY_MASK_M |
 222                                                UC_CATEGORY_MASK_S |
 223                                                UC_CATEGORY_MASK_N)
 224              && uc != 0xfffc && uc != 0xfffd));
 225 }
 226
 227 /* Returns true if Unicode code point UC is a space that separates tokens. */
 228 bool
 229 lex_uc_is_space (ucs4_t uc)
 230 {
 231   /* These are all of the Unicode characters in category Zs, Zl, or Zp.  */
 232   return (uc == ' ' || (uc <= 0x000d && uc >= 0x0009)
 233           || (uc >= 0x80
 234               && (uc == 0xa0 || uc == 0x85 || uc == 0x1680 || uc == 0x180e
 235                   || (uc >= 0x2000 && uc <= 0x200a)
 236                   || uc == 0x2028 || uc == 0x2029 || uc == 0x202f
 237                   || uc == 0x205f || uc == 0x3000)));
 238 }
 239
 240
 241 /* Returns the length of the longest prefix of STRING that forms
 242    a valid identifier.  Returns zero if STRING does not begin
 243    with a valid identifier.  */
 244 size_t
 245 lex_id_get_length (struct substring string)
 246 {
 247   const uint8_t *s = CHAR_CAST (const uint8_t *, string.string);
 248   size_t len = string.length;
 249   size_t ofs;
 250   int mblen;
 251
 252   for (ofs = 0; ofs < string.length; ofs += mblen)
 253     {
 254       ucs4_t uc;
 255
 256       mblen = u8_mbtouc (&uc, s + ofs, len - ofs);
 257       if (!(ofs == 0 ? lex_uc_is_id1 (uc) : lex_uc_is_idn (uc)))
 258         break;
 259     }
 260
 261   return ofs;
 262 }
 263 \f
 264 /* Comparing identifiers. */
 265
 266 /* Returns true if TOKEN is a case-insensitive match for KEYWORD.
 267
 268    Keywords match if one of the following is true: KEYWORD and
 269    TOKEN are identical, or TOKEN is at least 3 characters long
 270    and those characters are identical to KEYWORD.  (Letters that
 271    differ only in case are considered identical.)
 272
 273    KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
 274 bool
 275 lex_id_match (struct substring keyword, struct substring token)
 276 {
 277   return lex_id_match_n (keyword, token, 3);
 278 }
 279
 280 /* Returns true if TOKEN is a case-insensitive match for at least
 281    the first N characters of KEYWORD.
 282
 283    KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
 284 bool
 285 lex_id_match_n (struct substring keyword, struct substring token, size_t n)
 286 {
 287   size_t token_len = ss_length (token);
 288   size_t keyword_len = ss_length (keyword);
 289
 290   if (token_len >= n && token_len < keyword_len)
 291     return ss_equals_case (ss_head (keyword, token_len), token);
 292   else
 293     return ss_equals_case (keyword, token);
 294 }
 295 \f
 296 /* Table of keywords. */
 297 struct keyword
 298   {
 299     int token;
 300     const struct substring identifier;
 301   };
 302
 303 static const struct keyword keywords[] =
 304   {
 305     { T_AND,  SS_LITERAL_INITIALIZER ("AND") },
 306     { T_OR,   SS_LITERAL_INITIALIZER ("OR") },
 307     { T_NOT,  SS_LITERAL_INITIALIZER ("NOT") },
 308     { T_EQ,   SS_LITERAL_INITIALIZER ("EQ") },
 309     { T_GE,   SS_LITERAL_INITIALIZER ("GE") },
 310     { T_GT,   SS_LITERAL_INITIALIZER ("GT") },
 311     { T_LE,   SS_LITERAL_INITIALIZER ("LE") },
 312     { T_LT,   SS_LITERAL_INITIALIZER ("LT") },
 313     { T_NE,   SS_LITERAL_INITIALIZER ("NE") },
 314     { T_ALL,  SS_LITERAL_INITIALIZER ("ALL") },
 315     { T_BY,   SS_LITERAL_INITIALIZER ("BY") },
 316     { T_TO,   SS_LITERAL_INITIALIZER ("TO") },
 317     { T_WITH, SS_LITERAL_INITIALIZER ("WITH") },
 318   };
 319 static const size_t n_keywords = sizeof keywords / sizeof *keywords;
 320
 321 /* Returns true if TOKEN is representable as a keyword. */
 322 bool
 323 lex_is_keyword (enum token_type token)
 324 {
 325   const struct keyword *kw;
 326   for (kw = keywords; kw < &keywords[n_keywords]; kw++)
 327     if (kw->token == token)
 328       return true;
 329   return false;
 330 }
 331
 332 /* Returns the proper token type, either T_ID or a reserved
 333    keyword enum, for ID. */
 334 int
 335 lex_id_to_token (struct substring id)
 336 {
 337   if (ss_length (id) >= 2 && ss_length (id) <= 4)
 338     {
 339       const struct keyword *kw;
 340       for (kw = keywords; kw < &keywords[n_keywords]; kw++)
 341         if (ss_equals_case (kw->identifier, id))
 342           return kw->token;
 343     }
 344
 345   return T_ID;
 346 }