src/data/identifier.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 1997-9, 2000, 2005, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 /*
  18    This file is concerned with the definition of the PSPP syntax, NOT the
  19    action of scanning/parsing code .
  20 */
  21
  22 #include <config.h>
  23
  24 #include "data/identifier.h"
  25
  26 #include <string.h>
  27 #include <unistr.h>
  28 #include <unictype.h>
  29
  30 #include "libpspp/assertion.h"
  31 #include "libpspp/cast.h"
  32
  33 #include "gl/c-ctype.h"
  34
  35 #include "gettext.h"
  36 #define _(msgid) gettext (msgid)
  37
  38 /* Tokens. */
  39
  40 /* Returns TYPE as a string, e.g. "ID" for T_ID. */
  41 const char *
  42 token_type_to_name (enum token_type type)
  43 {
  44   switch (type)
  45     {
  46 #define TOKEN_TYPE(TYPE) case T_##TYPE: return #TYPE;
  47       TOKEN_TYPES
  48 #undef TOKEN_TYPE
  49     case TOKEN_N_TYPES:
  50     default:
  51       return "unknown token type";
  52     }
  53 }
  54
  55 /* Returns an ASCII string that yields TOKEN if it appeared in a syntax file,
  56    as a statically allocated constant string.  This function returns NULL for
  57    tokens that don't have any fixed string representation, such as identifier
  58    and number tokens. */
  59 const char *
  60 token_type_to_string (enum token_type token)
  61 {
  62   switch (token)
  63     {
  64     case T_ID:
  65     case T_POS_NUM:
  66     case T_NEG_NUM:
  67     case T_STRING:
  68     case T_STOP:
  69       return NULL;
  70
  71     case T_ENDCMD:
  72       return ".";
  73
  74     case T_PLUS:
  75       return "+";
  76
  77     case T_DASH:
  78       return "-";
  79
  80     case T_ASTERISK:
  81       return "*";
  82
  83     case T_SLASH:
  84       return "/";
  85
  86     case T_EQUALS:
  87       return "=";
  88
  89     case T_LPAREN:
  90       return "(";
  91
  92     case T_RPAREN:
  93       return ")";
  94
  95     case T_LBRACK:
  96       return "[";
  97
  98     case T_RBRACK:
  99       return "]";
 100
 101     case T_COMMA:
 102       return ",";
 103
 104     case T_AND:
 105       return "AND";
 106
 107     case T_OR:
 108       return "OR";
 109
 110     case T_NOT:
 111       return "NOT";
 112
 113     case T_EQ:
 114       return "EQ";
 115
 116     case T_GE:
 117       return ">=";
 118
 119     case T_GT:
 120       return ">";
 121
 122     case T_LE:
 123       return "<=";
 124
 125     case T_LT:
 126       return "<";
 127
 128     case T_NE:
 129       return "~=";
 130
 131     case T_ALL:
 132       return "ALL";
 133
 134     case T_BY:
 135       return "BY";
 136
 137     case T_TO:
 138       return "TO";
 139
 140     case T_WITH:
 141       return "WITH";
 142
 143     case T_EXP:
 144       return "**";
 145
 146     case TOKEN_N_TYPES:
 147       NOT_REACHED ();
 148     }
 149
 150   NOT_REACHED ();
 151 }
 152
 153 /* Recognizing identifiers. */
 154
 155 static bool
 156 is_ascii_id1 (unsigned char c)
 157 {
 158   return c_isalpha (c) || c == '@' || c == '#' || c == '$';
 159 }
 160
 161 static bool
 162 is_ascii_idn (unsigned char c)
 163 {
 164   return is_ascii_id1 (c) || isdigit (c) || c == '.' || c == '_';
 165 }
 166
 167 /* Returns true if C may be the first byte in an identifier in the current
 168    locale.
 169
 170    (PSPP is transitioning to using Unicode internally for syntax, so please
 171    use lex_uc_is_id1() instead, if possible.) */
 172 bool
 173 lex_is_id1 (char c)
 174 {
 175   return is_ascii_id1 (c) || (unsigned char) c >= 128;
 176 }
 177
 178 /* Returns true if C may be a byte in an identifier other than the first.
 179
 180    (PSPP is transitioning to using Unicode internally for syntax, so please
 181    use lex_uc_is_idn() instead, if possible.) */
 182 bool
 183 lex_is_idn (char c)
 184 {
 185   return is_ascii_idn (c) || (unsigned char) c >= 128;
 186 }
 187
 188 /* Returns true if Unicode code point UC may be the first character in an
 189    identifier in the current locale. */
 190 bool
 191 lex_uc_is_id1 (ucs4_t uc)
 192 {
 193   return (uc < 0x80
 194           ? is_ascii_id1 (uc)
 195           : (uc_is_general_category_withtable (uc,
 196                                                UC_CATEGORY_MASK_L |
 197                                                UC_CATEGORY_MASK_M |
 198                                                UC_CATEGORY_MASK_S)
 199              && uc != 0xfffc && uc != 0xfffd));
 200 }
 201
 202 /* Returns true if Unicode code point UC may be a character in an identifier
 203    other than the first. */
 204 bool
 205 lex_uc_is_idn (ucs4_t uc)
 206 {
 207   return (uc < 0x80
 208           ? is_ascii_id1 (uc) || isdigit (uc) || uc == '.' || uc == '_'
 209           : (uc_is_general_category_withtable (uc,
 210                                                UC_CATEGORY_MASK_L |
 211                                                UC_CATEGORY_MASK_M |
 212                                                UC_CATEGORY_MASK_S |
 213                                                UC_CATEGORY_MASK_N)
 214              && uc != 0xfffc && uc != 0xfffd));
 215 }
 216
 217 /* Returns true if Unicode code point UC is a space that separates tokens. */
 218 bool
 219 lex_uc_is_space (ucs4_t uc)
 220 {
 221   /* These are all of the Unicode characters in category Zs, Zl, or Zp.  */
 222   return (uc == ' ' || (uc <= 0x000d && uc >= 0x0009)
 223           || (uc >= 0x80
 224               && (uc == 0xa0 || uc == 0x85 || uc == 0x1680 || uc == 0x180e
 225                   || (uc >= 0x2000 && uc <= 0x200a)
 226                   || uc == 0x2028 || uc == 0x2029 || uc == 0x202f
 227                   || uc == 0x205f || uc == 0x3000)));
 228 }
 229
 230
 231 /* Returns the length of the longest prefix of STRING that forms
 232    a valid identifier.  Returns zero if STRING does not begin
 233    with a valid identifier.  */
 234 size_t
 235 lex_id_get_length (struct substring string)
 236 {
 237   const uint8_t *s = CHAR_CAST (const uint8_t *, string.string);
 238   size_t len = string.length;
 239   size_t ofs;
 240   int mblen;
 241
 242   for (ofs = 0; ofs < string.length; ofs += mblen)
 243     {
 244       ucs4_t uc;
 245
 246       mblen = u8_mbtouc (&uc, s + ofs, len - ofs);
 247       if (!(ofs == 0 ? lex_uc_is_id1 (uc) : lex_uc_is_idn (uc)))
 248         break;
 249     }
 250
 251   return ofs;
 252 }
 253 \f
 254 /* Comparing identifiers. */
 255
 256 /* Returns true if TOKEN is a case-insensitive match for KEYWORD.
 257
 258    Keywords match if one of the following is true: KEYWORD and
 259    TOKEN are identical, or TOKEN is at least 3 characters long
 260    and those characters are identical to KEYWORD.  (Letters that
 261    differ only in case are considered identical.)
 262
 263    KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
 264 bool
 265 lex_id_match (struct substring keyword, struct substring token)
 266 {
 267   return lex_id_match_n (keyword, token, 3);
 268 }
 269
 270 /* Returns true if TOKEN is a case-insensitive match for at least
 271    the first N characters of KEYWORD.
 272
 273    KEYWORD must be ASCII, but TOKEN may be ASCII or UTF-8. */
 274 bool
 275 lex_id_match_n (struct substring keyword, struct substring token, size_t n)
 276 {
 277   size_t token_len = ss_length (token);
 278   size_t keyword_len = ss_length (keyword);
 279
 280   if (token_len >= n && token_len < keyword_len)
 281     return ss_equals_case (ss_head (keyword, token_len), token);
 282   else
 283     return ss_equals_case (keyword, token);
 284 }
 285 \f
 286 /* Table of keywords. */
 287 struct keyword
 288   {
 289     int token;
 290     const struct substring identifier;
 291   };
 292
 293 static const struct keyword keywords[] =
 294   {
 295     { T_AND,  SS_LITERAL_INITIALIZER ("AND") },
 296     { T_OR,   SS_LITERAL_INITIALIZER ("OR") },
 297     { T_NOT,  SS_LITERAL_INITIALIZER ("NOT") },
 298     { T_EQ,   SS_LITERAL_INITIALIZER ("EQ") },
 299     { T_GE,   SS_LITERAL_INITIALIZER ("GE") },
 300     { T_GT,   SS_LITERAL_INITIALIZER ("GT") },
 301     { T_LE,   SS_LITERAL_INITIALIZER ("LE") },
 302     { T_LT,   SS_LITERAL_INITIALIZER ("LT") },
 303     { T_NE,   SS_LITERAL_INITIALIZER ("NE") },
 304     { T_ALL,  SS_LITERAL_INITIALIZER ("ALL") },
 305     { T_BY,   SS_LITERAL_INITIALIZER ("BY") },
 306     { T_TO,   SS_LITERAL_INITIALIZER ("TO") },
 307     { T_WITH, SS_LITERAL_INITIALIZER ("WITH") },
 308   };
 309 static const size_t keyword_cnt = sizeof keywords / sizeof *keywords;
 310
 311 /* Returns true if TOKEN is representable as a keyword. */
 312 bool
 313 lex_is_keyword (enum token_type token)
 314 {
 315   const struct keyword *kw;
 316   for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
 317     if (kw->token == token)
 318       return true;
 319   return false;
 320 }
 321
 322 /* Returns the proper token type, either T_ID or a reserved
 323    keyword enum, for ID. */
 324 int
 325 lex_id_to_token (struct substring id)
 326 {
 327   if (ss_length (id) >= 2 && ss_length (id) <= 4)
 328     {
 329       const struct keyword *kw;
 330       for (kw = keywords; kw < &keywords[keyword_cnt]; kw++)
 331         if (ss_equals_case (kw->identifier, id))
 332           return kw->token;
 333     }
 334
 335   return T_ID;
 336 }