lib/sql_lexer.mll

   1
   2 {
   3   open Printf
   4   open Lexing
   5   open ExtLib
   6   open Sql_parser
   7   module T = Sql.Type
   8
   9 let error _ callerID =
  10   prerr_endline (sprintf "Lexer error : %s" callerID);
  11 (*      update_pos buf;*)
  12         raise Parsing.Parse_error
  13
  14 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
  15
  16 let advance_line_pos pos =
  17   { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
  18
  19 let advance_line lexbuf =
  20   lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
  21
  22 let keywords =
  23   let k = ref [
  24    "as",AS;
  25    "on",ON;
  26    "to",TO;
  27    "conflict",CONFLICT;
  28    "using",USING;
  29    "natural",NATURAL;
  30    "join",JOIN;
  31    "between",BETWEEN;
  32    "and",AND;
  33    "escape",ESCAPE;
  34    "not",NOT;
  35    "null",NULL;
  36    "unique",UNIQUE;
  37    "primary",PRIMARY;
  38    "key",KEY;
  39    "default",DEFAULT;
  40    "precision",PRECISION;
  41    "varying",VARYING;
  42    "charset",CHARSET;
  43    "collate",COLLATE;
  44    "national",NATIONAL;
  45    "ascii",ASCII;
  46    "unicode",UNICODE;
  47    "distinct",DISTINCT;
  48    "character",CHARACTER;
  49    "binary",BINARY;
  50    "bytea",BINARY;
  51    "all",ALL;
  52    "any",ANY;
  53    "some",SOME;
  54    "order",ORDER;
  55    "by",BY;
  56    "limit",LIMIT;
  57    "desc",DESC;
  58    "asc",ASC;
  59    "offset",OFFSET;
  60    "select",SELECT;
  61    "create",CREATE;
  62    "table",TABLE;
  63    "view",VIEW;
  64    "insert",INSERT;
  65    "replace",REPLACE;
  66    "update",UPDATE;
  67    "delete",DELETE;
  68    "from",FROM;
  69    "or",OR;
  70    "into",INTO;
  71    "values",VALUES;
  72    "where",WHERE;
  73    "set",SET;
  74    "in",IN;
  75    "group",GROUP;
  76    "having",HAVING;
  77    "union",UNION;
  78    "except",EXCEPT;
  79    "intersect",INTERSECT;
  80    "cross",CROSS;
  81    "temporary",TEMPORARY;
  82    "if",IF;
  83    "exists",EXISTS;
  84    "foreign",FOREIGN;
  85    "global",GLOBAL;
  86    "local",LOCAL;
  87    "value",VALUE;
  88    "references",REFERENCES;
  89    "check",CHECK;
  90    "date",DATE;
  91    "time",TIME;
  92    "timestamp",TIMESTAMP;
  93    "alter",ALTER;
  94    "rename",RENAME;
  95    "add",ADD;
  96    "cascade",CASCADE;
  97    "restrict",RESTRICT;
  98    "drop",DROP;
  99    "constraint",CONSTRAINT;
 100    "after",AFTER;
 101    "index",INDEX;
 102    "fulltext",FULLTEXT;
 103    "unsigned",UNSIGNED;
 104    "first",FIRST;
 105    "column",COLUMN;
 106    "like", LIKE;
 107    "case", CASE;
 108    "when", WHEN;
 109    "then", THEN;
 110    "else", ELSE;
 111    "end", END;
 112    "change", CHANGE;
 113    "modify", MODIFY;
 114    "delayed", DELAYED;
 115    "enum", ENUM;
 116    "for", FOR;
 117    "share", SHARE;
 118    "mode", MODE;
 119    "mod", MOD;
 120    "div", DIV;
 121    "lock", LOCK;
 122    "of", OF;
 123    "with", WITH;
 124    "nowait", NOWAIT;
 125    "action", ACTION;
 126    "no", NO;
 127    "is", IS;
 128    "interval", INTERVAL;
 129    "microsecond", MICROSECOND;
 130    "second", SECOND;
 131    "minute", MINUTE;
 132    "hour", HOUR;
 133    "day", DAY;
 134    "week", WEEK;
 135    "month", MONTH;
 136    "quarter", QUARTER;
 137    "year", YEAR;
 138    "second_microsecond", SECOND_MICROSECOND;
 139    "minute_microsecond", MINUTE_MICROSECOND;
 140    "minute_second", MINUTE_SECOND;
 141    "hour_microsecond", HOUR_MICROSECOND;
 142    "hour_second", HOUR_SECOND;
 143    "hour_minute", HOUR_MINUTE;
 144    "day_microsecond", DAY_MICROSECOND;
 145    "day_second", DAY_SECOND;
 146    "day_minute", DAY_MINUTE;
 147    "day_hour", DAY_HOUR;
 148    "year_month", YEAR_MONTH;
 149    "false", FALSE;
 150    "true", TRUE;
 151    "duplicate", DUPLICATE;
 152    "function", FUNCTION;
 153    "procedure", PROCEDURE;
 154    "returns", RETURNS;
 155    "begin", BEGIN;
 156    "comment", COMMENT;
 157    "language", LANGUAGE;
 158    "substring", SUBSTRING;
 159    "substr", SUBSTRING;
 160   ] in (* more *)
 161   let all token l = k := !k @ List.map (fun x -> x,token) l in
 162   all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now";];
 163   all DATETIME_FUNC ["getdate"]; (* mssql? *)
 164   all CONFLICT_ALGO ["ignore"; "abort"; "fail"; "rollback"];
 165   all JOIN_TYPE1 ["left";"right";"full"];
 166   all JOIN_TYPE2 ["inner";"outer"];
 167   all LIKE_OP ["glob";"regexp";"match"];
 168   all AUTOINCREMENT ["autoincrement";"auto_increment"];
 169 (* standard built-in types
 170       CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
 171       BINARY, BINARY VARYING, BINARY LARGE OBJECT,
 172       NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
 173       FLOAT, REAL, DOUBLE PRECISION,
 174       BOOLEAN,
 175       DATE, TIME, TIMESTAMP, INTERVAL
 176     *)
 177   all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
 178   all T_DECIMAL ["numeric";"decimal";"dec";"fixed"];
 179   all T_INTEGER ["number"]; (* oracle *)
 180   all T_BOOLEAN ["bool";"boolean"];
 181   all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
 182   all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
 183   all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
 184   all T_TEXT ["varchar2"]; (* oracle *)
 185   all T_DATETIME ["datetime"];
 186   all T_UUID ["uuid"]; (* http://www.postgresql.org/docs/9.4/static/datatype-uuid.html *)
 187   !k
 188
 189 (*
 190   Q: Why not convert all input to lowercase before lexing?
 191   A: Sometimes SQL is case-sensitive, also string contents should be preserved
 192 *)
 193
 194 module Keywords = Map.Make(String)
 195
 196 let keywords =
 197   let add map (k,v) =
 198     let k = String.lowercase k in
 199     if Keywords.mem k map then
 200       failwith (sprintf "Lexeme %s is already associated with keyword." k)
 201     else
 202       Keywords.add k v map
 203   in
 204   List.fold_left add Keywords.empty keywords
 205
 206 (* FIXME case sensitivity??! *)
 207
 208 let get_ident str =
 209   let str = String.lowercase str in
 210   try Keywords.find str keywords with Not_found -> IDENT str
 211
 212 let ident str = IDENT (String.lowercase str)
 213
 214 let as_literal ch s =
 215   let s = String.replace_chars (fun x -> String.make (if x = ch then 2 else 1) x) s in
 216   sprintf "%c%s%c" ch s ch
 217 }
 218
 219 let digit = ['0'-'9']
 220 let alpha = ['a'-'z' 'A'-'Z']
 221 let ident = (alpha) (alpha | digit | '_' )*
 222 let wsp = [' ' '\r' '\t']
 223 let cmnt = "--" | "//" | "#"
 224
 225 (* extract separate statements *)
 226 rule ruleStatement = parse
 227   | ['\n' ' ' '\r' '\t']+ as tok { `Space tok }
 228   | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n' { `Prop (n,v) }
 229   | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n' { `Prop ("name",name) }
 230   | '"' { let s = ruleInQuotes "" lexbuf in `Token (as_literal '"' s) }
 231   | "'" { let s = ruleInSingleQuotes "" lexbuf in `Token (as_literal '\'' s) }
 232   | "$" (ident? as tag) "$" { let s = ruleInDollarQuotes tag "" lexbuf in `Token (sprintf "$%s$%s$%s$" tag s tag) }
 233   | cmnt as s { `Comment (s ^ ruleComment "" lexbuf) }
 234   | "/*" { `Comment ("/*" ^ ruleCommentMulti "" lexbuf ^ "*/") }
 235   | ';' { `Semicolon }
 236   | [^ ';'] as c { `Char c }
 237   | eof { `Eof }
 238 and
 239 (* extract tail of the input *)
 240 ruleTail acc = parse
 241   | eof { acc }
 242   | _* as str { ruleTail (acc ^ str) lexbuf }
 243 and
 244 ruleMain = parse
 245   | wsp   { ruleMain lexbuf }
 246   (* update line number *)
 247   | '\n'  { advance_line lexbuf; ruleMain lexbuf}
 248
 249   | '('         { LPAREN }
 250   | ')'         { RPAREN }
 251   | ','   { COMMA }
 252   | '.'   { DOT }
 253   | '{'   { LCURLY (lexeme_start lexbuf) }
 254   | '}'   { RCURLY (lexeme_start lexbuf) }
 255
 256   | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
 257   | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
 258
 259   | "*" { ASTERISK }
 260   | "=" { EQUAL }
 261   | "!" { EXCL }
 262   | "~" { TILDE }
 263   | "||" { CONCAT_OP }
 264   | "+" { PLUS }
 265   | "-" { MINUS }
 266
 267   | "/" | "%" { NUM_DIV_OP }
 268   | "<<" | ">>" { NUM_BIT_SHIFT }
 269   | "|" { NUM_BIT_OR }
 270   | "&" { NUM_BIT_AND }
 271   | ">" | ">=" | "<=" | "<" { NUM_CMP_OP }
 272   | "<>" | "!=" | "==" { NUM_EQ_OP }
 273   | "<=>" { NOT_DISTINCT_OP }
 274
 275   | "?" { PARAM { label=None; pos = pos lexbuf } }
 276   | [':' '@'] (ident as str) { PARAM { label = Some str; pos = pos lexbuf } }
 277
 278   | '"' { ident (ruleInQuotes "" lexbuf) }
 279   | "'" { TEXT (ruleInSingleQuotes "" lexbuf) }
 280   (* http://www.postgresql.org/docs/current/interactive/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING *)
 281   | "$" (ident? as tag) "$" { TEXT (ruleInDollarQuotes tag "" lexbuf) }
 282   | "`" { ident (ruleInBackQuotes "" lexbuf) }
 283   | "[" { ident (ruleInBrackets "" lexbuf) }
 284   | ['x' 'X'] "'" { BLOB (ruleInSingleQuotes "" lexbuf) }
 285
 286   | ident as str { if !Parser_state.mode = Ident then IDENT str (* no keywords, preserve case *) else get_ident str }
 287   | digit+ as str { INTEGER (int_of_string str) }
 288   | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
 289   | eof         { EOF }
 290   | _   { error lexbuf "ruleMain" }
 291 and
 292 (* FIXME factor out all that ruleIn* rules *)
 293 ruleInQuotes acc = parse
 294   | '"'         { acc }
 295   | eof         { error lexbuf "no terminating quote" }
 296   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
 297   | "\"\""      { ruleInQuotes (acc^"\"") lexbuf }
 298   | [^'"' '\n']+ as s { ruleInQuotes (acc^s) lexbuf }
 299   | _           { error lexbuf "ruleInQuotes" }
 300 and
 301 ruleInBrackets acc = parse
 302   | ']'         { acc }
 303   | eof         { error lexbuf "no terminating bracket" }
 304   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating bracket" }
 305 (*   | "\"\""      { ruleInQuotes (acc ^ "\"") lexbuf } *)
 306   | [^']' '\n']+  { ruleInBrackets (acc ^ lexeme lexbuf) lexbuf }
 307   | _           { error lexbuf "ruleInBrackets" }
 308 and
 309 ruleInSingleQuotes acc = parse
 310   | '\''              { acc }
 311   | eof         { error lexbuf "no terminating single quote" }
 312   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
 313   | "''"        { ruleInSingleQuotes (acc ^ "'") lexbuf }
 314   | [^'\'' '\n']+  { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
 315   | _           { error lexbuf "ruleInSingleQuotes" }
 316 and
 317 ruleInBackQuotes acc = parse
 318   | '`'         { acc }
 319   | eof         { error lexbuf "no terminating back quote" }
 320   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
 321   | "``"        { ruleInBackQuotes (acc ^ "`") lexbuf }
 322   | [^'`' '\n']+  { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
 323   | _           { error lexbuf "ruleInBackQuotes" }
 324 and
 325 ruleInDollarQuotes tag acc = parse
 326   | "$" (ident? as tag_) "$" { if tag_ = tag then acc else ruleInDollarQuotes tag (acc ^ sprintf "$%s$" tag_) lexbuf }
 327   | eof         { error lexbuf "no terminating dollar quote" }
 328   | '\n'        { advance_line lexbuf; ruleInDollarQuotes tag (acc ^ "\n") lexbuf }
 329   (* match one char at a time to make sure delimiter matches longer *)
 330   | [^'\n']     { ruleInDollarQuotes tag (acc ^ lexeme lexbuf) lexbuf }
 331   | _           { error lexbuf "ruleInDollarQuotes" }
 332 and
 333 ruleComment acc = parse
 334   | '\n'        { advance_line lexbuf; acc }
 335   | eof         { acc }
 336   | [^'\n']+    { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
 337   | _           { error lexbuf "ruleComment"; }
 338 and
 339 ruleCommentMulti acc = parse
 340   | '\n'        { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
 341   | "*/"        { acc }
 342   | "*"
 343   | [^'\n' '*']+    { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
 344   | _           { error lexbuf "ruleCommentMulti" }
 345
 346 {
 347
 348   let parse_rule lexbuf =
 349     let token = ruleMain lexbuf in
 350     match !Parser_state.mode with
 351     | Normal -> token
 352     | Ignore ->
 353 (*         eprintf "ignored: %s\n" (lexeme lexbuf); *)
 354       if token = EOF then token else IGNORED
 355     | Ident -> token
 356
 357 }
 358