lib/sql_lexer.mll

   1
   2 {
   3   open Printf
   4   open Lexing
   5   open ExtLib
   6   open Sql_parser
   7   module T = Sql.Type
   8
   9 let error _ callerID =
  10   prerr_endline (sprintf "Lexer error : %s" callerID);
  11 (*      update_pos buf;*)
  12         raise Parsing.Parse_error
  13
  14 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
  15
  16 let advance_line_pos pos =
  17   { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
  18
  19 let advance_line lexbuf =
  20   lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
  21
  22 let keep_lexeme_start lexbuf f =
  23   let start_p = lexeme_start_p lexbuf in
  24   let x = f () in
  25   lexbuf.lex_start_p <- start_p;
  26   x
  27
  28 let keywords =
  29   let k = ref [
  30    "action", ACTION;
  31    "add",ADD;
  32    "after",AFTER;
  33    "all",ALL;
  34    "alter",ALTER;
  35    "and",AND;
  36    "any",ANY;
  37    "as",AS;
  38    "asc",ASC;
  39    "ascii",ASCII;
  40    "begin", BEGIN;
  41    "between",BETWEEN;
  42    "binary",BINARY;
  43    "by",BY;
  44    "bytea",BINARY;
  45    "cascade",CASCADE;
  46    "case", CASE;
  47    "cast", CAST;
  48    "change", CHANGE;
  49    "character",CHARACTER;
  50    "charset",CHARSET;
  51    "check",CHECK;
  52    "collate",COLLATE;
  53    "column",COLUMN;
  54    "comment", COMMENT;
  55    "conflict",CONFLICT;
  56    "constraint",CONSTRAINT;
  57    "convert", CONVERT;
  58    "create",CREATE;
  59    "cross",CROSS;
  60    "current", CURRENT;
  61    "date",DATE;
  62    "day", DAY;
  63    "day_hour", DAY_HOUR;
  64    "day_microsecond", DAY_MICROSECOND;
  65    "day_minute", DAY_MINUTE;
  66    "day_second", DAY_SECOND;
  67    "default",DEFAULT;
  68    "delayed", DELAYED;
  69    "delete",DELETE;
  70    "desc",DESC;
  71    "distinct",DISTINCT;
  72    "div", DIV;
  73    "drop",DROP;
  74    "duplicate", DUPLICATE;
  75    "else", ELSE;
  76    "end", END;
  77    "enum", ENUM;
  78    "escape",ESCAPE;
  79    "except",EXCEPT;
  80    "exists",EXISTS;
  81    "false", FALSE;
  82    "first",FIRST;
  83    "first_value",FIRST_VALUE;
  84    "following", FOLLOWING;
  85    "for", FOR;
  86    "foreign",FOREIGN;
  87    "from",FROM;
  88    "fulltext",FULLTEXT;
  89    "function", FUNCTION;
  90    "global",GLOBAL;
  91    "group",GROUP;
  92    "having",HAVING;
  93    "hour", HOUR;
  94    "hour_microsecond", HOUR_MICROSECOND;
  95    "hour_minute", HOUR_MINUTE;
  96    "hour_second", HOUR_SECOND;
  97    "if",IF;
  98    "in",IN;
  99    "index",INDEX;
 100    "insert",INSERT;
 101    "intersect",INTERSECT;
 102    "interval", INTERVAL;
 103    "into",INTO;
 104    "is", IS;
 105    "join",JOIN;
 106    "straight_join",STRAIGHT_JOIN;
 107    "key",KEY;
 108    "lag", LAG;
 109    "language", LANGUAGE;
 110    "last_value", LAST_VALUE;
 111    "lead", LEAD;
 112    "like", LIKE;
 113    "limit",LIMIT;
 114    "local",LOCAL;
 115    "lock", LOCK;
 116    "microsecond", MICROSECOND;
 117    "minute", MINUTE;
 118    "minute_microsecond", MINUTE_MICROSECOND;
 119    "minute_second", MINUTE_SECOND;
 120    "mod", MOD;
 121    "mode", MODE;
 122    "modify", MODIFY;
 123    "month", MONTH;
 124    "national",NATIONAL;
 125    "natural",NATURAL;
 126    "no", NO;
 127    "not",NOT;
 128    "nowait", NOWAIT;
 129    "null",NULL;
 130    "of", OF;
 131    "offset",OFFSET;
 132    "on",ON;
 133    "or",OR;
 134    "order",ORDER;
 135    "over",OVER;
 136    "partition",PARTITION;
 137    "preceding", PRECEDING;
 138    "precision",PRECISION;
 139    "primary",PRIMARY;
 140    "procedure", PROCEDURE;
 141    "quarter", QUARTER;
 142    "range", RANGE;
 143    "references",REFERENCES;
 144    "rename",RENAME;
 145    "replace",REPLACE;
 146    "restrict",RESTRICT;
 147    "returns", RETURNS;
 148    "row", ROW;
 149    "rows", ROWS;
 150    "second", SECOND;
 151    "second_microsecond", SECOND_MICROSECOND;
 152    "select",SELECT;
 153    "set",SET;
 154    "share", SHARE;
 155    "some",SOME;
 156    "spatial", SPATIAL;
 157    "statement", STATEMENT;
 158    "substr", SUBSTRING;
 159    "substring", SUBSTRING;
 160    "table",TABLE;
 161    "temporary",TEMPORARY;
 162    "then", THEN;
 163    "time",TIME;
 164    "timestamp",TIMESTAMP;
 165    "to",TO;
 166    "true", TRUE;
 167    "unbounded", UNBOUNDED;
 168    "unicode",UNICODE;
 169    "union",UNION;
 170    "unique",UNIQUE;
 171    "unsigned",UNSIGNED;
 172    "update",UPDATE;
 173    "using",USING;
 174    "values",VALUES;
 175    "varying",VARYING;
 176    "view",VIEW;
 177    "week", WEEK;
 178    "when", WHEN;
 179    "where",WHERE;
 180    "with", WITH;
 181    "year", YEAR;
 182    "year_month", YEAR_MONTH;
 183    "generated", GENERATED;
 184    "always", ALWAYS;
 185    "virtual", VIRTUAL;
 186    "stored", STORED;
 187    "left", LEFT;
 188    "right", RIGHT;
 189    "full", FULL;
 190    "inner", INNER;
 191    "outer", OUTER;
 192    "instant", INSTANT;
 193    "inplace", INPLACE;
 194    "algorithm", ALGORITHM;
 195    "copy", COPY;
 196   ] in (* more *)
 197   let all token l = k := !k @ List.map (fun x -> x,token) l in
 198   all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now";];
 199   all DATETIME_FUNC ["getdate"]; (* mssql? *)
 200   all CONFLICT_ALGO ["ignore"; "abort"; "fail"; "rollback"];
 201   all LIKE_OP ["glob";"regexp";"match"];
 202   all AUTOINCREMENT ["autoincrement";"auto_increment"];
 203 (* standard built-in types
 204       CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
 205       BINARY, BINARY VARYING, BINARY LARGE OBJECT,
 206       NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
 207       FLOAT, REAL, DOUBLE PRECISION,
 208       BOOLEAN,
 209       DATE, TIME, TIMESTAMP, INTERVAL
 210     *)
 211   all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
 212   all T_DECIMAL ["numeric";"decimal";"dec";"fixed"];
 213   all T_INTEGER ["number"]; (* oracle *)
 214   all T_BOOLEAN ["bool";"boolean"];
 215   all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
 216   all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
 217   all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
 218   all T_TEXT ["varchar2"]; (* oracle *)
 219   all T_DATETIME ["datetime"];
 220   all T_UUID ["uuid"]; (* http://www.postgresql.org/docs/9.4/static/datatype-uuid.html *)
 221   !k
 222
 223 (*
 224   Q: Why not convert all input to lowercase before lexing?
 225   A: Sometimes SQL is case-sensitive, also string contents should be preserved
 226 *)
 227
 228 module Keywords = Map.Make(String)
 229
 230 let keywords =
 231   let add map (k,v) =
 232     let k = String.lowercase_ascii k in
 233     if Keywords.mem k map then
 234       failwith (sprintf "Lexeme %s is already associated with keyword." k)
 235     else
 236       Keywords.add k v map
 237   in
 238   List.fold_left add Keywords.empty keywords
 239
 240 (* FIXME case sensitivity??! *)
 241
 242 let get_ident str =
 243   let str = String.lowercase_ascii str in
 244   try Keywords.find str keywords with Not_found -> IDENT str
 245
 246 let ident str = IDENT (String.lowercase_ascii str)
 247
 248 let as_literal ch s =
 249   let s = String.replace_chars (fun x -> String.make (if x = ch then 2 else 1) x) s in
 250   sprintf "%c%s%c" ch s ch
 251 }
 252
 253 let digit = ['0'-'9']
 254 let alpha = ['a'-'z' 'A'-'Z']
 255 let ident = (alpha) (alpha | digit | '_' )*
 256 let wsp = [' ' '\r' '\t']
 257 let cmnt = "--" | "//" | "#"
 258
 259 (* extract separate statements *)
 260 rule ruleStatement = parse
 261   | ['\n' ' ' '\r' '\t']+ as tok { `Space tok }
 262   | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n' { `Prop (n,v) }
 263   | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n' { `Prop ("name",name) }
 264   | '"' { let s = ruleInQuotes "" lexbuf in `Token (as_literal '"' s) }
 265   | "'" { let s = ruleInSingleQuotes "" lexbuf in `Token (as_literal '\'' s) }
 266   | "$" (ident? as tag) "$" {
 267     keep_lexeme_start lexbuf (fun () -> let s = ruleInDollarQuotes tag "" lexbuf in `Token (sprintf "$%s$%s$%s$" tag s tag))
 268   }
 269   | cmnt as s { `Comment (s ^ ruleComment "" lexbuf) }
 270   | "/*" { `Comment ("/*" ^ ruleCommentMulti "" lexbuf ^ "*/") }
 271   | ';' { `Semicolon }
 272   | [^ ';'] as c { `Char c }
 273   | eof { `Eof }
 274 and
 275 (* extract tail of the input *)
 276 ruleTail acc = parse
 277   | eof { acc }
 278   | _* as str { ruleTail (acc ^ str) lexbuf }
 279 and
 280 ruleMain = parse
 281   | wsp   { ruleMain lexbuf }
 282   (* update line number *)
 283   | '\n'  { advance_line lexbuf; ruleMain lexbuf}
 284
 285   | '('                { LPAREN }
 286   | ')'                { RPAREN }
 287   | ','   { COMMA }
 288   | '.'   { DOT }
 289   | '{'   { LCURLY (lexeme_start lexbuf) }
 290   | '}'   { RCURLY (lexeme_start lexbuf) }
 291
 292   | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
 293   | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
 294
 295   | "*" { ASTERISK }
 296   | "=" { EQUAL }
 297   | "!" { EXCL }
 298   | "~" { TILDE }
 299   | "||" { CONCAT_OP }
 300   | "+" { PLUS }
 301   | "-" { MINUS }
 302
 303   | "/" | "%" { NUM_DIV_OP }
 304   | "<<" | ">>" { NUM_BIT_SHIFT }
 305   | "|" { NUM_BIT_OR }
 306   | "&" { NUM_BIT_AND }
 307   | ">" | ">=" | "<=" | "<" { NUM_CMP_OP }
 308   | "<>" | "!=" | "==" { NUM_EQ_OP }
 309   | "<=>" { NOT_DISTINCT_OP }
 310
 311   | "?" { PARAM { label=None; pos = pos lexbuf } }
 312   | [':' '@'] (ident as str) { PARAM { label = Some str; pos = pos lexbuf } }
 313   | "::" { DOUBLECOLON }
 314
 315   | '"' { keep_lexeme_start lexbuf (fun () -> ident (ruleInQuotes "" lexbuf)) }
 316   | "'" { keep_lexeme_start lexbuf (fun () -> TEXT (ruleInSingleQuotes "" lexbuf)) }
 317   (* http://www.postgresql.org/docs/current/interactive/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING *)
 318   | "$" (ident? as tag) "$" { keep_lexeme_start lexbuf (fun () -> TEXT (ruleInDollarQuotes tag "" lexbuf)) }
 319   | "`" { keep_lexeme_start lexbuf (fun () -> ident (ruleInBackQuotes "" lexbuf)) }
 320   | "[" { keep_lexeme_start lexbuf (fun () -> ident (ruleInBrackets "" lexbuf)) }
 321   | ['x' 'X'] "'" { keep_lexeme_start lexbuf (fun () -> BLOB (ruleInSingleQuotes "" lexbuf)) }
 322
 323   | ident as str { if !Parser_state.mode = Ident then IDENT str (* no keywords, preserve case *) else get_ident str }
 324   | digit+ as str { INTEGER (int_of_string str) }
 325   | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
 326   | eof         { EOF }
 327   | _   { error lexbuf "ruleMain" }
 328 and
 329 (* FIXME factor out all that ruleIn* rules *)
 330 ruleInQuotes acc = parse
 331   | '"'         { acc }
 332   | eof         { error lexbuf "no terminating quote" }
 333   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
 334   | "\"\""      { ruleInQuotes (acc^"\"") lexbuf }
 335   | [^'"' '\n']+ as s { ruleInQuotes (acc^s) lexbuf }
 336   | _           { error lexbuf "ruleInQuotes" }
 337 and
 338 ruleInBrackets acc = parse
 339   | ']'         { acc }
 340   | eof         { error lexbuf "no terminating bracket" }
 341   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating bracket" }
 342 (*   | "\"\""      { ruleInQuotes (acc ^ "\"") lexbuf } *)
 343   | [^']' '\n']+  { ruleInBrackets (acc ^ lexeme lexbuf) lexbuf }
 344   | _           { error lexbuf "ruleInBrackets" }
 345 and
 346 ruleInSingleQuotes acc = parse
 347   | '\''              { acc }
 348   | eof         { error lexbuf "no terminating single quote" }
 349   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
 350   | "''"        { ruleInSingleQuotes (acc ^ "'") lexbuf }
 351   | [^'\'' '\n']+  { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
 352   | _           { error lexbuf "ruleInSingleQuotes" }
 353 and
 354 ruleInBackQuotes acc = parse
 355   | '`'         { acc }
 356   | eof         { error lexbuf "no terminating back quote" }
 357   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
 358   | "``"        { ruleInBackQuotes (acc ^ "`") lexbuf }
 359   | [^'`' '\n']+  { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
 360   | _           { error lexbuf "ruleInBackQuotes" }
 361 and
 362 ruleInDollarQuotes tag acc = parse
 363   | "$" (ident? as tag_) "$" { if tag_ = tag then acc else ruleInDollarQuotes tag (acc ^ sprintf "$%s$" tag_) lexbuf }
 364   | eof         { error lexbuf "no terminating dollar quote" }
 365   | '\n'        { advance_line lexbuf; ruleInDollarQuotes tag (acc ^ "\n") lexbuf }
 366   (* match one char at a time to make sure delimiter matches longer *)
 367   | [^'\n']     { ruleInDollarQuotes tag (acc ^ lexeme lexbuf) lexbuf }
 368   | _           { error lexbuf "ruleInDollarQuotes" }
 369 and
 370 ruleComment acc = parse
 371   | '\n'        { advance_line lexbuf; acc }
 372   | eof         { acc }
 373   | [^'\n']+    { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
 374   | _           { error lexbuf "ruleComment"; }
 375 and
 376 ruleCommentMulti acc = parse
 377   | '\n'        { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
 378   | "*/"        { acc }
 379   | "*"
 380   | [^'\n' '*']+    { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
 381   | _           { error lexbuf "ruleCommentMulti" }
 382
 383 {
 384
 385   let parse_rule lexbuf =
 386     let token = ruleMain lexbuf in
 387     match !Parser_state.mode with
 388     | Normal -> token
 389     | Ignore ->
 390 (*         eprintf "ignored: %s\n" (lexeme lexbuf); *)
 391       if token = EOF then token else IGNORED
 392     | Ident -> token
 393
 394 }
 395