lib/sql_lexer.mll

   1
   2 {
   3   open Printf
   4   open Lexing
   5   open ExtLib
   6   open Sql_parser
   7   module T = Sql.Type
   8
   9 let error _ callerID =
  10   prerr_endline (sprintf "Lexer error : %s" callerID);
  11 (*      update_pos buf;*)
  12         raise Parsing.Parse_error
  13
  14 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
  15
  16 let advance_line_pos pos =
  17   { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
  18
  19 let advance_line lexbuf =
  20   lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
  21
  22 let keep_lexeme_start lexbuf f =
  23   let start_p = lexeme_start_p lexbuf in
  24   let x = f () in
  25   lexbuf.lex_start_p <- start_p;
  26   x
  27
  28 let keywords =
  29   let k = ref [
  30    "action", ACTION;
  31    "add",ADD;
  32    "after",AFTER;
  33    "all",ALL;
  34    "alter",ALTER;
  35    "and",AND;
  36    "any",ANY;
  37    "as",AS;
  38    "asc",ASC;
  39    "ascii",ASCII;
  40    "begin", BEGIN;
  41    "between",BETWEEN;
  42    "binary",BINARY;
  43    "by",BY;
  44    "bytea",BINARY;
  45    "cascade",CASCADE;
  46    "case", CASE;
  47    "cast", CAST;
  48    "change", CHANGE;
  49    "character",CHARACTER;
  50    "charset",CHARSET;
  51    "check",CHECK;
  52    "collate",COLLATE;
  53    "column",COLUMN;
  54    "comment", COMMENT;
  55    "conflict",CONFLICT;
  56    "constraint",CONSTRAINT;
  57    "convert", CONVERT;
  58    "create",CREATE;
  59    "cross",CROSS;
  60    "current", CURRENT;
  61    "date",DATE;
  62    "day", DAY;
  63    "day_hour", DAY_HOUR;
  64    "day_microsecond", DAY_MICROSECOND;
  65    "day_minute", DAY_MINUTE;
  66    "day_second", DAY_SECOND;
  67    "default",DEFAULT;
  68    "delayed", DELAYED;
  69    "delete",DELETE;
  70    "desc",DESC;
  71    "distinct",DISTINCT;
  72    "div", DIV;
  73    "drop",DROP;
  74    "duplicate", DUPLICATE;
  75    "else", ELSE;
  76    "end", END;
  77    "enum", ENUM;
  78    "escape",ESCAPE;
  79    "except",EXCEPT;
  80    "exists",EXISTS;
  81    "false", FALSE;
  82    "first",FIRST;
  83    "first_value",FIRST_VALUE;
  84    "following", FOLLOWING;
  85    "for", FOR;
  86    "foreign",FOREIGN;
  87    "from",FROM;
  88    "fulltext",FULLTEXT;
  89    "function", FUNCTION;
  90    "global",GLOBAL;
  91    "group",GROUP;
  92    "having",HAVING;
  93    "hour", HOUR;
  94    "hour_microsecond", HOUR_MICROSECOND;
  95    "hour_minute", HOUR_MINUTE;
  96    "hour_second", HOUR_SECOND;
  97    "if",IF;
  98    "in",IN;
  99    "index",INDEX;
 100    "insert",INSERT;
 101    "intersect",INTERSECT;
 102    "interval", INTERVAL;
 103    "into",INTO;
 104    "is", IS;
 105    "join",JOIN;
 106    "key",KEY;
 107    "lag", LAG;
 108    "language", LANGUAGE;
 109    "last_value", LAST_VALUE;
 110    "lead", LEAD;
 111    "like", LIKE;
 112    "limit",LIMIT;
 113    "local",LOCAL;
 114    "lock", LOCK;
 115    "microsecond", MICROSECOND;
 116    "minute", MINUTE;
 117    "minute_microsecond", MINUTE_MICROSECOND;
 118    "minute_second", MINUTE_SECOND;
 119    "mod", MOD;
 120    "mode", MODE;
 121    "modify", MODIFY;
 122    "month", MONTH;
 123    "national",NATIONAL;
 124    "natural",NATURAL;
 125    "no", NO;
 126    "not",NOT;
 127    "nowait", NOWAIT;
 128    "null",NULL;
 129    "of", OF;
 130    "offset",OFFSET;
 131    "on",ON;
 132    "or",OR;
 133    "order",ORDER;
 134    "over",OVER;
 135    "partition",PARTITION;
 136    "preceding", PRECEDING;
 137    "precision",PRECISION;
 138    "primary",PRIMARY;
 139    "procedure", PROCEDURE;
 140    "quarter", QUARTER;
 141    "range", RANGE;
 142    "references",REFERENCES;
 143    "rename",RENAME;
 144    "replace",REPLACE;
 145    "restrict",RESTRICT;
 146    "returns", RETURNS;
 147    "row", ROW;
 148    "rows", ROWS;
 149    "second", SECOND;
 150    "second_microsecond", SECOND_MICROSECOND;
 151    "select",SELECT;
 152    "set",SET;
 153    "share", SHARE;
 154    "some",SOME;
 155    "spatial", SPATIAL;
 156    "substr", SUBSTRING;
 157    "substring", SUBSTRING;
 158    "table",TABLE;
 159    "temporary",TEMPORARY;
 160    "then", THEN;
 161    "time",TIME;
 162    "timestamp",TIMESTAMP;
 163    "to",TO;
 164    "true", TRUE;
 165    "unbounded", UNBOUNDED;
 166    "unicode",UNICODE;
 167    "union",UNION;
 168    "unique",UNIQUE;
 169    "unsigned",UNSIGNED;
 170    "update",UPDATE;
 171    "using",USING;
 172    "values",VALUES;
 173    "varying",VARYING;
 174    "view",VIEW;
 175    "week", WEEK;
 176    "when", WHEN;
 177    "where",WHERE;
 178    "with", WITH;
 179    "year", YEAR;
 180    "year_month", YEAR_MONTH;
 181    "generated", GENERATED;
 182    "always", ALWAYS;
 183    "virtual", VIRTUAL;
 184    "stored", STORED;
 185   ] in (* more *)
 186   let all token l = k := !k @ List.map (fun x -> x,token) l in
 187   all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now";];
 188   all DATETIME_FUNC ["getdate"]; (* mssql? *)
 189   all CONFLICT_ALGO ["ignore"; "abort"; "fail"; "rollback"];
 190   all JOIN_TYPE1 ["left";"right";"full"];
 191   all JOIN_TYPE2 ["inner";"outer"];
 192   all LIKE_OP ["glob";"regexp";"match"];
 193   all AUTOINCREMENT ["autoincrement";"auto_increment"];
 194 (* standard built-in types
 195       CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
 196       BINARY, BINARY VARYING, BINARY LARGE OBJECT,
 197       NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
 198       FLOAT, REAL, DOUBLE PRECISION,
 199       BOOLEAN,
 200       DATE, TIME, TIMESTAMP, INTERVAL
 201     *)
 202   all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
 203   all T_DECIMAL ["numeric";"decimal";"dec";"fixed"];
 204   all T_INTEGER ["number"]; (* oracle *)
 205   all T_BOOLEAN ["bool";"boolean"];
 206   all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
 207   all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
 208   all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
 209   all T_TEXT ["varchar2"]; (* oracle *)
 210   all T_DATETIME ["datetime"];
 211   all T_UUID ["uuid"]; (* http://www.postgresql.org/docs/9.4/static/datatype-uuid.html *)
 212   !k
 213
 214 (*
 215   Q: Why not convert all input to lowercase before lexing?
 216   A: Sometimes SQL is case-sensitive, also string contents should be preserved
 217 *)
 218
 219 module Keywords = Map.Make(String)
 220
 221 let keywords =
 222   let add map (k,v) =
 223     let k = String.lowercase_ascii k in
 224     if Keywords.mem k map then
 225       failwith (sprintf "Lexeme %s is already associated with keyword." k)
 226     else
 227       Keywords.add k v map
 228   in
 229   List.fold_left add Keywords.empty keywords
 230
 231 (* FIXME case sensitivity??! *)
 232
 233 let get_ident str =
 234   let str = String.lowercase_ascii str in
 235   try Keywords.find str keywords with Not_found -> IDENT str
 236
 237 let ident str = IDENT (String.lowercase_ascii str)
 238
 239 let as_literal ch s =
 240   let s = String.replace_chars (fun x -> String.make (if x = ch then 2 else 1) x) s in
 241   sprintf "%c%s%c" ch s ch
 242 }
 243
 244 let digit = ['0'-'9']
 245 let alpha = ['a'-'z' 'A'-'Z']
 246 let ident = (alpha) (alpha | digit | '_' )*
 247 let wsp = [' ' '\r' '\t']
 248 let cmnt = "--" | "//" | "#"
 249
 250 (* extract separate statements *)
 251 rule ruleStatement = parse
 252   | ['\n' ' ' '\r' '\t']+ as tok { `Space tok }
 253   | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n' { `Prop (n,v) }
 254   | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n' { `Prop ("name",name) }
 255   | '"' { let s = ruleInQuotes "" lexbuf in `Token (as_literal '"' s) }
 256   | "'" { let s = ruleInSingleQuotes "" lexbuf in `Token (as_literal '\'' s) }
 257   | "$" (ident? as tag) "$" {
 258     keep_lexeme_start lexbuf (fun () -> let s = ruleInDollarQuotes tag "" lexbuf in `Token (sprintf "$%s$%s$%s$" tag s tag))
 259   }
 260   | cmnt as s { `Comment (s ^ ruleComment "" lexbuf) }
 261   | "/*" { `Comment ("/*" ^ ruleCommentMulti "" lexbuf ^ "*/") }
 262   | ';' { `Semicolon }
 263   | [^ ';'] as c { `Char c }
 264   | eof { `Eof }
 265 and
 266 (* extract tail of the input *)
 267 ruleTail acc = parse
 268   | eof { acc }
 269   | _* as str { ruleTail (acc ^ str) lexbuf }
 270 and
 271 ruleMain = parse
 272   | wsp   { ruleMain lexbuf }
 273   (* update line number *)
 274   | '\n'  { advance_line lexbuf; ruleMain lexbuf}
 275
 276   | '('                { LPAREN }
 277   | ')'                { RPAREN }
 278   | ','   { COMMA }
 279   | '.'   { DOT }
 280   | '{'   { LCURLY (lexeme_start lexbuf) }
 281   | '}'   { RCURLY (lexeme_start lexbuf) }
 282
 283   | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
 284   | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
 285
 286   | "*" { ASTERISK }
 287   | "=" { EQUAL }
 288   | "!" { EXCL }
 289   | "~" { TILDE }
 290   | "||" { CONCAT_OP }
 291   | "+" { PLUS }
 292   | "-" { MINUS }
 293
 294   | "/" | "%" { NUM_DIV_OP }
 295   | "<<" | ">>" { NUM_BIT_SHIFT }
 296   | "|" { NUM_BIT_OR }
 297   | "&" { NUM_BIT_AND }
 298   | ">" | ">=" | "<=" | "<" { NUM_CMP_OP }
 299   | "<>" | "!=" | "==" { NUM_EQ_OP }
 300   | "<=>" { NOT_DISTINCT_OP }
 301
 302   | "?" { PARAM { label=None; pos = pos lexbuf } }
 303   | [':' '@'] (ident as str) { PARAM { label = Some str; pos = pos lexbuf } }
 304
 305   | '"' { keep_lexeme_start lexbuf (fun () -> ident (ruleInQuotes "" lexbuf)) }
 306   | "'" { keep_lexeme_start lexbuf (fun () -> TEXT (ruleInSingleQuotes "" lexbuf)) }
 307   (* http://www.postgresql.org/docs/current/interactive/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING *)
 308   | "$" (ident? as tag) "$" { keep_lexeme_start lexbuf (fun () -> TEXT (ruleInDollarQuotes tag "" lexbuf)) }
 309   | "`" { keep_lexeme_start lexbuf (fun () -> ident (ruleInBackQuotes "" lexbuf)) }
 310   | "[" { keep_lexeme_start lexbuf (fun () -> ident (ruleInBrackets "" lexbuf)) }
 311   | ['x' 'X'] "'" { keep_lexeme_start lexbuf (fun () -> BLOB (ruleInSingleQuotes "" lexbuf)) }
 312
 313   | ident as str { if !Parser_state.mode = Ident then IDENT str (* no keywords, preserve case *) else get_ident str }
 314   | digit+ as str { INTEGER (int_of_string str) }
 315   | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
 316   | eof         { EOF }
 317   | _   { error lexbuf "ruleMain" }
 318 and
 319 (* FIXME factor out all that ruleIn* rules *)
 320 ruleInQuotes acc = parse
 321   | '"'         { acc }
 322   | eof         { error lexbuf "no terminating quote" }
 323   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
 324   | "\"\""      { ruleInQuotes (acc^"\"") lexbuf }
 325   | [^'"' '\n']+ as s { ruleInQuotes (acc^s) lexbuf }
 326   | _           { error lexbuf "ruleInQuotes" }
 327 and
 328 ruleInBrackets acc = parse
 329   | ']'         { acc }
 330   | eof         { error lexbuf "no terminating bracket" }
 331   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating bracket" }
 332 (*   | "\"\""      { ruleInQuotes (acc ^ "\"") lexbuf } *)
 333   | [^']' '\n']+  { ruleInBrackets (acc ^ lexeme lexbuf) lexbuf }
 334   | _           { error lexbuf "ruleInBrackets" }
 335 and
 336 ruleInSingleQuotes acc = parse
 337   | '\''              { acc }
 338   | eof         { error lexbuf "no terminating single quote" }
 339   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
 340   | "''"        { ruleInSingleQuotes (acc ^ "'") lexbuf }
 341   | [^'\'' '\n']+  { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
 342   | _           { error lexbuf "ruleInSingleQuotes" }
 343 and
 344 ruleInBackQuotes acc = parse
 345   | '`'         { acc }
 346   | eof         { error lexbuf "no terminating back quote" }
 347   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
 348   | "``"        { ruleInBackQuotes (acc ^ "`") lexbuf }
 349   | [^'`' '\n']+  { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
 350   | _           { error lexbuf "ruleInBackQuotes" }
 351 and
 352 ruleInDollarQuotes tag acc = parse
 353   | "$" (ident? as tag_) "$" { if tag_ = tag then acc else ruleInDollarQuotes tag (acc ^ sprintf "$%s$" tag_) lexbuf }
 354   | eof         { error lexbuf "no terminating dollar quote" }
 355   | '\n'        { advance_line lexbuf; ruleInDollarQuotes tag (acc ^ "\n") lexbuf }
 356   (* match one char at a time to make sure delimiter matches longer *)
 357   | [^'\n']     { ruleInDollarQuotes tag (acc ^ lexeme lexbuf) lexbuf }
 358   | _           { error lexbuf "ruleInDollarQuotes" }
 359 and
 360 ruleComment acc = parse
 361   | '\n'        { advance_line lexbuf; acc }
 362   | eof         { acc }
 363   | [^'\n']+    { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
 364   | _           { error lexbuf "ruleComment"; }
 365 and
 366 ruleCommentMulti acc = parse
 367   | '\n'        { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
 368   | "*/"        { acc }
 369   | "*"
 370   | [^'\n' '*']+    { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
 371   | _           { error lexbuf "ruleCommentMulti" }
 372
 373 {
 374
 375   let parse_rule lexbuf =
 376     let token = ruleMain lexbuf in
 377     match !Parser_state.mode with
 378     | Normal -> token
 379     | Ignore ->
 380 (*         eprintf "ignored: %s\n" (lexeme lexbuf); *)
 381       if token = EOF then token else IGNORED
 382     | Ident -> token
 383
 384 }
 385