lib/sql_lexer.mll

   1
   2 {
   3   open Printf
   4   open Lexing
   5   open ExtLib
   6   open Sql_parser
   7   module T = Sql.Type
   8
   9 let error _ callerID =
  10   prerr_endline (sprintf "Lexer error : %s" callerID);
  11 (*      update_pos buf;*)
  12         raise Parsing.Parse_error
  13
  14 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
  15
  16 let advance_line_pos pos =
  17   { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
  18
  19 let advance_line lexbuf =
  20   lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
  21
  22 let keywords =
  23   let k = ref [
  24    "as",AS;
  25    "on",ON;
  26    "conflict",CONFLICT;
  27    "using",USING;
  28    "natural",NATURAL;
  29    "join",JOIN;
  30    "between",BETWEEN;
  31    "and",AND;
  32    "escape",ESCAPE;
  33    "not",NOT;
  34    "null",NULL;
  35    "unique",UNIQUE;
  36    "primary",PRIMARY;
  37    "key",KEY;
  38    "default",DEFAULT;
  39    "precision",PRECISION;
  40    "varying",VARYING;
  41    "charset",CHARSET;
  42    "collate",COLLATE;
  43    "national",NATIONAL;
  44    "ascii",ASCII;
  45    "unicode",UNICODE;
  46    "distinct",DISTINCT;
  47    "character",CHARACTER;
  48    "binary",BINARY;
  49    "bytea",BINARY;
  50    "all",ALL;
  51    "any",ANY;
  52    "some",SOME;
  53    "order",ORDER;
  54    "by",BY;
  55    "limit",LIMIT;
  56    "desc",DESC;
  57    "asc",ASC;
  58    "offset",OFFSET;
  59    "select",SELECT;
  60    "create",CREATE;
  61    "table",TABLE;
  62    "view",VIEW;
  63    "insert",INSERT;
  64    "replace",REPLACE;
  65    "update",UPDATE;
  66    "delete",DELETE;
  67    "from",FROM;
  68    "or",OR;
  69    "into",INTO;
  70    "values",VALUES;
  71    "where",WHERE;
  72    "set",SET;
  73    "in",IN;
  74    "group",GROUP;
  75    "having",HAVING;
  76    "union",UNION;
  77    "except",EXCEPT;
  78    "intersect",INTERSECT;
  79    "cross",CROSS;
  80    "temporary",TEMPORARY;
  81    "if",IF;
  82    "exists",EXISTS;
  83    "foreign",FOREIGN;
  84    "global",GLOBAL;
  85    "local",LOCAL;
  86    "value",VALUE;
  87    "references",REFERENCES;
  88    "check",CHECK;
  89    "date",DATE;
  90    "time",TIME;
  91    "timestamp",TIMESTAMP;
  92    "alter",ALTER;
  93    "add",ADD;
  94    "cascade",CASCADE;
  95    "restrict",RESTRICT;
  96    "drop",DROP;
  97    "constraint",CONSTRAINT;
  98    "after",AFTER;
  99    "index",INDEX;
 100    "fulltext",FULLTEXT;
 101    "unsigned",UNSIGNED;
 102    "first",FIRST;
 103    "column",COLUMN;
 104    "like", LIKE;
 105    "case", CASE;
 106    "when", WHEN;
 107    "then", THEN;
 108    "else", ELSE;
 109    "end", END;
 110    "change", CHANGE;
 111    "modify", MODIFY;
 112    "delayed", DELAYED;
 113    "enum", ENUM;
 114    "for", FOR;
 115    "share", SHARE;
 116    "mode", MODE;
 117    "lock", LOCK;
 118    "of", OF;
 119    "with", WITH;
 120    "nowait", NOWAIT;
 121    "action", ACTION;
 122    "no", NO;
 123    "is", IS;
 124    "interval", INTERVAL;
 125    "microsecond", MICROSECOND;
 126    "second", SECOND;
 127    "minute", MINUTE;
 128    "hour", HOUR;
 129    "day", DAY;
 130    "week", WEEK;
 131    "month", MONTH;
 132    "quarter", QUARTER;
 133    "year", YEAR;
 134    "second_microsecond", SECOND_MICROSECOND;
 135    "minute_microsecond", MINUTE_MICROSECOND;
 136    "minute_second", MINUTE_SECOND;
 137    "hour_microsecond", HOUR_MICROSECOND;
 138    "hour_second", HOUR_SECOND;
 139    "hour_minute", HOUR_MINUTE;
 140    "day_microsecond", DAY_MICROSECOND;
 141    "day_second", DAY_SECOND;
 142    "day_minute", DAY_MINUTE;
 143    "day_hour", DAY_HOUR;
 144    "year_month", YEAR_MONTH;
 145    "false", FALSE;
 146    "true", TRUE;
 147    "duplicate", DUPLICATE;
 148    "function", FUNCTION;
 149    "procedure", PROCEDURE;
 150    "returns", RETURNS;
 151    "begin", BEGIN;
 152    "comment", COMMENT;
 153    "language", LANGUAGE;
 154   ] in (* more *)
 155   let all token l = k := !k @ List.map (fun x -> x,token) l in
 156   all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now";];
 157   all DATETIME_FUNC ["getdate"]; (* mssql? *)
 158   all CONFLICT_ALGO ["ignore"; "abort"; "fail"; "rollback"];
 159   all JOIN_TYPE1 ["left";"right";"full"];
 160   all JOIN_TYPE2 ["inner";"outer"];
 161   all LIKE_OP ["glob";"regexp";"match"];
 162   all AUTOINCREMENT ["autoincrement";"auto_increment"];
 163 (* standard built-in types
 164       CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
 165       BINARY, BINARY VARYING, BINARY LARGE OBJECT,
 166       NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
 167       FLOAT, REAL, DOUBLE PRECISION,
 168       BOOLEAN,
 169       DATE, TIME, TIMESTAMP, INTERVAL
 170     *)
 171   all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
 172   all T_INTEGER ["numeric";"decimal";"dec";"fixed"];
 173   all T_INTEGER ["number"]; (* oracle *)
 174   all T_BOOLEAN ["bool";"boolean"];
 175   all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
 176   all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
 177   all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
 178   all T_TEXT ["varchar2"]; (* oracle *)
 179   all T_DATETIME ["datetime"];
 180   all T_UUID ["uuid"]; (* http://www.postgresql.org/docs/9.4/static/datatype-uuid.html *)
 181   !k
 182
 183 (*
 184   Q: Why not convert all input to lowercase before lexing?
 185   A: Sometimes SQL is case-sensitive, also string contents should be preserved
 186 *)
 187
 188 module Keywords = Map.Make(String)
 189
 190 let keywords =
 191   let add map (k,v) =
 192     let k = String.lowercase k in
 193     if Keywords.mem k map then
 194       failwith (sprintf "Lexeme %s is already associated with keyword." k)
 195     else
 196       Keywords.add k v map
 197   in
 198   List.fold_left add Keywords.empty keywords
 199
 200 (* FIXME case sensitivity??! *)
 201
 202 let get_ident str =
 203   let str = String.lowercase str in
 204   try Keywords.find str keywords with Not_found -> IDENT str
 205
 206 let ident str = IDENT (String.lowercase str)
 207
 208 let as_literal ch s =
 209   let s = String.replace_chars (fun x -> String.make (if x = ch then 2 else 1) x) s in
 210   sprintf "%c%s%c" ch s ch
 211 }
 212
 213 let digit = ['0'-'9']
 214 let alpha = ['a'-'z' 'A'-'Z']
 215 let ident = (alpha) (alpha | digit | '_' )*
 216 let wsp = [' ' '\r' '\t']
 217 let cmnt = "--" | "//" | "#"
 218
 219 (* extract separate statements *)
 220 rule ruleStatement = parse
 221   | ['\n' ' ' '\r' '\t']+ as tok { `Space tok }
 222   | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n' { `Prop (n,v) }
 223   | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n' { `Prop ("name",name) }
 224   | '"' { let s = ruleInQuotes "" lexbuf in `Token (as_literal '"' s) }
 225   | "'" { let s = ruleInSingleQuotes "" lexbuf in `Token (as_literal '\'' s) }
 226   | "$" (ident? as tag) "$" { let s = ruleInDollarQuotes tag "" lexbuf in `Token (sprintf "$%s$%s$%s$" tag s tag) }
 227   | cmnt as s { `Comment (s ^ ruleComment "" lexbuf) }
 228   | "/*" { `Comment ("/*" ^ ruleCommentMulti "" lexbuf ^ "*/") }
 229   | ';' { `Semicolon }
 230   | [^ ';'] as c { `Char c }
 231   | eof { `Eof }
 232 and
 233 (* extract tail of the input *)
 234 ruleTail acc = parse
 235   | eof { acc }
 236   | _* as str { ruleTail (acc ^ str) lexbuf }
 237 and
 238 ruleMain = parse
 239   | wsp   { ruleMain lexbuf }
 240   (* update line number *)
 241   | '\n'  { advance_line lexbuf; ruleMain lexbuf}
 242
 243   | '('         { LPAREN }
 244   | ')'         { RPAREN }
 245   | ','   { COMMA }
 246   | '.'   { DOT }
 247
 248   | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
 249   | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
 250
 251   | "*" { ASTERISK }
 252   | "=" { EQUAL }
 253   | "!" { EXCL }
 254   | "~" { TILDE }
 255   | "||" { CONCAT_OP }
 256   | "+" { PLUS }
 257   | "-" { MINUS }
 258
 259   | "/" | "%" { NUM_DIV_OP }
 260   | "<<" | ">>" | "|" | "&" { NUM_BIT_OP }
 261   | ">" | ">=" | "<=" | "<" { NUM_CMP_OP }
 262   | "<>" | "!=" | "==" { NUM_EQ_OP }
 263   | "<=>" { NOT_DISTINCT_OP }
 264
 265   | "?" { PARAM (None,pos lexbuf) }
 266   | [':' '@'] (ident as str) { PARAM (Some str,pos lexbuf) }
 267
 268   | '"' { ident (ruleInQuotes "" lexbuf) }
 269   | "'" { TEXT (ruleInSingleQuotes "" lexbuf) }
 270   (* http://www.postgresql.org/docs/current/interactive/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING *)
 271   | "$" (ident? as tag) "$" { TEXT (ruleInDollarQuotes tag "" lexbuf) }
 272   | "`" { ident (ruleInBackQuotes "" lexbuf) }
 273   | "[" { ident (ruleInBrackets "" lexbuf) }
 274   | ['x' 'X'] "'" { BLOB (ruleInSingleQuotes "" lexbuf) }
 275
 276   | ident as str { get_ident str }
 277   | digit+ as str { INTEGER (int_of_string str) }
 278   | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
 279   | eof         { EOF }
 280   | _   { error lexbuf "ruleMain" }
 281 and
 282 (* FIXME factor out all that ruleIn* rules *)
 283 ruleInQuotes acc = parse
 284   | '"'         { acc }
 285   | eof         { error lexbuf "no terminating quote" }
 286   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
 287   | "\"\""      { ruleInQuotes (acc^"\"") lexbuf }
 288   | [^'"' '\n']+ as s { ruleInQuotes (acc^s) lexbuf }
 289   | _           { error lexbuf "ruleInQuotes" }
 290 and
 291 ruleInBrackets acc = parse
 292   | ']'         { acc }
 293   | eof         { error lexbuf "no terminating bracket" }
 294   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating bracket" }
 295 (*   | "\"\""      { ruleInQuotes (acc ^ "\"") lexbuf } *)
 296   | [^']' '\n']+  { ruleInBrackets (acc ^ lexeme lexbuf) lexbuf }
 297   | _           { error lexbuf "ruleInBrackets" }
 298 and
 299 ruleInSingleQuotes acc = parse
 300   | '\''              { acc }
 301   | eof         { error lexbuf "no terminating single quote" }
 302   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
 303   | "''"        { ruleInSingleQuotes (acc ^ "'") lexbuf }
 304   | [^'\'' '\n']+  { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
 305   | _           { error lexbuf "ruleInSingleQuotes" }
 306 and
 307 ruleInBackQuotes acc = parse
 308   | '`'         { acc }
 309   | eof         { error lexbuf "no terminating back quote" }
 310   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
 311   | "``"        { ruleInBackQuotes (acc ^ "`") lexbuf }
 312   | [^'`' '\n']+  { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
 313   | _           { error lexbuf "ruleInBackQuotes" }
 314 and
 315 ruleInDollarQuotes tag acc = parse
 316   | "$" (ident? as tag_) "$" { if tag_ = tag then acc else ruleInDollarQuotes tag (acc ^ sprintf "$%s$" tag_) lexbuf }
 317   | eof         { error lexbuf "no terminating dollar quote" }
 318   | '\n'        { advance_line lexbuf; ruleInDollarQuotes tag (acc ^ "\n") lexbuf }
 319   (* match one char at a time to make sure delimiter matches longer *)
 320   | [^'\n']     { ruleInDollarQuotes tag (acc ^ lexeme lexbuf) lexbuf }
 321   | _           { error lexbuf "ruleInDollarQuotes" }
 322 and
 323 ruleComment acc = parse
 324   | '\n'        { advance_line lexbuf; acc }
 325   | eof         { acc }
 326   | [^'\n']+    { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
 327   | _           { error lexbuf "ruleComment"; }
 328 and
 329 ruleCommentMulti acc = parse
 330   | '\n'        { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
 331   | "*/"        { acc }
 332   | "*"
 333   | [^'\n' '*']+    { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
 334   | _           { error lexbuf "ruleCommentMulti" }
 335
 336 {
 337
 338   let parse_rule lexbuf =
 339     let module P = Parser_state in
 340     let token = ruleMain lexbuf in
 341     match !P.mode with
 342     | P.Normal -> token
 343     | P.Ignore ->
 344 (*         eprintf "ignored: %s\n" (lexeme lexbuf); *)
 345       if (token = EOF) then token else IGNORED
 346
 347 }
 348