lib/sql_lexer.mll

   1
   2 {
   3   open Printf
   4   open Lexing
   5   open ExtLib
   6   open Sql_parser
   7   module T = Sql.Type
   8
   9 let error _ callerID =
  10   prerr_endline (sprintf "Lexer error : %s" callerID);
  11 (*      update_pos buf;*)
  12         raise Parsing.Parse_error
  13
  14 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
  15
  16 let advance_line_pos pos =
  17   { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
  18
  19 let advance_line lexbuf =
  20   lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
  21
  22 let keywords =
  23   let k = ref [
  24    "as",AS;
  25    "on",ON;
  26    "conflict",CONFLICT;
  27    "using",USING;
  28    "natural",NATURAL;
  29    "join",JOIN;
  30    "between",BETWEEN;
  31    "and",AND;
  32    "escape",ESCAPE;
  33    "not",NOT;
  34    "null",NULL;
  35    "unique",UNIQUE;
  36    "primary",PRIMARY;
  37    "key",KEY;
  38    "default",DEFAULT;
  39    "precision",PRECISION;
  40    "varying",VARYING;
  41    "charset",CHARSET;
  42    "collate",COLLATE;
  43    "national",NATIONAL;
  44    "ascii",ASCII;
  45    "unicode",UNICODE;
  46    "distinct",DISTINCT;
  47    "character",CHARACTER;
  48    "binary",BINARY;
  49    "bytea",BINARY;
  50    "all",ALL;
  51    "any",ANY;
  52    "some",SOME;
  53    "order",ORDER;
  54    "by",BY;
  55    "limit",LIMIT;
  56    "desc",DESC;
  57    "asc",ASC;
  58    "offset",OFFSET;
  59    "select",SELECT;
  60    "create",CREATE;
  61    "table",TABLE;
  62    "view",VIEW;
  63    "insert",INSERT;
  64    "replace",REPLACE;
  65    "update",UPDATE;
  66    "delete",DELETE;
  67    "from",FROM;
  68    "or",OR;
  69    "into",INTO;
  70    "values",VALUES;
  71    "where",WHERE;
  72    "set",SET;
  73    "in",IN;
  74    "group",GROUP;
  75    "having",HAVING;
  76    "union",UNION;
  77    "except",EXCEPT;
  78    "intersect",INTERSECT;
  79    "cross",CROSS;
  80    "temporary",TEMPORARY;
  81    "if",IF;
  82    "exists",EXISTS;
  83    "foreign",FOREIGN;
  84    "global",GLOBAL;
  85    "local",LOCAL;
  86    "value",VALUE;
  87    "references",REFERENCES;
  88    "check",CHECK;
  89    "date",DATE;
  90    "time",TIME;
  91    "timestamp",TIMESTAMP;
  92    "alter",ALTER;
  93    "add",ADD;
  94    "cascade",CASCADE;
  95    "restrict",RESTRICT;
  96    "drop",DROP;
  97    "constraint",CONSTRAINT;
  98    "after",AFTER;
  99    "index",INDEX;
 100    "fulltext",FULLTEXT;
 101    "unsigned",UNSIGNED;
 102    "first",FIRST;
 103    "column",COLUMN;
 104    "like", LIKE;
 105    "case", CASE;
 106    "when", WHEN;
 107    "then", THEN;
 108    "else", ELSE;
 109    "end", END;
 110    "change", CHANGE;
 111    "modify", MODIFY;
 112    "delayed", DELAYED;
 113    "enum", ENUM;
 114    "for", FOR;
 115    "share", SHARE;
 116    "mode", MODE;
 117    "lock", LOCK;
 118    "of", OF;
 119    "with", WITH;
 120    "nowait", NOWAIT;
 121    "action", ACTION;
 122    "no", NO;
 123    "is", IS;
 124    "interval", INTERVAL;
 125    "microsecond", MICROSECOND;
 126    "second", SECOND;
 127    "minute", MINUTE;
 128    "hour", HOUR;
 129    "day", DAY;
 130    "week", WEEK;
 131    "month", MONTH;
 132    "quarter", QUARTER;
 133    "year", YEAR;
 134    "second_microsecond", SECOND_MICROSECOND;
 135    "minute_microsecond", MINUTE_MICROSECOND;
 136    "minute_second", MINUTE_SECOND;
 137    "hour_microsecond", HOUR_MICROSECOND;
 138    "hour_second", HOUR_SECOND;
 139    "hour_minute", HOUR_MINUTE;
 140    "day_microsecond", DAY_MICROSECOND;
 141    "day_second", DAY_SECOND;
 142    "day_minute", DAY_MINUTE;
 143    "day_hour", DAY_HOUR;
 144    "year_month", YEAR_MONTH;
 145    "false", FALSE;
 146    "true", TRUE;
 147    "duplicate", DUPLICATE;
 148   ] in (* more *)
 149   let all token l = k := !k @ List.map (fun x -> x,token) l in
 150   let func x l = all (FUNCTION x) l in
 151   func T.Agg ["max";"min";"sum"];
 152   func T.(Group (Int,true)) ["count"];
 153   func T.(Group (Float,false)) ["avg"];
 154   func T.(fixed Text [Text;Text]) ["strftime"];
 155   func T.(fixed Text [Text]) ["lower";"upper"];
 156   func T.(Ret Text) ["concat"];
 157   func T.(Ret Any) ["coalesce"];
 158   func T.(Ret Int) ["length"; "random";"unix_timestamp";"least";"greatest"];
 159   all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now";];
 160   all DATETIME_FUNC ["getdate"]; (* mssql? *)
 161   all CONFLICT_ALGO ["ignore"; "abort"; "fail"; "rollback"];
 162   all JOIN_TYPE1 ["left";"right";"full"];
 163   all JOIN_TYPE2 ["inner";"outer"];
 164   all LIKE_OP ["glob";"regexp";"match"];
 165   all AUTOINCREMENT ["autoincrement";"auto_increment"];
 166 (* standard built-in types
 167       CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
 168       BINARY, BINARY VARYING, BINARY LARGE OBJECT,
 169       NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
 170       FLOAT, REAL, DOUBLE PRECISION,
 171       BOOLEAN,
 172       DATE, TIME, TIMESTAMP, INTERVAL
 173     *)
 174   all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
 175   all T_INTEGER ["numeric";"decimal";"dec";"fixed"];
 176   all T_INTEGER ["number"]; (* oracle *)
 177   all T_BOOLEAN ["bool";"boolean"];
 178   all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
 179   all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
 180   all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
 181   all T_TEXT ["varchar2"]; (* oracle *)
 182   all T_DATETIME ["datetime"];
 183   all T_UUID ["uuid"]; (* http://www.postgresql.org/docs/9.4/static/datatype-uuid.html *)
 184   !k
 185
 186 (*
 187   Q: Why not convert all input to lowercase before lexing?
 188   A: Sometimes SQL is case-sensitive, also string contents should be preserved
 189 *)
 190
 191 module Keywords = Map.Make(String)
 192
 193 let keywords =
 194   let add map (k,v) =
 195     let k = String.lowercase k in
 196     if Keywords.mem k map then
 197       failwith (sprintf "Lexeme %s is already associated with keyword." k)
 198     else
 199       Keywords.add k v map
 200   in
 201   List.fold_left add Keywords.empty keywords
 202
 203 (* FIXME case sensitivity??! *)
 204
 205 let get_ident str =
 206   let str = String.lowercase str in
 207   try Keywords.find str keywords with Not_found -> IDENT str
 208
 209 let ident str = IDENT (String.lowercase str)
 210
 211 let as_literal ch s =
 212   let s = String.replace_chars (fun x -> String.make (if x = ch then 2 else 1) x) s in
 213   sprintf "%c%s%c" ch s ch
 214 }
 215
 216 let digit = ['0'-'9']
 217 let alpha = ['a'-'z' 'A'-'Z']
 218 let ident = (alpha) (alpha | digit | '_' )*
 219 let wsp = [' ' '\r' '\t']
 220 let cmnt = "--" | "//" | "#"
 221
 222 (* extract separate statements *)
 223 rule ruleStatement = parse
 224   | ['\n' ' ' '\r' '\t']+ as tok { `Space tok }
 225   | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n' { `Prop (n,v) }
 226   | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n' { `Prop ("name",name) }
 227   | '"' { let s = ruleInQuotes "" lexbuf in `Token (as_literal '"' s) }
 228   | "'" { let s = ruleInSingleQuotes "" lexbuf in `Token (as_literal '\'' s) }
 229   | "$" (ident? as tag) "$" { let s = ruleInDollarQuotes tag "" lexbuf in `Token (sprintf "$%s$%s$%s$" tag s tag) }
 230   | cmnt as s { `Comment (s ^ ruleComment "" lexbuf) }
 231   | "/*" { `Comment ("/*" ^ ruleCommentMulti "" lexbuf ^ "*/") }
 232   | ';' { `Semicolon }
 233   | [^ ';'] as c { `Char c }
 234   | eof { `Eof }
 235 and
 236 (* extract tail of the input *)
 237 ruleTail acc = parse
 238   | eof { acc }
 239   | _* as str { ruleTail (acc ^ str) lexbuf }
 240 and
 241 ruleMain = parse
 242   | wsp   { ruleMain lexbuf }
 243   (* update line number *)
 244   | '\n'  { advance_line lexbuf; ruleMain lexbuf}
 245
 246   | '('         { LPAREN }
 247   | ')'         { RPAREN }
 248   | ','   { COMMA }
 249   | '.'   { DOT }
 250
 251   | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
 252   | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
 253
 254   | "*" { ASTERISK }
 255   | "=" { EQUAL }
 256   | "!" { EXCL }
 257   | "~" { TILDE }
 258   | "||" { CONCAT_OP }
 259   | "+" { PLUS }
 260   | "-" { MINUS }
 261
 262   | "/" | "%" { NUM_DIV_OP }
 263   | "<<" | ">>" | "|" | "&" { NUM_BIT_OP }
 264   | ">" | ">=" | "<=" | "<" { NUM_CMP_OP }
 265   | "<>" | "!=" | "==" { NUM_EQ_OP }
 266
 267   | "?" { PARAM (None,pos lexbuf) }
 268   | [':' '@'] (ident as str) { PARAM (Some str,pos lexbuf) }
 269
 270   | '"' { ident (ruleInQuotes "" lexbuf) }
 271   | "'" { TEXT (ruleInSingleQuotes "" lexbuf) }
 272   (* http://www.postgresql.org/docs/current/interactive/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING *)
 273   | "$" (ident? as tag) "$" { TEXT (ruleInDollarQuotes tag "" lexbuf) }
 274   | "`" { ident (ruleInBackQuotes "" lexbuf) }
 275   | "[" { ident (ruleInBrackets "" lexbuf) }
 276   | ['x' 'X'] "'" { BLOB (ruleInSingleQuotes "" lexbuf) }
 277
 278   | ident as str { get_ident str }
 279   | digit+ as str { INTEGER (int_of_string str) }
 280   | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
 281   | eof         { EOF }
 282   | _   { error lexbuf "ruleMain" }
 283 and
 284 (* FIXME factor out all that ruleIn* rules *)
 285 ruleInQuotes acc = parse
 286   | '"'         { acc }
 287   | eof         { error lexbuf "no terminating quote" }
 288   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
 289   | "\"\""      { ruleInQuotes (acc^"\"") lexbuf }
 290   | [^'"' '\n']+ as s { ruleInQuotes (acc^s) lexbuf }
 291   | _           { error lexbuf "ruleInQuotes" }
 292 and
 293 ruleInBrackets acc = parse
 294   | ']'         { acc }
 295   | eof         { error lexbuf "no terminating bracket" }
 296   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating bracket" }
 297 (*   | "\"\""      { ruleInQuotes (acc ^ "\"") lexbuf } *)
 298   | [^']' '\n']+  { ruleInBrackets (acc ^ lexeme lexbuf) lexbuf }
 299   | _           { error lexbuf "ruleInBrackets" }
 300 and
 301 ruleInSingleQuotes acc = parse
 302   | '\''              { acc }
 303   | eof         { error lexbuf "no terminating single quote" }
 304   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
 305   | "''"        { ruleInSingleQuotes (acc ^ "'") lexbuf }
 306   | [^'\'' '\n']+  { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
 307   | _           { error lexbuf "ruleInSingleQuotes" }
 308 and
 309 ruleInBackQuotes acc = parse
 310   | '`'         { acc }
 311   | eof         { error lexbuf "no terminating back quote" }
 312   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
 313   | "``"        { ruleInBackQuotes (acc ^ "`") lexbuf }
 314   | [^'`' '\n']+  { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
 315   | _           { error lexbuf "ruleInBackQuotes" }
 316 and
 317 ruleInDollarQuotes tag acc = parse
 318   | "$" (ident? as tag_) "$" { if tag_ = tag then acc else ruleInDollarQuotes tag (acc ^ sprintf "$%s$" tag_) lexbuf }
 319   | eof         { error lexbuf "no terminating dollar quote" }
 320   | '\n'        { advance_line lexbuf; ruleInDollarQuotes tag (acc ^ "\n") lexbuf }
 321   (* match one char at a time to make sure delimiter matches longer *)
 322   | [^'\n']     { ruleInDollarQuotes tag (acc ^ lexeme lexbuf) lexbuf }
 323   | _           { error lexbuf "ruleInDollarQuotes" }
 324 and
 325 ruleComment acc = parse
 326   | '\n'        { advance_line lexbuf; acc }
 327   | eof         { acc }
 328   | [^'\n']+    { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
 329   | _           { error lexbuf "ruleComment"; }
 330 and
 331 ruleCommentMulti acc = parse
 332   | '\n'        { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
 333   | "*/"        { acc }
 334   | "*"
 335   | [^'\n' '*']+    { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
 336   | _           { error lexbuf "ruleCommentMulti" }
 337
 338 {
 339
 340   let parse_rule lexbuf =
 341     let module P = Parser_state in
 342     let token = ruleMain lexbuf in
 343     match !P.mode with
 344     | P.Normal -> token
 345     | P.Ignore ->
 346 (*         eprintf "ignored: %s\n" (lexeme lexbuf); *)
 347       if (token = EOF) then token else IGNORED
 348
 349 }
 350