lib/sql_lexer.mll

   1
   2 {
   3   open Printf
   4   open Lexing
   5   open ExtLib
   6   open Sql_parser
   7   module T = Sql.Type
   8
   9 let error _ callerID =
  10   prerr_endline (sprintf "Lexer error : %s" callerID);
  11 (*      update_pos buf;*)
  12         raise Parsing.Parse_error
  13
  14 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
  15
  16 let advance_line_pos pos =
  17   { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
  18
  19 let advance_line lexbuf =
  20   lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
  21
  22 let keywords =
  23   let k = ref [
  24    "as",AS;
  25    "on",ON;
  26    "conflict",CONFLICT;
  27    "using",USING;
  28    "natural",NATURAL;
  29    "join",JOIN;
  30    "between",BETWEEN;
  31    "and",AND;
  32    "escape",ESCAPE;
  33    "not",NOT;
  34    "null",NULL;
  35    "unique",UNIQUE;
  36    "primary",PRIMARY;
  37    "key",KEY;
  38    "default",DEFAULT;
  39    "precision",PRECISION;
  40    "varying",VARYING;
  41    "charset",CHARSET;
  42    "collate",COLLATE;
  43    "national",NATIONAL;
  44    "ascii",ASCII;
  45    "unicode",UNICODE;
  46    "distinct",DISTINCT;
  47    "character",CHARACTER;
  48    "binary",BINARY;
  49    "bytea",BINARY;
  50    "all",ALL;
  51    "any",ANY;
  52    "some",SOME;
  53    "order",ORDER;
  54    "by",BY;
  55    "limit",LIMIT;
  56    "desc",DESC;
  57    "asc",ASC;
  58    "offset",OFFSET;
  59    "select",SELECT;
  60    "create",CREATE;
  61    "table",TABLE;
  62    "view",VIEW;
  63    "insert",INSERT;
  64    "replace",REPLACE;
  65    "update",UPDATE;
  66    "delete",DELETE;
  67    "from",FROM;
  68    "or",OR;
  69    "into",INTO;
  70    "values",VALUES;
  71    "where",WHERE;
  72    "set",SET;
  73    "in",IN;
  74    "group",GROUP;
  75    "having",HAVING;
  76    "union",UNION;
  77    "except",EXCEPT;
  78    "intersect",INTERSECT;
  79    "cross",CROSS;
  80    "temporary",TEMPORARY;
  81    "if",IF;
  82    "exists",EXISTS;
  83    "foreign",FOREIGN;
  84    "global",GLOBAL;
  85    "local",LOCAL;
  86    "value",VALUE;
  87    "references",REFERENCES;
  88    "check",CHECK;
  89    "date",DATE;
  90    "time",TIME;
  91    "timestamp",TIMESTAMP;
  92    "alter",ALTER;
  93    "add",ADD;
  94    "cascade",CASCADE;
  95    "restrict",RESTRICT;
  96    "drop",DROP;
  97    "constraint",CONSTRAINT;
  98    "after",AFTER;
  99    "index",INDEX;
 100    "fulltext",FULLTEXT;
 101    "unsigned",UNSIGNED;
 102    "first",FIRST;
 103    "column",COLUMN;
 104    "like", LIKE;
 105    "case", CASE;
 106    "when", WHEN;
 107    "then", THEN;
 108    "else", ELSE;
 109    "end", END;
 110    "change", CHANGE;
 111    "modify", MODIFY;
 112    "delayed", DELAYED;
 113    "enum", ENUM;
 114    "for", FOR;
 115    "share", SHARE;
 116    "mode", MODE;
 117    "lock", LOCK;
 118    "of", OF;
 119    "with", WITH;
 120    "nowait", NOWAIT;
 121    "action", ACTION;
 122    "no", NO;
 123    "is", IS;
 124    "interval", INTERVAL;
 125    "microsecond", MICROSECOND;
 126    "second", SECOND;
 127    "minute", MINUTE;
 128    "hour", HOUR;
 129    "day", DAY;
 130    "week", WEEK;
 131    "month", MONTH;
 132    "quarter", QUARTER;
 133    "year", YEAR;
 134    "second_microsecond", SECOND_MICROSECOND;
 135    "minute_microsecond", MINUTE_MICROSECOND;
 136    "minute_second", MINUTE_SECOND;
 137    "hour_microsecond", HOUR_MICROSECOND;
 138    "hour_second", HOUR_SECOND;
 139    "hour_minute", HOUR_MINUTE;
 140    "day_microsecond", DAY_MICROSECOND;
 141    "day_second", DAY_SECOND;
 142    "day_minute", DAY_MINUTE;
 143    "day_hour", DAY_HOUR;
 144    "year_month", YEAR_MONTH;
 145    "false", FALSE;
 146    "true", TRUE;
 147    "duplicate", DUPLICATE;
 148   ] in (* more *)
 149   let all token l = k := !k @ List.map (fun x -> x,token) l in
 150   let func x l = all (FUNCTION x) l in
 151   func T.Agg ["max";"min";"sum"];
 152   func T.(Group (Int,true)) ["count"];
 153   func T.(Group (Float,false)) ["avg"];
 154   func T.(fixed Text [Text;Text]) ["strftime"];
 155   func T.(fixed Text [Text]) ["lower";"upper"];
 156   func T.(Ret Text) ["concat"];
 157   func T.(Ret Any) ["coalesce"];
 158   func T.(Ret Int) ["length"; "random";"unix_timestamp";"least";"greatest"];
 159   all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now";];
 160   all DATETIME_FUNC ["getdate"]; (* mssql? *)
 161   all CONFLICT_ALGO ["ignore"; "abort"; "fail"; "rollback"];
 162   all JOIN_TYPE1 ["left";"right";"full"];
 163   all JOIN_TYPE2 ["inner";"outer"];
 164   all LIKE_OP ["glob";"regexp";"match"];
 165   all AUTOINCREMENT ["autoincrement";"auto_increment"];
 166 (* standard built-in types
 167       CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
 168       BINARY, BINARY VARYING, BINARY LARGE OBJECT,
 169       NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
 170       FLOAT, REAL, DOUBLE PRECISION,
 171       BOOLEAN,
 172       DATE, TIME, TIMESTAMP, INTERVAL
 173     *)
 174   all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
 175   all T_INTEGER ["numeric";"decimal";"dec";"fixed"];
 176   all T_INTEGER ["number"]; (* oracle *)
 177   all T_BOOLEAN ["bool";"boolean"];
 178   all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
 179   all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
 180   all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
 181   all T_TEXT ["varchar2"]; (* oracle *)
 182   all T_DATETIME ["datetime"];
 183   !k
 184
 185 (*
 186   Q: Why not convert all input to lowercase before lexing?
 187   A: Sometimes SQL is case-sensitive, also string contents should be preserved
 188 *)
 189
 190 module Keywords = Map.Make(String)
 191
 192 let keywords =
 193   let add map (k,v) =
 194     let k = String.lowercase k in
 195     if Keywords.mem k map then
 196       failwith (sprintf "Lexeme %s is already associated with keyword." k)
 197     else
 198       Keywords.add k v map
 199   in
 200   List.fold_left add Keywords.empty keywords
 201
 202 (* FIXME case sensitivity??! *)
 203
 204 let get_ident str =
 205   let str = String.lowercase str in
 206   try Keywords.find str keywords with Not_found -> IDENT str
 207
 208 let ident str = IDENT (String.lowercase str)
 209
 210 let as_literal ch s =
 211   let s = String.replace_chars (fun x -> String.make (if x = ch then 2 else 1) x) s in
 212   sprintf "%c%s%c" ch s ch
 213 }
 214
 215 let digit = ['0'-'9']
 216 let alpha = ['a'-'z' 'A'-'Z']
 217 let ident = (alpha) (alpha | digit | '_' )*
 218 let wsp = [' ' '\r' '\t']
 219 let cmnt = "--" | "//" | "#"
 220
 221 (* extract separate statements *)
 222 rule ruleStatement = parse
 223   | ['\n' ' ' '\r' '\t']+ as tok { `Space tok }
 224   | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n' { `Prop (n,v) }
 225   | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n' { `Prop ("name",name) }
 226   | '"' { let s = ruleInQuotes "" lexbuf in `Token (as_literal '"' s) }
 227   | "'" { let s = ruleInSingleQuotes "" lexbuf in `Token (as_literal '\'' s) }
 228   | cmnt as s { `Comment (s ^ ruleComment "" lexbuf) }
 229   | "/*" { `Comment ("/*" ^ ruleCommentMulti "" lexbuf ^ "*/") }
 230   | ';' { `Semicolon }
 231   | [^ ';'] as c { `Char c }
 232   | eof { `Eof }
 233 and
 234 (* extract tail of the input *)
 235 ruleTail acc = parse
 236   | eof { acc }
 237   | _* as str { ruleTail (acc ^ str) lexbuf }
 238 and
 239 ruleMain = parse
 240   | wsp   { ruleMain lexbuf }
 241   (* update line number *)
 242   | '\n'  { advance_line lexbuf; ruleMain lexbuf}
 243
 244   | '('         { LPAREN }
 245   | ')'         { RPAREN }
 246   | ','   { COMMA }
 247   | '.'   { DOT }
 248
 249   | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
 250   | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
 251
 252   | "*" { ASTERISK }
 253   | "=" { EQUAL }
 254   | "!" { EXCL }
 255   | "~" { TILDE }
 256   | "||" { CONCAT_OP }
 257   | "+" { PLUS }
 258   | "-" { MINUS }
 259
 260   | "/" | "%" { NUM_DIV_OP }
 261   | "<<" | ">>" | "|" | "&" { NUM_BIT_OP }
 262   | ">" | ">=" | "<=" | "<" { NUM_CMP_OP }
 263   | "<>" | "!=" | "==" { NUM_EQ_OP }
 264
 265   | "?" { PARAM (None,pos lexbuf) }
 266   | [':' '@'] (ident as str) { PARAM (Some str,pos lexbuf) }
 267
 268   | '"' { ident (ruleInQuotes "" lexbuf) }
 269   | "'" { TEXT (ruleInSingleQuotes "" lexbuf) }
 270   | "`" { ident (ruleInBackQuotes "" lexbuf) }
 271   | "[" { ident (ruleInBrackets "" lexbuf) }
 272   | ['x' 'X'] "'" { BLOB (ruleInSingleQuotes "" lexbuf) }
 273
 274   | ident as str { get_ident str }
 275   | digit+ as str { INTEGER (int_of_string str) }
 276   | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
 277   | eof         { EOF }
 278   | _   { error lexbuf "ruleMain" }
 279 and
 280 (* FIXME factor out all that ruleIn* rules *)
 281 ruleInQuotes acc = parse
 282   | '"'         { acc }
 283   | eof         { error lexbuf "no terminating quote" }
 284   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
 285   | "\"\""      { ruleInQuotes (acc^"\"") lexbuf }
 286   | [^'"' '\n']+ as s { ruleInQuotes (acc^s) lexbuf }
 287   | _           { error lexbuf "ruleInQuotes" }
 288 and
 289 ruleInBrackets acc = parse
 290   | ']'         { acc }
 291   | eof         { error lexbuf "no terminating bracket" }
 292   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating bracket" }
 293 (*   | "\"\""      { ruleInQuotes (acc ^ "\"") lexbuf } *)
 294   | [^']' '\n']+  { ruleInBrackets (acc ^ lexeme lexbuf) lexbuf }
 295   | _           { error lexbuf "ruleInBrackets" }
 296 and
 297 ruleInSingleQuotes acc = parse
 298   | '\''              { acc }
 299   | eof         { error lexbuf "no terminating single quote" }
 300   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
 301   | "''"        { ruleInSingleQuotes (acc ^ "'") lexbuf }
 302   | [^'\'' '\n']+  { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
 303   | _           { error lexbuf "ruleInSingleQuotes" }
 304 and
 305 ruleInBackQuotes acc = parse
 306   | '`'         { acc }
 307   | eof         { error lexbuf "no terminating back quote" }
 308   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
 309   | "``"        { ruleInBackQuotes (acc ^ "`") lexbuf }
 310   | [^'`' '\n']+  { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
 311   | _           { error lexbuf "ruleInBackQuotes" }
 312 and
 313 ruleComment acc = parse
 314   | '\n'        { advance_line lexbuf; acc }
 315   | eof         { acc }
 316   | [^'\n']+    { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
 317   | _           { error lexbuf "ruleComment"; }
 318 and
 319 ruleCommentMulti acc = parse
 320   | '\n'        { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
 321   | "*/"        { acc }
 322   | "*"
 323   | [^'\n' '*']+    { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
 324   | _           { error lexbuf "ruleCommentMulti" }
 325
 326 {
 327
 328   let parse_rule lexbuf =
 329     let module P = Parser_state in
 330     let token = ruleMain lexbuf in
 331     match !P.mode with
 332     | P.Normal -> token
 333     | P.Ignore ->
 334 (*         eprintf "ignored: %s\n" (lexeme lexbuf); *)
 335       if (token = EOF) then token else IGNORED
 336
 337 }
 338