lib/sql_lexer.mll

   1
   2 {
   3   open Printf
   4   open Lexing
   5   open ExtLib
   6   open Sql_parser
   7   module T = Sql.Type
   8
   9 let error _ callerID =
  10   prerr_endline (sprintf "Lexer error : %s" callerID);
  11 (*      update_pos buf;*)
  12         raise Parsing.Parse_error
  13
  14 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
  15
  16 let advance_line_pos pos =
  17   { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
  18
  19 let advance_line lexbuf =
  20   lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
  21
  22 let keywords =
  23   let k = ref [
  24    "action", ACTION;
  25    "add",ADD;
  26    "after",AFTER;
  27    "all",ALL;
  28    "alter",ALTER;
  29    "and",AND;
  30    "any",ANY;
  31    "as",AS;
  32    "asc",ASC;
  33    "ascii",ASCII;
  34    "begin", BEGIN;
  35    "between",BETWEEN;
  36    "binary",BINARY;
  37    "by",BY;
  38    "bytea",BINARY;
  39    "cascade",CASCADE;
  40    "case", CASE;
  41    "change", CHANGE;
  42    "character",CHARACTER;
  43    "charset",CHARSET;
  44    "check",CHECK;
  45    "collate",COLLATE;
  46    "column",COLUMN;
  47    "comment", COMMENT;
  48    "conflict",CONFLICT;
  49    "constraint",CONSTRAINT;
  50    "convert", CONVERT;
  51    "create",CREATE;
  52    "cross",CROSS;
  53    "date",DATE;
  54    "day", DAY;
  55    "day_hour", DAY_HOUR;
  56    "day_microsecond", DAY_MICROSECOND;
  57    "day_minute", DAY_MINUTE;
  58    "day_second", DAY_SECOND;
  59    "default",DEFAULT;
  60    "delayed", DELAYED;
  61    "delete",DELETE;
  62    "desc",DESC;
  63    "distinct",DISTINCT;
  64    "div", DIV;
  65    "drop",DROP;
  66    "duplicate", DUPLICATE;
  67    "else", ELSE;
  68    "end", END;
  69    "enum", ENUM;
  70    "escape",ESCAPE;
  71    "except",EXCEPT;
  72    "exists",EXISTS;
  73    "false", FALSE;
  74    "first",FIRST;
  75    "for", FOR;
  76    "foreign",FOREIGN;
  77    "from",FROM;
  78    "fulltext",FULLTEXT;
  79    "function", FUNCTION;
  80    "global",GLOBAL;
  81    "group",GROUP;
  82    "having",HAVING;
  83    "hour", HOUR;
  84    "hour_microsecond", HOUR_MICROSECOND;
  85    "hour_minute", HOUR_MINUTE;
  86    "hour_second", HOUR_SECOND;
  87    "if",IF;
  88    "in",IN;
  89    "index",INDEX;
  90    "insert",INSERT;
  91    "intersect",INTERSECT;
  92    "interval", INTERVAL;
  93    "into",INTO;
  94    "is", IS;
  95    "join",JOIN;
  96    "key",KEY;
  97    "language", LANGUAGE;
  98    "like", LIKE;
  99    "limit",LIMIT;
 100    "local",LOCAL;
 101    "lock", LOCK;
 102    "microsecond", MICROSECOND;
 103    "minute", MINUTE;
 104    "minute_microsecond", MINUTE_MICROSECOND;
 105    "minute_second", MINUTE_SECOND;
 106    "mod", MOD;
 107    "mode", MODE;
 108    "modify", MODIFY;
 109    "month", MONTH;
 110    "national",NATIONAL;
 111    "natural",NATURAL;
 112    "no", NO;
 113    "not",NOT;
 114    "nowait", NOWAIT;
 115    "null",NULL;
 116    "of", OF;
 117    "offset",OFFSET;
 118    "on",ON;
 119    "or",OR;
 120    "order",ORDER;
 121    "precision",PRECISION;
 122    "primary",PRIMARY;
 123    "procedure", PROCEDURE;
 124    "quarter", QUARTER;
 125    "references",REFERENCES;
 126    "rename",RENAME;
 127    "replace",REPLACE;
 128    "restrict",RESTRICT;
 129    "returns", RETURNS;
 130    "second", SECOND;
 131    "second_microsecond", SECOND_MICROSECOND;
 132    "select",SELECT;
 133    "set",SET;
 134    "share", SHARE;
 135    "some",SOME;
 136    "spatial", SPATIAL;
 137    "substr", SUBSTRING;
 138    "substring", SUBSTRING;
 139    "table",TABLE;
 140    "temporary",TEMPORARY;
 141    "then", THEN;
 142    "time",TIME;
 143    "timestamp",TIMESTAMP;
 144    "to",TO;
 145    "true", TRUE;
 146    "unicode",UNICODE;
 147    "union",UNION;
 148    "unique",UNIQUE;
 149    "unsigned",UNSIGNED;
 150    "update",UPDATE;
 151    "using",USING;
 152    "values",VALUES;
 153    "varying",VARYING;
 154    "view",VIEW;
 155    "week", WEEK;
 156    "when", WHEN;
 157    "where",WHERE;
 158    "with", WITH;
 159    "year", YEAR;
 160    "year_month", YEAR_MONTH;
 161   ] in (* more *)
 162   let all token l = k := !k @ List.map (fun x -> x,token) l in
 163   all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now";];
 164   all DATETIME_FUNC ["getdate"]; (* mssql? *)
 165   all CONFLICT_ALGO ["ignore"; "abort"; "fail"; "rollback"];
 166   all JOIN_TYPE1 ["left";"right";"full"];
 167   all JOIN_TYPE2 ["inner";"outer"];
 168   all LIKE_OP ["glob";"regexp";"match"];
 169   all AUTOINCREMENT ["autoincrement";"auto_increment"];
 170 (* standard built-in types
 171       CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
 172       BINARY, BINARY VARYING, BINARY LARGE OBJECT,
 173       NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
 174       FLOAT, REAL, DOUBLE PRECISION,
 175       BOOLEAN,
 176       DATE, TIME, TIMESTAMP, INTERVAL
 177     *)
 178   all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
 179   all T_DECIMAL ["numeric";"decimal";"dec";"fixed"];
 180   all T_INTEGER ["number"]; (* oracle *)
 181   all T_BOOLEAN ["bool";"boolean"];
 182   all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
 183   all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
 184   all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
 185   all T_TEXT ["varchar2"]; (* oracle *)
 186   all T_DATETIME ["datetime"];
 187   all T_UUID ["uuid"]; (* http://www.postgresql.org/docs/9.4/static/datatype-uuid.html *)
 188   !k
 189
 190 (*
 191   Q: Why not convert all input to lowercase before lexing?
 192   A: Sometimes SQL is case-sensitive, also string contents should be preserved
 193 *)
 194
 195 module Keywords = Map.Make(String)
 196
 197 let keywords =
 198   let add map (k,v) =
 199     let k = String.lowercase k in
 200     if Keywords.mem k map then
 201       failwith (sprintf "Lexeme %s is already associated with keyword." k)
 202     else
 203       Keywords.add k v map
 204   in
 205   List.fold_left add Keywords.empty keywords
 206
 207 (* FIXME case sensitivity??! *)
 208
 209 let get_ident str =
 210   let str = String.lowercase str in
 211   try Keywords.find str keywords with Not_found -> IDENT str
 212
 213 let ident str = IDENT (String.lowercase str)
 214
 215 let as_literal ch s =
 216   let s = String.replace_chars (fun x -> String.make (if x = ch then 2 else 1) x) s in
 217   sprintf "%c%s%c" ch s ch
 218 }
 219
 220 let digit = ['0'-'9']
 221 let alpha = ['a'-'z' 'A'-'Z']
 222 let ident = (alpha) (alpha | digit | '_' )*
 223 let wsp = [' ' '\r' '\t']
 224 let cmnt = "--" | "//" | "#"
 225
 226 (* extract separate statements *)
 227 rule ruleStatement = parse
 228   | ['\n' ' ' '\r' '\t']+ as tok { `Space tok }
 229   | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n' { `Prop (n,v) }
 230   | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n' { `Prop ("name",name) }
 231   | '"' { let s = ruleInQuotes "" lexbuf in `Token (as_literal '"' s) }
 232   | "'" { let s = ruleInSingleQuotes "" lexbuf in `Token (as_literal '\'' s) }
 233   | "$" (ident? as tag) "$" { let s = ruleInDollarQuotes tag "" lexbuf in `Token (sprintf "$%s$%s$%s$" tag s tag) }
 234   | cmnt as s { `Comment (s ^ ruleComment "" lexbuf) }
 235   | "/*" { `Comment ("/*" ^ ruleCommentMulti "" lexbuf ^ "*/") }
 236   | ';' { `Semicolon }
 237   | [^ ';'] as c { `Char c }
 238   | eof { `Eof }
 239 and
 240 (* extract tail of the input *)
 241 ruleTail acc = parse
 242   | eof { acc }
 243   | _* as str { ruleTail (acc ^ str) lexbuf }
 244 and
 245 ruleMain = parse
 246   | wsp   { ruleMain lexbuf }
 247   (* update line number *)
 248   | '\n'  { advance_line lexbuf; ruleMain lexbuf}
 249
 250   | '('         { LPAREN }
 251   | ')'         { RPAREN }
 252   | ','   { COMMA }
 253   | '.'   { DOT }
 254   | '{'   { LCURLY (lexeme_start lexbuf) }
 255   | '}'   { RCURLY (lexeme_start lexbuf) }
 256
 257   | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
 258   | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
 259
 260   | "*" { ASTERISK }
 261   | "=" { EQUAL }
 262   | "!" { EXCL }
 263   | "~" { TILDE }
 264   | "||" { CONCAT_OP }
 265   | "+" { PLUS }
 266   | "-" { MINUS }
 267
 268   | "/" | "%" { NUM_DIV_OP }
 269   | "<<" | ">>" { NUM_BIT_SHIFT }
 270   | "|" { NUM_BIT_OR }
 271   | "&" { NUM_BIT_AND }
 272   | ">" | ">=" | "<=" | "<" { NUM_CMP_OP }
 273   | "<>" | "!=" | "==" { NUM_EQ_OP }
 274   | "<=>" { NOT_DISTINCT_OP }
 275
 276   | "?" { PARAM { label=None; pos = pos lexbuf } }
 277   | [':' '@'] (ident as str) { PARAM { label = Some str; pos = pos lexbuf } }
 278
 279   | '"' { ident (ruleInQuotes "" lexbuf) }
 280   | "'" { TEXT (ruleInSingleQuotes "" lexbuf) }
 281   (* http://www.postgresql.org/docs/current/interactive/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING *)
 282   | "$" (ident? as tag) "$" { TEXT (ruleInDollarQuotes tag "" lexbuf) }
 283   | "`" { ident (ruleInBackQuotes "" lexbuf) }
 284   | "[" { ident (ruleInBrackets "" lexbuf) }
 285   | ['x' 'X'] "'" { BLOB (ruleInSingleQuotes "" lexbuf) }
 286
 287   | ident as str { if !Parser_state.mode = Ident then IDENT str (* no keywords, preserve case *) else get_ident str }
 288   | digit+ as str { INTEGER (int_of_string str) }
 289   | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
 290   | eof         { EOF }
 291   | _   { error lexbuf "ruleMain" }
 292 and
 293 (* FIXME factor out all that ruleIn* rules *)
 294 ruleInQuotes acc = parse
 295   | '"'         { acc }
 296   | eof         { error lexbuf "no terminating quote" }
 297   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
 298   | "\"\""      { ruleInQuotes (acc^"\"") lexbuf }
 299   | [^'"' '\n']+ as s { ruleInQuotes (acc^s) lexbuf }
 300   | _           { error lexbuf "ruleInQuotes" }
 301 and
 302 ruleInBrackets acc = parse
 303   | ']'         { acc }
 304   | eof         { error lexbuf "no terminating bracket" }
 305   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating bracket" }
 306 (*   | "\"\""      { ruleInQuotes (acc ^ "\"") lexbuf } *)
 307   | [^']' '\n']+  { ruleInBrackets (acc ^ lexeme lexbuf) lexbuf }
 308   | _           { error lexbuf "ruleInBrackets" }
 309 and
 310 ruleInSingleQuotes acc = parse
 311   | '\''              { acc }
 312   | eof         { error lexbuf "no terminating single quote" }
 313   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
 314   | "''"        { ruleInSingleQuotes (acc ^ "'") lexbuf }
 315   | [^'\'' '\n']+  { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
 316   | _           { error lexbuf "ruleInSingleQuotes" }
 317 and
 318 ruleInBackQuotes acc = parse
 319   | '`'         { acc }
 320   | eof         { error lexbuf "no terminating back quote" }
 321   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
 322   | "``"        { ruleInBackQuotes (acc ^ "`") lexbuf }
 323   | [^'`' '\n']+  { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
 324   | _           { error lexbuf "ruleInBackQuotes" }
 325 and
 326 ruleInDollarQuotes tag acc = parse
 327   | "$" (ident? as tag_) "$" { if tag_ = tag then acc else ruleInDollarQuotes tag (acc ^ sprintf "$%s$" tag_) lexbuf }
 328   | eof         { error lexbuf "no terminating dollar quote" }
 329   | '\n'        { advance_line lexbuf; ruleInDollarQuotes tag (acc ^ "\n") lexbuf }
 330   (* match one char at a time to make sure delimiter matches longer *)
 331   | [^'\n']     { ruleInDollarQuotes tag (acc ^ lexeme lexbuf) lexbuf }
 332   | _           { error lexbuf "ruleInDollarQuotes" }
 333 and
 334 ruleComment acc = parse
 335   | '\n'        { advance_line lexbuf; acc }
 336   | eof         { acc }
 337   | [^'\n']+    { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
 338   | _           { error lexbuf "ruleComment"; }
 339 and
 340 ruleCommentMulti acc = parse
 341   | '\n'        { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
 342   | "*/"        { acc }
 343   | "*"
 344   | [^'\n' '*']+    { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
 345   | _           { error lexbuf "ruleCommentMulti" }
 346
 347 {
 348
 349   let parse_rule lexbuf =
 350     let token = ruleMain lexbuf in
 351     match !Parser_state.mode with
 352     | Normal -> token
 353     | Ignore ->
 354 (*         eprintf "ignored: %s\n" (lexeme lexbuf); *)
 355       if token = EOF then token else IGNORED
 356     | Ident -> token
 357
 358 }
 359