sql_lexer.mll

   1
   2 {
   3   open Sql_parser
   4   open Lexing
   5   module T = Sql.Type
   6
   7 let error buf callerID =
   8   Error.report "Lexer error : %s" callerID;
   9 (*      update_pos buf;*)
  10         raise Parsing.Parse_error
  11
  12 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
  13
  14 let advance_line_pos pos =
  15   { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
  16
  17 let advance_line lexbuf =
  18   lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
  19
  20 (* use Map or Hashtbl ? *)
  21 let keywords =
  22   let k = ref [
  23    "as",AS;
  24    "on",ON;
  25    "conflict",CONFLICT;
  26    "using",USING;
  27    "natural",NATURAL;
  28    "join",JOIN;
  29    "isnull",TEST_NULL;
  30    "notnull",TEST_NULL;
  31    "between",BETWEEN;
  32    "and",AND;
  33    "escape",ESCAPE;
  34    "not",NOT;
  35    "null",NULL;
  36    "unique",UNIQUE;
  37    "primary",PRIMARY;
  38    "key",KEY;
  39    "default",DEFAULT;
  40    "precision",PRECISION;
  41    "varying",VARYING;
  42    "charset",CHARSET;
  43    "collate",COLLATE;
  44    "national",NATIONAL;
  45    "ascii",ASCII;
  46    "unicode",UNICODE;
  47    "distinct",DISTINCT;
  48    "character",CHARACTER;
  49    "binary",BINARY;
  50    "all",ALL;
  51    "any",ANY;
  52    "some",SOME;
  53    "order",ORDER;
  54    "by",BY;
  55    "limit",LIMIT;
  56    "desc",DESC;
  57    "asc",ASC;
  58    "offset",OFFSET;
  59    "select",SELECT;
  60    "create",CREATE;
  61    "table",TABLE;
  62    "view",VIEW;
  63    "insert",INSERT;
  64    "replace",REPLACE;
  65    "update",UPDATE;
  66    "delete",DELETE;
  67    "from",FROM;
  68    "or",OR;
  69    "into",INTO;
  70    "values",VALUES;
  71    "where",WHERE;
  72    "from",FROM;
  73    "set",SET;
  74    "in",IN;
  75    "group",GROUP;
  76    "having",HAVING;
  77    "union",UNION;
  78    "except",EXCEPT;
  79    "intersect",INTERSECT;
  80    "cross",CROSS;
  81    "temporary",TEMPORARY;
  82    "if",IF;
  83    "exists",EXISTS;
  84    "foreign",FOREIGN;
  85    "global",GLOBAL;
  86    "local",LOCAL;
  87    "value",VALUE;
  88    "references",REFERENCES;
  89    "check",CHECK;
  90    "date",DATE;
  91    "time",TIME;
  92    "timestamp",TIMESTAMP;
  93    "alter",ALTER;
  94    "add",ADD;
  95    "cascade",CASCADE;
  96    "restrict",RESTRICT;
  97    "drop",DROP;
  98    "constraint",CONSTRAINT;
  99    "collate",COLLATE;
 100    "after",AFTER;
 101    "index",INDEX;
 102    "fulltext",FULLTEXT;
 103    "unsigned",UNSIGNED;
 104    "first",FIRST;
 105    "column",COLUMN;
 106    "like", LIKE;
 107   ] in (* more *)
 108   let all token l = k := !k @ List.map (fun x -> x,token) l in
 109   all (FUNCTION T.Int) ["max"; "min"; "length"; "random";"count";"sum";"avg"];
 110   all (FUNCTION T.Text) ["concat";"lower";"upper"];
 111   all (FUNCTION T.Any) ["coalesce"];
 112   all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now";"unix_timestamp"];
 113   all CONFLICT_ALGO ["ignore"; "replace"; "abort"; "fail"; "rollback"];
 114   all JOIN_TYPE1 ["left";"right";"full"];
 115   all JOIN_TYPE2 ["inner";"outer"];
 116   all LIKE_OP ["glob";"regexp";"match"];
 117   all AUTOINCREMENT ["autoincrement";"auto_increment"];
 118 (* standard built-in types
 119       CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
 120       BINARY, BINARY VARYING, BINARY LARGE OBJECT,
 121       NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
 122       FLOAT, REAL, DOUBLE PRECISION,
 123       BOOLEAN,
 124       DATE, TIME, TIMESTAMP, INTERVAL
 125     *)
 126   all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
 127   all T_INTEGER ["numeric";"decimal";"dec";"fixed"];
 128   all T_INTEGER ["number"]; (* oracle *)
 129   all T_BOOLEAN ["bool";"boolean"];
 130   all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
 131   all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
 132   all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
 133   all T_TEXT ["varchar2"]; (* oracle *)
 134   all T_DATETIME ["datetime";"year";];
 135   !k
 136
 137 (*
 138   Q: Why not convert all input to lowercase before lexing?
 139   A: Sometimes SQL is case-sensitive, also string contents should be preserved
 140 *)
 141
 142 let keywords = List.map (fun (k,v) -> (String.lowercase k, v)) keywords
 143
 144 let get_ident str =
 145   let str = String.lowercase str in
 146   try List.assoc str keywords with Not_found -> IDENT str
 147 }
 148
 149 let digit = ['0'-'9']
 150 let alpha = ['a'-'z' 'A'-'Z']
 151 let ident = (alpha) (alpha | digit | '_' )*
 152 let wsp = [' ' '\r' '\t']
 153 let cmnt = "--" | "//" | "#"
 154
 155 rule ruleStatement props = parse
 156   | ['\n' ' ' '\r' '\t']+ { ruleStatement props lexbuf }
 157 (* fixme strings *)
 158   | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n'
 159       {
 160         ruleStatement (Props.set props n v) lexbuf
 161       }
 162   | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n'
 163       {
 164         ruleStatement (Props.set props "name" name) lexbuf
 165       }
 166   | cmnt { ignore (ruleComment "" lexbuf); ruleStatement props lexbuf }
 167   | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleStatement props lexbuf }
 168   | alpha [^ ';']+ as stmt ';' { Some (stmt,props) }
 169   | _ { None }
 170 and
 171 ruleMain = parse
 172   | wsp   { ruleMain lexbuf }
 173   (* update line number *)
 174   | '\n'  { advance_line lexbuf; ruleMain lexbuf}
 175
 176   | '('         { LPAREN }
 177   | ')'         { RPAREN }
 178   | ','   { COMMA }
 179   | '.'   { DOT }
 180
 181   | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
 182   | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
 183
 184   | "*" { ASTERISK }
 185   | "=" { EQUAL }
 186   | "!" { EXCL }
 187   | "~" { TILDE }
 188   | "||" { CONCAT_OP }
 189   | "+" { PLUS }
 190   | "-" { MINUS }
 191
 192   | "/" | "%" | "|" | "&" { NUM_BINARY_OP }
 193   | ">" | ">=" | "<=" | "<" | "<>" { COMPARISON_OP }
 194
 195   | "?" { PARAM (None,pos lexbuf) }
 196   | [':' '@'] (ident as str) { PARAM (Some str,pos lexbuf) }
 197
 198   | '"' { IDENT (ruleInQuotes "" lexbuf) }
 199   | "'" { TEXT (ruleInSingleQuotes "" lexbuf) }
 200   | "`" { IDENT (ruleInBackQuotes "" lexbuf) }
 201   | "[" { IDENT (ruleInBrackets "" lexbuf) }
 202   | ['x' 'X'] "'" { BLOB (ruleInSingleQuotes "" lexbuf) }
 203
 204   | ident as str { get_ident str }
 205   | digit+ as str { INTEGER (int_of_string str) }
 206   | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
 207   | eof         { EOF }
 208   | _           { error lexbuf "ruleMain" }
 209 and
 210 (* FIXME factor out all that ruleIn* rules *)
 211 ruleInQuotes acc = parse
 212   | '"'         { acc }
 213   | eof         { error lexbuf "no terminating quote" }
 214   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
 215   | "\"\""      { ruleInQuotes (acc ^ "\"") lexbuf }
 216   | [^'"' '\n']+  { ruleInQuotes (acc ^ lexeme lexbuf) lexbuf }
 217   | _           { error lexbuf "ruleInQuotes" }
 218 and
 219 ruleInBrackets acc = parse
 220   | ']'         { acc }
 221   | eof         { error lexbuf "no terminating bracket" }
 222   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating bracket" }
 223 (*   | "\"\""      { ruleInQuotes (acc ^ "\"") lexbuf } *)
 224   | [^']' '\n']+  { ruleInBrackets (acc ^ lexeme lexbuf) lexbuf }
 225   | _           { error lexbuf "ruleInBrackets" }
 226 and
 227 ruleInSingleQuotes acc = parse
 228   | '\''              { acc }
 229   | eof         { error lexbuf "no terminating single quote" }
 230   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
 231   | "''"        { ruleInSingleQuotes (acc ^ "'") lexbuf }
 232   | [^'\'' '\n']+  { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
 233   | _           { error lexbuf "ruleInSingleQuotes" }
 234 and
 235 ruleInBackQuotes acc = parse
 236   | '`'         { acc }
 237   | eof         { error lexbuf "no terminating back quote" }
 238   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
 239   | "``"        { ruleInBackQuotes (acc ^ "`") lexbuf }
 240   | [^'`' '\n']+  { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
 241   | _           { error lexbuf "ruleInBackQuotes" }
 242 and
 243 ruleComment acc = parse
 244   | '\n'        { advance_line lexbuf; acc }
 245   | eof         { acc }
 246   | [^'\n']+    { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
 247   | _           { error lexbuf "ruleComment"; }
 248 and
 249 ruleCommentMulti acc = parse
 250   | '\n'        { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
 251   | "*/"        { acc }
 252   | [^'\n']+    { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
 253   | _           { error lexbuf "ruleCommentMulti" }
 254
 255 {
 256
 257   let parse_rule lexbuf =
 258     let module P = Parser_state in
 259     let token = ruleMain lexbuf in
 260     match !P.mode with
 261     | P.Normal -> token
 262     | P.Ignore ->
 263 (*         Printf.eprintf "ignored: %s\n" (lexeme lexbuf); *)
 264       if (token = EOF) then token else IGNORED
 265
 266 }
 267