sql_lexer.mll

   1
   2 {
   3   open Sql_parser
   4   open Lexing
   5   module T = Sql.Type
   6
   7 let error buf callerID =
   8   Error.report "Lexer error : %s" callerID;
   9 (*      update_pos buf;*)
  10         raise Parsing.Parse_error
  11
  12 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
  13
  14 let advance_line_pos pos =
  15   { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
  16
  17 let advance_line lexbuf =
  18   lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
  19
  20 (* use Map or Hashtbl ? *)
  21 let keywords =
  22   let k = ref [
  23    "as",AS;
  24    "on",ON;
  25    "conflict",CONFLICT;
  26    "using",USING;
  27    "natural",NATURAL;
  28    "join",JOIN;
  29    "isnull",TEST_NULL;
  30    "notnull",TEST_NULL;
  31    "between",BETWEEN;
  32    "and",AND;
  33    "escape",ESCAPE;
  34    "not",NOT;
  35    "null",NULL;
  36    "unique",UNIQUE;
  37    "primary",PRIMARY;
  38    "key",KEY;
  39    "default",DEFAULT;
  40    "precision",PRECISION;
  41    "varying",VARYING;
  42    "charset",CHARSET;
  43    "collate",COLLATE;
  44    "national",NATIONAL;
  45    "ascii",ASCII;
  46    "unicode",UNICODE;
  47    "distinct",DISTINCT;
  48    "character",CHARACTER;
  49    "binary",BINARY;
  50    "all",ALL;
  51    "any",ANY;
  52    "some",SOME;
  53    "order",ORDER;
  54    "by",BY;
  55    "limit",LIMIT;
  56    "desc",DESC;
  57    "asc",ASC;
  58    "offset",OFFSET;
  59    "select",SELECT;
  60    "create",CREATE;
  61    "table",TABLE;
  62    "view",VIEW;
  63    "insert",INSERT;
  64    "replace",REPLACE;
  65    "update",UPDATE;
  66    "delete",DELETE;
  67    "from",FROM;
  68    "or",OR;
  69    "into",INTO;
  70    "values",VALUES;
  71    "where",WHERE;
  72    "from",FROM;
  73    "set",SET;
  74    "in",IN;
  75    "group",GROUP;
  76    "having",HAVING;
  77    "union",UNION;
  78    "except",EXCEPT;
  79    "intersect",INTERSECT;
  80    "cross",CROSS;
  81    "temporary",TEMPORARY;
  82    "if",IF;
  83    "exists",EXISTS;
  84    "foreign",FOREIGN;
  85    "global",GLOBAL;
  86    "local",LOCAL;
  87    "value",VALUE;
  88    "references",REFERENCES;
  89    "check",CHECK;
  90    "date",DATE;
  91    "time",TIME;
  92    "timestamp",TIMESTAMP;
  93    "alter",ALTER;
  94    "add",ADD;
  95    "cascade",CASCADE;
  96    "restrict",RESTRICT;
  97    "drop",DROP;
  98    "constraint",CONSTRAINT;
  99    "collate",COLLATE;
 100    "after",AFTER;
 101    "index",INDEX;
 102    "fulltext",FULLTEXT;
 103    "unsigned",UNSIGNED;
 104    "first",FIRST;
 105    "column",COLUMN;
 106    "like", LIKE;
 107   ] in (* more *)
 108   let all token l = k := !k @ List.map (fun x -> x,token) l in
 109   all (FUNCTION T.Int) ["max"; "min"; "length"; "random";"count";"sum";"avg"];
 110   all (FUNCTION T.Text) ["concat";"lower";"upper"];
 111   all (FUNCTION T.Any) ["coalesce"];
 112   all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now";"unix_timestamp"];
 113   all CONFLICT_ALGO ["ignore"; "replace"; "abort"; "fail"; "rollback"];
 114   all JOIN_TYPE1 ["left";"right";"full"];
 115   all JOIN_TYPE2 ["inner";"outer"];
 116   all LIKE_OP ["glob";"regexp";"match"];
 117   all AUTOINCREMENT ["autoincrement";"auto_increment"];
 118 (* standard built-in types
 119       CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
 120       BINARY, BINARY VARYING, BINARY LARGE OBJECT,
 121       NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
 122       FLOAT, REAL, DOUBLE PRECISION,
 123       BOOLEAN,
 124       DATE, TIME, TIMESTAMP, INTERVAL
 125     *)
 126   all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
 127   all T_INTEGER ["numeric";"decimal";"dec";"fixed"];
 128   all T_INTEGER ["number"]; (* oracle *)
 129   all T_BOOLEAN ["bool";"boolean"];
 130   all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
 131   all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
 132   all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
 133   all T_TEXT ["varchar2"]; (* oracle *)
 134   all T_DATETIME ["datetime";"year";];
 135   !k
 136
 137 (*
 138   Q: Why not convert all input to lowercase before lexing?
 139   A: Sometimes SQL is case-sensitive, also string contents should be preserved
 140 *)
 141
 142 let keywords = List.map (fun (k,v) -> (String.lowercase k, v)) keywords
 143
 144 let get_ident str =
 145   let str = String.lowercase str in
 146   try List.assoc str keywords with Not_found -> IDENT str
 147 }
 148
 149 let digit = ['0'-'9']
 150 let alpha = ['a'-'z' 'A'-'Z']
 151 let ident = (alpha) (alpha | digit | '_' )*
 152 let wsp = [' ' '\r' '\t']
 153 let cmnt = "--" | "//" | "#"
 154
 155 rule ruleStatement props = parse
 156   | ['\n' ' ' '\r' '\t']+ { ruleStatement props lexbuf }
 157 (* fixme strings *)
 158   | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n'
 159       {
 160         ruleStatement (Props.set props n v) lexbuf
 161       }
 162   | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n'
 163       {
 164         ruleStatement (Props.set props "name" name) lexbuf
 165       }
 166   | cmnt { ignore (ruleComment "" lexbuf); ruleStatement props lexbuf }
 167   | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleStatement props lexbuf }
 168   | alpha [^ ';']+ as stmt ';' { Some (stmt,props) }
 169   | _ { None }
 170 and
 171 ruleTail acc = parse
 172 | eof { acc }
 173 | _* as str { ruleTail (acc ^ str) lexbuf }
 174 and
 175 ruleMain = parse
 176   | wsp   { ruleMain lexbuf }
 177   (* update line number *)
 178   | '\n'  { advance_line lexbuf; ruleMain lexbuf}
 179
 180   | '('         { LPAREN }
 181   | ')'         { RPAREN }
 182   | ','   { COMMA }
 183   | '.'   { DOT }
 184
 185   | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
 186   | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
 187
 188   | "*" { ASTERISK }
 189   | "=" { EQUAL }
 190   | "!" { EXCL }
 191   | "~" { TILDE }
 192   | "||" { CONCAT_OP }
 193   | "+" { PLUS }
 194   | "-" { MINUS }
 195
 196   | "/" | "%" | "|" | "&" { NUM_BINARY_OP }
 197   | ">" | ">=" | "<=" | "<" | "<>" { COMPARISON_OP }
 198
 199   | "?" { PARAM (None,pos lexbuf) }
 200   | [':' '@'] (ident as str) { PARAM (Some str,pos lexbuf) }
 201
 202   | '"' { IDENT (ruleInQuotes "" lexbuf) }
 203   | "'" { TEXT (ruleInSingleQuotes "" lexbuf) }
 204   | "`" { IDENT (ruleInBackQuotes "" lexbuf) }
 205   | "[" { IDENT (ruleInBrackets "" lexbuf) }
 206   | ['x' 'X'] "'" { BLOB (ruleInSingleQuotes "" lexbuf) }
 207
 208   | ident as str { get_ident str }
 209   | digit+ as str { INTEGER (int_of_string str) }
 210   | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
 211   | eof         { EOF }
 212   | _           { error lexbuf "ruleMain" }
 213 and
 214 (* FIXME factor out all that ruleIn* rules *)
 215 ruleInQuotes acc = parse
 216   | '"'         { acc }
 217   | eof         { error lexbuf "no terminating quote" }
 218   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
 219   | "\"\""      { ruleInQuotes (acc ^ "\"") lexbuf }
 220   | [^'"' '\n']+  { ruleInQuotes (acc ^ lexeme lexbuf) lexbuf }
 221   | _           { error lexbuf "ruleInQuotes" }
 222 and
 223 ruleInBrackets acc = parse
 224   | ']'         { acc }
 225   | eof         { error lexbuf "no terminating bracket" }
 226   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating bracket" }
 227 (*   | "\"\""      { ruleInQuotes (acc ^ "\"") lexbuf } *)
 228   | [^']' '\n']+  { ruleInBrackets (acc ^ lexeme lexbuf) lexbuf }
 229   | _           { error lexbuf "ruleInBrackets" }
 230 and
 231 ruleInSingleQuotes acc = parse
 232   | '\''              { acc }
 233   | eof         { error lexbuf "no terminating single quote" }
 234   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
 235   | "''"        { ruleInSingleQuotes (acc ^ "'") lexbuf }
 236   | [^'\'' '\n']+  { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
 237   | _           { error lexbuf "ruleInSingleQuotes" }
 238 and
 239 ruleInBackQuotes acc = parse
 240   | '`'         { acc }
 241   | eof         { error lexbuf "no terminating back quote" }
 242   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
 243   | "``"        { ruleInBackQuotes (acc ^ "`") lexbuf }
 244   | [^'`' '\n']+  { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
 245   | _           { error lexbuf "ruleInBackQuotes" }
 246 and
 247 ruleComment acc = parse
 248   | '\n'        { advance_line lexbuf; acc }
 249   | eof         { acc }
 250   | [^'\n']+    { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
 251   | _           { error lexbuf "ruleComment"; }
 252 and
 253 ruleCommentMulti acc = parse
 254   | '\n'        { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
 255   | "*/"        { acc }
 256   | [^'\n']+    { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
 257   | _           { error lexbuf "ruleCommentMulti" }
 258
 259 {
 260
 261   let parse_rule lexbuf =
 262     let module P = Parser_state in
 263     let token = ruleMain lexbuf in
 264     match !P.mode with
 265     | P.Normal -> token
 266     | P.Ignore ->
 267 (*         Printf.eprintf "ignored: %s\n" (lexeme lexbuf); *)
 268       if (token = EOF) then token else IGNORED
 269
 270 }
 271