sql_lexer.mll

   1
   2 {
   3   open Sql_parser
   4   open Lexing
   5   module T = Sql.Type
   6
   7 let error buf callerID =
   8   Error.report "Lexer error : %s" callerID;
   9 (*      update_pos buf;*)
  10         raise Parsing.Parse_error
  11
  12 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
  13
  14 let advance_line_pos pos =
  15   { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
  16
  17 let advance_line lexbuf =
  18   lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
  19
  20 (* use Map or Hashtbl ? *)
  21 let keywords =
  22   let k = ref [
  23    "as",AS;
  24    "on",ON;
  25    "conflict",CONFLICT;
  26    "using",USING;
  27    "natural",NATURAL;
  28    "join",JOIN;
  29    "isnull",TEST_NULL;
  30    "notnull",TEST_NULL;
  31    "between",BETWEEN;
  32    "and",AND;
  33    "escape",ESCAPE;
  34    "not",NOT;
  35    "null",NULL;
  36    "unique",UNIQUE;
  37    "primary",PRIMARY;
  38    "key",KEY;
  39    "default",DEFAULT;
  40    "precision",PRECISION;
  41    "varying",VARYING;
  42    "charset",CHARSET;
  43    "collate",COLLATE;
  44    "national",NATIONAL;
  45    "ascii",ASCII;
  46    "unicode",UNICODE;
  47    "distinct",DISTINCT;
  48    "character",CHARACTER;
  49    "binary",BINARY;
  50    "all",ALL;
  51    "any",ANY;
  52    "some",SOME;
  53    "order",ORDER;
  54    "by",BY;
  55    "limit",LIMIT;
  56    "desc",DESC;
  57    "asc",ASC;
  58    "offset",OFFSET;
  59    "select",SELECT;
  60    "create",CREATE;
  61    "table",TABLE;
  62    "view",VIEW;
  63    "insert",INSERT;
  64    "replace",REPLACE;
  65    "update",UPDATE;
  66    "delete",DELETE;
  67    "from",FROM;
  68    "or",OR;
  69    "into",INTO;
  70    "values",VALUES;
  71    "where",WHERE;
  72    "from",FROM;
  73    "set",SET;
  74    "in",IN;
  75    "group",GROUP;
  76    "having",HAVING;
  77    "union",UNION;
  78    "except",EXCEPT;
  79    "intersect",INTERSECT;
  80    "cross",CROSS;
  81    "temporary",TEMPORARY;
  82    "if",IF;
  83    "exists",EXISTS;
  84    "foreign",FOREIGN;
  85    "global",GLOBAL;
  86    "local",LOCAL;
  87    "value",VALUE;
  88    "references",REFERENCES;
  89    "check",CHECK;
  90    "date",DATE;
  91    "time",TIME;
  92    "timestamp",TIMESTAMP;
  93    "alter",ALTER;
  94    "add",ADD;
  95    "cascade",CASCADE;
  96    "restrict",RESTRICT;
  97    "drop",DROP;
  98    "constraint",CONSTRAINT;
  99    "collate",COLLATE;
 100    "after",AFTER;
 101    "index",INDEX;
 102    "fulltext",FULLTEXT;
 103    "unsigned",UNSIGNED;
 104    "first",FIRST;
 105    "column",COLUMN;
 106   ] in (* more *)
 107   let all token l = k := !k @ List.map (fun x -> x,token) l in
 108   all (FUNCTION (Some T.Int)) ["max"; "min"; "length"; "random";"count";"sum";"avg"];
 109   all (FUNCTION (Some T.Text)) ["concat";"lower";"upper"];
 110   all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now";"unix_timestamp"];
 111   all CONFLICT_ALGO ["ignore"; "replace"; "abort"; "fail"; "rollback"];
 112   all JOIN_TYPE1 ["left";"right";"full"];
 113   all JOIN_TYPE2 ["inner";"outer"];
 114   all LIKE_OP ["like";"glob";"regexp";"match"];
 115   all AUTOINCREMENT ["autoincrement";"auto_increment"];
 116 (* standard built-in types
 117       CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
 118       BINARY, BINARY VARYING, BINARY LARGE OBJECT,
 119       NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
 120       FLOAT, REAL, DOUBLE PRECISION,
 121       BOOLEAN,
 122       DATE, TIME, TIMESTAMP, INTERVAL
 123     *)
 124   all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
 125   all T_INTEGER ["numeric";"decimal";"dec";"fixed"];
 126   all T_BOOLEAN ["bool";"boolean"];
 127   all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
 128   all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
 129   all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
 130   all T_DATETIME ["datetime";"year";];
 131   !k
 132
 133 (*
 134   Q: Why not convert all input to lowercase before lexing?
 135   A: Sometimes SQL is case-sensitive, also string contents should be preserved
 136 *)
 137
 138 let keywords = List.map (fun (k,v) -> (String.lowercase k, v)) keywords
 139
 140 let get_ident str =
 141   let str = String.lowercase str in
 142   try List.assoc str keywords with Not_found -> IDENT str
 143 }
 144
 145 let digit = ['0'-'9']
 146 let alpha = ['a'-'z' 'A'-'Z']
 147 let ident = (alpha) (alpha | digit | '_' )*
 148 let wsp = [' ' '\r' '\t']
 149 let cmnt = "--" | "//" | "#"
 150
 151 rule ruleStatement props = parse
 152   | ['\n' ' ' '\r' '\t']+ { ruleStatement props lexbuf }
 153 (* fixme strings *)
 154   | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n'
 155       {
 156         ruleStatement (Props.set props n v) lexbuf
 157       }
 158   | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n'
 159       {
 160         ruleStatement (Props.set props "name" name) lexbuf
 161       }
 162   | cmnt { ignore (ruleComment "" lexbuf); ruleStatement props lexbuf }
 163   | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleStatement props lexbuf }
 164   | alpha [^ ';']+ as stmt ';' { Some (stmt,props) }
 165   | _ { None }
 166 and
 167 ruleMain = parse
 168   | wsp   { ruleMain lexbuf }
 169   (* update line number *)
 170   | '\n'  { advance_line lexbuf; ruleMain lexbuf}
 171
 172   | '('         { LPAREN }
 173   | ')'         { RPAREN }
 174   | ','   { COMMA }
 175   | '.'   { DOT }
 176
 177   | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
 178   | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
 179
 180   | "*" { ASTERISK }
 181   | "=" { EQUAL }
 182   | "!" { EXCL }
 183   | "~" { TILDE }
 184   | "||" { CONCAT_OP }
 185   | "+" { PLUS }
 186   | "-" { MINUS }
 187
 188   | "/" | "%" | "|" | "&" { NUM_BINARY_OP }
 189   | ">" | ">=" | "<=" | "<" | "<>" { COMPARISON_OP }
 190
 191   | "?" { PARAM (None,pos lexbuf) }
 192   | [':' '@'] (ident as str) { PARAM (Some str,pos lexbuf) }
 193
 194   | '"' { IDENT (ruleInQuotes "" lexbuf) }
 195   | "'" { TEXT (ruleInSingleQuotes "" lexbuf) }
 196   | "`" { IDENT (ruleInBackQuotes "" lexbuf) }
 197   | ['x' 'X'] "'" { BLOB (ruleInSingleQuotes "" lexbuf) }
 198
 199   | ident as str { get_ident str }
 200   | digit+ as str { INTEGER (int_of_string str) }
 201   | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
 202   | eof         { EOF }
 203   | _           { error lexbuf "ruleMain" }
 204 and
 205 ruleInQuotes acc = parse
 206   | '"'         { acc }
 207   | eof         { error lexbuf "no terminating quote" }
 208   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
 209   | "\"\""      { ruleInQuotes (acc ^ "\"") lexbuf }
 210   | [^'"' '\n']+  { ruleInQuotes (acc ^ lexeme lexbuf) lexbuf }
 211   | _           { error lexbuf "ruleInQuotes" }
 212 and
 213 ruleInSingleQuotes acc = parse
 214   | '\''              { acc }
 215   | eof         { error lexbuf "no terminating single quote" }
 216   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
 217   | "''"        { ruleInSingleQuotes (acc ^ "'") lexbuf }
 218   | [^'\'' '\n']+  { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
 219   | _           { error lexbuf "ruleInSingleQuotes" }
 220 and
 221 ruleInBackQuotes acc = parse
 222   | '`'         { acc }
 223   | eof         { error lexbuf "no terminating back quote" }
 224   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
 225   | "``"        { ruleInBackQuotes (acc ^ "`") lexbuf }
 226   | [^'`' '\n']+  { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
 227   | _           { error lexbuf "ruleInBackQuotes" }
 228 and
 229 ruleComment acc = parse
 230   | '\n'        { advance_line lexbuf; acc }
 231   | eof         { acc }
 232   | [^'\n']+    { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
 233   | _           { error lexbuf "ruleComment"; }
 234 and
 235 ruleCommentMulti acc = parse
 236   | '\n'        { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
 237   | "*/"        { acc }
 238   | [^'\n']+    { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
 239   | _           { error lexbuf "ruleCommentMulti" }
 240
 241 {
 242
 243   let parse_rule lexbuf =
 244     let module P = Parser_state in
 245     let token = ruleMain lexbuf in
 246     match !P.mode with
 247     | P.Normal -> token
 248     | P.Ignore ->
 249 (*         Printf.eprintf "ignored: %s\n" (lexeme lexbuf); *)
 250       if (token = EOF) then token else IGNORED
 251
 252 }
 253