src/sql_lexer.mll

   1
   2 {
   3   open Sql_parser
   4   open Lexing
   5   open ExtLib
   6   module T = Sql.Type
   7
   8 let error _ callerID =
   9   Error.report "Lexer error : %s" callerID;
  10 (*      update_pos buf;*)
  11         raise Parsing.Parse_error
  12
  13 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
  14
  15 let advance_line_pos pos =
  16   { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
  17
  18 let advance_line lexbuf =
  19   lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
  20
  21 let keywords =
  22   let k = ref [
  23    "as",AS;
  24    "on",ON;
  25    "conflict",CONFLICT;
  26    "using",USING;
  27    "natural",NATURAL;
  28    "join",JOIN;
  29    "isnull",TEST_NULL;
  30    "notnull",TEST_NULL;
  31    "between",BETWEEN;
  32    "and",AND;
  33    "escape",ESCAPE;
  34    "not",NOT;
  35    "null",NULL;
  36    "unique",UNIQUE;
  37    "primary",PRIMARY;
  38    "key",KEY;
  39    "default",DEFAULT;
  40    "precision",PRECISION;
  41    "varying",VARYING;
  42    "charset",CHARSET;
  43    "collate",COLLATE;
  44    "national",NATIONAL;
  45    "ascii",ASCII;
  46    "unicode",UNICODE;
  47    "distinct",DISTINCT;
  48    "character",CHARACTER;
  49    "binary",BINARY;
  50    "all",ALL;
  51    "any",ANY;
  52    "some",SOME;
  53    "order",ORDER;
  54    "by",BY;
  55    "limit",LIMIT;
  56    "desc",DESC;
  57    "asc",ASC;
  58    "offset",OFFSET;
  59    "select",SELECT;
  60    "create",CREATE;
  61    "table",TABLE;
  62    "view",VIEW;
  63    "insert",INSERT;
  64    "replace",REPLACE;
  65    "update",UPDATE;
  66    "delete",DELETE;
  67    "from",FROM;
  68    "or",OR;
  69    "into",INTO;
  70    "values",VALUES;
  71    "where",WHERE;
  72    "set",SET;
  73    "in",IN;
  74    "group",GROUP;
  75    "having",HAVING;
  76    "union",UNION;
  77    "except",EXCEPT;
  78    "intersect",INTERSECT;
  79    "cross",CROSS;
  80    "temporary",TEMPORARY;
  81    "if",IF;
  82    "exists",EXISTS;
  83    "foreign",FOREIGN;
  84    "global",GLOBAL;
  85    "local",LOCAL;
  86    "value",VALUE;
  87    "references",REFERENCES;
  88    "check",CHECK;
  89    "date",DATE;
  90    "time",TIME;
  91    "timestamp",TIMESTAMP;
  92    "alter",ALTER;
  93    "add",ADD;
  94    "cascade",CASCADE;
  95    "restrict",RESTRICT;
  96    "drop",DROP;
  97    "constraint",CONSTRAINT;
  98    "after",AFTER;
  99    "index",INDEX;
 100    "fulltext",FULLTEXT;
 101    "unsigned",UNSIGNED;
 102    "first",FIRST;
 103    "column",COLUMN;
 104    "like", LIKE;
 105    "case", CASE;
 106    "when", WHEN;
 107    "then", THEN;
 108    "else", ELSE;
 109    "end", END;
 110    "change", CHANGE;
 111   ] in (* more *)
 112   let all token l = k := !k @ List.map (fun x -> x,token) l in
 113   all (FUNCTION (T.Int,true)) ["max"; "min"; "count";"sum";"avg"];
 114   all (FUNCTION (T.Int,false)) ["length"; "random";"unix_timestamp"];
 115   all (FUNCTION (T.Int,false)) ["least"; "greatest"];
 116   all (FUNCTION (T.Text,false)) ["concat";"lower";"upper"];
 117   all (FUNCTION (T.Any,false)) ["coalesce"];
 118   all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now"];
 119   all CONFLICT_ALGO ["ignore"; "abort"; "fail"; "rollback"];
 120   all JOIN_TYPE1 ["left";"right";"full"];
 121   all JOIN_TYPE2 ["inner";"outer"];
 122   all LIKE_OP ["glob";"regexp";"match"];
 123   all AUTOINCREMENT ["autoincrement";"auto_increment"];
 124 (* standard built-in types
 125       CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
 126       BINARY, BINARY VARYING, BINARY LARGE OBJECT,
 127       NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
 128       FLOAT, REAL, DOUBLE PRECISION,
 129       BOOLEAN,
 130       DATE, TIME, TIMESTAMP, INTERVAL
 131     *)
 132   all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
 133   all T_INTEGER ["numeric";"decimal";"dec";"fixed"];
 134   all T_INTEGER ["number"]; (* oracle *)
 135   all T_BOOLEAN ["bool";"boolean"];
 136   all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
 137   all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
 138   all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
 139   all T_TEXT ["varchar2"]; (* oracle *)
 140   all T_DATETIME ["datetime";"year";];
 141   !k
 142
 143 (*
 144   Q: Why not convert all input to lowercase before lexing?
 145   A: Sometimes SQL is case-sensitive, also string contents should be preserved
 146 *)
 147
 148 module Keywords = Map.Make(String)
 149
 150 let keywords =
 151   let add map (k,v) =
 152     let k = String.lowercase k in
 153     if Keywords.mem k map then
 154       failwith (Printf.sprintf "Lexeme %s is already associated with keyword." k)
 155     else
 156       Keywords.add k v map
 157   in
 158   List.fold_left add Keywords.empty keywords
 159
 160 let get_ident str =
 161   let str = String.lowercase str in
 162   try Keywords.find str keywords with Not_found -> IDENT str
 163
 164 let as_literal ch s =
 165   let s = String.replace_chars (fun x -> String.make (if x = ch then 2 else 1) x) s in
 166   Printf.sprintf "%c%s%c" ch s ch
 167 }
 168
 169 let digit = ['0'-'9']
 170 let alpha = ['a'-'z' 'A'-'Z']
 171 let ident = (alpha) (alpha | digit | '_' )*
 172 let wsp = [' ' '\r' '\t']
 173 let cmnt = "--" | "//" | "#"
 174
 175 (* extract separate statements *)
 176 rule ruleStatement = parse
 177   | ['\n' ' ' '\r' '\t']+ as tok { `Space tok }
 178   | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n' { `Prop (n,v) }
 179   | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n' { `Prop ("name",name) }
 180   | '"' { let s = ruleInQuotes "" lexbuf in `Token (as_literal '"' s) }
 181   | "'" { let s = ruleInSingleQuotes "" lexbuf in `Token (as_literal '\'' s) }
 182   | cmnt as s { `Comment (s ^ ruleComment "" lexbuf) }
 183   | "/*" { `Comment ("/*" ^ ruleCommentMulti "" lexbuf ^ "*/") }
 184   | ';' { `Semicolon }
 185   | [^ ';'] as c { `Char c }
 186   | eof { `Eof }
 187 and
 188 (* extract tail of the input *)
 189 ruleTail acc = parse
 190   | eof { acc }
 191   | _* as str { ruleTail (acc ^ str) lexbuf }
 192 and
 193 ruleMain = parse
 194   | wsp   { ruleMain lexbuf }
 195   (* update line number *)
 196   | '\n'  { advance_line lexbuf; ruleMain lexbuf}
 197
 198   | '('         { LPAREN }
 199   | ')'         { RPAREN }
 200   | ','   { COMMA }
 201   | '.'   { DOT }
 202
 203   | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
 204   | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
 205
 206   | "*" { ASTERISK }
 207   | "=" { EQUAL }
 208   | "!" { EXCL }
 209   | "~" { TILDE }
 210   | "||" { CONCAT_OP }
 211   | "+" { PLUS }
 212   | "-" { MINUS }
 213
 214   | "/" | "%" { NUM_DIV_OP }
 215   | "<<" | ">>" | "|" | "&" { NUM_BIT_OP }
 216   | ">" | ">=" | "<=" | "<" { NUM_CMP_OP }
 217   | "<>" | "!=" | "==" { NUM_EQ_OP }
 218
 219   | "?" { PARAM (None,pos lexbuf) }
 220   | [':' '@'] (ident as str) { PARAM (Some str,pos lexbuf) }
 221
 222   | '"' { IDENT (ruleInQuotes "" lexbuf) }
 223   | "'" { TEXT (ruleInSingleQuotes "" lexbuf) }
 224   | "`" { IDENT (ruleInBackQuotes "" lexbuf) }
 225   | "[" { IDENT (ruleInBrackets "" lexbuf) }
 226   | ['x' 'X'] "'" { BLOB (ruleInSingleQuotes "" lexbuf) }
 227
 228   | ident as str { get_ident str }
 229   | digit+ as str { INTEGER (int_of_string str) }
 230   | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
 231   | eof         { EOF }
 232   | _   { error lexbuf "ruleMain" }
 233 and
 234 (* FIXME factor out all that ruleIn* rules *)
 235 ruleInQuotes acc = parse
 236   | '"'         { acc }
 237   | eof         { error lexbuf "no terminating quote" }
 238   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
 239   | "\"\""      { ruleInQuotes (acc^"\"") lexbuf }
 240   | [^'"' '\n']+ as s { ruleInQuotes (acc^s) lexbuf }
 241   | _           { error lexbuf "ruleInQuotes" }
 242 and
 243 ruleInBrackets acc = parse
 244   | ']'         { acc }
 245   | eof         { error lexbuf "no terminating bracket" }
 246   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating bracket" }
 247 (*   | "\"\""      { ruleInQuotes (acc ^ "\"") lexbuf } *)
 248   | [^']' '\n']+  { ruleInBrackets (acc ^ lexeme lexbuf) lexbuf }
 249   | _           { error lexbuf "ruleInBrackets" }
 250 and
 251 ruleInSingleQuotes acc = parse
 252   | '\''              { acc }
 253   | eof         { error lexbuf "no terminating single quote" }
 254   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
 255   | "''"        { ruleInSingleQuotes (acc ^ "'") lexbuf }
 256   | [^'\'' '\n']+  { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
 257   | _           { error lexbuf "ruleInSingleQuotes" }
 258 and
 259 ruleInBackQuotes acc = parse
 260   | '`'         { acc }
 261   | eof         { error lexbuf "no terminating back quote" }
 262   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
 263   | "``"        { ruleInBackQuotes (acc ^ "`") lexbuf }
 264   | [^'`' '\n']+  { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
 265   | _           { error lexbuf "ruleInBackQuotes" }
 266 and
 267 ruleComment acc = parse
 268   | '\n'        { advance_line lexbuf; acc }
 269   | eof         { acc }
 270   | [^'\n']+    { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
 271   | _           { error lexbuf "ruleComment"; }
 272 and
 273 ruleCommentMulti acc = parse
 274   | '\n'        { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
 275   | "*/"        { acc }
 276   | "*"
 277   | [^'\n' '*']+    { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
 278   | _           { error lexbuf "ruleCommentMulti" }
 279
 280 {
 281
 282   let parse_rule lexbuf =
 283     let module P = Parser_state in
 284     let token = ruleMain lexbuf in
 285     match !P.mode with
 286     | P.Normal -> token
 287     | P.Ignore ->
 288 (*         Printf.eprintf "ignored: %s\n" (lexeme lexbuf); *)
 289       if (token = EOF) then token else IGNORED
 290
 291 }
 292