src/sql_lexer.mll

   1
   2 {
   3   open Sql_parser
   4   open Lexing
   5   open ExtLib
   6   module T = Sql.Type
   7
   8 let error _ callerID =
   9   Error.log "Lexer error : %s" callerID;
  10 (*      update_pos buf;*)
  11         raise Parsing.Parse_error
  12
  13 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
  14
  15 let advance_line_pos pos =
  16   { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
  17
  18 let advance_line lexbuf =
  19   lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
  20
  21 let keywords =
  22   let k = ref [
  23    "as",AS;
  24    "on",ON;
  25    "conflict",CONFLICT;
  26    "using",USING;
  27    "natural",NATURAL;
  28    "join",JOIN;
  29    "isnull",TEST_NULL;
  30    "notnull",TEST_NULL;
  31    "between",BETWEEN;
  32    "and",AND;
  33    "escape",ESCAPE;
  34    "not",NOT;
  35    "null",NULL;
  36    "unique",UNIQUE;
  37    "primary",PRIMARY;
  38    "key",KEY;
  39    "default",DEFAULT;
  40    "precision",PRECISION;
  41    "varying",VARYING;
  42    "charset",CHARSET;
  43    "collate",COLLATE;
  44    "national",NATIONAL;
  45    "ascii",ASCII;
  46    "unicode",UNICODE;
  47    "distinct",DISTINCT;
  48    "character",CHARACTER;
  49    "binary",BINARY;
  50    "all",ALL;
  51    "any",ANY;
  52    "some",SOME;
  53    "order",ORDER;
  54    "by",BY;
  55    "limit",LIMIT;
  56    "desc",DESC;
  57    "asc",ASC;
  58    "offset",OFFSET;
  59    "select",SELECT;
  60    "create",CREATE;
  61    "table",TABLE;
  62    "view",VIEW;
  63    "insert",INSERT;
  64    "replace",REPLACE;
  65    "update",UPDATE;
  66    "delete",DELETE;
  67    "from",FROM;
  68    "or",OR;
  69    "into",INTO;
  70    "values",VALUES;
  71    "where",WHERE;
  72    "set",SET;
  73    "in",IN;
  74    "group",GROUP;
  75    "having",HAVING;
  76    "union",UNION;
  77    "except",EXCEPT;
  78    "intersect",INTERSECT;
  79    "cross",CROSS;
  80    "temporary",TEMPORARY;
  81    "if",IF;
  82    "exists",EXISTS;
  83    "foreign",FOREIGN;
  84    "global",GLOBAL;
  85    "local",LOCAL;
  86    "value",VALUE;
  87    "references",REFERENCES;
  88    "check",CHECK;
  89    "date",DATE;
  90    "time",TIME;
  91    "timestamp",TIMESTAMP;
  92    "alter",ALTER;
  93    "add",ADD;
  94    "cascade",CASCADE;
  95    "restrict",RESTRICT;
  96    "drop",DROP;
  97    "constraint",CONSTRAINT;
  98    "after",AFTER;
  99    "index",INDEX;
 100    "fulltext",FULLTEXT;
 101    "unsigned",UNSIGNED;
 102    "first",FIRST;
 103    "column",COLUMN;
 104    "like", LIKE;
 105    "case", CASE;
 106    "when", WHEN;
 107    "then", THEN;
 108    "else", ELSE;
 109    "end", END;
 110    "change", CHANGE;
 111    "modify", MODIFY;
 112    "delayed", DELAYED;
 113    "enum", ENUM;
 114   ] in (* more *)
 115   let all token l = k := !k @ List.map (fun x -> x,token) l in
 116   all (FUNCTION (T.Int,true)) ["max"; "min"; "count";"sum";"avg"];
 117   all (FUNCTION (T.Int,false)) ["length"; "random";"unix_timestamp"];
 118   all (FUNCTION (T.Int,false)) ["least"; "greatest"];
 119   all (FUNCTION (T.Text,false)) ["concat";"lower";"upper"];
 120   all (FUNCTION (T.Any,false)) ["coalesce"];
 121   all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now";];
 122   all DATETIME_FUNC ["getdate"]; (* mssql? *)
 123   all CONFLICT_ALGO ["ignore"; "abort"; "fail"; "rollback"];
 124   all JOIN_TYPE1 ["left";"right";"full"];
 125   all JOIN_TYPE2 ["inner";"outer"];
 126   all LIKE_OP ["glob";"regexp";"match"];
 127   all AUTOINCREMENT ["autoincrement";"auto_increment"];
 128 (* standard built-in types
 129       CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
 130       BINARY, BINARY VARYING, BINARY LARGE OBJECT,
 131       NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
 132       FLOAT, REAL, DOUBLE PRECISION,
 133       BOOLEAN,
 134       DATE, TIME, TIMESTAMP, INTERVAL
 135     *)
 136   all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
 137   all T_INTEGER ["numeric";"decimal";"dec";"fixed"];
 138   all T_INTEGER ["number"]; (* oracle *)
 139   all T_BOOLEAN ["bool";"boolean"];
 140   all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
 141   all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
 142   all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
 143   all T_TEXT ["varchar2"]; (* oracle *)
 144   all T_DATETIME ["datetime";"year";];
 145   !k
 146
 147 (*
 148   Q: Why not convert all input to lowercase before lexing?
 149   A: Sometimes SQL is case-sensitive, also string contents should be preserved
 150 *)
 151
 152 module Keywords = Map.Make(String)
 153
 154 let keywords =
 155   let add map (k,v) =
 156     let k = String.lowercase k in
 157     if Keywords.mem k map then
 158       failwith (Printf.sprintf "Lexeme %s is already associated with keyword." k)
 159     else
 160       Keywords.add k v map
 161   in
 162   List.fold_left add Keywords.empty keywords
 163
 164 (* FIXME case sensitivity??! *)
 165
 166 let get_ident str =
 167   let str = String.lowercase str in
 168   try Keywords.find str keywords with Not_found -> IDENT str
 169
 170 let ident str = IDENT (String.lowercase str)
 171
 172 let as_literal ch s =
 173   let s = String.replace_chars (fun x -> String.make (if x = ch then 2 else 1) x) s in
 174   Printf.sprintf "%c%s%c" ch s ch
 175 }
 176
 177 let digit = ['0'-'9']
 178 let alpha = ['a'-'z' 'A'-'Z']
 179 let ident = (alpha) (alpha | digit | '_' )*
 180 let wsp = [' ' '\r' '\t']
 181 let cmnt = "--" | "//" | "#"
 182
 183 (* extract separate statements *)
 184 rule ruleStatement = parse
 185   | ['\n' ' ' '\r' '\t']+ as tok { `Space tok }
 186   | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n' { `Prop (n,v) }
 187   | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n' { `Prop ("name",name) }
 188   | '"' { let s = ruleInQuotes "" lexbuf in `Token (as_literal '"' s) }
 189   | "'" { let s = ruleInSingleQuotes "" lexbuf in `Token (as_literal '\'' s) }
 190   | cmnt as s { `Comment (s ^ ruleComment "" lexbuf) }
 191   | "/*" { `Comment ("/*" ^ ruleCommentMulti "" lexbuf ^ "*/") }
 192   | ';' { `Semicolon }
 193   | [^ ';'] as c { `Char c }
 194   | eof { `Eof }
 195 and
 196 (* extract tail of the input *)
 197 ruleTail acc = parse
 198   | eof { acc }
 199   | _* as str { ruleTail (acc ^ str) lexbuf }
 200 and
 201 ruleMain = parse
 202   | wsp   { ruleMain lexbuf }
 203   (* update line number *)
 204   | '\n'  { advance_line lexbuf; ruleMain lexbuf}
 205
 206   | '('         { LPAREN }
 207   | ')'         { RPAREN }
 208   | ','   { COMMA }
 209   | '.'   { DOT }
 210
 211   | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
 212   | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
 213
 214   | "*" { ASTERISK }
 215   | "=" { EQUAL }
 216   | "!" { EXCL }
 217   | "~" { TILDE }
 218   | "||" { CONCAT_OP }
 219   | "+" { PLUS }
 220   | "-" { MINUS }
 221
 222   | "/" | "%" { NUM_DIV_OP }
 223   | "<<" | ">>" | "|" | "&" { NUM_BIT_OP }
 224   | ">" | ">=" | "<=" | "<" { NUM_CMP_OP }
 225   | "<>" | "!=" | "==" { NUM_EQ_OP }
 226
 227   | "?" { PARAM (None,pos lexbuf) }
 228   | [':' '@'] (ident as str) { PARAM (Some str,pos lexbuf) }
 229
 230   | '"' { ident (ruleInQuotes "" lexbuf) }
 231   | "'" { TEXT (ruleInSingleQuotes "" lexbuf) }
 232   | "`" { ident (ruleInBackQuotes "" lexbuf) }
 233   | "[" { ident (ruleInBrackets "" lexbuf) }
 234   | ['x' 'X'] "'" { BLOB (ruleInSingleQuotes "" lexbuf) }
 235
 236   | ident as str { get_ident str }
 237   | digit+ as str { INTEGER (int_of_string str) }
 238   | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
 239   | eof         { EOF }
 240   | _   { error lexbuf "ruleMain" }
 241 and
 242 (* FIXME factor out all that ruleIn* rules *)
 243 ruleInQuotes acc = parse
 244   | '"'         { acc }
 245   | eof         { error lexbuf "no terminating quote" }
 246   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
 247   | "\"\""      { ruleInQuotes (acc^"\"") lexbuf }
 248   | [^'"' '\n']+ as s { ruleInQuotes (acc^s) lexbuf }
 249   | _           { error lexbuf "ruleInQuotes" }
 250 and
 251 ruleInBrackets acc = parse
 252   | ']'         { acc }
 253   | eof         { error lexbuf "no terminating bracket" }
 254   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating bracket" }
 255 (*   | "\"\""      { ruleInQuotes (acc ^ "\"") lexbuf } *)
 256   | [^']' '\n']+  { ruleInBrackets (acc ^ lexeme lexbuf) lexbuf }
 257   | _           { error lexbuf "ruleInBrackets" }
 258 and
 259 ruleInSingleQuotes acc = parse
 260   | '\''              { acc }
 261   | eof         { error lexbuf "no terminating single quote" }
 262   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
 263   | "''"        { ruleInSingleQuotes (acc ^ "'") lexbuf }
 264   | [^'\'' '\n']+  { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
 265   | _           { error lexbuf "ruleInSingleQuotes" }
 266 and
 267 ruleInBackQuotes acc = parse
 268   | '`'         { acc }
 269   | eof         { error lexbuf "no terminating back quote" }
 270   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
 271   | "``"        { ruleInBackQuotes (acc ^ "`") lexbuf }
 272   | [^'`' '\n']+  { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
 273   | _           { error lexbuf "ruleInBackQuotes" }
 274 and
 275 ruleComment acc = parse
 276   | '\n'        { advance_line lexbuf; acc }
 277   | eof         { acc }
 278   | [^'\n']+    { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
 279   | _           { error lexbuf "ruleComment"; }
 280 and
 281 ruleCommentMulti acc = parse
 282   | '\n'        { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
 283   | "*/"        { acc }
 284   | "*"
 285   | [^'\n' '*']+    { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
 286   | _           { error lexbuf "ruleCommentMulti" }
 287
 288 {
 289
 290   let parse_rule lexbuf =
 291     let module P = Parser_state in
 292     let token = ruleMain lexbuf in
 293     match !P.mode with
 294     | P.Normal -> token
 295     | P.Ignore ->
 296 (*         Printf.eprintf "ignored: %s\n" (lexeme lexbuf); *)
 297       if (token = EOF) then token else IGNORED
 298
 299 }
 300