lib/sql_lexer.mll

   1
   2 {
   3   open Printf
   4   open Lexing
   5   open ExtLib
   6   open Sql_parser
   7   module T = Sql.Type
   8
   9 let error _ callerID =
  10   prerr_endline (sprintf "Lexer error : %s" callerID);
  11 (*      update_pos buf;*)
  12         raise Parsing.Parse_error
  13
  14 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
  15
  16 let advance_line_pos pos =
  17   { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
  18
  19 let advance_line lexbuf =
  20   lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
  21
  22 let keywords =
  23   let k = ref [
  24    "as",AS;
  25    "on",ON;
  26    "conflict",CONFLICT;
  27    "using",USING;
  28    "natural",NATURAL;
  29    "join",JOIN;
  30    "isnull",TEST_NULL;
  31    "notnull",TEST_NULL;
  32    "between",BETWEEN;
  33    "and",AND;
  34    "escape",ESCAPE;
  35    "not",NOT;
  36    "null",NULL;
  37    "unique",UNIQUE;
  38    "primary",PRIMARY;
  39    "key",KEY;
  40    "default",DEFAULT;
  41    "precision",PRECISION;
  42    "varying",VARYING;
  43    "charset",CHARSET;
  44    "collate",COLLATE;
  45    "national",NATIONAL;
  46    "ascii",ASCII;
  47    "unicode",UNICODE;
  48    "distinct",DISTINCT;
  49    "character",CHARACTER;
  50    "binary",BINARY;
  51    "bytea",BINARY;
  52    "all",ALL;
  53    "any",ANY;
  54    "some",SOME;
  55    "order",ORDER;
  56    "by",BY;
  57    "limit",LIMIT;
  58    "desc",DESC;
  59    "asc",ASC;
  60    "offset",OFFSET;
  61    "select",SELECT;
  62    "create",CREATE;
  63    "table",TABLE;
  64    "view",VIEW;
  65    "insert",INSERT;
  66    "replace",REPLACE;
  67    "update",UPDATE;
  68    "delete",DELETE;
  69    "from",FROM;
  70    "or",OR;
  71    "into",INTO;
  72    "values",VALUES;
  73    "where",WHERE;
  74    "set",SET;
  75    "in",IN;
  76    "group",GROUP;
  77    "having",HAVING;
  78    "union",UNION;
  79    "except",EXCEPT;
  80    "intersect",INTERSECT;
  81    "cross",CROSS;
  82    "temporary",TEMPORARY;
  83    "if",IF;
  84    "exists",EXISTS;
  85    "foreign",FOREIGN;
  86    "global",GLOBAL;
  87    "local",LOCAL;
  88    "value",VALUE;
  89    "references",REFERENCES;
  90    "check",CHECK;
  91    "date",DATE;
  92    "time",TIME;
  93    "timestamp",TIMESTAMP;
  94    "alter",ALTER;
  95    "add",ADD;
  96    "cascade",CASCADE;
  97    "restrict",RESTRICT;
  98    "drop",DROP;
  99    "constraint",CONSTRAINT;
 100    "after",AFTER;
 101    "index",INDEX;
 102    "fulltext",FULLTEXT;
 103    "unsigned",UNSIGNED;
 104    "first",FIRST;
 105    "column",COLUMN;
 106    "like", LIKE;
 107    "case", CASE;
 108    "when", WHEN;
 109    "then", THEN;
 110    "else", ELSE;
 111    "end", END;
 112    "change", CHANGE;
 113    "modify", MODIFY;
 114    "delayed", DELAYED;
 115    "enum", ENUM;
 116   ] in (* more *)
 117   let all token l = k := !k @ List.map (fun x -> x,token) l in
 118   all (FUNCTION (T.Int,true)) ["max"; "min"; "count";"sum";"avg"];
 119   all (FUNCTION (T.Int,false)) ["length"; "random";"unix_timestamp"];
 120   all (FUNCTION (T.Int,false)) ["least"; "greatest"];
 121   all (FUNCTION (T.Text,false)) ["concat";"lower";"upper"];
 122   all (FUNCTION (T.Any,false)) ["coalesce"];
 123   all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now";];
 124   all DATETIME_FUNC ["getdate"]; (* mssql? *)
 125   all CONFLICT_ALGO ["ignore"; "abort"; "fail"; "rollback"];
 126   all JOIN_TYPE1 ["left";"right";"full"];
 127   all JOIN_TYPE2 ["inner";"outer"];
 128   all LIKE_OP ["glob";"regexp";"match"];
 129   all AUTOINCREMENT ["autoincrement";"auto_increment"];
 130 (* standard built-in types
 131       CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
 132       BINARY, BINARY VARYING, BINARY LARGE OBJECT,
 133       NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
 134       FLOAT, REAL, DOUBLE PRECISION,
 135       BOOLEAN,
 136       DATE, TIME, TIMESTAMP, INTERVAL
 137     *)
 138   all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
 139   all T_INTEGER ["numeric";"decimal";"dec";"fixed"];
 140   all T_INTEGER ["number"]; (* oracle *)
 141   all T_BOOLEAN ["bool";"boolean"];
 142   all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
 143   all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
 144   all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
 145   all T_TEXT ["varchar2"]; (* oracle *)
 146   all T_DATETIME ["datetime";"year";];
 147   !k
 148
 149 (*
 150   Q: Why not convert all input to lowercase before lexing?
 151   A: Sometimes SQL is case-sensitive, also string contents should be preserved
 152 *)
 153
 154 module Keywords = Map.Make(String)
 155
 156 let keywords =
 157   let add map (k,v) =
 158     let k = String.lowercase k in
 159     if Keywords.mem k map then
 160       failwith (sprintf "Lexeme %s is already associated with keyword." k)
 161     else
 162       Keywords.add k v map
 163   in
 164   List.fold_left add Keywords.empty keywords
 165
 166 (* FIXME case sensitivity??! *)
 167
 168 let get_ident str =
 169   let str = String.lowercase str in
 170   try Keywords.find str keywords with Not_found -> IDENT str
 171
 172 let ident str = IDENT (String.lowercase str)
 173
 174 let as_literal ch s =
 175   let s = String.replace_chars (fun x -> String.make (if x = ch then 2 else 1) x) s in
 176   sprintf "%c%s%c" ch s ch
 177 }
 178
 179 let digit = ['0'-'9']
 180 let alpha = ['a'-'z' 'A'-'Z']
 181 let ident = (alpha) (alpha | digit | '_' )*
 182 let wsp = [' ' '\r' '\t']
 183 let cmnt = "--" | "//" | "#"
 184
 185 (* extract separate statements *)
 186 rule ruleStatement = parse
 187   | ['\n' ' ' '\r' '\t']+ as tok { `Space tok }
 188   | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n' { `Prop (n,v) }
 189   | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n' { `Prop ("name",name) }
 190   | '"' { let s = ruleInQuotes "" lexbuf in `Token (as_literal '"' s) }
 191   | "'" { let s = ruleInSingleQuotes "" lexbuf in `Token (as_literal '\'' s) }
 192   | cmnt as s { `Comment (s ^ ruleComment "" lexbuf) }
 193   | "/*" { `Comment ("/*" ^ ruleCommentMulti "" lexbuf ^ "*/") }
 194   | ';' { `Semicolon }
 195   | [^ ';'] as c { `Char c }
 196   | eof { `Eof }
 197 and
 198 (* extract tail of the input *)
 199 ruleTail acc = parse
 200   | eof { acc }
 201   | _* as str { ruleTail (acc ^ str) lexbuf }
 202 and
 203 ruleMain = parse
 204   | wsp   { ruleMain lexbuf }
 205   (* update line number *)
 206   | '\n'  { advance_line lexbuf; ruleMain lexbuf}
 207
 208   | '('         { LPAREN }
 209   | ')'         { RPAREN }
 210   | ','   { COMMA }
 211   | '.'   { DOT }
 212
 213   | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
 214   | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
 215
 216   | "*" { ASTERISK }
 217   | "=" { EQUAL }
 218   | "!" { EXCL }
 219   | "~" { TILDE }
 220   | "||" { CONCAT_OP }
 221   | "+" { PLUS }
 222   | "-" { MINUS }
 223
 224   | "/" | "%" { NUM_DIV_OP }
 225   | "<<" | ">>" | "|" | "&" { NUM_BIT_OP }
 226   | ">" | ">=" | "<=" | "<" { NUM_CMP_OP }
 227   | "<>" | "!=" | "==" { NUM_EQ_OP }
 228
 229   | "?" { PARAM (None,pos lexbuf) }
 230   | [':' '@'] (ident as str) { PARAM (Some str,pos lexbuf) }
 231
 232   | '"' { ident (ruleInQuotes "" lexbuf) }
 233   | "'" { TEXT (ruleInSingleQuotes "" lexbuf) }
 234   | "`" { ident (ruleInBackQuotes "" lexbuf) }
 235   | "[" { ident (ruleInBrackets "" lexbuf) }
 236   | ['x' 'X'] "'" { BLOB (ruleInSingleQuotes "" lexbuf) }
 237
 238   | ident as str { get_ident str }
 239   | digit+ as str { INTEGER (int_of_string str) }
 240   | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
 241   | eof         { EOF }
 242   | _   { error lexbuf "ruleMain" }
 243 and
 244 (* FIXME factor out all that ruleIn* rules *)
 245 ruleInQuotes acc = parse
 246   | '"'         { acc }
 247   | eof         { error lexbuf "no terminating quote" }
 248   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
 249   | "\"\""      { ruleInQuotes (acc^"\"") lexbuf }
 250   | [^'"' '\n']+ as s { ruleInQuotes (acc^s) lexbuf }
 251   | _           { error lexbuf "ruleInQuotes" }
 252 and
 253 ruleInBrackets acc = parse
 254   | ']'         { acc }
 255   | eof         { error lexbuf "no terminating bracket" }
 256   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating bracket" }
 257 (*   | "\"\""      { ruleInQuotes (acc ^ "\"") lexbuf } *)
 258   | [^']' '\n']+  { ruleInBrackets (acc ^ lexeme lexbuf) lexbuf }
 259   | _           { error lexbuf "ruleInBrackets" }
 260 and
 261 ruleInSingleQuotes acc = parse
 262   | '\''              { acc }
 263   | eof         { error lexbuf "no terminating single quote" }
 264   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
 265   | "''"        { ruleInSingleQuotes (acc ^ "'") lexbuf }
 266   | [^'\'' '\n']+  { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
 267   | _           { error lexbuf "ruleInSingleQuotes" }
 268 and
 269 ruleInBackQuotes acc = parse
 270   | '`'         { acc }
 271   | eof         { error lexbuf "no terminating back quote" }
 272   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
 273   | "``"        { ruleInBackQuotes (acc ^ "`") lexbuf }
 274   | [^'`' '\n']+  { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
 275   | _           { error lexbuf "ruleInBackQuotes" }
 276 and
 277 ruleComment acc = parse
 278   | '\n'        { advance_line lexbuf; acc }
 279   | eof         { acc }
 280   | [^'\n']+    { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
 281   | _           { error lexbuf "ruleComment"; }
 282 and
 283 ruleCommentMulti acc = parse
 284   | '\n'        { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
 285   | "*/"        { acc }
 286   | "*"
 287   | [^'\n' '*']+    { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
 288   | _           { error lexbuf "ruleCommentMulti" }
 289
 290 {
 291
 292   let parse_rule lexbuf =
 293     let module P = Parser_state in
 294     let token = ruleMain lexbuf in
 295     match !P.mode with
 296     | P.Normal -> token
 297     | P.Ignore ->
 298 (*         eprintf "ignored: %s\n" (lexeme lexbuf); *)
 299       if (token = EOF) then token else IGNORED
 300
 301 }
 302