src/sql_lexer.mll

   1
   2 {
   3   open Sql_parser
   4   open Lexing
   5   open ExtLib
   6   module T = Sql.Type
   7
   8 let error _ callerID =
   9   Error.log "Lexer error : %s" callerID;
  10 (*      update_pos buf;*)
  11         raise Parsing.Parse_error
  12
  13 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
  14
  15 let advance_line_pos pos =
  16   { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
  17
  18 let advance_line lexbuf =
  19   lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
  20
  21 let keywords =
  22   let k = ref [
  23    "as",AS;
  24    "on",ON;
  25    "conflict",CONFLICT;
  26    "using",USING;
  27    "natural",NATURAL;
  28    "join",JOIN;
  29    "isnull",TEST_NULL;
  30    "notnull",TEST_NULL;
  31    "between",BETWEEN;
  32    "and",AND;
  33    "escape",ESCAPE;
  34    "not",NOT;
  35    "null",NULL;
  36    "unique",UNIQUE;
  37    "primary",PRIMARY;
  38    "key",KEY;
  39    "default",DEFAULT;
  40    "precision",PRECISION;
  41    "varying",VARYING;
  42    "charset",CHARSET;
  43    "collate",COLLATE;
  44    "national",NATIONAL;
  45    "ascii",ASCII;
  46    "unicode",UNICODE;
  47    "distinct",DISTINCT;
  48    "character",CHARACTER;
  49    "binary",BINARY;
  50    "bytea",BINARY;
  51    "all",ALL;
  52    "any",ANY;
  53    "some",SOME;
  54    "order",ORDER;
  55    "by",BY;
  56    "limit",LIMIT;
  57    "desc",DESC;
  58    "asc",ASC;
  59    "offset",OFFSET;
  60    "select",SELECT;
  61    "create",CREATE;
  62    "table",TABLE;
  63    "view",VIEW;
  64    "insert",INSERT;
  65    "replace",REPLACE;
  66    "update",UPDATE;
  67    "delete",DELETE;
  68    "from",FROM;
  69    "or",OR;
  70    "into",INTO;
  71    "values",VALUES;
  72    "where",WHERE;
  73    "set",SET;
  74    "in",IN;
  75    "group",GROUP;
  76    "having",HAVING;
  77    "union",UNION;
  78    "except",EXCEPT;
  79    "intersect",INTERSECT;
  80    "cross",CROSS;
  81    "temporary",TEMPORARY;
  82    "if",IF;
  83    "exists",EXISTS;
  84    "foreign",FOREIGN;
  85    "global",GLOBAL;
  86    "local",LOCAL;
  87    "value",VALUE;
  88    "references",REFERENCES;
  89    "check",CHECK;
  90    "date",DATE;
  91    "time",TIME;
  92    "timestamp",TIMESTAMP;
  93    "alter",ALTER;
  94    "add",ADD;
  95    "cascade",CASCADE;
  96    "restrict",RESTRICT;
  97    "drop",DROP;
  98    "constraint",CONSTRAINT;
  99    "after",AFTER;
 100    "index",INDEX;
 101    "fulltext",FULLTEXT;
 102    "unsigned",UNSIGNED;
 103    "first",FIRST;
 104    "column",COLUMN;
 105    "like", LIKE;
 106    "case", CASE;
 107    "when", WHEN;
 108    "then", THEN;
 109    "else", ELSE;
 110    "end", END;
 111    "change", CHANGE;
 112    "modify", MODIFY;
 113    "delayed", DELAYED;
 114    "enum", ENUM;
 115   ] in (* more *)
 116   let all token l = k := !k @ List.map (fun x -> x,token) l in
 117   all (FUNCTION (T.Int,true)) ["max"; "min"; "count";"sum";"avg"];
 118   all (FUNCTION (T.Int,false)) ["length"; "random";"unix_timestamp"];
 119   all (FUNCTION (T.Int,false)) ["least"; "greatest"];
 120   all (FUNCTION (T.Text,false)) ["concat";"lower";"upper"];
 121   all (FUNCTION (T.Any,false)) ["coalesce"];
 122   all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now";];
 123   all DATETIME_FUNC ["getdate"]; (* mssql? *)
 124   all CONFLICT_ALGO ["ignore"; "abort"; "fail"; "rollback"];
 125   all JOIN_TYPE1 ["left";"right";"full"];
 126   all JOIN_TYPE2 ["inner";"outer"];
 127   all LIKE_OP ["glob";"regexp";"match"];
 128   all AUTOINCREMENT ["autoincrement";"auto_increment"];
 129 (* standard built-in types
 130       CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
 131       BINARY, BINARY VARYING, BINARY LARGE OBJECT,
 132       NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
 133       FLOAT, REAL, DOUBLE PRECISION,
 134       BOOLEAN,
 135       DATE, TIME, TIMESTAMP, INTERVAL
 136     *)
 137   all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
 138   all T_INTEGER ["numeric";"decimal";"dec";"fixed"];
 139   all T_INTEGER ["number"]; (* oracle *)
 140   all T_BOOLEAN ["bool";"boolean"];
 141   all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
 142   all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
 143   all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
 144   all T_TEXT ["varchar2"]; (* oracle *)
 145   all T_DATETIME ["datetime";"year";];
 146   !k
 147
 148 (*
 149   Q: Why not convert all input to lowercase before lexing?
 150   A: Sometimes SQL is case-sensitive, also string contents should be preserved
 151 *)
 152
 153 module Keywords = Map.Make(String)
 154
 155 let keywords =
 156   let add map (k,v) =
 157     let k = String.lowercase k in
 158     if Keywords.mem k map then
 159       failwith (Printf.sprintf "Lexeme %s is already associated with keyword." k)
 160     else
 161       Keywords.add k v map
 162   in
 163   List.fold_left add Keywords.empty keywords
 164
 165 (* FIXME case sensitivity??! *)
 166
 167 let get_ident str =
 168   let str = String.lowercase str in
 169   try Keywords.find str keywords with Not_found -> IDENT str
 170
 171 let ident str = IDENT (String.lowercase str)
 172
 173 let as_literal ch s =
 174   let s = String.replace_chars (fun x -> String.make (if x = ch then 2 else 1) x) s in
 175   Printf.sprintf "%c%s%c" ch s ch
 176 }
 177
 178 let digit = ['0'-'9']
 179 let alpha = ['a'-'z' 'A'-'Z']
 180 let ident = (alpha) (alpha | digit | '_' )*
 181 let wsp = [' ' '\r' '\t']
 182 let cmnt = "--" | "//" | "#"
 183
 184 (* extract separate statements *)
 185 rule ruleStatement = parse
 186   | ['\n' ' ' '\r' '\t']+ as tok { `Space tok }
 187   | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n' { `Prop (n,v) }
 188   | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n' { `Prop ("name",name) }
 189   | '"' { let s = ruleInQuotes "" lexbuf in `Token (as_literal '"' s) }
 190   | "'" { let s = ruleInSingleQuotes "" lexbuf in `Token (as_literal '\'' s) }
 191   | cmnt as s { `Comment (s ^ ruleComment "" lexbuf) }
 192   | "/*" { `Comment ("/*" ^ ruleCommentMulti "" lexbuf ^ "*/") }
 193   | ';' { `Semicolon }
 194   | [^ ';'] as c { `Char c }
 195   | eof { `Eof }
 196 and
 197 (* extract tail of the input *)
 198 ruleTail acc = parse
 199   | eof { acc }
 200   | _* as str { ruleTail (acc ^ str) lexbuf }
 201 and
 202 ruleMain = parse
 203   | wsp   { ruleMain lexbuf }
 204   (* update line number *)
 205   | '\n'  { advance_line lexbuf; ruleMain lexbuf}
 206
 207   | '('         { LPAREN }
 208   | ')'         { RPAREN }
 209   | ','   { COMMA }
 210   | '.'   { DOT }
 211
 212   | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
 213   | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
 214
 215   | "*" { ASTERISK }
 216   | "=" { EQUAL }
 217   | "!" { EXCL }
 218   | "~" { TILDE }
 219   | "||" { CONCAT_OP }
 220   | "+" { PLUS }
 221   | "-" { MINUS }
 222
 223   | "/" | "%" { NUM_DIV_OP }
 224   | "<<" | ">>" | "|" | "&" { NUM_BIT_OP }
 225   | ">" | ">=" | "<=" | "<" { NUM_CMP_OP }
 226   | "<>" | "!=" | "==" { NUM_EQ_OP }
 227
 228   | "?" { PARAM (None,pos lexbuf) }
 229   | [':' '@'] (ident as str) { PARAM (Some str,pos lexbuf) }
 230
 231   | '"' { ident (ruleInQuotes "" lexbuf) }
 232   | "'" { TEXT (ruleInSingleQuotes "" lexbuf) }
 233   | "`" { ident (ruleInBackQuotes "" lexbuf) }
 234   | "[" { ident (ruleInBrackets "" lexbuf) }
 235   | ['x' 'X'] "'" { BLOB (ruleInSingleQuotes "" lexbuf) }
 236
 237   | ident as str { get_ident str }
 238   | digit+ as str { INTEGER (int_of_string str) }
 239   | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
 240   | eof         { EOF }
 241   | _   { error lexbuf "ruleMain" }
 242 and
 243 (* FIXME factor out all that ruleIn* rules *)
 244 ruleInQuotes acc = parse
 245   | '"'         { acc }
 246   | eof         { error lexbuf "no terminating quote" }
 247   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
 248   | "\"\""      { ruleInQuotes (acc^"\"") lexbuf }
 249   | [^'"' '\n']+ as s { ruleInQuotes (acc^s) lexbuf }
 250   | _           { error lexbuf "ruleInQuotes" }
 251 and
 252 ruleInBrackets acc = parse
 253   | ']'         { acc }
 254   | eof         { error lexbuf "no terminating bracket" }
 255   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating bracket" }
 256 (*   | "\"\""      { ruleInQuotes (acc ^ "\"") lexbuf } *)
 257   | [^']' '\n']+  { ruleInBrackets (acc ^ lexeme lexbuf) lexbuf }
 258   | _           { error lexbuf "ruleInBrackets" }
 259 and
 260 ruleInSingleQuotes acc = parse
 261   | '\''              { acc }
 262   | eof         { error lexbuf "no terminating single quote" }
 263   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
 264   | "''"        { ruleInSingleQuotes (acc ^ "'") lexbuf }
 265   | [^'\'' '\n']+  { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
 266   | _           { error lexbuf "ruleInSingleQuotes" }
 267 and
 268 ruleInBackQuotes acc = parse
 269   | '`'         { acc }
 270   | eof         { error lexbuf "no terminating back quote" }
 271   | '\n'        { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
 272   | "``"        { ruleInBackQuotes (acc ^ "`") lexbuf }
 273   | [^'`' '\n']+  { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
 274   | _           { error lexbuf "ruleInBackQuotes" }
 275 and
 276 ruleComment acc = parse
 277   | '\n'        { advance_line lexbuf; acc }
 278   | eof         { acc }
 279   | [^'\n']+    { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
 280   | _           { error lexbuf "ruleComment"; }
 281 and
 282 ruleCommentMulti acc = parse
 283   | '\n'        { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
 284   | "*/"        { acc }
 285   | "*"
 286   | [^'\n' '*']+    { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
 287   | _           { error lexbuf "ruleCommentMulti" }
 288
 289 {
 290
 291   let parse_rule lexbuf =
 292     let module P = Parser_state in
 293     let token = ruleMain lexbuf in
 294     match !P.mode with
 295     | P.Normal -> token
 296     | P.Ignore ->
 297 (*         Printf.eprintf "ignored: %s\n" (lexeme lexbuf); *)
 298       if (token = EOF) then token else IGNORED
 299
 300 }
 301