10 prerr_endline (sprintf "Lexer error : %s" callerID);
12 raise Parsing.Parse_error
14 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
16 let advance_line_pos pos =
17 { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
19 let advance_line lexbuf =
20 lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
39 "precision",PRECISION;
47 "character",CHARACTER;
78 "intersect",INTERSECT;
80 "temporary",TEMPORARY;
87 "references",REFERENCES;
91 "timestamp",TIMESTAMP;
97 "constraint",CONSTRAINT;
124 "interval", INTERVAL;
125 "microsecond", MICROSECOND;
134 "second_microsecond", SECOND_MICROSECOND;
135 "minute_microsecond", MINUTE_MICROSECOND;
136 "minute_second", MINUTE_SECOND;
137 "hour_microsecond", HOUR_MICROSECOND;
138 "hour_second", HOUR_SECOND;
139 "hour_minute", HOUR_MINUTE;
140 "day_microsecond", DAY_MICROSECOND;
141 "day_second", DAY_SECOND;
142 "day_minute", DAY_MINUTE;
143 "day_hour", DAY_HOUR;
144 "year_month", YEAR_MONTH;
147 "duplicate", DUPLICATE;
148 "function", FUNCTION;
149 "procedure", PROCEDURE;
153 "language", LANGUAGE;
155 let all token l = k := !k @ List.map (fun x -> x,token) l in
156 all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now";];
157 all DATETIME_FUNC ["getdate"]; (* mssql? *)
158 all CONFLICT_ALGO ["ignore"; "abort"; "fail"; "rollback"];
159 all JOIN_TYPE1 ["left";"right";"full"];
160 all JOIN_TYPE2 ["inner";"outer"];
161 all LIKE_OP ["glob";"regexp";"match"];
162 all AUTOINCREMENT ["autoincrement";"auto_increment"];
163 (* standard built-in types
164 CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
165 BINARY, BINARY VARYING, BINARY LARGE OBJECT,
166 NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
167 FLOAT, REAL, DOUBLE PRECISION,
169 DATE, TIME, TIMESTAMP, INTERVAL
171 all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
172 all T_INTEGER ["numeric";"decimal";"dec";"fixed"];
173 all T_INTEGER ["number"]; (* oracle *)
174 all T_BOOLEAN ["bool";"boolean"];
175 all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
176 all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
177 all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
178 all T_TEXT ["varchar2"]; (* oracle *)
179 all T_DATETIME ["datetime"];
180 all T_UUID ["uuid"]; (* http://www.postgresql.org/docs/9.4/static/datatype-uuid.html *)
184 Q: Why not convert all input to lowercase before lexing?
185 A: Sometimes SQL is case-sensitive, also string contents should be preserved
188 module Keywords = Map.Make(String)
192 let k = String.lowercase k in
193 if Keywords.mem k map then
194 failwith (sprintf "Lexeme %s is already associated with keyword." k)
198 List.fold_left add Keywords.empty keywords
200 (* FIXME case sensitivity??! *)
203 let str = String.lowercase str in
204 try Keywords.find str keywords with Not_found -> IDENT str
206 let ident str = IDENT (String.lowercase str)
208 let as_literal ch s =
209 let s = String.replace_chars (fun x -> String.make (if x = ch then 2 else 1) x) s in
210 sprintf "%c%s%c" ch s ch
213 let digit = ['0'-'9']
214 let alpha = ['a'-'z' 'A'-'Z']
215 let ident = (alpha) (alpha | digit | '_' )*
216 let wsp = [' ' '\r' '\t']
217 let cmnt = "--" | "//" | "#"
219 (* extract separate statements *)
220 rule ruleStatement = parse
221 | ['\n' ' ' '\r' '\t']+ as tok { `Space tok }
222 | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n' { `Prop (n,v) }
223 | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n' { `Prop ("name",name) }
224 | '"' { let s = ruleInQuotes "" lexbuf in `Token (as_literal '"' s) }
225 | "'" { let s = ruleInSingleQuotes "" lexbuf in `Token (as_literal '\'' s) }
226 | "$" (ident? as tag) "$" { let s = ruleInDollarQuotes tag "" lexbuf in `Token (sprintf "$%s$%s$%s$" tag s tag) }
227 | cmnt as s { `Comment (s ^ ruleComment "" lexbuf) }
228 | "/*" { `Comment ("/*" ^ ruleCommentMulti "" lexbuf ^ "*/") }
230 | [^ ';'] as c { `Char c }
233 (* extract tail of the input *)
236 | _* as str { ruleTail (acc ^ str) lexbuf }
239 | wsp { ruleMain lexbuf }
240 (* update line number *)
241 | '\n' { advance_line lexbuf; ruleMain lexbuf}
248 | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
249 | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
259 | "/" | "%" { NUM_DIV_OP }
260 | "<<" | ">>" | "|" | "&" { NUM_BIT_OP }
261 | ">" | ">=" | "<=" | "<" { NUM_CMP_OP }
262 | "<>" | "!=" | "==" { NUM_EQ_OP }
263 | "<=>" { NOT_DISTINCT_OP }
265 | "?" { PARAM (None,pos lexbuf) }
266 | [':' '@'] (ident as str) { PARAM (Some str,pos lexbuf) }
268 | '"' { ident (ruleInQuotes "" lexbuf) }
269 | "'" { TEXT (ruleInSingleQuotes "" lexbuf) }
270 (* http://www.postgresql.org/docs/current/interactive/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING *)
271 | "$" (ident? as tag) "$" { TEXT (ruleInDollarQuotes tag "" lexbuf) }
272 | "`" { ident (ruleInBackQuotes "" lexbuf) }
273 | "[" { ident (ruleInBrackets "" lexbuf) }
274 | ['x' 'X'] "'" { BLOB (ruleInSingleQuotes "" lexbuf) }
276 | ident as str { get_ident str }
277 | digit+ as str { INTEGER (int_of_string str) }
278 | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
280 | _ { error lexbuf "ruleMain" }
282 (* FIXME factor out all that ruleIn* rules *)
283 ruleInQuotes acc = parse
285 | eof { error lexbuf "no terminating quote" }
286 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
287 | "\"\"" { ruleInQuotes (acc^"\"") lexbuf }
288 | [^'"' '\n']+ as s { ruleInQuotes (acc^s) lexbuf }
289 | _ { error lexbuf "ruleInQuotes" }
291 ruleInBrackets acc = parse
293 | eof { error lexbuf "no terminating bracket" }
294 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating bracket" }
295 (* | "\"\"" { ruleInQuotes (acc ^ "\"") lexbuf } *)
296 | [^']' '\n']+ { ruleInBrackets (acc ^ lexeme lexbuf) lexbuf }
297 | _ { error lexbuf "ruleInBrackets" }
299 ruleInSingleQuotes acc = parse
301 | eof { error lexbuf "no terminating single quote" }
302 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
303 | "''" { ruleInSingleQuotes (acc ^ "'") lexbuf }
304 | [^'\'' '\n']+ { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
305 | _ { error lexbuf "ruleInSingleQuotes" }
307 ruleInBackQuotes acc = parse
309 | eof { error lexbuf "no terminating back quote" }
310 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
311 | "``" { ruleInBackQuotes (acc ^ "`") lexbuf }
312 | [^'`' '\n']+ { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
313 | _ { error lexbuf "ruleInBackQuotes" }
315 ruleInDollarQuotes tag acc = parse
316 | "$" (ident? as tag_) "$" { if tag_ = tag then acc else ruleInDollarQuotes tag (acc ^ sprintf "$%s$" tag_) lexbuf }
317 | eof { error lexbuf "no terminating dollar quote" }
318 | '\n' { advance_line lexbuf; ruleInDollarQuotes tag (acc ^ "\n") lexbuf }
319 (* match one char at a time to make sure delimiter matches longer *)
320 | [^'\n'] { ruleInDollarQuotes tag (acc ^ lexeme lexbuf) lexbuf }
321 | _ { error lexbuf "ruleInDollarQuotes" }
323 ruleComment acc = parse
324 | '\n' { advance_line lexbuf; acc }
326 | [^'\n']+ { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
327 | _ { error lexbuf "ruleComment"; }
329 ruleCommentMulti acc = parse
330 | '\n' { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
333 | [^'\n' '*']+ { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
334 | _ { error lexbuf "ruleCommentMulti" }
338 let parse_rule lexbuf =
339 let module P = Parser_state in
340 let token = ruleMain lexbuf in
344 (* eprintf "ignored: %s\n" (lexeme lexbuf); *)
345 if (token = EOF) then token else IGNORED