10 prerr_endline (sprintf "Lexer error : %s" callerID);
12 raise Parsing.Parse_error
14 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
16 let advance_line_pos pos =
17 { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
19 let advance_line lexbuf =
20 lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
22 let keep_lexeme_start lexbuf f =
23 let start_p = lexeme_start_p lexbuf in
25 lexbuf.lex_start_p <- start_p;
49 "character",CHARACTER;
56 "constraint",CONSTRAINT;
64 "day_microsecond", DAY_MICROSECOND;
65 "day_minute", DAY_MINUTE;
66 "day_second", DAY_SECOND;
74 "duplicate", DUPLICATE;
83 "first_value",FIRST_VALUE;
84 "following", FOLLOWING;
94 "hour_microsecond", HOUR_MICROSECOND;
95 "hour_minute", HOUR_MINUTE;
96 "hour_second", HOUR_SECOND;
101 "intersect",INTERSECT;
102 "interval", INTERVAL;
106 "straight_join",STRAIGHT_JOIN;
109 "language", LANGUAGE;
110 "last_value", LAST_VALUE;
116 "microsecond", MICROSECOND;
118 "minute_microsecond", MINUTE_MICROSECOND;
119 "minute_second", MINUTE_SECOND;
136 "partition",PARTITION;
137 "preceding", PRECEDING;
138 "precision",PRECISION;
140 "procedure", PROCEDURE;
143 "references",REFERENCES;
151 "second_microsecond", SECOND_MICROSECOND;
157 "statement", STATEMENT;
159 "substring", SUBSTRING;
161 "temporary",TEMPORARY;
164 "timestamp",TIMESTAMP;
167 "unbounded", UNBOUNDED;
182 "year_month", YEAR_MONTH;
183 "generated", GENERATED;
194 "algorithm", ALGORITHM;
197 let all token l = k := !k @ List.map (fun x -> x,token) l in
198 all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now";];
199 all DATETIME_FUNC ["getdate"]; (* mssql? *)
200 all CONFLICT_ALGO ["ignore"; "abort"; "fail"; "rollback"];
201 all LIKE_OP ["glob";"regexp";"match"];
202 all AUTOINCREMENT ["autoincrement";"auto_increment"];
203 (* standard built-in types
204 CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
205 BINARY, BINARY VARYING, BINARY LARGE OBJECT,
206 NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
207 FLOAT, REAL, DOUBLE PRECISION,
209 DATE, TIME, TIMESTAMP, INTERVAL
211 all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
212 all T_DECIMAL ["numeric";"decimal";"dec";"fixed"];
213 all T_INTEGER ["number"]; (* oracle *)
214 all T_BOOLEAN ["bool";"boolean"];
215 all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
216 all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
217 all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
218 all T_TEXT ["varchar2"]; (* oracle *)
219 all T_DATETIME ["datetime"];
220 all T_UUID ["uuid"]; (* http://www.postgresql.org/docs/9.4/static/datatype-uuid.html *)
224 Q: Why not convert all input to lowercase before lexing?
225 A: Sometimes SQL is case-sensitive, also string contents should be preserved
228 module Keywords = Map.Make(String)
232 let k = String.lowercase_ascii k in
233 if Keywords.mem k map then
234 failwith (sprintf "Lexeme %s is already associated with keyword." k)
238 List.fold_left add Keywords.empty keywords
240 (* FIXME case sensitivity??! *)
243 let str = String.lowercase_ascii str in
244 try Keywords.find str keywords with Not_found -> IDENT str
246 let ident str = IDENT (String.lowercase_ascii str)
248 let as_literal ch s =
249 let s = String.replace_chars (fun x -> String.make (if x = ch then 2 else 1) x) s in
250 sprintf "%c%s%c" ch s ch
253 let digit = ['0'-'9']
254 let alpha = ['a'-'z' 'A'-'Z']
255 let ident = (alpha) (alpha | digit | '_' )*
256 let wsp = [' ' '\r' '\t']
257 let cmnt = "--" | "//" | "#"
259 (* extract separate statements *)
260 rule ruleStatement = parse
261 | ['\n' ' ' '\r' '\t']+ as tok { `Space tok }
262 | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n' { `Prop (n,v) }
263 | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n' { `Prop ("name",name) }
264 | '"' { let s = ruleInQuotes "" lexbuf in `Token (as_literal '"' s) }
265 | "'" { let s = ruleInSingleQuotes "" lexbuf in `Token (as_literal '\'' s) }
266 | "$" (ident? as tag) "$" {
267 keep_lexeme_start lexbuf (fun () -> let s = ruleInDollarQuotes tag "" lexbuf in `Token (sprintf "$%s$%s$%s$" tag s tag))
269 | cmnt as s { `Comment (s ^ ruleComment "" lexbuf) }
270 | "/*" { `Comment ("/*" ^ ruleCommentMulti "" lexbuf ^ "*/") }
272 | [^ ';'] as c { `Char c }
275 (* extract tail of the input *)
278 | _* as str { ruleTail (acc ^ str) lexbuf }
281 | wsp { ruleMain lexbuf }
282 (* update line number *)
283 | '\n' { advance_line lexbuf; ruleMain lexbuf}
289 | '{' { LCURLY (lexeme_start lexbuf) }
290 | '}' { RCURLY (lexeme_start lexbuf) }
292 | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
293 | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
303 | "/" | "%" { NUM_DIV_OP }
304 | "<<" | ">>" { NUM_BIT_SHIFT }
306 | "&" { NUM_BIT_AND }
307 | ">" | ">=" | "<=" | "<" { NUM_CMP_OP }
308 | "<>" | "!=" | "==" { NUM_EQ_OP }
309 | "<=>" { NOT_DISTINCT_OP }
311 | "?" { PARAM { label=None; pos = pos lexbuf } }
312 | [':' '@'] (ident as str) { PARAM { label = Some str; pos = pos lexbuf } }
313 | "::" { DOUBLECOLON }
315 | '"' { keep_lexeme_start lexbuf (fun () -> ident (ruleInQuotes "" lexbuf)) }
316 | "'" { keep_lexeme_start lexbuf (fun () -> TEXT (ruleInSingleQuotes "" lexbuf)) }
317 (* http://www.postgresql.org/docs/current/interactive/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING *)
318 | "$" (ident? as tag) "$" { keep_lexeme_start lexbuf (fun () -> TEXT (ruleInDollarQuotes tag "" lexbuf)) }
319 | "`" { keep_lexeme_start lexbuf (fun () -> ident (ruleInBackQuotes "" lexbuf)) }
320 | "[" { keep_lexeme_start lexbuf (fun () -> ident (ruleInBrackets "" lexbuf)) }
321 | ['x' 'X'] "'" { keep_lexeme_start lexbuf (fun () -> BLOB (ruleInSingleQuotes "" lexbuf)) }
323 | ident as str { if !Parser_state.mode = Ident then IDENT str (* no keywords, preserve case *) else get_ident str }
324 | digit+ as str { INTEGER (int_of_string str) }
325 | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
327 | _ { error lexbuf "ruleMain" }
329 (* FIXME factor out all that ruleIn* rules *)
330 ruleInQuotes acc = parse
332 | eof { error lexbuf "no terminating quote" }
333 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
334 | "\"\"" { ruleInQuotes (acc^"\"") lexbuf }
335 | [^'"' '\n']+ as s { ruleInQuotes (acc^s) lexbuf }
336 | _ { error lexbuf "ruleInQuotes" }
338 ruleInBrackets acc = parse
340 | eof { error lexbuf "no terminating bracket" }
341 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating bracket" }
342 (* | "\"\"" { ruleInQuotes (acc ^ "\"") lexbuf } *)
343 | [^']' '\n']+ { ruleInBrackets (acc ^ lexeme lexbuf) lexbuf }
344 | _ { error lexbuf "ruleInBrackets" }
346 ruleInSingleQuotes acc = parse
348 | eof { error lexbuf "no terminating single quote" }
349 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
350 | "''" { ruleInSingleQuotes (acc ^ "'") lexbuf }
351 | [^'\'' '\n']+ { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
352 | _ { error lexbuf "ruleInSingleQuotes" }
354 ruleInBackQuotes acc = parse
356 | eof { error lexbuf "no terminating back quote" }
357 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
358 | "``" { ruleInBackQuotes (acc ^ "`") lexbuf }
359 | [^'`' '\n']+ { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
360 | _ { error lexbuf "ruleInBackQuotes" }
362 ruleInDollarQuotes tag acc = parse
363 | "$" (ident? as tag_) "$" { if tag_ = tag then acc else ruleInDollarQuotes tag (acc ^ sprintf "$%s$" tag_) lexbuf }
364 | eof { error lexbuf "no terminating dollar quote" }
365 | '\n' { advance_line lexbuf; ruleInDollarQuotes tag (acc ^ "\n") lexbuf }
366 (* match one char at a time to make sure delimiter matches longer *)
367 | [^'\n'] { ruleInDollarQuotes tag (acc ^ lexeme lexbuf) lexbuf }
368 | _ { error lexbuf "ruleInDollarQuotes" }
370 ruleComment acc = parse
371 | '\n' { advance_line lexbuf; acc }
373 | [^'\n']+ { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
374 | _ { error lexbuf "ruleComment"; }
376 ruleCommentMulti acc = parse
377 | '\n' { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
380 | [^'\n' '*']+ { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
381 | _ { error lexbuf "ruleCommentMulti" }
385 let parse_rule lexbuf =
386 let token = ruleMain lexbuf in
387 match !Parser_state.mode with
390 (* eprintf "ignored: %s\n" (lexeme lexbuf); *)
391 if token = EOF then token else IGNORED