10 prerr_endline (sprintf "Lexer error : %s" callerID);
12 raise Parsing.Parse_error
14 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
16 let advance_line_pos pos =
17 { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
19 let advance_line lexbuf =
20 lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
22 let keep_lexeme_start lexbuf f =
23 let start_p = lexeme_start_p lexbuf in
25 lexbuf.lex_start_p <- start_p;
49 "character",CHARACTER;
56 "constraint",CONSTRAINT;
64 "day_microsecond", DAY_MICROSECOND;
65 "day_minute", DAY_MINUTE;
66 "day_second", DAY_SECOND;
74 "duplicate", DUPLICATE;
83 "first_value",FIRST_VALUE;
84 "following", FOLLOWING;
94 "hour_microsecond", HOUR_MICROSECOND;
95 "hour_minute", HOUR_MINUTE;
96 "hour_second", HOUR_SECOND;
101 "intersect",INTERSECT;
102 "interval", INTERVAL;
108 "language", LANGUAGE;
109 "last_value", LAST_VALUE;
115 "microsecond", MICROSECOND;
117 "minute_microsecond", MINUTE_MICROSECOND;
118 "minute_second", MINUTE_SECOND;
135 "partition",PARTITION;
136 "preceding", PRECEDING;
137 "precision",PRECISION;
139 "procedure", PROCEDURE;
142 "references",REFERENCES;
150 "second_microsecond", SECOND_MICROSECOND;
157 "substring", SUBSTRING;
159 "temporary",TEMPORARY;
162 "timestamp",TIMESTAMP;
165 "unbounded", UNBOUNDED;
180 "year_month", YEAR_MONTH;
181 "generated", GENERATED;
186 let all token l = k := !k @ List.map (fun x -> x,token) l in
187 all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now";];
188 all DATETIME_FUNC ["getdate"]; (* mssql? *)
189 all CONFLICT_ALGO ["ignore"; "abort"; "fail"; "rollback"];
190 all JOIN_TYPE1 ["left";"right";"full"];
191 all JOIN_TYPE2 ["inner";"outer"];
192 all LIKE_OP ["glob";"regexp";"match"];
193 all AUTOINCREMENT ["autoincrement";"auto_increment"];
194 (* standard built-in types
195 CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
196 BINARY, BINARY VARYING, BINARY LARGE OBJECT,
197 NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
198 FLOAT, REAL, DOUBLE PRECISION,
200 DATE, TIME, TIMESTAMP, INTERVAL
202 all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
203 all T_DECIMAL ["numeric";"decimal";"dec";"fixed"];
204 all T_INTEGER ["number"]; (* oracle *)
205 all T_BOOLEAN ["bool";"boolean"];
206 all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
207 all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
208 all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
209 all T_TEXT ["varchar2"]; (* oracle *)
210 all T_DATETIME ["datetime"];
211 all T_UUID ["uuid"]; (* http://www.postgresql.org/docs/9.4/static/datatype-uuid.html *)
215 Q: Why not convert all input to lowercase before lexing?
216 A: Sometimes SQL is case-sensitive, also string contents should be preserved
219 module Keywords = Map.Make(String)
223 let k = String.lowercase_ascii k in
224 if Keywords.mem k map then
225 failwith (sprintf "Lexeme %s is already associated with keyword." k)
229 List.fold_left add Keywords.empty keywords
231 (* FIXME case sensitivity??! *)
234 let str = String.lowercase_ascii str in
235 try Keywords.find str keywords with Not_found -> IDENT str
237 let ident str = IDENT (String.lowercase_ascii str)
239 let as_literal ch s =
240 let s = String.replace_chars (fun x -> String.make (if x = ch then 2 else 1) x) s in
241 sprintf "%c%s%c" ch s ch
244 let digit = ['0'-'9']
245 let alpha = ['a'-'z' 'A'-'Z']
246 let ident = (alpha) (alpha | digit | '_' )*
247 let wsp = [' ' '\r' '\t']
248 let cmnt = "--" | "//" | "#"
250 (* extract separate statements *)
251 rule ruleStatement = parse
252 | ['\n' ' ' '\r' '\t']+ as tok { `Space tok }
253 | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n' { `Prop (n,v) }
254 | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n' { `Prop ("name",name) }
255 | '"' { let s = ruleInQuotes "" lexbuf in `Token (as_literal '"' s) }
256 | "'" { let s = ruleInSingleQuotes "" lexbuf in `Token (as_literal '\'' s) }
257 | "$" (ident? as tag) "$" {
258 keep_lexeme_start lexbuf (fun () -> let s = ruleInDollarQuotes tag "" lexbuf in `Token (sprintf "$%s$%s$%s$" tag s tag))
260 | cmnt as s { `Comment (s ^ ruleComment "" lexbuf) }
261 | "/*" { `Comment ("/*" ^ ruleCommentMulti "" lexbuf ^ "*/") }
263 | [^ ';'] as c { `Char c }
266 (* extract tail of the input *)
269 | _* as str { ruleTail (acc ^ str) lexbuf }
272 | wsp { ruleMain lexbuf }
273 (* update line number *)
274 | '\n' { advance_line lexbuf; ruleMain lexbuf}
280 | '{' { LCURLY (lexeme_start lexbuf) }
281 | '}' { RCURLY (lexeme_start lexbuf) }
283 | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
284 | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
294 | "/" | "%" { NUM_DIV_OP }
295 | "<<" | ">>" { NUM_BIT_SHIFT }
297 | "&" { NUM_BIT_AND }
298 | ">" | ">=" | "<=" | "<" { NUM_CMP_OP }
299 | "<>" | "!=" | "==" { NUM_EQ_OP }
300 | "<=>" { NOT_DISTINCT_OP }
302 | "?" { PARAM { label=None; pos = pos lexbuf } }
303 | [':' '@'] (ident as str) { PARAM { label = Some str; pos = pos lexbuf } }
305 | '"' { keep_lexeme_start lexbuf (fun () -> ident (ruleInQuotes "" lexbuf)) }
306 | "'" { keep_lexeme_start lexbuf (fun () -> TEXT (ruleInSingleQuotes "" lexbuf)) }
307 (* http://www.postgresql.org/docs/current/interactive/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING *)
308 | "$" (ident? as tag) "$" { keep_lexeme_start lexbuf (fun () -> TEXT (ruleInDollarQuotes tag "" lexbuf)) }
309 | "`" { keep_lexeme_start lexbuf (fun () -> ident (ruleInBackQuotes "" lexbuf)) }
310 | "[" { keep_lexeme_start lexbuf (fun () -> ident (ruleInBrackets "" lexbuf)) }
311 | ['x' 'X'] "'" { keep_lexeme_start lexbuf (fun () -> BLOB (ruleInSingleQuotes "" lexbuf)) }
313 | ident as str { if !Parser_state.mode = Ident then IDENT str (* no keywords, preserve case *) else get_ident str }
314 | digit+ as str { INTEGER (int_of_string str) }
315 | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
317 | _ { error lexbuf "ruleMain" }
319 (* FIXME factor out all that ruleIn* rules *)
320 ruleInQuotes acc = parse
322 | eof { error lexbuf "no terminating quote" }
323 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
324 | "\"\"" { ruleInQuotes (acc^"\"") lexbuf }
325 | [^'"' '\n']+ as s { ruleInQuotes (acc^s) lexbuf }
326 | _ { error lexbuf "ruleInQuotes" }
328 ruleInBrackets acc = parse
330 | eof { error lexbuf "no terminating bracket" }
331 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating bracket" }
332 (* | "\"\"" { ruleInQuotes (acc ^ "\"") lexbuf } *)
333 | [^']' '\n']+ { ruleInBrackets (acc ^ lexeme lexbuf) lexbuf }
334 | _ { error lexbuf "ruleInBrackets" }
336 ruleInSingleQuotes acc = parse
338 | eof { error lexbuf "no terminating single quote" }
339 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
340 | "''" { ruleInSingleQuotes (acc ^ "'") lexbuf }
341 | [^'\'' '\n']+ { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
342 | _ { error lexbuf "ruleInSingleQuotes" }
344 ruleInBackQuotes acc = parse
346 | eof { error lexbuf "no terminating back quote" }
347 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
348 | "``" { ruleInBackQuotes (acc ^ "`") lexbuf }
349 | [^'`' '\n']+ { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
350 | _ { error lexbuf "ruleInBackQuotes" }
352 ruleInDollarQuotes tag acc = parse
353 | "$" (ident? as tag_) "$" { if tag_ = tag then acc else ruleInDollarQuotes tag (acc ^ sprintf "$%s$" tag_) lexbuf }
354 | eof { error lexbuf "no terminating dollar quote" }
355 | '\n' { advance_line lexbuf; ruleInDollarQuotes tag (acc ^ "\n") lexbuf }
356 (* match one char at a time to make sure delimiter matches longer *)
357 | [^'\n'] { ruleInDollarQuotes tag (acc ^ lexeme lexbuf) lexbuf }
358 | _ { error lexbuf "ruleInDollarQuotes" }
360 ruleComment acc = parse
361 | '\n' { advance_line lexbuf; acc }
363 | [^'\n']+ { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
364 | _ { error lexbuf "ruleComment"; }
366 ruleCommentMulti acc = parse
367 | '\n' { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
370 | [^'\n' '*']+ { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
371 | _ { error lexbuf "ruleCommentMulti" }
375 let parse_rule lexbuf =
376 let token = ruleMain lexbuf in
377 match !Parser_state.mode with
380 (* eprintf "ignored: %s\n" (lexeme lexbuf); *)
381 if token = EOF then token else IGNORED