10 prerr_endline (sprintf "Lexer error : %s" callerID);
12 raise Parsing.Parse_error
14 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
16 let advance_line_pos pos =
17 { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
19 let advance_line lexbuf =
20 lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
40 "precision",PRECISION;
48 "character",CHARACTER;
79 "intersect",INTERSECT;
81 "temporary",TEMPORARY;
88 "references",REFERENCES;
92 "timestamp",TIMESTAMP;
99 "constraint",CONSTRAINT;
128 "interval", INTERVAL;
129 "microsecond", MICROSECOND;
138 "second_microsecond", SECOND_MICROSECOND;
139 "minute_microsecond", MINUTE_MICROSECOND;
140 "minute_second", MINUTE_SECOND;
141 "hour_microsecond", HOUR_MICROSECOND;
142 "hour_second", HOUR_SECOND;
143 "hour_minute", HOUR_MINUTE;
144 "day_microsecond", DAY_MICROSECOND;
145 "day_second", DAY_SECOND;
146 "day_minute", DAY_MINUTE;
147 "day_hour", DAY_HOUR;
148 "year_month", YEAR_MONTH;
151 "duplicate", DUPLICATE;
152 "function", FUNCTION;
153 "procedure", PROCEDURE;
157 "language", LANGUAGE;
158 "substring", SUBSTRING;
161 let all token l = k := !k @ List.map (fun x -> x,token) l in
162 all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now";];
163 all DATETIME_FUNC ["getdate"]; (* mssql? *)
164 all CONFLICT_ALGO ["ignore"; "abort"; "fail"; "rollback"];
165 all JOIN_TYPE1 ["left";"right";"full"];
166 all JOIN_TYPE2 ["inner";"outer"];
167 all LIKE_OP ["glob";"regexp";"match"];
168 all AUTOINCREMENT ["autoincrement";"auto_increment"];
169 (* standard built-in types
170 CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
171 BINARY, BINARY VARYING, BINARY LARGE OBJECT,
172 NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
173 FLOAT, REAL, DOUBLE PRECISION,
175 DATE, TIME, TIMESTAMP, INTERVAL
177 all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
178 all T_INTEGER ["numeric";"decimal";"dec";"fixed"];
179 all T_INTEGER ["number"]; (* oracle *)
180 all T_BOOLEAN ["bool";"boolean"];
181 all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
182 all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
183 all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
184 all T_TEXT ["varchar2"]; (* oracle *)
185 all T_DATETIME ["datetime"];
186 all T_UUID ["uuid"]; (* http://www.postgresql.org/docs/9.4/static/datatype-uuid.html *)
190 Q: Why not convert all input to lowercase before lexing?
191 A: Sometimes SQL is case-sensitive, also string contents should be preserved
194 module Keywords = Map.Make(String)
198 let k = String.lowercase k in
199 if Keywords.mem k map then
200 failwith (sprintf "Lexeme %s is already associated with keyword." k)
204 List.fold_left add Keywords.empty keywords
206 (* FIXME case sensitivity??! *)
209 let str = String.lowercase str in
210 try Keywords.find str keywords with Not_found -> IDENT str
212 let ident str = IDENT (String.lowercase str)
214 let as_literal ch s =
215 let s = String.replace_chars (fun x -> String.make (if x = ch then 2 else 1) x) s in
216 sprintf "%c%s%c" ch s ch
219 let digit = ['0'-'9']
220 let alpha = ['a'-'z' 'A'-'Z']
221 let ident = (alpha) (alpha | digit | '_' )*
222 let wsp = [' ' '\r' '\t']
223 let cmnt = "--" | "//" | "#"
225 (* extract separate statements *)
226 rule ruleStatement = parse
227 | ['\n' ' ' '\r' '\t']+ as tok { `Space tok }
228 | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n' { `Prop (n,v) }
229 | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n' { `Prop ("name",name) }
230 | '"' { let s = ruleInQuotes "" lexbuf in `Token (as_literal '"' s) }
231 | "'" { let s = ruleInSingleQuotes "" lexbuf in `Token (as_literal '\'' s) }
232 | "$" (ident? as tag) "$" { let s = ruleInDollarQuotes tag "" lexbuf in `Token (sprintf "$%s$%s$%s$" tag s tag) }
233 | cmnt as s { `Comment (s ^ ruleComment "" lexbuf) }
234 | "/*" { `Comment ("/*" ^ ruleCommentMulti "" lexbuf ^ "*/") }
236 | [^ ';'] as c { `Char c }
239 (* extract tail of the input *)
242 | _* as str { ruleTail (acc ^ str) lexbuf }
245 | wsp { ruleMain lexbuf }
246 (* update line number *)
247 | '\n' { advance_line lexbuf; ruleMain lexbuf}
253 | '{' { LCURLY (lexeme_start lexbuf) }
254 | '}' { RCURLY (lexeme_start lexbuf) }
256 | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
257 | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
267 | "/" | "%" { NUM_DIV_OP }
268 | "<<" | ">>" { NUM_BIT_SHIFT }
270 | "&" { NUM_BIT_AND }
271 | ">" | ">=" | "<=" | "<" { NUM_CMP_OP }
272 | "<>" | "!=" | "==" { NUM_EQ_OP }
273 | "<=>" { NOT_DISTINCT_OP }
275 | "?" { PARAM (None,pos lexbuf) }
276 | [':' '@'] (ident as str) { PARAM (Some str,pos lexbuf) }
278 | '"' { ident (ruleInQuotes "" lexbuf) }
279 | "'" { TEXT (ruleInSingleQuotes "" lexbuf) }
280 (* http://www.postgresql.org/docs/current/interactive/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING *)
281 | "$" (ident? as tag) "$" { TEXT (ruleInDollarQuotes tag "" lexbuf) }
282 | "`" { ident (ruleInBackQuotes "" lexbuf) }
283 | "[" { ident (ruleInBrackets "" lexbuf) }
284 | ['x' 'X'] "'" { BLOB (ruleInSingleQuotes "" lexbuf) }
286 | ident as str { get_ident str }
287 | digit+ as str { INTEGER (int_of_string str) }
288 | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
290 | _ { error lexbuf "ruleMain" }
292 (* FIXME factor out all that ruleIn* rules *)
293 ruleInQuotes acc = parse
295 | eof { error lexbuf "no terminating quote" }
296 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
297 | "\"\"" { ruleInQuotes (acc^"\"") lexbuf }
298 | [^'"' '\n']+ as s { ruleInQuotes (acc^s) lexbuf }
299 | _ { error lexbuf "ruleInQuotes" }
301 ruleInBrackets acc = parse
303 | eof { error lexbuf "no terminating bracket" }
304 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating bracket" }
305 (* | "\"\"" { ruleInQuotes (acc ^ "\"") lexbuf } *)
306 | [^']' '\n']+ { ruleInBrackets (acc ^ lexeme lexbuf) lexbuf }
307 | _ { error lexbuf "ruleInBrackets" }
309 ruleInSingleQuotes acc = parse
311 | eof { error lexbuf "no terminating single quote" }
312 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
313 | "''" { ruleInSingleQuotes (acc ^ "'") lexbuf }
314 | [^'\'' '\n']+ { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
315 | _ { error lexbuf "ruleInSingleQuotes" }
317 ruleInBackQuotes acc = parse
319 | eof { error lexbuf "no terminating back quote" }
320 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
321 | "``" { ruleInBackQuotes (acc ^ "`") lexbuf }
322 | [^'`' '\n']+ { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
323 | _ { error lexbuf "ruleInBackQuotes" }
325 ruleInDollarQuotes tag acc = parse
326 | "$" (ident? as tag_) "$" { if tag_ = tag then acc else ruleInDollarQuotes tag (acc ^ sprintf "$%s$" tag_) lexbuf }
327 | eof { error lexbuf "no terminating dollar quote" }
328 | '\n' { advance_line lexbuf; ruleInDollarQuotes tag (acc ^ "\n") lexbuf }
329 (* match one char at a time to make sure delimiter matches longer *)
330 | [^'\n'] { ruleInDollarQuotes tag (acc ^ lexeme lexbuf) lexbuf }
331 | _ { error lexbuf "ruleInDollarQuotes" }
333 ruleComment acc = parse
334 | '\n' { advance_line lexbuf; acc }
336 | [^'\n']+ { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
337 | _ { error lexbuf "ruleComment"; }
339 ruleCommentMulti acc = parse
340 | '\n' { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
343 | [^'\n' '*']+ { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
344 | _ { error lexbuf "ruleCommentMulti" }
348 let parse_rule lexbuf =
349 let module P = Parser_state in
350 let token = ruleMain lexbuf in
354 (* eprintf "ignored: %s\n" (lexeme lexbuf); *)
355 if (token = EOF) then token else IGNORED