10 prerr_endline (sprintf "Lexer error : %s" callerID);
12 raise Parsing.Parse_error
14 let pos lexbuf = (lexeme_start lexbuf, lexeme_end lexbuf)
16 let advance_line_pos pos =
17 { pos with pos_lnum = pos.pos_lnum + 1; pos_bol = pos.pos_cnum; }
19 let advance_line lexbuf =
20 lexbuf.lex_curr_p <- advance_line_pos lexbuf.lex_curr_p
42 "character",CHARACTER;
49 "constraint",CONSTRAINT;
56 "day_microsecond", DAY_MICROSECOND;
57 "day_minute", DAY_MINUTE;
58 "day_second", DAY_SECOND;
66 "duplicate", DUPLICATE;
84 "hour_microsecond", HOUR_MICROSECOND;
85 "hour_minute", HOUR_MINUTE;
86 "hour_second", HOUR_SECOND;
91 "intersect",INTERSECT;
102 "microsecond", MICROSECOND;
104 "minute_microsecond", MINUTE_MICROSECOND;
105 "minute_second", MINUTE_SECOND;
121 "precision",PRECISION;
123 "procedure", PROCEDURE;
125 "references",REFERENCES;
131 "second_microsecond", SECOND_MICROSECOND;
138 "substring", SUBSTRING;
140 "temporary",TEMPORARY;
143 "timestamp",TIMESTAMP;
160 "year_month", YEAR_MONTH;
162 let all token l = k := !k @ List.map (fun x -> x,token) l in
163 all DATETIME_FUNC ["current_date";"current_timestamp";"current_time";"localtime";"localtimestamp";"now";];
164 all DATETIME_FUNC ["getdate"]; (* mssql? *)
165 all CONFLICT_ALGO ["ignore"; "abort"; "fail"; "rollback"];
166 all JOIN_TYPE1 ["left";"right";"full"];
167 all JOIN_TYPE2 ["inner";"outer"];
168 all LIKE_OP ["glob";"regexp";"match"];
169 all AUTOINCREMENT ["autoincrement";"auto_increment"];
170 (* standard built-in types
171 CHARACTER, CHARACTER VARYING, CHARACTER LARGE OBJECT,
172 BINARY, BINARY VARYING, BINARY LARGE OBJECT,
173 NUMERIC, DECIMAL, INTEGER, SMALLINT, BIGINT,
174 FLOAT, REAL, DOUBLE PRECISION,
176 DATE, TIME, TIMESTAMP, INTERVAL
178 all T_INTEGER ["integer";"int";"smallint";"bigint";"tinyint";"mediumint";"middleint";"serial";"identity"];
179 all T_DECIMAL ["numeric";"decimal";"dec";"fixed"];
180 all T_INTEGER ["number"]; (* oracle *)
181 all T_BOOLEAN ["bool";"boolean"];
182 all T_FLOAT ["float";"real";"double";"float4";"float8";"int1";"int2";"int3";"int4";"int8"];
183 all T_BLOB ["blob";"varbinary";"tinyblob";"mediumblob";"longblob"];
184 all T_TEXT ["text";"char";"varchar";"tinytext";"mediumtext";"longtext"];
185 all T_TEXT ["varchar2"]; (* oracle *)
186 all T_DATETIME ["datetime"];
187 all T_UUID ["uuid"]; (* http://www.postgresql.org/docs/9.4/static/datatype-uuid.html *)
191 Q: Why not convert all input to lowercase before lexing?
192 A: Sometimes SQL is case-sensitive, also string contents should be preserved
195 module Keywords = Map.Make(String)
199 let k = String.lowercase k in
200 if Keywords.mem k map then
201 failwith (sprintf "Lexeme %s is already associated with keyword." k)
205 List.fold_left add Keywords.empty keywords
207 (* FIXME case sensitivity??! *)
210 let str = String.lowercase str in
211 try Keywords.find str keywords with Not_found -> IDENT str
213 let ident str = IDENT (String.lowercase str)
215 let as_literal ch s =
216 let s = String.replace_chars (fun x -> String.make (if x = ch then 2 else 1) x) s in
217 sprintf "%c%s%c" ch s ch
220 let digit = ['0'-'9']
221 let alpha = ['a'-'z' 'A'-'Z']
222 let ident = (alpha) (alpha | digit | '_' )*
223 let wsp = [' ' '\r' '\t']
224 let cmnt = "--" | "//" | "#"
226 (* extract separate statements *)
227 rule ruleStatement = parse
228 | ['\n' ' ' '\r' '\t']+ as tok { `Space tok }
229 | cmnt wsp* "[sqlgg]" wsp+ (ident+ as n) wsp* "=" wsp* ([^'\n']* as v) '\n' { `Prop (n,v) }
230 | cmnt wsp* "@" (ident+ as name) [^'\n']* '\n' { `Prop ("name",name) }
231 | '"' { let s = ruleInQuotes "" lexbuf in `Token (as_literal '"' s) }
232 | "'" { let s = ruleInSingleQuotes "" lexbuf in `Token (as_literal '\'' s) }
233 | "$" (ident? as tag) "$" { let s = ruleInDollarQuotes tag "" lexbuf in `Token (sprintf "$%s$%s$%s$" tag s tag) }
234 | cmnt as s { `Comment (s ^ ruleComment "" lexbuf) }
235 | "/*" { `Comment ("/*" ^ ruleCommentMulti "" lexbuf ^ "*/") }
237 | [^ ';'] as c { `Char c }
240 (* extract tail of the input *)
243 | _* as str { ruleTail (acc ^ str) lexbuf }
246 | wsp { ruleMain lexbuf }
247 (* update line number *)
248 | '\n' { advance_line lexbuf; ruleMain lexbuf}
254 | '{' { LCURLY (lexeme_start lexbuf) }
255 | '}' { RCURLY (lexeme_start lexbuf) }
257 | cmnt { ignore (ruleComment "" lexbuf); ruleMain lexbuf }
258 | "/*" { ignore (ruleCommentMulti "" lexbuf); ruleMain lexbuf }
268 | "/" | "%" { NUM_DIV_OP }
269 | "<<" | ">>" { NUM_BIT_SHIFT }
271 | "&" { NUM_BIT_AND }
272 | ">" | ">=" | "<=" | "<" { NUM_CMP_OP }
273 | "<>" | "!=" | "==" { NUM_EQ_OP }
274 | "<=>" { NOT_DISTINCT_OP }
276 | "?" { PARAM { label=None; pos = pos lexbuf } }
277 | [':' '@'] (ident as str) { PARAM { label = Some str; pos = pos lexbuf } }
279 | '"' { ident (ruleInQuotes "" lexbuf) }
280 | "'" { TEXT (ruleInSingleQuotes "" lexbuf) }
281 (* http://www.postgresql.org/docs/current/interactive/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING *)
282 | "$" (ident? as tag) "$" { TEXT (ruleInDollarQuotes tag "" lexbuf) }
283 | "`" { ident (ruleInBackQuotes "" lexbuf) }
284 | "[" { ident (ruleInBrackets "" lexbuf) }
285 | ['x' 'X'] "'" { BLOB (ruleInSingleQuotes "" lexbuf) }
287 | ident as str { if !Parser_state.mode = Ident then IDENT str (* no keywords, preserve case *) else get_ident str }
288 | digit+ as str { INTEGER (int_of_string str) }
289 | digit+ '.' digit+ as str { FLOAT (float_of_string str) }
291 | _ { error lexbuf "ruleMain" }
293 (* FIXME factor out all that ruleIn* rules *)
294 ruleInQuotes acc = parse
296 | eof { error lexbuf "no terminating quote" }
297 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating quote" }
298 | "\"\"" { ruleInQuotes (acc^"\"") lexbuf }
299 | [^'"' '\n']+ as s { ruleInQuotes (acc^s) lexbuf }
300 | _ { error lexbuf "ruleInQuotes" }
302 ruleInBrackets acc = parse
304 | eof { error lexbuf "no terminating bracket" }
305 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating bracket" }
306 (* | "\"\"" { ruleInQuotes (acc ^ "\"") lexbuf } *)
307 | [^']' '\n']+ { ruleInBrackets (acc ^ lexeme lexbuf) lexbuf }
308 | _ { error lexbuf "ruleInBrackets" }
310 ruleInSingleQuotes acc = parse
312 | eof { error lexbuf "no terminating single quote" }
313 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating single quote" }
314 | "''" { ruleInSingleQuotes (acc ^ "'") lexbuf }
315 | [^'\'' '\n']+ { ruleInSingleQuotes (acc ^ lexeme lexbuf) lexbuf }
316 | _ { error lexbuf "ruleInSingleQuotes" }
318 ruleInBackQuotes acc = parse
320 | eof { error lexbuf "no terminating back quote" }
321 | '\n' { advance_line lexbuf; error lexbuf "EOL before terminating back quote" }
322 | "``" { ruleInBackQuotes (acc ^ "`") lexbuf }
323 | [^'`' '\n']+ { ruleInBackQuotes (acc ^ lexeme lexbuf) lexbuf }
324 | _ { error lexbuf "ruleInBackQuotes" }
326 ruleInDollarQuotes tag acc = parse
327 | "$" (ident? as tag_) "$" { if tag_ = tag then acc else ruleInDollarQuotes tag (acc ^ sprintf "$%s$" tag_) lexbuf }
328 | eof { error lexbuf "no terminating dollar quote" }
329 | '\n' { advance_line lexbuf; ruleInDollarQuotes tag (acc ^ "\n") lexbuf }
330 (* match one char at a time to make sure delimiter matches longer *)
331 | [^'\n'] { ruleInDollarQuotes tag (acc ^ lexeme lexbuf) lexbuf }
332 | _ { error lexbuf "ruleInDollarQuotes" }
334 ruleComment acc = parse
335 | '\n' { advance_line lexbuf; acc }
337 | [^'\n']+ { let s = lexeme lexbuf in ruleComment (acc ^ s) lexbuf; }
338 | _ { error lexbuf "ruleComment"; }
340 ruleCommentMulti acc = parse
341 | '\n' { advance_line lexbuf; ruleCommentMulti (acc ^ "\n") lexbuf }
344 | [^'\n' '*']+ { let s = lexeme lexbuf in ruleCommentMulti (acc ^ s) lexbuf }
345 | _ { error lexbuf "ruleCommentMulti" }
349 let parse_rule lexbuf =
350 let token = ruleMain lexbuf in
351 match !Parser_state.mode with
354 (* eprintf "ignored: %s\n" (lexeme lexbuf); *)
355 if token = EOF then token else IGNORED