Robustness
[llpp.git] / parser.ml
blob8053719afebc41eb15ab94940db0654244046964
1 (* based on Tor Andersson's XML parser from MuPDF's XPS module *)
3 let dolog fmt = Printf.kprintf prerr_endline fmt;;
4 let r_comment_terminator = Str.regexp "-->";;
5 let r_CDATA_terminator = Str.regexp "\\]\\]>";;
6 let r_q_terminator = Str.regexp "\\?>";;
8 let iswhite = function
9 | '\r' | '\n' | '\t' | ' ' -> true
10 | _ -> false
13 let isname = function
14 | '.' | '-' | '_' | ':' -> true
15 | c -> (c >= '0' && c <= '9')
16 || (c >= 'a' && c <= 'z')
17 || (c >= 'A' && c <= 'Z')
20 exception Parse_error of string * string * int
22 let error msg s pos =
23 raise (Parse_error (msg, s, pos))
26 let enent s pos len =
27 let b = Buffer.create len in
28 let rec loop i =
29 if i - pos = len
30 then Buffer.contents b
31 else (
32 begin match s.[i] with
33 | '<' -> Buffer.add_string b "&lt;"
34 | '>' -> Buffer.add_string b "&gt;"
35 | '\'' -> Buffer.add_string b "&apos;"
36 | '"' -> Buffer.add_string b "&quot;"
37 | '&' -> Buffer.add_string b "&amp;"
38 | c ->
39 let code = Char.code c in
40 if code < 32 || code > 127
41 then (
42 Buffer.add_string b "&#";
43 Buffer.add_string b (string_of_int code);
44 Buffer.add_char b ';';
46 else Buffer.add_char b c
47 end;
48 loop (i+1)
51 loop pos
54 let unent b s pos len =
55 let rec loop i =
56 if i = pos + len
57 then ()
58 else
59 let amppos =
60 try
61 String.index_from s i '&'
62 with Not_found -> -1
64 if amppos = -1 || amppos >= pos + len
65 then (
66 Buffer.add_substring b s i (pos + len - i)
68 else (
69 Buffer.add_substring b s i (amppos - i);
70 if amppos = i + len then failwith "lonely amp";
72 let semipos =
73 try
74 let semipos = String.index_from s (amppos+1) ';' in
75 if semipos >= pos + len then raise Not_found;
76 semipos
77 with Not_found -> failwith "amp not followed by semi colon"
80 let subslen = semipos-amppos-1 in
81 if subslen = 0 then failwith "empty amp";
83 let subs = String.sub s (amppos+1) subslen in
85 if subs.[0] = '#'
86 then (
87 if subslen = 1 then failwith "empty amp followed by hash";
88 let code =
89 if subs.[1] = 'x'
90 then (
91 subs.[0] <- '0';
92 int_of_string subs
94 else (
95 int_of_string (String.sub subs 1 (subslen-1))
98 let c = Char.unsafe_chr code in
99 Buffer.add_char b c
101 else (
102 match subs with
103 | "lt" -> Buffer.add_char b '<'
104 | "gt" -> Buffer.add_char b '>'
105 | "amp" -> Buffer.add_char b '&'
106 | "apos" -> Buffer.add_char b '\''
107 | "quot" -> Buffer.add_char b '"'
108 | _ -> failwith ("unknown amp " ^ String.escaped subs)
110 loop (semipos+1)
113 loop pos
116 let subs s pos =
117 let len = String.length s in
118 let left = len - pos in
119 if left < 0
120 then
121 Printf.sprintf "(pos=%d len=%d left=%d)"
122 pos len left
123 else
124 let len = min left 10 in
125 let s = String.sub s pos len in
129 let ts = function
130 | `text -> "text"
131 | `lt -> "lt"
132 | `close -> "close"
133 | `exclam -> "exclam"
134 | `question -> "question"
135 | `doctype -> "doctype"
136 | `comment -> "comment"
137 | `tag -> "tag"
140 type attr = string * string
141 and attrs = attr list
142 and vp =
143 | Vdata
144 | Vcdata
145 | Vopen of string * attrs * bool
146 | Vclose of string
147 | Vend
148 and 'a v = { f : 'a v -> vp -> int -> int -> 'a v; accu : 'a }
151 let parse v s =
152 let slen = String.length s in
154 let find_substr pos subs r =
155 let pos =
157 Str.search_forward r s pos
158 with Not_found ->
159 error ("couldn't find substring " ^ subs) s pos
164 let begins_with pos prefix =
165 let prefixlen = String.length prefix in
166 if String.length s - pos >= prefixlen
167 then
168 let rec cmp i =
169 i = prefixlen || (s.[pos+i] = prefix.[i]) && cmp (i+1)
171 cmp 0
172 else
173 false
176 let find_non_white pos =
177 let rec forward i =
178 if i >= slen then error "couldn't find non white space character" s pos;
179 if iswhite s.[i] then forward (i+1) else i in
180 forward pos
183 let getname pos =
184 let non_name_pos =
185 let rec find_non_name i =
186 if i >= slen then error "couldn't find non name character" s pos;
187 if isname s.[i] then find_non_name (i+1) else i
189 find_non_name pos
191 non_name_pos, String.sub s pos (non_name_pos - pos)
194 let rec collect v pos t =
195 if pos >= slen && t != `text
196 then error ("not enough data for " ^ ts t) s pos;
198 match t with
199 | `text ->
200 let ltpos =
202 String.index_from s pos '<'
203 with Not_found ->
204 let rec trailsbywhite i =
205 if pos+i = String.length s
206 then -1
207 else (
208 if not (iswhite s.[pos+i])
209 then error "garbage at the end" s pos
210 else trailsbywhite (i+1)
213 trailsbywhite 0
215 if ltpos = -1
216 then
217 v.f v Vend pos slen, slen
218 else
219 let start_of_text_pos = find_non_white pos in
220 let end_of_text_pos =
221 if start_of_text_pos < ltpos
222 then
223 let rec find i =
224 if i = start_of_text_pos || not (iswhite s.[i])
225 then i+1
226 else find (i-1)
228 find (ltpos-1)
229 else start_of_text_pos
231 let v =
232 if start_of_text_pos != end_of_text_pos
233 then v.f v Vdata start_of_text_pos end_of_text_pos
234 else v
236 collect v (ltpos+1) `lt
238 | `lt ->
239 let pos, t =
240 match s.[pos] with
241 | '/' -> (pos+1), `close
242 | '!' -> (pos+1), `exclam
243 | '?' -> (pos+1), `question
244 | c when isname c -> pos, `tag
245 | _ -> error "invalid data after <" s pos
247 collect v pos t
249 | `close ->
250 let tag_name_pos = find_non_white pos in
251 let tag_name_end_pos, close_tag_name = getname tag_name_pos in
252 let close_tag_pos = find_non_white tag_name_end_pos in
253 if s.[close_tag_pos] != '>'
254 then error "missing >" s pos;
255 let pos' = close_tag_pos + 1 in
256 let v = v.f v (Vclose close_tag_name) pos pos' in
257 collect v pos' `text
259 | `doctype ->
260 let close_tag_pos =
262 String.index_from s pos '>'
263 with Not_found ->
264 error "doctype is not terminated" s pos
266 collect v (close_tag_pos+1) `text
268 | `comment ->
269 let pos =
271 find_substr pos "-->" r_comment_terminator
272 with Not_found ->
273 error "comment is not terminated" s pos
275 collect v (pos+3) `text
277 | `exclam ->
278 if begins_with pos "[CDATA["
279 then
280 let cdata_start = pos+7 in
281 let cdata_end = find_substr cdata_start "]]>" r_CDATA_terminator in
282 let v = v.f v Vcdata cdata_start cdata_end in
283 collect v (cdata_end+3) `text
284 else (
285 if begins_with pos "DOCTYPE"
286 then
287 collect v (pos+7) `doctype
288 else (
289 if begins_with pos "--"
290 then collect v (pos+2) `comment
291 else error "unknown shit after exclamation mark" s pos
295 | `question ->
296 let pos = find_substr pos "?>" r_q_terminator in
297 collect v (pos+2) `text
299 | `tag ->
300 let pos', name = getname pos in
301 let attrs, pos', closed = collect_attributes pos' in
302 let v = v.f v (Vopen (name, attrs, closed)) pos pos' in
303 collect v pos' `text
305 and collect_attributes pos =
306 let rec f accu pos =
307 let nameval pos =
308 let pos, name = getname pos in
309 let pos = find_non_white pos in
310 if s.[pos] = '='
311 then
312 let qpos = pos+1 in
313 if qpos = slen
314 then error "not enough data for attribute" s pos;
316 let qc = s.[qpos] in
317 if not (qc = '\'' || qc = '"')
318 then error "assignment is not followed by a quote" s pos;
320 let closing_q_pos =
321 let rec find i =
322 if i = slen
323 then error "not enough data for attribute value" s pos;
325 if s.[i] = qc then i else find (i+1)
327 find (qpos+1)
330 let vallen = closing_q_pos - (qpos+1) in
331 let val' = String.sub s (qpos+1) vallen in
332 (name, val'), closing_q_pos+1
334 else error "attribute name not followed by '='" s pos
337 let pos = find_non_white pos in
338 if s.[pos] = '>'
339 then
340 accu, pos+1, false
341 else (
342 if slen - pos > 2 && s.[pos] = '/' && s.[pos+1] = '>'
343 then
344 accu, pos+2, true
345 else (
346 if isname s.[pos]
347 then (
348 let nameval, pos = nameval pos in
349 let accu = nameval :: accu in
350 f accu pos
352 else error "malformed attribute list" s pos;
356 f [] pos
358 let _, _ = collect v 0 `text in
359 v.accu;