Unbreak unoconv
[llpp.git] / parser.ml
blobc6831525062bc0fe3471ac00b57a958b08d8940a
1 (* based on Tor Andersson's XML parser from MuPDF's XPS module *)
3 let r_comment_terminator = Str.regexp "-->";;
4 let r_CDATA_terminator = Str.regexp "\\]\\]>";;
5 let r_q_terminator = Str.regexp "\\?>";;
7 let iswhite = function
8 | '\r' | '\n' | '\t' | ' ' -> true
9 | _ -> false
12 let isname = function
13 | '.' | '-' | '_' | ':' -> true
14 | c -> (c >= '0' && c <= '9')
15 || (c >= 'a' && c <= 'z')
16 || (c >= 'A' && c <= 'Z')
19 exception Parse_error of string * string * int;;
21 let parse_error msg s pos =
22 raise (Parse_error (msg, s, pos))
25 let enent s pos len =
26 let b = Buffer.create len in
27 let rec loop i =
28 if i - pos = len
29 then Buffer.contents b
30 else (
31 begin match s.[i] with
32 | '<' -> Buffer.add_string b "&lt;"
33 | '>' -> Buffer.add_string b "&gt;"
34 | '\'' -> Buffer.add_string b "&apos;"
35 | '\"' -> Buffer.add_string b "&quot;"
36 | '&' -> Buffer.add_string b "&amp;"
37 | c ->
38 let code = Char.code c in
39 if code < 32 || code > 127
40 then (
41 Buffer.add_string b "&#";
42 Buffer.add_string b (string_of_int code);
43 Buffer.add_char b ';';
45 else Buffer.add_char b c
46 end;
47 loop (i+1)
50 loop pos
53 let unent b s pos len =
54 let rec loop i =
55 if i = pos + len
56 then ()
57 else
58 let amppos =
59 try
60 String.index_from s i '&'
61 with Not_found -> -1
63 if amppos = -1 || amppos >= pos + len
64 then (
65 Buffer.add_substring b s i (pos + len - i)
67 else (
68 Buffer.add_substring b s i (amppos - i);
69 if amppos = i + len then failwith "lonely amp";
71 let semipos =
72 try
73 let semipos = String.index_from s (amppos+1) ';' in
74 if semipos >= pos + len then raise Not_found;
75 semipos
76 with Not_found -> failwith "amp not followed by semicolon"
79 let subslen = semipos-amppos-1 in
80 if subslen = 0 then failwith "empty amp";
82 let subs = String.sub s (amppos+1) subslen in
84 if subs.[0] = '#'
85 then (
86 if subslen = 1 then failwith "empty amp followed by hash";
87 let code =
88 if subs.[1] = 'x'
89 then (
90 Scanf.sscanf subs "#x%x" (fun n -> n)
92 else (
93 int_of_string (String.sub subs 1 (subslen-1))
96 let c = Char.unsafe_chr code in
97 Buffer.add_char b c
99 else (
100 match subs with
101 | "lt" -> Buffer.add_char b '<'
102 | "gt" -> Buffer.add_char b '>'
103 | "amp" -> Buffer.add_char b '&'
104 | "apos" -> Buffer.add_char b '\''
105 | "quot" -> Buffer.add_char b '\"'
106 | _ -> failwith ("unknown amp " ^ String.escaped subs)
108 loop (semipos+1)
111 loop pos
114 let subs s pos =
115 let len = String.length s in
116 let left = len - pos in
117 if left < 0
118 then
119 Printf.sprintf "(pos=%d len=%d left=%d)"
120 pos len left
121 else
122 let len = min left 10 in
123 let s = String.sub s pos len in
127 let ts = function
128 | `text -> "text"
129 | `lt -> "lt"
130 | `close -> "close"
131 | `exclam -> "exclam"
132 | `question -> "question"
133 | `doctype -> "doctype"
134 | `comment -> "comment"
135 | `tag -> "tag"
138 type attr = string * string
139 and attrs = attr list
140 and vp =
141 | Vdata
142 | Vcdata
143 | Vopen of string * attrs * bool
144 | Vclose of string
145 | Vend
146 and 'a v = { f : 'a v -> vp -> int -> int -> 'a v; accu : 'a }
149 let parse v s =
150 let slen = String.length s in
152 let find_substr pos subs r =
153 let pos =
155 Str.search_forward r s pos
156 with Not_found ->
157 parse_error ("cannot find substring " ^ subs) s pos
162 let begins_with pos prefix =
163 let prefixlen = String.length prefix in
164 if String.length s - pos >= prefixlen
165 then
166 let rec cmp i =
167 i = prefixlen || (s.[pos+i] = prefix.[i]) && cmp (i+1)
169 cmp 0
170 else
171 false
174 let find_non_white pos =
175 let rec forward i =
176 if i >= slen
177 then parse_error "cannot find non white space character" s pos;
178 if iswhite s.[i] then forward (i+1) else i in
179 forward pos
182 let getname pos =
183 let non_name_pos =
184 let rec find_non_name i =
185 if i >= slen then parse_error "cannot find non name character" s pos;
186 if isname s.[i] then find_non_name (i+1) else i
188 find_non_name pos
190 non_name_pos, String.sub s pos (non_name_pos - pos)
193 let rec collect v pos t =
194 if pos >= slen && t != `text
195 then parse_error ("not enough data for " ^ ts t) s pos;
197 match t with
198 | `text ->
199 let ltpos =
201 String.index_from s pos '<'
202 with Not_found ->
203 let rec trailsbywhite i =
204 if pos+i = String.length s
205 then -1
206 else (
207 if not (iswhite s.[pos+i])
208 then parse_error "garbage at the end" s pos
209 else trailsbywhite (i+1)
212 trailsbywhite 0
214 if ltpos = -1
215 then
216 v.f v Vend pos slen, slen
217 else
218 let start_of_text_pos = find_non_white pos in
219 let end_of_text_pos =
220 if start_of_text_pos < ltpos
221 then
222 let rec find i =
223 if i = start_of_text_pos || not (iswhite s.[i])
224 then i+1
225 else find (i-1)
227 find (ltpos-1)
228 else start_of_text_pos
230 let v =
231 if start_of_text_pos != end_of_text_pos
232 then v.f v Vdata start_of_text_pos end_of_text_pos
233 else v
235 collect v (ltpos+1) `lt
237 | `lt ->
238 let pos, t =
239 match s.[pos] with
240 | '/' -> (pos+1), `close
241 | '!' -> (pos+1), `exclam
242 | '?' -> (pos+1), `question
243 | c when isname c -> pos, `tag
244 | _ -> parse_error "invalid data after <" s pos
246 collect v pos t
248 | `close ->
249 let tag_name_pos = find_non_white pos in
250 let tag_name_end_pos, close_tag_name = getname tag_name_pos in
251 let close_tag_pos = find_non_white tag_name_end_pos in
252 if s.[close_tag_pos] != '>'
253 then parse_error "missing >" s pos;
254 let pos' = close_tag_pos + 1 in
255 let v = v.f v (Vclose close_tag_name) pos pos' in
256 collect v pos' `text
258 | `doctype ->
259 let close_tag_pos =
261 String.index_from s pos '>'
262 with Not_found ->
263 parse_error "doctype is not terminated" s pos
265 collect v (close_tag_pos+1) `text
267 | `comment ->
268 let pos =
270 find_substr pos "-->" r_comment_terminator
271 with Not_found ->
272 parse_error "comment is not terminated" s pos
274 collect v (pos+3) `text
276 | `exclam ->
277 if begins_with pos "[CDATA["
278 then
279 let cdata_start = pos+7 in
280 let cdata_end = find_substr cdata_start "]]>" r_CDATA_terminator in
281 let v = v.f v Vcdata cdata_start cdata_end in
282 collect v (cdata_end+3) `text
283 else (
284 if begins_with pos "DOCTYPE"
285 then
286 collect v (pos+7) `doctype
287 else (
288 if begins_with pos "--"
289 then collect v (pos+2) `comment
290 else parse_error "unknown shit after exclamation mark" s pos
294 | `question ->
295 let pos = find_substr pos "?>" r_q_terminator in
296 collect v (pos+2) `text
298 | `tag ->
299 let pos', name = getname pos in
300 let attrs, pos', closed = collect_attributes pos' in
301 let v = v.f v (Vopen (name, attrs, closed)) pos pos' in
302 collect v pos' `text
304 and collect_attributes pos =
305 let rec f accu pos =
306 let nameval pos =
307 let pos, name = getname pos in
308 let pos = find_non_white pos in
309 if s.[pos] = '='
310 then
311 let qpos = pos+1 in
312 if qpos = slen
313 then parse_error "not enough data for attribute" s pos;
315 let qc = s.[qpos] in
316 if not (qc = '\'' || qc = '\"')
317 then parse_error "assignment is not followed by a quote" s pos;
319 let closing_q_pos =
320 let rec find i =
321 if i = slen
322 then parse_error "not enough data for attribute value" s pos;
324 if s.[i] = qc then i else find (i+1)
326 find (qpos+1)
329 let vallen = closing_q_pos - (qpos+1) in
330 let val' = String.sub s (qpos+1) vallen in
331 (name, val'), closing_q_pos+1
333 else parse_error "attribute name not followed by '='" s pos
336 let pos = find_non_white pos in
337 if s.[pos] = '>'
338 then
339 accu, pos+1, false
340 else (
341 if slen - pos > 2 && s.[pos] = '/' && s.[pos+1] = '>'
342 then
343 accu, pos+2, true
344 else (
345 if isname s.[pos]
346 then (
347 let nameval, pos = nameval pos in
348 let accu = nameval :: accu in
349 f accu pos
351 else parse_error "malformed attribute list" s pos;
355 f [] pos
357 let _, _ = collect v 0 `text in
358 v.accu;