Consistency
[llpp.git] / parser.ml
blob903a9d8ef10be3d58bcc285ab0107fcdc87e55fc
1 (* based on Tor Andersson's XML parser from MuPDF's XPS module *)
3 let r_comment_terminator = Str.regexp "-->";;
4 let r_CDATA_terminator = Str.regexp "\\]\\]>";;
5 let r_q_terminator = Str.regexp "\\?>";;
7 let iswhite = function
8 | '\r' | '\n' | '\t' | ' ' -> true
9 | _ -> false
12 let isname = function
13 | '.' | '-' | '_' | ':' -> true
14 | c -> (c >= '0' && c <= '9')
15 || (c >= 'a' && c <= 'z')
16 || (c >= 'A' && c <= 'Z')
19 exception Parse_error of string * string * int;;
21 let parse_error msg s pos =
22 raise (Parse_error (msg, s, pos))
25 let enent s pos len =
26 let b = Buffer.create len in
27 let rec loop i =
28 if i - pos = len
29 then Buffer.contents b
30 else (
31 begin match s.[i] with
32 | '<' -> Buffer.add_string b "&lt;"
33 | '>' -> Buffer.add_string b "&gt;"
34 | '\'' -> Buffer.add_string b "&apos;"
35 | '\"' -> Buffer.add_string b "&quot;"
36 | '&' -> Buffer.add_string b "&amp;"
37 | c ->
38 let code = Char.code c in
39 if code < 32 || code > 127
40 then (
41 Buffer.add_string b "&#";
42 Buffer.add_string b (string_of_int code);
43 Buffer.add_char b ';';
45 else Buffer.add_char b c
46 end;
47 loop (i+1)
50 loop pos
53 let unent b s pos len =
54 let rec loop i =
55 if i != pos + len
56 then
57 let amppos =
58 try
59 String.index_from s i '&'
60 with Not_found -> -1
62 if amppos = -1 || amppos >= pos + len
63 then (
64 Buffer.add_substring b s i (pos + len - i)
66 else (
67 Buffer.add_substring b s i (amppos - i);
68 if amppos = i + len then Utils.error "lonely amp";
70 let semipos =
71 try
72 let semipos = String.index_from s (amppos+1) ';' in
73 if semipos >= pos + len then raise Not_found;
74 semipos
75 with Not_found ->
76 Utils.error "amp not followed by semicolon at %d" amppos
79 let subslen = semipos-amppos-1 in
80 if subslen = 0 then Utils.error "empty amp at %d" amppos;
82 let subs = String.sub s (amppos+1) subslen in
84 if subs.[0] = '#'
85 then (
86 if subslen = 1
87 then Utils.error "empty amp followed by hash at %d" amppos;
88 let code =
89 if subs.[1] = 'x'
90 then Scanf.sscanf subs "#x%x" (fun n -> n)
91 else int_of_string (String.sub subs 1 (subslen-1))
93 let c = Char.unsafe_chr code in
94 Buffer.add_char b c
96 else (
97 match subs with
98 | "lt" -> Buffer.add_char b '<'
99 | "gt" -> Buffer.add_char b '>'
100 | "amp" -> Buffer.add_char b '&'
101 | "apos" -> Buffer.add_char b '\''
102 | "quot" -> Buffer.add_char b '\"'
103 | _ -> Utils.error "unknown amp %S" subs
105 loop (semipos+1)
108 loop pos
111 let subs s pos =
112 let len = String.length s in
113 let left = len - pos in
114 if left < 0
115 then
116 Printf.sprintf "(pos=%d len=%d left=%d)"
117 pos len left
118 else
119 let len = min left 10 in
120 let s = String.sub s pos len in
124 let ts = function
125 | `text -> "text"
126 | `lt -> "lt"
127 | `close -> "close"
128 | `exclam -> "exclam"
129 | `question -> "question"
130 | `doctype -> "doctype"
131 | `comment -> "comment"
132 | `tag -> "tag"
135 type attr = string * string
136 and attrs = attr list
137 and vp =
138 | Vdata
139 | Vcdata
140 | Vopen of string * attrs * bool
141 | Vclose of string
142 | Vend
143 and 'a v = { f : 'a v -> vp -> int -> int -> 'a v; accu : 'a }
146 let parse v s =
147 let slen = String.length s in
149 let find_substr pos subs r =
150 let pos =
152 Str.search_forward r s pos
153 with Not_found ->
154 parse_error ("cannot find substring " ^ subs) s pos
158 let begins_with pos prefix = Utils.substratis s pos prefix in
159 let find_non_white pos =
160 let rec forward i =
161 if i >= slen
162 then parse_error "cannot find non white space character" s pos;
163 if iswhite s.[i] then forward (i+1) else i in
164 forward pos
167 let getname pos =
168 let non_name_pos =
169 let rec find_non_name i =
170 if i >= slen then parse_error "cannot find non name character" s pos;
171 if isname s.[i] then find_non_name (i+1) else i
173 find_non_name pos
175 non_name_pos, String.sub s pos (non_name_pos - pos)
178 let rec collect v pos t =
179 if pos >= slen && t != `text
180 then parse_error ("not enough data for " ^ ts t) s pos;
182 match t with
183 | `text ->
184 let ltpos =
186 String.index_from s pos '<'
187 with Not_found ->
188 let rec trailsbywhite i =
189 if pos+i = String.length s
190 then -1
191 else (
192 if not (iswhite s.[pos+i])
193 then parse_error "garbage at the end" s pos
194 else trailsbywhite (i+1)
197 trailsbywhite 0
199 if ltpos = -1
200 then
201 v.f v Vend pos slen, slen
202 else
203 let start_of_text_pos = find_non_white pos in
204 let end_of_text_pos =
205 if start_of_text_pos < ltpos
206 then
207 let rec find i =
208 if i = start_of_text_pos || not (iswhite s.[i])
209 then i+1
210 else find (i-1)
212 find (ltpos-1)
213 else start_of_text_pos
215 let v =
216 if start_of_text_pos != end_of_text_pos
217 then v.f v Vdata start_of_text_pos end_of_text_pos
218 else v
220 collect v (ltpos+1) `lt
222 | `lt ->
223 let pos, t =
224 match s.[pos] with
225 | '/' -> (pos+1), `close
226 | '!' -> (pos+1), `exclam
227 | '?' -> (pos+1), `question
228 | c when isname c -> pos, `tag
229 | _ -> parse_error "invalid data after <" s pos
231 collect v pos t
233 | `close ->
234 let tag_name_pos = find_non_white pos in
235 let tag_name_end_pos, close_tag_name = getname tag_name_pos in
236 let close_tag_pos = find_non_white tag_name_end_pos in
237 if s.[close_tag_pos] != '>'
238 then parse_error "missing >" s pos;
239 let pos' = close_tag_pos + 1 in
240 let v = v.f v (Vclose close_tag_name) pos pos' in
241 collect v pos' `text
243 | `doctype ->
244 let close_tag_pos =
246 String.index_from s pos '>'
247 with Not_found ->
248 parse_error "doctype is not terminated" s pos
250 collect v (close_tag_pos+1) `text
252 | `comment ->
253 let pos =
255 find_substr pos "-->" r_comment_terminator
256 with Not_found ->
257 parse_error "comment is not terminated" s pos
259 collect v (pos+3) `text
261 | `exclam ->
262 if begins_with pos "[CDATA["
263 then
264 let cdata_start = pos+7 in
265 let cdata_end = find_substr cdata_start "]]>" r_CDATA_terminator in
266 let v = v.f v Vcdata cdata_start cdata_end in
267 collect v (cdata_end+3) `text
268 else (
269 if begins_with pos "DOCTYPE"
270 then
271 collect v (pos+7) `doctype
272 else (
273 if begins_with pos "--"
274 then collect v (pos+2) `comment
275 else parse_error "unknown shit after exclamation mark" s pos
279 | `question ->
280 let pos = find_substr pos "?>" r_q_terminator in
281 collect v (pos+2) `text
283 | `tag ->
284 let pos', name = getname pos in
285 let attrs, pos', closed = collect_attributes pos' in
286 let v = v.f v (Vopen (name, attrs, closed)) pos pos' in
287 collect v pos' `text
289 and collect_attributes pos =
290 let rec f accu pos =
291 let nameval pos =
292 let pos, name = getname pos in
293 let pos = find_non_white pos in
294 if s.[pos] = '='
295 then
296 let qpos = pos+1 in
297 if qpos = slen
298 then parse_error "not enough data for attribute" s pos;
300 let qc = s.[qpos] in
301 if not (qc = '\'' || qc = '\"')
302 then parse_error "assignment is not followed by a quote" s pos;
304 let closing_q_pos =
305 let rec find i =
306 if i = slen
307 then parse_error "not enough data for attribute value" s pos;
309 if s.[i] = qc then i else find (i+1)
311 find (qpos+1)
314 let vallen = closing_q_pos - (qpos+1) in
315 let val' = String.sub s (qpos+1) vallen in
316 (name, val'), closing_q_pos+1
318 else parse_error "attribute name not followed by '='" s pos
321 let pos = find_non_white pos in
322 if s.[pos] = '>'
323 then
324 accu, pos+1, false
325 else (
326 if slen - pos > 2 && s.[pos] = '/' && s.[pos+1] = '>'
327 then
328 accu, pos+2, true
329 else (
330 if isname s.[pos]
331 then (
332 let nameval, pos = nameval pos in
333 let accu = nameval :: accu in
334 f accu pos
336 else parse_error "malformed attribute list" s pos;
340 f [] pos
342 let _, _ = collect v 0 `text in
343 v.accu;