Move timing out of build.bash
[llpp.git] / parser.ml
blob87fbbf40254e64b86253664426e2c6920bee6e8a
1 (* based on Tor Andersson's XML parser from MuPDF's XPS module *)
3 let iswhite = function
4 | '\r' | '\n' | '\t' | ' ' -> true
5 | _ -> false
7 let isname = function
8 | '.' | '-' | '_' | ':' -> true
9 | c -> (c >= '0' && c <= '9')
10 || (c >= 'a' && c <= 'z')
11 || (c >= 'A' && c <= 'Z')
13 exception Parse_error of string * string * int
15 let parse_error msg s pos = raise (Parse_error (msg, s, pos))
17 let enent s pos len =
18 let b = Buffer.create len in
19 let rec loop i =
20 if i - pos = len
21 then Buffer.contents b
22 else (
23 begin match s.[i] with
24 | '<' -> Buffer.add_string b "&lt;"
25 | '>' -> Buffer.add_string b "&gt;"
26 | '\'' -> Buffer.add_string b "&apos;"
27 | '\"' -> Buffer.add_string b "&quot;"
28 | '&' -> Buffer.add_string b "&amp;"
29 | c ->
30 let code = Char.code c in
31 if code < 32 || code > 127
32 then (
33 Buffer.add_string b "&#";
34 Buffer.add_string b (string_of_int code);
35 Buffer.add_char b ';';
37 else Buffer.add_char b c
38 end;
39 loop (i+1)
42 loop pos
44 let unent b s pos len =
45 let rec loop i =
46 if i != pos + len
47 then
48 let amppos =
49 try
50 String.index_from s i '&'
51 with Not_found -> -1
53 if amppos = -1 || amppos >= pos + len
54 then Buffer.add_substring b s i (pos + len - i)
55 else (
56 Buffer.add_substring b s i (amppos - i);
57 if amppos = i + len then Utils.error "lonely amp";
59 let semipos =
60 try
61 let semipos = String.index_from s (amppos+1) ';' in
62 if semipos >= pos + len then raise Not_found;
63 semipos
64 with Not_found ->
65 Utils.error "amp not followed by semicolon at %d" amppos
68 let subslen = semipos-amppos-1 in
69 if subslen = 0 then Utils.error "empty amp at %d" amppos;
71 let subs = String.sub s (amppos+1) subslen in
73 if subs.[0] = '#'
74 then (
75 if subslen = 1
76 then Utils.error "empty amp followed by hash at %d" amppos;
77 let code =
78 if subs.[1] = 'x'
79 then Scanf.sscanf subs "#x%x" (fun n -> n)
80 else int_of_string (String.sub subs 1 (subslen-1))
82 let c = Char.unsafe_chr code in
83 Buffer.add_char b c
85 else (
86 match subs with
87 | "lt" -> Buffer.add_char b '<'
88 | "gt" -> Buffer.add_char b '>'
89 | "amp" -> Buffer.add_char b '&'
90 | "apos" -> Buffer.add_char b '\''
91 | "quot" -> Buffer.add_char b '\"'
92 | _ -> Utils.error "unknown amp %S" subs
94 loop (semipos+1)
97 loop pos
99 let subs s pos =
100 let len = String.length s in
101 let left = len - pos in
102 if left < 0
103 then Printf.sprintf "(pos=%d len=%d left=%d)" pos len left
104 else
105 let len = min left 10 in
106 String.sub s pos len
108 let ts = function
109 | `text -> "text"
110 | `lt -> "lt"
111 | `close -> "close"
112 | `exclam -> "exclam"
113 | `question -> "question"
114 | `doctype -> "doctype"
115 | `comment -> "comment"
116 | `tag -> "tag"
118 type attr = string * string
119 and attrs = attr list
120 and vp =
121 | Vdata
122 | Vcdata
123 | Vopen of string * attrs * bool
124 | Vclose of string
125 | Vend
126 and 'a v = { f : 'a v -> vp -> int -> int -> 'a v; accu : 'a }
128 let parse v s =
129 let r_comment_terminator = Str.regexp "-->"
130 and r_CDATA_terminator = Str.regexp "\\]\\]>"
131 and r_q_terminator = Str.regexp "\\?>" in
133 let slen = String.length s in
135 let find_substr pos subs r =
136 let pos =
138 Str.search_forward r s pos
139 with Not_found ->
140 parse_error ("cannot find substring " ^ subs) s pos
144 let begins_with pos prefix = Utils.substratis s pos prefix in
145 let find_non_white pos =
146 let rec forward i =
147 if i >= slen
148 then parse_error "cannot find non white space character" s pos;
149 if iswhite s.[i] then forward (i+1) else i in
150 forward pos
153 let getname pos =
154 let non_name_pos =
155 let rec find_non_name i =
156 if i >= slen then parse_error "cannot find non name character" s pos;
157 if isname s.[i] then find_non_name (i+1) else i
159 find_non_name pos
161 non_name_pos, String.sub s pos (non_name_pos - pos)
164 let rec collect v pos t =
165 if pos >= slen && t != `text
166 then parse_error ("not enough data for " ^ ts t) s pos;
168 match t with
169 | `text ->
170 let ltpos =
172 String.index_from s pos '<'
173 with Not_found ->
174 let rec trailsbywhite i =
175 if pos+i = String.length s
176 then -1
177 else (
178 if not (iswhite s.[pos+i])
179 then parse_error "garbage at the end" s pos
180 else trailsbywhite (i+1)
183 trailsbywhite 0
185 if ltpos = -1
186 then v.f v Vend pos slen, slen
187 else
188 let start_of_text_pos = find_non_white pos in
189 let end_of_text_pos =
190 if start_of_text_pos < ltpos
191 then
192 let rec find i =
193 if i = start_of_text_pos || not (iswhite s.[i])
194 then i+1
195 else find (i-1)
197 find (ltpos-1)
198 else start_of_text_pos
200 let v =
201 if start_of_text_pos != end_of_text_pos
202 then v.f v Vdata start_of_text_pos end_of_text_pos
203 else v
205 collect v (ltpos+1) `lt
207 | `lt ->
208 let pos, t =
209 match s.[pos] with
210 | '/' -> (pos+1), `close
211 | '!' -> (pos+1), `exclam
212 | '?' -> (pos+1), `question
213 | c when isname c -> pos, `tag
214 | _ -> parse_error "invalid data after <" s pos
216 collect v pos t
218 | `close ->
219 let tag_name_pos = find_non_white pos in
220 let tag_name_end_pos, close_tag_name = getname tag_name_pos in
221 let close_tag_pos = find_non_white tag_name_end_pos in
222 if s.[close_tag_pos] != '>'
223 then parse_error "missing >" s pos;
224 let pos' = close_tag_pos + 1 in
225 let v = v.f v (Vclose close_tag_name) pos pos' in
226 collect v pos' `text
228 | `doctype ->
229 let close_tag_pos =
231 String.index_from s pos '>'
232 with Not_found ->
233 parse_error "doctype is not terminated" s pos
235 collect v (close_tag_pos+1) `text
237 | `comment ->
238 let pos =
240 find_substr pos "-->" r_comment_terminator
241 with Not_found ->
242 parse_error "comment is not terminated" s pos
244 collect v (pos+3) `text
246 | `exclam ->
247 if begins_with pos "[CDATA["
248 then
249 let cdata_start = pos+7 in
250 let cdata_end = find_substr cdata_start "]]>" r_CDATA_terminator in
251 let v = v.f v Vcdata cdata_start cdata_end in
252 collect v (cdata_end+3) `text
253 else (
254 if begins_with pos "DOCTYPE"
255 then
256 collect v (pos+7) `doctype
257 else (
258 if begins_with pos "--"
259 then collect v (pos+2) `comment
260 else parse_error "unknown shit after exclamation mark" s pos
264 | `question ->
265 let pos = find_substr pos "?>" r_q_terminator in
266 collect v (pos+2) `text
268 | `tag ->
269 let pos', name = getname pos in
270 let attrs, pos', closed = collect_attributes pos' in
271 let v = v.f v (Vopen (name, attrs, closed)) pos pos' in
272 collect v pos' `text
274 and collect_attributes pos =
275 let rec f accu pos =
276 let nameval pos =
277 let pos, name = getname pos in
278 let pos = find_non_white pos in
279 if s.[pos] = '='
280 then
281 let qpos = pos+1 in
282 if qpos = slen
283 then parse_error "not enough data for attribute" s pos;
285 let qc = s.[qpos] in
286 if not (qc = '\'' || qc = '\"')
287 then parse_error "assignment is not followed by a quote" s pos;
289 let closing_q_pos =
290 let rec find i =
291 if i = slen
292 then parse_error "not enough data for attribute value" s pos;
294 if s.[i] = qc then i else find (i+1)
296 find (qpos+1)
299 let vallen = closing_q_pos - (qpos+1) in
300 let val' = String.sub s (qpos+1) vallen in
301 (name, val'), closing_q_pos+1
303 else parse_error "attribute name not followed by '='" s pos
306 let pos = find_non_white pos in
307 if s.[pos] = '>'
308 then accu, pos+1, false
309 else (
310 if slen - pos > 2 && s.[pos] = '/' && s.[pos+1] = '>'
311 then accu, pos+2, true
312 else (
313 if isname s.[pos]
314 then (
315 let nameval, pos = nameval pos in
316 let accu = nameval :: accu in
317 f accu pos
319 else parse_error "malformed attribute list" s pos;
323 f [] pos
325 let _, _ = collect v 0 `text in
326 v.accu