Cosmetics
[llpp.git] / parser.ml
blobce5197cc0090586f4ac342f2d8d6410d23681a5e
1 (* based on Tor Andersson's XML parser from MuPDF's XPS module *)
3 let iswhite = function
4 | '\r' | '\n' | '\t' | ' ' -> true
5 | _ -> false
6 ;;
8 let isname = function
9 | '.' | '-' | '_' | ':' -> true
10 | c -> (c >= '0' && c <= '9')
11 || (c >= 'a' && c <= 'z')
12 || (c >= 'A' && c <= 'Z')
15 exception Parse_error of string * string * int;;
17 let parse_error msg s pos =
18 raise (Parse_error (msg, s, pos))
21 let enent s pos len =
22 let b = Buffer.create len in
23 let rec loop i =
24 if i - pos = len
25 then Buffer.contents b
26 else (
27 begin match s.[i] with
28 | '<' -> Buffer.add_string b "&lt;"
29 | '>' -> Buffer.add_string b "&gt;"
30 | '\'' -> Buffer.add_string b "&apos;"
31 | '\"' -> Buffer.add_string b "&quot;"
32 | '&' -> Buffer.add_string b "&amp;"
33 | c ->
34 let code = Char.code c in
35 if code < 32 || code > 127
36 then (
37 Buffer.add_string b "&#";
38 Buffer.add_string b (string_of_int code);
39 Buffer.add_char b ';';
41 else Buffer.add_char b c
42 end;
43 loop (i+1)
46 loop pos
49 let unent b s pos len =
50 let rec loop i =
51 if i != pos + len
52 then
53 let amppos =
54 try
55 String.index_from s i '&'
56 with Not_found -> -1
58 if amppos = -1 || amppos >= pos + len
59 then Buffer.add_substring b s i (pos + len - i)
60 else (
61 Buffer.add_substring b s i (amppos - i);
62 if amppos = i + len then Utils.error "lonely amp";
64 let semipos =
65 try
66 let semipos = String.index_from s (amppos+1) ';' in
67 if semipos >= pos + len then raise Not_found;
68 semipos
69 with Not_found ->
70 Utils.error "amp not followed by semicolon at %d" amppos
73 let subslen = semipos-amppos-1 in
74 if subslen = 0 then Utils.error "empty amp at %d" amppos;
76 let subs = String.sub s (amppos+1) subslen in
78 if subs.[0] = '#'
79 then (
80 if subslen = 1
81 then Utils.error "empty amp followed by hash at %d" amppos;
82 let code =
83 if subs.[1] = 'x'
84 then Scanf.sscanf subs "#x%x" (fun n -> n)
85 else int_of_string (String.sub subs 1 (subslen-1))
87 let c = Char.unsafe_chr code in
88 Buffer.add_char b c
90 else (
91 match subs with
92 | "lt" -> Buffer.add_char b '<'
93 | "gt" -> Buffer.add_char b '>'
94 | "amp" -> Buffer.add_char b '&'
95 | "apos" -> Buffer.add_char b '\''
96 | "quot" -> Buffer.add_char b '\"'
97 | _ -> Utils.error "unknown amp %S" subs
99 loop (semipos+1)
102 loop pos
105 let subs s pos =
106 let len = String.length s in
107 let left = len - pos in
108 if left < 0
109 then Printf.sprintf "(pos=%d len=%d left=%d)" pos len left
110 else
111 let len = min left 10 in
112 String.sub s pos len
115 let ts = function
116 | `text -> "text"
117 | `lt -> "lt"
118 | `close -> "close"
119 | `exclam -> "exclam"
120 | `question -> "question"
121 | `doctype -> "doctype"
122 | `comment -> "comment"
123 | `tag -> "tag"
126 type attr = string * string
127 and attrs = attr list
128 and vp =
129 | Vdata
130 | Vcdata
131 | Vopen of string * attrs * bool
132 | Vclose of string
133 | Vend
134 and 'a v = { f : 'a v -> vp -> int -> int -> 'a v; accu : 'a }
137 let parse v s =
138 let r_comment_terminator = Str.regexp "-->"
139 and r_CDATA_terminator = Str.regexp "\\]\\]>"
140 and r_q_terminator = Str.regexp "\\?>" in
142 let slen = String.length s in
144 let find_substr pos subs r =
145 let pos =
147 Str.search_forward r s pos
148 with Not_found ->
149 parse_error ("cannot find substring " ^ subs) s pos
153 let begins_with pos prefix = Utils.substratis s pos prefix in
154 let find_non_white pos =
155 let rec forward i =
156 if i >= slen
157 then parse_error "cannot find non white space character" s pos;
158 if iswhite s.[i] then forward (i+1) else i in
159 forward pos
162 let getname pos =
163 let non_name_pos =
164 let rec find_non_name i =
165 if i >= slen then parse_error "cannot find non name character" s pos;
166 if isname s.[i] then find_non_name (i+1) else i
168 find_non_name pos
170 non_name_pos, String.sub s pos (non_name_pos - pos)
173 let rec collect v pos t =
174 if pos >= slen && t != `text
175 then parse_error ("not enough data for " ^ ts t) s pos;
177 match t with
178 | `text ->
179 let ltpos =
181 String.index_from s pos '<'
182 with Not_found ->
183 let rec trailsbywhite i =
184 if pos+i = String.length s
185 then -1
186 else (
187 if not (iswhite s.[pos+i])
188 then parse_error "garbage at the end" s pos
189 else trailsbywhite (i+1)
192 trailsbywhite 0
194 if ltpos = -1
195 then
196 v.f v Vend pos slen, slen
197 else
198 let start_of_text_pos = find_non_white pos in
199 let end_of_text_pos =
200 if start_of_text_pos < ltpos
201 then
202 let rec find i =
203 if i = start_of_text_pos || not (iswhite s.[i])
204 then i+1
205 else find (i-1)
207 find (ltpos-1)
208 else start_of_text_pos
210 let v =
211 if start_of_text_pos != end_of_text_pos
212 then v.f v Vdata start_of_text_pos end_of_text_pos
213 else v
215 collect v (ltpos+1) `lt
217 | `lt ->
218 let pos, t =
219 match s.[pos] with
220 | '/' -> (pos+1), `close
221 | '!' -> (pos+1), `exclam
222 | '?' -> (pos+1), `question
223 | c when isname c -> pos, `tag
224 | _ -> parse_error "invalid data after <" s pos
226 collect v pos t
228 | `close ->
229 let tag_name_pos = find_non_white pos in
230 let tag_name_end_pos, close_tag_name = getname tag_name_pos in
231 let close_tag_pos = find_non_white tag_name_end_pos in
232 if s.[close_tag_pos] != '>'
233 then parse_error "missing >" s pos;
234 let pos' = close_tag_pos + 1 in
235 let v = v.f v (Vclose close_tag_name) pos pos' in
236 collect v pos' `text
238 | `doctype ->
239 let close_tag_pos =
241 String.index_from s pos '>'
242 with Not_found ->
243 parse_error "doctype is not terminated" s pos
245 collect v (close_tag_pos+1) `text
247 | `comment ->
248 let pos =
250 find_substr pos "-->" r_comment_terminator
251 with Not_found ->
252 parse_error "comment is not terminated" s pos
254 collect v (pos+3) `text
256 | `exclam ->
257 if begins_with pos "[CDATA["
258 then
259 let cdata_start = pos+7 in
260 let cdata_end = find_substr cdata_start "]]>" r_CDATA_terminator in
261 let v = v.f v Vcdata cdata_start cdata_end in
262 collect v (cdata_end+3) `text
263 else (
264 if begins_with pos "DOCTYPE"
265 then
266 collect v (pos+7) `doctype
267 else (
268 if begins_with pos "--"
269 then collect v (pos+2) `comment
270 else parse_error "unknown shit after exclamation mark" s pos
274 | `question ->
275 let pos = find_substr pos "?>" r_q_terminator in
276 collect v (pos+2) `text
278 | `tag ->
279 let pos', name = getname pos in
280 let attrs, pos', closed = collect_attributes pos' in
281 let v = v.f v (Vopen (name, attrs, closed)) pos pos' in
282 collect v pos' `text
284 and collect_attributes pos =
285 let rec f accu pos =
286 let nameval pos =
287 let pos, name = getname pos in
288 let pos = find_non_white pos in
289 if s.[pos] = '='
290 then
291 let qpos = pos+1 in
292 if qpos = slen
293 then parse_error "not enough data for attribute" s pos;
295 let qc = s.[qpos] in
296 if not (qc = '\'' || qc = '\"')
297 then parse_error "assignment is not followed by a quote" s pos;
299 let closing_q_pos =
300 let rec find i =
301 if i = slen
302 then parse_error "not enough data for attribute value" s pos;
304 if s.[i] = qc then i else find (i+1)
306 find (qpos+1)
309 let vallen = closing_q_pos - (qpos+1) in
310 let val' = String.sub s (qpos+1) vallen in
311 (name, val'), closing_q_pos+1
313 else parse_error "attribute name not followed by '='" s pos
316 let pos = find_non_white pos in
317 if s.[pos] = '>'
318 then
319 accu, pos+1, false
320 else (
321 if slen - pos > 2 && s.[pos] = '/' && s.[pos+1] = '>'
322 then
323 accu, pos+2, true
324 else (
325 if isname s.[pos]
326 then (
327 let nameval, pos = nameval pos in
328 let accu = nameval :: accu in
329 f accu pos
331 else parse_error "malformed attribute list" s pos;
335 f [] pos
337 let _, _ = collect v 0 `text in
338 v.accu;