Simplify
[llpp.git] / parser.ml
blobd7042e91c72b67820ef564a6979945956ac1cc4b
1 (* based on Tor Andersson's XML parser from MuPDF's XPS module *)
3 let iswhite = function
4 | '\r' | '\n' | '\t' | ' ' -> true
5 | _ -> false
6 ;;
8 let isname = function
9 | '.' | '-' | '_' | ':' -> true
10 | c -> (c >= '0' && c <= '9')
11 || (c >= 'a' && c <= 'z')
12 || (c >= 'A' && c <= 'Z')
15 exception Parse_error of string * string * int;;
17 let parse_error msg s pos = raise (Parse_error (msg, s, pos));;
19 let enent s pos len =
20 let b = Buffer.create len in
21 let rec loop i =
22 if i - pos = len
23 then Buffer.contents b
24 else (
25 begin match s.[i] with
26 | '<' -> Buffer.add_string b "&lt;"
27 | '>' -> Buffer.add_string b "&gt;"
28 | '\'' -> Buffer.add_string b "&apos;"
29 | '\"' -> Buffer.add_string b "&quot;"
30 | '&' -> Buffer.add_string b "&amp;"
31 | c ->
32 let code = Char.code c in
33 if code < 32 || code > 127
34 then (
35 Buffer.add_string b "&#";
36 Buffer.add_string b (string_of_int code);
37 Buffer.add_char b ';';
39 else Buffer.add_char b c
40 end;
41 loop (i+1)
44 loop pos
47 let unent b s pos len =
48 let rec loop i =
49 if i != pos + len
50 then
51 let amppos =
52 try
53 String.index_from s i '&'
54 with Not_found -> -1
56 if amppos = -1 || amppos >= pos + len
57 then Buffer.add_substring b s i (pos + len - i)
58 else (
59 Buffer.add_substring b s i (amppos - i);
60 if amppos = i + len then Utils.error "lonely amp";
62 let semipos =
63 try
64 let semipos = String.index_from s (amppos+1) ';' in
65 if semipos >= pos + len then raise Not_found;
66 semipos
67 with Not_found ->
68 Utils.error "amp not followed by semicolon at %d" amppos
71 let subslen = semipos-amppos-1 in
72 if subslen = 0 then Utils.error "empty amp at %d" amppos;
74 let subs = String.sub s (amppos+1) subslen in
76 if subs.[0] = '#'
77 then (
78 if subslen = 1
79 then Utils.error "empty amp followed by hash at %d" amppos;
80 let code =
81 if subs.[1] = 'x'
82 then Scanf.sscanf subs "#x%x" (fun n -> n)
83 else int_of_string (String.sub subs 1 (subslen-1))
85 let c = Char.unsafe_chr code in
86 Buffer.add_char b c
88 else (
89 match subs with
90 | "lt" -> Buffer.add_char b '<'
91 | "gt" -> Buffer.add_char b '>'
92 | "amp" -> Buffer.add_char b '&'
93 | "apos" -> Buffer.add_char b '\''
94 | "quot" -> Buffer.add_char b '\"'
95 | _ -> Utils.error "unknown amp %S" subs
97 loop (semipos+1)
100 loop pos
103 let subs s pos =
104 let len = String.length s in
105 let left = len - pos in
106 if left < 0
107 then Printf.sprintf "(pos=%d len=%d left=%d)" pos len left
108 else
109 let len = min left 10 in
110 String.sub s pos len
113 let ts = function
114 | `text -> "text"
115 | `lt -> "lt"
116 | `close -> "close"
117 | `exclam -> "exclam"
118 | `question -> "question"
119 | `doctype -> "doctype"
120 | `comment -> "comment"
121 | `tag -> "tag"
124 type attr = string * string
125 and attrs = attr list
126 and vp =
127 | Vdata
128 | Vcdata
129 | Vopen of string * attrs * bool
130 | Vclose of string
131 | Vend
132 and 'a v = { f : 'a v -> vp -> int -> int -> 'a v; accu : 'a }
135 let parse v s =
136 let r_comment_terminator = Str.regexp "-->"
137 and r_CDATA_terminator = Str.regexp "\\]\\]>"
138 and r_q_terminator = Str.regexp "\\?>" in
140 let slen = String.length s in
142 let find_substr pos subs r =
143 let pos =
145 Str.search_forward r s pos
146 with Not_found ->
147 parse_error ("cannot find substring " ^ subs) s pos
151 let begins_with pos prefix = Utils.substratis s pos prefix in
152 let find_non_white pos =
153 let rec forward i =
154 if i >= slen
155 then parse_error "cannot find non white space character" s pos;
156 if iswhite s.[i] then forward (i+1) else i in
157 forward pos
160 let getname pos =
161 let non_name_pos =
162 let rec find_non_name i =
163 if i >= slen then parse_error "cannot find non name character" s pos;
164 if isname s.[i] then find_non_name (i+1) else i
166 find_non_name pos
168 non_name_pos, String.sub s pos (non_name_pos - pos)
171 let rec collect v pos t =
172 if pos >= slen && t != `text
173 then parse_error ("not enough data for " ^ ts t) s pos;
175 match t with
176 | `text ->
177 let ltpos =
179 String.index_from s pos '<'
180 with Not_found ->
181 let rec trailsbywhite i =
182 if pos+i = String.length s
183 then -1
184 else (
185 if not (iswhite s.[pos+i])
186 then parse_error "garbage at the end" s pos
187 else trailsbywhite (i+1)
190 trailsbywhite 0
192 if ltpos = -1
193 then v.f v Vend pos slen, slen
194 else
195 let start_of_text_pos = find_non_white pos in
196 let end_of_text_pos =
197 if start_of_text_pos < ltpos
198 then
199 let rec find i =
200 if i = start_of_text_pos || not (iswhite s.[i])
201 then i+1
202 else find (i-1)
204 find (ltpos-1)
205 else start_of_text_pos
207 let v =
208 if start_of_text_pos != end_of_text_pos
209 then v.f v Vdata start_of_text_pos end_of_text_pos
210 else v
212 collect v (ltpos+1) `lt
214 | `lt ->
215 let pos, t =
216 match s.[pos] with
217 | '/' -> (pos+1), `close
218 | '!' -> (pos+1), `exclam
219 | '?' -> (pos+1), `question
220 | c when isname c -> pos, `tag
221 | _ -> parse_error "invalid data after <" s pos
223 collect v pos t
225 | `close ->
226 let tag_name_pos = find_non_white pos in
227 let tag_name_end_pos, close_tag_name = getname tag_name_pos in
228 let close_tag_pos = find_non_white tag_name_end_pos in
229 if s.[close_tag_pos] != '>'
230 then parse_error "missing >" s pos;
231 let pos' = close_tag_pos + 1 in
232 let v = v.f v (Vclose close_tag_name) pos pos' in
233 collect v pos' `text
235 | `doctype ->
236 let close_tag_pos =
238 String.index_from s pos '>'
239 with Not_found ->
240 parse_error "doctype is not terminated" s pos
242 collect v (close_tag_pos+1) `text
244 | `comment ->
245 let pos =
247 find_substr pos "-->" r_comment_terminator
248 with Not_found ->
249 parse_error "comment is not terminated" s pos
251 collect v (pos+3) `text
253 | `exclam ->
254 if begins_with pos "[CDATA["
255 then
256 let cdata_start = pos+7 in
257 let cdata_end = find_substr cdata_start "]]>" r_CDATA_terminator in
258 let v = v.f v Vcdata cdata_start cdata_end in
259 collect v (cdata_end+3) `text
260 else (
261 if begins_with pos "DOCTYPE"
262 then
263 collect v (pos+7) `doctype
264 else (
265 if begins_with pos "--"
266 then collect v (pos+2) `comment
267 else parse_error "unknown shit after exclamation mark" s pos
271 | `question ->
272 let pos = find_substr pos "?>" r_q_terminator in
273 collect v (pos+2) `text
275 | `tag ->
276 let pos', name = getname pos in
277 let attrs, pos', closed = collect_attributes pos' in
278 let v = v.f v (Vopen (name, attrs, closed)) pos pos' in
279 collect v pos' `text
281 and collect_attributes pos =
282 let rec f accu pos =
283 let nameval pos =
284 let pos, name = getname pos in
285 let pos = find_non_white pos in
286 if s.[pos] = '='
287 then
288 let qpos = pos+1 in
289 if qpos = slen
290 then parse_error "not enough data for attribute" s pos;
292 let qc = s.[qpos] in
293 if not (qc = '\'' || qc = '\"')
294 then parse_error "assignment is not followed by a quote" s pos;
296 let closing_q_pos =
297 let rec find i =
298 if i = slen
299 then parse_error "not enough data for attribute value" s pos;
301 if s.[i] = qc then i else find (i+1)
303 find (qpos+1)
306 let vallen = closing_q_pos - (qpos+1) in
307 let val' = String.sub s (qpos+1) vallen in
308 (name, val'), closing_q_pos+1
310 else parse_error "attribute name not followed by '='" s pos
313 let pos = find_non_white pos in
314 if s.[pos] = '>'
315 then accu, pos+1, false
316 else (
317 if slen - pos > 2 && s.[pos] = '/' && s.[pos+1] = '>'
318 then accu, pos+2, true
319 else (
320 if isname s.[pos]
321 then (
322 let nameval, pos = nameval pos in
323 let accu = nameval :: accu in
324 f accu pos
326 else parse_error "malformed attribute list" s pos;
330 f [] pos
332 let _, _ = collect v 0 `text in
333 v.accu;