1 (* based on Tor Andersson's XML parser from MuPDF's XPS module *)
3 let dolog fmt
= Printf.kprintf prerr_endline fmt
;;
4 let r_comment_terminator = Str.regexp
"-->";;
5 let r_CDATA_terminator = Str.regexp
"\\]\\]>";;
6 let r_q_terminator = Str.regexp
"\\?>";;
9 | '
\r'
| '
\n'
| '
\t'
| ' '
-> true
14 | '
.'
| '
-'
| '_'
| '
:'
-> true
15 | c
-> (c
>= '
0'
&& c
<= '
9'
)
16 || (c
>= 'a'
&& c
<= 'z'
)
17 || (c
>= 'A'
&& c
<= 'Z'
)
20 exception Parse_error
of string * string * int
23 raise
(Parse_error
(msg
, s
, pos
))
27 let b = Buffer.create len
in
30 then Buffer.contents
b
32 begin match s
.[i
] with
33 | '
<'
-> Buffer.add_string
b "<"
34 | '
>'
-> Buffer.add_string
b ">"
35 | '
\''
-> Buffer.add_string
b "'"
36 | '
"' -> Buffer.add_string b ""
;"
37 | '&' -> Buffer.add_string b "&
;"
39 let code = Char.code c in
40 if code < 32 || code = 127
42 Buffer.add_string b "&#
";
43 Buffer.add_string b (string_of_int code);
44 Buffer.add_char b ';';
46 else Buffer.add_char b c
54 let unent b s pos len =
61 String.index_from s i '&'
64 if amppos = -1 || amppos >= pos + len
66 Buffer.add_substring b s i (pos + len - i)
69 Buffer.add_substring b s i (amppos - i);
70 if amppos = i + len then failwith "lonely amp
";
74 let semipos = String.index_from s (amppos+1) ';' in
75 if semipos >= pos + len then raise Not_found;
77 with Not_found -> failwith "amp not followed by semi colon
"
80 let subslen = semipos-amppos-1 in
81 if subslen = 0 then failwith "empty amp
";
83 let subs = String.sub s (amppos+1) subslen in
87 if subslen = 1 then failwith "empty amp followed by hash
";
95 int_of_string (String.sub subs 1 (subslen-1))
98 let c = Char.unsafe_chr code in
103 | "lt
" -> Buffer.add_char b '<'
104 | "gt
" -> Buffer.add_char b '>'
105 | "amp
" -> Buffer.add_char b '&'
106 | "apos
" -> Buffer.add_char b '\''
107 | "quot
" -> Buffer.add_char b '"'
108 | _
-> failwith
("unknown amp " ^
String.escaped
subs)
117 let len = String.length s
in
118 let left = len - pos
in
121 Printf.sprintf
"(pos=%d len=%d left=%d)"
124 let len = min
left 10 in
125 let s = String.sub
s pos
len in
133 | `exclam
-> "exclam"
134 | `question
-> "question"
135 | `doctype
-> "doctype"
136 | `comment
-> "comment"
140 type attr
= string * string
141 and attrs
= attr list
145 | Vopen
of string * attrs
* bool
148 and 'a v
= { f
: 'a v
-> vp
-> int -> int -> 'a v
; accu
: 'a
}
152 let slen = String.length
s in
154 let find_substr pos
subs r
=
157 Str.search_forward r
s pos
159 error ("couldn't find substring " ^
subs) s pos
164 let begins_with pos prefix
=
165 let prefixlen = String.length prefix
in
166 if String.length
s - pos >= prefixlen
169 i
= prefixlen || (s.[pos+i
] = prefix
.[i
]) && cmp (i
+1)
176 let find_non_white pos =
178 if i
>= slen then error "couldn't find non white space character" s pos;
179 if iswhite s.[i
] then forward (i
+1) else i
in
185 let rec find_non_name i
=
186 if i
>= slen then error "couldn't find non name character" s pos;
187 if isname s.[i
] then find_non_name (i
+1) else i
191 non_name_pos, String.sub
s pos (non_name_pos - pos)
194 let rec collect v
pos t
=
195 if pos >= slen && t
!= `text
196 then error ("not enough data for " ^
ts t
) s pos;
202 String.index_from
s pos '
<'
204 let rec trailsbywhite i
=
205 if pos+i
= String.length
s
208 if not
(iswhite s.[pos+i
])
209 then error "garbage at the end" s pos
210 else trailsbywhite (i
+1)
217 v
.f v Vend
pos slen, slen
219 let start_of_text_pos = find_non_white pos in
220 let end_of_text_pos =
221 if start_of_text_pos < ltpos
224 if i
= start_of_text_pos || not
(iswhite s.[i
])
229 else start_of_text_pos
232 if start_of_text_pos != end_of_text_pos
233 then v.f
v Vdata
start_of_text_pos end_of_text_pos
236 collect v (ltpos+1) `lt
241 | '
/'
-> (pos+1), `close
242 | '
!'
-> (pos+1), `exclam
243 | '?'
-> (pos+1), `question
244 | c when isname c -> pos, `tag
245 | _
-> error "invalid data after <" s pos
250 let tag_name_pos = find_non_white pos in
251 let tag_name_end_pos, close_tag_name
= getname tag_name_pos in
252 let close_tag_pos = find_non_white tag_name_end_pos in
253 if s.[close_tag_pos] != '
>'
254 then error "missing >" s pos;
255 let pos'
= close_tag_pos + 1 in
256 let v = v.f
v (Vclose close_tag_name
) pos pos'
in
262 String.index_from
s pos '
>'
264 error "doctype is not terminated" s pos
266 collect v (close_tag_pos+1) `text
271 find_substr pos "-->" r_comment_terminator
273 error "comment is not terminated" s pos
275 collect v (pos+3) `text
278 if begins_with pos "[CDATA["
280 let cdata_start = pos+7 in
281 let cdata_end = find_substr cdata_start "]]>" r_CDATA_terminator in
282 let v = v.f
v Vcdata
cdata_start cdata_end in
283 collect v (cdata_end+3) `text
285 if begins_with pos "DOCTYPE"
287 collect v (pos+7) `doctype
289 if begins_with pos "--"
290 then collect v (pos+2) `comment
291 else error "unknown shit after exclamation mark" s pos
296 let pos = find_substr pos "?>" r_q_terminator in
297 collect v (pos+2) `text
300 let pos'
, name
= getname pos in
301 let attrs, pos'
, closed
= collect_attributes
pos'
in
302 let v = v.f
v (Vopen
(name
, attrs, closed
)) pos pos'
in
305 and collect_attributes
pos =
308 let pos, name
= getname pos in
309 let pos = find_non_white pos in
314 then error "not enough data for attribute" s pos;
317 if not
(qc = '
\''
|| qc = '
"')
318 then error "assignment is not followed by a quote
" s pos;
323 then error "not enough data
for attribute
value" s pos;
325 if s.[i] = qc then i else find (i+1)
330 let vallen = closing_q_pos - (qpos+1) in
331 let val' = String.sub s (qpos+1) vallen in
332 (name, val'), closing_q_pos+1
334 else error "attribute name not followed by '
='
" s pos
337 let pos = find_non_white pos in
342 if slen - pos > 2 && s.[pos] = '/' && s.[pos+1] = '>'
348 let nameval, pos = nameval pos in
349 let accu = nameval :: accu in
352 else error "malformed attribute list
" s pos;
358 let _, _ = collect v 0 `text in