1 (* based on Tor Andersson's XML parser from MuPDF's XPS module *)
3 let r_comment_terminator = Str.regexp
"-->";;
4 let r_CDATA_terminator = Str.regexp
"\\]\\]>";;
5 let r_q_terminator = Str.regexp
"\\?>";;
8 | '
\r'
| '
\n'
| '
\t'
| ' '
-> true
13 | '
.'
| '
-'
| '_'
| '
:'
-> true
14 | c
-> (c
>= '
0'
&& c
<= '
9'
)
15 || (c
>= 'a'
&& c
<= 'z'
)
16 || (c
>= 'A'
&& c
<= 'Z'
)
19 exception Parse_error
of string * string * int
21 let parse_error msg s pos
=
22 raise
(Parse_error
(msg
, s
, pos
))
26 let b = Buffer.create len
in
29 then Buffer.contents
b
31 begin match s
.[i
] with
32 | '
<'
-> Buffer.add_string
b "<"
33 | '
>'
-> Buffer.add_string
b ">"
34 | '
\''
-> Buffer.add_string
b "'"
35 | '
\"'
-> Buffer.add_string
b """
36 | '
&'
-> Buffer.add_string
b "&"
38 let code = Char.code c
in
39 if code < 32 || code > 127
41 Buffer.add_string
b "&#";
42 Buffer.add_string
b (string_of_int
code);
43 Buffer.add_char
b '
;'
;
45 else Buffer.add_char
b c
53 let unent b s pos len
=
60 String.index_from s i '
&'
63 if amppos = -1 || amppos >= pos
+ len
65 Buffer.add_substring
b s i
(pos
+ len
- i
)
68 Buffer.add_substring
b s i
(amppos - i
);
69 if amppos = i
+ len
then failwith
"lonely amp";
73 let semipos = String.index_from s
(amppos+1) '
;'
in
74 if semipos >= pos
+ len
then raise Not_found
;
76 with Not_found
-> failwith
"amp not followed by semicolon"
79 let subslen = semipos-amppos-1 in
80 if subslen = 0 then failwith
"empty amp";
82 let subs = String.sub s
(amppos+1) subslen in
86 if subslen = 1 then failwith
"empty amp followed by hash";
90 Scanf.sscanf
subs "#x%x" (fun n
-> n
)
93 int_of_string
(String.sub
subs 1 (subslen-1))
96 let c = Char.unsafe_chr
code in
101 | "lt" -> Buffer.add_char
b '
<'
102 | "gt" -> Buffer.add_char
b '
>'
103 | "amp" -> Buffer.add_char
b '
&'
104 | "apos" -> Buffer.add_char
b '
\''
105 | "quot" -> Buffer.add_char
b '
\"'
106 | _
-> failwith
("unknown amp " ^
String.escaped
subs)
115 let len = String.length s
in
116 let left = len - pos
in
119 Printf.sprintf
"(pos=%d len=%d left=%d)"
122 let len = min
left 10 in
123 let s = String.sub
s pos
len in
131 | `exclam
-> "exclam"
132 | `question
-> "question"
133 | `doctype
-> "doctype"
134 | `comment
-> "comment"
138 type attr
= string * string
139 and attrs
= attr list
143 | Vopen
of string * attrs
* bool
146 and 'a v
= { f
: 'a v
-> vp
-> int -> int -> 'a v
; accu
: 'a
}
150 let slen = String.length
s in
152 let find_substr pos
subs r
=
155 Str.search_forward r
s pos
157 parse_error ("couldn't find substring " ^
subs) s pos
162 let begins_with pos prefix
=
163 let prefixlen = String.length prefix
in
164 if String.length
s - pos >= prefixlen
167 i
= prefixlen || (s.[pos+i
] = prefix
.[i
]) && cmp (i
+1)
174 let find_non_white pos =
177 then parse_error "couldn't find non white space character" s pos;
178 if iswhite s.[i
] then forward (i
+1) else i
in
184 let rec find_non_name i
=
185 if i
>= slen then parse_error "couldn't find non name character" s pos;
186 if isname s.[i
] then find_non_name (i
+1) else i
190 non_name_pos, String.sub
s pos (non_name_pos - pos)
193 let rec collect v
pos t
=
194 if pos >= slen && t
!= `text
195 then parse_error ("not enough data for " ^
ts t
) s pos;
201 String.index_from
s pos '
<'
203 let rec trailsbywhite i
=
204 if pos+i
= String.length
s
207 if not
(iswhite s.[pos+i
])
208 then parse_error "garbage at the end" s pos
209 else trailsbywhite (i
+1)
216 v
.f v Vend
pos slen, slen
218 let start_of_text_pos = find_non_white pos in
219 let end_of_text_pos =
220 if start_of_text_pos < ltpos
223 if i
= start_of_text_pos || not
(iswhite s.[i
])
228 else start_of_text_pos
231 if start_of_text_pos != end_of_text_pos
232 then v.f
v Vdata
start_of_text_pos end_of_text_pos
235 collect v (ltpos+1) `lt
240 | '
/'
-> (pos+1), `close
241 | '
!'
-> (pos+1), `exclam
242 | '?'
-> (pos+1), `question
243 | c when isname c -> pos, `tag
244 | _
-> parse_error "invalid data after <" s pos
249 let tag_name_pos = find_non_white pos in
250 let tag_name_end_pos, close_tag_name
= getname tag_name_pos in
251 let close_tag_pos = find_non_white tag_name_end_pos in
252 if s.[close_tag_pos] != '
>'
253 then parse_error "missing >" s pos;
254 let pos'
= close_tag_pos + 1 in
255 let v = v.f
v (Vclose close_tag_name
) pos pos'
in
261 String.index_from
s pos '
>'
263 parse_error "doctype is not terminated" s pos
265 collect v (close_tag_pos+1) `text
270 find_substr pos "-->" r_comment_terminator
272 parse_error "comment is not terminated" s pos
274 collect v (pos+3) `text
277 if begins_with pos "[CDATA["
279 let cdata_start = pos+7 in
280 let cdata_end = find_substr cdata_start "]]>" r_CDATA_terminator in
281 let v = v.f
v Vcdata
cdata_start cdata_end in
282 collect v (cdata_end+3) `text
284 if begins_with pos "DOCTYPE"
286 collect v (pos+7) `doctype
288 if begins_with pos "--"
289 then collect v (pos+2) `comment
290 else parse_error "unknown shit after exclamation mark" s pos
295 let pos = find_substr pos "?>" r_q_terminator in
296 collect v (pos+2) `text
299 let pos'
, name
= getname pos in
300 let attrs, pos'
, closed
= collect_attributes
pos'
in
301 let v = v.f
v (Vopen
(name
, attrs, closed
)) pos pos'
in
304 and collect_attributes
pos =
307 let pos, name
= getname pos in
308 let pos = find_non_white pos in
313 then parse_error "not enough data for attribute" s pos;
316 if not
(qc = '
\''
|| qc = '
\"'
)
317 then parse_error "assignment is not followed by a quote" s pos;
322 then parse_error "not enough data for attribute value" s pos;
324 if s.[i
] = qc then i
else find (i
+1)
329 let vallen = closing_q_pos - (qpos+1) in
330 let val'
= String.sub
s (qpos+1) vallen in
331 (name
, val'
), closing_q_pos+1
333 else parse_error "attribute name not followed by '='" s pos
336 let pos = find_non_white pos in
341 if slen - pos > 2 && s.[pos] = '
/'
&& s.[pos+1] = '
>'
347 let nameval, pos = nameval pos in
348 let accu = nameval :: accu in
351 else parse_error "malformed attribute list" s pos;
357 let _, _ = collect v 0 `text
in