1 (* based on Tor Andersson's XML parser from MuPDF's XPS module *)
4 | '
\r'
| '
\n'
| '
\t'
| ' '
-> true
8 | '
.'
| '
-'
| '_'
| '
:'
-> true
9 | c
-> (c
>= '
0'
&& c
<= '
9'
)
10 || (c
>= 'a'
&& c
<= 'z'
)
11 || (c
>= 'A'
&& c
<= 'Z'
)
13 exception Parse_error
of string * string * int
15 let parse_error msg s pos
= raise
(Parse_error
(msg
, s
, pos
))
18 let b = Buffer.create len
in
21 then Buffer.contents
b
23 begin match s
.[i
] with
24 | '
<'
-> Buffer.add_string
b "<"
25 | '
>'
-> Buffer.add_string
b ">"
26 | '
\''
-> Buffer.add_string
b "'"
27 | '
\"'
-> Buffer.add_string
b """
28 | '
&'
-> Buffer.add_string
b "&"
30 let code = Char.code c
in
31 if code < 32 || code > 127
33 Buffer.add_string
b "&#";
34 Buffer.add_string
b (string_of_int
code);
35 Buffer.add_char
b '
;'
;
37 else Buffer.add_char
b c
44 let unent b s pos len
=
50 String.index_from s i '
&'
53 if amppos = -1 || amppos >= pos
+ len
54 then Buffer.add_substring
b s i
(pos
+ len
- i
)
56 Buffer.add_substring
b s i
(amppos - i
);
57 if amppos = i
+ len
then Utils.error
"lonely amp";
61 let semipos = String.index_from s
(amppos+1) '
;'
in
62 if semipos >= pos
+ len
then raise Not_found
;
65 Utils.error
"amp not followed by semicolon at %d" amppos
68 let subslen = semipos-amppos-1 in
69 if subslen = 0 then Utils.error
"empty amp at %d" amppos;
71 let subs = String.sub s
(amppos+1) subslen in
76 then Utils.error
"empty amp followed by hash at %d" amppos;
79 then Scanf.sscanf
subs "#x%x" (fun n
-> n
)
80 else int_of_string
(String.sub
subs 1 (subslen-1))
82 let c = Char.unsafe_chr
code in
87 | "lt" -> Buffer.add_char
b '
<'
88 | "gt" -> Buffer.add_char
b '
>'
89 | "amp" -> Buffer.add_char
b '
&'
90 | "apos" -> Buffer.add_char
b '
\''
91 | "quot" -> Buffer.add_char
b '
\"'
92 | _
-> Utils.error
"unknown amp %S" subs
100 let len = String.length s
in
101 let left = len - pos
in
103 then Printf.sprintf
"(pos=%d len=%d left=%d)" pos
len left
105 let len = min
left 10 in
112 | `exclam
-> "exclam"
113 | `question
-> "question"
114 | `doctype
-> "doctype"
115 | `comment
-> "comment"
118 type attr
= string * string
119 and attrs
= attr list
123 | Vopen
of string * attrs
* bool
126 and 'a v
= { f
: 'a v
-> vp
-> int -> int -> 'a v
; accu
: 'a
}
129 let r_comment_terminator = Str.regexp
"-->"
130 and r_CDATA_terminator
= Str.regexp
"\\]\\]>"
131 and r_q_terminator
= Str.regexp
"\\?>" in
133 let slen = String.length s
in
135 let find_substr pos
subs r
=
138 Str.search_forward r s
pos
140 parse_error ("cannot find substring " ^
subs) s
pos
144 let begins_with pos prefix
= Utils.substratis s
pos prefix
in
145 let find_non_white pos =
148 then parse_error "cannot find non white space character" s
pos;
149 if iswhite s
.[i
] then forward (i
+1) else i
in
155 let rec find_non_name i
=
156 if i
>= slen then parse_error "cannot find non name character" s
pos;
157 if isname s
.[i
] then find_non_name (i
+1) else i
161 non_name_pos, String.sub s
pos (non_name_pos - pos)
164 let rec collect v
pos t
=
165 if pos >= slen && t
!= `text
166 then parse_error ("not enough data for " ^
ts t
) s
pos;
172 String.index_from s
pos '
<'
174 let rec trailsbywhite i
=
175 if pos+i
= String.length s
178 if not
(iswhite s
.[pos+i
])
179 then parse_error "garbage at the end" s
pos
180 else trailsbywhite (i
+1)
186 then v
.f v Vend
pos slen, slen
188 let start_of_text_pos = find_non_white pos in
189 let end_of_text_pos =
190 if start_of_text_pos < ltpos
193 if i
= start_of_text_pos || not
(iswhite s
.[i
])
198 else start_of_text_pos
201 if start_of_text_pos != end_of_text_pos
202 then v.f
v Vdata
start_of_text_pos end_of_text_pos
205 collect v (ltpos+1) `lt
210 | '
/'
-> (pos+1), `close
211 | '
!'
-> (pos+1), `exclam
212 | '?'
-> (pos+1), `question
213 | c when isname c -> pos, `tag
214 | _
-> parse_error "invalid data after <" s
pos
219 let tag_name_pos = find_non_white pos in
220 let tag_name_end_pos, close_tag_name
= getname tag_name_pos in
221 let close_tag_pos = find_non_white tag_name_end_pos in
222 if s
.[close_tag_pos] != '
>'
223 then parse_error "missing >" s
pos;
224 let pos'
= close_tag_pos + 1 in
225 let v = v.f
v (Vclose close_tag_name
) pos pos'
in
231 String.index_from s
pos '
>'
233 parse_error "doctype is not terminated" s
pos
235 collect v (close_tag_pos+1) `text
240 find_substr pos "-->" r_comment_terminator
242 parse_error "comment is not terminated" s
pos
244 collect v (pos+3) `text
247 if begins_with pos "[CDATA["
249 let cdata_start = pos+7 in
250 let cdata_end = find_substr cdata_start "]]>" r_CDATA_terminator
in
251 let v = v.f
v Vcdata
cdata_start cdata_end in
252 collect v (cdata_end+3) `text
254 if begins_with pos "DOCTYPE"
256 collect v (pos+7) `doctype
258 if begins_with pos "--"
259 then collect v (pos+2) `comment
260 else parse_error "unknown shit after exclamation mark" s
pos
265 let pos = find_substr pos "?>" r_q_terminator
in
266 collect v (pos+2) `text
269 let pos'
, name
= getname pos in
270 let attrs, pos'
, closed
= collect_attributes
pos'
in
271 let v = v.f
v (Vopen
(name
, attrs, closed
)) pos pos'
in
274 and collect_attributes
pos =
277 let pos, name
= getname pos in
278 let pos = find_non_white pos in
283 then parse_error "not enough data for attribute" s
pos;
286 if not
(qc = '
\''
|| qc = '
\"'
)
287 then parse_error "assignment is not followed by a quote" s
pos;
292 then parse_error "not enough data for attribute value" s
pos;
294 if s
.[i
] = qc then i
else find (i
+1)
299 let vallen = closing_q_pos - (qpos+1) in
300 let val'
= String.sub s
(qpos+1) vallen in
301 (name
, val'
), closing_q_pos+1
303 else parse_error "attribute name not followed by '='" s
pos
306 let pos = find_non_white pos in
308 then accu
, pos+1, false
310 if slen - pos > 2 && s
.[pos] = '
/'
&& s
.[pos+1] = '
>'
311 then accu
, pos+2, true
315 let nameval, pos = nameval pos in
316 let accu = nameval :: accu in
319 else parse_error "malformed attribute list" s
pos;
325 let _, _ = collect v 0 `text
in