parser.ml

   1 (* based on Tor Andersson's XML parser from MuPDF's XPS module *)
   2
   3 let r_comment_terminator = Str.regexp "-->";;
   4 let r_CDATA_terminator = Str.regexp "\\]\\]>";;
   5 let r_q_terminator = Str.regexp "\\?>";;
   6
   7 let iswhite = function
   8   | '\r' | '\n' | '\t' | ' ' -> true
   9   | _ -> false
  10 ;;
  11
  12 let isname = function
  13   | '.' | '-' | '_' | ':' -> true
  14   | c -> (c >= '0' && c <= '9')
  15       || (c >= 'a' && c <= 'z')
  16       || (c >= 'A' && c <= 'Z')
  17 ;;
  18
  19 exception Parse_error of string * string * int;;
  20
  21 let parse_error msg s pos =
  22   raise (Parse_error (msg, s, pos))
  23 ;;
  24
  25 let enent s pos len =
  26   let b = Buffer.create len in
  27   let rec loop i =
  28     if i - pos = len
  29     then Buffer.contents b
  30     else (
  31       begin match s.[i] with
  32       | '<' -> Buffer.add_string b "&lt;"
  33       | '>' -> Buffer.add_string b "&gt;"
  34       | '\'' -> Buffer.add_string b "&apos;"
  35       | '\"' -> Buffer.add_string b "&quot;"
  36       | '&' -> Buffer.add_string b "&amp;"
  37       | c ->
  38           let code = Char.code c in
  39           if code < 32 || code > 127
  40           then (
  41             Buffer.add_string b "&#";
  42             Buffer.add_string b (string_of_int code);
  43             Buffer.add_char b ';';
  44           )
  45           else Buffer.add_char b c
  46       end;
  47       loop (i+1)
  48     )
  49   in
  50   loop pos
  51 ;;
  52
  53 let unent b s pos len =
  54   let rec loop i =
  55     if i = pos + len
  56     then ()
  57     else
  58       let amppos =
  59         try
  60           String.index_from s i '&'
  61         with Not_found -> -1
  62       in
  63       if amppos = -1 || amppos >= pos + len
  64       then (
  65         Buffer.add_substring b s i (pos + len - i)
  66       )
  67       else (
  68         Buffer.add_substring b s i (amppos - i);
  69         if amppos = i + len then failwith "lonely amp";
  70
  71         let semipos =
  72           try
  73             let semipos = String.index_from s (amppos+1) ';' in
  74             if semipos >= pos + len then raise Not_found;
  75             semipos
  76           with Not_found -> failwith "amp not followed by semicolon"
  77         in
  78
  79         let subslen = semipos-amppos-1 in
  80         if subslen = 0 then failwith "empty amp";
  81
  82         let subs = String.sub s (amppos+1) subslen in
  83
  84         if subs.[0] = '#'
  85         then (
  86           if subslen = 1 then failwith "empty amp followed by hash";
  87           let code =
  88             if subs.[1] = 'x'
  89             then (
  90               Scanf.sscanf subs "#x%x" (fun n -> n)
  91             )
  92             else (
  93               int_of_string (String.sub subs 1 (subslen-1))
  94             )
  95           in
  96           let c = Char.unsafe_chr code in
  97           Buffer.add_char b c
  98         )
  99         else (
 100           match subs with
 101           | "lt" -> Buffer.add_char b '<'
 102           | "gt" -> Buffer.add_char b '>'
 103           | "amp" -> Buffer.add_char b '&'
 104           | "apos" -> Buffer.add_char b '\''
 105           | "quot" -> Buffer.add_char b '\"'
 106           | _ -> failwith ("unknown amp " ^ String.escaped subs)
 107         );
 108         loop (semipos+1)
 109       )
 110   in
 111   loop pos
 112 ;;
 113
 114 let subs s pos =
 115   let len = String.length s in
 116   let left = len - pos in
 117   if left < 0
 118   then
 119     Printf.sprintf "(pos=%d len=%d left=%d)"
 120       pos len left
 121   else
 122     let len = min left 10 in
 123     let s = String.sub s pos len in
 124     s;
 125 ;;
 126
 127 let ts = function
 128   | `text -> "text"
 129   | `lt -> "lt"
 130   | `close -> "close"
 131   | `exclam -> "exclam"
 132   | `question -> "question"
 133   | `doctype -> "doctype"
 134   | `comment -> "comment"
 135   | `tag -> "tag"
 136 ;;
 137
 138 type attr = string * string
 139 and attrs = attr list
 140 and vp =
 141     | Vdata
 142     | Vcdata
 143     | Vopen of string * attrs * bool
 144     | Vclose of string
 145     | Vend
 146 and 'a v = { f : 'a v -> vp -> int -> int -> 'a v; accu : 'a }
 147 ;;
 148
 149 let parse v s =
 150   let slen = String.length s in
 151
 152   let find_substr pos subs r =
 153     let pos =
 154       try
 155         Str.search_forward r s pos
 156       with Not_found ->
 157         parse_error ("cannot find substring " ^ subs) s pos
 158     in
 159     pos
 160   in
 161
 162   let begins_with pos prefix =
 163     let prefixlen = String.length prefix in
 164     if String.length s - pos >= prefixlen
 165     then
 166       let rec cmp i =
 167         i = prefixlen || (s.[pos+i] = prefix.[i]) && cmp (i+1)
 168       in
 169       cmp 0
 170     else
 171       false
 172   in
 173
 174   let find_non_white pos =
 175     let rec forward i =
 176       if i >= slen
 177       then parse_error "cannot find non white space character" s pos;
 178       if iswhite s.[i] then forward (i+1) else i in
 179     forward pos
 180   in
 181
 182   let getname pos =
 183     let non_name_pos =
 184       let rec find_non_name i =
 185         if i >= slen then parse_error "cannot find non name character" s pos;
 186         if isname s.[i] then find_non_name (i+1) else i
 187       in
 188       find_non_name pos
 189     in
 190     non_name_pos, String.sub s pos (non_name_pos - pos)
 191   in
 192
 193   let rec collect v pos t =
 194     if pos >= slen && t != `text
 195     then parse_error ("not enough data for " ^ ts t) s pos;
 196
 197     match t with
 198     | `text ->
 199         let ltpos =
 200           try
 201             String.index_from s pos '<'
 202           with Not_found ->
 203             let rec trailsbywhite i =
 204               if pos+i = String.length s
 205               then -1
 206               else (
 207                 if not (iswhite s.[pos+i])
 208                 then parse_error "garbage at the end" s pos
 209                 else trailsbywhite (i+1)
 210               )
 211             in
 212             trailsbywhite 0
 213         in
 214         if ltpos = -1
 215         then
 216           v.f v Vend pos slen, slen
 217         else
 218           let start_of_text_pos = find_non_white pos in
 219           let end_of_text_pos =
 220             if start_of_text_pos < ltpos
 221             then
 222               let rec find i =
 223                 if i = start_of_text_pos || not (iswhite s.[i])
 224                 then i+1
 225                 else find (i-1)
 226               in
 227               find (ltpos-1)
 228             else start_of_text_pos
 229           in
 230           let v =
 231             if start_of_text_pos != end_of_text_pos
 232             then v.f v Vdata start_of_text_pos end_of_text_pos
 233             else v
 234           in
 235           collect v (ltpos+1) `lt
 236
 237     | `lt ->
 238         let pos, t =
 239           match s.[pos] with
 240           | '/' -> (pos+1), `close
 241           | '!' -> (pos+1), `exclam
 242           | '?' -> (pos+1), `question
 243           | c when isname c -> pos, `tag
 244           | _ -> parse_error "invalid data after <" s pos
 245         in
 246         collect v pos t
 247
 248     | `close ->
 249         let tag_name_pos = find_non_white pos in
 250         let tag_name_end_pos, close_tag_name = getname tag_name_pos in
 251         let close_tag_pos = find_non_white tag_name_end_pos in
 252         if s.[close_tag_pos] != '>'
 253         then parse_error "missing >" s pos;
 254         let pos' = close_tag_pos + 1 in
 255         let v = v.f v (Vclose close_tag_name) pos pos' in
 256         collect v pos' `text
 257
 258     | `doctype ->
 259         let close_tag_pos =
 260           try
 261             String.index_from s pos '>'
 262           with Not_found ->
 263             parse_error "doctype is not terminated" s pos
 264         in
 265         collect v (close_tag_pos+1) `text
 266
 267     | `comment ->
 268         let pos =
 269           try
 270             find_substr pos "-->" r_comment_terminator
 271           with Not_found ->
 272             parse_error "comment is not terminated" s pos
 273         in
 274         collect v (pos+3) `text
 275
 276     | `exclam ->
 277         if begins_with pos "[CDATA["
 278         then
 279           let cdata_start = pos+7 in
 280           let cdata_end = find_substr cdata_start "]]>" r_CDATA_terminator in
 281           let v = v.f v Vcdata cdata_start cdata_end in
 282           collect v (cdata_end+3) `text
 283         else (
 284           if begins_with pos "DOCTYPE"
 285           then
 286             collect v (pos+7) `doctype
 287           else (
 288             if begins_with pos "--"
 289             then collect v (pos+2) `comment
 290             else parse_error "unknown shit after exclamation mark" s pos
 291           )
 292         )
 293
 294     | `question ->
 295         let pos = find_substr pos "?>" r_q_terminator in
 296         collect v (pos+2) `text
 297
 298     | `tag ->
 299         let pos', name = getname pos in
 300         let attrs, pos', closed = collect_attributes pos' in
 301         let v = v.f v (Vopen (name, attrs, closed)) pos pos' in
 302         collect v pos' `text
 303
 304   and collect_attributes pos =
 305     let rec f accu pos =
 306       let nameval pos =
 307         let pos, name = getname pos in
 308         let pos = find_non_white pos in
 309         if s.[pos] = '='
 310         then
 311           let qpos = pos+1 in
 312           if qpos = slen
 313           then parse_error "not enough data for attribute" s pos;
 314
 315           let qc = s.[qpos] in
 316           if not (qc = '\'' || qc = '\"')
 317           then parse_error "assignment is not followed by a quote" s pos;
 318
 319           let closing_q_pos =
 320             let rec find i =
 321               if i = slen
 322               then parse_error "not enough data for attribute value" s pos;
 323
 324               if s.[i] = qc then i else find (i+1)
 325             in
 326             find (qpos+1)
 327           in
 328
 329           let vallen = closing_q_pos - (qpos+1) in
 330           let val' = String.sub s (qpos+1) vallen in
 331           (name, val'), closing_q_pos+1
 332
 333         else parse_error "attribute name not followed by '='" s pos
 334       in
 335
 336       let pos = find_non_white pos in
 337       if s.[pos] = '>'
 338       then
 339         accu, pos+1, false
 340       else (
 341         if slen - pos > 2 && s.[pos] = '/' && s.[pos+1] = '>'
 342         then
 343           accu, pos+2, true
 344         else (
 345           if isname s.[pos]
 346           then (
 347             let nameval, pos = nameval pos in
 348             let accu = nameval :: accu in
 349             f accu pos
 350           )
 351           else parse_error "malformed attribute list" s pos;
 352         )
 353       )
 354     in
 355     f [] pos
 356   in
 357   let _, _ = collect v 0 `text in
 358   v.accu;
 359 ;;