parser.ml

   1 (* based on Tor Andersson's XML parser from MuPDF's XPS module *)
   2
   3 let dolog fmt = Printf.kprintf prerr_endline fmt;;
   4 let r_comment_terminator = Str.regexp "-->";;
   5 let r_CDATA_terminator = Str.regexp "\\]\\]>";;
   6 let r_q_terminator = Str.regexp "\\?>";;
   7
   8 let iswhite = function
   9   | '\r' | '\n' | '\t' | ' ' -> true
  10   | _ -> false
  11 ;;
  12
  13 let isname = function
  14   | '.' | '-' | '_' | ':' -> true
  15   | c -> (c >= '0' && c <= '9')
  16       || (c >= 'a' && c <= 'z')
  17       || (c >= 'A' && c <= 'Z')
  18 ;;
  19
  20 exception Parse_error of string * string * int
  21
  22 let error msg s pos =
  23   raise (Parse_error (msg, s, pos))
  24 ;;
  25
  26 let enent s pos len =
  27   let b = Buffer.create len in
  28   let rec loop i =
  29     if i - pos = len
  30     then Buffer.contents b
  31     else (
  32       begin match s.[i] with
  33       | '<' -> Buffer.add_string b "&lt;"
  34       | '>' -> Buffer.add_string b "&gt;"
  35       | '\'' -> Buffer.add_string b "&apos;"
  36       | '"' -> Buffer.add_string b "&quot;"
  37       | '&' -> Buffer.add_string b "&amp;"
  38       | c ->
  39           let code = Char.code c in
  40           if code < 32 || code = 127
  41           then (
  42             Buffer.add_string b "&#";
  43             Buffer.add_string b (string_of_int code);
  44             Buffer.add_char b ';';
  45           )
  46           else Buffer.add_char b c
  47       end;
  48       loop (i+1)
  49     )
  50   in
  51   loop pos
  52 ;;
  53
  54 let unent b s pos len =
  55   let rec loop i =
  56     if i = pos + len
  57     then ()
  58     else
  59       let amppos =
  60         try
  61           String.index_from s i '&'
  62         with Not_found -> -1
  63       in
  64       if amppos = -1 || amppos >= pos + len
  65       then (
  66         Buffer.add_substring b s i (pos + len - i)
  67       )
  68       else (
  69         Buffer.add_substring b s i (amppos - i);
  70         if amppos = i + len then failwith "lonely amp";
  71
  72         let semipos =
  73           try
  74             let semipos = String.index_from s (amppos+1) ';' in
  75             if semipos >= pos + len then raise Not_found;
  76             semipos
  77           with Not_found -> failwith "amp not followed by semi colon"
  78         in
  79
  80         let subslen = semipos-amppos-1 in
  81         if subslen = 0 then failwith "empty amp";
  82
  83         let subs = String.sub s (amppos+1) subslen in
  84
  85         if subs.[0] = '#'
  86         then (
  87           if subslen = 1 then failwith "empty amp followed by hash";
  88           let code =
  89             if subs.[1] = 'x'
  90             then (
  91               subs.[0] <- '0';
  92               int_of_string subs
  93             )
  94             else (
  95               int_of_string (String.sub subs 1 (subslen-1))
  96             )
  97           in
  98           let c = Char.unsafe_chr code in
  99           Buffer.add_char b c
 100         )
 101         else (
 102           match subs with
 103           | "lt" -> Buffer.add_char b '<'
 104           | "gt" -> Buffer.add_char b '>'
 105           | "amp" -> Buffer.add_char b '&'
 106           | "apos" -> Buffer.add_char b '\''
 107           | "quot" -> Buffer.add_char b '"'
 108           | _ -> failwith ("unknown amp " ^ String.escaped subs)
 109         );
 110         loop (semipos+1)
 111       )
 112   in
 113   loop pos
 114 ;;
 115
 116 let subs s pos =
 117   let len = String.length s in
 118   let left = len - pos in
 119   if left < 0
 120   then
 121     Printf.sprintf "(pos=%d len=%d left=%d)"
 122       pos len left
 123   else
 124     let len = min left 10 in
 125     let s = String.sub s pos len in
 126     s;
 127 ;;
 128
 129 let ts = function
 130   | `text -> "text"
 131   | `lt -> "lt"
 132   | `close -> "close"
 133   | `exclam -> "exclam"
 134   | `question -> "question"
 135   | `doctype -> "doctype"
 136   | `comment -> "comment"
 137   | `tag -> "tag"
 138 ;;
 139
 140 type attr = string * string
 141 and attrs = attr list
 142 and vp =
 143     | Vdata
 144     | Vcdata
 145     | Vopen of string * attrs * bool
 146     | Vclose of string
 147     | Vend
 148 and 'a v = { f : 'a v -> vp -> int -> int -> 'a v; accu : 'a }
 149 ;;
 150
 151 let parse v s =
 152   let slen = String.length s in
 153
 154   let find_substr pos subs r =
 155     let pos =
 156       try
 157         Str.search_forward r s pos
 158       with Not_found ->
 159         error ("couldn't find substring " ^ subs) s pos
 160     in
 161     pos
 162   in
 163
 164   let begins_with pos prefix =
 165     let prefixlen = String.length prefix in
 166     if String.length s - pos >= prefixlen
 167     then
 168       let rec cmp i =
 169         i = prefixlen || (s.[pos+i] = prefix.[i]) && cmp (i+1)
 170       in
 171       cmp 0
 172     else
 173       false
 174   in
 175
 176   let find_non_white pos =
 177     let rec forward i =
 178       if i >= slen then error "couldn't find non white space character" s pos;
 179       if iswhite s.[i] then forward (i+1) else i in
 180     forward pos
 181   in
 182
 183   let getname pos =
 184     let non_name_pos =
 185       let rec find_non_name i =
 186         if i >= slen then error "couldn't find  non name character" s pos;
 187         if isname s.[i] then find_non_name (i+1) else i
 188       in
 189       find_non_name pos
 190     in
 191     non_name_pos, String.sub s pos (non_name_pos - pos)
 192   in
 193
 194   let rec collect v pos t =
 195     if pos >= slen && t != `text
 196     then error ("not enough data for " ^ ts t) s pos;
 197
 198     match t with
 199     | `text ->
 200         let ltpos =
 201           try
 202             String.index_from s pos '<'
 203           with Not_found ->
 204             let rec trailsbywhite i =
 205               if pos+i = String.length s
 206               then -1
 207               else (
 208                 if not (iswhite s.[pos+i])
 209                 then error "garbage at the end" s pos
 210                 else trailsbywhite (i+1)
 211               )
 212             in
 213             trailsbywhite 0
 214         in
 215         if ltpos = -1
 216         then
 217           v.f v Vend pos slen, slen
 218         else
 219           let start_of_text_pos = find_non_white pos in
 220           let end_of_text_pos =
 221             if start_of_text_pos < ltpos
 222             then
 223               let rec find i =
 224                 if i = start_of_text_pos || not (iswhite s.[i])
 225                 then i+1
 226                 else find (i-1)
 227               in
 228               find (ltpos-1)
 229             else start_of_text_pos
 230           in
 231           let v =
 232             if start_of_text_pos != end_of_text_pos
 233             then v.f v Vdata start_of_text_pos end_of_text_pos
 234             else v
 235           in
 236           collect v (ltpos+1) `lt
 237
 238     | `lt ->
 239         let pos, t =
 240           match s.[pos] with
 241           | '/' -> (pos+1), `close
 242           | '!' -> (pos+1), `exclam
 243           | '?' -> (pos+1), `question
 244           | c when isname c -> pos, `tag
 245           | _ -> error "invalid data after <" s pos
 246         in
 247         collect v pos t
 248
 249     | `close ->
 250         let tag_name_pos = find_non_white pos in
 251         let tag_name_end_pos, close_tag_name = getname tag_name_pos in
 252         let close_tag_pos = find_non_white tag_name_end_pos in
 253         if s.[close_tag_pos] != '>'
 254         then error "missing >" s pos;
 255         let pos' = close_tag_pos + 1 in
 256         let v = v.f v (Vclose close_tag_name) pos pos' in
 257         collect v pos' `text
 258
 259     | `doctype ->
 260         let close_tag_pos =
 261           try
 262             String.index_from s pos '>'
 263           with Not_found ->
 264             error "doctype is not terminated" s pos
 265         in
 266         collect v (close_tag_pos+1) `text
 267
 268     | `comment ->
 269         let pos =
 270           try
 271             find_substr pos "-->" r_comment_terminator
 272           with Not_found ->
 273             error "comment is not terminated" s pos
 274         in
 275         collect v (pos+3) `text
 276
 277     | `exclam ->
 278         if begins_with pos "[CDATA["
 279         then
 280           let cdata_start = pos+7 in
 281           let cdata_end = find_substr cdata_start "]]>" r_CDATA_terminator in
 282           let v = v.f v Vcdata cdata_start cdata_end in
 283           collect v (cdata_end+3) `text
 284         else (
 285           if begins_with pos "DOCTYPE"
 286           then
 287             collect v (pos+7) `doctype
 288           else (
 289             if begins_with pos "--"
 290             then collect v (pos+2) `comment
 291             else error "unknown shit after exclamation mark" s pos
 292           )
 293         )
 294
 295     | `question ->
 296         let pos = find_substr pos "?>" r_q_terminator in
 297         collect v (pos+2) `text
 298
 299     | `tag ->
 300         let pos', name = getname pos in
 301         let attrs, pos', closed = collect_attributes pos' in
 302         let v = v.f v (Vopen (name, attrs, closed)) pos pos' in
 303         collect v pos' `text
 304
 305   and collect_attributes pos =
 306     let rec f accu pos =
 307       let nameval pos =
 308         let pos, name = getname pos in
 309         let pos = find_non_white pos in
 310         if s.[pos] = '='
 311         then
 312           let qpos = pos+1 in
 313           if qpos = slen
 314           then error "not enough data for attribute" s pos;
 315
 316           let qc = s.[qpos] in
 317           if not (qc = '\'' || qc = '"')
 318           then error "assignment is not followed by a quote" s pos;
 319
 320           let closing_q_pos =
 321             let rec find i =
 322               if i = slen
 323               then error "not enough data for attribute value" s pos;
 324
 325               if s.[i] = qc then i else find (i+1)
 326             in
 327             find (qpos+1)
 328           in
 329
 330           let vallen = closing_q_pos - (qpos+1) in
 331           let val' = String.sub s (qpos+1) vallen in
 332           (name, val'), closing_q_pos+1
 333
 334         else error "attribute name not followed by '='" s pos
 335       in
 336
 337       let pos = find_non_white pos in
 338       if s.[pos] = '>'
 339       then
 340         accu, pos+1, false
 341       else (
 342         if slen - pos > 2 && s.[pos] = '/' && s.[pos+1] = '>'
 343         then
 344           accu, pos+2, true
 345         else (
 346           if isname s.[pos]
 347           then (
 348             let nameval, pos = nameval pos in
 349             let accu = nameval :: accu in
 350             f accu pos
 351           )
 352           else error "malformed attribute list" s pos;
 353         )
 354       )
 355     in
 356     f [] pos
 357   in
 358   let _, _ = collect v 0 `text in
 359   v.accu;
 360 ;;