parser.ml

   1 (* based on Tor Andersson's XML parser from MuPDF's XPS module *)
   2
   3 let r_comment_terminator = Str.regexp "-->";;
   4 let r_CDATA_terminator = Str.regexp "\\]\\]>";;
   5 let r_q_terminator = Str.regexp "\\?>";;
   6
   7 let iswhite = function
   8   | '\r' | '\n' | '\t' | ' ' -> true
   9   | _ -> false
  10 ;;
  11
  12 let isname = function
  13   | '.' | '-' | '_' | ':' -> true
  14   | c -> (c >= '0' && c <= '9')
  15       || (c >= 'a' && c <= 'z')
  16       || (c >= 'A' && c <= 'Z')
  17 ;;
  18
  19 exception Parse_error of string * string * int
  20
  21 let parse_error msg s pos =
  22   raise (Parse_error (msg, s, pos))
  23 ;;
  24
  25 let enent s pos len =
  26   let b = Buffer.create len in
  27   let rec loop i =
  28     if i - pos = len
  29     then Buffer.contents b
  30     else (
  31       begin match s.[i] with
  32       | '<' -> Buffer.add_string b "&lt;"
  33       | '>' -> Buffer.add_string b "&gt;"
  34       | '\'' -> Buffer.add_string b "&apos;"
  35       | '"' -> Buffer.add_string b "&quot;"
  36       | '&' -> Buffer.add_string b "&amp;"
  37       | c ->
  38           let code = Char.code c in
  39           if code < 32 || code > 127
  40           then (
  41             Buffer.add_string b "&#";
  42             Buffer.add_string b (string_of_int code);
  43             Buffer.add_char b ';';
  44           )
  45           else Buffer.add_char b c
  46       end;
  47       loop (i+1)
  48     )
  49   in
  50   loop pos
  51 ;;
  52
  53 let unent b s pos len =
  54   let rec loop i =
  55     if i = pos + len
  56     then ()
  57     else
  58       let amppos =
  59         try
  60           String.index_from s i '&'
  61         with Not_found -> -1
  62       in
  63       if amppos = -1 || amppos >= pos + len
  64       then (
  65         Buffer.add_substring b s i (pos + len - i)
  66       )
  67       else (
  68         Buffer.add_substring b s i (amppos - i);
  69         if amppos = i + len then failwith "lonely amp";
  70
  71         let semipos =
  72           try
  73             let semipos = String.index_from s (amppos+1) ';' in
  74             if semipos >= pos + len then raise Not_found;
  75             semipos
  76           with Not_found -> failwith "amp not followed by semicolon"
  77         in
  78
  79         let subslen = semipos-amppos-1 in
  80         if subslen = 0 then failwith "empty amp";
  81
  82         let subs = String.sub s (amppos+1) subslen in
  83
  84         if subs.[0] = '#'
  85         then (
  86           if subslen = 1 then failwith "empty amp followed by hash";
  87           let code =
  88             if subs.[1] = 'x'
  89             then (
  90               subs.[0] <- '0';
  91               int_of_string subs
  92             )
  93             else (
  94               int_of_string (String.sub subs 1 (subslen-1))
  95             )
  96           in
  97           let c = Char.unsafe_chr code in
  98           Buffer.add_char b c
  99         )
 100         else (
 101           match subs with
 102           | "lt" -> Buffer.add_char b '<'
 103           | "gt" -> Buffer.add_char b '>'
 104           | "amp" -> Buffer.add_char b '&'
 105           | "apos" -> Buffer.add_char b '\''
 106           | "quot" -> Buffer.add_char b '"'
 107           | _ -> failwith ("unknown amp " ^ String.escaped subs)
 108         );
 109         loop (semipos+1)
 110       )
 111   in
 112   loop pos
 113 ;;
 114
 115 let subs s pos =
 116   let len = String.length s in
 117   let left = len - pos in
 118   if left < 0
 119   then
 120     Printf.sprintf "(pos=%d len=%d left=%d)"
 121       pos len left
 122   else
 123     let len = min left 10 in
 124     let s = String.sub s pos len in
 125     s;
 126 ;;
 127
 128 let ts = function
 129   | `text -> "text"
 130   | `lt -> "lt"
 131   | `close -> "close"
 132   | `exclam -> "exclam"
 133   | `question -> "question"
 134   | `doctype -> "doctype"
 135   | `comment -> "comment"
 136   | `tag -> "tag"
 137 ;;
 138
 139 type attr = string * string
 140 and attrs = attr list
 141 and vp =
 142     | Vdata
 143     | Vcdata
 144     | Vopen of string * attrs * bool
 145     | Vclose of string
 146     | Vend
 147 and 'a v = { f : 'a v -> vp -> int -> int -> 'a v; accu : 'a }
 148 ;;
 149
 150 let parse v s =
 151   let slen = String.length s in
 152
 153   let find_substr pos subs r =
 154     let pos =
 155       try
 156         Str.search_forward r s pos
 157       with Not_found ->
 158         parse_error ("couldn't find substring " ^ subs) s pos
 159     in
 160     pos
 161   in
 162
 163   let begins_with pos prefix =
 164     let prefixlen = String.length prefix in
 165     if String.length s - pos >= prefixlen
 166     then
 167       let rec cmp i =
 168         i = prefixlen || (s.[pos+i] = prefix.[i]) && cmp (i+1)
 169       in
 170       cmp 0
 171     else
 172       false
 173   in
 174
 175   let find_non_white pos =
 176     let rec forward i =
 177       if i >= slen
 178       then parse_error "couldn't find non white space character" s pos;
 179       if iswhite s.[i] then forward (i+1) else i in
 180     forward pos
 181   in
 182
 183   let getname pos =
 184     let non_name_pos =
 185       let rec find_non_name i =
 186         if i >= slen then parse_error "couldn't find  non name character" s pos;
 187         if isname s.[i] then find_non_name (i+1) else i
 188       in
 189       find_non_name pos
 190     in
 191     non_name_pos, String.sub s pos (non_name_pos - pos)
 192   in
 193
 194   let rec collect v pos t =
 195     if pos >= slen && t != `text
 196     then parse_error ("not enough data for " ^ ts t) s pos;
 197
 198     match t with
 199     | `text ->
 200         let ltpos =
 201           try
 202             String.index_from s pos '<'
 203           with Not_found ->
 204             let rec trailsbywhite i =
 205               if pos+i = String.length s
 206               then -1
 207               else (
 208                 if not (iswhite s.[pos+i])
 209                 then parse_error "garbage at the end" s pos
 210                 else trailsbywhite (i+1)
 211               )
 212             in
 213             trailsbywhite 0
 214         in
 215         if ltpos = -1
 216         then
 217           v.f v Vend pos slen, slen
 218         else
 219           let start_of_text_pos = find_non_white pos in
 220           let end_of_text_pos =
 221             if start_of_text_pos < ltpos
 222             then
 223               let rec find i =
 224                 if i = start_of_text_pos || not (iswhite s.[i])
 225                 then i+1
 226                 else find (i-1)
 227               in
 228               find (ltpos-1)
 229             else start_of_text_pos
 230           in
 231           let v =
 232             if start_of_text_pos != end_of_text_pos
 233             then v.f v Vdata start_of_text_pos end_of_text_pos
 234             else v
 235           in
 236           collect v (ltpos+1) `lt
 237
 238     | `lt ->
 239         let pos, t =
 240           match s.[pos] with
 241           | '/' -> (pos+1), `close
 242           | '!' -> (pos+1), `exclam
 243           | '?' -> (pos+1), `question
 244           | c when isname c -> pos, `tag
 245           | _ -> parse_error "invalid data after <" s pos
 246         in
 247         collect v pos t
 248
 249     | `close ->
 250         let tag_name_pos = find_non_white pos in
 251         let tag_name_end_pos, close_tag_name = getname tag_name_pos in
 252         let close_tag_pos = find_non_white tag_name_end_pos in
 253         if s.[close_tag_pos] != '>'
 254         then parse_error "missing >" s pos;
 255         let pos' = close_tag_pos + 1 in
 256         let v = v.f v (Vclose close_tag_name) pos pos' in
 257         collect v pos' `text
 258
 259     | `doctype ->
 260         let close_tag_pos =
 261           try
 262             String.index_from s pos '>'
 263           with Not_found ->
 264             parse_error "doctype is not terminated" s pos
 265         in
 266         collect v (close_tag_pos+1) `text
 267
 268     | `comment ->
 269         let pos =
 270           try
 271             find_substr pos "-->" r_comment_terminator
 272           with Not_found ->
 273             parse_error "comment is not terminated" s pos
 274         in
 275         collect v (pos+3) `text
 276
 277     | `exclam ->
 278         if begins_with pos "[CDATA["
 279         then
 280           let cdata_start = pos+7 in
 281           let cdata_end = find_substr cdata_start "]]>" r_CDATA_terminator in
 282           let v = v.f v Vcdata cdata_start cdata_end in
 283           collect v (cdata_end+3) `text
 284         else (
 285           if begins_with pos "DOCTYPE"
 286           then
 287             collect v (pos+7) `doctype
 288           else (
 289             if begins_with pos "--"
 290             then collect v (pos+2) `comment
 291             else parse_error "unknown shit after exclamation mark" s pos
 292           )
 293         )
 294
 295     | `question ->
 296         let pos = find_substr pos "?>" r_q_terminator in
 297         collect v (pos+2) `text
 298
 299     | `tag ->
 300         let pos', name = getname pos in
 301         let attrs, pos', closed = collect_attributes pos' in
 302         let v = v.f v (Vopen (name, attrs, closed)) pos pos' in
 303         collect v pos' `text
 304
 305   and collect_attributes pos =
 306     let rec f accu pos =
 307       let nameval pos =
 308         let pos, name = getname pos in
 309         let pos = find_non_white pos in
 310         if s.[pos] = '='
 311         then
 312           let qpos = pos+1 in
 313           if qpos = slen
 314           then parse_error "not enough data for attribute" s pos;
 315
 316           let qc = s.[qpos] in
 317           if not (qc = '\'' || qc = '"')
 318           then parse_error "assignment is not followed by a quote" s pos;
 319
 320           let closing_q_pos =
 321             let rec find i =
 322               if i = slen
 323               then parse_error "not enough data for attribute value" s pos;
 324
 325               if s.[i] = qc then i else find (i+1)
 326             in
 327             find (qpos+1)
 328           in
 329
 330           let vallen = closing_q_pos - (qpos+1) in
 331           let val' = String.sub s (qpos+1) vallen in
 332           (name, val'), closing_q_pos+1
 333
 334         else parse_error "attribute name not followed by '='" s pos
 335       in
 336
 337       let pos = find_non_white pos in
 338       if s.[pos] = '>'
 339       then
 340         accu, pos+1, false
 341       else (
 342         if slen - pos > 2 && s.[pos] = '/' && s.[pos+1] = '>'
 343         then
 344           accu, pos+2, true
 345         else (
 346           if isname s.[pos]
 347           then (
 348             let nameval, pos = nameval pos in
 349             let accu = nameval :: accu in
 350             f accu pos
 351           )
 352           else parse_error "malformed attribute list" s pos;
 353         )
 354       )
 355     in
 356     f [] pos
 357   in
 358   let _, _ = collect v 0 `text in
 359   v.accu;
 360 ;;