parser.ml

   1 (* based on Tor Andersson's XML parser from MuPDF's XPS module *)
   2
   3 let iswhite = function
   4   | '\r' | '\n' | '\t' | ' ' -> true
   5   | _ -> false
   6 ;;
   7
   8 let isname = function
   9   | '.' | '-' | '_' | ':' -> true
  10   | c -> (c >= '0' && c <= '9')
  11          || (c >= 'a' && c <= 'z')
  12          || (c >= 'A' && c <= 'Z')
  13 ;;
  14
  15 exception Parse_error of string * string * int;;
  16
  17 let parse_error msg s pos = raise (Parse_error (msg, s, pos));;
  18
  19 let enent s pos len =
  20   let b = Buffer.create len in
  21   let rec loop i =
  22     if i - pos = len
  23     then Buffer.contents b
  24     else (
  25       begin match s.[i] with
  26       | '<' -> Buffer.add_string b "&lt;"
  27       | '>' -> Buffer.add_string b "&gt;"
  28       | '\'' -> Buffer.add_string b "&apos;"
  29       | '\"' -> Buffer.add_string b "&quot;"
  30       | '&' -> Buffer.add_string b "&amp;"
  31       | c ->
  32          let code = Char.code c in
  33          if code < 32 || code > 127
  34          then (
  35            Buffer.add_string b "&#";
  36            Buffer.add_string b (string_of_int code);
  37            Buffer.add_char b ';';
  38          )
  39          else Buffer.add_char b c
  40       end;
  41       loop (i+1)
  42     )
  43   in
  44   loop pos
  45 ;;
  46
  47 let unent b s pos len =
  48   let rec loop i =
  49     if i != pos + len
  50     then
  51       let amppos =
  52         try
  53           String.index_from s i '&'
  54         with Not_found -> -1
  55       in
  56       if amppos = -1 || amppos >= pos + len
  57       then Buffer.add_substring b s i (pos + len - i)
  58       else (
  59         Buffer.add_substring b s i (amppos - i);
  60         if amppos = i + len then Utils.error "lonely amp";
  61
  62         let semipos =
  63           try
  64             let semipos = String.index_from s (amppos+1) ';' in
  65             if semipos >= pos + len then raise Not_found;
  66             semipos
  67           with Not_found ->
  68             Utils.error "amp not followed by semicolon at %d" amppos
  69         in
  70
  71         let subslen = semipos-amppos-1 in
  72         if subslen = 0 then Utils.error "empty amp at %d" amppos;
  73
  74         let subs = String.sub s (amppos+1) subslen in
  75
  76         if subs.[0] = '#'
  77         then (
  78           if subslen = 1
  79           then Utils.error "empty amp followed by hash at %d" amppos;
  80           let code =
  81             if subs.[1] = 'x'
  82             then Scanf.sscanf subs "#x%x" (fun n -> n)
  83             else int_of_string (String.sub subs 1 (subslen-1))
  84           in
  85           let c = Char.unsafe_chr code in
  86           Buffer.add_char b c
  87         )
  88         else (
  89           match subs with
  90           | "lt" -> Buffer.add_char b '<'
  91           | "gt" -> Buffer.add_char b '>'
  92           | "amp" -> Buffer.add_char b '&'
  93           | "apos" -> Buffer.add_char b '\''
  94           | "quot" -> Buffer.add_char b '\"'
  95           | _ -> Utils.error "unknown amp %S" subs
  96         );
  97         loop (semipos+1)
  98       )
  99   in
 100   loop pos
 101 ;;
 102
 103 let subs s pos =
 104   let len = String.length s in
 105   let left = len - pos in
 106   if left < 0
 107   then Printf.sprintf "(pos=%d len=%d left=%d)" pos len left
 108   else
 109     let len = min left 10 in
 110     String.sub s pos len
 111 ;;
 112
 113 let ts = function
 114   | `text -> "text"
 115   | `lt -> "lt"
 116   | `close -> "close"
 117   | `exclam -> "exclam"
 118   | `question -> "question"
 119   | `doctype -> "doctype"
 120   | `comment -> "comment"
 121   | `tag -> "tag"
 122 ;;
 123
 124 type attr = string * string
 125  and attrs = attr list
 126  and vp =
 127    | Vdata
 128    | Vcdata
 129    | Vopen of string * attrs * bool
 130    | Vclose of string
 131    | Vend
 132  and 'a v = { f : 'a v -> vp -> int -> int -> 'a v; accu : 'a }
 133 ;;
 134
 135 let parse v s =
 136   let r_comment_terminator = Str.regexp "-->"
 137   and r_CDATA_terminator = Str.regexp "\\]\\]>"
 138   and r_q_terminator = Str.regexp "\\?>" in
 139
 140   let slen = String.length s in
 141
 142   let find_substr pos subs r =
 143     let pos =
 144       try
 145         Str.search_forward r s pos
 146       with Not_found ->
 147         parse_error ("cannot find substring " ^ subs) s pos
 148     in
 149     pos
 150   in
 151   let begins_with pos prefix = Utils.substratis s pos prefix in
 152   let find_non_white pos =
 153     let rec forward i =
 154       if i >= slen
 155       then parse_error "cannot find non white space character" s pos;
 156       if iswhite s.[i] then forward (i+1) else i in
 157     forward pos
 158   in
 159
 160   let getname pos =
 161     let non_name_pos =
 162       let rec find_non_name i =
 163         if i >= slen then parse_error "cannot find non name character" s pos;
 164         if isname s.[i] then find_non_name (i+1) else i
 165       in
 166       find_non_name pos
 167     in
 168     non_name_pos, String.sub s pos (non_name_pos - pos)
 169   in
 170
 171   let rec collect v pos t =
 172     if pos >= slen && t != `text
 173     then parse_error ("not enough data for " ^ ts t) s pos;
 174
 175     match t with
 176     | `text ->
 177        let ltpos =
 178          try
 179            String.index_from s pos '<'
 180          with Not_found ->
 181            let rec trailsbywhite i =
 182              if pos+i = String.length s
 183              then -1
 184              else (
 185                if not (iswhite s.[pos+i])
 186                then parse_error "garbage at the end" s pos
 187                else trailsbywhite (i+1)
 188              )
 189            in
 190            trailsbywhite 0
 191        in
 192        if ltpos = -1
 193        then v.f v Vend pos slen, slen
 194        else
 195          let start_of_text_pos = find_non_white pos in
 196          let end_of_text_pos =
 197            if start_of_text_pos < ltpos
 198            then
 199              let rec find i =
 200                if i = start_of_text_pos || not (iswhite s.[i])
 201                then i+1
 202                else find (i-1)
 203              in
 204              find (ltpos-1)
 205            else start_of_text_pos
 206          in
 207          let v =
 208            if start_of_text_pos != end_of_text_pos
 209            then v.f v Vdata start_of_text_pos end_of_text_pos
 210            else v
 211          in
 212          collect v (ltpos+1) `lt
 213
 214     | `lt ->
 215        let pos, t =
 216          match s.[pos] with
 217          | '/' -> (pos+1), `close
 218          | '!' -> (pos+1), `exclam
 219          | '?' -> (pos+1), `question
 220          | c when isname c -> pos, `tag
 221          | _ -> parse_error "invalid data after <" s pos
 222        in
 223        collect v pos t
 224
 225     | `close ->
 226        let tag_name_pos = find_non_white pos in
 227        let tag_name_end_pos, close_tag_name = getname tag_name_pos in
 228        let close_tag_pos = find_non_white tag_name_end_pos in
 229        if s.[close_tag_pos] != '>'
 230        then parse_error "missing >" s pos;
 231        let pos' = close_tag_pos + 1 in
 232        let v = v.f v (Vclose close_tag_name) pos pos' in
 233        collect v pos' `text
 234
 235     | `doctype ->
 236        let close_tag_pos =
 237          try
 238            String.index_from s pos '>'
 239          with Not_found ->
 240            parse_error "doctype is not terminated" s pos
 241        in
 242        collect v (close_tag_pos+1) `text
 243
 244     | `comment ->
 245        let pos =
 246          try
 247            find_substr pos "-->" r_comment_terminator
 248          with Not_found ->
 249            parse_error "comment is not terminated" s pos
 250        in
 251        collect v (pos+3) `text
 252
 253     | `exclam ->
 254        if begins_with pos "[CDATA["
 255        then
 256          let cdata_start = pos+7 in
 257          let cdata_end = find_substr cdata_start "]]>" r_CDATA_terminator in
 258          let v = v.f v Vcdata cdata_start cdata_end in
 259          collect v (cdata_end+3) `text
 260        else (
 261          if begins_with pos "DOCTYPE"
 262          then
 263            collect v (pos+7) `doctype
 264          else (
 265            if begins_with pos "--"
 266            then collect v (pos+2) `comment
 267            else parse_error "unknown shit after exclamation mark" s pos
 268          )
 269        )
 270
 271     | `question ->
 272        let pos = find_substr pos "?>" r_q_terminator in
 273        collect v (pos+2) `text
 274
 275     | `tag ->
 276        let pos', name = getname pos in
 277        let attrs, pos', closed = collect_attributes pos' in
 278        let v = v.f v (Vopen (name, attrs, closed)) pos pos' in
 279        collect v pos' `text
 280
 281   and collect_attributes pos =
 282     let rec f accu pos =
 283       let nameval pos =
 284         let pos, name = getname pos in
 285         let pos = find_non_white pos in
 286         if s.[pos] = '='
 287         then
 288           let qpos = pos+1 in
 289           if qpos = slen
 290           then parse_error "not enough data for attribute" s pos;
 291
 292           let qc = s.[qpos] in
 293           if not (qc = '\'' || qc = '\"')
 294           then parse_error "assignment is not followed by a quote" s pos;
 295
 296           let closing_q_pos =
 297             let rec find i =
 298               if i = slen
 299               then parse_error "not enough data for attribute value" s pos;
 300
 301               if s.[i] = qc then i else find (i+1)
 302             in
 303             find (qpos+1)
 304           in
 305
 306           let vallen = closing_q_pos - (qpos+1) in
 307           let val' = String.sub s (qpos+1) vallen in
 308           (name, val'), closing_q_pos+1
 309
 310           else parse_error "attribute name not followed by '='" s pos
 311       in
 312
 313       let pos = find_non_white pos in
 314       if s.[pos] = '>'
 315       then accu, pos+1, false
 316       else (
 317         if slen - pos > 2 && s.[pos] = '/' && s.[pos+1] = '>'
 318         then accu, pos+2, true
 319         else (
 320           if isname s.[pos]
 321           then (
 322             let nameval, pos = nameval pos in
 323             let accu = nameval :: accu in
 324             f accu pos
 325           )
 326           else parse_error "malformed attribute list" s pos;
 327         )
 328       )
 329     in
 330     f [] pos
 331   in
 332   let _, _ = collect v 0 `text in
 333   v.accu;
 334 ;;