parser.ml

   1 (* based on Tor Andersson's XML parser from MuPDF's XPS module *)
   2
   3 let iswhite = function
   4   | '\r' | '\n' | '\t' | ' ' -> true
   5   | _ -> false
   6 ;;
   7
   8 let isname = function
   9   | '.' | '-' | '_' | ':' -> true
  10   | c -> (c >= '0' && c <= '9')
  11          || (c >= 'a' && c <= 'z')
  12          || (c >= 'A' && c <= 'Z')
  13 ;;
  14
  15 exception Parse_error of string * string * int;;
  16
  17 let parse_error msg s pos =
  18   raise (Parse_error (msg, s, pos))
  19 ;;
  20
  21 let enent s pos len =
  22   let b = Buffer.create len in
  23   let rec loop i =
  24     if i - pos = len
  25     then Buffer.contents b
  26     else (
  27       begin match s.[i] with
  28       | '<' -> Buffer.add_string b "&lt;"
  29       | '>' -> Buffer.add_string b "&gt;"
  30       | '\'' -> Buffer.add_string b "&apos;"
  31       | '\"' -> Buffer.add_string b "&quot;"
  32       | '&' -> Buffer.add_string b "&amp;"
  33       | c ->
  34          let code = Char.code c in
  35          if code < 32 || code > 127
  36          then (
  37            Buffer.add_string b "&#";
  38            Buffer.add_string b (string_of_int code);
  39            Buffer.add_char b ';';
  40          )
  41          else Buffer.add_char b c
  42       end;
  43       loop (i+1)
  44     )
  45   in
  46   loop pos
  47 ;;
  48
  49 let unent b s pos len =
  50   let rec loop i =
  51     if i != pos + len
  52     then
  53       let amppos =
  54         try
  55           String.index_from s i '&'
  56         with Not_found -> -1
  57       in
  58       if amppos = -1 || amppos >= pos + len
  59       then Buffer.add_substring b s i (pos + len - i)
  60       else (
  61         Buffer.add_substring b s i (amppos - i);
  62         if amppos = i + len then Utils.error "lonely amp";
  63
  64         let semipos =
  65           try
  66             let semipos = String.index_from s (amppos+1) ';' in
  67             if semipos >= pos + len then raise Not_found;
  68             semipos
  69           with Not_found ->
  70             Utils.error "amp not followed by semicolon at %d" amppos
  71         in
  72
  73         let subslen = semipos-amppos-1 in
  74         if subslen = 0 then Utils.error "empty amp at %d" amppos;
  75
  76         let subs = String.sub s (amppos+1) subslen in
  77
  78         if subs.[0] = '#'
  79         then (
  80           if subslen = 1
  81           then Utils.error "empty amp followed by hash at %d" amppos;
  82           let code =
  83             if subs.[1] = 'x'
  84             then Scanf.sscanf subs "#x%x" (fun n -> n)
  85             else int_of_string (String.sub subs 1 (subslen-1))
  86           in
  87           let c = Char.unsafe_chr code in
  88           Buffer.add_char b c
  89         )
  90         else (
  91           match subs with
  92           | "lt" -> Buffer.add_char b '<'
  93           | "gt" -> Buffer.add_char b '>'
  94           | "amp" -> Buffer.add_char b '&'
  95           | "apos" -> Buffer.add_char b '\''
  96           | "quot" -> Buffer.add_char b '\"'
  97           | _ -> Utils.error "unknown amp %S" subs
  98         );
  99         loop (semipos+1)
 100       )
 101   in
 102   loop pos
 103 ;;
 104
 105 let subs s pos =
 106   let len = String.length s in
 107   let left = len - pos in
 108   if left < 0
 109   then Printf.sprintf "(pos=%d len=%d left=%d)" pos len left
 110   else
 111     let len = min left 10 in
 112     String.sub s pos len
 113 ;;
 114
 115 let ts = function
 116   | `text -> "text"
 117   | `lt -> "lt"
 118   | `close -> "close"
 119   | `exclam -> "exclam"
 120   | `question -> "question"
 121   | `doctype -> "doctype"
 122   | `comment -> "comment"
 123   | `tag -> "tag"
 124 ;;
 125
 126 type attr = string * string
 127  and attrs = attr list
 128  and vp =
 129    | Vdata
 130    | Vcdata
 131    | Vopen of string * attrs * bool
 132    | Vclose of string
 133    | Vend
 134  and 'a v = { f : 'a v -> vp -> int -> int -> 'a v; accu : 'a }
 135 ;;
 136
 137 let parse v s =
 138   let r_comment_terminator = Str.regexp "-->"
 139   and r_CDATA_terminator = Str.regexp "\\]\\]>"
 140   and r_q_terminator = Str.regexp "\\?>" in
 141
 142   let slen = String.length s in
 143
 144   let find_substr pos subs r =
 145     let pos =
 146       try
 147         Str.search_forward r s pos
 148       with Not_found ->
 149         parse_error ("cannot find substring " ^ subs) s pos
 150     in
 151     pos
 152   in
 153   let begins_with pos prefix = Utils.substratis s pos prefix in
 154   let find_non_white pos =
 155     let rec forward i =
 156       if i >= slen
 157       then parse_error "cannot find non white space character" s pos;
 158       if iswhite s.[i] then forward (i+1) else i in
 159     forward pos
 160   in
 161
 162   let getname pos =
 163     let non_name_pos =
 164       let rec find_non_name i =
 165         if i >= slen then parse_error "cannot find non name character" s pos;
 166         if isname s.[i] then find_non_name (i+1) else i
 167       in
 168       find_non_name pos
 169     in
 170     non_name_pos, String.sub s pos (non_name_pos - pos)
 171   in
 172
 173   let rec collect v pos t =
 174     if pos >= slen && t != `text
 175     then parse_error ("not enough data for " ^ ts t) s pos;
 176
 177     match t with
 178     | `text ->
 179        let ltpos =
 180          try
 181            String.index_from s pos '<'
 182          with Not_found ->
 183            let rec trailsbywhite i =
 184              if pos+i = String.length s
 185              then -1
 186              else (
 187                if not (iswhite s.[pos+i])
 188                then parse_error "garbage at the end" s pos
 189                else trailsbywhite (i+1)
 190              )
 191            in
 192            trailsbywhite 0
 193        in
 194        if ltpos = -1
 195        then
 196          v.f v Vend pos slen, slen
 197        else
 198          let start_of_text_pos = find_non_white pos in
 199          let end_of_text_pos =
 200            if start_of_text_pos < ltpos
 201            then
 202              let rec find i =
 203                if i = start_of_text_pos || not (iswhite s.[i])
 204                then i+1
 205                else find (i-1)
 206              in
 207              find (ltpos-1)
 208            else start_of_text_pos
 209          in
 210          let v =
 211            if start_of_text_pos != end_of_text_pos
 212            then v.f v Vdata start_of_text_pos end_of_text_pos
 213            else v
 214          in
 215          collect v (ltpos+1) `lt
 216
 217     | `lt ->
 218        let pos, t =
 219          match s.[pos] with
 220          | '/' -> (pos+1), `close
 221          | '!' -> (pos+1), `exclam
 222          | '?' -> (pos+1), `question
 223          | c when isname c -> pos, `tag
 224          | _ -> parse_error "invalid data after <" s pos
 225        in
 226        collect v pos t
 227
 228     | `close ->
 229        let tag_name_pos = find_non_white pos in
 230        let tag_name_end_pos, close_tag_name = getname tag_name_pos in
 231        let close_tag_pos = find_non_white tag_name_end_pos in
 232        if s.[close_tag_pos] != '>'
 233        then parse_error "missing >" s pos;
 234        let pos' = close_tag_pos + 1 in
 235        let v = v.f v (Vclose close_tag_name) pos pos' in
 236        collect v pos' `text
 237
 238     | `doctype ->
 239        let close_tag_pos =
 240          try
 241            String.index_from s pos '>'
 242          with Not_found ->
 243            parse_error "doctype is not terminated" s pos
 244        in
 245        collect v (close_tag_pos+1) `text
 246
 247     | `comment ->
 248        let pos =
 249          try
 250            find_substr pos "-->" r_comment_terminator
 251          with Not_found ->
 252            parse_error "comment is not terminated" s pos
 253        in
 254        collect v (pos+3) `text
 255
 256     | `exclam ->
 257        if begins_with pos "[CDATA["
 258        then
 259          let cdata_start = pos+7 in
 260          let cdata_end = find_substr cdata_start "]]>" r_CDATA_terminator in
 261          let v = v.f v Vcdata cdata_start cdata_end in
 262          collect v (cdata_end+3) `text
 263        else (
 264          if begins_with pos "DOCTYPE"
 265          then
 266            collect v (pos+7) `doctype
 267          else (
 268            if begins_with pos "--"
 269            then collect v (pos+2) `comment
 270            else parse_error "unknown shit after exclamation mark" s pos
 271          )
 272        )
 273
 274     | `question ->
 275        let pos = find_substr pos "?>" r_q_terminator in
 276        collect v (pos+2) `text
 277
 278     | `tag ->
 279        let pos', name = getname pos in
 280        let attrs, pos', closed = collect_attributes pos' in
 281        let v = v.f v (Vopen (name, attrs, closed)) pos pos' in
 282        collect v pos' `text
 283
 284   and collect_attributes pos =
 285     let rec f accu pos =
 286       let nameval pos =
 287         let pos, name = getname pos in
 288         let pos = find_non_white pos in
 289         if s.[pos] = '='
 290         then
 291           let qpos = pos+1 in
 292           if qpos = slen
 293           then parse_error "not enough data for attribute" s pos;
 294
 295           let qc = s.[qpos] in
 296           if not (qc = '\'' || qc = '\"')
 297           then parse_error "assignment is not followed by a quote" s pos;
 298
 299           let closing_q_pos =
 300             let rec find i =
 301               if i = slen
 302               then parse_error "not enough data for attribute value" s pos;
 303
 304               if s.[i] = qc then i else find (i+1)
 305             in
 306             find (qpos+1)
 307           in
 308
 309           let vallen = closing_q_pos - (qpos+1) in
 310           let val' = String.sub s (qpos+1) vallen in
 311           (name, val'), closing_q_pos+1
 312
 313           else parse_error "attribute name not followed by '='" s pos
 314       in
 315
 316       let pos = find_non_white pos in
 317       if s.[pos] = '>'
 318       then
 319         accu, pos+1, false
 320       else (
 321         if slen - pos > 2 && s.[pos] = '/' && s.[pos+1] = '>'
 322         then
 323           accu, pos+2, true
 324         else (
 325           if isname s.[pos]
 326           then (
 327             let nameval, pos = nameval pos in
 328             let accu = nameval :: accu in
 329             f accu pos
 330           )
 331           else parse_error "malformed attribute list" s pos;
 332         )
 333       )
 334     in
 335     f [] pos
 336   in
 337   let _, _ = collect v 0 `text in
 338   v.accu;
 339 ;;