parser.ml

   1 (* based on Tor Andersson's XML parser from MuPDF's XPS module *)
   2
   3 let r_comment_terminator = Str.regexp "-->";;
   4 let r_CDATA_terminator = Str.regexp "\\]\\]>";;
   5 let r_q_terminator = Str.regexp "\\?>";;
   6
   7 let iswhite = function
   8   | '\r' | '\n' | '\t' | ' ' -> true
   9   | _ -> false
  10 ;;
  11
  12 let isname = function
  13   | '.' | '-' | '_' | ':' -> true
  14   | c -> (c >= '0' && c <= '9')
  15          || (c >= 'a' && c <= 'z')
  16          || (c >= 'A' && c <= 'Z')
  17 ;;
  18
  19 exception Parse_error of string * string * int;;
  20
  21 let parse_error msg s pos =
  22   raise (Parse_error (msg, s, pos))
  23 ;;
  24
  25 let enent s pos len =
  26   let b = Buffer.create len in
  27   let rec loop i =
  28     if i - pos = len
  29     then Buffer.contents b
  30     else (
  31       begin match s.[i] with
  32       | '<' -> Buffer.add_string b "&lt;"
  33       | '>' -> Buffer.add_string b "&gt;"
  34       | '\'' -> Buffer.add_string b "&apos;"
  35       | '\"' -> Buffer.add_string b "&quot;"
  36       | '&' -> Buffer.add_string b "&amp;"
  37       | c ->
  38          let code = Char.code c in
  39          if code < 32 || code > 127
  40          then (
  41            Buffer.add_string b "&#";
  42            Buffer.add_string b (string_of_int code);
  43            Buffer.add_char b ';';
  44          )
  45          else Buffer.add_char b c
  46       end;
  47       loop (i+1)
  48     )
  49   in
  50   loop pos
  51 ;;
  52
  53 let unent b s pos len =
  54   let rec loop i =
  55     if i = pos + len
  56     then ()
  57     else
  58       let amppos =
  59         try
  60           String.index_from s i '&'
  61         with Not_found -> -1
  62       in
  63       if amppos = -1 || amppos >= pos + len
  64       then (
  65         Buffer.add_substring b s i (pos + len - i)
  66       )
  67       else (
  68         Buffer.add_substring b s i (amppos - i);
  69         if amppos = i + len then failwith "lonely amp";
  70
  71         let semipos =
  72           try
  73             let semipos = String.index_from s (amppos+1) ';' in
  74             if semipos >= pos + len then raise Not_found;
  75             semipos
  76           with Not_found -> failwith "amp not followed by semicolon"
  77         in
  78
  79         let subslen = semipos-amppos-1 in
  80         if subslen = 0 then failwith "empty amp";
  81
  82         let subs = String.sub s (amppos+1) subslen in
  83
  84         if subs.[0] = '#'
  85         then (
  86           if subslen = 1 then failwith "empty amp followed by hash";
  87           let code =
  88             if subs.[1] = 'x'
  89             then (
  90               Scanf.sscanf subs "#x%x" (fun n -> n)
  91             )
  92             else (
  93               int_of_string (String.sub subs 1 (subslen-1))
  94             )
  95           in
  96           let c = Char.unsafe_chr code in
  97           Buffer.add_char b c
  98         )
  99         else (
 100           match subs with
 101           | "lt" -> Buffer.add_char b '<'
 102           | "gt" -> Buffer.add_char b '>'
 103           | "amp" -> Buffer.add_char b '&'
 104           | "apos" -> Buffer.add_char b '\''
 105           | "quot" -> Buffer.add_char b '\"'
 106           | _ -> failwith ("unknown amp " ^ String.escaped subs)
 107         );
 108         loop (semipos+1)
 109       )
 110   in
 111   loop pos
 112 ;;
 113
 114 let subs s pos =
 115   let len = String.length s in
 116   let left = len - pos in
 117   if left < 0
 118   then
 119     Printf.sprintf "(pos=%d len=%d left=%d)"
 120                    pos len left
 121   else
 122     let len = min left 10 in
 123     let s = String.sub s pos len in
 124     s;
 125 ;;
 126
 127 let ts = function
 128   | `text -> "text"
 129   | `lt -> "lt"
 130   | `close -> "close"
 131   | `exclam -> "exclam"
 132   | `question -> "question"
 133   | `doctype -> "doctype"
 134   | `comment -> "comment"
 135   | `tag -> "tag"
 136 ;;
 137
 138 type attr = string * string
 139  and attrs = attr list
 140  and vp =
 141    | Vdata
 142    | Vcdata
 143    | Vopen of string * attrs * bool
 144    | Vclose of string
 145    | Vend
 146  and 'a v = { f : 'a v -> vp -> int -> int -> 'a v; accu : 'a }
 147 ;;
 148
 149 let parse v s =
 150   let slen = String.length s in
 151
 152   let find_substr pos subs r =
 153     let pos =
 154       try
 155         Str.search_forward r s pos
 156       with Not_found ->
 157         parse_error ("cannot find substring " ^ subs) s pos
 158     in
 159     pos
 160   in
 161   let begins_with pos prefix = Utils.substratis s pos prefix in
 162   let find_non_white pos =
 163     let rec forward i =
 164       if i >= slen
 165       then parse_error "cannot find non white space character" s pos;
 166       if iswhite s.[i] then forward (i+1) else i in
 167     forward pos
 168   in
 169
 170   let getname pos =
 171     let non_name_pos =
 172       let rec find_non_name i =
 173         if i >= slen then parse_error "cannot find non name character" s pos;
 174         if isname s.[i] then find_non_name (i+1) else i
 175       in
 176       find_non_name pos
 177     in
 178     non_name_pos, String.sub s pos (non_name_pos - pos)
 179   in
 180
 181   let rec collect v pos t =
 182     if pos >= slen && t != `text
 183     then parse_error ("not enough data for " ^ ts t) s pos;
 184
 185     match t with
 186     | `text ->
 187        let ltpos =
 188          try
 189            String.index_from s pos '<'
 190          with Not_found ->
 191            let rec trailsbywhite i =
 192              if pos+i = String.length s
 193              then -1
 194              else (
 195                if not (iswhite s.[pos+i])
 196                then parse_error "garbage at the end" s pos
 197                else trailsbywhite (i+1)
 198              )
 199            in
 200            trailsbywhite 0
 201        in
 202        if ltpos = -1
 203        then
 204          v.f v Vend pos slen, slen
 205        else
 206          let start_of_text_pos = find_non_white pos in
 207          let end_of_text_pos =
 208            if start_of_text_pos < ltpos
 209            then
 210              let rec find i =
 211                if i = start_of_text_pos || not (iswhite s.[i])
 212                then i+1
 213                else find (i-1)
 214              in
 215              find (ltpos-1)
 216            else start_of_text_pos
 217          in
 218          let v =
 219            if start_of_text_pos != end_of_text_pos
 220            then v.f v Vdata start_of_text_pos end_of_text_pos
 221            else v
 222          in
 223          collect v (ltpos+1) `lt
 224
 225     | `lt ->
 226        let pos, t =
 227          match s.[pos] with
 228          | '/' -> (pos+1), `close
 229          | '!' -> (pos+1), `exclam
 230          | '?' -> (pos+1), `question
 231          | c when isname c -> pos, `tag
 232          | _ -> parse_error "invalid data after <" s pos
 233        in
 234        collect v pos t
 235
 236     | `close ->
 237        let tag_name_pos = find_non_white pos in
 238        let tag_name_end_pos, close_tag_name = getname tag_name_pos in
 239        let close_tag_pos = find_non_white tag_name_end_pos in
 240        if s.[close_tag_pos] != '>'
 241        then parse_error "missing >" s pos;
 242        let pos' = close_tag_pos + 1 in
 243        let v = v.f v (Vclose close_tag_name) pos pos' in
 244        collect v pos' `text
 245
 246     | `doctype ->
 247        let close_tag_pos =
 248          try
 249            String.index_from s pos '>'
 250          with Not_found ->
 251            parse_error "doctype is not terminated" s pos
 252        in
 253        collect v (close_tag_pos+1) `text
 254
 255     | `comment ->
 256        let pos =
 257          try
 258            find_substr pos "-->" r_comment_terminator
 259          with Not_found ->
 260            parse_error "comment is not terminated" s pos
 261        in
 262        collect v (pos+3) `text
 263
 264     | `exclam ->
 265        if begins_with pos "[CDATA["
 266        then
 267          let cdata_start = pos+7 in
 268          let cdata_end = find_substr cdata_start "]]>" r_CDATA_terminator in
 269          let v = v.f v Vcdata cdata_start cdata_end in
 270          collect v (cdata_end+3) `text
 271        else (
 272          if begins_with pos "DOCTYPE"
 273          then
 274            collect v (pos+7) `doctype
 275          else (
 276            if begins_with pos "--"
 277            then collect v (pos+2) `comment
 278            else parse_error "unknown shit after exclamation mark" s pos
 279          )
 280        )
 281
 282     | `question ->
 283        let pos = find_substr pos "?>" r_q_terminator in
 284        collect v (pos+2) `text
 285
 286     | `tag ->
 287        let pos', name = getname pos in
 288        let attrs, pos', closed = collect_attributes pos' in
 289        let v = v.f v (Vopen (name, attrs, closed)) pos pos' in
 290        collect v pos' `text
 291
 292   and collect_attributes pos =
 293     let rec f accu pos =
 294       let nameval pos =
 295         let pos, name = getname pos in
 296         let pos = find_non_white pos in
 297         if s.[pos] = '='
 298         then
 299           let qpos = pos+1 in
 300           if qpos = slen
 301           then parse_error "not enough data for attribute" s pos;
 302
 303           let qc = s.[qpos] in
 304           if not (qc = '\'' || qc = '\"')
 305           then parse_error "assignment is not followed by a quote" s pos;
 306
 307           let closing_q_pos =
 308             let rec find i =
 309               if i = slen
 310               then parse_error "not enough data for attribute value" s pos;
 311
 312               if s.[i] = qc then i else find (i+1)
 313             in
 314             find (qpos+1)
 315           in
 316
 317           let vallen = closing_q_pos - (qpos+1) in
 318           let val' = String.sub s (qpos+1) vallen in
 319           (name, val'), closing_q_pos+1
 320
 321           else parse_error "attribute name not followed by '='" s pos
 322       in
 323
 324       let pos = find_non_white pos in
 325       if s.[pos] = '>'
 326       then
 327         accu, pos+1, false
 328       else (
 329         if slen - pos > 2 && s.[pos] = '/' && s.[pos+1] = '>'
 330         then
 331           accu, pos+2, true
 332         else (
 333           if isname s.[pos]
 334           then (
 335             let nameval, pos = nameval pos in
 336             let accu = nameval :: accu in
 337             f accu pos
 338           )
 339           else parse_error "malformed attribute list" s pos;
 340         )
 341       )
 342     in
 343     f [] pos
 344   in
 345   let _, _ = collect v 0 `text in
 346   v.accu;
 347 ;;