parser.ml

   1 (* based on Tor Andersson's XML parser from MuPDF's XPS module *)
   2
   3 let r_comment_terminator = Str.regexp "-->";;
   4 let r_CDATA_terminator = Str.regexp "\\]\\]>";;
   5 let r_q_terminator = Str.regexp "\\?>";;
   6
   7 let iswhite = function
   8   | '\r' | '\n' | '\t' | ' ' -> true
   9   | _ -> false
  10 ;;
  11
  12 let isname = function
  13   | '.' | '-' | '_' | ':' -> true
  14   | c -> (c >= '0' && c <= '9')
  15          || (c >= 'a' && c <= 'z')
  16          || (c >= 'A' && c <= 'Z')
  17 ;;
  18
  19 exception Parse_error of string * string * int;;
  20
  21 let parse_error msg s pos =
  22   raise (Parse_error (msg, s, pos))
  23 ;;
  24
  25 let enent s pos len =
  26   let b = Buffer.create len in
  27   let rec loop i =
  28     if i - pos = len
  29     then Buffer.contents b
  30     else (
  31       begin match s.[i] with
  32       | '<' -> Buffer.add_string b "&lt;"
  33       | '>' -> Buffer.add_string b "&gt;"
  34       | '\'' -> Buffer.add_string b "&apos;"
  35       | '\"' -> Buffer.add_string b "&quot;"
  36       | '&' -> Buffer.add_string b "&amp;"
  37       | c ->
  38          let code = Char.code c in
  39          if code < 32 || code > 127
  40          then (
  41            Buffer.add_string b "&#";
  42            Buffer.add_string b (string_of_int code);
  43            Buffer.add_char b ';';
  44          )
  45          else Buffer.add_char b c
  46       end;
  47       loop (i+1)
  48     )
  49   in
  50   loop pos
  51 ;;
  52
  53 let unent b s pos len =
  54   let rec loop i =
  55     if i != pos + len
  56     then
  57       let amppos =
  58         try
  59           String.index_from s i '&'
  60         with Not_found -> -1
  61       in
  62       if amppos = -1 || amppos >= pos + len
  63       then (
  64         Buffer.add_substring b s i (pos + len - i)
  65       )
  66       else (
  67         Buffer.add_substring b s i (amppos - i);
  68         if amppos = i + len then Utils.error "lonely amp";
  69
  70         let semipos =
  71           try
  72             let semipos = String.index_from s (amppos+1) ';' in
  73             if semipos >= pos + len then raise Not_found;
  74             semipos
  75           with Not_found ->
  76             Utils.error "amp not followed by semicolon at %d" amppos
  77         in
  78
  79         let subslen = semipos-amppos-1 in
  80         if subslen = 0 then Utils.error "empty amp at %d" amppos;
  81
  82         let subs = String.sub s (amppos+1) subslen in
  83
  84         if subs.[0] = '#'
  85         then (
  86           if subslen = 1
  87           then Utils.error "empty amp followed by hash at %d" amppos;
  88           let code =
  89             if subs.[1] = 'x'
  90             then Scanf.sscanf subs "#x%x" (fun n -> n)
  91             else int_of_string (String.sub subs 1 (subslen-1))
  92           in
  93           let c = Char.unsafe_chr code in
  94           Buffer.add_char b c
  95         )
  96         else (
  97           match subs with
  98           | "lt" -> Buffer.add_char b '<'
  99           | "gt" -> Buffer.add_char b '>'
 100           | "amp" -> Buffer.add_char b '&'
 101           | "apos" -> Buffer.add_char b '\''
 102           | "quot" -> Buffer.add_char b '\"'
 103           | _ -> Utils.error "unknown amp %S" subs
 104         );
 105         loop (semipos+1)
 106       )
 107   in
 108   loop pos
 109 ;;
 110
 111 let subs s pos =
 112   let len = String.length s in
 113   let left = len - pos in
 114   if left < 0
 115   then
 116     Printf.sprintf "(pos=%d len=%d left=%d)"
 117                    pos len left
 118   else
 119     let len = min left 10 in
 120     let s = String.sub s pos len in
 121     s;
 122 ;;
 123
 124 let ts = function
 125   | `text -> "text"
 126   | `lt -> "lt"
 127   | `close -> "close"
 128   | `exclam -> "exclam"
 129   | `question -> "question"
 130   | `doctype -> "doctype"
 131   | `comment -> "comment"
 132   | `tag -> "tag"
 133 ;;
 134
 135 type attr = string * string
 136  and attrs = attr list
 137  and vp =
 138    | Vdata
 139    | Vcdata
 140    | Vopen of string * attrs * bool
 141    | Vclose of string
 142    | Vend
 143  and 'a v = { f : 'a v -> vp -> int -> int -> 'a v; accu : 'a }
 144 ;;
 145
 146 let parse v s =
 147   let slen = String.length s in
 148
 149   let find_substr pos subs r =
 150     let pos =
 151       try
 152         Str.search_forward r s pos
 153       with Not_found ->
 154         parse_error ("cannot find substring " ^ subs) s pos
 155     in
 156     pos
 157   in
 158   let begins_with pos prefix = Utils.substratis s pos prefix in
 159   let find_non_white pos =
 160     let rec forward i =
 161       if i >= slen
 162       then parse_error "cannot find non white space character" s pos;
 163       if iswhite s.[i] then forward (i+1) else i in
 164     forward pos
 165   in
 166
 167   let getname pos =
 168     let non_name_pos =
 169       let rec find_non_name i =
 170         if i >= slen then parse_error "cannot find non name character" s pos;
 171         if isname s.[i] then find_non_name (i+1) else i
 172       in
 173       find_non_name pos
 174     in
 175     non_name_pos, String.sub s pos (non_name_pos - pos)
 176   in
 177
 178   let rec collect v pos t =
 179     if pos >= slen && t != `text
 180     then parse_error ("not enough data for " ^ ts t) s pos;
 181
 182     match t with
 183     | `text ->
 184        let ltpos =
 185          try
 186            String.index_from s pos '<'
 187          with Not_found ->
 188            let rec trailsbywhite i =
 189              if pos+i = String.length s
 190              then -1
 191              else (
 192                if not (iswhite s.[pos+i])
 193                then parse_error "garbage at the end" s pos
 194                else trailsbywhite (i+1)
 195              )
 196            in
 197            trailsbywhite 0
 198        in
 199        if ltpos = -1
 200        then
 201          v.f v Vend pos slen, slen
 202        else
 203          let start_of_text_pos = find_non_white pos in
 204          let end_of_text_pos =
 205            if start_of_text_pos < ltpos
 206            then
 207              let rec find i =
 208                if i = start_of_text_pos || not (iswhite s.[i])
 209                then i+1
 210                else find (i-1)
 211              in
 212              find (ltpos-1)
 213            else start_of_text_pos
 214          in
 215          let v =
 216            if start_of_text_pos != end_of_text_pos
 217            then v.f v Vdata start_of_text_pos end_of_text_pos
 218            else v
 219          in
 220          collect v (ltpos+1) `lt
 221
 222     | `lt ->
 223        let pos, t =
 224          match s.[pos] with
 225          | '/' -> (pos+1), `close
 226          | '!' -> (pos+1), `exclam
 227          | '?' -> (pos+1), `question
 228          | c when isname c -> pos, `tag
 229          | _ -> parse_error "invalid data after <" s pos
 230        in
 231        collect v pos t
 232
 233     | `close ->
 234        let tag_name_pos = find_non_white pos in
 235        let tag_name_end_pos, close_tag_name = getname tag_name_pos in
 236        let close_tag_pos = find_non_white tag_name_end_pos in
 237        if s.[close_tag_pos] != '>'
 238        then parse_error "missing >" s pos;
 239        let pos' = close_tag_pos + 1 in
 240        let v = v.f v (Vclose close_tag_name) pos pos' in
 241        collect v pos' `text
 242
 243     | `doctype ->
 244        let close_tag_pos =
 245          try
 246            String.index_from s pos '>'
 247          with Not_found ->
 248            parse_error "doctype is not terminated" s pos
 249        in
 250        collect v (close_tag_pos+1) `text
 251
 252     | `comment ->
 253        let pos =
 254          try
 255            find_substr pos "-->" r_comment_terminator
 256          with Not_found ->
 257            parse_error "comment is not terminated" s pos
 258        in
 259        collect v (pos+3) `text
 260
 261     | `exclam ->
 262        if begins_with pos "[CDATA["
 263        then
 264          let cdata_start = pos+7 in
 265          let cdata_end = find_substr cdata_start "]]>" r_CDATA_terminator in
 266          let v = v.f v Vcdata cdata_start cdata_end in
 267          collect v (cdata_end+3) `text
 268        else (
 269          if begins_with pos "DOCTYPE"
 270          then
 271            collect v (pos+7) `doctype
 272          else (
 273            if begins_with pos "--"
 274            then collect v (pos+2) `comment
 275            else parse_error "unknown shit after exclamation mark" s pos
 276          )
 277        )
 278
 279     | `question ->
 280        let pos = find_substr pos "?>" r_q_terminator in
 281        collect v (pos+2) `text
 282
 283     | `tag ->
 284        let pos', name = getname pos in
 285        let attrs, pos', closed = collect_attributes pos' in
 286        let v = v.f v (Vopen (name, attrs, closed)) pos pos' in
 287        collect v pos' `text
 288
 289   and collect_attributes pos =
 290     let rec f accu pos =
 291       let nameval pos =
 292         let pos, name = getname pos in
 293         let pos = find_non_white pos in
 294         if s.[pos] = '='
 295         then
 296           let qpos = pos+1 in
 297           if qpos = slen
 298           then parse_error "not enough data for attribute" s pos;
 299
 300           let qc = s.[qpos] in
 301           if not (qc = '\'' || qc = '\"')
 302           then parse_error "assignment is not followed by a quote" s pos;
 303
 304           let closing_q_pos =
 305             let rec find i =
 306               if i = slen
 307               then parse_error "not enough data for attribute value" s pos;
 308
 309               if s.[i] = qc then i else find (i+1)
 310             in
 311             find (qpos+1)
 312           in
 313
 314           let vallen = closing_q_pos - (qpos+1) in
 315           let val' = String.sub s (qpos+1) vallen in
 316           (name, val'), closing_q_pos+1
 317
 318           else parse_error "attribute name not followed by '='" s pos
 319       in
 320
 321       let pos = find_non_white pos in
 322       if s.[pos] = '>'
 323       then
 324         accu, pos+1, false
 325       else (
 326         if slen - pos > 2 && s.[pos] = '/' && s.[pos+1] = '>'
 327         then
 328           accu, pos+2, true
 329         else (
 330           if isname s.[pos]
 331           then (
 332             let nameval, pos = nameval pos in
 333             let accu = nameval :: accu in
 334             f accu pos
 335           )
 336           else parse_error "malformed attribute list" s pos;
 337         )
 338       )
 339     in
 340     f [] pos
 341   in
 342   let _, _ = collect v 0 `text in
 343   v.accu;
 344 ;;