parser.ml

   1 (* based on Tor Andersson's XML parser from MuPDF's XPS module *)
   2
   3 let iswhite = function
   4   | '\r' | '\n' | '\t' | ' ' -> true
   5   | _ -> false
   6
   7 let isname = function
   8   | '.' | '-' | '_' | ':' -> true
   9   | c -> (c >= '0' && c <= '9')
  10          || (c >= 'a' && c <= 'z')
  11          || (c >= 'A' && c <= 'Z')
  12
  13 exception Parse_error of string * string * int
  14
  15 let parse_error msg s pos = raise (Parse_error (msg, s, pos))
  16
  17 let enent s pos len =
  18   let b = Buffer.create len in
  19   let rec loop i =
  20     if i - pos = len
  21     then Buffer.contents b
  22     else (
  23       begin match s.[i] with
  24       | '<' -> Buffer.add_string b "&lt;"
  25       | '>' -> Buffer.add_string b "&gt;"
  26       | '\'' -> Buffer.add_string b "&apos;"
  27       | '\"' -> Buffer.add_string b "&quot;"
  28       | '&' -> Buffer.add_string b "&amp;"
  29       | c ->
  30          let code = Char.code c in
  31          if code < 32 || code > 127
  32          then (
  33            Buffer.add_string b "&#";
  34            Buffer.add_string b (string_of_int code);
  35            Buffer.add_char b ';';
  36          )
  37          else Buffer.add_char b c
  38       end;
  39       loop (i+1)
  40     )
  41   in
  42   loop pos
  43
  44 let unent b s pos len =
  45   let rec loop i =
  46     if i != pos + len
  47     then
  48       let amppos =
  49         try
  50           String.index_from s i '&'
  51         with Not_found -> -1
  52       in
  53       if amppos = -1 || amppos >= pos + len
  54       then Buffer.add_substring b s i (pos + len - i)
  55       else (
  56         Buffer.add_substring b s i (amppos - i);
  57         if amppos = i + len then Utils.error "lonely amp";
  58
  59         let semipos =
  60           try
  61             let semipos = String.index_from s (amppos+1) ';' in
  62             if semipos >= pos + len then raise Not_found;
  63             semipos
  64           with Not_found ->
  65             Utils.error "amp not followed by semicolon at %d" amppos
  66         in
  67
  68         let subslen = semipos-amppos-1 in
  69         if subslen = 0 then Utils.error "empty amp at %d" amppos;
  70
  71         let subs = String.sub s (amppos+1) subslen in
  72
  73         if subs.[0] = '#'
  74         then (
  75           if subslen = 1
  76           then Utils.error "empty amp followed by hash at %d" amppos;
  77           let code =
  78             if subs.[1] = 'x'
  79             then Scanf.sscanf subs "#x%x" (fun n -> n)
  80             else int_of_string (String.sub subs 1 (subslen-1))
  81           in
  82           let c = Char.unsafe_chr code in
  83           Buffer.add_char b c
  84         )
  85         else (
  86           match subs with
  87           | "lt" -> Buffer.add_char b '<'
  88           | "gt" -> Buffer.add_char b '>'
  89           | "amp" -> Buffer.add_char b '&'
  90           | "apos" -> Buffer.add_char b '\''
  91           | "quot" -> Buffer.add_char b '\"'
  92           | _ -> Utils.error "unknown amp %S" subs
  93         );
  94         loop (semipos+1)
  95       )
  96   in
  97   loop pos
  98
  99 let subs s pos =
 100   let len = String.length s in
 101   let left = len - pos in
 102   if left < 0
 103   then Printf.sprintf "(pos=%d len=%d left=%d)" pos len left
 104   else
 105     let len = min left 10 in
 106     String.sub s pos len
 107
 108 let ts = function
 109   | `text -> "text"
 110   | `lt -> "lt"
 111   | `close -> "close"
 112   | `exclam -> "exclam"
 113   | `question -> "question"
 114   | `doctype -> "doctype"
 115   | `comment -> "comment"
 116   | `tag -> "tag"
 117
 118 type attr = string * string
 119  and attrs = attr list
 120  and vp =
 121    | Vdata
 122    | Vcdata
 123    | Vopen of string * attrs * bool
 124    | Vclose of string
 125    | Vend
 126  and 'a v = { f : 'a v -> vp -> int -> int -> 'a v; accu : 'a }
 127
 128 let parse v s =
 129   let r_comment_terminator = Str.regexp "-->"
 130   and r_CDATA_terminator = Str.regexp "\\]\\]>"
 131   and r_q_terminator = Str.regexp "\\?>" in
 132
 133   let slen = String.length s in
 134
 135   let find_substr pos subs r =
 136     let pos =
 137       try
 138         Str.search_forward r s pos
 139       with Not_found ->
 140         parse_error ("cannot find substring " ^ subs) s pos
 141     in
 142     pos
 143   in
 144   let begins_with pos prefix = Utils.substratis s pos prefix in
 145   let find_non_white pos =
 146     let rec forward i =
 147       if i >= slen
 148       then parse_error "cannot find non white space character" s pos;
 149       if iswhite s.[i] then forward (i+1) else i in
 150     forward pos
 151   in
 152
 153   let getname pos =
 154     let non_name_pos =
 155       let rec find_non_name i =
 156         if i >= slen then parse_error "cannot find non name character" s pos;
 157         if isname s.[i] then find_non_name (i+1) else i
 158       in
 159       find_non_name pos
 160     in
 161     non_name_pos, String.sub s pos (non_name_pos - pos)
 162   in
 163
 164   let rec collect v pos t =
 165     if pos >= slen && t != `text
 166     then parse_error ("not enough data for " ^ ts t) s pos;
 167
 168     match t with
 169     | `text ->
 170        let ltpos =
 171          try
 172            String.index_from s pos '<'
 173          with Not_found ->
 174            let rec trailsbywhite i =
 175              if pos+i = String.length s
 176              then -1
 177              else (
 178                if not (iswhite s.[pos+i])
 179                then parse_error "garbage at the end" s pos
 180                else trailsbywhite (i+1)
 181              )
 182            in
 183            trailsbywhite 0
 184        in
 185        if ltpos = -1
 186        then v.f v Vend pos slen, slen
 187        else
 188          let start_of_text_pos = find_non_white pos in
 189          let end_of_text_pos =
 190            if start_of_text_pos < ltpos
 191            then
 192              let rec find i =
 193                if i = start_of_text_pos || not (iswhite s.[i])
 194                then i+1
 195                else find (i-1)
 196              in
 197              find (ltpos-1)
 198            else start_of_text_pos
 199          in
 200          let v =
 201            if start_of_text_pos != end_of_text_pos
 202            then v.f v Vdata start_of_text_pos end_of_text_pos
 203            else v
 204          in
 205          collect v (ltpos+1) `lt
 206
 207     | `lt ->
 208        let pos, t =
 209          match s.[pos] with
 210          | '/' -> (pos+1), `close
 211          | '!' -> (pos+1), `exclam
 212          | '?' -> (pos+1), `question
 213          | c when isname c -> pos, `tag
 214          | _ -> parse_error "invalid data after <" s pos
 215        in
 216        collect v pos t
 217
 218     | `close ->
 219        let tag_name_pos = find_non_white pos in
 220        let tag_name_end_pos, close_tag_name = getname tag_name_pos in
 221        let close_tag_pos = find_non_white tag_name_end_pos in
 222        if s.[close_tag_pos] != '>'
 223        then parse_error "missing >" s pos;
 224        let pos' = close_tag_pos + 1 in
 225        let v = v.f v (Vclose close_tag_name) pos pos' in
 226        collect v pos' `text
 227
 228     | `doctype ->
 229        let close_tag_pos =
 230          try
 231            String.index_from s pos '>'
 232          with Not_found ->
 233            parse_error "doctype is not terminated" s pos
 234        in
 235        collect v (close_tag_pos+1) `text
 236
 237     | `comment ->
 238        let pos =
 239          try
 240            find_substr pos "-->" r_comment_terminator
 241          with Not_found ->
 242            parse_error "comment is not terminated" s pos
 243        in
 244        collect v (pos+3) `text
 245
 246     | `exclam ->
 247        if begins_with pos "[CDATA["
 248        then
 249          let cdata_start = pos+7 in
 250          let cdata_end = find_substr cdata_start "]]>" r_CDATA_terminator in
 251          let v = v.f v Vcdata cdata_start cdata_end in
 252          collect v (cdata_end+3) `text
 253        else (
 254          if begins_with pos "DOCTYPE"
 255          then
 256            collect v (pos+7) `doctype
 257          else (
 258            if begins_with pos "--"
 259            then collect v (pos+2) `comment
 260            else parse_error "unknown shit after exclamation mark" s pos
 261          )
 262        )
 263
 264     | `question ->
 265        let pos = find_substr pos "?>" r_q_terminator in
 266        collect v (pos+2) `text
 267
 268     | `tag ->
 269        let pos', name = getname pos in
 270        let attrs, pos', closed = collect_attributes pos' in
 271        let v = v.f v (Vopen (name, attrs, closed)) pos pos' in
 272        collect v pos' `text
 273
 274   and collect_attributes pos =
 275     let rec f accu pos =
 276       let nameval pos =
 277         let pos, name = getname pos in
 278         let pos = find_non_white pos in
 279         if s.[pos] = '='
 280         then
 281           let qpos = pos+1 in
 282           if qpos = slen
 283           then parse_error "not enough data for attribute" s pos;
 284
 285           let qc = s.[qpos] in
 286           if not (qc = '\'' || qc = '\"')
 287           then parse_error "assignment is not followed by a quote" s pos;
 288
 289           let closing_q_pos =
 290             let rec find i =
 291               if i = slen
 292               then parse_error "not enough data for attribute value" s pos;
 293
 294               if s.[i] = qc then i else find (i+1)
 295             in
 296             find (qpos+1)
 297           in
 298
 299           let vallen = closing_q_pos - (qpos+1) in
 300           let val' = String.sub s (qpos+1) vallen in
 301           (name, val'), closing_q_pos+1
 302
 303           else parse_error "attribute name not followed by '='" s pos
 304       in
 305
 306       let pos = find_non_white pos in
 307       if s.[pos] = '>'
 308       then accu, pos+1, false
 309       else (
 310         if slen - pos > 2 && s.[pos] = '/' && s.[pos+1] = '>'
 311         then accu, pos+2, true
 312         else (
 313           if isname s.[pos]
 314           then (
 315             let nameval, pos = nameval pos in
 316             let accu = nameval :: accu in
 317             f accu pos
 318           )
 319           else parse_error "malformed attribute list" s pos;
 320         )
 321       )
 322     in
 323     f [] pos
 324   in
 325   let _, _ = collect v 0 `text in
 326   v.accu