parser.ml

   1 (* based on Tor Andersson's XML parser from MuPDF's XPS module *)
   2
   3 let r_comment_terminator = Str.regexp "-->";;
   4 let r_CDATA_terminator = Str.regexp "\\]\\]>";;
   5 let r_q_terminator = Str.regexp "\\?>";;
   6
   7 let iswhite = function
   8   | '\r' | '\n' | '\t' | ' ' -> true
   9   | _ -> false
  10 ;;
  11
  12 let isname = function
  13   | '.' | '-' | '_' | ':' -> true
  14   | c -> (c >= '0' && c <= '9')
  15          || (c >= 'a' && c <= 'z')
  16          || (c >= 'A' && c <= 'Z')
  17 ;;
  18
  19 exception Parse_error of string * string * int;;
  20
  21 let parse_error msg s pos =
  22   raise (Parse_error (msg, s, pos))
  23 ;;
  24
  25 let enent s pos len =
  26   let b = Buffer.create len in
  27   let rec loop i =
  28     if i - pos = len
  29     then Buffer.contents b
  30     else (
  31       begin match s.[i] with
  32       | '<' -> Buffer.add_string b "&lt;"
  33       | '>' -> Buffer.add_string b "&gt;"
  34       | '\'' -> Buffer.add_string b "&apos;"
  35       | '\"' -> Buffer.add_string b "&quot;"
  36       | '&' -> Buffer.add_string b "&amp;"
  37       | c ->
  38          let code = Char.code c in
  39          if code < 32 || code > 127
  40          then (
  41            Buffer.add_string b "&#";
  42            Buffer.add_string b (string_of_int code);
  43            Buffer.add_char b ';';
  44          )
  45          else Buffer.add_char b c
  46       end;
  47       loop (i+1)
  48     )
  49   in
  50   loop pos
  51 ;;
  52
  53 let unent b s pos len =
  54   let rec loop i =
  55     if i = pos + len
  56     then ()
  57     else
  58       let amppos =
  59         try
  60           String.index_from s i '&'
  61         with Not_found -> -1
  62       in
  63       if amppos = -1 || amppos >= pos + len
  64       then (
  65         Buffer.add_substring b s i (pos + len - i)
  66       )
  67       else (
  68         Buffer.add_substring b s i (amppos - i);
  69         if amppos = i + len then Utils.error "lonely amp";
  70
  71         let semipos =
  72           try
  73             let semipos = String.index_from s (amppos+1) ';' in
  74             if semipos >= pos + len then raise Not_found;
  75             semipos
  76           with Not_found ->
  77             Utils.error "amp not followed by semicolon at %d" amppos
  78         in
  79
  80         let subslen = semipos-amppos-1 in
  81         if subslen = 0 then Utils.error "empty amp at %d" amppos;
  82
  83         let subs = String.sub s (amppos+1) subslen in
  84
  85         if subs.[0] = '#'
  86         then (
  87           if subslen = 1
  88           then Utils.error "empty amp followed by hash at %d" amppos;
  89           let code =
  90             if subs.[1] = 'x'
  91             then Scanf.sscanf subs "#x%x" (fun n -> n)
  92             else int_of_string (String.sub subs 1 (subslen-1))
  93           in
  94           let c = Char.unsafe_chr code in
  95           Buffer.add_char b c
  96         )
  97         else (
  98           match subs with
  99           | "lt" -> Buffer.add_char b '<'
 100           | "gt" -> Buffer.add_char b '>'
 101           | "amp" -> Buffer.add_char b '&'
 102           | "apos" -> Buffer.add_char b '\''
 103           | "quot" -> Buffer.add_char b '\"'
 104           | _ -> Utils.error "unknown amp %S" subs
 105         );
 106         loop (semipos+1)
 107       )
 108   in
 109   loop pos
 110 ;;
 111
 112 let subs s pos =
 113   let len = String.length s in
 114   let left = len - pos in
 115   if left < 0
 116   then
 117     Printf.sprintf "(pos=%d len=%d left=%d)"
 118                    pos len left
 119   else
 120     let len = min left 10 in
 121     let s = String.sub s pos len in
 122     s;
 123 ;;
 124
 125 let ts = function
 126   | `text -> "text"
 127   | `lt -> "lt"
 128   | `close -> "close"
 129   | `exclam -> "exclam"
 130   | `question -> "question"
 131   | `doctype -> "doctype"
 132   | `comment -> "comment"
 133   | `tag -> "tag"
 134 ;;
 135
 136 type attr = string * string
 137  and attrs = attr list
 138  and vp =
 139    | Vdata
 140    | Vcdata
 141    | Vopen of string * attrs * bool
 142    | Vclose of string
 143    | Vend
 144  and 'a v = { f : 'a v -> vp -> int -> int -> 'a v; accu : 'a }
 145 ;;
 146
 147 let parse v s =
 148   let slen = String.length s in
 149
 150   let find_substr pos subs r =
 151     let pos =
 152       try
 153         Str.search_forward r s pos
 154       with Not_found ->
 155         parse_error ("cannot find substring " ^ subs) s pos
 156     in
 157     pos
 158   in
 159   let begins_with pos prefix = Utils.substratis s pos prefix in
 160   let find_non_white pos =
 161     let rec forward i =
 162       if i >= slen
 163       then parse_error "cannot find non white space character" s pos;
 164       if iswhite s.[i] then forward (i+1) else i in
 165     forward pos
 166   in
 167
 168   let getname pos =
 169     let non_name_pos =
 170       let rec find_non_name i =
 171         if i >= slen then parse_error "cannot find non name character" s pos;
 172         if isname s.[i] then find_non_name (i+1) else i
 173       in
 174       find_non_name pos
 175     in
 176     non_name_pos, String.sub s pos (non_name_pos - pos)
 177   in
 178
 179   let rec collect v pos t =
 180     if pos >= slen && t != `text
 181     then parse_error ("not enough data for " ^ ts t) s pos;
 182
 183     match t with
 184     | `text ->
 185        let ltpos =
 186          try
 187            String.index_from s pos '<'
 188          with Not_found ->
 189            let rec trailsbywhite i =
 190              if pos+i = String.length s
 191              then -1
 192              else (
 193                if not (iswhite s.[pos+i])
 194                then parse_error "garbage at the end" s pos
 195                else trailsbywhite (i+1)
 196              )
 197            in
 198            trailsbywhite 0
 199        in
 200        if ltpos = -1
 201        then
 202          v.f v Vend pos slen, slen
 203        else
 204          let start_of_text_pos = find_non_white pos in
 205          let end_of_text_pos =
 206            if start_of_text_pos < ltpos
 207            then
 208              let rec find i =
 209                if i = start_of_text_pos || not (iswhite s.[i])
 210                then i+1
 211                else find (i-1)
 212              in
 213              find (ltpos-1)
 214            else start_of_text_pos
 215          in
 216          let v =
 217            if start_of_text_pos != end_of_text_pos
 218            then v.f v Vdata start_of_text_pos end_of_text_pos
 219            else v
 220          in
 221          collect v (ltpos+1) `lt
 222
 223     | `lt ->
 224        let pos, t =
 225          match s.[pos] with
 226          | '/' -> (pos+1), `close
 227          | '!' -> (pos+1), `exclam
 228          | '?' -> (pos+1), `question
 229          | c when isname c -> pos, `tag
 230          | _ -> parse_error "invalid data after <" s pos
 231        in
 232        collect v pos t
 233
 234     | `close ->
 235        let tag_name_pos = find_non_white pos in
 236        let tag_name_end_pos, close_tag_name = getname tag_name_pos in
 237        let close_tag_pos = find_non_white tag_name_end_pos in
 238        if s.[close_tag_pos] != '>'
 239        then parse_error "missing >" s pos;
 240        let pos' = close_tag_pos + 1 in
 241        let v = v.f v (Vclose close_tag_name) pos pos' in
 242        collect v pos' `text
 243
 244     | `doctype ->
 245        let close_tag_pos =
 246          try
 247            String.index_from s pos '>'
 248          with Not_found ->
 249            parse_error "doctype is not terminated" s pos
 250        in
 251        collect v (close_tag_pos+1) `text
 252
 253     | `comment ->
 254        let pos =
 255          try
 256            find_substr pos "-->" r_comment_terminator
 257          with Not_found ->
 258            parse_error "comment is not terminated" s pos
 259        in
 260        collect v (pos+3) `text
 261
 262     | `exclam ->
 263        if begins_with pos "[CDATA["
 264        then
 265          let cdata_start = pos+7 in
 266          let cdata_end = find_substr cdata_start "]]>" r_CDATA_terminator in
 267          let v = v.f v Vcdata cdata_start cdata_end in
 268          collect v (cdata_end+3) `text
 269        else (
 270          if begins_with pos "DOCTYPE"
 271          then
 272            collect v (pos+7) `doctype
 273          else (
 274            if begins_with pos "--"
 275            then collect v (pos+2) `comment
 276            else parse_error "unknown shit after exclamation mark" s pos
 277          )
 278        )
 279
 280     | `question ->
 281        let pos = find_substr pos "?>" r_q_terminator in
 282        collect v (pos+2) `text
 283
 284     | `tag ->
 285        let pos', name = getname pos in
 286        let attrs, pos', closed = collect_attributes pos' in
 287        let v = v.f v (Vopen (name, attrs, closed)) pos pos' in
 288        collect v pos' `text
 289
 290   and collect_attributes pos =
 291     let rec f accu pos =
 292       let nameval pos =
 293         let pos, name = getname pos in
 294         let pos = find_non_white pos in
 295         if s.[pos] = '='
 296         then
 297           let qpos = pos+1 in
 298           if qpos = slen
 299           then parse_error "not enough data for attribute" s pos;
 300
 301           let qc = s.[qpos] in
 302           if not (qc = '\'' || qc = '\"')
 303           then parse_error "assignment is not followed by a quote" s pos;
 304
 305           let closing_q_pos =
 306             let rec find i =
 307               if i = slen
 308               then parse_error "not enough data for attribute value" s pos;
 309
 310               if s.[i] = qc then i else find (i+1)
 311             in
 312             find (qpos+1)
 313           in
 314
 315           let vallen = closing_q_pos - (qpos+1) in
 316           let val' = String.sub s (qpos+1) vallen in
 317           (name, val'), closing_q_pos+1
 318
 319           else parse_error "attribute name not followed by '='" s pos
 320       in
 321
 322       let pos = find_non_white pos in
 323       if s.[pos] = '>'
 324       then
 325         accu, pos+1, false
 326       else (
 327         if slen - pos > 2 && s.[pos] = '/' && s.[pos+1] = '>'
 328         then
 329           accu, pos+2, true
 330         else (
 331           if isname s.[pos]
 332           then (
 333             let nameval, pos = nameval pos in
 334             let accu = nameval :: accu in
 335             f accu pos
 336           )
 337           else parse_error "malformed attribute list" s pos;
 338         )
 339       )
 340     in
 341     f [] pos
 342   in
 343   let _, _ = collect v 0 `text in
 344   v.accu;
 345 ;;