Lib/email/FeedParser.py

   1 # Copyright (C) 2004-2006 Python Software Foundation
   2 # Authors: Baxter, Wouters and Warsaw
   3 # Contact: email-sig@python.org
   4
   5 """FeedParser - An email feed parser.
   6
   7 The feed parser implements an interface for incrementally parsing an email
   8 message, line by line.  This has advantages for certain applications, such as
   9 those reading email messages off a socket.
  10
  11 FeedParser.feed() is the primary interface for pushing new data into the
  12 parser.  It returns when there's nothing more it can do with the available
  13 data.  When you have no more data to push into the parser, call .close().
  14 This completes the parsing and returns the root message object.
  15
  16 The other advantage of this parser is that it will never throw a parsing
  17 exception.  Instead, when it finds something unexpected, it adds a 'defect' to
  18 the current message.  Defects are just instances that live on the message
  19 object's .defects attribute.
  20 """
  21
  22 import re
  23 from email import Errors
  24 from email import Message
  25
  26 NLCRE = re.compile('\r\n|\r|\n')
  27 NLCRE_bol = re.compile('(\r\n|\r|\n)')
  28 NLCRE_eol = re.compile('(\r\n|\r|\n)$')
  29 NLCRE_crack = re.compile('(\r\n|\r|\n)')
  30 # RFC 2822 $3.6.8 Optional fields.  ftext is %d33-57 / %d59-126, Any character
  31 # except controls, SP, and ":".
  32 headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')
  33 EMPTYSTRING = ''
  34 NL = '\n'
  35
  36 NeedMoreData = object()
  37
  38
  39 \f
  40 class BufferedSubFile(object):
  41     """A file-ish object that can have new data loaded into it.
  42
  43     You can also push and pop line-matching predicates onto a stack.  When the
  44     current predicate matches the current line, a false EOF response
  45     (i.e. empty string) is returned instead.  This lets the parser adhere to a
  46     simple abstraction -- it parses until EOF closes the current message.
  47     """
  48     def __init__(self):
  49         # The last partial line pushed into this object.
  50         self._partial = ''
  51         # The list of full, pushed lines, in reverse order
  52         self._lines = []
  53         # The stack of false-EOF checking predicates.
  54         self._eofstack = []
  55         # A flag indicating whether the file has been closed or not.
  56         self._closed = False
  57
  58     def push_eof_matcher(self, pred):
  59         self._eofstack.append(pred)
  60
  61     def pop_eof_matcher(self):
  62         return self._eofstack.pop()
  63
  64     def close(self):
  65         # Don't forget any trailing partial line.
  66         self._lines.append(self._partial)
  67         self._partial = ''
  68         self._closed = True
  69
  70     def readline(self):
  71         if not self._lines:
  72             if self._closed:
  73                 return ''
  74             return NeedMoreData
  75         # Pop the line off the stack and see if it matches the current
  76         # false-EOF predicate.
  77         line = self._lines.pop()
  78         # RFC 2046, section 5.1.2 requires us to recognize outer level
  79         # boundaries at any level of inner nesting.  Do this, but be sure it's
  80         # in the order of most to least nested.
  81         for ateof in self._eofstack[::-1]:
  82             if ateof(line):
  83                 # We're at the false EOF.  But push the last line back first.
  84                 self._lines.append(line)
  85                 return ''
  86         return line
  87
  88     def unreadline(self, line):
  89         # Let the consumer push a line back into the buffer.
  90         assert line is not NeedMoreData
  91         self._lines.append(line)
  92
  93     def push(self, data):
  94         """Push some new data into this object."""
  95         # Handle any previous leftovers
  96         data, self._partial = self._partial + data, ''
  97         # Crack into lines, but preserve the newlines on the end of each
  98         parts = NLCRE_crack.split(data)
  99         # The *ahem* interesting behaviour of re.split when supplied grouping
 100         # parentheses is that the last element of the resulting list is the
 101         # data after the final RE.  In the case of a NL/CR terminated string,
 102         # this is the empty string.
 103         self._partial = parts.pop()
 104         # parts is a list of strings, alternating between the line contents
 105         # and the eol character(s).  Gather up a list of lines after
 106         # re-attaching the newlines.
 107         lines = []
 108         for i in range(len(parts) // 2):
 109             lines.append(parts[i*2] + parts[i*2+1])
 110         self.pushlines(lines)
 111
 112     def pushlines(self, lines):
 113         # Reverse and insert at the front of the lines.
 114         self._lines[:0] = lines[::-1]
 115
 116     def is_closed(self):
 117         return self._closed
 118
 119     def __iter__(self):
 120         return self
 121
 122     def next(self):
 123         line = self.readline()
 124         if line == '':
 125             raise StopIteration
 126         return line
 127
 128
 129 \f
 130 class FeedParser:
 131     """A feed-style parser of email."""
 132
 133     def __init__(self, _factory=Message.Message):
 134         """_factory is called with no arguments to create a new message obj"""
 135         self._factory = _factory
 136         self._input = BufferedSubFile()
 137         self._msgstack = []
 138         self._parse = self._parsegen().next
 139         self._cur = None
 140         self._last = None
 141         self._headersonly = False
 142
 143     # Non-public interface for supporting Parser's headersonly flag
 144     def _set_headersonly(self):
 145         self._headersonly = True
 146
 147     def feed(self, data):
 148         """Push more data into the parser."""
 149         self._input.push(data)
 150         self._call_parse()
 151
 152     def _call_parse(self):
 153         try:
 154             self._parse()
 155         except StopIteration:
 156             pass
 157
 158     def close(self):
 159         """Parse all remaining data and return the root message object."""
 160         self._input.close()
 161         self._call_parse()
 162         root = self._pop_message()
 163         assert not self._msgstack
 164         # Look for final set of defects
 165         if root.get_content_maintype() == 'multipart' \
 166                and not root.is_multipart():
 167             root.defects.append(Errors.MultipartInvariantViolationDefect())
 168         return root
 169
 170     def _new_message(self):
 171         msg = self._factory()
 172         if self._cur and self._cur.get_content_type() == 'multipart/digest':
 173             msg.set_default_type('message/rfc822')
 174         if self._msgstack:
 175             self._msgstack[-1].attach(msg)
 176         self._msgstack.append(msg)
 177         self._cur = msg
 178         self._last = msg
 179
 180     def _pop_message(self):
 181         retval = self._msgstack.pop()
 182         if self._msgstack:
 183             self._cur = self._msgstack[-1]
 184         else:
 185             self._cur = None
 186         return retval
 187
 188     def _parsegen(self):
 189         # Create a new message and start by parsing headers.
 190         self._new_message()
 191         headers = []
 192         # Collect the headers, searching for a line that doesn't match the RFC
 193         # 2822 header or continuation pattern (including an empty line).
 194         for line in self._input:
 195             if line is NeedMoreData:
 196                 yield NeedMoreData
 197                 continue
 198             if not headerRE.match(line):
 199                 # If we saw the RFC defined header/body separator
 200                 # (i.e. newline), just throw it away. Otherwise the line is
 201                 # part of the body so push it back.
 202                 if not NLCRE.match(line):
 203                     self._input.unreadline(line)
 204                 break
 205             headers.append(line)
 206         # Done with the headers, so parse them and figure out what we're
 207         # supposed to see in the body of the message.
 208         self._parse_headers(headers)
 209         # Headers-only parsing is a backwards compatibility hack, which was
 210         # necessary in the older parser, which could throw errors.  All
 211         # remaining lines in the input are thrown into the message body.
 212         if self._headersonly:
 213             lines = []
 214             while True:
 215                 line = self._input.readline()
 216                 if line is NeedMoreData:
 217                     yield NeedMoreData
 218                     continue
 219                 if line == '':
 220                     break
 221                 lines.append(line)
 222             self._cur.set_payload(EMPTYSTRING.join(lines))
 223             return
 224         if self._cur.get_content_type() == 'message/delivery-status':
 225             # message/delivery-status contains blocks of headers separated by
 226             # a blank line.  We'll represent each header block as a separate
 227             # nested message object, but the processing is a bit different
 228             # than standard message/* types because there is no body for the
 229             # nested messages.  A blank line separates the subparts.
 230             while True:
 231                 self._input.push_eof_matcher(NLCRE.match)
 232                 for retval in self._parsegen():
 233                     if retval is NeedMoreData:
 234                         yield NeedMoreData
 235                         continue
 236                     break
 237                 msg = self._pop_message()
 238                 # We need to pop the EOF matcher in order to tell if we're at
 239                 # the end of the current file, not the end of the last block
 240                 # of message headers.
 241                 self._input.pop_eof_matcher()
 242                 # The input stream must be sitting at the newline or at the
 243                 # EOF.  We want to see if we're at the end of this subpart, so
 244                 # first consume the blank line, then test the next line to see
 245                 # if we're at this subpart's EOF.
 246                 while True:
 247                     line = self._input.readline()
 248                     if line is NeedMoreData:
 249                         yield NeedMoreData
 250                         continue
 251                     break
 252                 while True:
 253                     line = self._input.readline()
 254                     if line is NeedMoreData:
 255                         yield NeedMoreData
 256                         continue
 257                     break
 258                 if line == '':
 259                     break
 260                 # Not at EOF so this is a line we're going to need.
 261                 self._input.unreadline(line)
 262             return
 263         if self._cur.get_content_maintype() == 'message':
 264             # The message claims to be a message/* type, then what follows is
 265             # another RFC 2822 message.
 266             for retval in self._parsegen():
 267                 if retval is NeedMoreData:
 268                     yield NeedMoreData
 269                     continue
 270                 break
 271             self._pop_message()
 272             return
 273         if self._cur.get_content_maintype() == 'multipart':
 274             boundary = self._cur.get_boundary()
 275             if boundary is None:
 276                 # The message /claims/ to be a multipart but it has not
 277                 # defined a boundary.  That's a problem which we'll handle by
 278                 # reading everything until the EOF and marking the message as
 279                 # defective.
 280                 self._cur.defects.append(Errors.NoBoundaryInMultipartDefect())
 281                 lines = []
 282                 for line in self._input:
 283                     if line is NeedMoreData:
 284                         yield NeedMoreData
 285                         continue
 286                     lines.append(line)
 287                 self._cur.set_payload(EMPTYSTRING.join(lines))
 288                 return
 289             # Create a line match predicate which matches the inter-part
 290             # boundary as well as the end-of-multipart boundary.  Don't push
 291             # this onto the input stream until we've scanned past the
 292             # preamble.
 293             separator = '--' + boundary
 294             boundaryre = re.compile(
 295                 '(?P<sep>' + re.escape(separator) +
 296                 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
 297             capturing_preamble = True
 298             preamble = []
 299             linesep = False
 300             while True:
 301                 line = self._input.readline()
 302                 if line is NeedMoreData:
 303                     yield NeedMoreData
 304                     continue
 305                 if line == '':
 306                     break
 307                 mo = boundaryre.match(line)
 308                 if mo:
 309                     # If we're looking at the end boundary, we're done with
 310                     # this multipart.  If there was a newline at the end of
 311                     # the closing boundary, then we need to initialize the
 312                     # epilogue with the empty string (see below).
 313                     if mo.group('end'):
 314                         linesep = mo.group('linesep')
 315                         break
 316                     # We saw an inter-part boundary.  Were we in the preamble?
 317                     if capturing_preamble:
 318                         if preamble:
 319                             # According to RFC 2046, the last newline belongs
 320                             # to the boundary.
 321                             lastline = preamble[-1]
 322                             eolmo = NLCRE_eol.search(lastline)
 323                             if eolmo:
 324                                 preamble[-1] = lastline[:-len(eolmo.group(0))]
 325                             self._cur.preamble = EMPTYSTRING.join(preamble)
 326                         capturing_preamble = False
 327                         self._input.unreadline(line)
 328                         continue
 329                     # We saw a boundary separating two parts.  Consume any
 330                     # multiple boundary lines that may be following.  Our
 331                     # interpretation of RFC 2046 BNF grammar does not produce
 332                     # body parts within such double boundaries.
 333                     while True:
 334                         line = self._input.readline()
 335                         if line is NeedMoreData:
 336                             yield NeedMoreData
 337                             continue
 338                         mo = boundaryre.match(line)
 339                         if not mo:
 340                             self._input.unreadline(line)
 341                             break
 342                     # Recurse to parse this subpart; the input stream points
 343                     # at the subpart's first line.
 344                     self._input.push_eof_matcher(boundaryre.match)
 345                     for retval in self._parsegen():
 346                         if retval is NeedMoreData:
 347                             yield NeedMoreData
 348                             continue
 349                         break
 350                     # Because of RFC 2046, the newline preceding the boundary
 351                     # separator actually belongs to the boundary, not the
 352                     # previous subpart's payload (or epilogue if the previous
 353                     # part is a multipart).
 354                     if self._last.get_content_maintype() == 'multipart':
 355                         epilogue = self._last.epilogue
 356                         if epilogue == '':
 357                             self._last.epilogue = None
 358                         elif epilogue is not None:
 359                             mo = NLCRE_eol.search(epilogue)
 360                             if mo:
 361                                 end = len(mo.group(0))
 362                                 self._last.epilogue = epilogue[:-end]
 363                     else:
 364                         payload = self._last.get_payload()
 365                         if isinstance(payload, basestring):
 366                             mo = NLCRE_eol.search(payload)
 367                             if mo:
 368                                 payload = payload[:-len(mo.group(0))]
 369                                 self._last.set_payload(payload)
 370                     self._input.pop_eof_matcher()
 371                     self._pop_message()
 372                     # Set the multipart up for newline cleansing, which will
 373                     # happen if we're in a nested multipart.
 374                     self._last = self._cur
 375                 else:
 376                     # I think we must be in the preamble
 377                     assert capturing_preamble
 378                     preamble.append(line)
 379             # We've seen either the EOF or the end boundary.  If we're still
 380             # capturing the preamble, we never saw the start boundary.  Note
 381             # that as a defect and store the captured text as the payload.
 382             # Everything from here to the EOF is epilogue.
 383             if capturing_preamble:
 384                 self._cur.defects.append(Errors.StartBoundaryNotFoundDefect())
 385                 self._cur.set_payload(EMPTYSTRING.join(preamble))
 386                 epilogue = []
 387                 for line in self._input:
 388                     if line is NeedMoreData:
 389                         yield NeedMoreData
 390                         continue
 391                 self._cur.epilogue = EMPTYSTRING.join(epilogue)
 392                 return
 393             # If the end boundary ended in a newline, we'll need to make sure
 394             # the epilogue isn't None
 395             if linesep:
 396                 epilogue = ['']
 397             else:
 398                 epilogue = []
 399             for line in self._input:
 400                 if line is NeedMoreData:
 401                     yield NeedMoreData
 402                     continue
 403                 epilogue.append(line)
 404             # Any CRLF at the front of the epilogue is not technically part of
 405             # the epilogue.  Also, watch out for an empty string epilogue,
 406             # which means a single newline.
 407             if epilogue:
 408                 firstline = epilogue[0]
 409                 bolmo = NLCRE_bol.match(firstline)
 410                 if bolmo:
 411                     epilogue[0] = firstline[len(bolmo.group(0)):]
 412             self._cur.epilogue = EMPTYSTRING.join(epilogue)
 413             return
 414         # Otherwise, it's some non-multipart type, so the entire rest of the
 415         # file contents becomes the payload.
 416         lines = []
 417         for line in self._input:
 418             if line is NeedMoreData:
 419                 yield NeedMoreData
 420                 continue
 421             lines.append(line)
 422         self._cur.set_payload(EMPTYSTRING.join(lines))
 423
 424     def _parse_headers(self, lines):
 425         # Passed a list of lines that make up the headers for the current msg
 426         lastheader = ''
 427         lastvalue = []
 428         for lineno, line in enumerate(lines):
 429             # Check for continuation
 430             if line[0] in ' \t':
 431                 if not lastheader:
 432                     # The first line of the headers was a continuation.  This
 433                     # is illegal, so let's note the defect, store the illegal
 434                     # line, and ignore it for purposes of headers.
 435                     defect = Errors.FirstHeaderLineIsContinuationDefect(line)
 436                     self._cur.defects.append(defect)
 437                     continue
 438                 lastvalue.append(line)
 439                 continue
 440             if lastheader:
 441                 # XXX reconsider the joining of folded lines
 442                 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
 443                 self._cur[lastheader] = lhdr
 444                 lastheader, lastvalue = '', []
 445             # Check for envelope header, i.e. unix-from
 446             if line.startswith('From '):
 447                 if lineno == 0:
 448                     # Strip off the trailing newline
 449                     mo = NLCRE_eol.search(line)
 450                     if mo:
 451                         line = line[:-len(mo.group(0))]
 452                     self._cur.set_unixfrom(line)
 453                     continue
 454                 elif lineno == len(lines) - 1:
 455                     # Something looking like a unix-from at the end - it's
 456                     # probably the first line of the body, so push back the
 457                     # line and stop.
 458                     self._input.unreadline(line)
 459                     return
 460                 else:
 461                     # Weirdly placed unix-from line.  Note this as a defect
 462                     # and ignore it.
 463                     defect = Errors.MisplacedEnvelopeHeaderDefect(line)
 464                     self._cur.defects.append(defect)
 465                     continue
 466             # Split the line on the colon separating field name from value.
 467             i = line.find(':')
 468             if i < 0:
 469                 defect = Errors.MalformedHeaderDefect(line)
 470                 self._cur.defects.append(defect)
 471                 continue
 472             lastheader = line[:i]
 473             lastvalue = [line[i+1:].lstrip()]
 474         # Done with all the lines, so handle the last header.
 475         if lastheader:
 476             # XXX reconsider the joining of folded lines
 477             self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')