Lib/email/feedparser.py

   1 # Copyright (C) 2004-2006 Python Software Foundation
   2 # Authors: Baxter, Wouters and Warsaw
   3 # Contact: email-sig@python.org
   4
   5 """FeedParser - An email feed parser.
   6
   7 The feed parser implements an interface for incrementally parsing an email
   8 message, line by line.  This has advantages for certain applications, such as
   9 those reading email messages off a socket.
  10
  11 FeedParser.feed() is the primary interface for pushing new data into the
  12 parser.  It returns when there's nothing more it can do with the available
  13 data.  When you have no more data to push into the parser, call .close().
  14 This completes the parsing and returns the root message object.
  15
  16 The other advantage of this parser is that it will never throw a parsing
  17 exception.  Instead, when it finds something unexpected, it adds a 'defect' to
  18 the current message.  Defects are just instances that live on the message
  19 object's .defects attribute.
  20 """
  21
  22 __all__ = ['FeedParser']
  23
  24 import re
  25
  26 from email import errors
  27 from email import message
  28
  29 NLCRE = re.compile('\r\n|\r|\n')
  30 NLCRE_bol = re.compile('(\r\n|\r|\n)')
  31 NLCRE_eol = re.compile('(\r\n|\r|\n)$')
  32 NLCRE_crack = re.compile('(\r\n|\r|\n)')
  33 # RFC 2822 $3.6.8 Optional fields.  ftext is %d33-57 / %d59-126, Any character
  34 # except controls, SP, and ":".
  35 headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')
  36 EMPTYSTRING = ''
  37 NL = '\n'
  38
  39 NeedMoreData = object()
  40
  41
  42 \f
  43 class BufferedSubFile(object):
  44     """A file-ish object that can have new data loaded into it.
  45
  46     You can also push and pop line-matching predicates onto a stack.  When the
  47     current predicate matches the current line, a false EOF response
  48     (i.e. empty string) is returned instead.  This lets the parser adhere to a
  49     simple abstraction -- it parses until EOF closes the current message.
  50     """
  51     def __init__(self):
  52         # The last partial line pushed into this object.
  53         self._partial = ''
  54         # The list of full, pushed lines, in reverse order
  55         self._lines = []
  56         # The stack of false-EOF checking predicates.
  57         self._eofstack = []
  58         # A flag indicating whether the file has been closed or not.
  59         self._closed = False
  60
  61     def push_eof_matcher(self, pred):
  62         self._eofstack.append(pred)
  63
  64     def pop_eof_matcher(self):
  65         return self._eofstack.pop()
  66
  67     def close(self):
  68         # Don't forget any trailing partial line.
  69         self._lines.append(self._partial)
  70         self._partial = ''
  71         self._closed = True
  72
  73     def readline(self):
  74         if not self._lines:
  75             if self._closed:
  76                 return ''
  77             return NeedMoreData
  78         # Pop the line off the stack and see if it matches the current
  79         # false-EOF predicate.
  80         line = self._lines.pop()
  81         # RFC 2046, section 5.1.2 requires us to recognize outer level
  82         # boundaries at any level of inner nesting.  Do this, but be sure it's
  83         # in the order of most to least nested.
  84         for ateof in self._eofstack[::-1]:
  85             if ateof(line):
  86                 # We're at the false EOF.  But push the last line back first.
  87                 self._lines.append(line)
  88                 return ''
  89         return line
  90
  91     def unreadline(self, line):
  92         # Let the consumer push a line back into the buffer.
  93         assert line is not NeedMoreData
  94         self._lines.append(line)
  95
  96     def push(self, data):
  97         """Push some new data into this object."""
  98         # Handle any previous leftovers
  99         data, self._partial = self._partial + data, ''
 100         # Crack into lines, but preserve the newlines on the end of each
 101         parts = NLCRE_crack.split(data)
 102         # The *ahem* interesting behaviour of re.split when supplied grouping
 103         # parentheses is that the last element of the resulting list is the
 104         # data after the final RE.  In the case of a NL/CR terminated string,
 105         # this is the empty string.
 106         self._partial = parts.pop()
 107         # parts is a list of strings, alternating between the line contents
 108         # and the eol character(s).  Gather up a list of lines after
 109         # re-attaching the newlines.
 110         lines = []
 111         for i in range(len(parts) // 2):
 112             lines.append(parts[i*2] + parts[i*2+1])
 113         self.pushlines(lines)
 114
 115     def pushlines(self, lines):
 116         # Reverse and insert at the front of the lines.
 117         self._lines[:0] = lines[::-1]
 118
 119     def is_closed(self):
 120         return self._closed
 121
 122     def __iter__(self):
 123         return self
 124
 125     def next(self):
 126         line = self.readline()
 127         if line == '':
 128             raise StopIteration
 129         return line
 130
 131
 132 \f
 133 class FeedParser:
 134     """A feed-style parser of email."""
 135
 136     def __init__(self, _factory=message.Message):
 137         """_factory is called with no arguments to create a new message obj"""
 138         self._factory = _factory
 139         self._input = BufferedSubFile()
 140         self._msgstack = []
 141         self._parse = self._parsegen().next
 142         self._cur = None
 143         self._last = None
 144         self._headersonly = False
 145
 146     # Non-public interface for supporting Parser's headersonly flag
 147     def _set_headersonly(self):
 148         self._headersonly = True
 149
 150     def feed(self, data):
 151         """Push more data into the parser."""
 152         self._input.push(data)
 153         self._call_parse()
 154
 155     def _call_parse(self):
 156         try:
 157             self._parse()
 158         except StopIteration:
 159             pass
 160
 161     def close(self):
 162         """Parse all remaining data and return the root message object."""
 163         self._input.close()
 164         self._call_parse()
 165         root = self._pop_message()
 166         assert not self._msgstack
 167         # Look for final set of defects
 168         if root.get_content_maintype() == 'multipart' \
 169                and not root.is_multipart():
 170             root.defects.append(errors.MultipartInvariantViolationDefect())
 171         return root
 172
 173     def _new_message(self):
 174         msg = self._factory()
 175         if self._cur and self._cur.get_content_type() == 'multipart/digest':
 176             msg.set_default_type('message/rfc822')
 177         if self._msgstack:
 178             self._msgstack[-1].attach(msg)
 179         self._msgstack.append(msg)
 180         self._cur = msg
 181         self._last = msg
 182
 183     def _pop_message(self):
 184         retval = self._msgstack.pop()
 185         if self._msgstack:
 186             self._cur = self._msgstack[-1]
 187         else:
 188             self._cur = None
 189         return retval
 190
 191     def _parsegen(self):
 192         # Create a new message and start by parsing headers.
 193         self._new_message()
 194         headers = []
 195         # Collect the headers, searching for a line that doesn't match the RFC
 196         # 2822 header or continuation pattern (including an empty line).
 197         for line in self._input:
 198             if line is NeedMoreData:
 199                 yield NeedMoreData
 200                 continue
 201             if not headerRE.match(line):
 202                 # If we saw the RFC defined header/body separator
 203                 # (i.e. newline), just throw it away. Otherwise the line is
 204                 # part of the body so push it back.
 205                 if not NLCRE.match(line):
 206                     self._input.unreadline(line)
 207                 break
 208             headers.append(line)
 209         # Done with the headers, so parse them and figure out what we're
 210         # supposed to see in the body of the message.
 211         self._parse_headers(headers)
 212         # Headers-only parsing is a backwards compatibility hack, which was
 213         # necessary in the older parser, which could throw errors.  All
 214         # remaining lines in the input are thrown into the message body.
 215         if self._headersonly:
 216             lines = []
 217             while True:
 218                 line = self._input.readline()
 219                 if line is NeedMoreData:
 220                     yield NeedMoreData
 221                     continue
 222                 if line == '':
 223                     break
 224                 lines.append(line)
 225             self._cur.set_payload(EMPTYSTRING.join(lines))
 226             return
 227         if self._cur.get_content_type() == 'message/delivery-status':
 228             # message/delivery-status contains blocks of headers separated by
 229             # a blank line.  We'll represent each header block as a separate
 230             # nested message object, but the processing is a bit different
 231             # than standard message/* types because there is no body for the
 232             # nested messages.  A blank line separates the subparts.
 233             while True:
 234                 self._input.push_eof_matcher(NLCRE.match)
 235                 for retval in self._parsegen():
 236                     if retval is NeedMoreData:
 237                         yield NeedMoreData
 238                         continue
 239                     break
 240                 msg = self._pop_message()
 241                 # We need to pop the EOF matcher in order to tell if we're at
 242                 # the end of the current file, not the end of the last block
 243                 # of message headers.
 244                 self._input.pop_eof_matcher()
 245                 # The input stream must be sitting at the newline or at the
 246                 # EOF.  We want to see if we're at the end of this subpart, so
 247                 # first consume the blank line, then test the next line to see
 248                 # if we're at this subpart's EOF.
 249                 while True:
 250                     line = self._input.readline()
 251                     if line is NeedMoreData:
 252                         yield NeedMoreData
 253                         continue
 254                     break
 255                 while True:
 256                     line = self._input.readline()
 257                     if line is NeedMoreData:
 258                         yield NeedMoreData
 259                         continue
 260                     break
 261                 if line == '':
 262                     break
 263                 # Not at EOF so this is a line we're going to need.
 264                 self._input.unreadline(line)
 265             return
 266         if self._cur.get_content_maintype() == 'message':
 267             # The message claims to be a message/* type, then what follows is
 268             # another RFC 2822 message.
 269             for retval in self._parsegen():
 270                 if retval is NeedMoreData:
 271                     yield NeedMoreData
 272                     continue
 273                 break
 274             self._pop_message()
 275             return
 276         if self._cur.get_content_maintype() == 'multipart':
 277             boundary = self._cur.get_boundary()
 278             if boundary is None:
 279                 # The message /claims/ to be a multipart but it has not
 280                 # defined a boundary.  That's a problem which we'll handle by
 281                 # reading everything until the EOF and marking the message as
 282                 # defective.
 283                 self._cur.defects.append(errors.NoBoundaryInMultipartDefect())
 284                 lines = []
 285                 for line in self._input:
 286                     if line is NeedMoreData:
 287                         yield NeedMoreData
 288                         continue
 289                     lines.append(line)
 290                 self._cur.set_payload(EMPTYSTRING.join(lines))
 291                 return
 292             # Create a line match predicate which matches the inter-part
 293             # boundary as well as the end-of-multipart boundary.  Don't push
 294             # this onto the input stream until we've scanned past the
 295             # preamble.
 296             separator = '--' + boundary
 297             boundaryre = re.compile(
 298                 '(?P<sep>' + re.escape(separator) +
 299                 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
 300             capturing_preamble = True
 301             preamble = []
 302             linesep = False
 303             while True:
 304                 line = self._input.readline()
 305                 if line is NeedMoreData:
 306                     yield NeedMoreData
 307                     continue
 308                 if line == '':
 309                     break
 310                 mo = boundaryre.match(line)
 311                 if mo:
 312                     # If we're looking at the end boundary, we're done with
 313                     # this multipart.  If there was a newline at the end of
 314                     # the closing boundary, then we need to initialize the
 315                     # epilogue with the empty string (see below).
 316                     if mo.group('end'):
 317                         linesep = mo.group('linesep')
 318                         break
 319                     # We saw an inter-part boundary.  Were we in the preamble?
 320                     if capturing_preamble:
 321                         if preamble:
 322                             # According to RFC 2046, the last newline belongs
 323                             # to the boundary.
 324                             lastline = preamble[-1]
 325                             eolmo = NLCRE_eol.search(lastline)
 326                             if eolmo:
 327                                 preamble[-1] = lastline[:-len(eolmo.group(0))]
 328                             self._cur.preamble = EMPTYSTRING.join(preamble)
 329                         capturing_preamble = False
 330                         self._input.unreadline(line)
 331                         continue
 332                     # We saw a boundary separating two parts.  Consume any
 333                     # multiple boundary lines that may be following.  Our
 334                     # interpretation of RFC 2046 BNF grammar does not produce
 335                     # body parts within such double boundaries.
 336                     while True:
 337                         line = self._input.readline()
 338                         if line is NeedMoreData:
 339                             yield NeedMoreData
 340                             continue
 341                         mo = boundaryre.match(line)
 342                         if not mo:
 343                             self._input.unreadline(line)
 344                             break
 345                     # Recurse to parse this subpart; the input stream points
 346                     # at the subpart's first line.
 347                     self._input.push_eof_matcher(boundaryre.match)
 348                     for retval in self._parsegen():
 349                         if retval is NeedMoreData:
 350                             yield NeedMoreData
 351                             continue
 352                         break
 353                     # Because of RFC 2046, the newline preceding the boundary
 354                     # separator actually belongs to the boundary, not the
 355                     # previous subpart's payload (or epilogue if the previous
 356                     # part is a multipart).
 357                     if self._last.get_content_maintype() == 'multipart':
 358                         epilogue = self._last.epilogue
 359                         if epilogue == '':
 360                             self._last.epilogue = None
 361                         elif epilogue is not None:
 362                             mo = NLCRE_eol.search(epilogue)
 363                             if mo:
 364                                 end = len(mo.group(0))
 365                                 self._last.epilogue = epilogue[:-end]
 366                     else:
 367                         payload = self._last.get_payload()
 368                         if isinstance(payload, basestring):
 369                             mo = NLCRE_eol.search(payload)
 370                             if mo:
 371                                 payload = payload[:-len(mo.group(0))]
 372                                 self._last.set_payload(payload)
 373                     self._input.pop_eof_matcher()
 374                     self._pop_message()
 375                     # Set the multipart up for newline cleansing, which will
 376                     # happen if we're in a nested multipart.
 377                     self._last = self._cur
 378                 else:
 379                     # I think we must be in the preamble
 380                     assert capturing_preamble
 381                     preamble.append(line)
 382             # We've seen either the EOF or the end boundary.  If we're still
 383             # capturing the preamble, we never saw the start boundary.  Note
 384             # that as a defect and store the captured text as the payload.
 385             # Everything from here to the EOF is epilogue.
 386             if capturing_preamble:
 387                 self._cur.defects.append(errors.StartBoundaryNotFoundDefect())
 388                 self._cur.set_payload(EMPTYSTRING.join(preamble))
 389                 epilogue = []
 390                 for line in self._input:
 391                     if line is NeedMoreData:
 392                         yield NeedMoreData
 393                         continue
 394                 self._cur.epilogue = EMPTYSTRING.join(epilogue)
 395                 return
 396             # If the end boundary ended in a newline, we'll need to make sure
 397             # the epilogue isn't None
 398             if linesep:
 399                 epilogue = ['']
 400             else:
 401                 epilogue = []
 402             for line in self._input:
 403                 if line is NeedMoreData:
 404                     yield NeedMoreData
 405                     continue
 406                 epilogue.append(line)
 407             # Any CRLF at the front of the epilogue is not technically part of
 408             # the epilogue.  Also, watch out for an empty string epilogue,
 409             # which means a single newline.
 410             if epilogue:
 411                 firstline = epilogue[0]
 412                 bolmo = NLCRE_bol.match(firstline)
 413                 if bolmo:
 414                     epilogue[0] = firstline[len(bolmo.group(0)):]
 415             self._cur.epilogue = EMPTYSTRING.join(epilogue)
 416             return
 417         # Otherwise, it's some non-multipart type, so the entire rest of the
 418         # file contents becomes the payload.
 419         lines = []
 420         for line in self._input:
 421             if line is NeedMoreData:
 422                 yield NeedMoreData
 423                 continue
 424             lines.append(line)
 425         self._cur.set_payload(EMPTYSTRING.join(lines))
 426
 427     def _parse_headers(self, lines):
 428         # Passed a list of lines that make up the headers for the current msg
 429         lastheader = ''
 430         lastvalue = []
 431         for lineno, line in enumerate(lines):
 432             # Check for continuation
 433             if line[0] in ' \t':
 434                 if not lastheader:
 435                     # The first line of the headers was a continuation.  This
 436                     # is illegal, so let's note the defect, store the illegal
 437                     # line, and ignore it for purposes of headers.
 438                     defect = errors.FirstHeaderLineIsContinuationDefect(line)
 439                     self._cur.defects.append(defect)
 440                     continue
 441                 lastvalue.append(line)
 442                 continue
 443             if lastheader:
 444                 # XXX reconsider the joining of folded lines
 445                 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
 446                 self._cur[lastheader] = lhdr
 447                 lastheader, lastvalue = '', []
 448             # Check for envelope header, i.e. unix-from
 449             if line.startswith('From '):
 450                 if lineno == 0:
 451                     # Strip off the trailing newline
 452                     mo = NLCRE_eol.search(line)
 453                     if mo:
 454                         line = line[:-len(mo.group(0))]
 455                     self._cur.set_unixfrom(line)
 456                     continue
 457                 elif lineno == len(lines) - 1:
 458                     # Something looking like a unix-from at the end - it's
 459                     # probably the first line of the body, so push back the
 460                     # line and stop.
 461                     self._input.unreadline(line)
 462                     return
 463                 else:
 464                     # Weirdly placed unix-from line.  Note this as a defect
 465                     # and ignore it.
 466                     defect = errors.MisplacedEnvelopeHeaderDefect(line)
 467                     self._cur.defects.append(defect)
 468                     continue
 469             # Split the line on the colon separating field name from value.
 470             i = line.find(':')
 471             if i < 0:
 472                 defect = errors.MalformedHeaderDefect(line)
 473                 self._cur.defects.append(defect)
 474                 continue
 475             lastheader = line[:i]
 476             lastvalue = [line[i+1:].lstrip()]
 477         # Done with all the lines, so handle the last header.
 478         if lastheader:
 479             # XXX reconsider the joining of folded lines
 480             self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')