Lib/rfc822.py

   1 """RFC 2822 message manipulation.
   2
   3 Note: This is only a very rough sketch of a full RFC-822 parser; in particular
   4 the tokenizing of addresses does not adhere to all the quoting rules.
   5
   6 Note: RFC 2822 is a long awaited update to RFC 822.  This module should
   7 conform to RFC 2822, and is thus mis-named (it's not worth renaming it).  Some
   8 effort at RFC 2822 updates have been made, but a thorough audit has not been
   9 performed.  Consider any RFC 2822 non-conformance to be a bug.
  10
  11     RFC 2822: http://www.faqs.org/rfcs/rfc2822.html
  12     RFC 822 : http://www.faqs.org/rfcs/rfc822.html (obsolete)
  13
  14 Directions for use:
  15
  16 To create a Message object: first open a file, e.g.:
  17
  18   fp = open(file, 'r')
  19
  20 You can use any other legal way of getting an open file object, e.g. use
  21 sys.stdin or call os.popen().  Then pass the open file object to the Message()
  22 constructor:
  23
  24   m = Message(fp)
  25
  26 This class can work with any input object that supports a readline method.  If
  27 the input object has seek and tell capability, the rewindbody method will
  28 work; also illegal lines will be pushed back onto the input stream.  If the
  29 input object lacks seek but has an `unread' method that can push back a line
  30 of input, Message will use that to push back illegal lines.  Thus this class
  31 can be used to parse messages coming from a buffered stream.
  32
  33 The optional `seekable' argument is provided as a workaround for certain stdio
  34 libraries in which tell() discards buffered data before discovering that the
  35 lseek() system call doesn't work.  For maximum portability, you should set the
  36 seekable argument to zero to prevent that initial \code{tell} when passing in
  37 an unseekable object such as a a file object created from a socket object.  If
  38 it is 1 on entry -- which it is by default -- the tell() method of the open
  39 file object is called once; if this raises an exception, seekable is reset to
  40 0.  For other nonzero values of seekable, this test is not made.
  41
  42 To get the text of a particular header there are several methods:
  43
  44   str = m.getheader(name)
  45   str = m.getrawheader(name)
  46
  47 where name is the name of the header, e.g. 'Subject'.  The difference is that
  48 getheader() strips the leading and trailing whitespace, while getrawheader()
  49 doesn't.  Both functions retain embedded whitespace (including newlines)
  50 exactly as they are specified in the header, and leave the case of the text
  51 unchanged.
  52
  53 For addresses and address lists there are functions
  54
  55   realname, mailaddress = m.getaddr(name)
  56   list = m.getaddrlist(name)
  57
  58 where the latter returns a list of (realname, mailaddr) tuples.
  59
  60 There is also a method
  61
  62   time = m.getdate(name)
  63
  64 which parses a Date-like field and returns a time-compatible tuple,
  65 i.e. a tuple such as returned by time.localtime() or accepted by
  66 time.mktime().
  67
  68 See the class definition for lower level access methods.
  69
  70 There are also some utility functions here.
  71 """
  72 # Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
  73
  74 import time
  75
  76 from warnings import warnpy3k
  77 warnpy3k("in 3.x, rfc822 has been removed in favor of the email package",
  78          stacklevel=2)
  79
  80 __all__ = ["Message","AddressList","parsedate","parsedate_tz","mktime_tz"]
  81
  82 _blanklines = ('\r\n', '\n')            # Optimization for islast()
  83
  84
  85 class Message:
  86     """Represents a single RFC 2822-compliant message."""
  87
  88     def __init__(self, fp, seekable = 1):
  89         """Initialize the class instance and read the headers."""
  90         if seekable == 1:
  91             # Exercise tell() to make sure it works
  92             # (and then assume seek() works, too)
  93             try:
  94                 fp.tell()
  95             except (AttributeError, IOError):
  96                 seekable = 0
  97         self.fp = fp
  98         self.seekable = seekable
  99         self.startofheaders = None
 100         self.startofbody = None
 101         #
 102         if self.seekable:
 103             try:
 104                 self.startofheaders = self.fp.tell()
 105             except IOError:
 106                 self.seekable = 0
 107         #
 108         self.readheaders()
 109         #
 110         if self.seekable:
 111             try:
 112                 self.startofbody = self.fp.tell()
 113             except IOError:
 114                 self.seekable = 0
 115
 116     def rewindbody(self):
 117         """Rewind the file to the start of the body (if seekable)."""
 118         if not self.seekable:
 119             raise IOError, "unseekable file"
 120         self.fp.seek(self.startofbody)
 121
 122     def readheaders(self):
 123         """Read header lines.
 124
 125         Read header lines up to the entirely blank line that terminates them.
 126         The (normally blank) line that ends the headers is skipped, but not
 127         included in the returned list.  If a non-header line ends the headers,
 128         (which is an error), an attempt is made to backspace over it; it is
 129         never included in the returned list.
 130
 131         The variable self.status is set to the empty string if all went well,
 132         otherwise it is an error message.  The variable self.headers is a
 133         completely uninterpreted list of lines contained in the header (so
 134         printing them will reproduce the header exactly as it appears in the
 135         file).
 136         """
 137         self.dict = {}
 138         self.unixfrom = ''
 139         self.headers = lst = []
 140         self.status = ''
 141         headerseen = ""
 142         firstline = 1
 143         startofline = unread = tell = None
 144         if hasattr(self.fp, 'unread'):
 145             unread = self.fp.unread
 146         elif self.seekable:
 147             tell = self.fp.tell
 148         while 1:
 149             if tell:
 150                 try:
 151                     startofline = tell()
 152                 except IOError:
 153                     startofline = tell = None
 154                     self.seekable = 0
 155             line = self.fp.readline()
 156             if not line:
 157                 self.status = 'EOF in headers'
 158                 break
 159             # Skip unix From name time lines
 160             if firstline and line.startswith('From '):
 161                 self.unixfrom = self.unixfrom + line
 162                 continue
 163             firstline = 0
 164             if headerseen and line[0] in ' \t':
 165                 # It's a continuation line.
 166                 lst.append(line)
 167                 x = (self.dict[headerseen] + "\n " + line.strip())
 168                 self.dict[headerseen] = x.strip()
 169                 continue
 170             elif self.iscomment(line):
 171                 # It's a comment.  Ignore it.
 172                 continue
 173             elif self.islast(line):
 174                 # Note! No pushback here!  The delimiter line gets eaten.
 175                 break
 176             headerseen = self.isheader(line)
 177             if headerseen:
 178                 # It's a legal header line, save it.
 179                 lst.append(line)
 180                 self.dict[headerseen] = line[len(headerseen)+1:].strip()
 181                 continue
 182             else:
 183                 # It's not a header line; throw it back and stop here.
 184                 if not self.dict:
 185                     self.status = 'No headers'
 186                 else:
 187                     self.status = 'Non-header line where header expected'
 188                 # Try to undo the read.
 189                 if unread:
 190                     unread(line)
 191                 elif tell:
 192                     self.fp.seek(startofline)
 193                 else:
 194                     self.status = self.status + '; bad seek'
 195                 break
 196
 197     def isheader(self, line):
 198         """Determine whether a given line is a legal header.
 199
 200         This method should return the header name, suitably canonicalized.
 201         You may override this method in order to use Message parsing on tagged
 202         data in RFC 2822-like formats with special header formats.
 203         """
 204         i = line.find(':')
 205         if i > 0:
 206             return line[:i].lower()
 207         return None
 208
 209     def islast(self, line):
 210         """Determine whether a line is a legal end of RFC 2822 headers.
 211
 212         You may override this method if your application wants to bend the
 213         rules, e.g. to strip trailing whitespace, or to recognize MH template
 214         separators ('--------').  For convenience (e.g. for code reading from
 215         sockets) a line consisting of \r\n also matches.
 216         """
 217         return line in _blanklines
 218
 219     def iscomment(self, line):
 220         """Determine whether a line should be skipped entirely.
 221
 222         You may override this method in order to use Message parsing on tagged
 223         data in RFC 2822-like formats that support embedded comments or
 224         free-text data.
 225         """
 226         return False
 227
 228     def getallmatchingheaders(self, name):
 229         """Find all header lines matching a given header name.
 230
 231         Look through the list of headers and find all lines matching a given
 232         header name (and their continuation lines).  A list of the lines is
 233         returned, without interpretation.  If the header does not occur, an
 234         empty list is returned.  If the header occurs multiple times, all
 235         occurrences are returned.  Case is not important in the header name.
 236         """
 237         name = name.lower() + ':'
 238         n = len(name)
 239         lst = []
 240         hit = 0
 241         for line in self.headers:
 242             if line[:n].lower() == name:
 243                 hit = 1
 244             elif not line[:1].isspace():
 245                 hit = 0
 246             if hit:
 247                 lst.append(line)
 248         return lst
 249
 250     def getfirstmatchingheader(self, name):
 251         """Get the first header line matching name.
 252
 253         This is similar to getallmatchingheaders, but it returns only the
 254         first matching header (and its continuation lines).
 255         """
 256         name = name.lower() + ':'
 257         n = len(name)
 258         lst = []
 259         hit = 0
 260         for line in self.headers:
 261             if hit:
 262                 if not line[:1].isspace():
 263                     break
 264             elif line[:n].lower() == name:
 265                 hit = 1
 266             if hit:
 267                 lst.append(line)
 268         return lst
 269
 270     def getrawheader(self, name):
 271         """A higher-level interface to getfirstmatchingheader().
 272
 273         Return a string containing the literal text of the header but with the
 274         keyword stripped.  All leading, trailing and embedded whitespace is
 275         kept in the string, however.  Return None if the header does not
 276         occur.
 277         """
 278
 279         lst = self.getfirstmatchingheader(name)
 280         if not lst:
 281             return None
 282         lst[0] = lst[0][len(name) + 1:]
 283         return ''.join(lst)
 284
 285     def getheader(self, name, default=None):
 286         """Get the header value for a name.
 287
 288         This is the normal interface: it returns a stripped version of the
 289         header value for a given header name, or None if it doesn't exist.
 290         This uses the dictionary version which finds the *last* such header.
 291         """
 292         return self.dict.get(name.lower(), default)
 293     get = getheader
 294
 295     def getheaders(self, name):
 296         """Get all values for a header.
 297
 298         This returns a list of values for headers given more than once; each
 299         value in the result list is stripped in the same way as the result of
 300         getheader().  If the header is not given, return an empty list.
 301         """
 302         result = []
 303         current = ''
 304         have_header = 0
 305         for s in self.getallmatchingheaders(name):
 306             if s[0].isspace():
 307                 if current:
 308                     current = "%s\n %s" % (current, s.strip())
 309                 else:
 310                     current = s.strip()
 311             else:
 312                 if have_header:
 313                     result.append(current)
 314                 current = s[s.find(":") + 1:].strip()
 315                 have_header = 1
 316         if have_header:
 317             result.append(current)
 318         return result
 319
 320     def getaddr(self, name):
 321         """Get a single address from a header, as a tuple.
 322
 323         An example return value:
 324         ('Guido van Rossum', 'guido@cwi.nl')
 325         """
 326         # New, by Ben Escoto
 327         alist = self.getaddrlist(name)
 328         if alist:
 329             return alist[0]
 330         else:
 331             return (None, None)
 332
 333     def getaddrlist(self, name):
 334         """Get a list of addresses from a header.
 335
 336         Retrieves a list of addresses from a header, where each address is a
 337         tuple as returned by getaddr().  Scans all named headers, so it works
 338         properly with multiple To: or Cc: headers for example.
 339         """
 340         raw = []
 341         for h in self.getallmatchingheaders(name):
 342             if h[0] in ' \t':
 343                 raw.append(h)
 344             else:
 345                 if raw:
 346                     raw.append(', ')
 347                 i = h.find(':')
 348                 if i > 0:
 349                     addr = h[i+1:]
 350                 raw.append(addr)
 351         alladdrs = ''.join(raw)
 352         a = AddressList(alladdrs)
 353         return a.addresslist
 354
 355     def getdate(self, name):
 356         """Retrieve a date field from a header.
 357
 358         Retrieves a date field from the named header, returning a tuple
 359         compatible with time.mktime().
 360         """
 361         try:
 362             data = self[name]
 363         except KeyError:
 364             return None
 365         return parsedate(data)
 366
 367     def getdate_tz(self, name):
 368         """Retrieve a date field from a header as a 10-tuple.
 369
 370         The first 9 elements make up a tuple compatible with time.mktime(),
 371         and the 10th is the offset of the poster's time zone from GMT/UTC.
 372         """
 373         try:
 374             data = self[name]
 375         except KeyError:
 376             return None
 377         return parsedate_tz(data)
 378
 379
 380     # Access as a dictionary (only finds *last* header of each type):
 381
 382     def __len__(self):
 383         """Get the number of headers in a message."""
 384         return len(self.dict)
 385
 386     def __getitem__(self, name):
 387         """Get a specific header, as from a dictionary."""
 388         return self.dict[name.lower()]
 389
 390     def __setitem__(self, name, value):
 391         """Set the value of a header.
 392
 393         Note: This is not a perfect inversion of __getitem__, because any
 394         changed headers get stuck at the end of the raw-headers list rather
 395         than where the altered header was.
 396         """
 397         del self[name] # Won't fail if it doesn't exist
 398         self.dict[name.lower()] = value
 399         text = name + ": " + value
 400         for line in text.split("\n"):
 401             self.headers.append(line + "\n")
 402
 403     def __delitem__(self, name):
 404         """Delete all occurrences of a specific header, if it is present."""
 405         name = name.lower()
 406         if not name in self.dict:
 407             return
 408         del self.dict[name]
 409         name = name + ':'
 410         n = len(name)
 411         lst = []
 412         hit = 0
 413         for i in range(len(self.headers)):
 414             line = self.headers[i]
 415             if line[:n].lower() == name:
 416                 hit = 1
 417             elif not line[:1].isspace():
 418                 hit = 0
 419             if hit:
 420                 lst.append(i)
 421         for i in reversed(lst):
 422             del self.headers[i]
 423
 424     def setdefault(self, name, default=""):
 425         lowername = name.lower()
 426         if lowername in self.dict:
 427             return self.dict[lowername]
 428         else:
 429             text = name + ": " + default
 430             for line in text.split("\n"):
 431                 self.headers.append(line + "\n")
 432             self.dict[lowername] = default
 433             return default
 434
 435     def has_key(self, name):
 436         """Determine whether a message contains the named header."""
 437         return name.lower() in self.dict
 438
 439     def __contains__(self, name):
 440         """Determine whether a message contains the named header."""
 441         return name.lower() in self.dict
 442
 443     def __iter__(self):
 444         return iter(self.dict)
 445
 446     def keys(self):
 447         """Get all of a message's header field names."""
 448         return self.dict.keys()
 449
 450     def values(self):
 451         """Get all of a message's header field values."""
 452         return self.dict.values()
 453
 454     def items(self):
 455         """Get all of a message's headers.
 456
 457         Returns a list of name, value tuples.
 458         """
 459         return self.dict.items()
 460
 461     def __str__(self):
 462         return ''.join(self.headers)
 463
 464
 465 # Utility functions
 466 # -----------------
 467
 468 # XXX Should fix unquote() and quote() to be really conformant.
 469 # XXX The inverses of the parse functions may also be useful.
 470
 471
 472 def unquote(s):
 473     """Remove quotes from a string."""
 474     if len(s) > 1:
 475         if s.startswith('"') and s.endswith('"'):
 476             return s[1:-1].replace('\\\\', '\\').replace('\\"', '"')
 477         if s.startswith('<') and s.endswith('>'):
 478             return s[1:-1]
 479     return s
 480
 481
 482 def quote(s):
 483     """Add quotes around a string."""
 484     return s.replace('\\', '\\\\').replace('"', '\\"')
 485
 486
 487 def parseaddr(address):
 488     """Parse an address into a (realname, mailaddr) tuple."""
 489     a = AddressList(address)
 490     lst = a.addresslist
 491     if not lst:
 492         return (None, None)
 493     return lst[0]
 494
 495
 496 class AddrlistClass:
 497     """Address parser class by Ben Escoto.
 498
 499     To understand what this class does, it helps to have a copy of
 500     RFC 2822 in front of you.
 501
 502     http://www.faqs.org/rfcs/rfc2822.html
 503
 504     Note: this class interface is deprecated and may be removed in the future.
 505     Use rfc822.AddressList instead.
 506     """
 507
 508     def __init__(self, field):
 509         """Initialize a new instance.
 510
 511         `field' is an unparsed address header field, containing one or more
 512         addresses.
 513         """
 514         self.specials = '()<>@,:;.\"[]'
 515         self.pos = 0
 516         self.LWS = ' \t'
 517         self.CR = '\r\n'
 518         self.atomends = self.specials + self.LWS + self.CR
 519         # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
 520         # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
 521         # syntax, so allow dots in phrases.
 522         self.phraseends = self.atomends.replace('.', '')
 523         self.field = field
 524         self.commentlist = []
 525
 526     def gotonext(self):
 527         """Parse up to the start of the next address."""
 528         while self.pos < len(self.field):
 529             if self.field[self.pos] in self.LWS + '\n\r':
 530                 self.pos = self.pos + 1
 531             elif self.field[self.pos] == '(':
 532                 self.commentlist.append(self.getcomment())
 533             else: break
 534
 535     def getaddrlist(self):
 536         """Parse all addresses.
 537
 538         Returns a list containing all of the addresses.
 539         """
 540         result = []
 541         ad = self.getaddress()
 542         while ad:
 543             result += ad
 544             ad = self.getaddress()
 545         return result
 546
 547     def getaddress(self):
 548         """Parse the next address."""
 549         self.commentlist = []
 550         self.gotonext()
 551
 552         oldpos = self.pos
 553         oldcl = self.commentlist
 554         plist = self.getphraselist()
 555
 556         self.gotonext()
 557         returnlist = []
 558
 559         if self.pos >= len(self.field):
 560             # Bad email address technically, no domain.
 561             if plist:
 562                 returnlist = [(' '.join(self.commentlist), plist[0])]
 563
 564         elif self.field[self.pos] in '.@':
 565             # email address is just an addrspec
 566             # this isn't very efficient since we start over
 567             self.pos = oldpos
 568             self.commentlist = oldcl
 569             addrspec = self.getaddrspec()
 570             returnlist = [(' '.join(self.commentlist), addrspec)]
 571
 572         elif self.field[self.pos] == ':':
 573             # address is a group
 574             returnlist = []
 575
 576             fieldlen = len(self.field)
 577             self.pos += 1
 578             while self.pos < len(self.field):
 579                 self.gotonext()
 580                 if self.pos < fieldlen and self.field[self.pos] == ';':
 581                     self.pos += 1
 582                     break
 583                 returnlist = returnlist + self.getaddress()
 584
 585         elif self.field[self.pos] == '<':
 586             # Address is a phrase then a route addr
 587             routeaddr = self.getrouteaddr()
 588
 589             if self.commentlist:
 590                 returnlist = [(' '.join(plist) + ' (' + \
 591                          ' '.join(self.commentlist) + ')', routeaddr)]
 592             else: returnlist = [(' '.join(plist), routeaddr)]
 593
 594         else:
 595             if plist:
 596                 returnlist = [(' '.join(self.commentlist), plist[0])]
 597             elif self.field[self.pos] in self.specials:
 598                 self.pos += 1
 599
 600         self.gotonext()
 601         if self.pos < len(self.field) and self.field[self.pos] == ',':
 602             self.pos += 1
 603         return returnlist
 604
 605     def getrouteaddr(self):
 606         """Parse a route address (Return-path value).
 607
 608         This method just skips all the route stuff and returns the addrspec.
 609         """
 610         if self.field[self.pos] != '<':
 611             return
 612
 613         expectroute = 0
 614         self.pos += 1
 615         self.gotonext()
 616         adlist = ""
 617         while self.pos < len(self.field):
 618             if expectroute:
 619                 self.getdomain()
 620                 expectroute = 0
 621             elif self.field[self.pos] == '>':
 622                 self.pos += 1
 623                 break
 624             elif self.field[self.pos] == '@':
 625                 self.pos += 1
 626                 expectroute = 1
 627             elif self.field[self.pos] == ':':
 628                 self.pos += 1
 629             else:
 630                 adlist = self.getaddrspec()
 631                 self.pos += 1
 632                 break
 633             self.gotonext()
 634
 635         return adlist
 636
 637     def getaddrspec(self):
 638         """Parse an RFC 2822 addr-spec."""
 639         aslist = []
 640
 641         self.gotonext()
 642         while self.pos < len(self.field):
 643             if self.field[self.pos] == '.':
 644                 aslist.append('.')
 645                 self.pos += 1
 646             elif self.field[self.pos] == '"':
 647                 aslist.append('"%s"' % self.getquote())
 648             elif self.field[self.pos] in self.atomends:
 649                 break
 650             else: aslist.append(self.getatom())
 651             self.gotonext()
 652
 653         if self.pos >= len(self.field) or self.field[self.pos] != '@':
 654             return ''.join(aslist)
 655
 656         aslist.append('@')
 657         self.pos += 1
 658         self.gotonext()
 659         return ''.join(aslist) + self.getdomain()
 660
 661     def getdomain(self):
 662         """Get the complete domain name from an address."""
 663         sdlist = []
 664         while self.pos < len(self.field):
 665             if self.field[self.pos] in self.LWS:
 666                 self.pos += 1
 667             elif self.field[self.pos] == '(':
 668                 self.commentlist.append(self.getcomment())
 669             elif self.field[self.pos] == '[':
 670                 sdlist.append(self.getdomainliteral())
 671             elif self.field[self.pos] == '.':
 672                 self.pos += 1
 673                 sdlist.append('.')
 674             elif self.field[self.pos] in self.atomends:
 675                 break
 676             else: sdlist.append(self.getatom())
 677         return ''.join(sdlist)
 678
 679     def getdelimited(self, beginchar, endchars, allowcomments = 1):
 680         """Parse a header fragment delimited by special characters.
 681
 682         `beginchar' is the start character for the fragment.  If self is not
 683         looking at an instance of `beginchar' then getdelimited returns the
 684         empty string.
 685
 686         `endchars' is a sequence of allowable end-delimiting characters.
 687         Parsing stops when one of these is encountered.
 688
 689         If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
 690         within the parsed fragment.
 691         """
 692         if self.field[self.pos] != beginchar:
 693             return ''
 694
 695         slist = ['']
 696         quote = 0
 697         self.pos += 1
 698         while self.pos < len(self.field):
 699             if quote == 1:
 700                 slist.append(self.field[self.pos])
 701                 quote = 0
 702             elif self.field[self.pos] in endchars:
 703                 self.pos += 1
 704                 break
 705             elif allowcomments and self.field[self.pos] == '(':
 706                 slist.append(self.getcomment())
 707                 continue        # have already advanced pos from getcomment
 708             elif self.field[self.pos] == '\\':
 709                 quote = 1
 710             else:
 711                 slist.append(self.field[self.pos])
 712             self.pos += 1
 713
 714         return ''.join(slist)
 715
 716     def getquote(self):
 717         """Get a quote-delimited fragment from self's field."""
 718         return self.getdelimited('"', '"\r', 0)
 719
 720     def getcomment(self):
 721         """Get a parenthesis-delimited fragment from self's field."""
 722         return self.getdelimited('(', ')\r', 1)
 723
 724     def getdomainliteral(self):
 725         """Parse an RFC 2822 domain-literal."""
 726         return '[%s]' % self.getdelimited('[', ']\r', 0)
 727
 728     def getatom(self, atomends=None):
 729         """Parse an RFC 2822 atom.
 730
 731         Optional atomends specifies a different set of end token delimiters
 732         (the default is to use self.atomends).  This is used e.g. in
 733         getphraselist() since phrase endings must not include the `.' (which
 734         is legal in phrases)."""
 735         atomlist = ['']
 736         if atomends is None:
 737             atomends = self.atomends
 738
 739         while self.pos < len(self.field):
 740             if self.field[self.pos] in atomends:
 741                 break
 742             else: atomlist.append(self.field[self.pos])
 743             self.pos += 1
 744
 745         return ''.join(atomlist)
 746
 747     def getphraselist(self):
 748         """Parse a sequence of RFC 2822 phrases.
 749
 750         A phrase is a sequence of words, which are in turn either RFC 2822
 751         atoms or quoted-strings.  Phrases are canonicalized by squeezing all
 752         runs of continuous whitespace into one space.
 753         """
 754         plist = []
 755
 756         while self.pos < len(self.field):
 757             if self.field[self.pos] in self.LWS:
 758                 self.pos += 1
 759             elif self.field[self.pos] == '"':
 760                 plist.append(self.getquote())
 761             elif self.field[self.pos] == '(':
 762                 self.commentlist.append(self.getcomment())
 763             elif self.field[self.pos] in self.phraseends:
 764                 break
 765             else:
 766                 plist.append(self.getatom(self.phraseends))
 767
 768         return plist
 769
 770 class AddressList(AddrlistClass):
 771     """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
 772     def __init__(self, field):
 773         AddrlistClass.__init__(self, field)
 774         if field:
 775             self.addresslist = self.getaddrlist()
 776         else:
 777             self.addresslist = []
 778
 779     def __len__(self):
 780         return len(self.addresslist)
 781
 782     def __str__(self):
 783         return ", ".join(map(dump_address_pair, self.addresslist))
 784
 785     def __add__(self, other):
 786         # Set union
 787         newaddr = AddressList(None)
 788         newaddr.addresslist = self.addresslist[:]
 789         for x in other.addresslist:
 790             if not x in self.addresslist:
 791                 newaddr.addresslist.append(x)
 792         return newaddr
 793
 794     def __iadd__(self, other):
 795         # Set union, in-place
 796         for x in other.addresslist:
 797             if not x in self.addresslist:
 798                 self.addresslist.append(x)
 799         return self
 800
 801     def __sub__(self, other):
 802         # Set difference
 803         newaddr = AddressList(None)
 804         for x in self.addresslist:
 805             if not x in other.addresslist:
 806                 newaddr.addresslist.append(x)
 807         return newaddr
 808
 809     def __isub__(self, other):
 810         # Set difference, in-place
 811         for x in other.addresslist:
 812             if x in self.addresslist:
 813                 self.addresslist.remove(x)
 814         return self
 815
 816     def __getitem__(self, index):
 817         # Make indexing, slices, and 'in' work
 818         return self.addresslist[index]
 819
 820 def dump_address_pair(pair):
 821     """Dump a (name, address) pair in a canonicalized form."""
 822     if pair[0]:
 823         return '"' + pair[0] + '" <' + pair[1] + '>'
 824     else:
 825         return pair[1]
 826
 827 # Parse a date field
 828
 829 _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
 830                'aug', 'sep', 'oct', 'nov', 'dec',
 831                'january', 'february', 'march', 'april', 'may', 'june', 'july',
 832                'august', 'september', 'october', 'november', 'december']
 833 _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
 834
 835 # The timezone table does not include the military time zones defined
 836 # in RFC822, other than Z.  According to RFC1123, the description in
 837 # RFC822 gets the signs wrong, so we can't rely on any such time
 838 # zones.  RFC1123 recommends that numeric timezone indicators be used
 839 # instead of timezone names.
 840
 841 _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
 842               'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
 843               'EST': -500, 'EDT': -400,  # Eastern
 844               'CST': -600, 'CDT': -500,  # Central
 845               'MST': -700, 'MDT': -600,  # Mountain
 846               'PST': -800, 'PDT': -700   # Pacific
 847               }
 848
 849
 850 def parsedate_tz(data):
 851     """Convert a date string to a time tuple.
 852
 853     Accounts for military timezones.
 854     """
 855     if not data:
 856         return None
 857     data = data.split()
 858     if data[0][-1] in (',', '.') or data[0].lower() in _daynames:
 859         # There's a dayname here. Skip it
 860         del data[0]
 861     else:
 862         # no space after the "weekday,"?
 863         i = data[0].rfind(',')
 864         if i >= 0:
 865             data[0] = data[0][i+1:]
 866     if len(data) == 3: # RFC 850 date, deprecated
 867         stuff = data[0].split('-')
 868         if len(stuff) == 3:
 869             data = stuff + data[1:]
 870     if len(data) == 4:
 871         s = data[3]
 872         i = s.find('+')
 873         if i > 0:
 874             data[3:] = [s[:i], s[i+1:]]
 875         else:
 876             data.append('') # Dummy tz
 877     if len(data) < 5:
 878         return None
 879     data = data[:5]
 880     [dd, mm, yy, tm, tz] = data
 881     mm = mm.lower()
 882     if not mm in _monthnames:
 883         dd, mm = mm, dd.lower()
 884         if not mm in _monthnames:
 885             return None
 886     mm = _monthnames.index(mm)+1
 887     if mm > 12: mm = mm - 12
 888     if dd[-1] == ',':
 889         dd = dd[:-1]
 890     i = yy.find(':')
 891     if i > 0:
 892         yy, tm = tm, yy
 893     if yy[-1] == ',':
 894         yy = yy[:-1]
 895     if not yy[0].isdigit():
 896         yy, tz = tz, yy
 897     if tm[-1] == ',':
 898         tm = tm[:-1]
 899     tm = tm.split(':')
 900     if len(tm) == 2:
 901         [thh, tmm] = tm
 902         tss = '0'
 903     elif len(tm) == 3:
 904         [thh, tmm, tss] = tm
 905     else:
 906         return None
 907     try:
 908         yy = int(yy)
 909         dd = int(dd)
 910         thh = int(thh)
 911         tmm = int(tmm)
 912         tss = int(tss)
 913     except ValueError:
 914         return None
 915     tzoffset = None
 916     tz = tz.upper()
 917     if tz in _timezones:
 918         tzoffset = _timezones[tz]
 919     else:
 920         try:
 921             tzoffset = int(tz)
 922         except ValueError:
 923             pass
 924     # Convert a timezone offset into seconds ; -0500 -> -18000
 925     if tzoffset:
 926         if tzoffset < 0:
 927             tzsign = -1
 928             tzoffset = -tzoffset
 929         else:
 930             tzsign = 1
 931         tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
 932     return (yy, mm, dd, thh, tmm, tss, 0, 1, 0, tzoffset)
 933
 934
 935 def parsedate(data):
 936     """Convert a time string to a time tuple."""
 937     t = parsedate_tz(data)
 938     if t is None:
 939         return t
 940     return t[:9]
 941
 942
 943 def mktime_tz(data):
 944     """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
 945     if data[9] is None:
 946         # No zone info, so localtime is better assumption than GMT
 947         return time.mktime(data[:8] + (-1,))
 948     else:
 949         t = time.mktime(data[:8] + (0,))
 950         return t - data[9] - time.timezone
 951
 952 def formatdate(timeval=None):
 953     """Returns time format preferred for Internet standards.
 954
 955     Sun, 06 Nov 1994 08:49:37 GMT  ; RFC 822, updated by RFC 1123
 956
 957     According to RFC 1123, day and month names must always be in
 958     English.  If not for that, this code could use strftime().  It
 959     can't because strftime() honors the locale and could generated
 960     non-English names.
 961     """
 962     if timeval is None:
 963         timeval = time.time()
 964     timeval = time.gmtime(timeval)
 965     return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
 966             ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")[timeval[6]],
 967             timeval[2],
 968             ("Jan", "Feb", "Mar", "Apr", "May", "Jun",
 969              "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")[timeval[1]-1],
 970                                 timeval[0], timeval[3], timeval[4], timeval[5])
 971
 972
 973 # When used as script, run a small test program.
 974 # The first command line argument must be a filename containing one
 975 # message in RFC-822 format.
 976
 977 if __name__ == '__main__':
 978     import sys, os
 979     file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
 980     if sys.argv[1:]: file = sys.argv[1]
 981     f = open(file, 'r')
 982     m = Message(f)
 983     print 'From:', m.getaddr('from')
 984     print 'To:', m.getaddrlist('to')
 985     print 'Subject:', m.getheader('subject')
 986     print 'Date:', m.getheader('date')
 987     date = m.getdate_tz('date')
 988     tz = date[-1]
 989     date = time.localtime(mktime_tz(date))
 990     if date:
 991         print 'ParsedDate:', time.asctime(date),
 992         hhmmss = tz
 993         hhmm, ss = divmod(hhmmss, 60)
 994         hh, mm = divmod(hhmm, 60)
 995         print "%+03d%02d" % (hh, mm),
 996         if ss: print ".%02d" % ss,
 997         print
 998     else:
 999         print 'ParsedDate:', None
1000     m.rewindbody()
1001     n = 0
1002     while f.readline():
1003         n += 1
1004     print 'Lines:', n
1005     print '-'*70
1006     print 'len =', len(m)
1007     if 'Date' in m: print 'Date =', m['Date']
1008     if 'X-Nonsense' in m: pass
1009     print 'keys =', m.keys()
1010     print 'values =', m.values()
1011     print 'items =', m.items()