Lib/rfc822.py

   1 """RFC 2822 message manipulation.
   2
   3 Note: This is only a very rough sketch of a full RFC-822 parser; in particular
   4 the tokenizing of addresses does not adhere to all the quoting rules.
   5
   6 Note: RFC 2822 is a long awaited update to RFC 822.  This module should
   7 conform to RFC 2822, and is thus mis-named (it's not worth renaming it).  Some
   8 effort at RFC 2822 updates have been made, but a thorough audit has not been
   9 performed.  Consider any RFC 2822 non-conformance to be a bug.
  10
  11     RFC 2822: http://www.faqs.org/rfcs/rfc2822.html
  12     RFC 822 : http://www.faqs.org/rfcs/rfc822.html (obsolete)
  13
  14 Directions for use:
  15
  16 To create a Message object: first open a file, e.g.:
  17
  18   fp = open(file, 'r')
  19
  20 You can use any other legal way of getting an open file object, e.g. use
  21 sys.stdin or call os.popen().  Then pass the open file object to the Message()
  22 constructor:
  23
  24   m = Message(fp)
  25
  26 This class can work with any input object that supports a readline method.  If
  27 the input object has seek and tell capability, the rewindbody method will
  28 work; also illegal lines will be pushed back onto the input stream.  If the
  29 input object lacks seek but has an `unread' method that can push back a line
  30 of input, Message will use that to push back illegal lines.  Thus this class
  31 can be used to parse messages coming from a buffered stream.
  32
  33 The optional `seekable' argument is provided as a workaround for certain stdio
  34 libraries in which tell() discards buffered data before discovering that the
  35 lseek() system call doesn't work.  For maximum portability, you should set the
  36 seekable argument to zero to prevent that initial \code{tell} when passing in
  37 an unseekable object such as a a file object created from a socket object.  If
  38 it is 1 on entry -- which it is by default -- the tell() method of the open
  39 file object is called once; if this raises an exception, seekable is reset to
  40 0.  For other nonzero values of seekable, this test is not made.
  41
  42 To get the text of a particular header there are several methods:
  43
  44   str = m.getheader(name)
  45   str = m.getrawheader(name)
  46
  47 where name is the name of the header, e.g. 'Subject'.  The difference is that
  48 getheader() strips the leading and trailing whitespace, while getrawheader()
  49 doesn't.  Both functions retain embedded whitespace (including newlines)
  50 exactly as they are specified in the header, and leave the case of the text
  51 unchanged.
  52
  53 For addresses and address lists there are functions
  54
  55   realname, mailaddress = m.getaddr(name)
  56   list = m.getaddrlist(name)
  57
  58 where the latter returns a list of (realname, mailaddr) tuples.
  59
  60 There is also a method
  61
  62   time = m.getdate(name)
  63
  64 which parses a Date-like field and returns a time-compatible tuple,
  65 i.e. a tuple such as returned by time.localtime() or accepted by
  66 time.mktime().
  67
  68 See the class definition for lower level access methods.
  69
  70 There are also some utility functions here.
  71 """
  72 # Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
  73
  74 import time
  75
  76 __all__ = ["Message","AddressList","parsedate","parsedate_tz","mktime_tz"]
  77
  78 _blanklines = ('\r\n', '\n')            # Optimization for islast()
  79
  80
  81 class Message:
  82     """Represents a single RFC 2822-compliant message."""
  83
  84     def __init__(self, fp, seekable = 1):
  85         """Initialize the class instance and read the headers."""
  86         if seekable == 1:
  87             # Exercise tell() to make sure it works
  88             # (and then assume seek() works, too)
  89             try:
  90                 fp.tell()
  91             except (AttributeError, IOError):
  92                 seekable = 0
  93         self.fp = fp
  94         self.seekable = seekable
  95         self.startofheaders = None
  96         self.startofbody = None
  97         #
  98         if self.seekable:
  99             try:
 100                 self.startofheaders = self.fp.tell()
 101             except IOError:
 102                 self.seekable = 0
 103         #
 104         self.readheaders()
 105         #
 106         if self.seekable:
 107             try:
 108                 self.startofbody = self.fp.tell()
 109             except IOError:
 110                 self.seekable = 0
 111
 112     def rewindbody(self):
 113         """Rewind the file to the start of the body (if seekable)."""
 114         if not self.seekable:
 115             raise IOError, "unseekable file"
 116         self.fp.seek(self.startofbody)
 117
 118     def readheaders(self):
 119         """Read header lines.
 120
 121         Read header lines up to the entirely blank line that terminates them.
 122         The (normally blank) line that ends the headers is skipped, but not
 123         included in the returned list.  If a non-header line ends the headers,
 124         (which is an error), an attempt is made to backspace over it; it is
 125         never included in the returned list.
 126
 127         The variable self.status is set to the empty string if all went well,
 128         otherwise it is an error message.  The variable self.headers is a
 129         completely uninterpreted list of lines contained in the header (so
 130         printing them will reproduce the header exactly as it appears in the
 131         file).
 132         """
 133         self.dict = {}
 134         self.unixfrom = ''
 135         self.headers = lst = []
 136         self.status = ''
 137         headerseen = ""
 138         firstline = 1
 139         startofline = unread = tell = None
 140         if hasattr(self.fp, 'unread'):
 141             unread = self.fp.unread
 142         elif self.seekable:
 143             tell = self.fp.tell
 144         while 1:
 145             if tell:
 146                 try:
 147                     startofline = tell()
 148                 except IOError:
 149                     startofline = tell = None
 150                     self.seekable = 0
 151             line = self.fp.readline()
 152             if not line:
 153                 self.status = 'EOF in headers'
 154                 break
 155             # Skip unix From name time lines
 156             if firstline and line.startswith('From '):
 157                 self.unixfrom = self.unixfrom + line
 158                 continue
 159             firstline = 0
 160             if headerseen and line[0] in ' \t':
 161                 # It's a continuation line.
 162                 lst.append(line)
 163                 x = (self.dict[headerseen] + "\n " + line.strip())
 164                 self.dict[headerseen] = x.strip()
 165                 continue
 166             elif self.iscomment(line):
 167                 # It's a comment.  Ignore it.
 168                 continue
 169             elif self.islast(line):
 170                 # Note! No pushback here!  The delimiter line gets eaten.
 171                 break
 172             headerseen = self.isheader(line)
 173             if headerseen:
 174                 # It's a legal header line, save it.
 175                 lst.append(line)
 176                 self.dict[headerseen] = line[len(headerseen)+1:].strip()
 177                 continue
 178             else:
 179                 # It's not a header line; throw it back and stop here.
 180                 if not self.dict:
 181                     self.status = 'No headers'
 182                 else:
 183                     self.status = 'Non-header line where header expected'
 184                 # Try to undo the read.
 185                 if unread:
 186                     unread(line)
 187                 elif tell:
 188                     self.fp.seek(startofline)
 189                 else:
 190                     self.status = self.status + '; bad seek'
 191                 break
 192
 193     def isheader(self, line):
 194         """Determine whether a given line is a legal header.
 195
 196         This method should return the header name, suitably canonicalized.
 197         You may override this method in order to use Message parsing on tagged
 198         data in RFC 2822-like formats with special header formats.
 199         """
 200         i = line.find(':')
 201         if i > 0:
 202             return line[:i].lower()
 203         return None
 204
 205     def islast(self, line):
 206         """Determine whether a line is a legal end of RFC 2822 headers.
 207
 208         You may override this method if your application wants to bend the
 209         rules, e.g. to strip trailing whitespace, or to recognize MH template
 210         separators ('--------').  For convenience (e.g. for code reading from
 211         sockets) a line consisting of \r\n also matches.
 212         """
 213         return line in _blanklines
 214
 215     def iscomment(self, line):
 216         """Determine whether a line should be skipped entirely.
 217
 218         You may override this method in order to use Message parsing on tagged
 219         data in RFC 2822-like formats that support embedded comments or
 220         free-text data.
 221         """
 222         return False
 223
 224     def getallmatchingheaders(self, name):
 225         """Find all header lines matching a given header name.
 226
 227         Look through the list of headers and find all lines matching a given
 228         header name (and their continuation lines).  A list of the lines is
 229         returned, without interpretation.  If the header does not occur, an
 230         empty list is returned.  If the header occurs multiple times, all
 231         occurrences are returned.  Case is not important in the header name.
 232         """
 233         name = name.lower() + ':'
 234         n = len(name)
 235         lst = []
 236         hit = 0
 237         for line in self.headers:
 238             if line[:n].lower() == name:
 239                 hit = 1
 240             elif not line[:1].isspace():
 241                 hit = 0
 242             if hit:
 243                 lst.append(line)
 244         return lst
 245
 246     def getfirstmatchingheader(self, name):
 247         """Get the first header line matching name.
 248
 249         This is similar to getallmatchingheaders, but it returns only the
 250         first matching header (and its continuation lines).
 251         """
 252         name = name.lower() + ':'
 253         n = len(name)
 254         lst = []
 255         hit = 0
 256         for line in self.headers:
 257             if hit:
 258                 if not line[:1].isspace():
 259                     break
 260             elif line[:n].lower() == name:
 261                 hit = 1
 262             if hit:
 263                 lst.append(line)
 264         return lst
 265
 266     def getrawheader(self, name):
 267         """A higher-level interface to getfirstmatchingheader().
 268
 269         Return a string containing the literal text of the header but with the
 270         keyword stripped.  All leading, trailing and embedded whitespace is
 271         kept in the string, however.  Return None if the header does not
 272         occur.
 273         """
 274
 275         lst = self.getfirstmatchingheader(name)
 276         if not lst:
 277             return None
 278         lst[0] = lst[0][len(name) + 1:]
 279         return ''.join(lst)
 280
 281     def getheader(self, name, default=None):
 282         """Get the header value for a name.
 283
 284         This is the normal interface: it returns a stripped version of the
 285         header value for a given header name, or None if it doesn't exist.
 286         This uses the dictionary version which finds the *last* such header.
 287         """
 288         return self.dict.get(name.lower(), default)
 289     get = getheader
 290
 291     def getheaders(self, name):
 292         """Get all values for a header.
 293
 294         This returns a list of values for headers given more than once; each
 295         value in the result list is stripped in the same way as the result of
 296         getheader().  If the header is not given, return an empty list.
 297         """
 298         result = []
 299         current = ''
 300         have_header = 0
 301         for s in self.getallmatchingheaders(name):
 302             if s[0].isspace():
 303                 if current:
 304                     current = "%s\n %s" % (current, s.strip())
 305                 else:
 306                     current = s.strip()
 307             else:
 308                 if have_header:
 309                     result.append(current)
 310                 current = s[s.find(":") + 1:].strip()
 311                 have_header = 1
 312         if have_header:
 313             result.append(current)
 314         return result
 315
 316     def getaddr(self, name):
 317         """Get a single address from a header, as a tuple.
 318
 319         An example return value:
 320         ('Guido van Rossum', 'guido@cwi.nl')
 321         """
 322         # New, by Ben Escoto
 323         alist = self.getaddrlist(name)
 324         if alist:
 325             return alist[0]
 326         else:
 327             return (None, None)
 328
 329     def getaddrlist(self, name):
 330         """Get a list of addresses from a header.
 331
 332         Retrieves a list of addresses from a header, where each address is a
 333         tuple as returned by getaddr().  Scans all named headers, so it works
 334         properly with multiple To: or Cc: headers for example.
 335         """
 336         raw = []
 337         for h in self.getallmatchingheaders(name):
 338             if h[0] in ' \t':
 339                 raw.append(h)
 340             else:
 341                 if raw:
 342                     raw.append(', ')
 343                 i = h.find(':')
 344                 if i > 0:
 345                     addr = h[i+1:]
 346                 raw.append(addr)
 347         alladdrs = ''.join(raw)
 348         a = AddressList(alladdrs)
 349         return a.addresslist
 350
 351     def getdate(self, name):
 352         """Retrieve a date field from a header.
 353
 354         Retrieves a date field from the named header, returning a tuple
 355         compatible with time.mktime().
 356         """
 357         try:
 358             data = self[name]
 359         except KeyError:
 360             return None
 361         return parsedate(data)
 362
 363     def getdate_tz(self, name):
 364         """Retrieve a date field from a header as a 10-tuple.
 365
 366         The first 9 elements make up a tuple compatible with time.mktime(),
 367         and the 10th is the offset of the poster's time zone from GMT/UTC.
 368         """
 369         try:
 370             data = self[name]
 371         except KeyError:
 372             return None
 373         return parsedate_tz(data)
 374
 375
 376     # Access as a dictionary (only finds *last* header of each type):
 377
 378     def __len__(self):
 379         """Get the number of headers in a message."""
 380         return len(self.dict)
 381
 382     def __getitem__(self, name):
 383         """Get a specific header, as from a dictionary."""
 384         return self.dict[name.lower()]
 385
 386     def __setitem__(self, name, value):
 387         """Set the value of a header.
 388
 389         Note: This is not a perfect inversion of __getitem__, because any
 390         changed headers get stuck at the end of the raw-headers list rather
 391         than where the altered header was.
 392         """
 393         del self[name] # Won't fail if it doesn't exist
 394         self.dict[name.lower()] = value
 395         text = name + ": " + value
 396         for line in text.split("\n"):
 397             self.headers.append(line + "\n")
 398
 399     def __delitem__(self, name):
 400         """Delete all occurrences of a specific header, if it is present."""
 401         name = name.lower()
 402         if not name in self.dict:
 403             return
 404         del self.dict[name]
 405         name = name + ':'
 406         n = len(name)
 407         lst = []
 408         hit = 0
 409         for i in range(len(self.headers)):
 410             line = self.headers[i]
 411             if line[:n].lower() == name:
 412                 hit = 1
 413             elif not line[:1].isspace():
 414                 hit = 0
 415             if hit:
 416                 lst.append(i)
 417         for i in reversed(lst):
 418             del self.headers[i]
 419
 420     def setdefault(self, name, default=""):
 421         lowername = name.lower()
 422         if lowername in self.dict:
 423             return self.dict[lowername]
 424         else:
 425             text = name + ": " + default
 426             for line in text.split("\n"):
 427                 self.headers.append(line + "\n")
 428             self.dict[lowername] = default
 429             return default
 430
 431     def has_key(self, name):
 432         """Determine whether a message contains the named header."""
 433         return name.lower() in self.dict
 434
 435     def __contains__(self, name):
 436         """Determine whether a message contains the named header."""
 437         return name.lower() in self.dict
 438
 439     def __iter__(self):
 440         return iter(self.dict)
 441
 442     def keys(self):
 443         """Get all of a message's header field names."""
 444         return self.dict.keys()
 445
 446     def values(self):
 447         """Get all of a message's header field values."""
 448         return self.dict.values()
 449
 450     def items(self):
 451         """Get all of a message's headers.
 452
 453         Returns a list of name, value tuples.
 454         """
 455         return self.dict.items()
 456
 457     def __str__(self):
 458         return ''.join(self.headers)
 459
 460
 461 # Utility functions
 462 # -----------------
 463
 464 # XXX Should fix unquote() and quote() to be really conformant.
 465 # XXX The inverses of the parse functions may also be useful.
 466
 467
 468 def unquote(s):
 469     """Remove quotes from a string."""
 470     if len(s) > 1:
 471         if s.startswith('"') and s.endswith('"'):
 472             return s[1:-1].replace('\\\\', '\\').replace('\\"', '"')
 473         if s.startswith('<') and s.endswith('>'):
 474             return s[1:-1]
 475     return s
 476
 477
 478 def quote(s):
 479     """Add quotes around a string."""
 480     return s.replace('\\', '\\\\').replace('"', '\\"')
 481
 482
 483 def parseaddr(address):
 484     """Parse an address into a (realname, mailaddr) tuple."""
 485     a = AddressList(address)
 486     lst = a.addresslist
 487     if not lst:
 488         return (None, None)
 489     return lst[0]
 490
 491
 492 class AddrlistClass:
 493     """Address parser class by Ben Escoto.
 494
 495     To understand what this class does, it helps to have a copy of
 496     RFC 2822 in front of you.
 497
 498     http://www.faqs.org/rfcs/rfc2822.html
 499
 500     Note: this class interface is deprecated and may be removed in the future.
 501     Use rfc822.AddressList instead.
 502     """
 503
 504     def __init__(self, field):
 505         """Initialize a new instance.
 506
 507         `field' is an unparsed address header field, containing one or more
 508         addresses.
 509         """
 510         self.specials = '()<>@,:;.\"[]'
 511         self.pos = 0
 512         self.LWS = ' \t'
 513         self.CR = '\r\n'
 514         self.atomends = self.specials + self.LWS + self.CR
 515         # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
 516         # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
 517         # syntax, so allow dots in phrases.
 518         self.phraseends = self.atomends.replace('.', '')
 519         self.field = field
 520         self.commentlist = []
 521
 522     def gotonext(self):
 523         """Parse up to the start of the next address."""
 524         while self.pos < len(self.field):
 525             if self.field[self.pos] in self.LWS + '\n\r':
 526                 self.pos = self.pos + 1
 527             elif self.field[self.pos] == '(':
 528                 self.commentlist.append(self.getcomment())
 529             else: break
 530
 531     def getaddrlist(self):
 532         """Parse all addresses.
 533
 534         Returns a list containing all of the addresses.
 535         """
 536         result = []
 537         ad = self.getaddress()
 538         while ad:
 539             result += ad
 540             ad = self.getaddress()
 541         return result
 542
 543     def getaddress(self):
 544         """Parse the next address."""
 545         self.commentlist = []
 546         self.gotonext()
 547
 548         oldpos = self.pos
 549         oldcl = self.commentlist
 550         plist = self.getphraselist()
 551
 552         self.gotonext()
 553         returnlist = []
 554
 555         if self.pos >= len(self.field):
 556             # Bad email address technically, no domain.
 557             if plist:
 558                 returnlist = [(' '.join(self.commentlist), plist[0])]
 559
 560         elif self.field[self.pos] in '.@':
 561             # email address is just an addrspec
 562             # this isn't very efficient since we start over
 563             self.pos = oldpos
 564             self.commentlist = oldcl
 565             addrspec = self.getaddrspec()
 566             returnlist = [(' '.join(self.commentlist), addrspec)]
 567
 568         elif self.field[self.pos] == ':':
 569             # address is a group
 570             returnlist = []
 571
 572             fieldlen = len(self.field)
 573             self.pos += 1
 574             while self.pos < len(self.field):
 575                 self.gotonext()
 576                 if self.pos < fieldlen and self.field[self.pos] == ';':
 577                     self.pos += 1
 578                     break
 579                 returnlist = returnlist + self.getaddress()
 580
 581         elif self.field[self.pos] == '<':
 582             # Address is a phrase then a route addr
 583             routeaddr = self.getrouteaddr()
 584
 585             if self.commentlist:
 586                 returnlist = [(' '.join(plist) + ' (' + \
 587                          ' '.join(self.commentlist) + ')', routeaddr)]
 588             else: returnlist = [(' '.join(plist), routeaddr)]
 589
 590         else:
 591             if plist:
 592                 returnlist = [(' '.join(self.commentlist), plist[0])]
 593             elif self.field[self.pos] in self.specials:
 594                 self.pos += 1
 595
 596         self.gotonext()
 597         if self.pos < len(self.field) and self.field[self.pos] == ',':
 598             self.pos += 1
 599         return returnlist
 600
 601     def getrouteaddr(self):
 602         """Parse a route address (Return-path value).
 603
 604         This method just skips all the route stuff and returns the addrspec.
 605         """
 606         if self.field[self.pos] != '<':
 607             return
 608
 609         expectroute = 0
 610         self.pos += 1
 611         self.gotonext()
 612         adlist = ""
 613         while self.pos < len(self.field):
 614             if expectroute:
 615                 self.getdomain()
 616                 expectroute = 0
 617             elif self.field[self.pos] == '>':
 618                 self.pos += 1
 619                 break
 620             elif self.field[self.pos] == '@':
 621                 self.pos += 1
 622                 expectroute = 1
 623             elif self.field[self.pos] == ':':
 624                 self.pos += 1
 625             else:
 626                 adlist = self.getaddrspec()
 627                 self.pos += 1
 628                 break
 629             self.gotonext()
 630
 631         return adlist
 632
 633     def getaddrspec(self):
 634         """Parse an RFC 2822 addr-spec."""
 635         aslist = []
 636
 637         self.gotonext()
 638         while self.pos < len(self.field):
 639             if self.field[self.pos] == '.':
 640                 aslist.append('.')
 641                 self.pos += 1
 642             elif self.field[self.pos] == '"':
 643                 aslist.append('"%s"' % self.getquote())
 644             elif self.field[self.pos] in self.atomends:
 645                 break
 646             else: aslist.append(self.getatom())
 647             self.gotonext()
 648
 649         if self.pos >= len(self.field) or self.field[self.pos] != '@':
 650             return ''.join(aslist)
 651
 652         aslist.append('@')
 653         self.pos += 1
 654         self.gotonext()
 655         return ''.join(aslist) + self.getdomain()
 656
 657     def getdomain(self):
 658         """Get the complete domain name from an address."""
 659         sdlist = []
 660         while self.pos < len(self.field):
 661             if self.field[self.pos] in self.LWS:
 662                 self.pos += 1
 663             elif self.field[self.pos] == '(':
 664                 self.commentlist.append(self.getcomment())
 665             elif self.field[self.pos] == '[':
 666                 sdlist.append(self.getdomainliteral())
 667             elif self.field[self.pos] == '.':
 668                 self.pos += 1
 669                 sdlist.append('.')
 670             elif self.field[self.pos] in self.atomends:
 671                 break
 672             else: sdlist.append(self.getatom())
 673         return ''.join(sdlist)
 674
 675     def getdelimited(self, beginchar, endchars, allowcomments = 1):
 676         """Parse a header fragment delimited by special characters.
 677
 678         `beginchar' is the start character for the fragment.  If self is not
 679         looking at an instance of `beginchar' then getdelimited returns the
 680         empty string.
 681
 682         `endchars' is a sequence of allowable end-delimiting characters.
 683         Parsing stops when one of these is encountered.
 684
 685         If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
 686         within the parsed fragment.
 687         """
 688         if self.field[self.pos] != beginchar:
 689             return ''
 690
 691         slist = ['']
 692         quote = 0
 693         self.pos += 1
 694         while self.pos < len(self.field):
 695             if quote == 1:
 696                 slist.append(self.field[self.pos])
 697                 quote = 0
 698             elif self.field[self.pos] in endchars:
 699                 self.pos += 1
 700                 break
 701             elif allowcomments and self.field[self.pos] == '(':
 702                 slist.append(self.getcomment())
 703                 continue        # have already advanced pos from getcomment
 704             elif self.field[self.pos] == '\\':
 705                 quote = 1
 706             else:
 707                 slist.append(self.field[self.pos])
 708             self.pos += 1
 709
 710         return ''.join(slist)
 711
 712     def getquote(self):
 713         """Get a quote-delimited fragment from self's field."""
 714         return self.getdelimited('"', '"\r', 0)
 715
 716     def getcomment(self):
 717         """Get a parenthesis-delimited fragment from self's field."""
 718         return self.getdelimited('(', ')\r', 1)
 719
 720     def getdomainliteral(self):
 721         """Parse an RFC 2822 domain-literal."""
 722         return '[%s]' % self.getdelimited('[', ']\r', 0)
 723
 724     def getatom(self, atomends=None):
 725         """Parse an RFC 2822 atom.
 726
 727         Optional atomends specifies a different set of end token delimiters
 728         (the default is to use self.atomends).  This is used e.g. in
 729         getphraselist() since phrase endings must not include the `.' (which
 730         is legal in phrases)."""
 731         atomlist = ['']
 732         if atomends is None:
 733             atomends = self.atomends
 734
 735         while self.pos < len(self.field):
 736             if self.field[self.pos] in atomends:
 737                 break
 738             else: atomlist.append(self.field[self.pos])
 739             self.pos += 1
 740
 741         return ''.join(atomlist)
 742
 743     def getphraselist(self):
 744         """Parse a sequence of RFC 2822 phrases.
 745
 746         A phrase is a sequence of words, which are in turn either RFC 2822
 747         atoms or quoted-strings.  Phrases are canonicalized by squeezing all
 748         runs of continuous whitespace into one space.
 749         """
 750         plist = []
 751
 752         while self.pos < len(self.field):
 753             if self.field[self.pos] in self.LWS:
 754                 self.pos += 1
 755             elif self.field[self.pos] == '"':
 756                 plist.append(self.getquote())
 757             elif self.field[self.pos] == '(':
 758                 self.commentlist.append(self.getcomment())
 759             elif self.field[self.pos] in self.phraseends:
 760                 break
 761             else:
 762                 plist.append(self.getatom(self.phraseends))
 763
 764         return plist
 765
 766 class AddressList(AddrlistClass):
 767     """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
 768     def __init__(self, field):
 769         AddrlistClass.__init__(self, field)
 770         if field:
 771             self.addresslist = self.getaddrlist()
 772         else:
 773             self.addresslist = []
 774
 775     def __len__(self):
 776         return len(self.addresslist)
 777
 778     def __str__(self):
 779         return ", ".join(map(dump_address_pair, self.addresslist))
 780
 781     def __add__(self, other):
 782         # Set union
 783         newaddr = AddressList(None)
 784         newaddr.addresslist = self.addresslist[:]
 785         for x in other.addresslist:
 786             if not x in self.addresslist:
 787                 newaddr.addresslist.append(x)
 788         return newaddr
 789
 790     def __iadd__(self, other):
 791         # Set union, in-place
 792         for x in other.addresslist:
 793             if not x in self.addresslist:
 794                 self.addresslist.append(x)
 795         return self
 796
 797     def __sub__(self, other):
 798         # Set difference
 799         newaddr = AddressList(None)
 800         for x in self.addresslist:
 801             if not x in other.addresslist:
 802                 newaddr.addresslist.append(x)
 803         return newaddr
 804
 805     def __isub__(self, other):
 806         # Set difference, in-place
 807         for x in other.addresslist:
 808             if x in self.addresslist:
 809                 self.addresslist.remove(x)
 810         return self
 811
 812     def __getitem__(self, index):
 813         # Make indexing, slices, and 'in' work
 814         return self.addresslist[index]
 815
 816 def dump_address_pair(pair):
 817     """Dump a (name, address) pair in a canonicalized form."""
 818     if pair[0]:
 819         return '"' + pair[0] + '" <' + pair[1] + '>'
 820     else:
 821         return pair[1]
 822
 823 # Parse a date field
 824
 825 _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
 826                'aug', 'sep', 'oct', 'nov', 'dec',
 827                'january', 'february', 'march', 'april', 'may', 'june', 'july',
 828                'august', 'september', 'october', 'november', 'december']
 829 _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
 830
 831 # The timezone table does not include the military time zones defined
 832 # in RFC822, other than Z.  According to RFC1123, the description in
 833 # RFC822 gets the signs wrong, so we can't rely on any such time
 834 # zones.  RFC1123 recommends that numeric timezone indicators be used
 835 # instead of timezone names.
 836
 837 _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
 838               'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
 839               'EST': -500, 'EDT': -400,  # Eastern
 840               'CST': -600, 'CDT': -500,  # Central
 841               'MST': -700, 'MDT': -600,  # Mountain
 842               'PST': -800, 'PDT': -700   # Pacific
 843               }
 844
 845
 846 def parsedate_tz(data):
 847     """Convert a date string to a time tuple.
 848
 849     Accounts for military timezones.
 850     """
 851     if not data:
 852         return None
 853     data = data.split()
 854     if data[0][-1] in (',', '.') or data[0].lower() in _daynames:
 855         # There's a dayname here. Skip it
 856         del data[0]
 857     else:
 858         # no space after the "weekday,"?
 859         i = data[0].rfind(',')
 860         if i >= 0:
 861             data[0] = data[0][i+1:]
 862     if len(data) == 3: # RFC 850 date, deprecated
 863         stuff = data[0].split('-')
 864         if len(stuff) == 3:
 865             data = stuff + data[1:]
 866     if len(data) == 4:
 867         s = data[3]
 868         i = s.find('+')
 869         if i > 0:
 870             data[3:] = [s[:i], s[i+1:]]
 871         else:
 872             data.append('') # Dummy tz
 873     if len(data) < 5:
 874         return None
 875     data = data[:5]
 876     [dd, mm, yy, tm, tz] = data
 877     mm = mm.lower()
 878     if not mm in _monthnames:
 879         dd, mm = mm, dd.lower()
 880         if not mm in _monthnames:
 881             return None
 882     mm = _monthnames.index(mm)+1
 883     if mm > 12: mm = mm - 12
 884     if dd[-1] == ',':
 885         dd = dd[:-1]
 886     i = yy.find(':')
 887     if i > 0:
 888         yy, tm = tm, yy
 889     if yy[-1] == ',':
 890         yy = yy[:-1]
 891     if not yy[0].isdigit():
 892         yy, tz = tz, yy
 893     if tm[-1] == ',':
 894         tm = tm[:-1]
 895     tm = tm.split(':')
 896     if len(tm) == 2:
 897         [thh, tmm] = tm
 898         tss = '0'
 899     elif len(tm) == 3:
 900         [thh, tmm, tss] = tm
 901     else:
 902         return None
 903     try:
 904         yy = int(yy)
 905         dd = int(dd)
 906         thh = int(thh)
 907         tmm = int(tmm)
 908         tss = int(tss)
 909     except ValueError:
 910         return None
 911     tzoffset = None
 912     tz = tz.upper()
 913     if tz in _timezones:
 914         tzoffset = _timezones[tz]
 915     else:
 916         try:
 917             tzoffset = int(tz)
 918         except ValueError:
 919             pass
 920     # Convert a timezone offset into seconds ; -0500 -> -18000
 921     if tzoffset:
 922         if tzoffset < 0:
 923             tzsign = -1
 924             tzoffset = -tzoffset
 925         else:
 926             tzsign = 1
 927         tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
 928     return (yy, mm, dd, thh, tmm, tss, 0, 1, 0, tzoffset)
 929
 930
 931 def parsedate(data):
 932     """Convert a time string to a time tuple."""
 933     t = parsedate_tz(data)
 934     if t is None:
 935         return t
 936     return t[:9]
 937
 938
 939 def mktime_tz(data):
 940     """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
 941     if data[9] is None:
 942         # No zone info, so localtime is better assumption than GMT
 943         return time.mktime(data[:8] + (-1,))
 944     else:
 945         t = time.mktime(data[:8] + (0,))
 946         return t - data[9] - time.timezone
 947
 948 def formatdate(timeval=None):
 949     """Returns time format preferred for Internet standards.
 950
 951     Sun, 06 Nov 1994 08:49:37 GMT  ; RFC 822, updated by RFC 1123
 952
 953     According to RFC 1123, day and month names must always be in
 954     English.  If not for that, this code could use strftime().  It
 955     can't because strftime() honors the locale and could generated
 956     non-English names.
 957     """
 958     if timeval is None:
 959         timeval = time.time()
 960     timeval = time.gmtime(timeval)
 961     return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
 962             ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")[timeval[6]],
 963             timeval[2],
 964             ("Jan", "Feb", "Mar", "Apr", "May", "Jun",
 965              "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")[timeval[1]-1],
 966                                 timeval[0], timeval[3], timeval[4], timeval[5])
 967
 968
 969 # When used as script, run a small test program.
 970 # The first command line argument must be a filename containing one
 971 # message in RFC-822 format.
 972
 973 if __name__ == '__main__':
 974     import sys, os
 975     file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
 976     if sys.argv[1:]: file = sys.argv[1]
 977     f = open(file, 'r')
 978     m = Message(f)
 979     print 'From:', m.getaddr('from')
 980     print 'To:', m.getaddrlist('to')
 981     print 'Subject:', m.getheader('subject')
 982     print 'Date:', m.getheader('date')
 983     date = m.getdate_tz('date')
 984     tz = date[-1]
 985     date = time.localtime(mktime_tz(date))
 986     if date:
 987         print 'ParsedDate:', time.asctime(date),
 988         hhmmss = tz
 989         hhmm, ss = divmod(hhmmss, 60)
 990         hh, mm = divmod(hhmm, 60)
 991         print "%+03d%02d" % (hh, mm),
 992         if ss: print ".%02d" % ss,
 993         print
 994     else:
 995         print 'ParsedDate:', None
 996     m.rewindbody()
 997     n = 0
 998     while f.readline():
 999         n += 1
1000     print 'Lines:', n
1001     print '-'*70
1002     print 'len =', len(m)
1003     if 'Date' in m: print 'Date =', m['Date']
1004     if 'X-Nonsense' in m: pass
1005     print 'keys =', m.keys()
1006     print 'values =', m.values()
1007     print 'items =', m.items()