Lib/rfc822.py

   1 """RFC 2822 message manipulation.
   2
   3 Note: This is only a very rough sketch of a full RFC-822 parser; in particular
   4 the tokenizing of addresses does not adhere to all the quoting rules.
   5
   6 Note: RFC 2822 is a long awaited update to RFC 822.  This module should
   7 conform to RFC 2822, and is thus mis-named (it's not worth renaming it).  Some
   8 effort at RFC 2822 updates have been made, but a thorough audit has not been
   9 performed.  Consider any RFC 2822 non-conformance to be a bug.
  10
  11     RFC 2822: http://www.faqs.org/rfcs/rfc2822.html
  12     RFC 822 : http://www.faqs.org/rfcs/rfc822.html (obsolete)
  13
  14 Directions for use:
  15
  16 To create a Message object: first open a file, e.g.:
  17
  18   fp = open(file, 'r')
  19
  20 You can use any other legal way of getting an open file object, e.g. use
  21 sys.stdin or call os.popen().  Then pass the open file object to the Message()
  22 constructor:
  23
  24   m = Message(fp)
  25
  26 This class can work with any input object that supports a readline method.  If
  27 the input object has seek and tell capability, the rewindbody method will
  28 work; also illegal lines will be pushed back onto the input stream.  If the
  29 input object lacks seek but has an `unread' method that can push back a line
  30 of input, Message will use that to push back illegal lines.  Thus this class
  31 can be used to parse messages coming from a buffered stream.
  32
  33 The optional `seekable' argument is provided as a workaround for certain stdio
  34 libraries in which tell() discards buffered data before discovering that the
  35 lseek() system call doesn't work.  For maximum portability, you should set the
  36 seekable argument to zero to prevent that initial \code{tell} when passing in
  37 an unseekable object such as a a file object created from a socket object.  If
  38 it is 1 on entry -- which it is by default -- the tell() method of the open
  39 file object is called once; if this raises an exception, seekable is reset to
  40 0.  For other nonzero values of seekable, this test is not made.
  41
  42 To get the text of a particular header there are several methods:
  43
  44   str = m.getheader(name)
  45   str = m.getrawheader(name)
  46
  47 where name is the name of the header, e.g. 'Subject'.  The difference is that
  48 getheader() strips the leading and trailing whitespace, while getrawheader()
  49 doesn't.  Both functions retain embedded whitespace (including newlines)
  50 exactly as they are specified in the header, and leave the case of the text
  51 unchanged.
  52
  53 For addresses and address lists there are functions
  54
  55   realname, mailaddress = m.getaddr(name)
  56   list = m.getaddrlist(name)
  57
  58 where the latter returns a list of (realname, mailaddr) tuples.
  59
  60 There is also a method
  61
  62   time = m.getdate(name)
  63
  64 which parses a Date-like field and returns a time-compatible tuple,
  65 i.e. a tuple such as returned by time.localtime() or accepted by
  66 time.mktime().
  67
  68 See the class definition for lower level access methods.
  69
  70 There are also some utility functions here.
  71 """
  72 # Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
  73
  74 import time
  75
  76 __all__ = ["Message","AddressList","parsedate","parsedate_tz","mktime_tz"]
  77
  78 _blanklines = ('\r\n', '\n')            # Optimization for islast()
  79
  80
  81 class Message:
  82     """Represents a single RFC 2822-compliant message."""
  83
  84     def __init__(self, fp, seekable = 1):
  85         """Initialize the class instance and read the headers."""
  86         if seekable == 1:
  87             # Exercise tell() to make sure it works
  88             # (and then assume seek() works, too)
  89             try:
  90                 fp.tell()
  91             except (AttributeError, IOError):
  92                 seekable = 0
  93         self.fp = fp
  94         self.seekable = seekable
  95         self.startofheaders = None
  96         self.startofbody = None
  97         #
  98         if self.seekable:
  99             try:
 100                 self.startofheaders = self.fp.tell()
 101             except IOError:
 102                 self.seekable = 0
 103         #
 104         self.readheaders()
 105         #
 106         if self.seekable:
 107             try:
 108                 self.startofbody = self.fp.tell()
 109             except IOError:
 110                 self.seekable = 0
 111
 112     def rewindbody(self):
 113         """Rewind the file to the start of the body (if seekable)."""
 114         if not self.seekable:
 115             raise IOError, "unseekable file"
 116         self.fp.seek(self.startofbody)
 117
 118     def readheaders(self):
 119         """Read header lines.
 120
 121         Read header lines up to the entirely blank line that terminates them.
 122         The (normally blank) line that ends the headers is skipped, but not
 123         included in the returned list.  If a non-header line ends the headers,
 124         (which is an error), an attempt is made to backspace over it; it is
 125         never included in the returned list.
 126
 127         The variable self.status is set to the empty string if all went well,
 128         otherwise it is an error message.  The variable self.headers is a
 129         completely uninterpreted list of lines contained in the header (so
 130         printing them will reproduce the header exactly as it appears in the
 131         file).
 132         """
 133         self.dict = {}
 134         self.unixfrom = ''
 135         self.headers = lst = []
 136         self.status = ''
 137         headerseen = ""
 138         firstline = 1
 139         startofline = unread = tell = None
 140         if hasattr(self.fp, 'unread'):
 141             unread = self.fp.unread
 142         elif self.seekable:
 143             tell = self.fp.tell
 144         while 1:
 145             if tell:
 146                 try:
 147                     startofline = tell()
 148                 except IOError:
 149                     startofline = tell = None
 150                     self.seekable = 0
 151             line = self.fp.readline()
 152             if not line:
 153                 self.status = 'EOF in headers'
 154                 break
 155             # Skip unix From name time lines
 156             if firstline and line.startswith('From '):
 157                 self.unixfrom = self.unixfrom + line
 158                 continue
 159             firstline = 0
 160             if headerseen and line[0] in ' \t':
 161                 # It's a continuation line.
 162                 lst.append(line)
 163                 x = (self.dict[headerseen] + "\n " + line.strip())
 164                 self.dict[headerseen] = x.strip()
 165                 continue
 166             elif self.iscomment(line):
 167                 # It's a comment.  Ignore it.
 168                 continue
 169             elif self.islast(line):
 170                 # Note! No pushback here!  The delimiter line gets eaten.
 171                 break
 172             headerseen = self.isheader(line)
 173             if headerseen:
 174                 # It's a legal header line, save it.
 175                 lst.append(line)
 176                 self.dict[headerseen] = line[len(headerseen)+1:].strip()
 177                 continue
 178             else:
 179                 # It's not a header line; throw it back and stop here.
 180                 if not self.dict:
 181                     self.status = 'No headers'
 182                 else:
 183                     self.status = 'Non-header line where header expected'
 184                 # Try to undo the read.
 185                 if unread:
 186                     unread(line)
 187                 elif tell:
 188                     self.fp.seek(startofline)
 189                 else:
 190                     self.status = self.status + '; bad seek'
 191                 break
 192
 193     def isheader(self, line):
 194         """Determine whether a given line is a legal header.
 195
 196         This method should return the header name, suitably canonicalized.
 197         You may override this method in order to use Message parsing on tagged
 198         data in RFC 2822-like formats with special header formats.
 199         """
 200         i = line.find(':')
 201         if i > 0:
 202             return line[:i].lower()
 203         return None
 204
 205     def islast(self, line):
 206         """Determine whether a line is a legal end of RFC 2822 headers.
 207
 208         You may override this method if your application wants to bend the
 209         rules, e.g. to strip trailing whitespace, or to recognize MH template
 210         separators ('--------').  For convenience (e.g. for code reading from
 211         sockets) a line consisting of \r\n also matches.
 212         """
 213         return line in _blanklines
 214
 215     def iscomment(self, line):
 216         """Determine whether a line should be skipped entirely.
 217
 218         You may override this method in order to use Message parsing on tagged
 219         data in RFC 2822-like formats that support embedded comments or
 220         free-text data.
 221         """
 222         return False
 223
 224     def getallmatchingheaders(self, name):
 225         """Find all header lines matching a given header name.
 226
 227         Look through the list of headers and find all lines matching a given
 228         header name (and their continuation lines).  A list of the lines is
 229         returned, without interpretation.  If the header does not occur, an
 230         empty list is returned.  If the header occurs multiple times, all
 231         occurrences are returned.  Case is not important in the header name.
 232         """
 233         name = name.lower() + ':'
 234         n = len(name)
 235         lst = []
 236         hit = 0
 237         for line in self.headers:
 238             if line[:n].lower() == name:
 239                 hit = 1
 240             elif not line[:1].isspace():
 241                 hit = 0
 242             if hit:
 243                 lst.append(line)
 244         return lst
 245
 246     def getfirstmatchingheader(self, name):
 247         """Get the first header line matching name.
 248
 249         This is similar to getallmatchingheaders, but it returns only the
 250         first matching header (and its continuation lines).
 251         """
 252         name = name.lower() + ':'
 253         n = len(name)
 254         lst = []
 255         hit = 0
 256         for line in self.headers:
 257             if hit:
 258                 if not line[:1].isspace():
 259                     break
 260             elif line[:n].lower() == name:
 261                 hit = 1
 262             if hit:
 263                 lst.append(line)
 264         return lst
 265
 266     def getrawheader(self, name):
 267         """A higher-level interface to getfirstmatchingheader().
 268
 269         Return a string containing the literal text of the header but with the
 270         keyword stripped.  All leading, trailing and embedded whitespace is
 271         kept in the string, however.  Return None if the header does not
 272         occur.
 273         """
 274
 275         lst = self.getfirstmatchingheader(name)
 276         if not lst:
 277             return None
 278         lst[0] = lst[0][len(name) + 1:]
 279         return ''.join(lst)
 280
 281     def getheader(self, name, default=None):
 282         """Get the header value for a name.
 283
 284         This is the normal interface: it returns a stripped version of the
 285         header value for a given header name, or None if it doesn't exist.
 286         This uses the dictionary version which finds the *last* such header.
 287         """
 288         return self.dict.get(name.lower(), default)
 289     get = getheader
 290
 291     def getheaders(self, name):
 292         """Get all values for a header.
 293
 294         This returns a list of values for headers given more than once; each
 295         value in the result list is stripped in the same way as the result of
 296         getheader().  If the header is not given, return an empty list.
 297         """
 298         result = []
 299         current = ''
 300         have_header = 0
 301         for s in self.getallmatchingheaders(name):
 302             if s[0].isspace():
 303                 if current:
 304                     current = "%s\n %s" % (current, s.strip())
 305                 else:
 306                     current = s.strip()
 307             else:
 308                 if have_header:
 309                     result.append(current)
 310                 current = s[s.find(":") + 1:].strip()
 311                 have_header = 1
 312         if have_header:
 313             result.append(current)
 314         return result
 315
 316     def getaddr(self, name):
 317         """Get a single address from a header, as a tuple.
 318
 319         An example return value:
 320         ('Guido van Rossum', 'guido@cwi.nl')
 321         """
 322         # New, by Ben Escoto
 323         alist = self.getaddrlist(name)
 324         if alist:
 325             return alist[0]
 326         else:
 327             return (None, None)
 328
 329     def getaddrlist(self, name):
 330         """Get a list of addresses from a header.
 331
 332         Retrieves a list of addresses from a header, where each address is a
 333         tuple as returned by getaddr().  Scans all named headers, so it works
 334         properly with multiple To: or Cc: headers for example.
 335         """
 336         raw = []
 337         for h in self.getallmatchingheaders(name):
 338             if h[0] in ' \t':
 339                 raw.append(h)
 340             else:
 341                 if raw:
 342                     raw.append(', ')
 343                 i = h.find(':')
 344                 if i > 0:
 345                     addr = h[i+1:]
 346                 raw.append(addr)
 347         alladdrs = ''.join(raw)
 348         a = AddressList(alladdrs)
 349         return a.addresslist
 350
 351     def getdate(self, name):
 352         """Retrieve a date field from a header.
 353
 354         Retrieves a date field from the named header, returning a tuple
 355         compatible with time.mktime().
 356         """
 357         try:
 358             data = self[name]
 359         except KeyError:
 360             return None
 361         return parsedate(data)
 362
 363     def getdate_tz(self, name):
 364         """Retrieve a date field from a header as a 10-tuple.
 365
 366         The first 9 elements make up a tuple compatible with time.mktime(),
 367         and the 10th is the offset of the poster's time zone from GMT/UTC.
 368         """
 369         try:
 370             data = self[name]
 371         except KeyError:
 372             return None
 373         return parsedate_tz(data)
 374
 375
 376     # Access as a dictionary (only finds *last* header of each type):
 377
 378     def __len__(self):
 379         """Get the number of headers in a message."""
 380         return len(self.dict)
 381
 382     def __getitem__(self, name):
 383         """Get a specific header, as from a dictionary."""
 384         return self.dict[name.lower()]
 385
 386     def __setitem__(self, name, value):
 387         """Set the value of a header.
 388
 389         Note: This is not a perfect inversion of __getitem__, because any
 390         changed headers get stuck at the end of the raw-headers list rather
 391         than where the altered header was.
 392         """
 393         del self[name] # Won't fail if it doesn't exist
 394         self.dict[name.lower()] = value
 395         text = name + ": " + value
 396         for line in text.split("\n"):
 397             self.headers.append(line + "\n")
 398
 399     def __delitem__(self, name):
 400         """Delete all occurrences of a specific header, if it is present."""
 401         name = name.lower()
 402         if not name in self.dict:
 403             return
 404         del self.dict[name]
 405         name = name + ':'
 406         n = len(name)
 407         lst = []
 408         hit = 0
 409         for i in range(len(self.headers)):
 410             line = self.headers[i]
 411             if line[:n].lower() == name:
 412                 hit = 1
 413             elif not line[:1].isspace():
 414                 hit = 0
 415             if hit:
 416                 lst.append(i)
 417         for i in reversed(lst):
 418             del self.headers[i]
 419
 420     def setdefault(self, name, default=""):
 421         lowername = name.lower()
 422         if lowername in self.dict:
 423             return self.dict[lowername]
 424         else:
 425             text = name + ": " + default
 426             for line in text.split("\n"):
 427                 self.headers.append(line + "\n")
 428             self.dict[lowername] = default
 429             return default
 430
 431     def has_key(self, name):
 432         """Determine whether a message contains the named header."""
 433         return name.lower() in self.dict
 434
 435     def __contains__(self, name):
 436         """Determine whether a message contains the named header."""
 437         return name.lower() in self.dict
 438
 439     def __iter__(self):
 440         return iter(self.dict)
 441
 442     def keys(self):
 443         """Get all of a message's header field names."""
 444         return self.dict.keys()
 445
 446     def values(self):
 447         """Get all of a message's header field values."""
 448         return self.dict.values()
 449
 450     def items(self):
 451         """Get all of a message's headers.
 452
 453         Returns a list of name, value tuples.
 454         """
 455         return self.dict.items()
 456
 457     def __str__(self):
 458         return ''.join(self.headers)
 459
 460
 461 # Utility functions
 462 # -----------------
 463
 464 # XXX Should fix unquote() and quote() to be really conformant.
 465 # XXX The inverses of the parse functions may also be useful.
 466
 467
 468 def unquote(s):
 469     """Remove quotes from a string."""
 470     if len(s) > 1:
 471         if s.startswith('"') and s.endswith('"'):
 472             return s[1:-1].replace('\\\\', '\\').replace('\\"', '"')
 473         if s.startswith('<') and s.endswith('>'):
 474             return s[1:-1]
 475     return s
 476
 477
 478 def quote(s):
 479     """Add quotes around a string."""
 480     return s.replace('\\', '\\\\').replace('"', '\\"')
 481
 482
 483 def parseaddr(address):
 484     """Parse an address into a (realname, mailaddr) tuple."""
 485     a = AddressList(address)
 486     lst = a.addresslist
 487     if not lst:
 488         return (None, None)
 489     return lst[0]
 490
 491
 492 class AddrlistClass:
 493     """Address parser class by Ben Escoto.
 494
 495     To understand what this class does, it helps to have a copy of
 496     RFC 2822 in front of you.
 497
 498     http://www.faqs.org/rfcs/rfc2822.html
 499
 500     Note: this class interface is deprecated and may be removed in the future.
 501     Use rfc822.AddressList instead.
 502     """
 503
 504     def __init__(self, field):
 505         """Initialize a new instance.
 506
 507         `field' is an unparsed address header field, containing one or more
 508         addresses.
 509         """
 510         self.specials = '()<>@,:;.\"[]'
 511         self.pos = 0
 512         self.LWS = ' \t'
 513         self.CR = '\r\n'
 514         self.atomends = self.specials + self.LWS + self.CR
 515         # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
 516         # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
 517         # syntax, so allow dots in phrases.
 518         self.phraseends = self.atomends.replace('.', '')
 519         self.field = field
 520         self.commentlist = []
 521
 522     def gotonext(self):
 523         """Parse up to the start of the next address."""
 524         while self.pos < len(self.field):
 525             if self.field[self.pos] in self.LWS + '\n\r':
 526                 self.pos = self.pos + 1
 527             elif self.field[self.pos] == '(':
 528                 self.commentlist.append(self.getcomment())
 529             else: break
 530
 531     def getaddrlist(self):
 532         """Parse all addresses.
 533
 534         Returns a list containing all of the addresses.
 535         """
 536         result = []
 537         ad = self.getaddress()
 538         while ad:
 539             result += ad
 540             ad = self.getaddress()
 541         return result
 542
 543     def getaddress(self):
 544         """Parse the next address."""
 545         self.commentlist = []
 546         self.gotonext()
 547
 548         oldpos = self.pos
 549         oldcl = self.commentlist
 550         plist = self.getphraselist()
 551
 552         self.gotonext()
 553         returnlist = []
 554
 555         if self.pos >= len(self.field):
 556             # Bad email address technically, no domain.
 557             if plist:
 558                 returnlist = [(' '.join(self.commentlist), plist[0])]
 559
 560         elif self.field[self.pos] in '.@':
 561             # email address is just an addrspec
 562             # this isn't very efficient since we start over
 563             self.pos = oldpos
 564             self.commentlist = oldcl
 565             addrspec = self.getaddrspec()
 566             returnlist = [(' '.join(self.commentlist), addrspec)]
 567
 568         elif self.field[self.pos] == ':':
 569             # address is a group
 570             returnlist = []
 571
 572             fieldlen = len(self.field)
 573             self.pos += 1
 574             while self.pos < len(self.field):
 575                 self.gotonext()
 576                 if self.pos < fieldlen and self.field[self.pos] == ';':
 577                     self.pos += 1
 578                     break
 579                 returnlist = returnlist + self.getaddress()
 580
 581         elif self.field[self.pos] == '<':
 582             # Address is a phrase then a route addr
 583             routeaddr = self.getrouteaddr()
 584
 585             if self.commentlist:
 586                 returnlist = [(' '.join(plist) + ' (' + \
 587                          ' '.join(self.commentlist) + ')', routeaddr)]
 588             else: returnlist = [(' '.join(plist), routeaddr)]
 589
 590         else:
 591             if plist:
 592                 returnlist = [(' '.join(self.commentlist), plist[0])]
 593             elif self.field[self.pos] in self.specials:
 594                 self.pos += 1
 595
 596         self.gotonext()
 597         if self.pos < len(self.field) and self.field[self.pos] == ',':
 598             self.pos += 1
 599         return returnlist
 600
 601     def getrouteaddr(self):
 602         """Parse a route address (Return-path value).
 603
 604         This method just skips all the route stuff and returns the addrspec.
 605         """
 606         if self.field[self.pos] != '<':
 607             return
 608
 609         expectroute = 0
 610         self.pos += 1
 611         self.gotonext()
 612         adlist = ""
 613         while self.pos < len(self.field):
 614             if expectroute:
 615                 self.getdomain()
 616                 expectroute = 0
 617             elif self.field[self.pos] == '>':
 618                 self.pos += 1
 619                 break
 620             elif self.field[self.pos] == '@':
 621                 self.pos += 1
 622                 expectroute = 1
 623             elif self.field[self.pos] == ':':
 624                 self.pos += 1
 625             else:
 626                 adlist = self.getaddrspec()
 627                 self.pos += 1
 628                 break
 629             self.gotonext()
 630
 631         return adlist
 632
 633     def getaddrspec(self):
 634         """Parse an RFC 2822 addr-spec."""
 635         aslist = []
 636
 637         self.gotonext()
 638         while self.pos < len(self.field):
 639             if self.field[self.pos] == '.':
 640                 aslist.append('.')
 641                 self.pos += 1
 642             elif self.field[self.pos] == '"':
 643                 aslist.append('"%s"' % self.getquote())
 644             elif self.field[self.pos] in self.atomends:
 645                 break
 646             else: aslist.append(self.getatom())
 647             self.gotonext()
 648
 649         if self.pos >= len(self.field) or self.field[self.pos] != '@':
 650             return ''.join(aslist)
 651
 652         aslist.append('@')
 653         self.pos += 1
 654         self.gotonext()
 655         return ''.join(aslist) + self.getdomain()
 656
 657     def getdomain(self):
 658         """Get the complete domain name from an address."""
 659         sdlist = []
 660         while self.pos < len(self.field):
 661             if self.field[self.pos] in self.LWS:
 662                 self.pos += 1
 663             elif self.field[self.pos] == '(':
 664                 self.commentlist.append(self.getcomment())
 665             elif self.field[self.pos] == '[':
 666                 sdlist.append(self.getdomainliteral())
 667             elif self.field[self.pos] == '.':
 668                 self.pos += 1
 669                 sdlist.append('.')
 670             elif self.field[self.pos] in self.atomends:
 671                 break
 672             else: sdlist.append(self.getatom())
 673         return ''.join(sdlist)
 674
 675     def getdelimited(self, beginchar, endchars, allowcomments = 1):
 676         """Parse a header fragment delimited by special characters.
 677
 678         `beginchar' is the start character for the fragment.  If self is not
 679         looking at an instance of `beginchar' then getdelimited returns the
 680         empty string.
 681
 682         `endchars' is a sequence of allowable end-delimiting characters.
 683         Parsing stops when one of these is encountered.
 684
 685         If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
 686         within the parsed fragment.
 687         """
 688         if self.field[self.pos] != beginchar:
 689             return ''
 690
 691         slist = ['']
 692         quote = 0
 693         self.pos += 1
 694         while self.pos < len(self.field):
 695             if quote == 1:
 696                 slist.append(self.field[self.pos])
 697                 quote = 0
 698             elif self.field[self.pos] in endchars:
 699                 self.pos += 1
 700                 break
 701             elif allowcomments and self.field[self.pos] == '(':
 702                 slist.append(self.getcomment())
 703             elif self.field[self.pos] == '\\':
 704                 quote = 1
 705             else:
 706                 slist.append(self.field[self.pos])
 707             self.pos += 1
 708
 709         return ''.join(slist)
 710
 711     def getquote(self):
 712         """Get a quote-delimited fragment from self's field."""
 713         return self.getdelimited('"', '"\r', 0)
 714
 715     def getcomment(self):
 716         """Get a parenthesis-delimited fragment from self's field."""
 717         return self.getdelimited('(', ')\r', 1)
 718
 719     def getdomainliteral(self):
 720         """Parse an RFC 2822 domain-literal."""
 721         return '[%s]' % self.getdelimited('[', ']\r', 0)
 722
 723     def getatom(self, atomends=None):
 724         """Parse an RFC 2822 atom.
 725
 726         Optional atomends specifies a different set of end token delimiters
 727         (the default is to use self.atomends).  This is used e.g. in
 728         getphraselist() since phrase endings must not include the `.' (which
 729         is legal in phrases)."""
 730         atomlist = ['']
 731         if atomends is None:
 732             atomends = self.atomends
 733
 734         while self.pos < len(self.field):
 735             if self.field[self.pos] in atomends:
 736                 break
 737             else: atomlist.append(self.field[self.pos])
 738             self.pos += 1
 739
 740         return ''.join(atomlist)
 741
 742     def getphraselist(self):
 743         """Parse a sequence of RFC 2822 phrases.
 744
 745         A phrase is a sequence of words, which are in turn either RFC 2822
 746         atoms or quoted-strings.  Phrases are canonicalized by squeezing all
 747         runs of continuous whitespace into one space.
 748         """
 749         plist = []
 750
 751         while self.pos < len(self.field):
 752             if self.field[self.pos] in self.LWS:
 753                 self.pos += 1
 754             elif self.field[self.pos] == '"':
 755                 plist.append(self.getquote())
 756             elif self.field[self.pos] == '(':
 757                 self.commentlist.append(self.getcomment())
 758             elif self.field[self.pos] in self.phraseends:
 759                 break
 760             else:
 761                 plist.append(self.getatom(self.phraseends))
 762
 763         return plist
 764
 765 class AddressList(AddrlistClass):
 766     """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
 767     def __init__(self, field):
 768         AddrlistClass.__init__(self, field)
 769         if field:
 770             self.addresslist = self.getaddrlist()
 771         else:
 772             self.addresslist = []
 773
 774     def __len__(self):
 775         return len(self.addresslist)
 776
 777     def __str__(self):
 778         return ", ".join(map(dump_address_pair, self.addresslist))
 779
 780     def __add__(self, other):
 781         # Set union
 782         newaddr = AddressList(None)
 783         newaddr.addresslist = self.addresslist[:]
 784         for x in other.addresslist:
 785             if not x in self.addresslist:
 786                 newaddr.addresslist.append(x)
 787         return newaddr
 788
 789     def __iadd__(self, other):
 790         # Set union, in-place
 791         for x in other.addresslist:
 792             if not x in self.addresslist:
 793                 self.addresslist.append(x)
 794         return self
 795
 796     def __sub__(self, other):
 797         # Set difference
 798         newaddr = AddressList(None)
 799         for x in self.addresslist:
 800             if not x in other.addresslist:
 801                 newaddr.addresslist.append(x)
 802         return newaddr
 803
 804     def __isub__(self, other):
 805         # Set difference, in-place
 806         for x in other.addresslist:
 807             if x in self.addresslist:
 808                 self.addresslist.remove(x)
 809         return self
 810
 811     def __getitem__(self, index):
 812         # Make indexing, slices, and 'in' work
 813         return self.addresslist[index]
 814
 815 def dump_address_pair(pair):
 816     """Dump a (name, address) pair in a canonicalized form."""
 817     if pair[0]:
 818         return '"' + pair[0] + '" <' + pair[1] + '>'
 819     else:
 820         return pair[1]
 821
 822 # Parse a date field
 823
 824 _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
 825                'aug', 'sep', 'oct', 'nov', 'dec',
 826                'january', 'february', 'march', 'april', 'may', 'june', 'july',
 827                'august', 'september', 'october', 'november', 'december']
 828 _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
 829
 830 # The timezone table does not include the military time zones defined
 831 # in RFC822, other than Z.  According to RFC1123, the description in
 832 # RFC822 gets the signs wrong, so we can't rely on any such time
 833 # zones.  RFC1123 recommends that numeric timezone indicators be used
 834 # instead of timezone names.
 835
 836 _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
 837               'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
 838               'EST': -500, 'EDT': -400,  # Eastern
 839               'CST': -600, 'CDT': -500,  # Central
 840               'MST': -700, 'MDT': -600,  # Mountain
 841               'PST': -800, 'PDT': -700   # Pacific
 842               }
 843
 844
 845 def parsedate_tz(data):
 846     """Convert a date string to a time tuple.
 847
 848     Accounts for military timezones.
 849     """
 850     if not data:
 851         return None
 852     data = data.split()
 853     if data[0][-1] in (',', '.') or data[0].lower() in _daynames:
 854         # There's a dayname here. Skip it
 855         del data[0]
 856     if len(data) == 3: # RFC 850 date, deprecated
 857         stuff = data[0].split('-')
 858         if len(stuff) == 3:
 859             data = stuff + data[1:]
 860     if len(data) == 4:
 861         s = data[3]
 862         i = s.find('+')
 863         if i > 0:
 864             data[3:] = [s[:i], s[i+1:]]
 865         else:
 866             data.append('') # Dummy tz
 867     if len(data) < 5:
 868         return None
 869     data = data[:5]
 870     [dd, mm, yy, tm, tz] = data
 871     mm = mm.lower()
 872     if not mm in _monthnames:
 873         dd, mm = mm, dd.lower()
 874         if not mm in _monthnames:
 875             return None
 876     mm = _monthnames.index(mm)+1
 877     if mm > 12: mm = mm - 12
 878     if dd[-1] == ',':
 879         dd = dd[:-1]
 880     i = yy.find(':')
 881     if i > 0:
 882         yy, tm = tm, yy
 883     if yy[-1] == ',':
 884         yy = yy[:-1]
 885     if not yy[0].isdigit():
 886         yy, tz = tz, yy
 887     if tm[-1] == ',':
 888         tm = tm[:-1]
 889     tm = tm.split(':')
 890     if len(tm) == 2:
 891         [thh, tmm] = tm
 892         tss = '0'
 893     elif len(tm) == 3:
 894         [thh, tmm, tss] = tm
 895     else:
 896         return None
 897     try:
 898         yy = int(yy)
 899         dd = int(dd)
 900         thh = int(thh)
 901         tmm = int(tmm)
 902         tss = int(tss)
 903     except ValueError:
 904         return None
 905     tzoffset = None
 906     tz = tz.upper()
 907     if tz in _timezones:
 908         tzoffset = _timezones[tz]
 909     else:
 910         try:
 911             tzoffset = int(tz)
 912         except ValueError:
 913             pass
 914     # Convert a timezone offset into seconds ; -0500 -> -18000
 915     if tzoffset:
 916         if tzoffset < 0:
 917             tzsign = -1
 918             tzoffset = -tzoffset
 919         else:
 920             tzsign = 1
 921         tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
 922     return (yy, mm, dd, thh, tmm, tss, 0, 1, 0, tzoffset)
 923
 924
 925 def parsedate(data):
 926     """Convert a time string to a time tuple."""
 927     t = parsedate_tz(data)
 928     if t is None:
 929         return t
 930     return t[:9]
 931
 932
 933 def mktime_tz(data):
 934     """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
 935     if data[9] is None:
 936         # No zone info, so localtime is better assumption than GMT
 937         return time.mktime(data[:8] + (-1,))
 938     else:
 939         t = time.mktime(data[:8] + (0,))
 940         return t - data[9] - time.timezone
 941
 942 def formatdate(timeval=None):
 943     """Returns time format preferred for Internet standards.
 944
 945     Sun, 06 Nov 1994 08:49:37 GMT  ; RFC 822, updated by RFC 1123
 946
 947     According to RFC 1123, day and month names must always be in
 948     English.  If not for that, this code could use strftime().  It
 949     can't because strftime() honors the locale and could generated
 950     non-English names.
 951     """
 952     if timeval is None:
 953         timeval = time.time()
 954     timeval = time.gmtime(timeval)
 955     return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
 956             ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")[timeval[6]],
 957             timeval[2],
 958             ("Jan", "Feb", "Mar", "Apr", "May", "Jun",
 959              "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")[timeval[1]-1],
 960                                 timeval[0], timeval[3], timeval[4], timeval[5])
 961
 962
 963 # When used as script, run a small test program.
 964 # The first command line argument must be a filename containing one
 965 # message in RFC-822 format.
 966
 967 if __name__ == '__main__':
 968     import sys, os
 969     file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
 970     if sys.argv[1:]: file = sys.argv[1]
 971     f = open(file, 'r')
 972     m = Message(f)
 973     print 'From:', m.getaddr('from')
 974     print 'To:', m.getaddrlist('to')
 975     print 'Subject:', m.getheader('subject')
 976     print 'Date:', m.getheader('date')
 977     date = m.getdate_tz('date')
 978     tz = date[-1]
 979     date = time.localtime(mktime_tz(date))
 980     if date:
 981         print 'ParsedDate:', time.asctime(date),
 982         hhmmss = tz
 983         hhmm, ss = divmod(hhmmss, 60)
 984         hh, mm = divmod(hhmm, 60)
 985         print "%+03d%02d" % (hh, mm),
 986         if ss: print ".%02d" % ss,
 987         print
 988     else:
 989         print 'ParsedDate:', None
 990     m.rewindbody()
 991     n = 0
 992     while f.readline():
 993         n += 1
 994     print 'Lines:', n
 995     print '-'*70
 996     print 'len =', len(m)
 997     if 'Date' in m: print 'Date =', m['Date']
 998     if 'X-Nonsense' in m: pass
 999     print 'keys =', m.keys()
1000     print 'values =', m.values()
1001     print 'items =', m.items()