Lib/email/_parseaddr.py

   1 # Copyright (C) 2002-2007 Python Software Foundation
   2 # Contact: email-sig@python.org
   3
   4 """Email address parsing code.
   5
   6 Lifted directly from rfc822.py.  This should eventually be rewritten.
   7 """
   8
   9 __all__ = [
  10     'mktime_tz',
  11     'parsedate',
  12     'parsedate_tz',
  13     'quote',
  14     ]
  15
  16 import time
  17
  18 SPACE = ' '
  19 EMPTYSTRING = ''
  20 COMMASPACE = ', '
  21
  22 # Parse a date field
  23 _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
  24                'aug', 'sep', 'oct', 'nov', 'dec',
  25                'january', 'february', 'march', 'april', 'may', 'june', 'july',
  26                'august', 'september', 'october', 'november', 'december']
  27
  28 _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
  29
  30 # The timezone table does not include the military time zones defined
  31 # in RFC822, other than Z.  According to RFC1123, the description in
  32 # RFC822 gets the signs wrong, so we can't rely on any such time
  33 # zones.  RFC1123 recommends that numeric timezone indicators be used
  34 # instead of timezone names.
  35
  36 _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
  37               'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
  38               'EST': -500, 'EDT': -400,  # Eastern
  39               'CST': -600, 'CDT': -500,  # Central
  40               'MST': -700, 'MDT': -600,  # Mountain
  41               'PST': -800, 'PDT': -700   # Pacific
  42               }
  43
  44
  45 def parsedate_tz(data):
  46     """Convert a date string to a time tuple.
  47
  48     Accounts for military timezones.
  49     """
  50     data = data.split()
  51     # The FWS after the comma after the day-of-week is optional, so search and
  52     # adjust for this.
  53     if data[0].endswith(',') or data[0].lower() in _daynames:
  54         # There's a dayname here. Skip it
  55         del data[0]
  56     else:
  57         i = data[0].rfind(',')
  58         if i >= 0:
  59             data[0] = data[0][i+1:]
  60     if len(data) == 3: # RFC 850 date, deprecated
  61         stuff = data[0].split('-')
  62         if len(stuff) == 3:
  63             data = stuff + data[1:]
  64     if len(data) == 4:
  65         s = data[3]
  66         i = s.find('+')
  67         if i > 0:
  68             data[3:] = [s[:i], s[i+1:]]
  69         else:
  70             data.append('') # Dummy tz
  71     if len(data) < 5:
  72         return None
  73     data = data[:5]
  74     [dd, mm, yy, tm, tz] = data
  75     mm = mm.lower()
  76     if mm not in _monthnames:
  77         dd, mm = mm, dd.lower()
  78         if mm not in _monthnames:
  79             return None
  80     mm = _monthnames.index(mm) + 1
  81     if mm > 12:
  82         mm -= 12
  83     if dd[-1] == ',':
  84         dd = dd[:-1]
  85     i = yy.find(':')
  86     if i > 0:
  87         yy, tm = tm, yy
  88     if yy[-1] == ',':
  89         yy = yy[:-1]
  90     if not yy[0].isdigit():
  91         yy, tz = tz, yy
  92     if tm[-1] == ',':
  93         tm = tm[:-1]
  94     tm = tm.split(':')
  95     if len(tm) == 2:
  96         [thh, tmm] = tm
  97         tss = '0'
  98     elif len(tm) == 3:
  99         [thh, tmm, tss] = tm
 100     else:
 101         return None
 102     try:
 103         yy = int(yy)
 104         dd = int(dd)
 105         thh = int(thh)
 106         tmm = int(tmm)
 107         tss = int(tss)
 108     except ValueError:
 109         return None
 110     tzoffset = None
 111     tz = tz.upper()
 112     if tz in _timezones:
 113         tzoffset = _timezones[tz]
 114     else:
 115         try:
 116             tzoffset = int(tz)
 117         except ValueError:
 118             pass
 119     # Convert a timezone offset into seconds ; -0500 -> -18000
 120     if tzoffset:
 121         if tzoffset < 0:
 122             tzsign = -1
 123             tzoffset = -tzoffset
 124         else:
 125             tzsign = 1
 126         tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
 127     # Daylight Saving Time flag is set to -1, since DST is unknown.
 128     return yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset
 129
 130
 131 def parsedate(data):
 132     """Convert a time string to a time tuple."""
 133     t = parsedate_tz(data)
 134     if isinstance(t, tuple):
 135         return t[:9]
 136     else:
 137         return t
 138
 139
 140 def mktime_tz(data):
 141     """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
 142     if data[9] is None:
 143         # No zone info, so localtime is better assumption than GMT
 144         return time.mktime(data[:8] + (-1,))
 145     else:
 146         t = time.mktime(data[:8] + (0,))
 147         return t - data[9] - time.timezone
 148
 149
 150 def quote(str):
 151     """Add quotes around a string."""
 152     return str.replace('\\', '\\\\').replace('"', '\\"')
 153
 154
 155 class AddrlistClass:
 156     """Address parser class by Ben Escoto.
 157
 158     To understand what this class does, it helps to have a copy of RFC 2822 in
 159     front of you.
 160
 161     Note: this class interface is deprecated and may be removed in the future.
 162     Use rfc822.AddressList instead.
 163     """
 164
 165     def __init__(self, field):
 166         """Initialize a new instance.
 167
 168         `field' is an unparsed address header field, containing
 169         one or more addresses.
 170         """
 171         self.specials = '()<>@,:;.\"[]'
 172         self.pos = 0
 173         self.LWS = ' \t'
 174         self.CR = '\r\n'
 175         self.FWS = self.LWS + self.CR
 176         self.atomends = self.specials + self.LWS + self.CR
 177         # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
 178         # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
 179         # syntax, so allow dots in phrases.
 180         self.phraseends = self.atomends.replace('.', '')
 181         self.field = field
 182         self.commentlist = []
 183
 184     def gotonext(self):
 185         """Parse up to the start of the next address."""
 186         while self.pos < len(self.field):
 187             if self.field[self.pos] in self.LWS + '\n\r':
 188                 self.pos += 1
 189             elif self.field[self.pos] == '(':
 190                 self.commentlist.append(self.getcomment())
 191             else:
 192                 break
 193
 194     def getaddrlist(self):
 195         """Parse all addresses.
 196
 197         Returns a list containing all of the addresses.
 198         """
 199         result = []
 200         while self.pos < len(self.field):
 201             ad = self.getaddress()
 202             if ad:
 203                 result += ad
 204             else:
 205                 result.append(('', ''))
 206         return result
 207
 208     def getaddress(self):
 209         """Parse the next address."""
 210         self.commentlist = []
 211         self.gotonext()
 212
 213         oldpos = self.pos
 214         oldcl = self.commentlist
 215         plist = self.getphraselist()
 216
 217         self.gotonext()
 218         returnlist = []
 219
 220         if self.pos >= len(self.field):
 221             # Bad email address technically, no domain.
 222             if plist:
 223                 returnlist = [(SPACE.join(self.commentlist), plist[0])]
 224
 225         elif self.field[self.pos] in '.@':
 226             # email address is just an addrspec
 227             # this isn't very efficient since we start over
 228             self.pos = oldpos
 229             self.commentlist = oldcl
 230             addrspec = self.getaddrspec()
 231             returnlist = [(SPACE.join(self.commentlist), addrspec)]
 232
 233         elif self.field[self.pos] == ':':
 234             # address is a group
 235             returnlist = []
 236
 237             fieldlen = len(self.field)
 238             self.pos += 1
 239             while self.pos < len(self.field):
 240                 self.gotonext()
 241                 if self.pos < fieldlen and self.field[self.pos] == ';':
 242                     self.pos += 1
 243                     break
 244                 returnlist = returnlist + self.getaddress()
 245
 246         elif self.field[self.pos] == '<':
 247             # Address is a phrase then a route addr
 248             routeaddr = self.getrouteaddr()
 249
 250             if self.commentlist:
 251                 returnlist = [(SPACE.join(plist) + ' (' +
 252                                ' '.join(self.commentlist) + ')', routeaddr)]
 253             else:
 254                 returnlist = [(SPACE.join(plist), routeaddr)]
 255
 256         else:
 257             if plist:
 258                 returnlist = [(SPACE.join(self.commentlist), plist[0])]
 259             elif self.field[self.pos] in self.specials:
 260                 self.pos += 1
 261
 262         self.gotonext()
 263         if self.pos < len(self.field) and self.field[self.pos] == ',':
 264             self.pos += 1
 265         return returnlist
 266
 267     def getrouteaddr(self):
 268         """Parse a route address (Return-path value).
 269
 270         This method just skips all the route stuff and returns the addrspec.
 271         """
 272         if self.field[self.pos] != '<':
 273             return
 274
 275         expectroute = False
 276         self.pos += 1
 277         self.gotonext()
 278         adlist = ''
 279         while self.pos < len(self.field):
 280             if expectroute:
 281                 self.getdomain()
 282                 expectroute = False
 283             elif self.field[self.pos] == '>':
 284                 self.pos += 1
 285                 break
 286             elif self.field[self.pos] == '@':
 287                 self.pos += 1
 288                 expectroute = True
 289             elif self.field[self.pos] == ':':
 290                 self.pos += 1
 291             else:
 292                 adlist = self.getaddrspec()
 293                 self.pos += 1
 294                 break
 295             self.gotonext()
 296
 297         return adlist
 298
 299     def getaddrspec(self):
 300         """Parse an RFC 2822 addr-spec."""
 301         aslist = []
 302
 303         self.gotonext()
 304         while self.pos < len(self.field):
 305             if self.field[self.pos] == '.':
 306                 aslist.append('.')
 307                 self.pos += 1
 308             elif self.field[self.pos] == '"':
 309                 aslist.append('"%s"' % self.getquote())
 310             elif self.field[self.pos] in self.atomends:
 311                 break
 312             else:
 313                 aslist.append(self.getatom())
 314             self.gotonext()
 315
 316         if self.pos >= len(self.field) or self.field[self.pos] != '@':
 317             return EMPTYSTRING.join(aslist)
 318
 319         aslist.append('@')
 320         self.pos += 1
 321         self.gotonext()
 322         return EMPTYSTRING.join(aslist) + self.getdomain()
 323
 324     def getdomain(self):
 325         """Get the complete domain name from an address."""
 326         sdlist = []
 327         while self.pos < len(self.field):
 328             if self.field[self.pos] in self.LWS:
 329                 self.pos += 1
 330             elif self.field[self.pos] == '(':
 331                 self.commentlist.append(self.getcomment())
 332             elif self.field[self.pos] == '[':
 333                 sdlist.append(self.getdomainliteral())
 334             elif self.field[self.pos] == '.':
 335                 self.pos += 1
 336                 sdlist.append('.')
 337             elif self.field[self.pos] in self.atomends:
 338                 break
 339             else:
 340                 sdlist.append(self.getatom())
 341         return EMPTYSTRING.join(sdlist)
 342
 343     def getdelimited(self, beginchar, endchars, allowcomments=True):
 344         """Parse a header fragment delimited by special characters.
 345
 346         `beginchar' is the start character for the fragment.
 347         If self is not looking at an instance of `beginchar' then
 348         getdelimited returns the empty string.
 349
 350         `endchars' is a sequence of allowable end-delimiting characters.
 351         Parsing stops when one of these is encountered.
 352
 353         If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
 354         within the parsed fragment.
 355         """
 356         if self.field[self.pos] != beginchar:
 357             return ''
 358
 359         slist = ['']
 360         quote = False
 361         self.pos += 1
 362         while self.pos < len(self.field):
 363             if quote:
 364                 slist.append(self.field[self.pos])
 365                 quote = False
 366             elif self.field[self.pos] in endchars:
 367                 self.pos += 1
 368                 break
 369             elif allowcomments and self.field[self.pos] == '(':
 370                 slist.append(self.getcomment())
 371                 continue        # have already advanced pos from getcomment
 372             elif self.field[self.pos] == '\\':
 373                 quote = True
 374             else:
 375                 slist.append(self.field[self.pos])
 376             self.pos += 1
 377
 378         return EMPTYSTRING.join(slist)
 379
 380     def getquote(self):
 381         """Get a quote-delimited fragment from self's field."""
 382         return self.getdelimited('"', '"\r', False)
 383
 384     def getcomment(self):
 385         """Get a parenthesis-delimited fragment from self's field."""
 386         return self.getdelimited('(', ')\r', True)
 387
 388     def getdomainliteral(self):
 389         """Parse an RFC 2822 domain-literal."""
 390         return '[%s]' % self.getdelimited('[', ']\r', False)
 391
 392     def getatom(self, atomends=None):
 393         """Parse an RFC 2822 atom.
 394
 395         Optional atomends specifies a different set of end token delimiters
 396         (the default is to use self.atomends).  This is used e.g. in
 397         getphraselist() since phrase endings must not include the `.' (which
 398         is legal in phrases)."""
 399         atomlist = ['']
 400         if atomends is None:
 401             atomends = self.atomends
 402
 403         while self.pos < len(self.field):
 404             if self.field[self.pos] in atomends:
 405                 break
 406             else:
 407                 atomlist.append(self.field[self.pos])
 408             self.pos += 1
 409
 410         return EMPTYSTRING.join(atomlist)
 411
 412     def getphraselist(self):
 413         """Parse a sequence of RFC 2822 phrases.
 414
 415         A phrase is a sequence of words, which are in turn either RFC 2822
 416         atoms or quoted-strings.  Phrases are canonicalized by squeezing all
 417         runs of continuous whitespace into one space.
 418         """
 419         plist = []
 420
 421         while self.pos < len(self.field):
 422             if self.field[self.pos] in self.FWS:
 423                 self.pos += 1
 424             elif self.field[self.pos] == '"':
 425                 plist.append(self.getquote())
 426             elif self.field[self.pos] == '(':
 427                 self.commentlist.append(self.getcomment())
 428             elif self.field[self.pos] in self.phraseends:
 429                 break
 430             else:
 431                 plist.append(self.getatom(self.phraseends))
 432
 433         return plist
 434
 435 class AddressList(AddrlistClass):
 436     """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
 437     def __init__(self, field):
 438         AddrlistClass.__init__(self, field)
 439         if field:
 440             self.addresslist = self.getaddrlist()
 441         else:
 442             self.addresslist = []
 443
 444     def __len__(self):
 445         return len(self.addresslist)
 446
 447     def __add__(self, other):
 448         # Set union
 449         newaddr = AddressList(None)
 450         newaddr.addresslist = self.addresslist[:]
 451         for x in other.addresslist:
 452             if not x in self.addresslist:
 453                 newaddr.addresslist.append(x)
 454         return newaddr
 455
 456     def __iadd__(self, other):
 457         # Set union, in-place
 458         for x in other.addresslist:
 459             if not x in self.addresslist:
 460                 self.addresslist.append(x)
 461         return self
 462
 463     def __sub__(self, other):
 464         # Set difference
 465         newaddr = AddressList(None)
 466         for x in self.addresslist:
 467             if not x in other.addresslist:
 468                 newaddr.addresslist.append(x)
 469         return newaddr
 470
 471     def __isub__(self, other):
 472         # Set difference, in-place
 473         for x in other.addresslist:
 474             if x in self.addresslist:
 475                 self.addresslist.remove(x)
 476         return self
 477
 478     def __getitem__(self, index):
 479         # Make indexing, slices, and 'in' work
 480         return self.addresslist[index]