Issue #5768: Change to Unicode output logic and test case for same.
[python.git] / Lib / rfc822.py
blob64cd702cd0c6b7cef1d9e4adf215a69bfd823a3f
1 """RFC 2822 message manipulation.
3 Note: This is only a very rough sketch of a full RFC-822 parser; in particular
4 the tokenizing of addresses does not adhere to all the quoting rules.
6 Note: RFC 2822 is a long awaited update to RFC 822. This module should
7 conform to RFC 2822, and is thus mis-named (it's not worth renaming it). Some
8 effort at RFC 2822 updates have been made, but a thorough audit has not been
9 performed. Consider any RFC 2822 non-conformance to be a bug.
11 RFC 2822: http://www.faqs.org/rfcs/rfc2822.html
12 RFC 822 : http://www.faqs.org/rfcs/rfc822.html (obsolete)
14 Directions for use:
16 To create a Message object: first open a file, e.g.:
18 fp = open(file, 'r')
20 You can use any other legal way of getting an open file object, e.g. use
21 sys.stdin or call os.popen(). Then pass the open file object to the Message()
22 constructor:
24 m = Message(fp)
26 This class can work with any input object that supports a readline method. If
27 the input object has seek and tell capability, the rewindbody method will
28 work; also illegal lines will be pushed back onto the input stream. If the
29 input object lacks seek but has an `unread' method that can push back a line
30 of input, Message will use that to push back illegal lines. Thus this class
31 can be used to parse messages coming from a buffered stream.
33 The optional `seekable' argument is provided as a workaround for certain stdio
34 libraries in which tell() discards buffered data before discovering that the
35 lseek() system call doesn't work. For maximum portability, you should set the
36 seekable argument to zero to prevent that initial \code{tell} when passing in
37 an unseekable object such as a a file object created from a socket object. If
38 it is 1 on entry -- which it is by default -- the tell() method of the open
39 file object is called once; if this raises an exception, seekable is reset to
40 0. For other nonzero values of seekable, this test is not made.
42 To get the text of a particular header there are several methods:
44 str = m.getheader(name)
45 str = m.getrawheader(name)
47 where name is the name of the header, e.g. 'Subject'. The difference is that
48 getheader() strips the leading and trailing whitespace, while getrawheader()
49 doesn't. Both functions retain embedded whitespace (including newlines)
50 exactly as they are specified in the header, and leave the case of the text
51 unchanged.
53 For addresses and address lists there are functions
55 realname, mailaddress = m.getaddr(name)
56 list = m.getaddrlist(name)
58 where the latter returns a list of (realname, mailaddr) tuples.
60 There is also a method
62 time = m.getdate(name)
64 which parses a Date-like field and returns a time-compatible tuple,
65 i.e. a tuple such as returned by time.localtime() or accepted by
66 time.mktime().
68 See the class definition for lower level access methods.
70 There are also some utility functions here.
71 """
72 # Cleanup and extensions by Eric S. Raymond <esr@thyrsus.com>
74 import time
76 from warnings import warnpy3k
77 warnpy3k("in 3.x, rfc822 has been removed in favor of the email package",
78 stacklevel=2)
80 __all__ = ["Message","AddressList","parsedate","parsedate_tz","mktime_tz"]
82 _blanklines = ('\r\n', '\n') # Optimization for islast()
85 class Message:
86 """Represents a single RFC 2822-compliant message."""
88 def __init__(self, fp, seekable = 1):
89 """Initialize the class instance and read the headers."""
90 if seekable == 1:
91 # Exercise tell() to make sure it works
92 # (and then assume seek() works, too)
93 try:
94 fp.tell()
95 except (AttributeError, IOError):
96 seekable = 0
97 self.fp = fp
98 self.seekable = seekable
99 self.startofheaders = None
100 self.startofbody = None
102 if self.seekable:
103 try:
104 self.startofheaders = self.fp.tell()
105 except IOError:
106 self.seekable = 0
108 self.readheaders()
110 if self.seekable:
111 try:
112 self.startofbody = self.fp.tell()
113 except IOError:
114 self.seekable = 0
116 def rewindbody(self):
117 """Rewind the file to the start of the body (if seekable)."""
118 if not self.seekable:
119 raise IOError, "unseekable file"
120 self.fp.seek(self.startofbody)
122 def readheaders(self):
123 """Read header lines.
125 Read header lines up to the entirely blank line that terminates them.
126 The (normally blank) line that ends the headers is skipped, but not
127 included in the returned list. If a non-header line ends the headers,
128 (which is an error), an attempt is made to backspace over it; it is
129 never included in the returned list.
131 The variable self.status is set to the empty string if all went well,
132 otherwise it is an error message. The variable self.headers is a
133 completely uninterpreted list of lines contained in the header (so
134 printing them will reproduce the header exactly as it appears in the
135 file).
137 self.dict = {}
138 self.unixfrom = ''
139 self.headers = lst = []
140 self.status = ''
141 headerseen = ""
142 firstline = 1
143 startofline = unread = tell = None
144 if hasattr(self.fp, 'unread'):
145 unread = self.fp.unread
146 elif self.seekable:
147 tell = self.fp.tell
148 while 1:
149 if tell:
150 try:
151 startofline = tell()
152 except IOError:
153 startofline = tell = None
154 self.seekable = 0
155 line = self.fp.readline()
156 if not line:
157 self.status = 'EOF in headers'
158 break
159 # Skip unix From name time lines
160 if firstline and line.startswith('From '):
161 self.unixfrom = self.unixfrom + line
162 continue
163 firstline = 0
164 if headerseen and line[0] in ' \t':
165 # It's a continuation line.
166 lst.append(line)
167 x = (self.dict[headerseen] + "\n " + line.strip())
168 self.dict[headerseen] = x.strip()
169 continue
170 elif self.iscomment(line):
171 # It's a comment. Ignore it.
172 continue
173 elif self.islast(line):
174 # Note! No pushback here! The delimiter line gets eaten.
175 break
176 headerseen = self.isheader(line)
177 if headerseen:
178 # It's a legal header line, save it.
179 lst.append(line)
180 self.dict[headerseen] = line[len(headerseen)+1:].strip()
181 continue
182 else:
183 # It's not a header line; throw it back and stop here.
184 if not self.dict:
185 self.status = 'No headers'
186 else:
187 self.status = 'Non-header line where header expected'
188 # Try to undo the read.
189 if unread:
190 unread(line)
191 elif tell:
192 self.fp.seek(startofline)
193 else:
194 self.status = self.status + '; bad seek'
195 break
197 def isheader(self, line):
198 """Determine whether a given line is a legal header.
200 This method should return the header name, suitably canonicalized.
201 You may override this method in order to use Message parsing on tagged
202 data in RFC 2822-like formats with special header formats.
204 i = line.find(':')
205 if i > 0:
206 return line[:i].lower()
207 return None
209 def islast(self, line):
210 """Determine whether a line is a legal end of RFC 2822 headers.
212 You may override this method if your application wants to bend the
213 rules, e.g. to strip trailing whitespace, or to recognize MH template
214 separators ('--------'). For convenience (e.g. for code reading from
215 sockets) a line consisting of \r\n also matches.
217 return line in _blanklines
219 def iscomment(self, line):
220 """Determine whether a line should be skipped entirely.
222 You may override this method in order to use Message parsing on tagged
223 data in RFC 2822-like formats that support embedded comments or
224 free-text data.
226 return False
228 def getallmatchingheaders(self, name):
229 """Find all header lines matching a given header name.
231 Look through the list of headers and find all lines matching a given
232 header name (and their continuation lines). A list of the lines is
233 returned, without interpretation. If the header does not occur, an
234 empty list is returned. If the header occurs multiple times, all
235 occurrences are returned. Case is not important in the header name.
237 name = name.lower() + ':'
238 n = len(name)
239 lst = []
240 hit = 0
241 for line in self.headers:
242 if line[:n].lower() == name:
243 hit = 1
244 elif not line[:1].isspace():
245 hit = 0
246 if hit:
247 lst.append(line)
248 return lst
250 def getfirstmatchingheader(self, name):
251 """Get the first header line matching name.
253 This is similar to getallmatchingheaders, but it returns only the
254 first matching header (and its continuation lines).
256 name = name.lower() + ':'
257 n = len(name)
258 lst = []
259 hit = 0
260 for line in self.headers:
261 if hit:
262 if not line[:1].isspace():
263 break
264 elif line[:n].lower() == name:
265 hit = 1
266 if hit:
267 lst.append(line)
268 return lst
270 def getrawheader(self, name):
271 """A higher-level interface to getfirstmatchingheader().
273 Return a string containing the literal text of the header but with the
274 keyword stripped. All leading, trailing and embedded whitespace is
275 kept in the string, however. Return None if the header does not
276 occur.
279 lst = self.getfirstmatchingheader(name)
280 if not lst:
281 return None
282 lst[0] = lst[0][len(name) + 1:]
283 return ''.join(lst)
285 def getheader(self, name, default=None):
286 """Get the header value for a name.
288 This is the normal interface: it returns a stripped version of the
289 header value for a given header name, or None if it doesn't exist.
290 This uses the dictionary version which finds the *last* such header.
292 return self.dict.get(name.lower(), default)
293 get = getheader
295 def getheaders(self, name):
296 """Get all values for a header.
298 This returns a list of values for headers given more than once; each
299 value in the result list is stripped in the same way as the result of
300 getheader(). If the header is not given, return an empty list.
302 result = []
303 current = ''
304 have_header = 0
305 for s in self.getallmatchingheaders(name):
306 if s[0].isspace():
307 if current:
308 current = "%s\n %s" % (current, s.strip())
309 else:
310 current = s.strip()
311 else:
312 if have_header:
313 result.append(current)
314 current = s[s.find(":") + 1:].strip()
315 have_header = 1
316 if have_header:
317 result.append(current)
318 return result
320 def getaddr(self, name):
321 """Get a single address from a header, as a tuple.
323 An example return value:
324 ('Guido van Rossum', 'guido@cwi.nl')
326 # New, by Ben Escoto
327 alist = self.getaddrlist(name)
328 if alist:
329 return alist[0]
330 else:
331 return (None, None)
333 def getaddrlist(self, name):
334 """Get a list of addresses from a header.
336 Retrieves a list of addresses from a header, where each address is a
337 tuple as returned by getaddr(). Scans all named headers, so it works
338 properly with multiple To: or Cc: headers for example.
340 raw = []
341 for h in self.getallmatchingheaders(name):
342 if h[0] in ' \t':
343 raw.append(h)
344 else:
345 if raw:
346 raw.append(', ')
347 i = h.find(':')
348 if i > 0:
349 addr = h[i+1:]
350 raw.append(addr)
351 alladdrs = ''.join(raw)
352 a = AddressList(alladdrs)
353 return a.addresslist
355 def getdate(self, name):
356 """Retrieve a date field from a header.
358 Retrieves a date field from the named header, returning a tuple
359 compatible with time.mktime().
361 try:
362 data = self[name]
363 except KeyError:
364 return None
365 return parsedate(data)
367 def getdate_tz(self, name):
368 """Retrieve a date field from a header as a 10-tuple.
370 The first 9 elements make up a tuple compatible with time.mktime(),
371 and the 10th is the offset of the poster's time zone from GMT/UTC.
373 try:
374 data = self[name]
375 except KeyError:
376 return None
377 return parsedate_tz(data)
380 # Access as a dictionary (only finds *last* header of each type):
382 def __len__(self):
383 """Get the number of headers in a message."""
384 return len(self.dict)
386 def __getitem__(self, name):
387 """Get a specific header, as from a dictionary."""
388 return self.dict[name.lower()]
390 def __setitem__(self, name, value):
391 """Set the value of a header.
393 Note: This is not a perfect inversion of __getitem__, because any
394 changed headers get stuck at the end of the raw-headers list rather
395 than where the altered header was.
397 del self[name] # Won't fail if it doesn't exist
398 self.dict[name.lower()] = value
399 text = name + ": " + value
400 for line in text.split("\n"):
401 self.headers.append(line + "\n")
403 def __delitem__(self, name):
404 """Delete all occurrences of a specific header, if it is present."""
405 name = name.lower()
406 if not name in self.dict:
407 return
408 del self.dict[name]
409 name = name + ':'
410 n = len(name)
411 lst = []
412 hit = 0
413 for i in range(len(self.headers)):
414 line = self.headers[i]
415 if line[:n].lower() == name:
416 hit = 1
417 elif not line[:1].isspace():
418 hit = 0
419 if hit:
420 lst.append(i)
421 for i in reversed(lst):
422 del self.headers[i]
424 def setdefault(self, name, default=""):
425 lowername = name.lower()
426 if lowername in self.dict:
427 return self.dict[lowername]
428 else:
429 text = name + ": " + default
430 for line in text.split("\n"):
431 self.headers.append(line + "\n")
432 self.dict[lowername] = default
433 return default
435 def has_key(self, name):
436 """Determine whether a message contains the named header."""
437 return name.lower() in self.dict
439 def __contains__(self, name):
440 """Determine whether a message contains the named header."""
441 return name.lower() in self.dict
443 def __iter__(self):
444 return iter(self.dict)
446 def keys(self):
447 """Get all of a message's header field names."""
448 return self.dict.keys()
450 def values(self):
451 """Get all of a message's header field values."""
452 return self.dict.values()
454 def items(self):
455 """Get all of a message's headers.
457 Returns a list of name, value tuples.
459 return self.dict.items()
461 def __str__(self):
462 return ''.join(self.headers)
465 # Utility functions
466 # -----------------
468 # XXX Should fix unquote() and quote() to be really conformant.
469 # XXX The inverses of the parse functions may also be useful.
472 def unquote(s):
473 """Remove quotes from a string."""
474 if len(s) > 1:
475 if s.startswith('"') and s.endswith('"'):
476 return s[1:-1].replace('\\\\', '\\').replace('\\"', '"')
477 if s.startswith('<') and s.endswith('>'):
478 return s[1:-1]
479 return s
482 def quote(s):
483 """Add quotes around a string."""
484 return s.replace('\\', '\\\\').replace('"', '\\"')
487 def parseaddr(address):
488 """Parse an address into a (realname, mailaddr) tuple."""
489 a = AddressList(address)
490 lst = a.addresslist
491 if not lst:
492 return (None, None)
493 return lst[0]
496 class AddrlistClass:
497 """Address parser class by Ben Escoto.
499 To understand what this class does, it helps to have a copy of
500 RFC 2822 in front of you.
502 http://www.faqs.org/rfcs/rfc2822.html
504 Note: this class interface is deprecated and may be removed in the future.
505 Use rfc822.AddressList instead.
508 def __init__(self, field):
509 """Initialize a new instance.
511 `field' is an unparsed address header field, containing one or more
512 addresses.
514 self.specials = '()<>@,:;.\"[]'
515 self.pos = 0
516 self.LWS = ' \t'
517 self.CR = '\r\n'
518 self.atomends = self.specials + self.LWS + self.CR
519 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
520 # is obsolete syntax. RFC 2822 requires that we recognize obsolete
521 # syntax, so allow dots in phrases.
522 self.phraseends = self.atomends.replace('.', '')
523 self.field = field
524 self.commentlist = []
526 def gotonext(self):
527 """Parse up to the start of the next address."""
528 while self.pos < len(self.field):
529 if self.field[self.pos] in self.LWS + '\n\r':
530 self.pos = self.pos + 1
531 elif self.field[self.pos] == '(':
532 self.commentlist.append(self.getcomment())
533 else: break
535 def getaddrlist(self):
536 """Parse all addresses.
538 Returns a list containing all of the addresses.
540 result = []
541 ad = self.getaddress()
542 while ad:
543 result += ad
544 ad = self.getaddress()
545 return result
547 def getaddress(self):
548 """Parse the next address."""
549 self.commentlist = []
550 self.gotonext()
552 oldpos = self.pos
553 oldcl = self.commentlist
554 plist = self.getphraselist()
556 self.gotonext()
557 returnlist = []
559 if self.pos >= len(self.field):
560 # Bad email address technically, no domain.
561 if plist:
562 returnlist = [(' '.join(self.commentlist), plist[0])]
564 elif self.field[self.pos] in '.@':
565 # email address is just an addrspec
566 # this isn't very efficient since we start over
567 self.pos = oldpos
568 self.commentlist = oldcl
569 addrspec = self.getaddrspec()
570 returnlist = [(' '.join(self.commentlist), addrspec)]
572 elif self.field[self.pos] == ':':
573 # address is a group
574 returnlist = []
576 fieldlen = len(self.field)
577 self.pos += 1
578 while self.pos < len(self.field):
579 self.gotonext()
580 if self.pos < fieldlen and self.field[self.pos] == ';':
581 self.pos += 1
582 break
583 returnlist = returnlist + self.getaddress()
585 elif self.field[self.pos] == '<':
586 # Address is a phrase then a route addr
587 routeaddr = self.getrouteaddr()
589 if self.commentlist:
590 returnlist = [(' '.join(plist) + ' (' + \
591 ' '.join(self.commentlist) + ')', routeaddr)]
592 else: returnlist = [(' '.join(plist), routeaddr)]
594 else:
595 if plist:
596 returnlist = [(' '.join(self.commentlist), plist[0])]
597 elif self.field[self.pos] in self.specials:
598 self.pos += 1
600 self.gotonext()
601 if self.pos < len(self.field) and self.field[self.pos] == ',':
602 self.pos += 1
603 return returnlist
605 def getrouteaddr(self):
606 """Parse a route address (Return-path value).
608 This method just skips all the route stuff and returns the addrspec.
610 if self.field[self.pos] != '<':
611 return
613 expectroute = 0
614 self.pos += 1
615 self.gotonext()
616 adlist = ""
617 while self.pos < len(self.field):
618 if expectroute:
619 self.getdomain()
620 expectroute = 0
621 elif self.field[self.pos] == '>':
622 self.pos += 1
623 break
624 elif self.field[self.pos] == '@':
625 self.pos += 1
626 expectroute = 1
627 elif self.field[self.pos] == ':':
628 self.pos += 1
629 else:
630 adlist = self.getaddrspec()
631 self.pos += 1
632 break
633 self.gotonext()
635 return adlist
637 def getaddrspec(self):
638 """Parse an RFC 2822 addr-spec."""
639 aslist = []
641 self.gotonext()
642 while self.pos < len(self.field):
643 if self.field[self.pos] == '.':
644 aslist.append('.')
645 self.pos += 1
646 elif self.field[self.pos] == '"':
647 aslist.append('"%s"' % self.getquote())
648 elif self.field[self.pos] in self.atomends:
649 break
650 else: aslist.append(self.getatom())
651 self.gotonext()
653 if self.pos >= len(self.field) or self.field[self.pos] != '@':
654 return ''.join(aslist)
656 aslist.append('@')
657 self.pos += 1
658 self.gotonext()
659 return ''.join(aslist) + self.getdomain()
661 def getdomain(self):
662 """Get the complete domain name from an address."""
663 sdlist = []
664 while self.pos < len(self.field):
665 if self.field[self.pos] in self.LWS:
666 self.pos += 1
667 elif self.field[self.pos] == '(':
668 self.commentlist.append(self.getcomment())
669 elif self.field[self.pos] == '[':
670 sdlist.append(self.getdomainliteral())
671 elif self.field[self.pos] == '.':
672 self.pos += 1
673 sdlist.append('.')
674 elif self.field[self.pos] in self.atomends:
675 break
676 else: sdlist.append(self.getatom())
677 return ''.join(sdlist)
679 def getdelimited(self, beginchar, endchars, allowcomments = 1):
680 """Parse a header fragment delimited by special characters.
682 `beginchar' is the start character for the fragment. If self is not
683 looking at an instance of `beginchar' then getdelimited returns the
684 empty string.
686 `endchars' is a sequence of allowable end-delimiting characters.
687 Parsing stops when one of these is encountered.
689 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
690 within the parsed fragment.
692 if self.field[self.pos] != beginchar:
693 return ''
695 slist = ['']
696 quote = 0
697 self.pos += 1
698 while self.pos < len(self.field):
699 if quote == 1:
700 slist.append(self.field[self.pos])
701 quote = 0
702 elif self.field[self.pos] in endchars:
703 self.pos += 1
704 break
705 elif allowcomments and self.field[self.pos] == '(':
706 slist.append(self.getcomment())
707 continue # have already advanced pos from getcomment
708 elif self.field[self.pos] == '\\':
709 quote = 1
710 else:
711 slist.append(self.field[self.pos])
712 self.pos += 1
714 return ''.join(slist)
716 def getquote(self):
717 """Get a quote-delimited fragment from self's field."""
718 return self.getdelimited('"', '"\r', 0)
720 def getcomment(self):
721 """Get a parenthesis-delimited fragment from self's field."""
722 return self.getdelimited('(', ')\r', 1)
724 def getdomainliteral(self):
725 """Parse an RFC 2822 domain-literal."""
726 return '[%s]' % self.getdelimited('[', ']\r', 0)
728 def getatom(self, atomends=None):
729 """Parse an RFC 2822 atom.
731 Optional atomends specifies a different set of end token delimiters
732 (the default is to use self.atomends). This is used e.g. in
733 getphraselist() since phrase endings must not include the `.' (which
734 is legal in phrases)."""
735 atomlist = ['']
736 if atomends is None:
737 atomends = self.atomends
739 while self.pos < len(self.field):
740 if self.field[self.pos] in atomends:
741 break
742 else: atomlist.append(self.field[self.pos])
743 self.pos += 1
745 return ''.join(atomlist)
747 def getphraselist(self):
748 """Parse a sequence of RFC 2822 phrases.
750 A phrase is a sequence of words, which are in turn either RFC 2822
751 atoms or quoted-strings. Phrases are canonicalized by squeezing all
752 runs of continuous whitespace into one space.
754 plist = []
756 while self.pos < len(self.field):
757 if self.field[self.pos] in self.LWS:
758 self.pos += 1
759 elif self.field[self.pos] == '"':
760 plist.append(self.getquote())
761 elif self.field[self.pos] == '(':
762 self.commentlist.append(self.getcomment())
763 elif self.field[self.pos] in self.phraseends:
764 break
765 else:
766 plist.append(self.getatom(self.phraseends))
768 return plist
770 class AddressList(AddrlistClass):
771 """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
772 def __init__(self, field):
773 AddrlistClass.__init__(self, field)
774 if field:
775 self.addresslist = self.getaddrlist()
776 else:
777 self.addresslist = []
779 def __len__(self):
780 return len(self.addresslist)
782 def __str__(self):
783 return ", ".join(map(dump_address_pair, self.addresslist))
785 def __add__(self, other):
786 # Set union
787 newaddr = AddressList(None)
788 newaddr.addresslist = self.addresslist[:]
789 for x in other.addresslist:
790 if not x in self.addresslist:
791 newaddr.addresslist.append(x)
792 return newaddr
794 def __iadd__(self, other):
795 # Set union, in-place
796 for x in other.addresslist:
797 if not x in self.addresslist:
798 self.addresslist.append(x)
799 return self
801 def __sub__(self, other):
802 # Set difference
803 newaddr = AddressList(None)
804 for x in self.addresslist:
805 if not x in other.addresslist:
806 newaddr.addresslist.append(x)
807 return newaddr
809 def __isub__(self, other):
810 # Set difference, in-place
811 for x in other.addresslist:
812 if x in self.addresslist:
813 self.addresslist.remove(x)
814 return self
816 def __getitem__(self, index):
817 # Make indexing, slices, and 'in' work
818 return self.addresslist[index]
820 def dump_address_pair(pair):
821 """Dump a (name, address) pair in a canonicalized form."""
822 if pair[0]:
823 return '"' + pair[0] + '" <' + pair[1] + '>'
824 else:
825 return pair[1]
827 # Parse a date field
829 _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
830 'aug', 'sep', 'oct', 'nov', 'dec',
831 'january', 'february', 'march', 'april', 'may', 'june', 'july',
832 'august', 'september', 'october', 'november', 'december']
833 _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
835 # The timezone table does not include the military time zones defined
836 # in RFC822, other than Z. According to RFC1123, the description in
837 # RFC822 gets the signs wrong, so we can't rely on any such time
838 # zones. RFC1123 recommends that numeric timezone indicators be used
839 # instead of timezone names.
841 _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
842 'AST': -400, 'ADT': -300, # Atlantic (used in Canada)
843 'EST': -500, 'EDT': -400, # Eastern
844 'CST': -600, 'CDT': -500, # Central
845 'MST': -700, 'MDT': -600, # Mountain
846 'PST': -800, 'PDT': -700 # Pacific
850 def parsedate_tz(data):
851 """Convert a date string to a time tuple.
853 Accounts for military timezones.
855 if not data:
856 return None
857 data = data.split()
858 if data[0][-1] in (',', '.') or data[0].lower() in _daynames:
859 # There's a dayname here. Skip it
860 del data[0]
861 else:
862 # no space after the "weekday,"?
863 i = data[0].rfind(',')
864 if i >= 0:
865 data[0] = data[0][i+1:]
866 if len(data) == 3: # RFC 850 date, deprecated
867 stuff = data[0].split('-')
868 if len(stuff) == 3:
869 data = stuff + data[1:]
870 if len(data) == 4:
871 s = data[3]
872 i = s.find('+')
873 if i > 0:
874 data[3:] = [s[:i], s[i+1:]]
875 else:
876 data.append('') # Dummy tz
877 if len(data) < 5:
878 return None
879 data = data[:5]
880 [dd, mm, yy, tm, tz] = data
881 mm = mm.lower()
882 if not mm in _monthnames:
883 dd, mm = mm, dd.lower()
884 if not mm in _monthnames:
885 return None
886 mm = _monthnames.index(mm)+1
887 if mm > 12: mm = mm - 12
888 if dd[-1] == ',':
889 dd = dd[:-1]
890 i = yy.find(':')
891 if i > 0:
892 yy, tm = tm, yy
893 if yy[-1] == ',':
894 yy = yy[:-1]
895 if not yy[0].isdigit():
896 yy, tz = tz, yy
897 if tm[-1] == ',':
898 tm = tm[:-1]
899 tm = tm.split(':')
900 if len(tm) == 2:
901 [thh, tmm] = tm
902 tss = '0'
903 elif len(tm) == 3:
904 [thh, tmm, tss] = tm
905 else:
906 return None
907 try:
908 yy = int(yy)
909 dd = int(dd)
910 thh = int(thh)
911 tmm = int(tmm)
912 tss = int(tss)
913 except ValueError:
914 return None
915 tzoffset = None
916 tz = tz.upper()
917 if tz in _timezones:
918 tzoffset = _timezones[tz]
919 else:
920 try:
921 tzoffset = int(tz)
922 except ValueError:
923 pass
924 # Convert a timezone offset into seconds ; -0500 -> -18000
925 if tzoffset:
926 if tzoffset < 0:
927 tzsign = -1
928 tzoffset = -tzoffset
929 else:
930 tzsign = 1
931 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
932 return (yy, mm, dd, thh, tmm, tss, 0, 1, 0, tzoffset)
935 def parsedate(data):
936 """Convert a time string to a time tuple."""
937 t = parsedate_tz(data)
938 if t is None:
939 return t
940 return t[:9]
943 def mktime_tz(data):
944 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
945 if data[9] is None:
946 # No zone info, so localtime is better assumption than GMT
947 return time.mktime(data[:8] + (-1,))
948 else:
949 t = time.mktime(data[:8] + (0,))
950 return t - data[9] - time.timezone
952 def formatdate(timeval=None):
953 """Returns time format preferred for Internet standards.
955 Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123
957 According to RFC 1123, day and month names must always be in
958 English. If not for that, this code could use strftime(). It
959 can't because strftime() honors the locale and could generated
960 non-English names.
962 if timeval is None:
963 timeval = time.time()
964 timeval = time.gmtime(timeval)
965 return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
966 ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")[timeval[6]],
967 timeval[2],
968 ("Jan", "Feb", "Mar", "Apr", "May", "Jun",
969 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")[timeval[1]-1],
970 timeval[0], timeval[3], timeval[4], timeval[5])
973 # When used as script, run a small test program.
974 # The first command line argument must be a filename containing one
975 # message in RFC-822 format.
977 if __name__ == '__main__':
978 import sys, os
979 file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
980 if sys.argv[1:]: file = sys.argv[1]
981 f = open(file, 'r')
982 m = Message(f)
983 print 'From:', m.getaddr('from')
984 print 'To:', m.getaddrlist('to')
985 print 'Subject:', m.getheader('subject')
986 print 'Date:', m.getheader('date')
987 date = m.getdate_tz('date')
988 tz = date[-1]
989 date = time.localtime(mktime_tz(date))
990 if date:
991 print 'ParsedDate:', time.asctime(date),
992 hhmmss = tz
993 hhmm, ss = divmod(hhmmss, 60)
994 hh, mm = divmod(hhmm, 60)
995 print "%+03d%02d" % (hh, mm),
996 if ss: print ".%02d" % ss,
997 print
998 else:
999 print 'ParsedDate:', None
1000 m.rewindbody()
1001 n = 0
1002 while f.readline():
1003 n += 1
1004 print 'Lines:', n
1005 print '-'*70
1006 print 'len =', len(m)
1007 if 'Date' in m: print 'Date =', m['Date']
1008 if 'X-Nonsense' in m: pass
1009 print 'keys =', m.keys()
1010 print 'values =', m.values()
1011 print 'items =', m.items()