Propagate exceptions from shutdown() if raiseExceptions is not set.
[python.git] / Lib / cookielib.py
blobf0a89a555cd37f71b81500becc1482cd5bf663e8
1 """HTTP cookie handling for web clients.
3 This module has (now fairly distant) origins in Gisle Aas' Perl module
4 HTTP::Cookies, from the libwww-perl library.
6 Docstrings, comments and debug strings in this code refer to the
7 attributes of the HTTP cookie system as cookie-attributes, to distinguish
8 them clearly from Python attributes.
10 Class diagram (note that the classes which do not derive from
11 FileCookieJar are not distributed with the Python standard library, but
12 are available from http://wwwsearch.sf.net/):
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
24 MSIECookieJar
26 """
28 import sys, re, urlparse, copy, time, urllib, logging
29 try:
30 import threading as _threading
31 except ImportError:
32 import dummy_threading as _threading
33 import httplib # only for the default HTTP port
34 from calendar import timegm
36 debug = logging.getLogger("cookielib").debug
38 DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
39 MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
40 "instance initialised with one)")
42 def reraise_unmasked_exceptions(unmasked=()):
43 # There are a few catch-all except: statements in this module, for
44 # catching input that's bad in unexpected ways.
45 # This function re-raises some exceptions we don't want to trap.
46 unmasked = unmasked + (KeyboardInterrupt, SystemExit, MemoryError)
47 etype = sys.exc_info()[0]
48 if issubclass(etype, unmasked):
49 raise
50 # swallowed an exception
51 import warnings, traceback, StringIO
52 f = StringIO.StringIO()
53 traceback.print_exc(None, f)
54 msg = f.getvalue()
55 warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2)
58 # Date/time conversion
59 # -----------------------------------------------------------------------------
61 EPOCH_YEAR = 1970
62 def _timegm(tt):
63 year, month, mday, hour, min, sec = tt[:6]
64 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
65 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
66 return timegm(tt)
67 else:
68 return None
70 DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
71 MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
72 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
73 MONTHS_LOWER = []
74 for month in MONTHS: MONTHS_LOWER.append(month.lower())
76 def time2isoz(t=None):
77 """Return a string representing time in seconds since epoch, t.
79 If the function is called without an argument, it will use the current
80 time.
82 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
83 representing Universal Time (UTC, aka GMT). An example of this format is:
85 1994-11-24 08:49:37Z
87 """
88 if t is None: t = time.time()
89 year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
90 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
91 year, mon, mday, hour, min, sec)
93 def time2netscape(t=None):
94 """Return a string representing time in seconds since epoch, t.
96 If the function is called without an argument, it will use the current
97 time.
99 The format of the returned string is like this:
101 Wed, DD-Mon-YYYY HH:MM:SS GMT
104 if t is None: t = time.time()
105 year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
106 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
107 DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
110 UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
112 TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
113 def offset_from_tz_string(tz):
114 offset = None
115 if tz in UTC_ZONES:
116 offset = 0
117 else:
118 m = TIMEZONE_RE.search(tz)
119 if m:
120 offset = 3600 * int(m.group(2))
121 if m.group(3):
122 offset = offset + 60 * int(m.group(3))
123 if m.group(1) == '-':
124 offset = -offset
125 return offset
127 def _str2time(day, mon, yr, hr, min, sec, tz):
128 # translate month name to number
129 # month numbers start with 1 (January)
130 try:
131 mon = MONTHS_LOWER.index(mon.lower())+1
132 except ValueError:
133 # maybe it's already a number
134 try:
135 imon = int(mon)
136 except ValueError:
137 return None
138 if 1 <= imon <= 12:
139 mon = imon
140 else:
141 return None
143 # make sure clock elements are defined
144 if hr is None: hr = 0
145 if min is None: min = 0
146 if sec is None: sec = 0
148 yr = int(yr)
149 day = int(day)
150 hr = int(hr)
151 min = int(min)
152 sec = int(sec)
154 if yr < 1000:
155 # find "obvious" year
156 cur_yr = time.localtime(time.time())[0]
157 m = cur_yr % 100
158 tmp = yr
159 yr = yr + cur_yr - m
160 m = m - tmp
161 if abs(m) > 50:
162 if m > 0: yr = yr + 100
163 else: yr = yr - 100
165 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
166 t = _timegm((yr, mon, day, hr, min, sec, tz))
168 if t is not None:
169 # adjust time using timezone string, to get absolute time since epoch
170 if tz is None:
171 tz = "UTC"
172 tz = tz.upper()
173 offset = offset_from_tz_string(tz)
174 if offset is None:
175 return None
176 t = t - offset
178 return t
180 STRICT_DATE_RE = re.compile(
181 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
182 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
183 WEEKDAY_RE = re.compile(
184 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
185 LOOSE_HTTP_DATE_RE = re.compile(
186 r"""^
187 (\d\d?) # day
188 (?:\s+|[-\/])
189 (\w+) # month
190 (?:\s+|[-\/])
191 (\d+) # year
193 (?:\s+|:) # separator before clock
194 (\d\d?):(\d\d) # hour:min
195 (?::(\d\d))? # optional seconds
196 )? # optional clock
198 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
200 (?:\(\w+\))? # ASCII representation of timezone in parens.
201 \s*$""", re.X)
202 def http2time(text):
203 """Returns time in seconds since epoch of time represented by a string.
205 Return value is an integer.
207 None is returned if the format of str is unrecognized, the time is outside
208 the representable range, or the timezone string is not recognized. If the
209 string contains no timezone, UTC is assumed.
211 The timezone in the string may be numerical (like "-0800" or "+0100") or a
212 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
213 timezone strings equivalent to UTC (zero offset) are known to the function.
215 The function loosely parses the following formats:
217 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
218 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
219 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
220 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
221 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
222 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
224 The parser ignores leading and trailing whitespace. The time may be
225 absent.
227 If the year is given with only 2 digits, the function will select the
228 century that makes the year closest to the current date.
231 # fast exit for strictly conforming string
232 m = STRICT_DATE_RE.search(text)
233 if m:
234 g = m.groups()
235 mon = MONTHS_LOWER.index(g[1].lower()) + 1
236 tt = (int(g[2]), mon, int(g[0]),
237 int(g[3]), int(g[4]), float(g[5]))
238 return _timegm(tt)
240 # No, we need some messy parsing...
242 # clean up
243 text = text.lstrip()
244 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
246 # tz is time zone specifier string
247 day, mon, yr, hr, min, sec, tz = [None]*7
249 # loose regexp parse
250 m = LOOSE_HTTP_DATE_RE.search(text)
251 if m is not None:
252 day, mon, yr, hr, min, sec, tz = m.groups()
253 else:
254 return None # bad format
256 return _str2time(day, mon, yr, hr, min, sec, tz)
258 ISO_DATE_RE = re.compile(
259 """^
260 (\d{4}) # year
261 [-\/]?
262 (\d\d?) # numerical month
263 [-\/]?
264 (\d\d?) # day
266 (?:\s+|[-:Tt]) # separator before clock
267 (\d\d?):?(\d\d) # hour:min
268 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
269 )? # optional clock
271 ([-+]?\d\d?:?(:?\d\d)?
272 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
273 \s*$""", re.X)
274 def iso2time(text):
276 As for http2time, but parses the ISO 8601 formats:
278 1994-02-03 14:15:29 -0100 -- ISO 8601 format
279 1994-02-03 14:15:29 -- zone is optional
280 1994-02-03 -- only date
281 1994-02-03T14:15:29 -- Use T as separator
282 19940203T141529Z -- ISO 8601 compact format
283 19940203 -- only date
286 # clean up
287 text = text.lstrip()
289 # tz is time zone specifier string
290 day, mon, yr, hr, min, sec, tz = [None]*7
292 # loose regexp parse
293 m = ISO_DATE_RE.search(text)
294 if m is not None:
295 # XXX there's an extra bit of the timezone I'm ignoring here: is
296 # this the right thing to do?
297 yr, mon, day, hr, min, sec, tz, _ = m.groups()
298 else:
299 return None # bad format
301 return _str2time(day, mon, yr, hr, min, sec, tz)
304 # Header parsing
305 # -----------------------------------------------------------------------------
307 def unmatched(match):
308 """Return unmatched part of re.Match object."""
309 start, end = match.span(0)
310 return match.string[:start]+match.string[end:]
312 HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
313 HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
314 HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
315 HEADER_ESCAPE_RE = re.compile(r"\\(.)")
316 def split_header_words(header_values):
317 r"""Parse header values into a list of lists containing key,value pairs.
319 The function knows how to deal with ",", ";" and "=" as well as quoted
320 values after "=". A list of space separated tokens are parsed as if they
321 were separated by ";".
323 If the header_values passed as argument contains multiple values, then they
324 are treated as if they were a single value separated by comma ",".
326 This means that this function is useful for parsing header fields that
327 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
328 the requirement for tokens).
330 headers = #header
331 header = (token | parameter) *( [";"] (token | parameter))
333 token = 1*<any CHAR except CTLs or separators>
334 separators = "(" | ")" | "<" | ">" | "@"
335 | "," | ";" | ":" | "\" | <">
336 | "/" | "[" | "]" | "?" | "="
337 | "{" | "}" | SP | HT
339 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
340 qdtext = <any TEXT except <">>
341 quoted-pair = "\" CHAR
343 parameter = attribute "=" value
344 attribute = token
345 value = token | quoted-string
347 Each header is represented by a list of key/value pairs. The value for a
348 simple token (not part of a parameter) is None. Syntactically incorrect
349 headers will not necessarily be parsed as you would want.
351 This is easier to describe with some examples:
353 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
354 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
355 >>> split_header_words(['text/html; charset="iso-8859-1"'])
356 [[('text/html', None), ('charset', 'iso-8859-1')]]
357 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
358 [[('Basic', None), ('realm', '"foobar"')]]
361 assert not isinstance(header_values, basestring)
362 result = []
363 for text in header_values:
364 orig_text = text
365 pairs = []
366 while text:
367 m = HEADER_TOKEN_RE.search(text)
368 if m:
369 text = unmatched(m)
370 name = m.group(1)
371 m = HEADER_QUOTED_VALUE_RE.search(text)
372 if m: # quoted value
373 text = unmatched(m)
374 value = m.group(1)
375 value = HEADER_ESCAPE_RE.sub(r"\1", value)
376 else:
377 m = HEADER_VALUE_RE.search(text)
378 if m: # unquoted value
379 text = unmatched(m)
380 value = m.group(1)
381 value = value.rstrip()
382 else:
383 # no value, a lone token
384 value = None
385 pairs.append((name, value))
386 elif text.lstrip().startswith(","):
387 # concatenated headers, as per RFC 2616 section 4.2
388 text = text.lstrip()[1:]
389 if pairs: result.append(pairs)
390 pairs = []
391 else:
392 # skip junk
393 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
394 assert nr_junk_chars > 0, (
395 "split_header_words bug: '%s', '%s', %s" %
396 (orig_text, text, pairs))
397 text = non_junk
398 if pairs: result.append(pairs)
399 return result
401 HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
402 def join_header_words(lists):
403 """Do the inverse (almost) of the conversion done by split_header_words.
405 Takes a list of lists of (key, value) pairs and produces a single header
406 value. Attribute values are quoted if needed.
408 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
409 'text/plain; charset="iso-8859/1"'
410 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
411 'text/plain, charset="iso-8859/1"'
414 headers = []
415 for pairs in lists:
416 attr = []
417 for k, v in pairs:
418 if v is not None:
419 if not re.search(r"^\w+$", v):
420 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
421 v = '"%s"' % v
422 k = "%s=%s" % (k, v)
423 attr.append(k)
424 if attr: headers.append("; ".join(attr))
425 return ", ".join(headers)
427 def parse_ns_headers(ns_headers):
428 """Ad-hoc parser for Netscape protocol cookie-attributes.
430 The old Netscape cookie format for Set-Cookie can for instance contain
431 an unquoted "," in the expires field, so we have to use this ad-hoc
432 parser instead of split_header_words.
434 XXX This may not make the best possible effort to parse all the crap
435 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
436 parser is probably better, so could do worse than following that if
437 this ever gives any trouble.
439 Currently, this is also used for parsing RFC 2109 cookies.
442 known_attrs = ("expires", "domain", "path", "secure",
443 # RFC 2109 attrs (may turn up in Netscape cookies, too)
444 "port", "max-age")
446 result = []
447 for ns_header in ns_headers:
448 pairs = []
449 version_set = False
450 for ii, param in enumerate(re.split(r";\s*", ns_header)):
451 param = param.rstrip()
452 if param == "": continue
453 if "=" not in param:
454 k, v = param, None
455 else:
456 k, v = re.split(r"\s*=\s*", param, 1)
457 k = k.lstrip()
458 if ii != 0:
459 lc = k.lower()
460 if lc in known_attrs:
461 k = lc
462 if k == "version":
463 # This is an RFC 2109 cookie.
464 version_set = True
465 if k == "expires":
466 # convert expires date to seconds since epoch
467 if v.startswith('"'): v = v[1:]
468 if v.endswith('"'): v = v[:-1]
469 v = http2time(v) # None if invalid
470 pairs.append((k, v))
472 if pairs:
473 if not version_set:
474 pairs.append(("version", "0"))
475 result.append(pairs)
477 return result
480 IPV4_RE = re.compile(r"\.\d+$")
481 def is_HDN(text):
482 """Return True if text is a host domain name."""
483 # XXX
484 # This may well be wrong. Which RFC is HDN defined in, if any (for
485 # the purposes of RFC 2965)?
486 # For the current implementation, what about IPv6? Remember to look
487 # at other uses of IPV4_RE also, if change this.
488 if IPV4_RE.search(text):
489 return False
490 if text == "":
491 return False
492 if text[0] == "." or text[-1] == ".":
493 return False
494 return True
496 def domain_match(A, B):
497 """Return True if domain A domain-matches domain B, according to RFC 2965.
499 A and B may be host domain names or IP addresses.
501 RFC 2965, section 1:
503 Host names can be specified either as an IP address or a HDN string.
504 Sometimes we compare one host name with another. (Such comparisons SHALL
505 be case-insensitive.) Host A's name domain-matches host B's if
507 * their host name strings string-compare equal; or
509 * A is a HDN string and has the form NB, where N is a non-empty
510 name string, B has the form .B', and B' is a HDN string. (So,
511 x.y.com domain-matches .Y.com but not Y.com.)
513 Note that domain-match is not a commutative operation: a.b.c.com
514 domain-matches .c.com, but not the reverse.
517 # Note that, if A or B are IP addresses, the only relevant part of the
518 # definition of the domain-match algorithm is the direct string-compare.
519 A = A.lower()
520 B = B.lower()
521 if A == B:
522 return True
523 if not is_HDN(A):
524 return False
525 i = A.rfind(B)
526 if i == -1 or i == 0:
527 # A does not have form NB, or N is the empty string
528 return False
529 if not B.startswith("."):
530 return False
531 if not is_HDN(B[1:]):
532 return False
533 return True
535 def liberal_is_HDN(text):
536 """Return True if text is a sort-of-like a host domain name.
538 For accepting/blocking domains.
541 if IPV4_RE.search(text):
542 return False
543 return True
545 def user_domain_match(A, B):
546 """For blocking/accepting domains.
548 A and B may be host domain names or IP addresses.
551 A = A.lower()
552 B = B.lower()
553 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
554 if A == B:
555 # equal IP addresses
556 return True
557 return False
558 initial_dot = B.startswith(".")
559 if initial_dot and A.endswith(B):
560 return True
561 if not initial_dot and A == B:
562 return True
563 return False
565 cut_port_re = re.compile(r":\d+$")
566 def request_host(request):
567 """Return request-host, as defined by RFC 2965.
569 Variation from RFC: returned value is lowercased, for convenient
570 comparison.
573 url = request.get_full_url()
574 host = urlparse.urlparse(url)[1]
575 if host == "":
576 host = request.get_header("Host", "")
578 # remove port, if present
579 host = cut_port_re.sub("", host, 1)
580 return host.lower()
582 def eff_request_host(request):
583 """Return a tuple (request-host, effective request-host name).
585 As defined by RFC 2965, except both are lowercased.
588 erhn = req_host = request_host(request)
589 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
590 erhn = req_host + ".local"
591 return req_host, erhn
593 def request_path(request):
594 """request-URI, as defined by RFC 2965."""
595 url = request.get_full_url()
596 #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
597 #req_path = escape_path("".join(urlparse.urlparse(url)[2:]))
598 path, parameters, query, frag = urlparse.urlparse(url)[2:]
599 if parameters:
600 path = "%s;%s" % (path, parameters)
601 path = escape_path(path)
602 req_path = urlparse.urlunparse(("", "", path, "", query, frag))
603 if not req_path.startswith("/"):
604 # fix bad RFC 2396 absoluteURI
605 req_path = "/"+req_path
606 return req_path
608 def request_port(request):
609 host = request.get_host()
610 i = host.find(':')
611 if i >= 0:
612 port = host[i+1:]
613 try:
614 int(port)
615 except ValueError:
616 debug("nonnumeric port: '%s'", port)
617 return None
618 else:
619 port = DEFAULT_HTTP_PORT
620 return port
622 # Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
623 # need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
624 HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
625 ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
626 def uppercase_escaped_char(match):
627 return "%%%s" % match.group(1).upper()
628 def escape_path(path):
629 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
630 # There's no knowing what character encoding was used to create URLs
631 # containing %-escapes, but since we have to pick one to escape invalid
632 # path characters, we pick UTF-8, as recommended in the HTML 4.0
633 # specification:
634 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
635 # And here, kind of: draft-fielding-uri-rfc2396bis-03
636 # (And in draft IRI specification: draft-duerst-iri-05)
637 # (And here, for new URI schemes: RFC 2718)
638 if isinstance(path, unicode):
639 path = path.encode("utf-8")
640 path = urllib.quote(path, HTTP_PATH_SAFE)
641 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
642 return path
644 def reach(h):
645 """Return reach of host h, as defined by RFC 2965, section 1.
647 The reach R of a host name H is defined as follows:
649 * If
651 - H is the host domain name of a host; and,
653 - H has the form A.B; and
655 - A has no embedded (that is, interior) dots; and
657 - B has at least one embedded dot, or B is the string "local".
658 then the reach of H is .B.
660 * Otherwise, the reach of H is H.
662 >>> reach("www.acme.com")
663 '.acme.com'
664 >>> reach("acme.com")
665 'acme.com'
666 >>> reach("acme.local")
667 '.local'
670 i = h.find(".")
671 if i >= 0:
672 #a = h[:i] # this line is only here to show what a is
673 b = h[i+1:]
674 i = b.find(".")
675 if is_HDN(h) and (i >= 0 or b == "local"):
676 return "."+b
677 return h
679 def is_third_party(request):
682 RFC 2965, section 3.3.6:
684 An unverifiable transaction is to a third-party host if its request-
685 host U does not domain-match the reach R of the request-host O in the
686 origin transaction.
689 req_host = request_host(request)
690 if not domain_match(req_host, reach(request.get_origin_req_host())):
691 return True
692 else:
693 return False
696 class Cookie:
697 """HTTP Cookie.
699 This class represents both Netscape and RFC 2965 cookies.
701 This is deliberately a very simple class. It just holds attributes. It's
702 possible to construct Cookie instances that don't comply with the cookie
703 standards. CookieJar.make_cookies is the factory function for Cookie
704 objects -- it deals with cookie parsing, supplying defaults, and
705 normalising to the representation used in this class. CookiePolicy is
706 responsible for checking them to see whether they should be accepted from
707 and returned to the server.
709 Note that the port may be present in the headers, but unspecified ("Port"
710 rather than"Port=80", for example); if this is the case, port is None.
714 def __init__(self, version, name, value,
715 port, port_specified,
716 domain, domain_specified, domain_initial_dot,
717 path, path_specified,
718 secure,
719 expires,
720 discard,
721 comment,
722 comment_url,
723 rest,
724 rfc2109=False,
727 if version is not None: version = int(version)
728 if expires is not None: expires = int(expires)
729 if port is None and port_specified is True:
730 raise ValueError("if port is None, port_specified must be false")
732 self.version = version
733 self.name = name
734 self.value = value
735 self.port = port
736 self.port_specified = port_specified
737 # normalise case, as per RFC 2965 section 3.3.3
738 self.domain = domain.lower()
739 self.domain_specified = domain_specified
740 # Sigh. We need to know whether the domain given in the
741 # cookie-attribute had an initial dot, in order to follow RFC 2965
742 # (as clarified in draft errata). Needed for the returned $Domain
743 # value.
744 self.domain_initial_dot = domain_initial_dot
745 self.path = path
746 self.path_specified = path_specified
747 self.secure = secure
748 self.expires = expires
749 self.discard = discard
750 self.comment = comment
751 self.comment_url = comment_url
752 self.rfc2109 = rfc2109
754 self._rest = copy.copy(rest)
756 def has_nonstandard_attr(self, name):
757 return name in self._rest
758 def get_nonstandard_attr(self, name, default=None):
759 return self._rest.get(name, default)
760 def set_nonstandard_attr(self, name, value):
761 self._rest[name] = value
763 def is_expired(self, now=None):
764 if now is None: now = time.time()
765 if (self.expires is not None) and (self.expires <= now):
766 return True
767 return False
769 def __str__(self):
770 if self.port is None: p = ""
771 else: p = ":"+self.port
772 limit = self.domain + p + self.path
773 if self.value is not None:
774 namevalue = "%s=%s" % (self.name, self.value)
775 else:
776 namevalue = self.name
777 return "<Cookie %s for %s>" % (namevalue, limit)
779 def __repr__(self):
780 args = []
781 for name in ("version", "name", "value",
782 "port", "port_specified",
783 "domain", "domain_specified", "domain_initial_dot",
784 "path", "path_specified",
785 "secure", "expires", "discard", "comment", "comment_url",
787 attr = getattr(self, name)
788 args.append("%s=%s" % (name, repr(attr)))
789 args.append("rest=%s" % repr(self._rest))
790 args.append("rfc2109=%s" % repr(self.rfc2109))
791 return "Cookie(%s)" % ", ".join(args)
794 class CookiePolicy:
795 """Defines which cookies get accepted from and returned to server.
797 May also modify cookies, though this is probably a bad idea.
799 The subclass DefaultCookiePolicy defines the standard rules for Netscape
800 and RFC 2965 cookies -- override that if you want a customised policy.
803 def set_ok(self, cookie, request):
804 """Return true if (and only if) cookie should be accepted from server.
806 Currently, pre-expired cookies never get this far -- the CookieJar
807 class deletes such cookies itself.
810 raise NotImplementedError()
812 def return_ok(self, cookie, request):
813 """Return true if (and only if) cookie should be returned to server."""
814 raise NotImplementedError()
816 def domain_return_ok(self, domain, request):
817 """Return false if cookies should not be returned, given cookie domain.
819 return True
821 def path_return_ok(self, path, request):
822 """Return false if cookies should not be returned, given cookie path.
824 return True
827 class DefaultCookiePolicy(CookiePolicy):
828 """Implements the standard rules for accepting and returning cookies."""
830 DomainStrictNoDots = 1
831 DomainStrictNonDomain = 2
832 DomainRFC2965Match = 4
834 DomainLiberal = 0
835 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
837 def __init__(self,
838 blocked_domains=None, allowed_domains=None,
839 netscape=True, rfc2965=False,
840 rfc2109_as_netscape=None,
841 hide_cookie2=False,
842 strict_domain=False,
843 strict_rfc2965_unverifiable=True,
844 strict_ns_unverifiable=False,
845 strict_ns_domain=DomainLiberal,
846 strict_ns_set_initial_dollar=False,
847 strict_ns_set_path=False,
849 """Constructor arguments should be passed as keyword arguments only."""
850 self.netscape = netscape
851 self.rfc2965 = rfc2965
852 self.rfc2109_as_netscape = rfc2109_as_netscape
853 self.hide_cookie2 = hide_cookie2
854 self.strict_domain = strict_domain
855 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
856 self.strict_ns_unverifiable = strict_ns_unverifiable
857 self.strict_ns_domain = strict_ns_domain
858 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
859 self.strict_ns_set_path = strict_ns_set_path
861 if blocked_domains is not None:
862 self._blocked_domains = tuple(blocked_domains)
863 else:
864 self._blocked_domains = ()
866 if allowed_domains is not None:
867 allowed_domains = tuple(allowed_domains)
868 self._allowed_domains = allowed_domains
870 def blocked_domains(self):
871 """Return the sequence of blocked domains (as a tuple)."""
872 return self._blocked_domains
873 def set_blocked_domains(self, blocked_domains):
874 """Set the sequence of blocked domains."""
875 self._blocked_domains = tuple(blocked_domains)
877 def is_blocked(self, domain):
878 for blocked_domain in self._blocked_domains:
879 if user_domain_match(domain, blocked_domain):
880 return True
881 return False
883 def allowed_domains(self):
884 """Return None, or the sequence of allowed domains (as a tuple)."""
885 return self._allowed_domains
886 def set_allowed_domains(self, allowed_domains):
887 """Set the sequence of allowed domains, or None."""
888 if allowed_domains is not None:
889 allowed_domains = tuple(allowed_domains)
890 self._allowed_domains = allowed_domains
892 def is_not_allowed(self, domain):
893 if self._allowed_domains is None:
894 return False
895 for allowed_domain in self._allowed_domains:
896 if user_domain_match(domain, allowed_domain):
897 return False
898 return True
900 def set_ok(self, cookie, request):
902 If you override .set_ok(), be sure to call this method. If it returns
903 false, so should your subclass (assuming your subclass wants to be more
904 strict about which cookies to accept).
907 debug(" - checking cookie %s=%s", cookie.name, cookie.value)
909 assert cookie.name is not None
911 for n in "version", "verifiability", "name", "path", "domain", "port":
912 fn_name = "set_ok_"+n
913 fn = getattr(self, fn_name)
914 if not fn(cookie, request):
915 return False
917 return True
919 def set_ok_version(self, cookie, request):
920 if cookie.version is None:
921 # Version is always set to 0 by parse_ns_headers if it's a Netscape
922 # cookie, so this must be an invalid RFC 2965 cookie.
923 debug(" Set-Cookie2 without version attribute (%s=%s)",
924 cookie.name, cookie.value)
925 return False
926 if cookie.version > 0 and not self.rfc2965:
927 debug(" RFC 2965 cookies are switched off")
928 return False
929 elif cookie.version == 0 and not self.netscape:
930 debug(" Netscape cookies are switched off")
931 return False
932 return True
934 def set_ok_verifiability(self, cookie, request):
935 if request.is_unverifiable() and is_third_party(request):
936 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
937 debug(" third-party RFC 2965 cookie during "
938 "unverifiable transaction")
939 return False
940 elif cookie.version == 0 and self.strict_ns_unverifiable:
941 debug(" third-party Netscape cookie during "
942 "unverifiable transaction")
943 return False
944 return True
946 def set_ok_name(self, cookie, request):
947 # Try and stop servers setting V0 cookies designed to hack other
948 # servers that know both V0 and V1 protocols.
949 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
950 cookie.name.startswith("$")):
951 debug(" illegal name (starts with '$'): '%s'", cookie.name)
952 return False
953 return True
955 def set_ok_path(self, cookie, request):
956 if cookie.path_specified:
957 req_path = request_path(request)
958 if ((cookie.version > 0 or
959 (cookie.version == 0 and self.strict_ns_set_path)) and
960 not req_path.startswith(cookie.path)):
961 debug(" path attribute %s is not a prefix of request "
962 "path %s", cookie.path, req_path)
963 return False
964 return True
966 def set_ok_domain(self, cookie, request):
967 if self.is_blocked(cookie.domain):
968 debug(" domain %s is in user block-list", cookie.domain)
969 return False
970 if self.is_not_allowed(cookie.domain):
971 debug(" domain %s is not in user allow-list", cookie.domain)
972 return False
973 if cookie.domain_specified:
974 req_host, erhn = eff_request_host(request)
975 domain = cookie.domain
976 if self.strict_domain and (domain.count(".") >= 2):
977 i = domain.rfind(".")
978 j = domain.rfind(".", 0, i)
979 if j == 0: # domain like .foo.bar
980 tld = domain[i+1:]
981 sld = domain[j+1:i]
982 if (sld.lower() in (
983 "co", "ac",
984 "com", "edu", "org", "net", "gov", "mil", "int") and
985 len(tld) == 2):
986 # domain like .co.uk
987 debug(" country-code second level domain %s", domain)
988 return False
989 if domain.startswith("."):
990 undotted_domain = domain[1:]
991 else:
992 undotted_domain = domain
993 embedded_dots = (undotted_domain.find(".") >= 0)
994 if not embedded_dots and domain != ".local":
995 debug(" non-local domain %s contains no embedded dot",
996 domain)
997 return False
998 if cookie.version == 0:
999 if (not erhn.endswith(domain) and
1000 (not erhn.startswith(".") and
1001 not ("."+erhn).endswith(domain))):
1002 debug(" effective request-host %s (even with added "
1003 "initial dot) does not end end with %s",
1004 erhn, domain)
1005 return False
1006 if (cookie.version > 0 or
1007 (self.strict_ns_domain & self.DomainRFC2965Match)):
1008 if not domain_match(erhn, domain):
1009 debug(" effective request-host %s does not domain-match "
1010 "%s", erhn, domain)
1011 return False
1012 if (cookie.version > 0 or
1013 (self.strict_ns_domain & self.DomainStrictNoDots)):
1014 host_prefix = req_host[:-len(domain)]
1015 if (host_prefix.find(".") >= 0 and
1016 not IPV4_RE.search(req_host)):
1017 debug(" host prefix %s for domain %s contains a dot",
1018 host_prefix, domain)
1019 return False
1020 return True
1022 def set_ok_port(self, cookie, request):
1023 if cookie.port_specified:
1024 req_port = request_port(request)
1025 if req_port is None:
1026 req_port = "80"
1027 else:
1028 req_port = str(req_port)
1029 for p in cookie.port.split(","):
1030 try:
1031 int(p)
1032 except ValueError:
1033 debug(" bad port %s (not numeric)", p)
1034 return False
1035 if p == req_port:
1036 break
1037 else:
1038 debug(" request port (%s) not found in %s",
1039 req_port, cookie.port)
1040 return False
1041 return True
1043 def return_ok(self, cookie, request):
1045 If you override .return_ok(), be sure to call this method. If it
1046 returns false, so should your subclass (assuming your subclass wants to
1047 be more strict about which cookies to return).
1050 # Path has already been checked by .path_return_ok(), and domain
1051 # blocking done by .domain_return_ok().
1052 debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1054 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1055 fn_name = "return_ok_"+n
1056 fn = getattr(self, fn_name)
1057 if not fn(cookie, request):
1058 return False
1059 return True
1061 def return_ok_version(self, cookie, request):
1062 if cookie.version > 0 and not self.rfc2965:
1063 debug(" RFC 2965 cookies are switched off")
1064 return False
1065 elif cookie.version == 0 and not self.netscape:
1066 debug(" Netscape cookies are switched off")
1067 return False
1068 return True
1070 def return_ok_verifiability(self, cookie, request):
1071 if request.is_unverifiable() and is_third_party(request):
1072 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1073 debug(" third-party RFC 2965 cookie during unverifiable "
1074 "transaction")
1075 return False
1076 elif cookie.version == 0 and self.strict_ns_unverifiable:
1077 debug(" third-party Netscape cookie during unverifiable "
1078 "transaction")
1079 return False
1080 return True
1082 def return_ok_secure(self, cookie, request):
1083 if cookie.secure and request.get_type() != "https":
1084 debug(" secure cookie with non-secure request")
1085 return False
1086 return True
1088 def return_ok_expires(self, cookie, request):
1089 if cookie.is_expired(self._now):
1090 debug(" cookie expired")
1091 return False
1092 return True
1094 def return_ok_port(self, cookie, request):
1095 if cookie.port:
1096 req_port = request_port(request)
1097 if req_port is None:
1098 req_port = "80"
1099 for p in cookie.port.split(","):
1100 if p == req_port:
1101 break
1102 else:
1103 debug(" request port %s does not match cookie port %s",
1104 req_port, cookie.port)
1105 return False
1106 return True
1108 def return_ok_domain(self, cookie, request):
1109 req_host, erhn = eff_request_host(request)
1110 domain = cookie.domain
1112 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1113 if (cookie.version == 0 and
1114 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1115 not cookie.domain_specified and domain != erhn):
1116 debug(" cookie with unspecified domain does not string-compare "
1117 "equal to request domain")
1118 return False
1120 if cookie.version > 0 and not domain_match(erhn, domain):
1121 debug(" effective request-host name %s does not domain-match "
1122 "RFC 2965 cookie domain %s", erhn, domain)
1123 return False
1124 if cookie.version == 0 and not ("."+erhn).endswith(domain):
1125 debug(" request-host %s does not match Netscape cookie domain "
1126 "%s", req_host, domain)
1127 return False
1128 return True
1130 def domain_return_ok(self, domain, request):
1131 # Liberal check of. This is here as an optimization to avoid
1132 # having to load lots of MSIE cookie files unless necessary.
1133 req_host, erhn = eff_request_host(request)
1134 if not req_host.startswith("."):
1135 req_host = "."+req_host
1136 if not erhn.startswith("."):
1137 erhn = "."+erhn
1138 if not (req_host.endswith(domain) or erhn.endswith(domain)):
1139 #debug(" request domain %s does not match cookie domain %s",
1140 # req_host, domain)
1141 return False
1143 if self.is_blocked(domain):
1144 debug(" domain %s is in user block-list", domain)
1145 return False
1146 if self.is_not_allowed(domain):
1147 debug(" domain %s is not in user allow-list", domain)
1148 return False
1150 return True
1152 def path_return_ok(self, path, request):
1153 debug("- checking cookie path=%s", path)
1154 req_path = request_path(request)
1155 if not req_path.startswith(path):
1156 debug(" %s does not path-match %s", req_path, path)
1157 return False
1158 return True
1161 def vals_sorted_by_key(adict):
1162 keys = adict.keys()
1163 keys.sort()
1164 return map(adict.get, keys)
1166 def deepvalues(mapping):
1167 """Iterates over nested mapping, depth-first, in sorted order by key."""
1168 values = vals_sorted_by_key(mapping)
1169 for obj in values:
1170 mapping = False
1171 try:
1172 obj.items
1173 except AttributeError:
1174 pass
1175 else:
1176 mapping = True
1177 for subobj in deepvalues(obj):
1178 yield subobj
1179 if not mapping:
1180 yield obj
1183 # Used as second parameter to dict.get() method, to distinguish absent
1184 # dict key from one with a None value.
1185 class Absent: pass
1187 class CookieJar:
1188 """Collection of HTTP cookies.
1190 You may not need to know about this class: try
1191 urllib2.build_opener(HTTPCookieProcessor).open(url).
1195 non_word_re = re.compile(r"\W")
1196 quote_re = re.compile(r"([\"\\])")
1197 strict_domain_re = re.compile(r"\.?[^.]*")
1198 domain_re = re.compile(r"[^.]*")
1199 dots_re = re.compile(r"^\.+")
1201 magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
1203 def __init__(self, policy=None):
1204 if policy is None:
1205 policy = DefaultCookiePolicy()
1206 self._policy = policy
1208 self._cookies_lock = _threading.RLock()
1209 self._cookies = {}
1211 def set_policy(self, policy):
1212 self._policy = policy
1214 def _cookies_for_domain(self, domain, request):
1215 cookies = []
1216 if not self._policy.domain_return_ok(domain, request):
1217 return []
1218 debug("Checking %s for cookies to return", domain)
1219 cookies_by_path = self._cookies[domain]
1220 for path in cookies_by_path.keys():
1221 if not self._policy.path_return_ok(path, request):
1222 continue
1223 cookies_by_name = cookies_by_path[path]
1224 for cookie in cookies_by_name.values():
1225 if not self._policy.return_ok(cookie, request):
1226 debug(" not returning cookie")
1227 continue
1228 debug(" it's a match")
1229 cookies.append(cookie)
1230 return cookies
1232 def _cookies_for_request(self, request):
1233 """Return a list of cookies to be returned to server."""
1234 cookies = []
1235 for domain in self._cookies.keys():
1236 cookies.extend(self._cookies_for_domain(domain, request))
1237 return cookies
1239 def _cookie_attrs(self, cookies):
1240 """Return a list of cookie-attributes to be returned to server.
1242 like ['foo="bar"; $Path="/"', ...]
1244 The $Version attribute is also added when appropriate (currently only
1245 once per request).
1248 # add cookies in order of most specific (ie. longest) path first
1249 def decreasing_size(a, b): return cmp(len(b.path), len(a.path))
1250 cookies.sort(decreasing_size)
1252 version_set = False
1254 attrs = []
1255 for cookie in cookies:
1256 # set version of Cookie header
1257 # XXX
1258 # What should it be if multiple matching Set-Cookie headers have
1259 # different versions themselves?
1260 # Answer: there is no answer; was supposed to be settled by
1261 # RFC 2965 errata, but that may never appear...
1262 version = cookie.version
1263 if not version_set:
1264 version_set = True
1265 if version > 0:
1266 attrs.append("$Version=%s" % version)
1268 # quote cookie value if necessary
1269 # (not for Netscape protocol, which already has any quotes
1270 # intact, due to the poorly-specified Netscape Cookie: syntax)
1271 if ((cookie.value is not None) and
1272 self.non_word_re.search(cookie.value) and version > 0):
1273 value = self.quote_re.sub(r"\\\1", cookie.value)
1274 else:
1275 value = cookie.value
1277 # add cookie-attributes to be returned in Cookie header
1278 if cookie.value is None:
1279 attrs.append(cookie.name)
1280 else:
1281 attrs.append("%s=%s" % (cookie.name, value))
1282 if version > 0:
1283 if cookie.path_specified:
1284 attrs.append('$Path="%s"' % cookie.path)
1285 if cookie.domain.startswith("."):
1286 domain = cookie.domain
1287 if (not cookie.domain_initial_dot and
1288 domain.startswith(".")):
1289 domain = domain[1:]
1290 attrs.append('$Domain="%s"' % domain)
1291 if cookie.port is not None:
1292 p = "$Port"
1293 if cookie.port_specified:
1294 p = p + ('="%s"' % cookie.port)
1295 attrs.append(p)
1297 return attrs
1299 def add_cookie_header(self, request):
1300 """Add correct Cookie: header to request (urllib2.Request object).
1302 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1305 debug("add_cookie_header")
1306 self._cookies_lock.acquire()
1308 self._policy._now = self._now = int(time.time())
1310 cookies = self._cookies_for_request(request)
1312 attrs = self._cookie_attrs(cookies)
1313 if attrs:
1314 if not request.has_header("Cookie"):
1315 request.add_unredirected_header(
1316 "Cookie", "; ".join(attrs))
1318 # if necessary, advertise that we know RFC 2965
1319 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1320 not request.has_header("Cookie2")):
1321 for cookie in cookies:
1322 if cookie.version != 1:
1323 request.add_unredirected_header("Cookie2", '$Version="1"')
1324 break
1326 self._cookies_lock.release()
1328 self.clear_expired_cookies()
1330 def _normalized_cookie_tuples(self, attrs_set):
1331 """Return list of tuples containing normalised cookie information.
1333 attrs_set is the list of lists of key,value pairs extracted from
1334 the Set-Cookie or Set-Cookie2 headers.
1336 Tuples are name, value, standard, rest, where name and value are the
1337 cookie name and value, standard is a dictionary containing the standard
1338 cookie-attributes (discard, secure, version, expires or max-age,
1339 domain, path and port) and rest is a dictionary containing the rest of
1340 the cookie-attributes.
1343 cookie_tuples = []
1345 boolean_attrs = "discard", "secure"
1346 value_attrs = ("version",
1347 "expires", "max-age",
1348 "domain", "path", "port",
1349 "comment", "commenturl")
1351 for cookie_attrs in attrs_set:
1352 name, value = cookie_attrs[0]
1354 # Build dictionary of standard cookie-attributes (standard) and
1355 # dictionary of other cookie-attributes (rest).
1357 # Note: expiry time is normalised to seconds since epoch. V0
1358 # cookies should have the Expires cookie-attribute, and V1 cookies
1359 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1360 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1361 # accept either (but prefer Max-Age).
1362 max_age_set = False
1364 bad_cookie = False
1366 standard = {}
1367 rest = {}
1368 for k, v in cookie_attrs[1:]:
1369 lc = k.lower()
1370 # don't lose case distinction for unknown fields
1371 if lc in value_attrs or lc in boolean_attrs:
1372 k = lc
1373 if k in boolean_attrs and v is None:
1374 # boolean cookie-attribute is present, but has no value
1375 # (like "discard", rather than "port=80")
1376 v = True
1377 if k in standard:
1378 # only first value is significant
1379 continue
1380 if k == "domain":
1381 if v is None:
1382 debug(" missing value for domain attribute")
1383 bad_cookie = True
1384 break
1385 # RFC 2965 section 3.3.3
1386 v = v.lower()
1387 if k == "expires":
1388 if max_age_set:
1389 # Prefer max-age to expires (like Mozilla)
1390 continue
1391 if v is None:
1392 debug(" missing or invalid value for expires "
1393 "attribute: treating as session cookie")
1394 continue
1395 if k == "max-age":
1396 max_age_set = True
1397 try:
1398 v = int(v)
1399 except ValueError:
1400 debug(" missing or invalid (non-numeric) value for "
1401 "max-age attribute")
1402 bad_cookie = True
1403 break
1404 # convert RFC 2965 Max-Age to seconds since epoch
1405 # XXX Strictly you're supposed to follow RFC 2616
1406 # age-calculation rules. Remember that zero Max-Age is a
1407 # is a request to discard (old and new) cookie, though.
1408 k = "expires"
1409 v = self._now + v
1410 if (k in value_attrs) or (k in boolean_attrs):
1411 if (v is None and
1412 k not in ("port", "comment", "commenturl")):
1413 debug(" missing value for %s attribute" % k)
1414 bad_cookie = True
1415 break
1416 standard[k] = v
1417 else:
1418 rest[k] = v
1420 if bad_cookie:
1421 continue
1423 cookie_tuples.append((name, value, standard, rest))
1425 return cookie_tuples
1427 def _cookie_from_cookie_tuple(self, tup, request):
1428 # standard is dict of standard cookie-attributes, rest is dict of the
1429 # rest of them
1430 name, value, standard, rest = tup
1432 domain = standard.get("domain", Absent)
1433 path = standard.get("path", Absent)
1434 port = standard.get("port", Absent)
1435 expires = standard.get("expires", Absent)
1437 # set the easy defaults
1438 version = standard.get("version", None)
1439 if version is not None: version = int(version)
1440 secure = standard.get("secure", False)
1441 # (discard is also set if expires is Absent)
1442 discard = standard.get("discard", False)
1443 comment = standard.get("comment", None)
1444 comment_url = standard.get("commenturl", None)
1446 # set default path
1447 if path is not Absent and path != "":
1448 path_specified = True
1449 path = escape_path(path)
1450 else:
1451 path_specified = False
1452 path = request_path(request)
1453 i = path.rfind("/")
1454 if i != -1:
1455 if version == 0:
1456 # Netscape spec parts company from reality here
1457 path = path[:i]
1458 else:
1459 path = path[:i+1]
1460 if len(path) == 0: path = "/"
1462 # set default domain
1463 domain_specified = domain is not Absent
1464 # but first we have to remember whether it starts with a dot
1465 domain_initial_dot = False
1466 if domain_specified:
1467 domain_initial_dot = bool(domain.startswith("."))
1468 if domain is Absent:
1469 req_host, erhn = eff_request_host(request)
1470 domain = erhn
1471 elif not domain.startswith("."):
1472 domain = "."+domain
1474 # set default port
1475 port_specified = False
1476 if port is not Absent:
1477 if port is None:
1478 # Port attr present, but has no value: default to request port.
1479 # Cookie should then only be sent back on that port.
1480 port = request_port(request)
1481 else:
1482 port_specified = True
1483 port = re.sub(r"\s+", "", port)
1484 else:
1485 # No port attr present. Cookie can be sent back on any port.
1486 port = None
1488 # set default expires and discard
1489 if expires is Absent:
1490 expires = None
1491 discard = True
1492 elif expires <= self._now:
1493 # Expiry date in past is request to delete cookie. This can't be
1494 # in DefaultCookiePolicy, because can't delete cookies there.
1495 try:
1496 self.clear(domain, path, name)
1497 except KeyError:
1498 pass
1499 debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1500 domain, path, name)
1501 return None
1503 return Cookie(version,
1504 name, value,
1505 port, port_specified,
1506 domain, domain_specified, domain_initial_dot,
1507 path, path_specified,
1508 secure,
1509 expires,
1510 discard,
1511 comment,
1512 comment_url,
1513 rest)
1515 def _cookies_from_attrs_set(self, attrs_set, request):
1516 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1518 cookies = []
1519 for tup in cookie_tuples:
1520 cookie = self._cookie_from_cookie_tuple(tup, request)
1521 if cookie: cookies.append(cookie)
1522 return cookies
1524 def _process_rfc2109_cookies(self, cookies):
1525 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1526 if rfc2109_as_ns is None:
1527 rfc2109_as_ns = not self._policy.rfc2965
1528 for cookie in cookies:
1529 if cookie.version == 1:
1530 cookie.rfc2109 = True
1531 if rfc2109_as_ns:
1532 # treat 2109 cookies as Netscape cookies rather than
1533 # as RFC2965 cookies
1534 cookie.version = 0
1536 def make_cookies(self, response, request):
1537 """Return sequence of Cookie objects extracted from response object."""
1538 # get cookie-attributes for RFC 2965 and Netscape protocols
1539 headers = response.info()
1540 rfc2965_hdrs = headers.getheaders("Set-Cookie2")
1541 ns_hdrs = headers.getheaders("Set-Cookie")
1543 rfc2965 = self._policy.rfc2965
1544 netscape = self._policy.netscape
1546 if ((not rfc2965_hdrs and not ns_hdrs) or
1547 (not ns_hdrs and not rfc2965) or
1548 (not rfc2965_hdrs and not netscape) or
1549 (not netscape and not rfc2965)):
1550 return [] # no relevant cookie headers: quick exit
1552 try:
1553 cookies = self._cookies_from_attrs_set(
1554 split_header_words(rfc2965_hdrs), request)
1555 except:
1556 reraise_unmasked_exceptions()
1557 cookies = []
1559 if ns_hdrs and netscape:
1560 try:
1561 # RFC 2109 and Netscape cookies
1562 ns_cookies = self._cookies_from_attrs_set(
1563 parse_ns_headers(ns_hdrs), request)
1564 except:
1565 reraise_unmasked_exceptions()
1566 ns_cookies = []
1567 self._process_rfc2109_cookies(ns_cookies)
1569 # Look for Netscape cookies (from Set-Cookie headers) that match
1570 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1571 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1572 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1573 # bundled in with the Netscape cookies for this purpose, which is
1574 # reasonable behaviour.
1575 if rfc2965:
1576 lookup = {}
1577 for cookie in cookies:
1578 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1580 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1581 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1582 return key not in lookup
1583 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1585 if ns_cookies:
1586 cookies.extend(ns_cookies)
1588 return cookies
1590 def set_cookie_if_ok(self, cookie, request):
1591 """Set a cookie if policy says it's OK to do so."""
1592 self._cookies_lock.acquire()
1593 self._policy._now = self._now = int(time.time())
1595 if self._policy.set_ok(cookie, request):
1596 self.set_cookie(cookie)
1598 self._cookies_lock.release()
1600 def set_cookie(self, cookie):
1601 """Set a cookie, without checking whether or not it should be set."""
1602 c = self._cookies
1603 self._cookies_lock.acquire()
1604 try:
1605 if cookie.domain not in c: c[cookie.domain] = {}
1606 c2 = c[cookie.domain]
1607 if cookie.path not in c2: c2[cookie.path] = {}
1608 c3 = c2[cookie.path]
1609 c3[cookie.name] = cookie
1610 finally:
1611 self._cookies_lock.release()
1613 def extract_cookies(self, response, request):
1614 """Extract cookies from response, where allowable given the request."""
1615 debug("extract_cookies: %s", response.info())
1616 self._cookies_lock.acquire()
1617 self._policy._now = self._now = int(time.time())
1619 for cookie in self.make_cookies(response, request):
1620 if self._policy.set_ok(cookie, request):
1621 debug(" setting cookie: %s", cookie)
1622 self.set_cookie(cookie)
1623 self._cookies_lock.release()
1625 def clear(self, domain=None, path=None, name=None):
1626 """Clear some cookies.
1628 Invoking this method without arguments will clear all cookies. If
1629 given a single argument, only cookies belonging to that domain will be
1630 removed. If given two arguments, cookies belonging to the specified
1631 path within that domain are removed. If given three arguments, then
1632 the cookie with the specified name, path and domain is removed.
1634 Raises KeyError if no matching cookie exists.
1637 if name is not None:
1638 if (domain is None) or (path is None):
1639 raise ValueError(
1640 "domain and path must be given to remove a cookie by name")
1641 del self._cookies[domain][path][name]
1642 elif path is not None:
1643 if domain is None:
1644 raise ValueError(
1645 "domain must be given to remove cookies by path")
1646 del self._cookies[domain][path]
1647 elif domain is not None:
1648 del self._cookies[domain]
1649 else:
1650 self._cookies = {}
1652 def clear_session_cookies(self):
1653 """Discard all session cookies.
1655 Note that the .save() method won't save session cookies anyway, unless
1656 you ask otherwise by passing a true ignore_discard argument.
1659 self._cookies_lock.acquire()
1660 for cookie in self:
1661 if cookie.discard:
1662 self.clear(cookie.domain, cookie.path, cookie.name)
1663 self._cookies_lock.release()
1665 def clear_expired_cookies(self):
1666 """Discard all expired cookies.
1668 You probably don't need to call this method: expired cookies are never
1669 sent back to the server (provided you're using DefaultCookiePolicy),
1670 this method is called by CookieJar itself every so often, and the
1671 .save() method won't save expired cookies anyway (unless you ask
1672 otherwise by passing a true ignore_expires argument).
1675 self._cookies_lock.acquire()
1676 now = time.time()
1677 for cookie in self:
1678 if cookie.is_expired(now):
1679 self.clear(cookie.domain, cookie.path, cookie.name)
1680 self._cookies_lock.release()
1682 def __iter__(self):
1683 return deepvalues(self._cookies)
1685 def __len__(self):
1686 """Return number of contained cookies."""
1687 i = 0
1688 for cookie in self: i = i + 1
1689 return i
1691 def __repr__(self):
1692 r = []
1693 for cookie in self: r.append(repr(cookie))
1694 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1696 def __str__(self):
1697 r = []
1698 for cookie in self: r.append(str(cookie))
1699 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1702 # derives from IOError for backwards-compatibility with Python 2.4.0
1703 class LoadError(IOError): pass
1705 class FileCookieJar(CookieJar):
1706 """CookieJar that can be loaded from and saved to a file."""
1708 def __init__(self, filename=None, delayload=False, policy=None):
1710 Cookies are NOT loaded from the named file until either the .load() or
1711 .revert() method is called.
1714 CookieJar.__init__(self, policy)
1715 if filename is not None:
1716 try:
1717 filename+""
1718 except:
1719 raise ValueError("filename must be string-like")
1720 self.filename = filename
1721 self.delayload = bool(delayload)
1723 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1724 """Save cookies to a file."""
1725 raise NotImplementedError()
1727 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1728 """Load cookies from a file."""
1729 if filename is None:
1730 if self.filename is not None: filename = self.filename
1731 else: raise ValueError(MISSING_FILENAME_TEXT)
1733 f = open(filename)
1734 try:
1735 self._really_load(f, filename, ignore_discard, ignore_expires)
1736 finally:
1737 f.close()
1739 def revert(self, filename=None,
1740 ignore_discard=False, ignore_expires=False):
1741 """Clear all cookies and reload cookies from a saved file.
1743 Raises LoadError (or IOError) if reversion is not successful; the
1744 object's state will not be altered if this happens.
1747 if filename is None:
1748 if self.filename is not None: filename = self.filename
1749 else: raise ValueError(MISSING_FILENAME_TEXT)
1751 self._cookies_lock.acquire()
1753 old_state = copy.deepcopy(self._cookies)
1754 self._cookies = {}
1755 try:
1756 self.load(filename, ignore_discard, ignore_expires)
1757 except (LoadError, IOError):
1758 self._cookies = old_state
1759 raise
1761 self._cookies_lock.release()
1763 from _LWPCookieJar import LWPCookieJar, lwp_cookie_str
1764 from _MozillaCookieJar import MozillaCookieJar