Add missing issue number in Misc/NEWS entry.
[python.git] / Lib / cookielib.py
blob6b59794869a07491de8276d1f8322ed43cbb4740
1 """HTTP cookie handling for web clients.
3 This module has (now fairly distant) origins in Gisle Aas' Perl module
4 HTTP::Cookies, from the libwww-perl library.
6 Docstrings, comments and debug strings in this code refer to the
7 attributes of the HTTP cookie system as cookie-attributes, to distinguish
8 them clearly from Python attributes.
10 Class diagram (note that BSDDBCookieJar and the MSIE* classes are not
11 distributed with the Python standard library, but are available from
12 http://wwwsearch.sf.net/):
14 CookieJar____
15 / \ \
16 FileCookieJar \ \
17 / | \ \ \
18 MozillaCookieJar | LWPCookieJar \ \
19 | | \
20 | ---MSIEBase | \
21 | / | | \
22 | / MSIEDBCookieJar BSDDBCookieJar
24 MSIECookieJar
26 """
28 __all__ = ['Cookie', 'CookieJar', 'CookiePolicy', 'DefaultCookiePolicy',
29 'FileCookieJar', 'LWPCookieJar', 'lwp_cookie_str', 'LoadError',
30 'MozillaCookieJar']
32 import re, urlparse, copy, time, urllib
33 try:
34 import threading as _threading
35 except ImportError:
36 import dummy_threading as _threading
37 import httplib # only for the default HTTP port
38 from calendar import timegm
40 debug = False # set to True to enable debugging via the logging module
41 logger = None
43 def _debug(*args):
44 if not debug:
45 return
46 global logger
47 if not logger:
48 import logging
49 logger = logging.getLogger("cookielib")
50 return logger.debug(*args)
53 DEFAULT_HTTP_PORT = str(httplib.HTTP_PORT)
54 MISSING_FILENAME_TEXT = ("a filename was not supplied (nor was the CookieJar "
55 "instance initialised with one)")
57 def _warn_unhandled_exception():
58 # There are a few catch-all except: statements in this module, for
59 # catching input that's bad in unexpected ways. Warn if any
60 # exceptions are caught there.
61 import warnings, traceback, StringIO
62 f = StringIO.StringIO()
63 traceback.print_exc(None, f)
64 msg = f.getvalue()
65 warnings.warn("cookielib bug!\n%s" % msg, stacklevel=2)
68 # Date/time conversion
69 # -----------------------------------------------------------------------------
71 EPOCH_YEAR = 1970
72 def _timegm(tt):
73 year, month, mday, hour, min, sec = tt[:6]
74 if ((year >= EPOCH_YEAR) and (1 <= month <= 12) and (1 <= mday <= 31) and
75 (0 <= hour <= 24) and (0 <= min <= 59) and (0 <= sec <= 61)):
76 return timegm(tt)
77 else:
78 return None
80 DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
81 MONTHS = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
82 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
83 MONTHS_LOWER = []
84 for month in MONTHS: MONTHS_LOWER.append(month.lower())
86 def time2isoz(t=None):
87 """Return a string representing time in seconds since epoch, t.
89 If the function is called without an argument, it will use the current
90 time.
92 The format of the returned string is like "YYYY-MM-DD hh:mm:ssZ",
93 representing Universal Time (UTC, aka GMT). An example of this format is:
95 1994-11-24 08:49:37Z
97 """
98 if t is None: t = time.time()
99 year, mon, mday, hour, min, sec = time.gmtime(t)[:6]
100 return "%04d-%02d-%02d %02d:%02d:%02dZ" % (
101 year, mon, mday, hour, min, sec)
103 def time2netscape(t=None):
104 """Return a string representing time in seconds since epoch, t.
106 If the function is called without an argument, it will use the current
107 time.
109 The format of the returned string is like this:
111 Wed, DD-Mon-YYYY HH:MM:SS GMT
114 if t is None: t = time.time()
115 year, mon, mday, hour, min, sec, wday = time.gmtime(t)[:7]
116 return "%s %02d-%s-%04d %02d:%02d:%02d GMT" % (
117 DAYS[wday], mday, MONTHS[mon-1], year, hour, min, sec)
120 UTC_ZONES = {"GMT": None, "UTC": None, "UT": None, "Z": None}
122 TIMEZONE_RE = re.compile(r"^([-+])?(\d\d?):?(\d\d)?$")
123 def offset_from_tz_string(tz):
124 offset = None
125 if tz in UTC_ZONES:
126 offset = 0
127 else:
128 m = TIMEZONE_RE.search(tz)
129 if m:
130 offset = 3600 * int(m.group(2))
131 if m.group(3):
132 offset = offset + 60 * int(m.group(3))
133 if m.group(1) == '-':
134 offset = -offset
135 return offset
137 def _str2time(day, mon, yr, hr, min, sec, tz):
138 # translate month name to number
139 # month numbers start with 1 (January)
140 try:
141 mon = MONTHS_LOWER.index(mon.lower())+1
142 except ValueError:
143 # maybe it's already a number
144 try:
145 imon = int(mon)
146 except ValueError:
147 return None
148 if 1 <= imon <= 12:
149 mon = imon
150 else:
151 return None
153 # make sure clock elements are defined
154 if hr is None: hr = 0
155 if min is None: min = 0
156 if sec is None: sec = 0
158 yr = int(yr)
159 day = int(day)
160 hr = int(hr)
161 min = int(min)
162 sec = int(sec)
164 if yr < 1000:
165 # find "obvious" year
166 cur_yr = time.localtime(time.time())[0]
167 m = cur_yr % 100
168 tmp = yr
169 yr = yr + cur_yr - m
170 m = m - tmp
171 if abs(m) > 50:
172 if m > 0: yr = yr + 100
173 else: yr = yr - 100
175 # convert UTC time tuple to seconds since epoch (not timezone-adjusted)
176 t = _timegm((yr, mon, day, hr, min, sec, tz))
178 if t is not None:
179 # adjust time using timezone string, to get absolute time since epoch
180 if tz is None:
181 tz = "UTC"
182 tz = tz.upper()
183 offset = offset_from_tz_string(tz)
184 if offset is None:
185 return None
186 t = t - offset
188 return t
190 STRICT_DATE_RE = re.compile(
191 r"^[SMTWF][a-z][a-z], (\d\d) ([JFMASOND][a-z][a-z]) "
192 "(\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$")
193 WEEKDAY_RE = re.compile(
194 r"^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*", re.I)
195 LOOSE_HTTP_DATE_RE = re.compile(
196 r"""^
197 (\d\d?) # day
198 (?:\s+|[-\/])
199 (\w+) # month
200 (?:\s+|[-\/])
201 (\d+) # year
203 (?:\s+|:) # separator before clock
204 (\d\d?):(\d\d) # hour:min
205 (?::(\d\d))? # optional seconds
206 )? # optional clock
208 ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone
210 (?:\(\w+\))? # ASCII representation of timezone in parens.
211 \s*$""", re.X)
212 def http2time(text):
213 """Returns time in seconds since epoch of time represented by a string.
215 Return value is an integer.
217 None is returned if the format of str is unrecognized, the time is outside
218 the representable range, or the timezone string is not recognized. If the
219 string contains no timezone, UTC is assumed.
221 The timezone in the string may be numerical (like "-0800" or "+0100") or a
222 string timezone (like "UTC", "GMT", "BST" or "EST"). Currently, only the
223 timezone strings equivalent to UTC (zero offset) are known to the function.
225 The function loosely parses the following formats:
227 Wed, 09 Feb 1994 22:23:32 GMT -- HTTP format
228 Tuesday, 08-Feb-94 14:15:29 GMT -- old rfc850 HTTP format
229 Tuesday, 08-Feb-1994 14:15:29 GMT -- broken rfc850 HTTP format
230 09 Feb 1994 22:23:32 GMT -- HTTP format (no weekday)
231 08-Feb-94 14:15:29 GMT -- rfc850 format (no weekday)
232 08-Feb-1994 14:15:29 GMT -- broken rfc850 format (no weekday)
234 The parser ignores leading and trailing whitespace. The time may be
235 absent.
237 If the year is given with only 2 digits, the function will select the
238 century that makes the year closest to the current date.
241 # fast exit for strictly conforming string
242 m = STRICT_DATE_RE.search(text)
243 if m:
244 g = m.groups()
245 mon = MONTHS_LOWER.index(g[1].lower()) + 1
246 tt = (int(g[2]), mon, int(g[0]),
247 int(g[3]), int(g[4]), float(g[5]))
248 return _timegm(tt)
250 # No, we need some messy parsing...
252 # clean up
253 text = text.lstrip()
254 text = WEEKDAY_RE.sub("", text, 1) # Useless weekday
256 # tz is time zone specifier string
257 day, mon, yr, hr, min, sec, tz = [None]*7
259 # loose regexp parse
260 m = LOOSE_HTTP_DATE_RE.search(text)
261 if m is not None:
262 day, mon, yr, hr, min, sec, tz = m.groups()
263 else:
264 return None # bad format
266 return _str2time(day, mon, yr, hr, min, sec, tz)
268 ISO_DATE_RE = re.compile(
269 """^
270 (\d{4}) # year
271 [-\/]?
272 (\d\d?) # numerical month
273 [-\/]?
274 (\d\d?) # day
276 (?:\s+|[-:Tt]) # separator before clock
277 (\d\d?):?(\d\d) # hour:min
278 (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional)
279 )? # optional clock
281 ([-+]?\d\d?:?(:?\d\d)?
282 |Z|z)? # timezone (Z is "zero meridian", i.e. GMT)
283 \s*$""", re.X)
284 def iso2time(text):
286 As for http2time, but parses the ISO 8601 formats:
288 1994-02-03 14:15:29 -0100 -- ISO 8601 format
289 1994-02-03 14:15:29 -- zone is optional
290 1994-02-03 -- only date
291 1994-02-03T14:15:29 -- Use T as separator
292 19940203T141529Z -- ISO 8601 compact format
293 19940203 -- only date
296 # clean up
297 text = text.lstrip()
299 # tz is time zone specifier string
300 day, mon, yr, hr, min, sec, tz = [None]*7
302 # loose regexp parse
303 m = ISO_DATE_RE.search(text)
304 if m is not None:
305 # XXX there's an extra bit of the timezone I'm ignoring here: is
306 # this the right thing to do?
307 yr, mon, day, hr, min, sec, tz, _ = m.groups()
308 else:
309 return None # bad format
311 return _str2time(day, mon, yr, hr, min, sec, tz)
314 # Header parsing
315 # -----------------------------------------------------------------------------
317 def unmatched(match):
318 """Return unmatched part of re.Match object."""
319 start, end = match.span(0)
320 return match.string[:start]+match.string[end:]
322 HEADER_TOKEN_RE = re.compile(r"^\s*([^=\s;,]+)")
323 HEADER_QUOTED_VALUE_RE = re.compile(r"^\s*=\s*\"([^\"\\]*(?:\\.[^\"\\]*)*)\"")
324 HEADER_VALUE_RE = re.compile(r"^\s*=\s*([^\s;,]*)")
325 HEADER_ESCAPE_RE = re.compile(r"\\(.)")
326 def split_header_words(header_values):
327 r"""Parse header values into a list of lists containing key,value pairs.
329 The function knows how to deal with ",", ";" and "=" as well as quoted
330 values after "=". A list of space separated tokens are parsed as if they
331 were separated by ";".
333 If the header_values passed as argument contains multiple values, then they
334 are treated as if they were a single value separated by comma ",".
336 This means that this function is useful for parsing header fields that
337 follow this syntax (BNF as from the HTTP/1.1 specification, but we relax
338 the requirement for tokens).
340 headers = #header
341 header = (token | parameter) *( [";"] (token | parameter))
343 token = 1*<any CHAR except CTLs or separators>
344 separators = "(" | ")" | "<" | ">" | "@"
345 | "," | ";" | ":" | "\" | <">
346 | "/" | "[" | "]" | "?" | "="
347 | "{" | "}" | SP | HT
349 quoted-string = ( <"> *(qdtext | quoted-pair ) <"> )
350 qdtext = <any TEXT except <">>
351 quoted-pair = "\" CHAR
353 parameter = attribute "=" value
354 attribute = token
355 value = token | quoted-string
357 Each header is represented by a list of key/value pairs. The value for a
358 simple token (not part of a parameter) is None. Syntactically incorrect
359 headers will not necessarily be parsed as you would want.
361 This is easier to describe with some examples:
363 >>> split_header_words(['foo="bar"; port="80,81"; discard, bar=baz'])
364 [[('foo', 'bar'), ('port', '80,81'), ('discard', None)], [('bar', 'baz')]]
365 >>> split_header_words(['text/html; charset="iso-8859-1"'])
366 [[('text/html', None), ('charset', 'iso-8859-1')]]
367 >>> split_header_words([r'Basic realm="\"foo\bar\""'])
368 [[('Basic', None), ('realm', '"foobar"')]]
371 assert not isinstance(header_values, basestring)
372 result = []
373 for text in header_values:
374 orig_text = text
375 pairs = []
376 while text:
377 m = HEADER_TOKEN_RE.search(text)
378 if m:
379 text = unmatched(m)
380 name = m.group(1)
381 m = HEADER_QUOTED_VALUE_RE.search(text)
382 if m: # quoted value
383 text = unmatched(m)
384 value = m.group(1)
385 value = HEADER_ESCAPE_RE.sub(r"\1", value)
386 else:
387 m = HEADER_VALUE_RE.search(text)
388 if m: # unquoted value
389 text = unmatched(m)
390 value = m.group(1)
391 value = value.rstrip()
392 else:
393 # no value, a lone token
394 value = None
395 pairs.append((name, value))
396 elif text.lstrip().startswith(","):
397 # concatenated headers, as per RFC 2616 section 4.2
398 text = text.lstrip()[1:]
399 if pairs: result.append(pairs)
400 pairs = []
401 else:
402 # skip junk
403 non_junk, nr_junk_chars = re.subn("^[=\s;]*", "", text)
404 assert nr_junk_chars > 0, (
405 "split_header_words bug: '%s', '%s', %s" %
406 (orig_text, text, pairs))
407 text = non_junk
408 if pairs: result.append(pairs)
409 return result
411 HEADER_JOIN_ESCAPE_RE = re.compile(r"([\"\\])")
412 def join_header_words(lists):
413 """Do the inverse (almost) of the conversion done by split_header_words.
415 Takes a list of lists of (key, value) pairs and produces a single header
416 value. Attribute values are quoted if needed.
418 >>> join_header_words([[("text/plain", None), ("charset", "iso-8859/1")]])
419 'text/plain; charset="iso-8859/1"'
420 >>> join_header_words([[("text/plain", None)], [("charset", "iso-8859/1")]])
421 'text/plain, charset="iso-8859/1"'
424 headers = []
425 for pairs in lists:
426 attr = []
427 for k, v in pairs:
428 if v is not None:
429 if not re.search(r"^\w+$", v):
430 v = HEADER_JOIN_ESCAPE_RE.sub(r"\\\1", v) # escape " and \
431 v = '"%s"' % v
432 k = "%s=%s" % (k, v)
433 attr.append(k)
434 if attr: headers.append("; ".join(attr))
435 return ", ".join(headers)
437 def parse_ns_headers(ns_headers):
438 """Ad-hoc parser for Netscape protocol cookie-attributes.
440 The old Netscape cookie format for Set-Cookie can for instance contain
441 an unquoted "," in the expires field, so we have to use this ad-hoc
442 parser instead of split_header_words.
444 XXX This may not make the best possible effort to parse all the crap
445 that Netscape Cookie headers contain. Ronald Tschalar's HTTPClient
446 parser is probably better, so could do worse than following that if
447 this ever gives any trouble.
449 Currently, this is also used for parsing RFC 2109 cookies.
452 known_attrs = ("expires", "domain", "path", "secure",
453 # RFC 2109 attrs (may turn up in Netscape cookies, too)
454 "port", "max-age")
456 result = []
457 for ns_header in ns_headers:
458 pairs = []
459 version_set = False
460 for ii, param in enumerate(re.split(r";\s*", ns_header)):
461 param = param.rstrip()
462 if param == "": continue
463 if "=" not in param:
464 k, v = param, None
465 else:
466 k, v = re.split(r"\s*=\s*", param, 1)
467 k = k.lstrip()
468 if ii != 0:
469 lc = k.lower()
470 if lc in known_attrs:
471 k = lc
472 if k == "version":
473 # This is an RFC 2109 cookie.
474 version_set = True
475 if k == "expires":
476 # convert expires date to seconds since epoch
477 if v.startswith('"'): v = v[1:]
478 if v.endswith('"'): v = v[:-1]
479 v = http2time(v) # None if invalid
480 pairs.append((k, v))
482 if pairs:
483 if not version_set:
484 pairs.append(("version", "0"))
485 result.append(pairs)
487 return result
490 IPV4_RE = re.compile(r"\.\d+$")
491 def is_HDN(text):
492 """Return True if text is a host domain name."""
493 # XXX
494 # This may well be wrong. Which RFC is HDN defined in, if any (for
495 # the purposes of RFC 2965)?
496 # For the current implementation, what about IPv6? Remember to look
497 # at other uses of IPV4_RE also, if change this.
498 if IPV4_RE.search(text):
499 return False
500 if text == "":
501 return False
502 if text[0] == "." or text[-1] == ".":
503 return False
504 return True
506 def domain_match(A, B):
507 """Return True if domain A domain-matches domain B, according to RFC 2965.
509 A and B may be host domain names or IP addresses.
511 RFC 2965, section 1:
513 Host names can be specified either as an IP address or a HDN string.
514 Sometimes we compare one host name with another. (Such comparisons SHALL
515 be case-insensitive.) Host A's name domain-matches host B's if
517 * their host name strings string-compare equal; or
519 * A is a HDN string and has the form NB, where N is a non-empty
520 name string, B has the form .B', and B' is a HDN string. (So,
521 x.y.com domain-matches .Y.com but not Y.com.)
523 Note that domain-match is not a commutative operation: a.b.c.com
524 domain-matches .c.com, but not the reverse.
527 # Note that, if A or B are IP addresses, the only relevant part of the
528 # definition of the domain-match algorithm is the direct string-compare.
529 A = A.lower()
530 B = B.lower()
531 if A == B:
532 return True
533 if not is_HDN(A):
534 return False
535 i = A.rfind(B)
536 if i == -1 or i == 0:
537 # A does not have form NB, or N is the empty string
538 return False
539 if not B.startswith("."):
540 return False
541 if not is_HDN(B[1:]):
542 return False
543 return True
545 def liberal_is_HDN(text):
546 """Return True if text is a sort-of-like a host domain name.
548 For accepting/blocking domains.
551 if IPV4_RE.search(text):
552 return False
553 return True
555 def user_domain_match(A, B):
556 """For blocking/accepting domains.
558 A and B may be host domain names or IP addresses.
561 A = A.lower()
562 B = B.lower()
563 if not (liberal_is_HDN(A) and liberal_is_HDN(B)):
564 if A == B:
565 # equal IP addresses
566 return True
567 return False
568 initial_dot = B.startswith(".")
569 if initial_dot and A.endswith(B):
570 return True
571 if not initial_dot and A == B:
572 return True
573 return False
575 cut_port_re = re.compile(r":\d+$")
576 def request_host(request):
577 """Return request-host, as defined by RFC 2965.
579 Variation from RFC: returned value is lowercased, for convenient
580 comparison.
583 url = request.get_full_url()
584 host = urlparse.urlparse(url)[1]
585 if host == "":
586 host = request.get_header("Host", "")
588 # remove port, if present
589 host = cut_port_re.sub("", host, 1)
590 return host.lower()
592 def eff_request_host(request):
593 """Return a tuple (request-host, effective request-host name).
595 As defined by RFC 2965, except both are lowercased.
598 erhn = req_host = request_host(request)
599 if req_host.find(".") == -1 and not IPV4_RE.search(req_host):
600 erhn = req_host + ".local"
601 return req_host, erhn
603 def request_path(request):
604 """request-URI, as defined by RFC 2965."""
605 url = request.get_full_url()
606 #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(url)
607 #req_path = escape_path("".join(urlparse.urlparse(url)[2:]))
608 path, parameters, query, frag = urlparse.urlparse(url)[2:]
609 if parameters:
610 path = "%s;%s" % (path, parameters)
611 path = escape_path(path)
612 req_path = urlparse.urlunparse(("", "", path, "", query, frag))
613 if not req_path.startswith("/"):
614 # fix bad RFC 2396 absoluteURI
615 req_path = "/"+req_path
616 return req_path
618 def request_port(request):
619 host = request.get_host()
620 i = host.find(':')
621 if i >= 0:
622 port = host[i+1:]
623 try:
624 int(port)
625 except ValueError:
626 _debug("nonnumeric port: '%s'", port)
627 return None
628 else:
629 port = DEFAULT_HTTP_PORT
630 return port
632 # Characters in addition to A-Z, a-z, 0-9, '_', '.', and '-' that don't
633 # need to be escaped to form a valid HTTP URL (RFCs 2396 and 1738).
634 HTTP_PATH_SAFE = "%/;:@&=+$,!~*'()"
635 ESCAPED_CHAR_RE = re.compile(r"%([0-9a-fA-F][0-9a-fA-F])")
636 def uppercase_escaped_char(match):
637 return "%%%s" % match.group(1).upper()
638 def escape_path(path):
639 """Escape any invalid characters in HTTP URL, and uppercase all escapes."""
640 # There's no knowing what character encoding was used to create URLs
641 # containing %-escapes, but since we have to pick one to escape invalid
642 # path characters, we pick UTF-8, as recommended in the HTML 4.0
643 # specification:
644 # http://www.w3.org/TR/REC-html40/appendix/notes.html#h-B.2.1
645 # And here, kind of: draft-fielding-uri-rfc2396bis-03
646 # (And in draft IRI specification: draft-duerst-iri-05)
647 # (And here, for new URI schemes: RFC 2718)
648 if isinstance(path, unicode):
649 path = path.encode("utf-8")
650 path = urllib.quote(path, HTTP_PATH_SAFE)
651 path = ESCAPED_CHAR_RE.sub(uppercase_escaped_char, path)
652 return path
654 def reach(h):
655 """Return reach of host h, as defined by RFC 2965, section 1.
657 The reach R of a host name H is defined as follows:
659 * If
661 - H is the host domain name of a host; and,
663 - H has the form A.B; and
665 - A has no embedded (that is, interior) dots; and
667 - B has at least one embedded dot, or B is the string "local".
668 then the reach of H is .B.
670 * Otherwise, the reach of H is H.
672 >>> reach("www.acme.com")
673 '.acme.com'
674 >>> reach("acme.com")
675 'acme.com'
676 >>> reach("acme.local")
677 '.local'
680 i = h.find(".")
681 if i >= 0:
682 #a = h[:i] # this line is only here to show what a is
683 b = h[i+1:]
684 i = b.find(".")
685 if is_HDN(h) and (i >= 0 or b == "local"):
686 return "."+b
687 return h
689 def is_third_party(request):
692 RFC 2965, section 3.3.6:
694 An unverifiable transaction is to a third-party host if its request-
695 host U does not domain-match the reach R of the request-host O in the
696 origin transaction.
699 req_host = request_host(request)
700 if not domain_match(req_host, reach(request.get_origin_req_host())):
701 return True
702 else:
703 return False
706 class Cookie:
707 """HTTP Cookie.
709 This class represents both Netscape and RFC 2965 cookies.
711 This is deliberately a very simple class. It just holds attributes. It's
712 possible to construct Cookie instances that don't comply with the cookie
713 standards. CookieJar.make_cookies is the factory function for Cookie
714 objects -- it deals with cookie parsing, supplying defaults, and
715 normalising to the representation used in this class. CookiePolicy is
716 responsible for checking them to see whether they should be accepted from
717 and returned to the server.
719 Note that the port may be present in the headers, but unspecified ("Port"
720 rather than"Port=80", for example); if this is the case, port is None.
724 def __init__(self, version, name, value,
725 port, port_specified,
726 domain, domain_specified, domain_initial_dot,
727 path, path_specified,
728 secure,
729 expires,
730 discard,
731 comment,
732 comment_url,
733 rest,
734 rfc2109=False,
737 if version is not None: version = int(version)
738 if expires is not None: expires = int(expires)
739 if port is None and port_specified is True:
740 raise ValueError("if port is None, port_specified must be false")
742 self.version = version
743 self.name = name
744 self.value = value
745 self.port = port
746 self.port_specified = port_specified
747 # normalise case, as per RFC 2965 section 3.3.3
748 self.domain = domain.lower()
749 self.domain_specified = domain_specified
750 # Sigh. We need to know whether the domain given in the
751 # cookie-attribute had an initial dot, in order to follow RFC 2965
752 # (as clarified in draft errata). Needed for the returned $Domain
753 # value.
754 self.domain_initial_dot = domain_initial_dot
755 self.path = path
756 self.path_specified = path_specified
757 self.secure = secure
758 self.expires = expires
759 self.discard = discard
760 self.comment = comment
761 self.comment_url = comment_url
762 self.rfc2109 = rfc2109
764 self._rest = copy.copy(rest)
766 def has_nonstandard_attr(self, name):
767 return name in self._rest
768 def get_nonstandard_attr(self, name, default=None):
769 return self._rest.get(name, default)
770 def set_nonstandard_attr(self, name, value):
771 self._rest[name] = value
773 def is_expired(self, now=None):
774 if now is None: now = time.time()
775 if (self.expires is not None) and (self.expires <= now):
776 return True
777 return False
779 def __str__(self):
780 if self.port is None: p = ""
781 else: p = ":"+self.port
782 limit = self.domain + p + self.path
783 if self.value is not None:
784 namevalue = "%s=%s" % (self.name, self.value)
785 else:
786 namevalue = self.name
787 return "<Cookie %s for %s>" % (namevalue, limit)
789 def __repr__(self):
790 args = []
791 for name in ("version", "name", "value",
792 "port", "port_specified",
793 "domain", "domain_specified", "domain_initial_dot",
794 "path", "path_specified",
795 "secure", "expires", "discard", "comment", "comment_url",
797 attr = getattr(self, name)
798 args.append("%s=%s" % (name, repr(attr)))
799 args.append("rest=%s" % repr(self._rest))
800 args.append("rfc2109=%s" % repr(self.rfc2109))
801 return "Cookie(%s)" % ", ".join(args)
804 class CookiePolicy:
805 """Defines which cookies get accepted from and returned to server.
807 May also modify cookies, though this is probably a bad idea.
809 The subclass DefaultCookiePolicy defines the standard rules for Netscape
810 and RFC 2965 cookies -- override that if you want a customised policy.
813 def set_ok(self, cookie, request):
814 """Return true if (and only if) cookie should be accepted from server.
816 Currently, pre-expired cookies never get this far -- the CookieJar
817 class deletes such cookies itself.
820 raise NotImplementedError()
822 def return_ok(self, cookie, request):
823 """Return true if (and only if) cookie should be returned to server."""
824 raise NotImplementedError()
826 def domain_return_ok(self, domain, request):
827 """Return false if cookies should not be returned, given cookie domain.
829 return True
831 def path_return_ok(self, path, request):
832 """Return false if cookies should not be returned, given cookie path.
834 return True
837 class DefaultCookiePolicy(CookiePolicy):
838 """Implements the standard rules for accepting and returning cookies."""
840 DomainStrictNoDots = 1
841 DomainStrictNonDomain = 2
842 DomainRFC2965Match = 4
844 DomainLiberal = 0
845 DomainStrict = DomainStrictNoDots|DomainStrictNonDomain
847 def __init__(self,
848 blocked_domains=None, allowed_domains=None,
849 netscape=True, rfc2965=False,
850 rfc2109_as_netscape=None,
851 hide_cookie2=False,
852 strict_domain=False,
853 strict_rfc2965_unverifiable=True,
854 strict_ns_unverifiable=False,
855 strict_ns_domain=DomainLiberal,
856 strict_ns_set_initial_dollar=False,
857 strict_ns_set_path=False,
859 """Constructor arguments should be passed as keyword arguments only."""
860 self.netscape = netscape
861 self.rfc2965 = rfc2965
862 self.rfc2109_as_netscape = rfc2109_as_netscape
863 self.hide_cookie2 = hide_cookie2
864 self.strict_domain = strict_domain
865 self.strict_rfc2965_unverifiable = strict_rfc2965_unverifiable
866 self.strict_ns_unverifiable = strict_ns_unverifiable
867 self.strict_ns_domain = strict_ns_domain
868 self.strict_ns_set_initial_dollar = strict_ns_set_initial_dollar
869 self.strict_ns_set_path = strict_ns_set_path
871 if blocked_domains is not None:
872 self._blocked_domains = tuple(blocked_domains)
873 else:
874 self._blocked_domains = ()
876 if allowed_domains is not None:
877 allowed_domains = tuple(allowed_domains)
878 self._allowed_domains = allowed_domains
880 def blocked_domains(self):
881 """Return the sequence of blocked domains (as a tuple)."""
882 return self._blocked_domains
883 def set_blocked_domains(self, blocked_domains):
884 """Set the sequence of blocked domains."""
885 self._blocked_domains = tuple(blocked_domains)
887 def is_blocked(self, domain):
888 for blocked_domain in self._blocked_domains:
889 if user_domain_match(domain, blocked_domain):
890 return True
891 return False
893 def allowed_domains(self):
894 """Return None, or the sequence of allowed domains (as a tuple)."""
895 return self._allowed_domains
896 def set_allowed_domains(self, allowed_domains):
897 """Set the sequence of allowed domains, or None."""
898 if allowed_domains is not None:
899 allowed_domains = tuple(allowed_domains)
900 self._allowed_domains = allowed_domains
902 def is_not_allowed(self, domain):
903 if self._allowed_domains is None:
904 return False
905 for allowed_domain in self._allowed_domains:
906 if user_domain_match(domain, allowed_domain):
907 return False
908 return True
910 def set_ok(self, cookie, request):
912 If you override .set_ok(), be sure to call this method. If it returns
913 false, so should your subclass (assuming your subclass wants to be more
914 strict about which cookies to accept).
917 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
919 assert cookie.name is not None
921 for n in "version", "verifiability", "name", "path", "domain", "port":
922 fn_name = "set_ok_"+n
923 fn = getattr(self, fn_name)
924 if not fn(cookie, request):
925 return False
927 return True
929 def set_ok_version(self, cookie, request):
930 if cookie.version is None:
931 # Version is always set to 0 by parse_ns_headers if it's a Netscape
932 # cookie, so this must be an invalid RFC 2965 cookie.
933 _debug(" Set-Cookie2 without version attribute (%s=%s)",
934 cookie.name, cookie.value)
935 return False
936 if cookie.version > 0 and not self.rfc2965:
937 _debug(" RFC 2965 cookies are switched off")
938 return False
939 elif cookie.version == 0 and not self.netscape:
940 _debug(" Netscape cookies are switched off")
941 return False
942 return True
944 def set_ok_verifiability(self, cookie, request):
945 if request.is_unverifiable() and is_third_party(request):
946 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
947 _debug(" third-party RFC 2965 cookie during "
948 "unverifiable transaction")
949 return False
950 elif cookie.version == 0 and self.strict_ns_unverifiable:
951 _debug(" third-party Netscape cookie during "
952 "unverifiable transaction")
953 return False
954 return True
956 def set_ok_name(self, cookie, request):
957 # Try and stop servers setting V0 cookies designed to hack other
958 # servers that know both V0 and V1 protocols.
959 if (cookie.version == 0 and self.strict_ns_set_initial_dollar and
960 cookie.name.startswith("$")):
961 _debug(" illegal name (starts with '$'): '%s'", cookie.name)
962 return False
963 return True
965 def set_ok_path(self, cookie, request):
966 if cookie.path_specified:
967 req_path = request_path(request)
968 if ((cookie.version > 0 or
969 (cookie.version == 0 and self.strict_ns_set_path)) and
970 not req_path.startswith(cookie.path)):
971 _debug(" path attribute %s is not a prefix of request "
972 "path %s", cookie.path, req_path)
973 return False
974 return True
976 def set_ok_domain(self, cookie, request):
977 if self.is_blocked(cookie.domain):
978 _debug(" domain %s is in user block-list", cookie.domain)
979 return False
980 if self.is_not_allowed(cookie.domain):
981 _debug(" domain %s is not in user allow-list", cookie.domain)
982 return False
983 if cookie.domain_specified:
984 req_host, erhn = eff_request_host(request)
985 domain = cookie.domain
986 if self.strict_domain and (domain.count(".") >= 2):
987 # XXX This should probably be compared with the Konqueror
988 # (kcookiejar.cpp) and Mozilla implementations, but it's a
989 # losing battle.
990 i = domain.rfind(".")
991 j = domain.rfind(".", 0, i)
992 if j == 0: # domain like .foo.bar
993 tld = domain[i+1:]
994 sld = domain[j+1:i]
995 if sld.lower() in ("co", "ac", "com", "edu", "org", "net",
996 "gov", "mil", "int", "aero", "biz", "cat", "coop",
997 "info", "jobs", "mobi", "museum", "name", "pro",
998 "travel", "eu") and len(tld) == 2:
999 # domain like .co.uk
1000 _debug(" country-code second level domain %s", domain)
1001 return False
1002 if domain.startswith("."):
1003 undotted_domain = domain[1:]
1004 else:
1005 undotted_domain = domain
1006 embedded_dots = (undotted_domain.find(".") >= 0)
1007 if not embedded_dots and domain != ".local":
1008 _debug(" non-local domain %s contains no embedded dot",
1009 domain)
1010 return False
1011 if cookie.version == 0:
1012 if (not erhn.endswith(domain) and
1013 (not erhn.startswith(".") and
1014 not ("."+erhn).endswith(domain))):
1015 _debug(" effective request-host %s (even with added "
1016 "initial dot) does not end end with %s",
1017 erhn, domain)
1018 return False
1019 if (cookie.version > 0 or
1020 (self.strict_ns_domain & self.DomainRFC2965Match)):
1021 if not domain_match(erhn, domain):
1022 _debug(" effective request-host %s does not domain-match "
1023 "%s", erhn, domain)
1024 return False
1025 if (cookie.version > 0 or
1026 (self.strict_ns_domain & self.DomainStrictNoDots)):
1027 host_prefix = req_host[:-len(domain)]
1028 if (host_prefix.find(".") >= 0 and
1029 not IPV4_RE.search(req_host)):
1030 _debug(" host prefix %s for domain %s contains a dot",
1031 host_prefix, domain)
1032 return False
1033 return True
1035 def set_ok_port(self, cookie, request):
1036 if cookie.port_specified:
1037 req_port = request_port(request)
1038 if req_port is None:
1039 req_port = "80"
1040 else:
1041 req_port = str(req_port)
1042 for p in cookie.port.split(","):
1043 try:
1044 int(p)
1045 except ValueError:
1046 _debug(" bad port %s (not numeric)", p)
1047 return False
1048 if p == req_port:
1049 break
1050 else:
1051 _debug(" request port (%s) not found in %s",
1052 req_port, cookie.port)
1053 return False
1054 return True
1056 def return_ok(self, cookie, request):
1058 If you override .return_ok(), be sure to call this method. If it
1059 returns false, so should your subclass (assuming your subclass wants to
1060 be more strict about which cookies to return).
1063 # Path has already been checked by .path_return_ok(), and domain
1064 # blocking done by .domain_return_ok().
1065 _debug(" - checking cookie %s=%s", cookie.name, cookie.value)
1067 for n in "version", "verifiability", "secure", "expires", "port", "domain":
1068 fn_name = "return_ok_"+n
1069 fn = getattr(self, fn_name)
1070 if not fn(cookie, request):
1071 return False
1072 return True
1074 def return_ok_version(self, cookie, request):
1075 if cookie.version > 0 and not self.rfc2965:
1076 _debug(" RFC 2965 cookies are switched off")
1077 return False
1078 elif cookie.version == 0 and not self.netscape:
1079 _debug(" Netscape cookies are switched off")
1080 return False
1081 return True
1083 def return_ok_verifiability(self, cookie, request):
1084 if request.is_unverifiable() and is_third_party(request):
1085 if cookie.version > 0 and self.strict_rfc2965_unverifiable:
1086 _debug(" third-party RFC 2965 cookie during unverifiable "
1087 "transaction")
1088 return False
1089 elif cookie.version == 0 and self.strict_ns_unverifiable:
1090 _debug(" third-party Netscape cookie during unverifiable "
1091 "transaction")
1092 return False
1093 return True
1095 def return_ok_secure(self, cookie, request):
1096 if cookie.secure and request.get_type() != "https":
1097 _debug(" secure cookie with non-secure request")
1098 return False
1099 return True
1101 def return_ok_expires(self, cookie, request):
1102 if cookie.is_expired(self._now):
1103 _debug(" cookie expired")
1104 return False
1105 return True
1107 def return_ok_port(self, cookie, request):
1108 if cookie.port:
1109 req_port = request_port(request)
1110 if req_port is None:
1111 req_port = "80"
1112 for p in cookie.port.split(","):
1113 if p == req_port:
1114 break
1115 else:
1116 _debug(" request port %s does not match cookie port %s",
1117 req_port, cookie.port)
1118 return False
1119 return True
1121 def return_ok_domain(self, cookie, request):
1122 req_host, erhn = eff_request_host(request)
1123 domain = cookie.domain
1125 # strict check of non-domain cookies: Mozilla does this, MSIE5 doesn't
1126 if (cookie.version == 0 and
1127 (self.strict_ns_domain & self.DomainStrictNonDomain) and
1128 not cookie.domain_specified and domain != erhn):
1129 _debug(" cookie with unspecified domain does not string-compare "
1130 "equal to request domain")
1131 return False
1133 if cookie.version > 0 and not domain_match(erhn, domain):
1134 _debug(" effective request-host name %s does not domain-match "
1135 "RFC 2965 cookie domain %s", erhn, domain)
1136 return False
1137 if cookie.version == 0 and not ("."+erhn).endswith(domain):
1138 _debug(" request-host %s does not match Netscape cookie domain "
1139 "%s", req_host, domain)
1140 return False
1141 return True
1143 def domain_return_ok(self, domain, request):
1144 # Liberal check of. This is here as an optimization to avoid
1145 # having to load lots of MSIE cookie files unless necessary.
1146 req_host, erhn = eff_request_host(request)
1147 if not req_host.startswith("."):
1148 req_host = "."+req_host
1149 if not erhn.startswith("."):
1150 erhn = "."+erhn
1151 if not (req_host.endswith(domain) or erhn.endswith(domain)):
1152 #_debug(" request domain %s does not match cookie domain %s",
1153 # req_host, domain)
1154 return False
1156 if self.is_blocked(domain):
1157 _debug(" domain %s is in user block-list", domain)
1158 return False
1159 if self.is_not_allowed(domain):
1160 _debug(" domain %s is not in user allow-list", domain)
1161 return False
1163 return True
1165 def path_return_ok(self, path, request):
1166 _debug("- checking cookie path=%s", path)
1167 req_path = request_path(request)
1168 if not req_path.startswith(path):
1169 _debug(" %s does not path-match %s", req_path, path)
1170 return False
1171 return True
1174 def vals_sorted_by_key(adict):
1175 keys = adict.keys()
1176 keys.sort()
1177 return map(adict.get, keys)
1179 def deepvalues(mapping):
1180 """Iterates over nested mapping, depth-first, in sorted order by key."""
1181 values = vals_sorted_by_key(mapping)
1182 for obj in values:
1183 mapping = False
1184 try:
1185 obj.items
1186 except AttributeError:
1187 pass
1188 else:
1189 mapping = True
1190 for subobj in deepvalues(obj):
1191 yield subobj
1192 if not mapping:
1193 yield obj
1196 # Used as second parameter to dict.get() method, to distinguish absent
1197 # dict key from one with a None value.
1198 class Absent: pass
1200 class CookieJar:
1201 """Collection of HTTP cookies.
1203 You may not need to know about this class: try
1204 urllib2.build_opener(HTTPCookieProcessor).open(url).
1208 non_word_re = re.compile(r"\W")
1209 quote_re = re.compile(r"([\"\\])")
1210 strict_domain_re = re.compile(r"\.?[^.]*")
1211 domain_re = re.compile(r"[^.]*")
1212 dots_re = re.compile(r"^\.+")
1214 magic_re = r"^\#LWP-Cookies-(\d+\.\d+)"
1216 def __init__(self, policy=None):
1217 if policy is None:
1218 policy = DefaultCookiePolicy()
1219 self._policy = policy
1221 self._cookies_lock = _threading.RLock()
1222 self._cookies = {}
1224 def set_policy(self, policy):
1225 self._policy = policy
1227 def _cookies_for_domain(self, domain, request):
1228 cookies = []
1229 if not self._policy.domain_return_ok(domain, request):
1230 return []
1231 _debug("Checking %s for cookies to return", domain)
1232 cookies_by_path = self._cookies[domain]
1233 for path in cookies_by_path.keys():
1234 if not self._policy.path_return_ok(path, request):
1235 continue
1236 cookies_by_name = cookies_by_path[path]
1237 for cookie in cookies_by_name.values():
1238 if not self._policy.return_ok(cookie, request):
1239 _debug(" not returning cookie")
1240 continue
1241 _debug(" it's a match")
1242 cookies.append(cookie)
1243 return cookies
1245 def _cookies_for_request(self, request):
1246 """Return a list of cookies to be returned to server."""
1247 cookies = []
1248 for domain in self._cookies.keys():
1249 cookies.extend(self._cookies_for_domain(domain, request))
1250 return cookies
1252 def _cookie_attrs(self, cookies):
1253 """Return a list of cookie-attributes to be returned to server.
1255 like ['foo="bar"; $Path="/"', ...]
1257 The $Version attribute is also added when appropriate (currently only
1258 once per request).
1261 # add cookies in order of most specific (ie. longest) path first
1262 cookies.sort(key=lambda arg: len(arg.path), reverse=True)
1264 version_set = False
1266 attrs = []
1267 for cookie in cookies:
1268 # set version of Cookie header
1269 # XXX
1270 # What should it be if multiple matching Set-Cookie headers have
1271 # different versions themselves?
1272 # Answer: there is no answer; was supposed to be settled by
1273 # RFC 2965 errata, but that may never appear...
1274 version = cookie.version
1275 if not version_set:
1276 version_set = True
1277 if version > 0:
1278 attrs.append("$Version=%s" % version)
1280 # quote cookie value if necessary
1281 # (not for Netscape protocol, which already has any quotes
1282 # intact, due to the poorly-specified Netscape Cookie: syntax)
1283 if ((cookie.value is not None) and
1284 self.non_word_re.search(cookie.value) and version > 0):
1285 value = self.quote_re.sub(r"\\\1", cookie.value)
1286 else:
1287 value = cookie.value
1289 # add cookie-attributes to be returned in Cookie header
1290 if cookie.value is None:
1291 attrs.append(cookie.name)
1292 else:
1293 attrs.append("%s=%s" % (cookie.name, value))
1294 if version > 0:
1295 if cookie.path_specified:
1296 attrs.append('$Path="%s"' % cookie.path)
1297 if cookie.domain.startswith("."):
1298 domain = cookie.domain
1299 if (not cookie.domain_initial_dot and
1300 domain.startswith(".")):
1301 domain = domain[1:]
1302 attrs.append('$Domain="%s"' % domain)
1303 if cookie.port is not None:
1304 p = "$Port"
1305 if cookie.port_specified:
1306 p = p + ('="%s"' % cookie.port)
1307 attrs.append(p)
1309 return attrs
1311 def add_cookie_header(self, request):
1312 """Add correct Cookie: header to request (urllib2.Request object).
1314 The Cookie2 header is also added unless policy.hide_cookie2 is true.
1317 _debug("add_cookie_header")
1318 self._cookies_lock.acquire()
1319 try:
1321 self._policy._now = self._now = int(time.time())
1323 cookies = self._cookies_for_request(request)
1325 attrs = self._cookie_attrs(cookies)
1326 if attrs:
1327 if not request.has_header("Cookie"):
1328 request.add_unredirected_header(
1329 "Cookie", "; ".join(attrs))
1331 # if necessary, advertise that we know RFC 2965
1332 if (self._policy.rfc2965 and not self._policy.hide_cookie2 and
1333 not request.has_header("Cookie2")):
1334 for cookie in cookies:
1335 if cookie.version != 1:
1336 request.add_unredirected_header("Cookie2", '$Version="1"')
1337 break
1339 finally:
1340 self._cookies_lock.release()
1342 self.clear_expired_cookies()
1344 def _normalized_cookie_tuples(self, attrs_set):
1345 """Return list of tuples containing normalised cookie information.
1347 attrs_set is the list of lists of key,value pairs extracted from
1348 the Set-Cookie or Set-Cookie2 headers.
1350 Tuples are name, value, standard, rest, where name and value are the
1351 cookie name and value, standard is a dictionary containing the standard
1352 cookie-attributes (discard, secure, version, expires or max-age,
1353 domain, path and port) and rest is a dictionary containing the rest of
1354 the cookie-attributes.
1357 cookie_tuples = []
1359 boolean_attrs = "discard", "secure"
1360 value_attrs = ("version",
1361 "expires", "max-age",
1362 "domain", "path", "port",
1363 "comment", "commenturl")
1365 for cookie_attrs in attrs_set:
1366 name, value = cookie_attrs[0]
1368 # Build dictionary of standard cookie-attributes (standard) and
1369 # dictionary of other cookie-attributes (rest).
1371 # Note: expiry time is normalised to seconds since epoch. V0
1372 # cookies should have the Expires cookie-attribute, and V1 cookies
1373 # should have Max-Age, but since V1 includes RFC 2109 cookies (and
1374 # since V0 cookies may be a mish-mash of Netscape and RFC 2109), we
1375 # accept either (but prefer Max-Age).
1376 max_age_set = False
1378 bad_cookie = False
1380 standard = {}
1381 rest = {}
1382 for k, v in cookie_attrs[1:]:
1383 lc = k.lower()
1384 # don't lose case distinction for unknown fields
1385 if lc in value_attrs or lc in boolean_attrs:
1386 k = lc
1387 if k in boolean_attrs and v is None:
1388 # boolean cookie-attribute is present, but has no value
1389 # (like "discard", rather than "port=80")
1390 v = True
1391 if k in standard:
1392 # only first value is significant
1393 continue
1394 if k == "domain":
1395 if v is None:
1396 _debug(" missing value for domain attribute")
1397 bad_cookie = True
1398 break
1399 # RFC 2965 section 3.3.3
1400 v = v.lower()
1401 if k == "expires":
1402 if max_age_set:
1403 # Prefer max-age to expires (like Mozilla)
1404 continue
1405 if v is None:
1406 _debug(" missing or invalid value for expires "
1407 "attribute: treating as session cookie")
1408 continue
1409 if k == "max-age":
1410 max_age_set = True
1411 try:
1412 v = int(v)
1413 except ValueError:
1414 _debug(" missing or invalid (non-numeric) value for "
1415 "max-age attribute")
1416 bad_cookie = True
1417 break
1418 # convert RFC 2965 Max-Age to seconds since epoch
1419 # XXX Strictly you're supposed to follow RFC 2616
1420 # age-calculation rules. Remember that zero Max-Age is a
1421 # is a request to discard (old and new) cookie, though.
1422 k = "expires"
1423 v = self._now + v
1424 if (k in value_attrs) or (k in boolean_attrs):
1425 if (v is None and
1426 k not in ("port", "comment", "commenturl")):
1427 _debug(" missing value for %s attribute" % k)
1428 bad_cookie = True
1429 break
1430 standard[k] = v
1431 else:
1432 rest[k] = v
1434 if bad_cookie:
1435 continue
1437 cookie_tuples.append((name, value, standard, rest))
1439 return cookie_tuples
1441 def _cookie_from_cookie_tuple(self, tup, request):
1442 # standard is dict of standard cookie-attributes, rest is dict of the
1443 # rest of them
1444 name, value, standard, rest = tup
1446 domain = standard.get("domain", Absent)
1447 path = standard.get("path", Absent)
1448 port = standard.get("port", Absent)
1449 expires = standard.get("expires", Absent)
1451 # set the easy defaults
1452 version = standard.get("version", None)
1453 if version is not None: version = int(version)
1454 secure = standard.get("secure", False)
1455 # (discard is also set if expires is Absent)
1456 discard = standard.get("discard", False)
1457 comment = standard.get("comment", None)
1458 comment_url = standard.get("commenturl", None)
1460 # set default path
1461 if path is not Absent and path != "":
1462 path_specified = True
1463 path = escape_path(path)
1464 else:
1465 path_specified = False
1466 path = request_path(request)
1467 i = path.rfind("/")
1468 if i != -1:
1469 if version == 0:
1470 # Netscape spec parts company from reality here
1471 path = path[:i]
1472 else:
1473 path = path[:i+1]
1474 if len(path) == 0: path = "/"
1476 # set default domain
1477 domain_specified = domain is not Absent
1478 # but first we have to remember whether it starts with a dot
1479 domain_initial_dot = False
1480 if domain_specified:
1481 domain_initial_dot = bool(domain.startswith("."))
1482 if domain is Absent:
1483 req_host, erhn = eff_request_host(request)
1484 domain = erhn
1485 elif not domain.startswith("."):
1486 domain = "."+domain
1488 # set default port
1489 port_specified = False
1490 if port is not Absent:
1491 if port is None:
1492 # Port attr present, but has no value: default to request port.
1493 # Cookie should then only be sent back on that port.
1494 port = request_port(request)
1495 else:
1496 port_specified = True
1497 port = re.sub(r"\s+", "", port)
1498 else:
1499 # No port attr present. Cookie can be sent back on any port.
1500 port = None
1502 # set default expires and discard
1503 if expires is Absent:
1504 expires = None
1505 discard = True
1506 elif expires <= self._now:
1507 # Expiry date in past is request to delete cookie. This can't be
1508 # in DefaultCookiePolicy, because can't delete cookies there.
1509 try:
1510 self.clear(domain, path, name)
1511 except KeyError:
1512 pass
1513 _debug("Expiring cookie, domain='%s', path='%s', name='%s'",
1514 domain, path, name)
1515 return None
1517 return Cookie(version,
1518 name, value,
1519 port, port_specified,
1520 domain, domain_specified, domain_initial_dot,
1521 path, path_specified,
1522 secure,
1523 expires,
1524 discard,
1525 comment,
1526 comment_url,
1527 rest)
1529 def _cookies_from_attrs_set(self, attrs_set, request):
1530 cookie_tuples = self._normalized_cookie_tuples(attrs_set)
1532 cookies = []
1533 for tup in cookie_tuples:
1534 cookie = self._cookie_from_cookie_tuple(tup, request)
1535 if cookie: cookies.append(cookie)
1536 return cookies
1538 def _process_rfc2109_cookies(self, cookies):
1539 rfc2109_as_ns = getattr(self._policy, 'rfc2109_as_netscape', None)
1540 if rfc2109_as_ns is None:
1541 rfc2109_as_ns = not self._policy.rfc2965
1542 for cookie in cookies:
1543 if cookie.version == 1:
1544 cookie.rfc2109 = True
1545 if rfc2109_as_ns:
1546 # treat 2109 cookies as Netscape cookies rather than
1547 # as RFC2965 cookies
1548 cookie.version = 0
1550 def make_cookies(self, response, request):
1551 """Return sequence of Cookie objects extracted from response object."""
1552 # get cookie-attributes for RFC 2965 and Netscape protocols
1553 headers = response.info()
1554 rfc2965_hdrs = headers.getheaders("Set-Cookie2")
1555 ns_hdrs = headers.getheaders("Set-Cookie")
1557 rfc2965 = self._policy.rfc2965
1558 netscape = self._policy.netscape
1560 if ((not rfc2965_hdrs and not ns_hdrs) or
1561 (not ns_hdrs and not rfc2965) or
1562 (not rfc2965_hdrs and not netscape) or
1563 (not netscape and not rfc2965)):
1564 return [] # no relevant cookie headers: quick exit
1566 try:
1567 cookies = self._cookies_from_attrs_set(
1568 split_header_words(rfc2965_hdrs), request)
1569 except Exception:
1570 _warn_unhandled_exception()
1571 cookies = []
1573 if ns_hdrs and netscape:
1574 try:
1575 # RFC 2109 and Netscape cookies
1576 ns_cookies = self._cookies_from_attrs_set(
1577 parse_ns_headers(ns_hdrs), request)
1578 except Exception:
1579 _warn_unhandled_exception()
1580 ns_cookies = []
1581 self._process_rfc2109_cookies(ns_cookies)
1583 # Look for Netscape cookies (from Set-Cookie headers) that match
1584 # corresponding RFC 2965 cookies (from Set-Cookie2 headers).
1585 # For each match, keep the RFC 2965 cookie and ignore the Netscape
1586 # cookie (RFC 2965 section 9.1). Actually, RFC 2109 cookies are
1587 # bundled in with the Netscape cookies for this purpose, which is
1588 # reasonable behaviour.
1589 if rfc2965:
1590 lookup = {}
1591 for cookie in cookies:
1592 lookup[(cookie.domain, cookie.path, cookie.name)] = None
1594 def no_matching_rfc2965(ns_cookie, lookup=lookup):
1595 key = ns_cookie.domain, ns_cookie.path, ns_cookie.name
1596 return key not in lookup
1597 ns_cookies = filter(no_matching_rfc2965, ns_cookies)
1599 if ns_cookies:
1600 cookies.extend(ns_cookies)
1602 return cookies
1604 def set_cookie_if_ok(self, cookie, request):
1605 """Set a cookie if policy says it's OK to do so."""
1606 self._cookies_lock.acquire()
1607 try:
1608 self._policy._now = self._now = int(time.time())
1610 if self._policy.set_ok(cookie, request):
1611 self.set_cookie(cookie)
1614 finally:
1615 self._cookies_lock.release()
1617 def set_cookie(self, cookie):
1618 """Set a cookie, without checking whether or not it should be set."""
1619 c = self._cookies
1620 self._cookies_lock.acquire()
1621 try:
1622 if cookie.domain not in c: c[cookie.domain] = {}
1623 c2 = c[cookie.domain]
1624 if cookie.path not in c2: c2[cookie.path] = {}
1625 c3 = c2[cookie.path]
1626 c3[cookie.name] = cookie
1627 finally:
1628 self._cookies_lock.release()
1630 def extract_cookies(self, response, request):
1631 """Extract cookies from response, where allowable given the request."""
1632 _debug("extract_cookies: %s", response.info())
1633 self._cookies_lock.acquire()
1634 try:
1635 self._policy._now = self._now = int(time.time())
1637 for cookie in self.make_cookies(response, request):
1638 if self._policy.set_ok(cookie, request):
1639 _debug(" setting cookie: %s", cookie)
1640 self.set_cookie(cookie)
1641 finally:
1642 self._cookies_lock.release()
1644 def clear(self, domain=None, path=None, name=None):
1645 """Clear some cookies.
1647 Invoking this method without arguments will clear all cookies. If
1648 given a single argument, only cookies belonging to that domain will be
1649 removed. If given two arguments, cookies belonging to the specified
1650 path within that domain are removed. If given three arguments, then
1651 the cookie with the specified name, path and domain is removed.
1653 Raises KeyError if no matching cookie exists.
1656 if name is not None:
1657 if (domain is None) or (path is None):
1658 raise ValueError(
1659 "domain and path must be given to remove a cookie by name")
1660 del self._cookies[domain][path][name]
1661 elif path is not None:
1662 if domain is None:
1663 raise ValueError(
1664 "domain must be given to remove cookies by path")
1665 del self._cookies[domain][path]
1666 elif domain is not None:
1667 del self._cookies[domain]
1668 else:
1669 self._cookies = {}
1671 def clear_session_cookies(self):
1672 """Discard all session cookies.
1674 Note that the .save() method won't save session cookies anyway, unless
1675 you ask otherwise by passing a true ignore_discard argument.
1678 self._cookies_lock.acquire()
1679 try:
1680 for cookie in self:
1681 if cookie.discard:
1682 self.clear(cookie.domain, cookie.path, cookie.name)
1683 finally:
1684 self._cookies_lock.release()
1686 def clear_expired_cookies(self):
1687 """Discard all expired cookies.
1689 You probably don't need to call this method: expired cookies are never
1690 sent back to the server (provided you're using DefaultCookiePolicy),
1691 this method is called by CookieJar itself every so often, and the
1692 .save() method won't save expired cookies anyway (unless you ask
1693 otherwise by passing a true ignore_expires argument).
1696 self._cookies_lock.acquire()
1697 try:
1698 now = time.time()
1699 for cookie in self:
1700 if cookie.is_expired(now):
1701 self.clear(cookie.domain, cookie.path, cookie.name)
1702 finally:
1703 self._cookies_lock.release()
1705 def __iter__(self):
1706 return deepvalues(self._cookies)
1708 def __len__(self):
1709 """Return number of contained cookies."""
1710 i = 0
1711 for cookie in self: i = i + 1
1712 return i
1714 def __repr__(self):
1715 r = []
1716 for cookie in self: r.append(repr(cookie))
1717 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1719 def __str__(self):
1720 r = []
1721 for cookie in self: r.append(str(cookie))
1722 return "<%s[%s]>" % (self.__class__, ", ".join(r))
1725 # derives from IOError for backwards-compatibility with Python 2.4.0
1726 class LoadError(IOError): pass
1728 class FileCookieJar(CookieJar):
1729 """CookieJar that can be loaded from and saved to a file."""
1731 def __init__(self, filename=None, delayload=False, policy=None):
1733 Cookies are NOT loaded from the named file until either the .load() or
1734 .revert() method is called.
1737 CookieJar.__init__(self, policy)
1738 if filename is not None:
1739 try:
1740 filename+""
1741 except:
1742 raise ValueError("filename must be string-like")
1743 self.filename = filename
1744 self.delayload = bool(delayload)
1746 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1747 """Save cookies to a file."""
1748 raise NotImplementedError()
1750 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1751 """Load cookies from a file."""
1752 if filename is None:
1753 if self.filename is not None: filename = self.filename
1754 else: raise ValueError(MISSING_FILENAME_TEXT)
1756 f = open(filename)
1757 try:
1758 self._really_load(f, filename, ignore_discard, ignore_expires)
1759 finally:
1760 f.close()
1762 def revert(self, filename=None,
1763 ignore_discard=False, ignore_expires=False):
1764 """Clear all cookies and reload cookies from a saved file.
1766 Raises LoadError (or IOError) if reversion is not successful; the
1767 object's state will not be altered if this happens.
1770 if filename is None:
1771 if self.filename is not None: filename = self.filename
1772 else: raise ValueError(MISSING_FILENAME_TEXT)
1774 self._cookies_lock.acquire()
1775 try:
1777 old_state = copy.deepcopy(self._cookies)
1778 self._cookies = {}
1779 try:
1780 self.load(filename, ignore_discard, ignore_expires)
1781 except (LoadError, IOError):
1782 self._cookies = old_state
1783 raise
1785 finally:
1786 self._cookies_lock.release()
1788 from _LWPCookieJar import LWPCookieJar, lwp_cookie_str
1789 from _MozillaCookieJar import MozillaCookieJar