Fullscreen support, UI fixes, reset improved
[smpy-maemo.git] / ClientForm.py
blob69ba3f0bb58b45f7f98b6784e33080c8c53f54d2
1 """HTML form handling for web clients.
3 ClientForm is a Python module for handling HTML forms on the client
4 side, useful for parsing HTML forms, filling them in and returning the
5 completed forms to the server. It has developed from a port of Gisle
6 Aas' Perl module HTML::Form, from the libwww-perl library, but the
7 interface is not the same.
9 The most useful docstring is the one for HTMLForm.
11 RFC 1866: HTML 2.0
12 RFC 1867: Form-based File Upload in HTML
13 RFC 2388: Returning Values from Forms: multipart/form-data
14 HTML 3.2 Specification, W3C Recommendation 14 January 1997 (for ISINDEX)
15 HTML 4.01 Specification, W3C Recommendation 24 December 1999
18 Copyright 2002-2007 John J. Lee <jjl@pobox.com>
19 Copyright 2005 Gary Poster
20 Copyright 2005 Zope Corporation
21 Copyright 1998-2000 Gisle Aas.
23 This code is free software; you can redistribute it and/or modify it
24 under the terms of the BSD or ZPL 2.1 licenses (see the file
25 COPYING.txt included with the distribution).
27 """
29 # XXX
30 # Remove parser testing hack
31 # safeUrl()-ize action
32 # Switch to unicode throughout (would be 0.3.x)
33 # See Wichert Akkerman's 2004-01-22 message to c.l.py.
34 # Add charset parameter to Content-type headers? How to find value??
35 # Add some more functional tests
36 # Especially single and multiple file upload on the internet.
37 # Does file upload work when name is missing? Sourceforge tracker form
38 # doesn't like it. Check standards, and test with Apache. Test
39 # binary upload with Apache.
40 # mailto submission & enctype text/plain
41 # I'm not going to fix this unless somebody tells me what real servers
42 # that want this encoding actually expect: If enctype is
43 # application/x-www-form-urlencoded and there's a FILE control present.
44 # Strictly, it should be 'name=data' (see HTML 4.01 spec., section
45 # 17.13.2), but I send "name=" ATM. What about multiple file upload??
47 # Would be nice, but I'm not going to do it myself:
48 # -------------------------------------------------
49 # Maybe a 0.4.x?
50 # Replace by_label etc. with moniker / selector concept. Allows, eg.,
51 # a choice between selection by value / id / label / element
52 # contents. Or choice between matching labels exactly or by
53 # substring. Etc.
54 # Remove deprecated methods.
55 # ...what else?
56 # Work on DOMForm.
57 # XForms? Don't know if there's a need here.
59 __all__ = ['AmbiguityError', 'CheckboxControl', 'Control',
60 'ControlNotFoundError', 'FileControl', 'FormParser', 'HTMLForm',
61 'HiddenControl', 'IgnoreControl', 'ImageControl', 'IsindexControl',
62 'Item', 'ItemCountError', 'ItemNotFoundError', 'Label',
63 'ListControl', 'LocateError', 'Missing', 'NestingRobustFormParser',
64 'ParseError', 'ParseFile', 'ParseFileEx', 'ParseResponse',
65 'ParseResponseEx', 'PasswordControl', 'RadioControl',
66 'RobustFormParser', 'ScalarControl', 'SelectControl',
67 'SubmitButtonControl', 'SubmitControl', 'TextControl',
68 'TextareaControl', 'XHTMLCompatibleFormParser']
70 try: True
71 except NameError:
72 True = 1
73 False = 0
75 try: bool
76 except NameError:
77 def bool(expr):
78 if expr: return True
79 else: return False
81 try:
82 import logging
83 import inspect
84 except ImportError:
85 def debug(msg, *args, **kwds):
86 pass
87 else:
88 _logger = logging.getLogger("ClientForm")
89 OPTIMIZATION_HACK = True
91 def debug(msg, *args, **kwds):
92 if OPTIMIZATION_HACK:
93 return
95 caller_name = inspect.stack()[1][3]
96 extended_msg = '%%s %s' % msg
97 extended_args = (caller_name,)+args
98 debug = _logger.debug(extended_msg, *extended_args, **kwds)
100 def _show_debug_messages():
101 global OPTIMIZATION_HACK
102 OPTIMIZATION_HACK = False
103 _logger.setLevel(logging.DEBUG)
104 handler = logging.StreamHandler(sys.stdout)
105 handler.setLevel(logging.DEBUG)
106 _logger.addHandler(handler)
108 import sys, urllib, urllib2, types, mimetools, copy, urlparse, \
109 htmlentitydefs, re, random
110 from cStringIO import StringIO
112 import sgmllib
113 # monkeypatch to fix http://www.python.org/sf/803422 :-(
114 sgmllib.charref = re.compile("&#(x?[0-9a-fA-F]+)[^0-9a-fA-F]")
116 # HTMLParser.HTMLParser is recent, so live without it if it's not available
117 # (also, sgmllib.SGMLParser is much more tolerant of bad HTML)
118 try:
119 import HTMLParser
120 except ImportError:
121 HAVE_MODULE_HTMLPARSER = False
122 else:
123 HAVE_MODULE_HTMLPARSER = True
125 try:
126 import warnings
127 except ImportError:
128 def deprecation(message, stack_offset=0):
129 pass
130 else:
131 def deprecation(message, stack_offset=0):
132 warnings.warn(message, DeprecationWarning, stacklevel=3+stack_offset)
134 VERSION = "0.2.7"
136 CHUNK = 1024 # size of chunks fed to parser, in bytes
138 DEFAULT_ENCODING = "latin-1"
140 class Missing: pass
142 _compress_re = re.compile(r"\s+")
143 def compress_text(text): return _compress_re.sub(" ", text.strip())
145 def normalize_line_endings(text):
146 return re.sub(r"(?:(?<!\r)\n)|(?:\r(?!\n))", "\r\n", text)
149 # This version of urlencode is from my Python 1.5.2 back-port of the
150 # Python 2.1 CVS maintenance branch of urllib. It will accept a sequence
151 # of pairs instead of a mapping -- the 2.0 version only accepts a mapping.
152 def urlencode(query,doseq=False,):
153 """Encode a sequence of two-element tuples or dictionary into a URL query \
154 string.
156 If any values in the query arg are sequences and doseq is true, each
157 sequence element is converted to a separate parameter.
159 If the query arg is a sequence of two-element tuples, the order of the
160 parameters in the output will match the order of parameters in the
161 input.
164 if hasattr(query,"items"):
165 # mapping objects
166 query = query.items()
167 else:
168 # it's a bother at times that strings and string-like objects are
169 # sequences...
170 try:
171 # non-sequence items should not work with len()
172 x = len(query)
173 # non-empty strings will fail this
174 if len(query) and type(query[0]) != types.TupleType:
175 raise TypeError()
176 # zero-length sequences of all types will get here and succeed,
177 # but that's a minor nit - since the original implementation
178 # allowed empty dicts that type of behavior probably should be
179 # preserved for consistency
180 except TypeError:
181 ty,va,tb = sys.exc_info()
182 raise TypeError("not a valid non-string sequence or mapping "
183 "object", tb)
185 l = []
186 if not doseq:
187 # preserve old behavior
188 for k, v in query:
189 k = urllib.quote_plus(str(k))
190 v = urllib.quote_plus(str(v))
191 l.append(k + '=' + v)
192 else:
193 for k, v in query:
194 k = urllib.quote_plus(str(k))
195 if type(v) == types.StringType:
196 v = urllib.quote_plus(v)
197 l.append(k + '=' + v)
198 elif type(v) == types.UnicodeType:
199 # is there a reasonable way to convert to ASCII?
200 # encode generates a string, but "replace" or "ignore"
201 # lose information and "strict" can raise UnicodeError
202 v = urllib.quote_plus(v.encode("ASCII","replace"))
203 l.append(k + '=' + v)
204 else:
205 try:
206 # is this a sufficient test for sequence-ness?
207 x = len(v)
208 except TypeError:
209 # not a sequence
210 v = urllib.quote_plus(str(v))
211 l.append(k + '=' + v)
212 else:
213 # loop over the sequence
214 for elt in v:
215 l.append(k + '=' + urllib.quote_plus(str(elt)))
216 return '&'.join(l)
218 def unescape(data, entities, encoding=DEFAULT_ENCODING):
219 if data is None or "&" not in data:
220 return data
222 def replace_entities(match, entities=entities, encoding=encoding):
223 ent = match.group()
224 if ent[1] == "#":
225 return unescape_charref(ent[2:-1], encoding)
227 repl = entities.get(ent)
228 if repl is not None:
229 if type(repl) != type(""):
230 try:
231 repl = repl.encode(encoding)
232 except UnicodeError:
233 repl = ent
234 else:
235 repl = ent
237 return repl
239 return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
241 def unescape_charref(data, encoding):
242 name, base = data, 10
243 if name.startswith("x"):
244 name, base= name[1:], 16
245 uc = unichr(int(name, base))
246 if encoding is None:
247 return uc
248 else:
249 try:
250 repl = uc.encode(encoding)
251 except UnicodeError:
252 repl = "&#%s;" % data
253 return repl
255 def get_entitydefs():
256 import htmlentitydefs
257 from codecs import latin_1_decode
258 entitydefs = {}
259 try:
260 htmlentitydefs.name2codepoint
261 except AttributeError:
262 entitydefs = {}
263 for name, char in htmlentitydefs.entitydefs.items():
264 uc = latin_1_decode(char)[0]
265 if uc.startswith("&#") and uc.endswith(";"):
266 uc = unescape_charref(uc[2:-1], None)
267 entitydefs["&%s;" % name] = uc
268 else:
269 for name, codepoint in htmlentitydefs.name2codepoint.items():
270 entitydefs["&%s;" % name] = unichr(codepoint)
271 return entitydefs
274 def issequence(x):
275 try:
276 x[0]
277 except (TypeError, KeyError):
278 return False
279 except IndexError:
280 pass
281 return True
283 def isstringlike(x):
284 try: x+""
285 except: return False
286 else: return True
289 def choose_boundary():
290 """Return a string usable as a multipart boundary."""
291 # follow IE and firefox
292 nonce = "".join([str(random.randint(0, sys.maxint-1)) for i in 0,1,2])
293 return "-"*27 + nonce
295 # This cut-n-pasted MimeWriter from standard library is here so can add
296 # to HTTP headers rather than message body when appropriate. It also uses
297 # \r\n in place of \n. This is a bit nasty.
298 class MimeWriter:
300 """Generic MIME writer.
302 Methods:
304 __init__()
305 addheader()
306 flushheaders()
307 startbody()
308 startmultipartbody()
309 nextpart()
310 lastpart()
312 A MIME writer is much more primitive than a MIME parser. It
313 doesn't seek around on the output file, and it doesn't use large
314 amounts of buffer space, so you have to write the parts in the
315 order they should occur on the output file. It does buffer the
316 headers you add, allowing you to rearrange their order.
318 General usage is:
320 f = <open the output file>
321 w = MimeWriter(f)
322 ...call w.addheader(key, value) 0 or more times...
324 followed by either:
326 f = w.startbody(content_type)
327 ...call f.write(data) for body data...
331 w.startmultipartbody(subtype)
332 for each part:
333 subwriter = w.nextpart()
334 ...use the subwriter's methods to create the subpart...
335 w.lastpart()
337 The subwriter is another MimeWriter instance, and should be
338 treated in the same way as the toplevel MimeWriter. This way,
339 writing recursive body parts is easy.
341 Warning: don't forget to call lastpart()!
343 XXX There should be more state so calls made in the wrong order
344 are detected.
346 Some special cases:
348 - startbody() just returns the file passed to the constructor;
349 but don't use this knowledge, as it may be changed.
351 - startmultipartbody() actually returns a file as well;
352 this can be used to write the initial 'if you can read this your
353 mailer is not MIME-aware' message.
355 - If you call flushheaders(), the headers accumulated so far are
356 written out (and forgotten); this is useful if you don't need a
357 body part at all, e.g. for a subpart of type message/rfc822
358 that's (mis)used to store some header-like information.
360 - Passing a keyword argument 'prefix=<flag>' to addheader(),
361 start*body() affects where the header is inserted; 0 means
362 append at the end, 1 means insert at the start; default is
363 append for addheader(), but insert for start*body(), which use
364 it to determine where the Content-type header goes.
368 def __init__(self, fp, http_hdrs=None):
369 self._http_hdrs = http_hdrs
370 self._fp = fp
371 self._headers = []
372 self._boundary = []
373 self._first_part = True
375 def addheader(self, key, value, prefix=0,
376 add_to_http_hdrs=0):
378 prefix is ignored if add_to_http_hdrs is true.
380 lines = value.split("\r\n")
381 while lines and not lines[-1]: del lines[-1]
382 while lines and not lines[0]: del lines[0]
383 if add_to_http_hdrs:
384 value = "".join(lines)
385 self._http_hdrs.append((key, value))
386 else:
387 for i in range(1, len(lines)):
388 lines[i] = " " + lines[i].strip()
389 value = "\r\n".join(lines) + "\r\n"
390 line = key + ": " + value
391 if prefix:
392 self._headers.insert(0, line)
393 else:
394 self._headers.append(line)
396 def flushheaders(self):
397 self._fp.writelines(self._headers)
398 self._headers = []
400 def startbody(self, ctype=None, plist=[], prefix=1,
401 add_to_http_hdrs=0, content_type=1):
403 prefix is ignored if add_to_http_hdrs is true.
405 if content_type and ctype:
406 for name, value in plist:
407 ctype = ctype + ';\r\n %s=%s' % (name, value)
408 self.addheader("Content-type", ctype, prefix=prefix,
409 add_to_http_hdrs=add_to_http_hdrs)
410 self.flushheaders()
411 if not add_to_http_hdrs: self._fp.write("\r\n")
412 self._first_part = True
413 return self._fp
415 def startmultipartbody(self, subtype, boundary=None, plist=[], prefix=1,
416 add_to_http_hdrs=0, content_type=1):
417 boundary = boundary or choose_boundary()
418 self._boundary.append(boundary)
419 return self.startbody("multipart/" + subtype,
420 [("boundary", boundary)] + plist,
421 prefix=prefix,
422 add_to_http_hdrs=add_to_http_hdrs,
423 content_type=content_type)
425 def nextpart(self):
426 boundary = self._boundary[-1]
427 if self._first_part:
428 self._first_part = False
429 else:
430 self._fp.write("\r\n")
431 self._fp.write("--" + boundary + "\r\n")
432 return self.__class__(self._fp)
434 def lastpart(self):
435 if self._first_part:
436 self.nextpart()
437 boundary = self._boundary.pop()
438 self._fp.write("\r\n--" + boundary + "--\r\n")
441 class LocateError(ValueError): pass
442 class AmbiguityError(LocateError): pass
443 class ControlNotFoundError(LocateError): pass
444 class ItemNotFoundError(LocateError): pass
446 class ItemCountError(ValueError): pass
448 # for backwards compatibility, ParseError derives from exceptions that were
449 # raised by versions of ClientForm <= 0.2.5
450 if HAVE_MODULE_HTMLPARSER:
451 SGMLLIB_PARSEERROR = sgmllib.SGMLParseError
452 class ParseError(sgmllib.SGMLParseError,
453 HTMLParser.HTMLParseError,
455 pass
456 else:
457 if hasattr(sgmllib, "SGMLParseError"):
458 SGMLLIB_PARSEERROR = sgmllib.SGMLParseError
459 class ParseError(sgmllib.SGMLParseError):
460 pass
461 else:
462 SGMLLIB_PARSEERROR = RuntimeError
463 class ParseError(RuntimeError):
464 pass
467 class _AbstractFormParser:
468 """forms attribute contains HTMLForm instances on completion."""
469 # thanks to Moshe Zadka for an example of sgmllib/htmllib usage
470 def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
471 if entitydefs is None:
472 entitydefs = get_entitydefs()
473 self._entitydefs = entitydefs
474 self._encoding = encoding
476 self.base = None
477 self.forms = []
478 self.labels = []
479 self._current_label = None
480 self._current_form = None
481 self._select = None
482 self._optgroup = None
483 self._option = None
484 self._textarea = None
486 # forms[0] will contain all controls that are outside of any form
487 # self._global_form is an alias for self.forms[0]
488 self._global_form = None
489 self.start_form([])
490 self.end_form()
491 self._current_form = self._global_form = self.forms[0]
493 def do_base(self, attrs):
494 debug("%s", attrs)
495 for key, value in attrs:
496 if key == "href":
497 self.base = self.unescape_attr_if_required(value)
499 def end_body(self):
500 debug("")
501 if self._current_label is not None:
502 self.end_label()
503 if self._current_form is not self._global_form:
504 self.end_form()
506 def start_form(self, attrs):
507 debug("%s", attrs)
508 if self._current_form is not self._global_form:
509 raise ParseError("nested FORMs")
510 name = None
511 action = None
512 enctype = "application/x-www-form-urlencoded"
513 method = "GET"
514 d = {}
515 for key, value in attrs:
516 if key == "name":
517 name = self.unescape_attr_if_required(value)
518 elif key == "action":
519 action = self.unescape_attr_if_required(value)
520 elif key == "method":
521 method = self.unescape_attr_if_required(value.upper())
522 elif key == "enctype":
523 enctype = self.unescape_attr_if_required(value.lower())
524 d[key] = self.unescape_attr_if_required(value)
525 controls = []
526 self._current_form = (name, action, method, enctype), d, controls
528 def end_form(self):
529 debug("")
530 if self._current_label is not None:
531 self.end_label()
532 if self._current_form is self._global_form:
533 raise ParseError("end of FORM before start")
534 self.forms.append(self._current_form)
535 self._current_form = self._global_form
537 def start_select(self, attrs):
538 debug("%s", attrs)
539 if self._select is not None:
540 raise ParseError("nested SELECTs")
541 if self._textarea is not None:
542 raise ParseError("SELECT inside TEXTAREA")
543 d = {}
544 for key, val in attrs:
545 d[key] = self.unescape_attr_if_required(val)
547 self._select = d
548 self._add_label(d)
550 self._append_select_control({"__select": d})
552 def end_select(self):
553 debug("")
554 if self._select is None:
555 raise ParseError("end of SELECT before start")
557 if self._option is not None:
558 self._end_option()
560 self._select = None
562 def start_optgroup(self, attrs):
563 debug("%s", attrs)
564 if self._select is None:
565 raise ParseError("OPTGROUP outside of SELECT")
566 d = {}
567 for key, val in attrs:
568 d[key] = self.unescape_attr_if_required(val)
570 self._optgroup = d
572 def end_optgroup(self):
573 debug("")
574 if self._optgroup is None:
575 raise ParseError("end of OPTGROUP before start")
576 self._optgroup = None
578 def _start_option(self, attrs):
579 debug("%s", attrs)
580 if self._select is None:
581 raise ParseError("OPTION outside of SELECT")
582 if self._option is not None:
583 self._end_option()
585 d = {}
586 for key, val in attrs:
587 d[key] = self.unescape_attr_if_required(val)
589 self._option = {}
590 self._option.update(d)
591 if (self._optgroup and self._optgroup.has_key("disabled") and
592 not self._option.has_key("disabled")):
593 self._option["disabled"] = None
595 def _end_option(self):
596 debug("")
597 if self._option is None:
598 raise ParseError("end of OPTION before start")
600 contents = self._option.get("contents", "").strip()
601 self._option["contents"] = contents
602 if not self._option.has_key("value"):
603 self._option["value"] = contents
604 if not self._option.has_key("label"):
605 self._option["label"] = contents
606 # stuff dict of SELECT HTML attrs into a special private key
607 # (gets deleted again later)
608 self._option["__select"] = self._select
609 self._append_select_control(self._option)
610 self._option = None
612 def _append_select_control(self, attrs):
613 debug("%s", attrs)
614 controls = self._current_form[2]
615 name = self._select.get("name")
616 controls.append(("select", name, attrs))
618 def start_textarea(self, attrs):
619 debug("%s", attrs)
620 if self._textarea is not None:
621 raise ParseError("nested TEXTAREAs")
622 if self._select is not None:
623 raise ParseError("TEXTAREA inside SELECT")
624 d = {}
625 for key, val in attrs:
626 d[key] = self.unescape_attr_if_required(val)
627 self._add_label(d)
629 self._textarea = d
631 def end_textarea(self):
632 debug("")
633 if self._textarea is None:
634 raise ParseError("end of TEXTAREA before start")
635 controls = self._current_form[2]
636 name = self._textarea.get("name")
637 controls.append(("textarea", name, self._textarea))
638 self._textarea = None
640 def start_label(self, attrs):
641 debug("%s", attrs)
642 if self._current_label:
643 self.end_label()
644 d = {}
645 for key, val in attrs:
646 d[key] = self.unescape_attr_if_required(val)
647 taken = bool(d.get("for")) # empty id is invalid
648 d["__text"] = ""
649 d["__taken"] = taken
650 if taken:
651 self.labels.append(d)
652 self._current_label = d
654 def end_label(self):
655 debug("")
656 label = self._current_label
657 if label is None:
658 # something is ugly in the HTML, but we're ignoring it
659 return
660 self._current_label = None
661 label["__text"] = label["__text"]
662 # if it is staying around, it is True in all cases
663 del label["__taken"]
665 def _add_label(self, d):
666 #debug("%s", d)
667 if self._current_label is not None:
668 if self._current_label["__taken"]:
669 self.end_label() # be fuzzy
670 else:
671 self._current_label["__taken"] = True
672 d["__label"] = self._current_label
674 def handle_data(self, data):
675 debug("%s", data)
677 # according to http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.1
678 # line break immediately after start tags or immediately before end
679 # tags must be ignored, but real browsers only ignore a line break
680 # after a start tag, so we'll do that.
681 if data[0:2] == "\r\n":
682 data = data[2:]
683 if data[0:1] in ["\n", "\r"]:
684 data = data[1:]
686 if self._option is not None:
687 # self._option is a dictionary of the OPTION element's HTML
688 # attributes, but it has two special keys, one of which is the
689 # special "contents" key contains text between OPTION tags (the
690 # other is the "__select" key: see the end_option method)
691 map = self._option
692 key = "contents"
693 elif self._textarea is not None:
694 map = self._textarea
695 key = "value"
696 data = normalize_line_endings(data)
697 # not if within option or textarea
698 elif self._current_label is not None:
699 map = self._current_label
700 key = "__text"
701 else:
702 return
704 if not map.has_key(key):
705 map[key] = data
706 else:
707 map[key] = map[key] + data
709 def do_button(self, attrs):
710 debug("%s", attrs)
711 d = {}
712 d["type"] = "submit" # default
713 for key, val in attrs:
714 d[key] = self.unescape_attr_if_required(val)
715 controls = self._current_form[2]
717 type = d["type"]
718 name = d.get("name")
719 # we don't want to lose information, so use a type string that
720 # doesn't clash with INPUT TYPE={SUBMIT,RESET,BUTTON}
721 # e.g. type for BUTTON/RESET is "resetbutton"
722 # (type for INPUT/RESET is "reset")
723 type = type+"button"
724 self._add_label(d)
725 controls.append((type, name, d))
727 def do_input(self, attrs):
728 debug("%s", attrs)
729 d = {}
730 d["type"] = "text" # default
731 for key, val in attrs:
732 d[key] = self.unescape_attr_if_required(val)
733 controls = self._current_form[2]
735 type = d["type"]
736 name = d.get("name")
737 self._add_label(d)
738 controls.append((type, name, d))
740 def do_isindex(self, attrs):
741 debug("%s", attrs)
742 d = {}
743 for key, val in attrs:
744 d[key] = self.unescape_attr_if_required(val)
745 controls = self._current_form[2]
747 self._add_label(d)
748 # isindex doesn't have type or name HTML attributes
749 controls.append(("isindex", None, d))
751 def handle_entityref(self, name):
752 #debug("%s", name)
753 self.handle_data(unescape(
754 '&%s;' % name, self._entitydefs, self._encoding))
756 def handle_charref(self, name):
757 #debug("%s", name)
758 self.handle_data(unescape_charref(name, self._encoding))
760 def unescape_attr(self, name):
761 #debug("%s", name)
762 return unescape(name, self._entitydefs, self._encoding)
764 def unescape_attrs(self, attrs):
765 #debug("%s", attrs)
766 escaped_attrs = {}
767 for key, val in attrs.items():
768 try:
769 val.items
770 except AttributeError:
771 escaped_attrs[key] = self.unescape_attr(val)
772 else:
773 # e.g. "__select" -- yuck!
774 escaped_attrs[key] = self.unescape_attrs(val)
775 return escaped_attrs
777 def unknown_entityref(self, ref): self.handle_data("&%s;" % ref)
778 def unknown_charref(self, ref): self.handle_data("&#%s;" % ref)
781 if not HAVE_MODULE_HTMLPARSER:
782 class XHTMLCompatibleFormParser:
783 def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
784 raise ValueError("HTMLParser could not be imported")
785 else:
786 class XHTMLCompatibleFormParser(_AbstractFormParser, HTMLParser.HTMLParser):
787 """Good for XHTML, bad for tolerance of incorrect HTML."""
788 # thanks to Michael Howitz for this!
789 def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
790 HTMLParser.HTMLParser.__init__(self)
791 _AbstractFormParser.__init__(self, entitydefs, encoding)
793 def feed(self, data):
794 try:
795 HTMLParser.HTMLParser.feed(self, data)
796 except HTMLParser.HTMLParseError, exc:
797 raise ParseError(exc)
799 def start_option(self, attrs):
800 _AbstractFormParser._start_option(self, attrs)
802 def end_option(self):
803 _AbstractFormParser._end_option(self)
805 def handle_starttag(self, tag, attrs):
806 try:
807 method = getattr(self, "start_" + tag)
808 except AttributeError:
809 try:
810 method = getattr(self, "do_" + tag)
811 except AttributeError:
812 pass # unknown tag
813 else:
814 method(attrs)
815 else:
816 method(attrs)
818 def handle_endtag(self, tag):
819 try:
820 method = getattr(self, "end_" + tag)
821 except AttributeError:
822 pass # unknown tag
823 else:
824 method()
826 def unescape(self, name):
827 # Use the entitydefs passed into constructor, not
828 # HTMLParser.HTMLParser's entitydefs.
829 return self.unescape_attr(name)
831 def unescape_attr_if_required(self, name):
832 return name # HTMLParser.HTMLParser already did it
833 def unescape_attrs_if_required(self, attrs):
834 return attrs # ditto
837 class _AbstractSgmllibParser(_AbstractFormParser):
839 def do_option(self, attrs):
840 _AbstractFormParser._start_option(self, attrs)
842 if sys.version_info[:2] >= (2,5):
843 # we override this attr to decode hex charrefs
844 entity_or_charref = re.compile(
845 '&(?:([a-zA-Z][-.a-zA-Z0-9]*)|#(x?[0-9a-fA-F]+))(;?)')
846 def convert_entityref(self, name):
847 return unescape("&%s;" % name, self._entitydefs, self._encoding)
848 def convert_charref(self, name):
849 return unescape_charref("%s" % name, self._encoding)
850 def unescape_attr_if_required(self, name):
851 return name # sgmllib already did it
852 def unescape_attrs_if_required(self, attrs):
853 return attrs # ditto
854 else:
855 def unescape_attr_if_required(self, name):
856 return self.unescape_attr(name)
857 def unescape_attrs_if_required(self, attrs):
858 return self.unescape_attrs(attrs)
861 class FormParser(_AbstractSgmllibParser, sgmllib.SGMLParser):
862 """Good for tolerance of incorrect HTML, bad for XHTML."""
863 def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
864 sgmllib.SGMLParser.__init__(self)
865 _AbstractFormParser.__init__(self, entitydefs, encoding)
867 def feed(self, data):
868 try:
869 sgmllib.SGMLParser.feed(self, data)
870 except SGMLLIB_PARSEERROR, exc:
871 raise ParseError(exc)
875 # sigh, must support mechanize by allowing dynamic creation of classes based on
876 # its bundled copy of BeautifulSoup (which was necessary because of dependency
877 # problems)
879 def _create_bs_classes(bs,
880 icbinbs,
882 class _AbstractBSFormParser(_AbstractSgmllibParser):
883 bs_base_class = None
884 def __init__(self, entitydefs=None, encoding=DEFAULT_ENCODING):
885 _AbstractFormParser.__init__(self, entitydefs, encoding)
886 self.bs_base_class.__init__(self)
887 def handle_data(self, data):
888 _AbstractFormParser.handle_data(self, data)
889 self.bs_base_class.handle_data(self, data)
890 def feed(self, data):
891 try:
892 self.bs_base_class.feed(self, data)
893 except SGMLLIB_PARSEERROR, exc:
894 raise ParseError(exc)
897 class RobustFormParser(_AbstractBSFormParser, bs):
898 """Tries to be highly tolerant of incorrect HTML."""
899 pass
900 RobustFormParser.bs_base_class = bs
901 class NestingRobustFormParser(_AbstractBSFormParser, icbinbs):
902 """Tries to be highly tolerant of incorrect HTML.
904 Different from RobustFormParser in that it more often guesses nesting
905 above missing end tags (see BeautifulSoup docs).
908 pass
909 NestingRobustFormParser.bs_base_class = icbinbs
911 return RobustFormParser, NestingRobustFormParser
913 try:
914 if sys.version_info[:2] < (2, 2):
915 raise ImportError # BeautifulSoup uses generators
916 import BeautifulSoup
917 except ImportError:
918 pass
919 else:
920 RobustFormParser, NestingRobustFormParser = _create_bs_classes(
921 BeautifulSoup.BeautifulSoup, BeautifulSoup.ICantBelieveItsBeautifulSoup
925 #FormParser = XHTMLCompatibleFormParser # testing hack
926 #FormParser = RobustFormParser # testing hack
929 def ParseResponseEx(response,
930 select_default=False,
931 form_parser_class=FormParser,
932 request_class=urllib2.Request,
933 entitydefs=None,
934 encoding=DEFAULT_ENCODING,
936 # private
937 _urljoin=urlparse.urljoin,
938 _urlparse=urlparse.urlparse,
939 _urlunparse=urlparse.urlunparse,
941 """Identical to ParseResponse, except that:
943 1. The returned list contains an extra item. The first form in the list
944 contains all controls not contained in any FORM element.
946 2. The arguments ignore_errors and backwards_compat have been removed.
948 3. Backwards-compatibility mode (backwards_compat=True) is not available.
950 return _ParseFileEx(response, response.geturl(),
951 select_default,
952 False,
953 form_parser_class,
954 request_class,
955 entitydefs,
956 False,
957 encoding,
958 _urljoin=_urljoin,
959 _urlparse=_urlparse,
960 _urlunparse=_urlunparse,
963 def ParseFileEx(file, base_uri,
964 select_default=False,
965 form_parser_class=FormParser,
966 request_class=urllib2.Request,
967 entitydefs=None,
968 encoding=DEFAULT_ENCODING,
970 # private
971 _urljoin=urlparse.urljoin,
972 _urlparse=urlparse.urlparse,
973 _urlunparse=urlparse.urlunparse,
975 """Identical to ParseFile, except that:
977 1. The returned list contains an extra item. The first form in the list
978 contains all controls not contained in any FORM element.
980 2. The arguments ignore_errors and backwards_compat have been removed.
982 3. Backwards-compatibility mode (backwards_compat=True) is not available.
984 return _ParseFileEx(file, base_uri,
985 select_default,
986 False,
987 form_parser_class,
988 request_class,
989 entitydefs,
990 False,
991 encoding,
992 _urljoin=_urljoin,
993 _urlparse=_urlparse,
994 _urlunparse=_urlunparse,
997 def ParseResponse(response, *args, **kwds):
998 """Parse HTTP response and return a list of HTMLForm instances.
1000 The return value of urllib2.urlopen can be conveniently passed to this
1001 function as the response parameter.
1003 ClientForm.ParseError is raised on parse errors.
1005 response: file-like object (supporting read() method) with a method
1006 geturl(), returning the URI of the HTTP response
1007 select_default: for multiple-selection SELECT controls and RADIO controls,
1008 pick the first item as the default if none are selected in the HTML
1009 form_parser_class: class to instantiate and use to pass
1010 request_class: class to return from .click() method (default is
1011 urllib2.Request)
1012 entitydefs: mapping like {"&amp;": "&", ...} containing HTML entity
1013 definitions (a sensible default is used)
1014 encoding: character encoding used for encoding numeric character references
1015 when matching link text. ClientForm does not attempt to find the encoding
1016 in a META HTTP-EQUIV attribute in the document itself (mechanize, for
1017 example, does do that and will pass the correct value to ClientForm using
1018 this parameter).
1020 backwards_compat: boolean that determines whether the returned HTMLForm
1021 objects are backwards-compatible with old code. If backwards_compat is
1022 true:
1024 - ClientForm 0.1 code will continue to work as before.
1026 - Label searches that do not specify a nr (number or count) will always
1027 get the first match, even if other controls match. If
1028 backwards_compat is False, label searches that have ambiguous results
1029 will raise an AmbiguityError.
1031 - Item label matching is done by strict string comparison rather than
1032 substring matching.
1034 - De-selecting individual list items is allowed even if the Item is
1035 disabled.
1037 The backwards_compat argument will be deprecated in a future release.
1039 Pass a true value for select_default if you want the behaviour specified by
1040 RFC 1866 (the HTML 2.0 standard), which is to select the first item in a
1041 RADIO or multiple-selection SELECT control if none were selected in the
1042 HTML. Most browsers (including Microsoft Internet Explorer (IE) and
1043 Netscape Navigator) instead leave all items unselected in these cases. The
1044 W3C HTML 4.0 standard leaves this behaviour undefined in the case of
1045 multiple-selection SELECT controls, but insists that at least one RADIO
1046 button should be checked at all times, in contradiction to browser
1047 behaviour.
1049 There is a choice of parsers. ClientForm.XHTMLCompatibleFormParser (uses
1050 HTMLParser.HTMLParser) works best for XHTML, ClientForm.FormParser (uses
1051 sgmllib.SGMLParser) (the default) works better for ordinary grubby HTML.
1052 Note that HTMLParser is only available in Python 2.2 and later. You can
1053 pass your own class in here as a hack to work around bad HTML, but at your
1054 own risk: there is no well-defined interface.
1057 return _ParseFileEx(response, response.geturl(), *args, **kwds)[1:]
1059 def ParseFile(file, base_uri, *args, **kwds):
1060 """Parse HTML and return a list of HTMLForm instances.
1062 ClientForm.ParseError is raised on parse errors.
1064 file: file-like object (supporting read() method) containing HTML with zero
1065 or more forms to be parsed
1066 base_uri: the URI of the document (note that the base URI used to submit
1067 the form will be that given in the BASE element if present, not that of
1068 the document)
1070 For the other arguments and further details, see ParseResponse.__doc__.
1073 return _ParseFileEx(file, base_uri, *args, **kwds)[1:]
1075 def _ParseFileEx(file, base_uri,
1076 select_default=False,
1077 ignore_errors=False,
1078 form_parser_class=FormParser,
1079 request_class=urllib2.Request,
1080 entitydefs=None,
1081 backwards_compat=True,
1082 encoding=DEFAULT_ENCODING,
1083 _urljoin=urlparse.urljoin,
1084 _urlparse=urlparse.urlparse,
1085 _urlunparse=urlparse.urlunparse,
1087 if backwards_compat:
1088 deprecation("operating in backwards-compatibility mode", 1)
1089 fp = form_parser_class(entitydefs, encoding)
1090 while 1:
1091 data = file.read(CHUNK)
1092 try:
1093 fp.feed(data)
1094 except ParseError, e:
1095 e.base_uri = base_uri
1096 raise
1097 if len(data) != CHUNK: break
1098 if fp.base is not None:
1099 # HTML BASE element takes precedence over document URI
1100 base_uri = fp.base
1101 labels = [] # Label(label) for label in fp.labels]
1102 id_to_labels = {}
1103 for l in fp.labels:
1104 label = Label(l)
1105 labels.append(label)
1106 for_id = l["for"]
1107 coll = id_to_labels.get(for_id)
1108 if coll is None:
1109 id_to_labels[for_id] = [label]
1110 else:
1111 coll.append(label)
1112 forms = []
1113 for (name, action, method, enctype), attrs, controls in fp.forms:
1114 if action is None:
1115 action = base_uri
1116 else:
1117 action = _urljoin(base_uri, action)
1118 # would be nice to make HTMLForm class (form builder) pluggable
1119 form = HTMLForm(
1120 action, method, enctype, name, attrs, request_class,
1121 forms, labels, id_to_labels, backwards_compat)
1122 form._urlparse = _urlparse
1123 form._urlunparse = _urlunparse
1124 for ii in range(len(controls)):
1125 type, name, attrs = controls[ii]
1126 # index=ii*10 allows ImageControl to return multiple ordered pairs
1127 form.new_control(
1128 type, name, attrs, select_default=select_default, index=ii*10)
1129 forms.append(form)
1130 for form in forms:
1131 form.fixup()
1132 return forms
1135 class Label:
1136 def __init__(self, attrs):
1137 self.id = attrs.get("for")
1138 self._text = attrs.get("__text").strip()
1139 self._ctext = compress_text(self._text)
1140 self.attrs = attrs
1141 self._backwards_compat = False # maintained by HTMLForm
1143 def __getattr__(self, name):
1144 if name == "text":
1145 if self._backwards_compat:
1146 return self._text
1147 else:
1148 return self._ctext
1149 return getattr(Label, name)
1151 def __setattr__(self, name, value):
1152 if name == "text":
1153 # don't see any need for this, so make it read-only
1154 raise AttributeError("text attribute is read-only")
1155 self.__dict__[name] = value
1157 def __str__(self):
1158 return "<Label(id=%r, text=%r)>" % (self.id, self.text)
1161 def _get_label(attrs):
1162 text = attrs.get("__label")
1163 if text is not None:
1164 return Label(text)
1165 else:
1166 return None
1168 class Control:
1169 """An HTML form control.
1171 An HTMLForm contains a sequence of Controls. The Controls in an HTMLForm
1172 are accessed using the HTMLForm.find_control method or the
1173 HTMLForm.controls attribute.
1175 Control instances are usually constructed using the ParseFile /
1176 ParseResponse functions. If you use those functions, you can ignore the
1177 rest of this paragraph. A Control is only properly initialised after the
1178 fixup method has been called. In fact, this is only strictly necessary for
1179 ListControl instances. This is necessary because ListControls are built up
1180 from ListControls each containing only a single item, and their initial
1181 value(s) can only be known after the sequence is complete.
1183 The types and values that are acceptable for assignment to the value
1184 attribute are defined by subclasses.
1186 If the disabled attribute is true, this represents the state typically
1187 represented by browsers by 'greying out' a control. If the disabled
1188 attribute is true, the Control will raise AttributeError if an attempt is
1189 made to change its value. In addition, the control will not be considered
1190 'successful' as defined by the W3C HTML 4 standard -- ie. it will
1191 contribute no data to the return value of the HTMLForm.click* methods. To
1192 enable a control, set the disabled attribute to a false value.
1194 If the readonly attribute is true, the Control will raise AttributeError if
1195 an attempt is made to change its value. To make a control writable, set
1196 the readonly attribute to a false value.
1198 All controls have the disabled and readonly attributes, not only those that
1199 may have the HTML attributes of the same names.
1201 On assignment to the value attribute, the following exceptions are raised:
1202 TypeError, AttributeError (if the value attribute should not be assigned
1203 to, because the control is disabled, for example) and ValueError.
1205 If the name or value attributes are None, or the value is an empty list, or
1206 if the control is disabled, the control is not successful.
1208 Public attributes:
1210 type: string describing type of control (see the keys of the
1211 HTMLForm.type2class dictionary for the allowable values) (readonly)
1212 name: name of control (readonly)
1213 value: current value of control (subclasses may allow a single value, a
1214 sequence of values, or either)
1215 disabled: disabled state
1216 readonly: readonly state
1217 id: value of id HTML attribute
1220 def __init__(self, type, name, attrs, index=None):
1222 type: string describing type of control (see the keys of the
1223 HTMLForm.type2class dictionary for the allowable values)
1224 name: control name
1225 attrs: HTML attributes of control's HTML element
1228 raise NotImplementedError()
1230 def add_to_form(self, form):
1231 self._form = form
1232 form.controls.append(self)
1234 def fixup(self):
1235 pass
1237 def is_of_kind(self, kind):
1238 raise NotImplementedError()
1240 def clear(self):
1241 raise NotImplementedError()
1243 def __getattr__(self, name): raise NotImplementedError()
1244 def __setattr__(self, name, value): raise NotImplementedError()
1246 def pairs(self):
1247 """Return list of (key, value) pairs suitable for passing to urlencode.
1249 return [(k, v) for (i, k, v) in self._totally_ordered_pairs()]
1251 def _totally_ordered_pairs(self):
1252 """Return list of (key, value, index) tuples.
1254 Like pairs, but allows preserving correct ordering even where several
1255 controls are involved.
1258 raise NotImplementedError()
1260 def _write_mime_data(self, mw, name, value):
1261 """Write data for a subitem of this control to a MimeWriter."""
1262 # called by HTMLForm
1263 mw2 = mw.nextpart()
1264 mw2.addheader("Content-disposition",
1265 'form-data; name="%s"' % name, 1)
1266 f = mw2.startbody(prefix=0)
1267 f.write(value)
1269 def __str__(self):
1270 raise NotImplementedError()
1272 def get_labels(self):
1273 """Return all labels (Label instances) for this control.
1275 If the control was surrounded by a <label> tag, that will be the first
1276 label; all other labels, connected by 'for' and 'id', are in the order
1277 that appear in the HTML.
1280 res = []
1281 if self._label:
1282 res.append(self._label)
1283 if self.id:
1284 res.extend(self._form._id_to_labels.get(self.id, ()))
1285 return res
1288 #---------------------------------------------------
1289 class ScalarControl(Control):
1290 """Control whose value is not restricted to one of a prescribed set.
1292 Some ScalarControls don't accept any value attribute. Otherwise, takes a
1293 single value, which must be string-like.
1295 Additional read-only public attribute:
1297 attrs: dictionary mapping the names of original HTML attributes of the
1298 control to their values
1301 def __init__(self, type, name, attrs, index=None):
1302 self._index = index
1303 self._label = _get_label(attrs)
1304 self.__dict__["type"] = type.lower()
1305 self.__dict__["name"] = name
1306 self._value = attrs.get("value")
1307 self.disabled = attrs.has_key("disabled")
1308 self.readonly = attrs.has_key("readonly")
1309 self.id = attrs.get("id")
1311 self.attrs = attrs.copy()
1313 self._clicked = False
1315 self._urlparse = urlparse.urlparse
1316 self._urlunparse = urlparse.urlunparse
1318 def __getattr__(self, name):
1319 if name == "value":
1320 return self.__dict__["_value"]
1321 else:
1322 raise AttributeError("%s instance has no attribute '%s'" %
1323 (self.__class__.__name__, name))
1325 def __setattr__(self, name, value):
1326 if name == "value":
1327 if not isstringlike(value):
1328 raise TypeError("must assign a string")
1329 elif self.readonly:
1330 raise AttributeError("control '%s' is readonly" % self.name)
1331 elif self.disabled:
1332 raise AttributeError("control '%s' is disabled" % self.name)
1333 self.__dict__["_value"] = value
1334 elif name in ("name", "type"):
1335 raise AttributeError("%s attribute is readonly" % name)
1336 else:
1337 self.__dict__[name] = value
1339 def _totally_ordered_pairs(self):
1340 name = self.name
1341 value = self.value
1342 if name is None or value is None or self.disabled:
1343 return []
1344 return [(self._index, name, value)]
1346 def clear(self):
1347 if self.readonly:
1348 raise AttributeError("control '%s' is readonly" % self.name)
1349 self.__dict__["_value"] = None
1351 def __str__(self):
1352 name = self.name
1353 value = self.value
1354 if name is None: name = "<None>"
1355 if value is None: value = "<None>"
1357 infos = []
1358 if self.disabled: infos.append("disabled")
1359 if self.readonly: infos.append("readonly")
1360 info = ", ".join(infos)
1361 if info: info = " (%s)" % info
1363 return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info)
1366 #---------------------------------------------------
1367 class TextControl(ScalarControl):
1368 """Textual input control.
1370 Covers:
1372 INPUT/TEXT
1373 INPUT/PASSWORD
1374 INPUT/HIDDEN
1375 TEXTAREA
1378 def __init__(self, type, name, attrs, index=None):
1379 ScalarControl.__init__(self, type, name, attrs, index)
1380 if self.type == "hidden": self.readonly = True
1381 if self._value is None:
1382 self._value = ""
1384 def is_of_kind(self, kind): return kind == "text"
1386 #---------------------------------------------------
1387 class FileControl(ScalarControl):
1388 """File upload with INPUT TYPE=FILE.
1390 The value attribute of a FileControl is always None. Use add_file instead.
1392 Additional public method: add_file
1396 def __init__(self, type, name, attrs, index=None):
1397 ScalarControl.__init__(self, type, name, attrs, index)
1398 self._value = None
1399 self._upload_data = []
1401 def is_of_kind(self, kind): return kind == "file"
1403 def clear(self):
1404 if self.readonly:
1405 raise AttributeError("control '%s' is readonly" % self.name)
1406 self._upload_data = []
1408 def __setattr__(self, name, value):
1409 if name in ("value", "name", "type"):
1410 raise AttributeError("%s attribute is readonly" % name)
1411 else:
1412 self.__dict__[name] = value
1414 def add_file(self, file_object, content_type=None, filename=None):
1415 if not hasattr(file_object, "read"):
1416 raise TypeError("file-like object must have read method")
1417 if content_type is not None and not isstringlike(content_type):
1418 raise TypeError("content type must be None or string-like")
1419 if filename is not None and not isstringlike(filename):
1420 raise TypeError("filename must be None or string-like")
1421 if content_type is None:
1422 content_type = "application/octet-stream"
1423 self._upload_data.append((file_object, content_type, filename))
1425 def _totally_ordered_pairs(self):
1426 # XXX should it be successful even if unnamed?
1427 if self.name is None or self.disabled:
1428 return []
1429 return [(self._index, self.name, "")]
1431 def _write_mime_data(self, mw, _name, _value):
1432 # called by HTMLForm
1433 # assert _name == self.name and _value == ''
1434 if len(self._upload_data) == 1:
1435 # single file
1436 file_object, content_type, filename = self._upload_data[0]
1437 mw2 = mw.nextpart()
1438 fn_part = filename and ('; filename="%s"' % filename) or ""
1439 disp = 'form-data; name="%s"%s' % (self.name, fn_part)
1440 mw2.addheader("Content-disposition", disp, prefix=1)
1441 fh = mw2.startbody(content_type, prefix=0)
1442 fh.write(file_object.read())
1443 elif len(self._upload_data) != 0:
1444 # multiple files
1445 mw2 = mw.nextpart()
1446 disp = 'form-data; name="%s"' % self.name
1447 mw2.addheader("Content-disposition", disp, prefix=1)
1448 fh = mw2.startmultipartbody("mixed", prefix=0)
1449 for file_object, content_type, filename in self._upload_data:
1450 mw3 = mw2.nextpart()
1451 fn_part = filename and ('; filename="%s"' % filename) or ""
1452 disp = "file%s" % fn_part
1453 mw3.addheader("Content-disposition", disp, prefix=1)
1454 fh2 = mw3.startbody(content_type, prefix=0)
1455 fh2.write(file_object.read())
1456 mw2.lastpart()
1458 def __str__(self):
1459 name = self.name
1460 if name is None: name = "<None>"
1462 if not self._upload_data:
1463 value = "<No files added>"
1464 else:
1465 value = []
1466 for file, ctype, filename in self._upload_data:
1467 if filename is None:
1468 value.append("<Unnamed file>")
1469 else:
1470 value.append(filename)
1471 value = ", ".join(value)
1473 info = []
1474 if self.disabled: info.append("disabled")
1475 if self.readonly: info.append("readonly")
1476 info = ", ".join(info)
1477 if info: info = " (%s)" % info
1479 return "<%s(%s=%s)%s>" % (self.__class__.__name__, name, value, info)
1482 #---------------------------------------------------
1483 class IsindexControl(ScalarControl):
1484 """ISINDEX control.
1486 ISINDEX is the odd-one-out of HTML form controls. In fact, it isn't really
1487 part of regular HTML forms at all, and predates it. You're only allowed
1488 one ISINDEX per HTML document. ISINDEX and regular form submission are
1489 mutually exclusive -- either submit a form, or the ISINDEX.
1491 Having said this, since ISINDEX controls may appear in forms (which is
1492 probably bad HTML), ParseFile / ParseResponse will include them in the
1493 HTMLForm instances it returns. You can set the ISINDEX's value, as with
1494 any other control (but note that ISINDEX controls have no name, so you'll
1495 need to use the type argument of set_value!). When you submit the form,
1496 the ISINDEX will not be successful (ie., no data will get returned to the
1497 server as a result of its presence), unless you click on the ISINDEX
1498 control, in which case the ISINDEX gets submitted instead of the form:
1500 form.set_value("my isindex value", type="isindex")
1501 urllib2.urlopen(form.click(type="isindex"))
1503 ISINDEX elements outside of FORMs are ignored. If you want to submit one
1504 by hand, do it like so:
1506 url = urlparse.urljoin(page_uri, "?"+urllib.quote_plus("my isindex value"))
1507 result = urllib2.urlopen(url)
1510 def __init__(self, type, name, attrs, index=None):
1511 ScalarControl.__init__(self, type, name, attrs, index)
1512 if self._value is None:
1513 self._value = ""
1515 def is_of_kind(self, kind): return kind in ["text", "clickable"]
1517 def _totally_ordered_pairs(self):
1518 return []
1520 def _click(self, form, coord, return_type, request_class=urllib2.Request):
1521 # Relative URL for ISINDEX submission: instead of "foo=bar+baz",
1522 # want "bar+baz".
1523 # This doesn't seem to be specified in HTML 4.01 spec. (ISINDEX is
1524 # deprecated in 4.01, but it should still say how to submit it).
1525 # Submission of ISINDEX is explained in the HTML 3.2 spec, though.
1526 parts = self._urlparse(form.action)
1527 rest, (query, frag) = parts[:-2], parts[-2:]
1528 parts = rest + (urllib.quote_plus(self.value), None)
1529 url = self._urlunparse(parts)
1530 req_data = url, None, []
1532 if return_type == "pairs":
1533 return []
1534 elif return_type == "request_data":
1535 return req_data
1536 else:
1537 return request_class(url)
1539 def __str__(self):
1540 value = self.value
1541 if value is None: value = "<None>"
1543 infos = []
1544 if self.disabled: infos.append("disabled")
1545 if self.readonly: infos.append("readonly")
1546 info = ", ".join(infos)
1547 if info: info = " (%s)" % info
1549 return "<%s(%s)%s>" % (self.__class__.__name__, value, info)
1552 #---------------------------------------------------
1553 class IgnoreControl(ScalarControl):
1554 """Control that we're not interested in.
1556 Covers:
1558 INPUT/RESET
1559 BUTTON/RESET
1560 INPUT/BUTTON
1561 BUTTON/BUTTON
1563 These controls are always unsuccessful, in the terminology of HTML 4 (ie.
1564 they never require any information to be returned to the server).
1566 BUTTON/BUTTON is used to generate events for script embedded in HTML.
1568 The value attribute of IgnoreControl is always None.
1571 def __init__(self, type, name, attrs, index=None):
1572 ScalarControl.__init__(self, type, name, attrs, index)
1573 self._value = None
1575 def is_of_kind(self, kind): return False
1577 def __setattr__(self, name, value):
1578 if name == "value":
1579 raise AttributeError(
1580 "control '%s' is ignored, hence read-only" % self.name)
1581 elif name in ("name", "type"):
1582 raise AttributeError("%s attribute is readonly" % name)
1583 else:
1584 self.__dict__[name] = value
1587 #---------------------------------------------------
1588 # ListControls
1590 # helpers and subsidiary classes
1592 class Item:
1593 def __init__(self, control, attrs, index=None):
1594 label = _get_label(attrs)
1595 self.__dict__.update({
1596 "name": attrs["value"],
1597 "_labels": label and [label] or [],
1598 "attrs": attrs,
1599 "_control": control,
1600 "disabled": attrs.has_key("disabled"),
1601 "_selected": False,
1602 "id": attrs.get("id"),
1603 "_index": index,
1605 control.items.append(self)
1607 def get_labels(self):
1608 """Return all labels (Label instances) for this item.
1610 For items that represent radio buttons or checkboxes, if the item was
1611 surrounded by a <label> tag, that will be the first label; all other
1612 labels, connected by 'for' and 'id', are in the order that appear in
1613 the HTML.
1615 For items that represent select options, if the option had a label
1616 attribute, that will be the first label. If the option has contents
1617 (text within the option tags) and it is not the same as the label
1618 attribute (if any), that will be a label. There is nothing in the
1619 spec to my knowledge that makes an option with an id unable to be the
1620 target of a label's for attribute, so those are included, if any, for
1621 the sake of consistency and completeness.
1624 res = []
1625 res.extend(self._labels)
1626 if self.id:
1627 res.extend(self._control._form._id_to_labels.get(self.id, ()))
1628 return res
1630 def __getattr__(self, name):
1631 if name=="selected":
1632 return self._selected
1633 raise AttributeError(name)
1635 def __setattr__(self, name, value):
1636 if name == "selected":
1637 self._control._set_selected_state(self, value)
1638 elif name == "disabled":
1639 self.__dict__["disabled"] = bool(value)
1640 else:
1641 raise AttributeError(name)
1643 def __str__(self):
1644 res = self.name
1645 if self.selected:
1646 res = "*" + res
1647 if self.disabled:
1648 res = "(%s)" % res
1649 return res
1651 def __repr__(self):
1652 # XXX appending the attrs without distinguishing them from name and id
1653 # is silly
1654 attrs = [("name", self.name), ("id", self.id)]+self.attrs.items()
1655 return "<%s %s>" % (
1656 self.__class__.__name__,
1657 " ".join(["%s=%r" % (k, v) for k, v in attrs])
1660 def disambiguate(items, nr, **kwds):
1661 msgs = []
1662 for key, value in kwds.items():
1663 msgs.append("%s=%r" % (key, value))
1664 msg = " ".join(msgs)
1665 if not items:
1666 raise ItemNotFoundError(msg)
1667 if nr is None:
1668 if len(items) > 1:
1669 raise AmbiguityError(msg)
1670 nr = 0
1671 if len(items) <= nr:
1672 raise ItemNotFoundError(msg)
1673 return items[nr]
1675 class ListControl(Control):
1676 """Control representing a sequence of items.
1678 The value attribute of a ListControl represents the successful list items
1679 in the control. The successful list items are those that are selected and
1680 not disabled.
1682 ListControl implements both list controls that take a length-1 value
1683 (single-selection) and those that take length >1 values
1684 (multiple-selection).
1686 ListControls accept sequence values only. Some controls only accept
1687 sequences of length 0 or 1 (RADIO, and single-selection SELECT).
1688 In those cases, ItemCountError is raised if len(sequence) > 1. CHECKBOXes
1689 and multiple-selection SELECTs (those having the "multiple" HTML attribute)
1690 accept sequences of any length.
1692 Note the following mistake:
1694 control.value = some_value
1695 assert control.value == some_value # not necessarily true
1697 The reason for this is that the value attribute always gives the list items
1698 in the order they were listed in the HTML.
1700 ListControl items can also be referred to by their labels instead of names.
1701 Use the label argument to .get(), and the .set_value_by_label(),
1702 .get_value_by_label() methods.
1704 Note that, rather confusingly, though SELECT controls are represented in
1705 HTML by SELECT elements (which contain OPTION elements, representing
1706 individual list items), CHECKBOXes and RADIOs are not represented by *any*
1707 element. Instead, those controls are represented by a collection of INPUT
1708 elements. For example, this is a SELECT control, named "control1":
1710 <select name="control1">
1711 <option>foo</option>
1712 <option value="1">bar</option>
1713 </select>
1715 and this is a CHECKBOX control, named "control2":
1717 <input type="checkbox" name="control2" value="foo" id="cbe1">
1718 <input type="checkbox" name="control2" value="bar" id="cbe2">
1720 The id attribute of a CHECKBOX or RADIO ListControl is always that of its
1721 first element (for example, "cbe1" above).
1724 Additional read-only public attribute: multiple.
1728 # ListControls are built up by the parser from their component items by
1729 # creating one ListControl per item, consolidating them into a single
1730 # master ListControl held by the HTMLForm:
1732 # -User calls form.new_control(...)
1733 # -Form creates Control, and calls control.add_to_form(self).
1734 # -Control looks for a Control with the same name and type in the form,
1735 # and if it finds one, merges itself with that control by calling
1736 # control.merge_control(self). The first Control added to the form, of
1737 # a particular name and type, is the only one that survives in the
1738 # form.
1739 # -Form calls control.fixup for all its controls. ListControls in the
1740 # form know they can now safely pick their default values.
1742 # To create a ListControl without an HTMLForm, use:
1744 # control.merge_control(new_control)
1746 # (actually, it's much easier just to use ParseFile)
1748 _label = None
1750 def __init__(self, type, name, attrs={}, select_default=False,
1751 called_as_base_class=False, index=None):
1753 select_default: for RADIO and multiple-selection SELECT controls, pick
1754 the first item as the default if no 'selected' HTML attribute is
1755 present
1758 if not called_as_base_class:
1759 raise NotImplementedError()
1761 self.__dict__["type"] = type.lower()
1762 self.__dict__["name"] = name
1763 self._value = attrs.get("value")
1764 self.disabled = False
1765 self.readonly = False
1766 self.id = attrs.get("id")
1767 self._closed = False
1769 # As Controls are merged in with .merge_control(), self.attrs will
1770 # refer to each Control in turn -- always the most recently merged
1771 # control. Each merged-in Control instance corresponds to a single
1772 # list item: see ListControl.__doc__.
1773 self.items = []
1774 self._form = None
1776 self._select_default = select_default
1777 self._clicked = False
1779 def clear(self):
1780 self.value = []
1782 def is_of_kind(self, kind):
1783 if kind == "list":
1784 return True
1785 elif kind == "multilist":
1786 return bool(self.multiple)
1787 elif kind == "singlelist":
1788 return not self.multiple
1789 else:
1790 return False
1792 def get_items(self, name=None, label=None, id=None,
1793 exclude_disabled=False):
1794 """Return matching items by name or label.
1796 For argument docs, see the docstring for .get()
1799 if name is not None and not isstringlike(name):
1800 raise TypeError("item name must be string-like")
1801 if label is not None and not isstringlike(label):
1802 raise TypeError("item label must be string-like")
1803 if id is not None and not isstringlike(id):
1804 raise TypeError("item id must be string-like")
1805 items = [] # order is important
1806 compat = self._form.backwards_compat
1807 for o in self.items:
1808 if exclude_disabled and o.disabled:
1809 continue
1810 if name is not None and o.name != name:
1811 continue
1812 if label is not None:
1813 for l in o.get_labels():
1814 if ((compat and l.text == label) or
1815 (not compat and l.text.find(label) > -1)):
1816 break
1817 else:
1818 continue
1819 if id is not None and o.id != id:
1820 continue
1821 items.append(o)
1822 return items
1824 def get(self, name=None, label=None, id=None, nr=None,
1825 exclude_disabled=False):
1826 """Return item by name or label, disambiguating if necessary with nr.
1828 All arguments must be passed by name, with the exception of 'name',
1829 which may be used as a positional argument.
1831 If name is specified, then the item must have the indicated name.
1833 If label is specified, then the item must have a label whose
1834 whitespace-compressed, stripped, text substring-matches the indicated
1835 label string (eg. label="please choose" will match
1836 " Do please choose an item ").
1838 If id is specified, then the item must have the indicated id.
1840 nr is an optional 0-based index of the items matching the query.
1842 If nr is the default None value and more than item is found, raises
1843 AmbiguityError (unless the HTMLForm instance's backwards_compat
1844 attribute is true).
1846 If no item is found, or if items are found but nr is specified and not
1847 found, raises ItemNotFoundError.
1849 Optionally excludes disabled items.
1852 if nr is None and self._form.backwards_compat:
1853 nr = 0 # :-/
1854 items = self.get_items(name, label, id, exclude_disabled)
1855 return disambiguate(items, nr, name=name, label=label, id=id)
1857 def _get(self, name, by_label=False, nr=None, exclude_disabled=False):
1858 # strictly for use by deprecated methods
1859 if by_label:
1860 name, label = None, name
1861 else:
1862 name, label = name, None
1863 return self.get(name, label, nr, exclude_disabled)
1865 def toggle(self, name, by_label=False, nr=None):
1866 """Deprecated: given a name or label and optional disambiguating index
1867 nr, toggle the matching item's selection.
1869 Selecting items follows the behavior described in the docstring of the
1870 'get' method.
1872 if the item is disabled, or this control is disabled or readonly,
1873 raise AttributeError.
1876 deprecation(
1877 "item = control.get(...); item.selected = not item.selected")
1878 o = self._get(name, by_label, nr)
1879 self._set_selected_state(o, not o.selected)
1881 def set(self, selected, name, by_label=False, nr=None):
1882 """Deprecated: given a name or label and optional disambiguating index
1883 nr, set the matching item's selection to the bool value of selected.
1885 Selecting items follows the behavior described in the docstring of the
1886 'get' method.
1888 if the item is disabled, or this control is disabled or readonly,
1889 raise AttributeError.
1892 deprecation(
1893 "control.get(...).selected = <boolean>")
1894 self._set_selected_state(self._get(name, by_label, nr), selected)
1896 def _set_selected_state(self, item, action):
1897 # action:
1898 # bool False: off
1899 # bool True: on
1900 if self.disabled:
1901 raise AttributeError("control '%s' is disabled" % self.name)
1902 if self.readonly:
1903 raise AttributeError("control '%s' is readonly" % self.name)
1904 action == bool(action)
1905 compat = self._form.backwards_compat
1906 if not compat and item.disabled:
1907 raise AttributeError("item is disabled")
1908 else:
1909 if compat and item.disabled and action:
1910 raise AttributeError("item is disabled")
1911 if self.multiple:
1912 item.__dict__["_selected"] = action
1913 else:
1914 if not action:
1915 item.__dict__["_selected"] = False
1916 else:
1917 for o in self.items:
1918 o.__dict__["_selected"] = False
1919 item.__dict__["_selected"] = True
1921 def toggle_single(self, by_label=None):
1922 """Deprecated: toggle the selection of the single item in this control.
1924 Raises ItemCountError if the control does not contain only one item.
1926 by_label argument is ignored, and included only for backwards
1927 compatibility.
1930 deprecation(
1931 "control.items[0].selected = not control.items[0].selected")
1932 if len(self.items) != 1:
1933 raise ItemCountError(
1934 "'%s' is not a single-item control" % self.name)
1935 item = self.items[0]
1936 self._set_selected_state(item, not item.selected)
1938 def set_single(self, selected, by_label=None):
1939 """Deprecated: set the selection of the single item in this control.
1941 Raises ItemCountError if the control does not contain only one item.
1943 by_label argument is ignored, and included only for backwards
1944 compatibility.
1947 deprecation(
1948 "control.items[0].selected = <boolean>")
1949 if len(self.items) != 1:
1950 raise ItemCountError(
1951 "'%s' is not a single-item control" % self.name)
1952 self._set_selected_state(self.items[0], selected)
1954 def get_item_disabled(self, name, by_label=False, nr=None):
1955 """Get disabled state of named list item in a ListControl."""
1956 deprecation(
1957 "control.get(...).disabled")
1958 return self._get(name, by_label, nr).disabled
1960 def set_item_disabled(self, disabled, name, by_label=False, nr=None):
1961 """Set disabled state of named list item in a ListControl.
1963 disabled: boolean disabled state
1966 deprecation(
1967 "control.get(...).disabled = <boolean>")
1968 self._get(name, by_label, nr).disabled = disabled
1970 def set_all_items_disabled(self, disabled):
1971 """Set disabled state of all list items in a ListControl.
1973 disabled: boolean disabled state
1976 for o in self.items:
1977 o.disabled = disabled
1979 def get_item_attrs(self, name, by_label=False, nr=None):
1980 """Return dictionary of HTML attributes for a single ListControl item.
1982 The HTML element types that describe list items are: OPTION for SELECT
1983 controls, INPUT for the rest. These elements have HTML attributes that
1984 you may occasionally want to know about -- for example, the "alt" HTML
1985 attribute gives a text string describing the item (graphical browsers
1986 usually display this as a tooltip).
1988 The returned dictionary maps HTML attribute names to values. The names
1989 and values are taken from the original HTML.
1992 deprecation(
1993 "control.get(...).attrs")
1994 return self._get(name, by_label, nr).attrs
1996 def close_control(self):
1997 self._closed = True
1999 def add_to_form(self, form):
2000 assert self._form is None or form == self._form, (
2001 "can't add control to more than one form")
2002 self._form = form
2003 if self.name is None:
2004 # always count nameless elements as separate controls
2005 Control.add_to_form(self, form)
2006 else:
2007 for ii in range(len(form.controls)-1, -1, -1):
2008 control = form.controls[ii]
2009 if control.name == self.name and control.type == self.type:
2010 if control._closed:
2011 Control.add_to_form(self, form)
2012 else:
2013 control.merge_control(self)
2014 break
2015 else:
2016 Control.add_to_form(self, form)
2018 def merge_control(self, control):
2019 assert bool(control.multiple) == bool(self.multiple)
2020 # usually, isinstance(control, self.__class__)
2021 self.items.extend(control.items)
2023 def fixup(self):
2025 ListControls are built up from component list items (which are also
2026 ListControls) during parsing. This method should be called after all
2027 items have been added. See ListControl.__doc__ for the reason this is
2028 required.
2031 # Need to set default selection where no item was indicated as being
2032 # selected by the HTML:
2034 # CHECKBOX:
2035 # Nothing should be selected.
2036 # SELECT/single, SELECT/multiple and RADIO:
2037 # RFC 1866 (HTML 2.0): says first item should be selected.
2038 # W3C HTML 4.01 Specification: says that client behaviour is
2039 # undefined in this case. For RADIO, exactly one must be selected,
2040 # though which one is undefined.
2041 # Both Netscape and Microsoft Internet Explorer (IE) choose first
2042 # item for SELECT/single. However, both IE5 and Mozilla (both 1.0
2043 # and Firebird 0.6) leave all items unselected for RADIO and
2044 # SELECT/multiple.
2046 # Since both Netscape and IE all choose the first item for
2047 # SELECT/single, we do the same. OTOH, both Netscape and IE
2048 # leave SELECT/multiple with nothing selected, in violation of RFC 1866
2049 # (but not in violation of the W3C HTML 4 standard); the same is true
2050 # of RADIO (which *is* in violation of the HTML 4 standard). We follow
2051 # RFC 1866 if the _select_default attribute is set, and Netscape and IE
2052 # otherwise. RFC 1866 and HTML 4 are always violated insofar as you
2053 # can deselect all items in a RadioControl.
2055 for o in self.items:
2056 # set items' controls to self, now that we've merged
2057 o.__dict__["_control"] = self
2059 def __getattr__(self, name):
2060 if name == "value":
2061 compat = self._form.backwards_compat
2062 if self.name is None:
2063 return []
2064 return [o.name for o in self.items if o.selected and
2065 (not o.disabled or compat)]
2066 else:
2067 raise AttributeError("%s instance has no attribute '%s'" %
2068 (self.__class__.__name__, name))
2070 def __setattr__(self, name, value):
2071 if name == "value":
2072 if self.disabled:
2073 raise AttributeError("control '%s' is disabled" % self.name)
2074 if self.readonly:
2075 raise AttributeError("control '%s' is readonly" % self.name)
2076 self._set_value(value)
2077 elif name in ("name", "type", "multiple"):
2078 raise AttributeError("%s attribute is readonly" % name)
2079 else:
2080 self.__dict__[name] = value
2082 def _set_value(self, value):
2083 if value is None or isstringlike(value):
2084 raise TypeError("ListControl, must set a sequence")
2085 if not value:
2086 compat = self._form.backwards_compat
2087 for o in self.items:
2088 if not o.disabled or compat:
2089 o.selected = False
2090 elif self.multiple:
2091 self._multiple_set_value(value)
2092 elif len(value) > 1:
2093 raise ItemCountError(
2094 "single selection list, must set sequence of "
2095 "length 0 or 1")
2096 else:
2097 self._single_set_value(value)
2099 def _get_items(self, name, target=1):
2100 all_items = self.get_items(name)
2101 items = [o for o in all_items if not o.disabled]
2102 if len(items) < target:
2103 if len(all_items) < target:
2104 raise ItemNotFoundError(
2105 "insufficient items with name %r" % name)
2106 else:
2107 raise AttributeError(
2108 "insufficient non-disabled items with name %s" % name)
2109 on = []
2110 off = []
2111 for o in items:
2112 if o.selected:
2113 on.append(o)
2114 else:
2115 off.append(o)
2116 return on, off
2118 def _single_set_value(self, value):
2119 assert len(value) == 1
2120 on, off = self._get_items(value[0])
2121 assert len(on) <= 1
2122 if not on:
2123 off[0].selected = True
2125 def _multiple_set_value(self, value):
2126 compat = self._form.backwards_compat
2127 turn_on = [] # transactional-ish
2128 turn_off = [item for item in self.items if
2129 item.selected and (not item.disabled or compat)]
2130 names = {}
2131 for nn in value:
2132 if nn in names.keys():
2133 names[nn] += 1
2134 else:
2135 names[nn] = 1
2136 for name, count in names.items():
2137 on, off = self._get_items(name, count)
2138 for i in range(count):
2139 if on:
2140 item = on[0]
2141 del on[0]
2142 del turn_off[turn_off.index(item)]
2143 else:
2144 item = off[0]
2145 del off[0]
2146 turn_on.append(item)
2147 for item in turn_off:
2148 item.selected = False
2149 for item in turn_on:
2150 item.selected = True
2152 def set_value_by_label(self, value):
2153 """Set the value of control by item labels.
2155 value is expected to be an iterable of strings that are substrings of
2156 the item labels that should be selected. Before substring matching is
2157 performed, the original label text is whitespace-compressed
2158 (consecutive whitespace characters are converted to a single space
2159 character) and leading and trailing whitespace is stripped. Ambiguous
2160 labels are accepted without complaint if the form's backwards_compat is
2161 True; otherwise, it will not complain as long as all ambiguous labels
2162 share the same item name (e.g. OPTION value).
2165 if isstringlike(value):
2166 raise TypeError(value)
2167 if not self.multiple and len(value) > 1:
2168 raise ItemCountError(
2169 "single selection list, must set sequence of "
2170 "length 0 or 1")
2171 items = []
2172 for nn in value:
2173 found = self.get_items(label=nn)
2174 if len(found) > 1:
2175 if not self._form.backwards_compat:
2176 # ambiguous labels are fine as long as item names (e.g.
2177 # OPTION values) are same
2178 opt_name = found[0].name
2179 if [o for o in found[1:] if o.name != opt_name]:
2180 raise AmbiguityError(nn)
2181 else:
2182 # OK, we'll guess :-( Assume first available item.
2183 found = found[:1]
2184 for o in found:
2185 # For the multiple-item case, we could try to be smarter,
2186 # saving them up and trying to resolve, but that's too much.
2187 if self._form.backwards_compat or o not in items:
2188 items.append(o)
2189 break
2190 else: # all of them are used
2191 raise ItemNotFoundError(nn)
2192 # now we have all the items that should be on
2193 # let's just turn everything off and then back on.
2194 self.value = []
2195 for o in items:
2196 o.selected = True
2198 def get_value_by_label(self):
2199 """Return the value of the control as given by normalized labels."""
2200 res = []
2201 compat = self._form.backwards_compat
2202 for o in self.items:
2203 if (not o.disabled or compat) and o.selected:
2204 for l in o.get_labels():
2205 if l.text:
2206 res.append(l.text)
2207 break
2208 else:
2209 res.append(None)
2210 return res
2212 def possible_items(self, by_label=False):
2213 """Deprecated: return the names or labels of all possible items.
2215 Includes disabled items, which may be misleading for some use cases.
2218 deprecation(
2219 "[item.name for item in self.items]")
2220 if by_label:
2221 res = []
2222 for o in self.items:
2223 for l in o.get_labels():
2224 if l.text:
2225 res.append(l.text)
2226 break
2227 else:
2228 res.append(None)
2229 return res
2230 return [o.name for o in self.items]
2232 def _totally_ordered_pairs(self):
2233 if self.disabled or self.name is None:
2234 return []
2235 else:
2236 return [(o._index, self.name, o.name) for o in self.items
2237 if o.selected and not o.disabled]
2239 def __str__(self):
2240 name = self.name
2241 if name is None: name = "<None>"
2243 display = [str(o) for o in self.items]
2245 infos = []
2246 if self.disabled: infos.append("disabled")
2247 if self.readonly: infos.append("readonly")
2248 info = ", ".join(infos)
2249 if info: info = " (%s)" % info
2251 return "<%s(%s=[%s])%s>" % (self.__class__.__name__,
2252 name, ", ".join(display), info)
2255 class RadioControl(ListControl):
2257 Covers:
2259 INPUT/RADIO
2262 def __init__(self, type, name, attrs, select_default=False, index=None):
2263 attrs.setdefault("value", "on")
2264 ListControl.__init__(self, type, name, attrs, select_default,
2265 called_as_base_class=True, index=index)
2266 self.__dict__["multiple"] = False
2267 o = Item(self, attrs, index)
2268 o.__dict__["_selected"] = attrs.has_key("checked")
2270 def fixup(self):
2271 ListControl.fixup(self)
2272 found = [o for o in self.items if o.selected and not o.disabled]
2273 if not found:
2274 if self._select_default:
2275 for o in self.items:
2276 if not o.disabled:
2277 o.selected = True
2278 break
2279 else:
2280 # Ensure only one item selected. Choose the last one,
2281 # following IE and Firefox.
2282 for o in found[:-1]:
2283 o.selected = False
2285 def get_labels(self):
2286 return []
2288 class CheckboxControl(ListControl):
2290 Covers:
2292 INPUT/CHECKBOX
2295 def __init__(self, type, name, attrs, select_default=False, index=None):
2296 attrs.setdefault("value", "on")
2297 ListControl.__init__(self, type, name, attrs, select_default,
2298 called_as_base_class=True, index=index)
2299 self.__dict__["multiple"] = True
2300 o = Item(self, attrs, index)
2301 o.__dict__["_selected"] = attrs.has_key("checked")
2303 def get_labels(self):
2304 return []
2307 class SelectControl(ListControl):
2309 Covers:
2311 SELECT (and OPTION)
2314 OPTION 'values', in HTML parlance, are Item 'names' in ClientForm parlance.
2316 SELECT control values and labels are subject to some messy defaulting
2317 rules. For example, if the HTML representation of the control is:
2319 <SELECT name=year>
2320 <OPTION value=0 label="2002">current year</OPTION>
2321 <OPTION value=1>2001</OPTION>
2322 <OPTION>2000</OPTION>
2323 </SELECT>
2325 The items, in order, have labels "2002", "2001" and "2000", whereas their
2326 names (the OPTION values) are "0", "1" and "2000" respectively. Note that
2327 the value of the last OPTION in this example defaults to its contents, as
2328 specified by RFC 1866, as do the labels of the second and third OPTIONs.
2330 The OPTION labels are sometimes more meaningful than the OPTION values,
2331 which can make for more maintainable code.
2333 Additional read-only public attribute: attrs
2335 The attrs attribute is a dictionary of the original HTML attributes of the
2336 SELECT element. Other ListControls do not have this attribute, because in
2337 other cases the control as a whole does not correspond to any single HTML
2338 element. control.get(...).attrs may be used as usual to get at the HTML
2339 attributes of the HTML elements corresponding to individual list items (for
2340 SELECT controls, these are OPTION elements).
2342 Another special case is that the Item.attrs dictionaries have a special key
2343 "contents" which does not correspond to any real HTML attribute, but rather
2344 contains the contents of the OPTION element:
2346 <OPTION>this bit</OPTION>
2349 # HTML attributes here are treated slightly differently from other list
2350 # controls:
2351 # -The SELECT HTML attributes dictionary is stuffed into the OPTION
2352 # HTML attributes dictionary under the "__select" key.
2353 # -The content of each OPTION element is stored under the special
2354 # "contents" key of the dictionary.
2355 # After all this, the dictionary is passed to the SelectControl constructor
2356 # as the attrs argument, as usual. However:
2357 # -The first SelectControl constructed when building up a SELECT control
2358 # has a constructor attrs argument containing only the __select key -- so
2359 # this SelectControl represents an empty SELECT control.
2360 # -Subsequent SelectControls have both OPTION HTML-attribute in attrs and
2361 # the __select dictionary containing the SELECT HTML-attributes.
2363 def __init__(self, type, name, attrs, select_default=False, index=None):
2364 # fish out the SELECT HTML attributes from the OPTION HTML attributes
2365 # dictionary
2366 self.attrs = attrs["__select"].copy()
2367 self.__dict__["_label"] = _get_label(self.attrs)
2368 self.__dict__["id"] = self.attrs.get("id")
2369 self.__dict__["multiple"] = self.attrs.has_key("multiple")
2370 # the majority of the contents, label, and value dance already happened
2371 contents = attrs.get("contents")
2372 attrs = attrs.copy()
2373 del attrs["__select"]
2375 ListControl.__init__(self, type, name, self.attrs, select_default,
2376 called_as_base_class=True, index=index)
2377 self.disabled = self.attrs.has_key("disabled")
2378 self.readonly = self.attrs.has_key("readonly")
2379 if attrs.has_key("value"):
2380 # otherwise it is a marker 'select started' token
2381 o = Item(self, attrs, index)
2382 o.__dict__["_selected"] = attrs.has_key("selected")
2383 # add 'label' label and contents label, if different. If both are
2384 # provided, the 'label' label is used for display in HTML
2385 # 4.0-compliant browsers (and any lower spec? not sure) while the
2386 # contents are used for display in older or less-compliant
2387 # browsers. We make label objects for both, if the values are
2388 # different.
2389 label = attrs.get("label")
2390 if label:
2391 o._labels.append(Label({"__text": label}))
2392 if contents and contents != label:
2393 o._labels.append(Label({"__text": contents}))
2394 elif contents:
2395 o._labels.append(Label({"__text": contents}))
2397 def fixup(self):
2398 ListControl.fixup(self)
2399 # Firefox doesn't exclude disabled items from those considered here
2400 # (i.e. from 'found', for both branches of the if below). Note that
2401 # IE6 doesn't support the disabled attribute on OPTIONs at all.
2402 found = [o for o in self.items if o.selected]
2403 if not found:
2404 if not self.multiple or self._select_default:
2405 for o in self.items:
2406 if not o.disabled:
2407 was_disabled = self.disabled
2408 self.disabled = False
2409 try:
2410 o.selected = True
2411 finally:
2412 o.disabled = was_disabled
2413 break
2414 elif not self.multiple:
2415 # Ensure only one item selected. Choose the last one,
2416 # following IE and Firefox.
2417 for o in found[:-1]:
2418 o.selected = False
2421 #---------------------------------------------------
2422 class SubmitControl(ScalarControl):
2424 Covers:
2426 INPUT/SUBMIT
2427 BUTTON/SUBMIT
2430 def __init__(self, type, name, attrs, index=None):
2431 ScalarControl.__init__(self, type, name, attrs, index)
2432 # IE5 defaults SUBMIT value to "Submit Query"; Firebird 0.6 leaves it
2433 # blank, Konqueror 3.1 defaults to "Submit". HTML spec. doesn't seem
2434 # to define this.
2435 if self.value is None: self.value = ""
2436 self.readonly = True
2438 def get_labels(self):
2439 res = []
2440 if self.value:
2441 res.append(Label({"__text": self.value}))
2442 res.extend(ScalarControl.get_labels(self))
2443 return res
2445 def is_of_kind(self, kind): return kind == "clickable"
2447 def _click(self, form, coord, return_type, request_class=urllib2.Request):
2448 self._clicked = coord
2449 r = form._switch_click(return_type, request_class)
2450 self._clicked = False
2451 return r
2453 def _totally_ordered_pairs(self):
2454 if not self._clicked:
2455 return []
2456 return ScalarControl._totally_ordered_pairs(self)
2459 #---------------------------------------------------
2460 class ImageControl(SubmitControl):
2462 Covers:
2464 INPUT/IMAGE
2466 Coordinates are specified using one of the HTMLForm.click* methods.
2469 def __init__(self, type, name, attrs, index=None):
2470 SubmitControl.__init__(self, type, name, attrs, index)
2471 self.readonly = False
2473 def _totally_ordered_pairs(self):
2474 clicked = self._clicked
2475 if self.disabled or not clicked:
2476 return []
2477 name = self.name
2478 if name is None: return []
2479 pairs = [
2480 (self._index, "%s.x" % name, str(clicked[0])),
2481 (self._index+1, "%s.y" % name, str(clicked[1])),
2483 value = self._value
2484 if value:
2485 pairs.append((self._index+2, name, value))
2486 return pairs
2488 get_labels = ScalarControl.get_labels
2490 # aliases, just to make str(control) and str(form) clearer
2491 class PasswordControl(TextControl): pass
2492 class HiddenControl(TextControl): pass
2493 class TextareaControl(TextControl): pass
2494 class SubmitButtonControl(SubmitControl): pass
2497 def is_listcontrol(control): return control.is_of_kind("list")
2500 class HTMLForm:
2501 """Represents a single HTML <form> ... </form> element.
2503 A form consists of a sequence of controls that usually have names, and
2504 which can take on various values. The values of the various types of
2505 controls represent variously: text, zero-or-one-of-many or many-of-many
2506 choices, and files to be uploaded. Some controls can be clicked on to
2507 submit the form, and clickable controls' values sometimes include the
2508 coordinates of the click.
2510 Forms can be filled in with data to be returned to the server, and then
2511 submitted, using the click method to generate a request object suitable for
2512 passing to urllib2.urlopen (or the click_request_data or click_pairs
2513 methods if you're not using urllib2).
2515 import ClientForm
2516 forms = ClientForm.ParseFile(html, base_uri)
2517 form = forms[0]
2519 form["query"] = "Python"
2520 form.find_control("nr_results").get("lots").selected = True
2522 response = urllib2.urlopen(form.click())
2524 Usually, HTMLForm instances are not created directly. Instead, the
2525 ParseFile or ParseResponse factory functions are used. If you do construct
2526 HTMLForm objects yourself, however, note that an HTMLForm instance is only
2527 properly initialised after the fixup method has been called (ParseFile and
2528 ParseResponse do this for you). See ListControl.__doc__ for the reason
2529 this is required.
2531 Indexing a form (form["control_name"]) returns the named Control's value
2532 attribute. Assignment to a form index (form["control_name"] = something)
2533 is equivalent to assignment to the named Control's value attribute. If you
2534 need to be more specific than just supplying the control's name, use the
2535 set_value and get_value methods.
2537 ListControl values are lists of item names (specifically, the names of the
2538 items that are selected and not disabled, and hence are "successful" -- ie.
2539 cause data to be returned to the server). The list item's name is the
2540 value of the corresponding HTML element's"value" attribute.
2542 Example:
2544 <INPUT type="CHECKBOX" name="cheeses" value="leicester"></INPUT>
2545 <INPUT type="CHECKBOX" name="cheeses" value="cheddar"></INPUT>
2547 defines a CHECKBOX control with name "cheeses" which has two items, named
2548 "leicester" and "cheddar".
2550 Another example:
2552 <SELECT name="more_cheeses">
2553 <OPTION>1</OPTION>
2554 <OPTION value="2" label="CHEDDAR">cheddar</OPTION>
2555 </SELECT>
2557 defines a SELECT control with name "more_cheeses" which has two items,
2558 named "1" and "2" (because the OPTION element's value HTML attribute
2559 defaults to the element contents -- see SelectControl.__doc__ for more on
2560 these defaulting rules).
2562 To select, deselect or otherwise manipulate individual list items, use the
2563 HTMLForm.find_control() and ListControl.get() methods. To set the whole
2564 value, do as for any other control: use indexing or the set_/get_value
2565 methods.
2567 Example:
2569 # select *only* the item named "cheddar"
2570 form["cheeses"] = ["cheddar"]
2571 # select "cheddar", leave other items unaffected
2572 form.find_control("cheeses").get("cheddar").selected = True
2574 Some controls (RADIO and SELECT without the multiple attribute) can only
2575 have zero or one items selected at a time. Some controls (CHECKBOX and
2576 SELECT with the multiple attribute) can have multiple items selected at a
2577 time. To set the whole value of a ListControl, assign a sequence to a form
2578 index:
2580 form["cheeses"] = ["cheddar", "leicester"]
2582 If the ListControl is not multiple-selection, the assigned list must be of
2583 length one.
2585 To check if a control has an item, if an item is selected, or if an item is
2586 successful (selected and not disabled), respectively:
2588 "cheddar" in [item.name for item in form.find_control("cheeses").items]
2589 "cheddar" in [item.name for item in form.find_control("cheeses").items and
2590 item.selected]
2591 "cheddar" in form["cheeses"] # (or "cheddar" in form.get_value("cheeses"))
2593 Note that some list items may be disabled (see below).
2595 Note the following mistake:
2597 form[control_name] = control_value
2598 assert form[control_name] == control_value # not necessarily true
2600 The reason for this is that form[control_name] always gives the list items
2601 in the order they were listed in the HTML.
2603 List items (hence list values, too) can be referred to in terms of list
2604 item labels rather than list item names using the appropriate label
2605 arguments. Note that each item may have several labels.
2607 The question of default values of OPTION contents, labels and values is
2608 somewhat complicated: see SelectControl.__doc__ and
2609 ListControl.get_item_attrs.__doc__ if you think you need to know.
2611 Controls can be disabled or readonly. In either case, the control's value
2612 cannot be changed until you clear those flags (see example below).
2613 Disabled is the state typically represented by browsers by 'greying out' a
2614 control. Disabled controls are not 'successful' -- they don't cause data
2615 to get returned to the server. Readonly controls usually appear in
2616 browsers as read-only text boxes. Readonly controls are successful. List
2617 items can also be disabled. Attempts to select or deselect disabled items
2618 fail with AttributeError.
2620 If a lot of controls are readonly, it can be useful to do this:
2622 form.set_all_readonly(False)
2624 To clear a control's value attribute, so that it is not successful (until a
2625 value is subsequently set):
2627 form.clear("cheeses")
2629 More examples:
2631 control = form.find_control("cheeses")
2632 control.disabled = False
2633 control.readonly = False
2634 control.get("gruyere").disabled = True
2635 control.items[0].selected = True
2637 See the various Control classes for further documentation. Many methods
2638 take name, type, kind, id, label and nr arguments to specify the control to
2639 be operated on: see HTMLForm.find_control.__doc__.
2641 ControlNotFoundError (subclass of ValueError) is raised if the specified
2642 control can't be found. This includes occasions where a non-ListControl
2643 is found, but the method (set, for example) requires a ListControl.
2644 ItemNotFoundError (subclass of ValueError) is raised if a list item can't
2645 be found. ItemCountError (subclass of ValueError) is raised if an attempt
2646 is made to select more than one item and the control doesn't allow that, or
2647 set/get_single are called and the control contains more than one item.
2648 AttributeError is raised if a control or item is readonly or disabled and
2649 an attempt is made to alter its value.
2651 Security note: Remember that any passwords you store in HTMLForm instances
2652 will be saved to disk in the clear if you pickle them (directly or
2653 indirectly). The simplest solution to this is to avoid pickling HTMLForm
2654 objects. You could also pickle before filling in any password, or just set
2655 the password to "" before pickling.
2658 Public attributes:
2660 action: full (absolute URI) form action
2661 method: "GET" or "POST"
2662 enctype: form transfer encoding MIME type
2663 name: name of form (None if no name was specified)
2664 attrs: dictionary mapping original HTML form attributes to their values
2666 controls: list of Control instances; do not alter this list
2667 (instead, call form.new_control to make a Control and add it to the
2668 form, or control.add_to_form if you already have a Control instance)
2672 Methods for form filling:
2673 -------------------------
2675 Most of the these methods have very similar arguments. See
2676 HTMLForm.find_control.__doc__ for details of the name, type, kind, label
2677 and nr arguments.
2679 def find_control(self,
2680 name=None, type=None, kind=None, id=None, predicate=None,
2681 nr=None, label=None)
2683 get_value(name=None, type=None, kind=None, id=None, nr=None,
2684 by_label=False, # by_label is deprecated
2685 label=None)
2686 set_value(value,
2687 name=None, type=None, kind=None, id=None, nr=None,
2688 by_label=False, # by_label is deprecated
2689 label=None)
2691 clear_all()
2692 clear(name=None, type=None, kind=None, id=None, nr=None, label=None)
2694 set_all_readonly(readonly)
2697 Method applying only to FileControls:
2699 add_file(file_object,
2700 content_type="application/octet-stream", filename=None,
2701 name=None, id=None, nr=None, label=None)
2704 Methods applying only to clickable controls:
2706 click(name=None, type=None, id=None, nr=0, coord=(1,1), label=None)
2707 click_request_data(name=None, type=None, id=None, nr=0, coord=(1,1),
2708 label=None)
2709 click_pairs(name=None, type=None, id=None, nr=0, coord=(1,1), label=None)
2713 type2class = {
2714 "text": TextControl,
2715 "password": PasswordControl,
2716 "hidden": HiddenControl,
2717 "textarea": TextareaControl,
2719 "isindex": IsindexControl,
2721 "file": FileControl,
2723 "button": IgnoreControl,
2724 "buttonbutton": IgnoreControl,
2725 "reset": IgnoreControl,
2726 "resetbutton": IgnoreControl,
2728 "submit": SubmitControl,
2729 "submitbutton": SubmitButtonControl,
2730 "image": ImageControl,
2732 "radio": RadioControl,
2733 "checkbox": CheckboxControl,
2734 "select": SelectControl,
2737 #---------------------------------------------------
2738 # Initialisation. Use ParseResponse / ParseFile instead.
2740 def __init__(self, action, method="GET",
2741 enctype="application/x-www-form-urlencoded",
2742 name=None, attrs=None,
2743 request_class=urllib2.Request,
2744 forms=None, labels=None, id_to_labels=None,
2745 backwards_compat=True):
2747 In the usual case, use ParseResponse (or ParseFile) to create new
2748 HTMLForm objects.
2750 action: full (absolute URI) form action
2751 method: "GET" or "POST"
2752 enctype: form transfer encoding MIME type
2753 name: name of form
2754 attrs: dictionary mapping original HTML form attributes to their values
2757 self.action = action
2758 self.method = method
2759 self.enctype = enctype
2760 self.name = name
2761 if attrs is not None:
2762 self.attrs = attrs.copy()
2763 else:
2764 self.attrs = {}
2765 self.controls = []
2766 self._request_class = request_class
2768 # these attributes are used by zope.testbrowser
2769 self._forms = forms # this is a semi-public API!
2770 self._labels = labels # this is a semi-public API!
2771 self._id_to_labels = id_to_labels # this is a semi-public API!
2773 self.backwards_compat = backwards_compat # note __setattr__
2775 self._urlunparse = urlparse.urlunparse
2776 self._urlparse = urlparse.urlparse
2778 def __getattr__(self, name):
2779 if name == "backwards_compat":
2780 return self._backwards_compat
2781 return getattr(HTMLForm, name)
2783 def __setattr__(self, name, value):
2784 # yuck
2785 if name == "backwards_compat":
2786 name = "_backwards_compat"
2787 value = bool(value)
2788 for cc in self.controls:
2789 try:
2790 items = cc.items
2791 except AttributeError:
2792 continue
2793 else:
2794 for ii in items:
2795 for ll in ii.get_labels():
2796 ll._backwards_compat = value
2797 self.__dict__[name] = value
2799 def new_control(self, type, name, attrs,
2800 ignore_unknown=False, select_default=False, index=None):
2801 """Adds a new control to the form.
2803 This is usually called by ParseFile and ParseResponse. Don't call it
2804 youself unless you're building your own Control instances.
2806 Note that controls representing lists of items are built up from
2807 controls holding only a single list item. See ListControl.__doc__ for
2808 further information.
2810 type: type of control (see Control.__doc__ for a list)
2811 attrs: HTML attributes of control
2812 ignore_unknown: if true, use a dummy Control instance for controls of
2813 unknown type; otherwise, use a TextControl
2814 select_default: for RADIO and multiple-selection SELECT controls, pick
2815 the first item as the default if no 'selected' HTML attribute is
2816 present (this defaulting happens when the HTMLForm.fixup method is
2817 called)
2818 index: index of corresponding element in HTML (see
2819 MoreFormTests.test_interspersed_controls for motivation)
2822 type = type.lower()
2823 klass = self.type2class.get(type)
2824 if klass is None:
2825 if ignore_unknown:
2826 klass = IgnoreControl
2827 else:
2828 klass = TextControl
2830 a = attrs.copy()
2831 if issubclass(klass, ListControl):
2832 control = klass(type, name, a, select_default, index)
2833 else:
2834 control = klass(type, name, a, index)
2836 if type == "select" and len(attrs) == 1:
2837 for ii in range(len(self.controls)-1, -1, -1):
2838 ctl = self.controls[ii]
2839 if ctl.type == "select":
2840 ctl.close_control()
2841 break
2843 control.add_to_form(self)
2844 control._urlparse = self._urlparse
2845 control._urlunparse = self._urlunparse
2847 def fixup(self):
2848 """Normalise form after all controls have been added.
2850 This is usually called by ParseFile and ParseResponse. Don't call it
2851 youself unless you're building your own Control instances.
2853 This method should only be called once, after all controls have been
2854 added to the form.
2857 for control in self.controls:
2858 control.fixup()
2859 self.backwards_compat = self._backwards_compat
2861 #---------------------------------------------------
2862 def __str__(self):
2863 header = "%s%s %s %s" % (
2864 (self.name and self.name+" " or ""),
2865 self.method, self.action, self.enctype)
2866 rep = [header]
2867 for control in self.controls:
2868 rep.append(" %s" % str(control))
2869 return "<%s>" % "\n".join(rep)
2871 #---------------------------------------------------
2872 # Form-filling methods.
2874 def __getitem__(self, name):
2875 return self.find_control(name).value
2876 def __contains__(self, name):
2877 return bool(self.find_control(name))
2878 def __setitem__(self, name, value):
2879 control = self.find_control(name)
2880 try:
2881 control.value = value
2882 except AttributeError, e:
2883 raise ValueError(str(e))
2885 def get_value(self,
2886 name=None, type=None, kind=None, id=None, nr=None,
2887 by_label=False, # by_label is deprecated
2888 label=None):
2889 """Return value of control.
2891 If only name and value arguments are supplied, equivalent to
2893 form[name]
2896 if by_label:
2897 deprecation("form.get_value_by_label(...)")
2898 c = self.find_control(name, type, kind, id, label=label, nr=nr)
2899 if by_label:
2900 try:
2901 meth = c.get_value_by_label
2902 except AttributeError:
2903 raise NotImplementedError(
2904 "control '%s' does not yet support by_label" % c.name)
2905 else:
2906 return meth()
2907 else:
2908 return c.value
2909 def set_value(self, value,
2910 name=None, type=None, kind=None, id=None, nr=None,
2911 by_label=False, # by_label is deprecated
2912 label=None):
2913 """Set value of control.
2915 If only name and value arguments are supplied, equivalent to
2917 form[name] = value
2920 if by_label:
2921 deprecation("form.get_value_by_label(...)")
2922 c = self.find_control(name, type, kind, id, label=label, nr=nr)
2923 if by_label:
2924 try:
2925 meth = c.set_value_by_label
2926 except AttributeError:
2927 raise NotImplementedError(
2928 "control '%s' does not yet support by_label" % c.name)
2929 else:
2930 meth(value)
2931 else:
2932 c.value = value
2933 def get_value_by_label(
2934 self, name=None, type=None, kind=None, id=None, label=None, nr=None):
2937 All arguments should be passed by name.
2940 c = self.find_control(name, type, kind, id, label=label, nr=nr)
2941 return c.get_value_by_label()
2943 def set_value_by_label(
2944 self, value,
2945 name=None, type=None, kind=None, id=None, label=None, nr=None):
2948 All arguments should be passed by name.
2951 c = self.find_control(name, type, kind, id, label=label, nr=nr)
2952 c.set_value_by_label(value)
2954 def set_all_readonly(self, readonly):
2955 for control in self.controls:
2956 control.readonly = bool(readonly)
2958 def clear_all(self):
2959 """Clear the value attributes of all controls in the form.
2961 See HTMLForm.clear.__doc__.
2964 for control in self.controls:
2965 control.clear()
2967 def clear(self,
2968 name=None, type=None, kind=None, id=None, nr=None, label=None):
2969 """Clear the value attribute of a control.
2971 As a result, the affected control will not be successful until a value
2972 is subsequently set. AttributeError is raised on readonly controls.
2975 c = self.find_control(name, type, kind, id, label=label, nr=nr)
2976 c.clear()
2979 #---------------------------------------------------
2980 # Form-filling methods applying only to ListControls.
2982 def possible_items(self, # deprecated
2983 name=None, type=None, kind=None, id=None,
2984 nr=None, by_label=False, label=None):
2985 """Return a list of all values that the specified control can take."""
2986 c = self._find_list_control(name, type, kind, id, label, nr)
2987 return c.possible_items(by_label)
2989 def set(self, selected, item_name, # deprecated
2990 name=None, type=None, kind=None, id=None, nr=None,
2991 by_label=False, label=None):
2992 """Select / deselect named list item.
2994 selected: boolean selected state
2997 self._find_list_control(name, type, kind, id, label, nr).set(
2998 selected, item_name, by_label)
2999 def toggle(self, item_name, # deprecated
3000 name=None, type=None, kind=None, id=None, nr=None,
3001 by_label=False, label=None):
3002 """Toggle selected state of named list item."""
3003 self._find_list_control(name, type, kind, id, label, nr).toggle(
3004 item_name, by_label)
3006 def set_single(self, selected, # deprecated
3007 name=None, type=None, kind=None, id=None,
3008 nr=None, by_label=None, label=None):
3009 """Select / deselect list item in a control having only one item.
3011 If the control has multiple list items, ItemCountError is raised.
3013 This is just a convenience method, so you don't need to know the item's
3014 name -- the item name in these single-item controls is usually
3015 something meaningless like "1" or "on".
3017 For example, if a checkbox has a single item named "on", the following
3018 two calls are equivalent:
3020 control.toggle("on")
3021 control.toggle_single()
3023 """ # by_label ignored and deprecated
3024 self._find_list_control(
3025 name, type, kind, id, label, nr).set_single(selected)
3026 def toggle_single(self, name=None, type=None, kind=None, id=None,
3027 nr=None, by_label=None, label=None): # deprecated
3028 """Toggle selected state of list item in control having only one item.
3030 The rest is as for HTMLForm.set_single.__doc__.
3032 """ # by_label ignored and deprecated
3033 self._find_list_control(name, type, kind, id, label, nr).toggle_single()
3035 #---------------------------------------------------
3036 # Form-filling method applying only to FileControls.
3038 def add_file(self, file_object, content_type=None, filename=None,
3039 name=None, id=None, nr=None, label=None):
3040 """Add a file to be uploaded.
3042 file_object: file-like object (with read method) from which to read
3043 data to upload
3044 content_type: MIME content type of data to upload
3045 filename: filename to pass to server
3047 If filename is None, no filename is sent to the server.
3049 If content_type is None, the content type is guessed based on the
3050 filename and the data from read from the file object.
3053 At the moment, guessed content type is always application/octet-stream.
3054 Use sndhdr, imghdr modules. Should also try to guess HTML, XML, and
3055 plain text.
3057 Note the following useful HTML attributes of file upload controls (see
3058 HTML 4.01 spec, section 17):
3060 accept: comma-separated list of content types that the server will
3061 handle correctly; you can use this to filter out non-conforming files
3062 size: XXX IIRC, this is indicative of whether form wants multiple or
3063 single files
3064 maxlength: XXX hint of max content length in bytes?
3067 self.find_control(name, "file", id=id, label=label, nr=nr).add_file(
3068 file_object, content_type, filename)
3070 #---------------------------------------------------
3071 # Form submission methods, applying only to clickable controls.
3073 def click(self, name=None, type=None, id=None, nr=0, coord=(1,1),
3074 request_class=urllib2.Request,
3075 label=None):
3076 """Return request that would result from clicking on a control.
3078 The request object is a urllib2.Request instance, which you can pass to
3079 urllib2.urlopen (or ClientCookie.urlopen).
3081 Only some control types (INPUT/SUBMIT & BUTTON/SUBMIT buttons and
3082 IMAGEs) can be clicked.
3084 Will click on the first clickable control, subject to the name, type
3085 and nr arguments (as for find_control). If no name, type, id or number
3086 is specified and there are no clickable controls, a request will be
3087 returned for the form in its current, un-clicked, state.
3089 IndexError is raised if any of name, type, id or nr is specified but no
3090 matching control is found. ValueError is raised if the HTMLForm has an
3091 enctype attribute that is not recognised.
3093 You can optionally specify a coordinate to click at, which only makes a
3094 difference if you clicked on an image.
3097 return self._click(name, type, id, label, nr, coord, "request",
3098 self._request_class)
3100 def click_request_data(self,
3101 name=None, type=None, id=None,
3102 nr=0, coord=(1,1),
3103 request_class=urllib2.Request,
3104 label=None):
3105 """As for click method, but return a tuple (url, data, headers).
3107 You can use this data to send a request to the server. This is useful
3108 if you're using httplib or urllib rather than urllib2. Otherwise, use
3109 the click method.
3111 # Untested. Have to subclass to add headers, I think -- so use urllib2
3112 # instead!
3113 import urllib
3114 url, data, hdrs = form.click_request_data()
3115 r = urllib.urlopen(url, data)
3117 # Untested. I don't know of any reason to use httplib -- you can get
3118 # just as much control with urllib2.
3119 import httplib, urlparse
3120 url, data, hdrs = form.click_request_data()
3121 tup = urlparse(url)
3122 host, path = tup[1], urlparse.urlunparse((None, None)+tup[2:])
3123 conn = httplib.HTTPConnection(host)
3124 if data:
3125 httplib.request("POST", path, data, hdrs)
3126 else:
3127 httplib.request("GET", path, headers=hdrs)
3128 r = conn.getresponse()
3131 return self._click(name, type, id, label, nr, coord, "request_data",
3132 self._request_class)
3134 def click_pairs(self, name=None, type=None, id=None,
3135 nr=0, coord=(1,1),
3136 label=None):
3137 """As for click_request_data, but returns a list of (key, value) pairs.
3139 You can use this list as an argument to ClientForm.urlencode. This is
3140 usually only useful if you're using httplib or urllib rather than
3141 urllib2 or ClientCookie. It may also be useful if you want to manually
3142 tweak the keys and/or values, but this should not be necessary.
3143 Otherwise, use the click method.
3145 Note that this method is only useful for forms of MIME type
3146 x-www-form-urlencoded. In particular, it does not return the
3147 information required for file upload. If you need file upload and are
3148 not using urllib2, use click_request_data.
3150 Also note that Python 2.0's urllib.urlencode is slightly broken: it
3151 only accepts a mapping, not a sequence of pairs, as an argument. This
3152 messes up any ordering in the argument. Use ClientForm.urlencode
3153 instead.
3156 return self._click(name, type, id, label, nr, coord, "pairs",
3157 self._request_class)
3159 #---------------------------------------------------
3161 def find_control(self,
3162 name=None, type=None, kind=None, id=None,
3163 predicate=None, nr=None,
3164 label=None):
3165 """Locate and return some specific control within the form.
3167 At least one of the name, type, kind, predicate and nr arguments must
3168 be supplied. If no matching control is found, ControlNotFoundError is
3169 raised.
3171 If name is specified, then the control must have the indicated name.
3173 If type is specified then the control must have the specified type (in
3174 addition to the types possible for <input> HTML tags: "text",
3175 "password", "hidden", "submit", "image", "button", "radio", "checkbox",
3176 "file" we also have "reset", "buttonbutton", "submitbutton",
3177 "resetbutton", "textarea", "select" and "isindex").
3179 If kind is specified, then the control must fall into the specified
3180 group, each of which satisfies a particular interface. The types are
3181 "text", "list", "multilist", "singlelist", "clickable" and "file".
3183 If id is specified, then the control must have the indicated id.
3185 If predicate is specified, then the control must match that function.
3186 The predicate function is passed the control as its single argument,
3187 and should return a boolean value indicating whether the control
3188 matched.
3190 nr, if supplied, is the sequence number of the control (where 0 is the
3191 first). Note that control 0 is the first control matching all the
3192 other arguments (if supplied); it is not necessarily the first control
3193 in the form. If no nr is supplied, AmbiguityError is raised if
3194 multiple controls match the other arguments (unless the
3195 .backwards-compat attribute is true).
3197 If label is specified, then the control must have this label. Note
3198 that radio controls and checkboxes never have labels: their items do.
3201 if ((name is None) and (type is None) and (kind is None) and
3202 (id is None) and (label is None) and (predicate is None) and
3203 (nr is None)):
3204 raise ValueError(
3205 "at least one argument must be supplied to specify control")
3206 return self._find_control(name, type, kind, id, label, predicate, nr)
3208 #---------------------------------------------------
3209 # Private methods.
3211 def _find_list_control(self,
3212 name=None, type=None, kind=None, id=None,
3213 label=None, nr=None):
3214 if ((name is None) and (type is None) and (kind is None) and
3215 (id is None) and (label is None) and (nr is None)):
3216 raise ValueError(
3217 "at least one argument must be supplied to specify control")
3219 return self._find_control(name, type, kind, id, label,
3220 is_listcontrol, nr)
3222 def _find_control(self, name, type, kind, id, label, predicate, nr):
3223 if ((name is not None) and (name is not Missing) and
3224 not isstringlike(name)):
3225 raise TypeError("control name must be string-like")
3226 if (type is not None) and not isstringlike(type):
3227 raise TypeError("control type must be string-like")
3228 if (kind is not None) and not isstringlike(kind):
3229 raise TypeError("control kind must be string-like")
3230 if (id is not None) and not isstringlike(id):
3231 raise TypeError("control id must be string-like")
3232 if (label is not None) and not isstringlike(label):
3233 raise TypeError("control label must be string-like")
3234 if (predicate is not None) and not callable(predicate):
3235 raise TypeError("control predicate must be callable")
3236 if (nr is not None) and nr < 0:
3237 raise ValueError("control number must be a positive integer")
3239 orig_nr = nr
3240 found = None
3241 ambiguous = False
3242 if nr is None and self.backwards_compat:
3243 nr = 0
3245 for control in self.controls:
3246 if ((name is not None and name != control.name) and
3247 (name is not Missing or control.name is not None)):
3248 continue
3249 if type is not None and type != control.type:
3250 continue
3251 if kind is not None and not control.is_of_kind(kind):
3252 continue
3253 if id is not None and id != control.id:
3254 continue
3255 if predicate and not predicate(control):
3256 continue
3257 if label:
3258 for l in control.get_labels():
3259 if l.text.find(label) > -1:
3260 break
3261 else:
3262 continue
3263 if nr is not None:
3264 if nr == 0:
3265 return control # early exit: unambiguous due to nr
3266 nr -= 1
3267 continue
3268 if found:
3269 ambiguous = True
3270 break
3271 found = control
3273 if found and not ambiguous:
3274 return found
3276 description = []
3277 if name is not None: description.append("name %s" % repr(name))
3278 if type is not None: description.append("type '%s'" % type)
3279 if kind is not None: description.append("kind '%s'" % kind)
3280 if id is not None: description.append("id '%s'" % id)
3281 if label is not None: description.append("label '%s'" % label)
3282 if predicate is not None:
3283 description.append("predicate %s" % predicate)
3284 if orig_nr: description.append("nr %d" % orig_nr)
3285 description = ", ".join(description)
3287 if ambiguous:
3288 raise AmbiguityError("more than one control matching "+description)
3289 elif not found:
3290 raise ControlNotFoundError("no control matching "+description)
3291 assert False
3293 def _click(self, name, type, id, label, nr, coord, return_type,
3294 request_class=urllib2.Request):
3295 try:
3296 control = self._find_control(
3297 name, type, "clickable", id, label, None, nr)
3298 except ControlNotFoundError:
3299 if ((name is not None) or (type is not None) or (id is not None) or
3300 (nr != 0)):
3301 raise
3302 # no clickable controls, but no control was explicitly requested,
3303 # so return state without clicking any control
3304 return self._switch_click(return_type, request_class)
3305 else:
3306 return control._click(self, coord, return_type, request_class)
3308 def _pairs(self):
3309 """Return sequence of (key, value) pairs suitable for urlencoding."""
3310 return [(k, v) for (i, k, v, c_i) in self._pairs_and_controls()]
3313 def _pairs_and_controls(self):
3314 """Return sequence of (index, key, value, control_index)
3315 of totally ordered pairs suitable for urlencoding.
3317 control_index is the index of the control in self.controls
3319 pairs = []
3320 for control_index in range(len(self.controls)):
3321 control = self.controls[control_index]
3322 for ii, key, val in control._totally_ordered_pairs():
3323 pairs.append((ii, key, val, control_index))
3325 # stable sort by ONLY first item in tuple
3326 pairs.sort()
3328 return pairs
3330 def _request_data(self):
3331 """Return a tuple (url, data, headers)."""
3332 method = self.method.upper()
3333 #scheme, netloc, path, parameters, query, frag = urlparse.urlparse(self.action)
3334 parts = self._urlparse(self.action)
3335 rest, (query, frag) = parts[:-2], parts[-2:]
3337 if method == "GET":
3338 if self.enctype != "application/x-www-form-urlencoded":
3339 raise ValueError(
3340 "unknown GET form encoding type '%s'" % self.enctype)
3341 parts = rest + (urlencode(self._pairs()), None)
3342 uri = self._urlunparse(parts)
3343 return uri, None, []
3344 elif method == "POST":
3345 parts = rest + (query, None)
3346 uri = self._urlunparse(parts)
3347 if self.enctype == "application/x-www-form-urlencoded":
3348 return (uri, urlencode(self._pairs()),
3349 [("Content-type", self.enctype)])
3350 elif self.enctype == "multipart/form-data":
3351 data = StringIO()
3352 http_hdrs = []
3353 mw = MimeWriter(data, http_hdrs)
3354 f = mw.startmultipartbody("form-data", add_to_http_hdrs=True,
3355 prefix=0)
3356 for ii, k, v, control_index in self._pairs_and_controls():
3357 self.controls[control_index]._write_mime_data(mw, k, v)
3358 mw.lastpart()
3359 return uri, data.getvalue(), http_hdrs
3360 else:
3361 raise ValueError(
3362 "unknown POST form encoding type '%s'" % self.enctype)
3363 else:
3364 raise ValueError("Unknown method '%s'" % method)
3366 def _switch_click(self, return_type, request_class=urllib2.Request):
3367 # This is called by HTMLForm and clickable Controls to hide switching
3368 # on return_type.
3369 if return_type == "pairs":
3370 return self._pairs()
3371 elif return_type == "request_data":
3372 return self._request_data()
3373 else:
3374 req_data = self._request_data()
3375 req = request_class(req_data[0], req_data[1])
3376 for key, val in req_data[2]:
3377 add_hdr = req.add_header
3378 if key.lower() == "content-type":
3379 try:
3380 add_hdr = req.add_unredirected_header
3381 except AttributeError:
3382 # pre-2.4 and not using ClientCookie
3383 pass
3384 add_hdr(key, val)
3385 return req