2 # Copyright (c) 2011 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
7 """Extracts registration forms from the corresponding HTML files.
9 Used for extracting forms within HTML files. This script is used in
10 conjunction with the webforms_aggregator.py script, which aggregates web pages
11 with fillable forms (i.e registration forms).
13 The purpose of this script is to extract out all non-form elements that may be
14 causing parsing errors and timeout issues when running browser_tests.
16 This script extracts all forms from a HTML file.
17 If there are multiple forms per downloaded site, multiple files are created
20 Used as a standalone script but assumes that it is run from the directory in
21 which it is checked into.
23 Usage: forms_extractor.py [options]
26 -l LOG_LEVEL, --log_level=LOG_LEVEL,
27 LOG_LEVEL: debug, info, warning or error [default: error]
28 -j, --js extracts javascript elements from web form.
29 -h, --help show this help message and exit
34 from optparse
import OptionParser
40 class FormsExtractor(object):
41 """Extracts HTML files, leaving only registration forms from the HTML file."""
42 _HTML_FILES_PATTERN
= r
'*.html'
43 _HTML_FILE_PREFIX
= r
'grabber-'
44 _FORM_FILE_PREFIX
= r
'grabber-stripped-'
46 _REGISTRATION_PAGES_DIR
= os
.path
.join(os
.pardir
, 'test', 'data', 'autofill',
47 'heuristics', 'input')
48 _EXTRACTED_FORMS_DIR
= os
.path
.join(os
.pardir
, 'test', 'data', 'autofill',
49 'heuristics', 'input')
51 logger
= logging
.getLogger(__name__
)
52 log_handlers
= {'StreamHandler': None}
54 # This pattern is used for retrieving the form location comment located at the
55 # top of each downloaded HTML file indicating where the form originated from.
56 _RE_FORM_LOCATION_PATTERN
= re
.compile(
58 <!--Form\s{1}Location: # Starting of form location comment.
59 .*? # Any characters (non-greedy).
60 --> # Ending of the form comment.
61 """, re
.U | re
.S | re
.I | re
.X
)
63 # This pattern is used for removing all script code.
64 _RE_SCRIPT_PATTERN
= re
.compile(
66 <script # A new opening '<script' tag.
67 \b # The end of the word 'script'.
68 .*? # Any characters (non-greedy).
69 > # Ending of the (opening) tag: '>'.
70 .*? # Any characters (non-greedy) between the tags.
71 </script\s*> # The '</script>' closing tag.
72 """, re
.U | re
.S | re
.I | re
.X
)
74 # This pattern is used for removing all href js code.
75 _RE_HREF_JS_PATTERN
= re
.compile(
77 \bhref # The word href and its beginning.
78 \s*=\s* # The '=' with all whitespace before and after it.
79 (?P<quote>[\'\"]) # A single or double quote which is captured.
80 \s
*javascript\s
*: # The word 'javascript:' with any whitespace possible.
81 .*?
# Any characters (non-greedy) between the quotes.
82 \
1 # The previously captured single or double quote.
83 """, re.U | re.S | re.I | re.X)
87 \b # The beginning of a new word.
88 on\w
+?
# All words starting with 'on' (non-greedy)
89 # example: |onmouseover|.
90 \s
*=\s
* # The '=' with all whitespace before and after it.
91 (?P
<quote
>[\'\"]) # A captured single or double quote.
92 .*?
# Any characters (non-greedy) between the quotes.
93 \
1 # The previously captured single or double quote.
96 # This pattern is used for removing code with js events, such as |onload|.
97 # By adding the leading |ur'<[^<>]*?'| and the trailing |'ur'[^<>]*?>'| the
98 # pattern matches to strings such as '<tr class="nav"
99 # onmouseover="mOvr1(this);" onmouseout="mOut1(this);">'
100 _RE_TAG_WITH_EVENTS_PATTERN = re.compile(
102 < # Matches character '<'.
103 [^
<>]*?
# Matches any characters except '<' and '>' (non-greedy).""" +
106 [^<>]*? # Matches any characters except '<' and '>' (non-greedy).
107 > # Matches character '>'.
108 """, re
.U | re
.S | re
.I | re
.X
)
110 # Adds whitespace chars at the end of the matched event. Also match trailing
111 # whitespaces for JS events. Do not match leading whitespace.
112 # For example: |< /form>| is invalid HTML and does not exist but |</form >| is
113 # considered valid HTML.
114 _RE_EVENT_PATTERN
= re
.compile(
115 _RE_EVENT_EXPR
+ ur
'\s*', re
.U | re
.S | re
.I | re
.X
)
117 # This pattern is used for finding form elements.
118 _RE_FORM_PATTERN
= re
.compile(
120 <form # A new opening '<form' tag.
121 \b # The end of the word 'form'.
122 .*? # Any characters (non-greedy).
123 > # Ending of the (opening) tag: '>'.
124 .*? # Any characters (non-greedy) between the tags.
125 </form\s*> # The '</form>' closing tag.
126 """, re
.U | re
.S | re
.I | re
.X
)
128 def __init__(self
, input_dir
=_REGISTRATION_PAGES_DIR
,
129 output_dir
=_EXTRACTED_FORMS_DIR
, logging_level
=None):
130 """Creates a FormsExtractor object.
133 input_dir: the directory of HTML files.
134 output_dir: the directory where the registration form files will be
136 logging_level: verbosity level, default is None.
139 IOError exception if input directory doesn't exist.
142 if not self
.log_handlers
['StreamHandler']:
143 console
= logging
.StreamHandler()
144 console
.setLevel(logging
.DEBUG
)
145 self
.log_handlers
['StreamHandler'] = console
146 self
.logger
.addHandler(console
)
147 self
.logger
.setLevel(logging_level
)
149 if self
.log_handlers
['StreamHandler']:
150 self
.logger
.removeHandler(self
.log_handlers
['StreamHandler'])
151 self
.log_handlers
['StreamHandler'] = None
153 self
._input
_dir
= input_dir
154 self
._output
_dir
= output_dir
155 if not os
.path
.isdir(self
._input
_dir
):
156 error_msg
= 'Directory "%s" doesn\'t exist.' % self
._input
_dir
157 self
.logger
.error('Error: %s', error_msg
)
158 raise IOError(error_msg
)
159 if not os
.path
.isdir(output_dir
):
160 os
.makedirs(output_dir
)
161 self
._form
_location
_comment
= ''
163 def _SubstituteAllEvents(self
, matchobj
):
164 """Remove all js events that are present as attributes within a tag.
167 matchobj: A regexp |re.MatchObject| containing text that has at least one
168 event. Example: |<tr class="nav" onmouseover="mOvr1(this);"
169 onmouseout="mOut1(this);">|.
172 The text containing the tag with all the attributes except for the tags
173 with events. Example: |<tr class="nav">|.
175 tag_with_all_attrs
= matchobj
.group(0)
176 return self
._RE
_EVENT
_PATTERN
.sub('', tag_with_all_attrs
)
178 def Extract(self
, strip_js_only
):
179 """Extracts and saves the extracted registration forms.
181 Iterates through all the HTML files.
184 strip_js_only: If True, only Javascript is stripped from the HTML content.
185 Otherwise, all non-form elements are stripped.
187 pathname_pattern
= os
.path
.join(self
._input
_dir
, self
._HTML
_FILES
_PATTERN
)
188 html_files
= [f
for f
in glob
.glob(pathname_pattern
) if os
.path
.isfile(f
)]
189 for filename
in html_files
:
190 self
.logger
.info('Stripping file "%s" ...', filename
)
191 with
open(filename
, 'U') as f
:
192 html_content
= self
._RE
_TAG
_WITH
_EVENTS
_PATTERN
.sub(
193 self
._SubstituteAllEvents
,
194 self
._RE
_HREF
_JS
_PATTERN
.sub(
195 '', self
._RE
_SCRIPT
_PATTERN
.sub('', f
.read())))
197 form_filename
= os
.path
.split(filename
)[1] # Path dropped.
198 form_filename
= form_filename
.replace(self
._HTML
_FILE
_PREFIX
, '', 1)
199 (form_filename
, extension
) = os
.path
.splitext(form_filename
)
200 form_filename
= (self
._FORM
_FILE
_PREFIX
+ form_filename
+
202 form_filename
= os
.path
.join(self
._output
_dir
, form_filename
)
204 form_filename
= form_filename
% ''
206 with
open(form_filename
, 'w') as f
:
207 f
.write(html_content
)
209 self
.logger
.error('Error: %s', e
)
211 else: # Remove all non form elements.
212 match
= self
._RE
_FORM
_LOCATION
_PATTERN
.search(html_content
)
214 form_location_comment
= match
.group() + os
.linesep
216 form_location_comment
= ''
217 forms_iterator
= self
._RE
_FORM
_PATTERN
.finditer(html_content
)
218 for form_number
, form_match
in enumerate(forms_iterator
, start
=1):
219 form_content
= form_match
.group()
220 numbered_form_filename
= form_filename
% form_number
222 with
open(numbered_form_filename
, 'w') as f
:
223 f
.write(form_location_comment
)
224 f
.write(form_content
)
226 self
.logger
.error('Error: %s', e
)
228 self
.logger
.info('\tFile "%s" extracted SUCCESSFULLY!', filename
)
232 parser
= OptionParser()
234 '-l', '--log_level', metavar
='LOG_LEVEL', default
='error',
235 help='LOG_LEVEL: debug, info, warning or error [default: %default]')
237 '-j', '--js', dest
='js', action
='store_true', default
=False,
238 help='Removes all javascript elements [default: %default]')
240 (options
, args
) = parser
.parse_args()
241 options
.log_level
= options
.log_level
.upper()
242 if options
.log_level
not in ['DEBUG', 'INFO', 'WARNING', 'ERROR']:
243 print 'Wrong log_level argument.'
247 options
.log_level
= getattr(logging
, options
.log_level
)
248 extractor
= FormsExtractor(logging_level
=options
.log_level
)
249 extractor
.Extract(options
.js
)
253 if __name__
== '__main__':