1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
6 import xml
.dom
.minidom
as minidom
7 from xml
.parsers
.expat
import ExpatError
10 from repository_parser_interface
import ParserInterface
12 FILE_CHANGE_TYPE_MAP
= {
21 def _ConvertToFileChangeType(file_action
):
22 # TODO(stgao): verify impact on code that checks the file change type.
23 return file_action
[0].upper()
26 class GitParser(ParserInterface
):
27 """Parser for Git repository in googlesource.
30 parsed_deps: A map from component path to its repository name, regression,
32 url_parts_map: A map from url type to its url parts. This parts are added
33 the base url to form different urls.
36 def __init__(self
, parsed_deps
, url_parts_map
):
37 self
.component_to_url_map
= parsed_deps
38 self
.url_parts_map
= url_parts_map
40 def ParseChangelog(self
, component_path
, range_start
, range_end
):
41 file_to_revision_map
= {}
43 base_url
= self
.component_to_url_map
[component_path
]['repository']
44 changelog_url
= base_url
+ self
.url_parts_map
['changelog_url']
45 revision_url
= base_url
+ self
.url_parts_map
['revision_url']
47 # Retrieve data from the url, return empty maps if fails. Html url is a\
48 # url where the changelog can be parsed from html.
49 url
= changelog_url
% (range_start
, range_end
)
50 html_url
= url
+ '?pretty=fuller'
51 response
= crash_utils
.GetDataFromURL(html_url
)
53 return (revision_map
, file_to_revision_map
)
55 # Parse xml out of the returned string. If it failes, Try parsing
58 dom
= minidom
.parseString(response
)
60 self
.ParseChangelogFromJSON(range_start
, range_end
, changelog_url
,
61 revision_url
, revision_map
,
63 return (revision_map
, file_to_revision_map
)
65 # The revisions information are in from the third divs to the second
67 divs
= dom
.getElementsByTagName('div')[2:-1]
68 pres
= dom
.getElementsByTagName('pre')
69 uls
= dom
.getElementsByTagName('ul')
71 # Divs, pres and uls each contain revision information for one CL, so
72 # they should have same length.
73 if not divs
or len(divs
) != len(pres
) or len(pres
) != len(uls
):
74 self
.ParseChangelogFromJSON(range_start
, range_end
, changelog_url
,
75 revision_url
, revision_map
,
77 return (revision_map
, file_to_revision_map
)
79 # Iterate through divs and parse revisions
80 for (div
, pre
, ul
) in zip(divs
, pres
, uls
):
81 # Create new revision object for each revision.
84 # There must be three <tr>s. If not, this page is wrong.
85 trs
= div
.getElementsByTagName('tr')
90 githash
= trs
[0].getElementsByTagName('a')[0].firstChild
.nodeValue
92 # Retrieve and set author.
93 author
= trs
[1].getElementsByTagName(
94 'td')[0].firstChild
.nodeValue
.split('<')[0]
95 revision
['author'] = author
96 revision
['time'] = trs
[1].getElementsByTagName(
97 'td')[1].firstChild
.nodeValue
99 # Retrive and set message.
100 revision
['message'] = pre
.firstChild
.nodeValue
102 # Set url of this CL.
103 revision_url_part
= self
.url_parts_map
['revision_url'] % githash
104 revision
['url'] = base_url
+ revision_url_part
106 # Go through changed files, they are in li.
107 lis
= ul
.getElementsByTagName('li')
109 # Retrieve path and action of the changed file
110 file_path
= li
.getElementsByTagName('a')[0].firstChild
.nodeValue
111 file_change_type
= li
.getElementsByTagName('span')[
112 0].getAttribute('class')
114 # Normalize file action so that it is same as SVN parser.
115 file_change_type
= _ConvertToFileChangeType(file_change_type
)
117 # Add the changed file to the map.
118 if file_path
not in file_to_revision_map
:
119 file_to_revision_map
[file_path
] = []
120 file_to_revision_map
[file_path
].append((githash
, file_change_type
))
122 # Add this revision object to the map.
123 revision_map
[githash
] = revision
125 # Parse one revision for the start range, because googlesource does not
126 # include the start of the range.
127 self
.ParseRevision(revision_url
, range_start
, revision_map
,
128 file_to_revision_map
)
130 return (revision_map
, file_to_revision_map
)
132 def ParseChangelogFromJSON(self
, range_start
, range_end
, changelog_url
,
133 revision_url
, revision_map
, file_to_revision_map
):
134 """Parses changelog by going over the JSON file.
137 range_start: Starting range of the regression.
138 range_end: Ending range of the regression.
139 changelog_url: The url to retrieve changelog from.
140 revision_url: The url to retrieve individual revision from.
141 revision_map: A map from a git hash number to its revision information.
142 file_to_revision_map: A map from file to a git hash in which it occurs.
144 # Compute URLs from given range, and retrieves changelog. Stop if it fails.
145 changelog_url
%= (range_start
, range_end
)
146 json_url
= changelog_url
+ '?format=json'
147 response
= crash_utils
.GetDataFromURL(json_url
)
151 # Parse changelog from the returned object. The returned string should
152 # start with ")}]'\n", so start from the 6th character.
153 revisions
= crash_utils
.LoadJSON(response
[5:])
157 # Parse individual revision in the log.
158 for revision
in revisions
['log']:
159 githash
= revision
['commit']
160 self
.ParseRevision(revision_url
, githash
, revision_map
,
161 file_to_revision_map
)
163 # Parse the revision with range_start, because googlesource ignores
165 self
.ParseRevision(revision_url
, range_start
, revision_map
,
166 file_to_revision_map
)
168 def ParseRevision(self
, revision_url
, githash
, revision_map
,
169 file_to_revision_map
):
171 # Retrieve data from the URL, return if it fails.
172 url
= revision_url
% githash
173 response
= crash_utils
.GetDataFromURL(url
+ '?format=json')
177 # Load JSON object from the string. If it fails, terminate the function.
178 json_revision
= crash_utils
.LoadJSON(response
[5:])
179 if not json_revision
:
182 # Create a map representing object and get githash from the JSON object.
184 githash
= json_revision
['commit']
186 # Set author, message and URL of this CL.
187 revision
['author'] = json_revision
['author']['name']
188 revision
['time'] = json_revision
['author']['time']
189 revision
['message'] = json_revision
['message']
190 revision
['url'] = url
192 # Iterate through the changed files.
193 for diff
in json_revision
['tree_diff']:
194 file_path
= diff
['new_path']
195 file_change_type
= diff
['type']
197 # Normalize file action so that it fits with svn_repository_parser.
198 file_change_type
= _ConvertToFileChangeType(file_change_type
)
200 # Add the file to the map.
201 if file_path
not in file_to_revision_map
:
202 file_to_revision_map
[file_path
] = []
203 file_to_revision_map
[file_path
].append((githash
, file_change_type
))
205 # Add this CL to the map.
206 revision_map
[githash
] = revision
210 def ParseLineDiff(self
, path
, component
, file_change_type
, githash
):
211 changed_line_numbers
= []
212 changed_line_contents
= []
213 base_url
= self
.component_to_url_map
[component
]['repository']
214 backup_url
= (base_url
+ self
.url_parts_map
['revision_url']) % githash
216 # If the file is added (not modified), treat it as if it is not changed.
217 if file_change_type
in ('A', 'C', 'R'):
218 # TODO(stgao): Maybe return whole file change for Add, Rename, and Copy?
219 return (backup_url
, changed_line_numbers
, changed_line_contents
)
221 # Retrieves the diff data from URL, and if it fails, return emptry lines.
222 url
= (base_url
+ self
.url_parts_map
['diff_url']) % (githash
, path
)
223 data
= crash_utils
.GetDataFromURL(url
+ '?format=text')
225 return (backup_url
, changed_line_numbers
, changed_line_contents
)
227 # Decode the returned object to line diff info
228 diff
= base64
.b64decode(data
).splitlines()
230 # Iterate through the lines in diff. Set current line to -1 so that we know
231 # that current line is part of the diff chunk.
236 # If line starts with @@, a new chunk starts.
237 if line
.startswith('@@'):
238 current_line
= int(line
.split('+')[1].split(',')[0])
240 # If we are in a chunk.
241 elif current_line
!= -1:
242 # If line is either added or modified.
243 if line
.startswith('+'):
244 changed_line_numbers
.append(current_line
)
245 changed_line_contents
.append(line
[2:])
247 # Do not increment current line if the change is 'delete'.
248 if not line
.startswith('-'):
251 # Return url without '?format=json'
252 return (url
, changed_line_numbers
, changed_line_contents
)
254 def ParseBlameInfo(self
, component
, file_path
, line
, revision
):
255 base_url
= self
.component_to_url_map
[component
]['repository']
257 # Retrieve blame JSON file from googlesource. If it fails, return None.
258 url_part
= self
.url_parts_map
['blame_url'] % (revision
, file_path
)
259 blame_url
= base_url
+ url_part
260 json_string
= crash_utils
.GetDataFromURL(blame_url
)
264 # Parse JSON object from the string. The returned string should
265 # start with ")}]'\n", so start from the 6th character.
266 annotation
= crash_utils
.LoadJSON(json_string
[5:])
270 # Go through the regions, which is a list of consecutive lines with same
272 for blame_line
in annotation
['regions']:
273 start
= blame_line
['start']
274 count
= blame_line
['count']
276 # For each region, check if the line we want the blame info of is in this
278 if start
<= line
and line
<= start
+ count
- 1:
279 # If we are in the right region, get the information from the line.
280 revision
= blame_line
['commit']
281 author
= blame_line
['author']['name']
282 revision_url_parts
= self
.url_parts_map
['revision_url'] % revision
283 revision_url
= base_url
+ revision_url_parts
284 # TODO(jeun): Add a way to get content from JSON object.
287 (revision_info
, _
) = self
.ParseChangelog(component
, revision
, revision
)
288 message
= revision_info
[revision
]['message']
289 time
= revision_info
[revision
]['time']
290 return (content
, revision
, author
, revision_url
, message
, time
)
292 # Return none if the region does not exist.