tools/findit/svn_repository_parser.py

   1 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 import xml.dom.minidom as minidom
   6 from xml.parsers.expat import ExpatError
   7
   8 import crash_utils
   9 from repository_parser_interface import ParserInterface
  10
  11
  12 # This number is 6 because each linediff page in src.chromium.org should
  13 # contain the following tables: table with revision number, table with actual
  14 # diff, table with dropdown menu, table with legend, a border table and a table
  15 # containing page information.
  16 NUM_TABLES_IN_LINEDIFF_PAGE = 6
  17 # Each of the linediff info should contain 3 tds, one for changed line number,
  18 # and two for line contents before/after.
  19 NUM_TDS_IN_LINEDIFF_PAGE = 3
  20
  21
  22 class SVNParser(ParserInterface):
  23   """Parser for SVN repository using chromium.org, for components in config.
  24
  25   Attributes:
  26     url_map: A map from component to the urls, where urls are for changelog,
  27              revision, line diff and annotation.
  28   """
  29
  30   def __init__(self, url_map):
  31     self.component_to_urls_map = url_map
  32
  33   def ParseChangelog(self, component, range_start, range_end):
  34     file_to_revision_map = {}
  35     revision_map = {}
  36
  37     # Check if the current component is supported by reading the components
  38     # parsed from config file. If it is not, fail.
  39
  40     url_map = self.component_to_urls_map.get(component)
  41     if not url_map:
  42       return (revision_map, file_to_revision_map)
  43
  44     # Retrieve data from the url, return empty map if fails.
  45     revision_range_str = '%s:%s' % (range_start, range_end)
  46     url = url_map['changelog_url'] % revision_range_str
  47     response = crash_utils.GetDataFromURL(url)
  48     if not response:
  49       return (revision_map, file_to_revision_map)
  50
  51     # Parse xml out of the returned string. If it fails, return empty map.
  52     try:
  53       xml_revisions = minidom.parseString(response)
  54     except ExpatError:
  55       return (revision_map, file_to_revision_map)
  56
  57     # Iterate through the returned XML object.
  58     revisions = xml_revisions.getElementsByTagName('logentry')
  59     for revision in revisions:
  60       # Create new revision object for each of the revision.
  61       revision_object = {}
  62
  63       # Set author of the CL.
  64       revision_object['author'] = revision.getElementsByTagName(
  65           'author')[0].firstChild.nodeValue
  66
  67       # Get the revision number from xml.
  68       revision_number = int(revision.getAttribute('revision'))
  69
  70       # Iterate through the changed paths in the CL.
  71       paths = revision.getElementsByTagName('paths')
  72       if paths:
  73         for changed_path in paths[0].getElementsByTagName('path'):
  74           # Get path and file change type from the xml.
  75           file_path = changed_path.firstChild.nodeValue
  76           file_change_type = changed_path.getAttribute('action')
  77
  78           if file_path.startswith('/trunk/'):
  79             file_path = file_path[len('/trunk/'):]
  80
  81           # Add file to the map.
  82           if file_path not in file_to_revision_map:
  83             file_to_revision_map[file_path] = []
  84           file_to_revision_map[file_path].append(
  85               (revision_number, file_change_type))
  86
  87       # Set commit message of the CL.
  88       revision_object['message'] = revision.getElementsByTagName('msg')[
  89           0].firstChild.nodeValue
  90
  91       # Set url of this CL.
  92       revision_url = url_map['revision_url'] % revision_number
  93       revision_object['url'] = revision_url
  94
  95       # Add this CL to the revision map.
  96       revision_map[revision_number] = revision_object
  97
  98     return (revision_map, file_to_revision_map)
  99
 100   def ParseLineDiff(self, path, component, file_change_type, revision_number):
 101     changed_line_numbers = []
 102     changed_line_contents = []
 103
 104     url_map = self.component_to_urls_map.get(component)
 105     if not url_map:
 106       return (None, None, None)
 107
 108     # If the file is added (not modified), treat it as if it is not changed.
 109     backup_url = url_map['revision_url'] % revision_number
 110     if file_change_type == 'A':
 111       return (backup_url, changed_line_numbers, changed_line_contents)
 112
 113     # Retrieve data from the url. If no data is retrieved, return empty lists.
 114     url = url_map['diff_url'] % (path, revision_number - 1,
 115                                  revision_number, revision_number)
 116     data = crash_utils.GetDataFromURL(url)
 117     if not data:
 118       return (backup_url, changed_line_numbers, changed_line_contents)
 119
 120     line_diff_html = minidom.parseString(data)
 121     tables = line_diff_html.getElementsByTagName('table')
 122     # If there are not NUM_TABLES tables in the html page, there should be an
 123     # error in the html page.
 124     if len(tables) != NUM_TABLES_IN_LINEDIFF_PAGE:
 125       return (backup_url, changed_line_numbers, changed_line_contents)
 126
 127     # Diff content is in the second table. Each line of the diff content
 128     # is in <tr>.
 129     trs = tables[1].getElementsByTagName('tr')
 130     prefix_len = len('vc_diff_')
 131
 132     # Filter trs so that it only contains diff chunk with contents.
 133     filtered_trs = []
 134     for tr in trs:
 135       tr_class = tr.getAttribute('class')
 136
 137       # Check for the classes of the <tr>s.
 138       if tr_class:
 139         tr_class = tr_class[prefix_len:]
 140
 141         # Do not have to add header.
 142         if tr_class == 'header' or tr_class == 'chunk_header':
 143           continue
 144
 145         # If the class of tr is empty, this page does not have any change.
 146         if tr_class == 'empty':
 147           return (backup_url, changed_line_numbers, changed_line_contents)
 148
 149       filtered_trs.append(tr)
 150
 151     # Iterate through filtered trs, and grab line diff information.
 152     for tr in filtered_trs:
 153       tds = tr.getElementsByTagName('td')
 154
 155       # If there aren't 3 tds, this line does should not contain line diff.
 156       if len(tds) != NUM_TDS_IN_LINEDIFF_PAGE:
 157         continue
 158
 159       # If line number information is not in hyperlink, ignore this line.
 160       try:
 161         line_num = tds[0].getElementsByTagName('a')[0].firstChild.nodeValue
 162         left_diff_type = tds[1].getAttribute('class')[prefix_len:]
 163         right_diff_type = tds[2].getAttribute('class')[prefix_len:]
 164       except IndexError:
 165         continue
 166
 167       # Treat the line as modified only if both left and right diff has type
 168       # changed or both have different change type, and if the change is not
 169       # deletion.
 170       if (left_diff_type != right_diff_type) or (
 171           left_diff_type == 'change' and right_diff_type == 'change'):
 172
 173         # Check if the line content is not empty.
 174         try:
 175           new_line = tds[2].firstChild.nodeValue
 176         except AttributeError:
 177           new_line = ''
 178
 179         if not (left_diff_type == 'remove' and right_diff_type == 'empty'):
 180           changed_line_numbers.append(int(line_num))
 181           changed_line_contents.append(new_line.strip())
 182
 183     return (url, changed_line_numbers, changed_line_contents)
 184
 185   def ParseBlameInfo(self, component, file_path, line, revision):
 186     url_map = self.component_to_urls_map.get(component)
 187     if not url_map:
 188       return None
 189
 190     # Retrieve blame data from url, return None if fails.
 191     url = url_map['blame_url'] % (file_path, revision, revision)
 192     data = crash_utils.GetDataFromURL(url)
 193     if not data:
 194       return None
 195
 196     blame_html = minidom.parseString(data)
 197
 198     title = blame_html.getElementsByTagName('title')
 199     # If the returned html page is an exception page, return None.
 200     if title[0].firstChild.nodeValue == 'ViewVC Exception':
 201       return None
 202
 203     # Each of the blame result is in <tr>.
 204     blame_results = blame_html.getElementsByTagName('tr')
 205     try:
 206       blame_result = blame_results[line]
 207     except IndexError:
 208       return None
 209
 210     # There must be 4 <td> for each <tr>. If not, this page is wrong.
 211     tds = blame_result.getElementsByTagName('td')
 212     if len(tds) != 4:
 213       return None
 214
 215     # The third <td> has the line content, separated by <span>s. Combine
 216     # those to get a string of changed line. If it has nothing, the line
 217     # is empty.
 218     line_content = ''
 219     if tds[3].hasChildNodes():
 220       contents = tds[3].childNodes
 221
 222       for content in contents:
 223         # Nodetype 3 means it is text node.
 224         if content.nodeType == minidom.Node.TEXT_NODE:
 225           line_content += content.nodeValue
 226         else:
 227           line_content += content.firstChild.nodeValue
 228
 229       line_content = line_content.strip()
 230
 231     # If the current line has the same author/revision as the previous lines,
 232     # the result is not shown. Propagate up until we find the line with info.
 233     while not tds[1].firstChild:
 234       line -= 1
 235       blame_result = blame_results[line]
 236       tds = blame_result.getElementsByTagName('td')
 237     author = tds[1].firstChild.nodeValue
 238
 239     # Revision can either be in hyperlink or plain text.
 240     try:
 241       revision = tds[2].getElementsByTagName('a')[0].firstChild.nodeValue
 242     except IndexError:
 243       revision = tds[2].firstChild.nodeValue
 244
 245     (revision_info, _) = self.ParseChangelog(component, revision, revision)
 246     message = revision_info[int(revision)]['message']
 247
 248     # Return the parsed information.
 249     revision_url = url_map['revision_url'] % int(revision)
 250     return (line_content, revision, author, revision_url, message)