bin/crashreportScraper.py

   1 #!/usr/bin/env python3
   2
   3 # This file is part of the LibreOffice project.
   4 #
   5 # This Source Code Form is subject to the terms of the Mozilla Public
   6 # License, v. 2.0. If a copy of the MPL was not distributed with this
   7 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8
   9 # Use this script to retrieve information from https://crashreport.libreoffice.org
  10 # about a specific version of LibreOffice
  11 # Usage sample: ./crashreportScraper.py --version 7.2.0.4 --repository /path/to/libreoffice/repository/
  12
  13 import argparse
  14 import requests
  15 from bs4 import BeautifulSoup
  16 import sys
  17 import os
  18 import math
  19 from datetime import datetime
  20 import urllib.parse
  21
  22 def convert_str_to_date(value):
  23     value = value.replace('.', '')
  24     value = value.replace('March', 'Mar')
  25     value = value.replace('April', 'Apr')
  26     value = value.replace('June', 'Jun')
  27     value = value.replace('July', 'Jul')
  28     value = value.replace('Sept', 'Sep')
  29     # reset the time leaving the date
  30     value = ", ".join(value.split(", ")[:-1])
  31     dtDate = datetime.strptime(value, '%b %d, %Y')
  32
  33     return dtDate.strftime('%y/%m/%d')
  34
  35 def parse_version_url(url):
  36     crashReports = {}
  37
  38     try:
  39         html_text = requests.get(url, timeout=200).text
  40         soup = BeautifulSoup(html_text, 'html.parser')
  41     except requests.exceptions.Timeout:
  42         print("Timeout requesting " + url)
  43         sys.exit(1)
  44
  45     table = soup.find("table", {"id": "data-table"}).tbody
  46     for tr in table.find_all("tr"):
  47         td_list = tr.find_all("td")
  48         crashName = td_list[0].a.text.strip()
  49         crashNumber = int(td_list[1].text.strip())
  50         firstCrashDate = convert_str_to_date(td_list[5].text.strip())
  51         lastCrashDate = convert_str_to_date(td_list[6].text.strip())
  52         crashReports[crashName] = [crashNumber, firstCrashDate, lastCrashDate]
  53
  54     return crashReports
  55
  56 def parse_reports_and_get_most_recent_report_from_last_page(url):
  57     try:
  58         html_text = requests.get(url, timeout=200).text
  59         soup = BeautifulSoup(html_text, 'html.parser')
  60     except requests.exceptions.Timeout:
  61         print("Timeout")
  62         raise
  63
  64     count = 0
  65     try:
  66         os_tab = soup.find("table", {"id": "os_tab"}).tbody
  67     except AttributeError:
  68         print("os_tab not found")
  69         raise
  70
  71     tr_list = os_tab.find_all("tr")
  72     for tr in tr_list:
  73         td_list = tr.find_all("td")
  74         count += int(td_list[1].text.strip())
  75
  76     # There are 50 reports on each page.
  77     # Go to the last page based on the total count to get a recent report
  78     last_page = math.ceil( count / 50 )
  79
  80     if last_page > 1:
  81         url = url + "?page=" + str(last_page)
  82         try:
  83             html_text = requests.get(url, timeout=200).text
  84             soup = BeautifulSoup(html_text, 'html.parser')
  85         except requests.exceptions.Timeout:
  86             print("Timeout")
  87             raise
  88
  89     reports = soup.find("div", {"id": "reports"}).tbody
  90     ID, currentID = "", ""
  91     version, currentVersion = "", ""
  92     OS, currentOS = "", ""
  93
  94     tr_list = reports.find_all("tr")
  95     for tr in tr_list:
  96         td_list = tr.find_all("td")
  97
  98         currentID = td_list[0].a.text.strip()
  99         currentVersion = td_list[2].text.strip().split(': ')[1]
 100         currentOS = td_list[3].text.strip()
 101
 102         # get most recent version
 103         # symbols on linux are not very informative generally
 104         if currentOS == "windows" and currentVersion > version:
 105             version = currentVersion
 106             ID = currentID
 107             OS = currentOS
 108
 109     if not version:
 110         version = currentVersion
 111
 112     if not ID:
 113         ID = currentID
 114
 115     if not OS:
 116         OS = currentOS
 117
 118     return count, ID, version, OS
 119
 120 def parse_details_and_get_info(url, gitRepo):
 121     try:
 122         html_text = requests.get(url, timeout=200).text
 123         soup = BeautifulSoup(html_text, 'html.parser')
 124     except requests.exceptions.Timeout:
 125         print("Timeout")
 126         raise
 127
 128     details = soup.find("div", {"id": "details"}).tbody
 129     tr_list = details.find_all("tr")
 130     reason = tr_list[8].td.text.strip()
 131
 132     stack = ""
 133     codeLine = ""
 134
 135     count = 0
 136     frames = soup.find("div", {"id": "frames"}).tbody
 137     for tr in frames.find_all("tr"):
 138         td_list = tr.find_all("td")
 139         source = td_list[3].text.strip()
 140         if source and count <= 10:
 141             source = source.replace("\\", "/").replace("C:/cygwin64/home/buildslave/source/libo-core/", "")
 142             stack += source + "\n"
 143             count += 1
 144
 145             codeFile = source.split(":")[0]
 146             codeNumber = source.split(":")[1]
 147             try:
 148                 with open(os.path.join(gitRepo, codeFile)) as f:
 149                     lines = f.readlines()
 150                     for index, line in enumerate(lines):
 151                         if index + 1 == int(codeNumber):
 152                             codeLine += line.strip().replace("\"", "'") + "\n"
 153             except FileNotFoundError:
 154                 codeLine += "\n"
 155                 continue
 156
 157     if stack:
 158         #multiline
 159         stack = "\"" + stack + "\""
 160
 161     if codeLine:
 162         #multiline
 163         codeLine = "\"" + codeLine + "\""
 164
 165     return reason, stack, codeLine
 166
 167
 168 if __name__ == '__main__':
 169
 170     parser = argparse.ArgumentParser()
 171
 172     parser.add_argument('--version', action='store', dest="version", required=True)
 173     parser.add_argument('--repository', action="store", dest="repository", required=True)
 174
 175     args = parser.parse_args()
 176
 177     crashes = parse_version_url(
 178             "https://crashreport.libreoffice.org/stats/version/" + args.version + "?limit=1000&days=30")
 179
 180     print(str(len(crashes)) + " crash reports in version " + args.version)
 181
 182     crashesInFile = []
 183     fileName = "crashes_" + args.version.replace(".", "_") + ".csv"
 184     print("Using " + fileName)
 185
 186     bInsertHeader = False
 187     if os.path.exists(fileName):
 188         with open(fileName, "r") as f:
 189             lines = f.readlines()
 190             for line in lines:
 191                 crashesInFile.append(line.split("\t")[0])
 192     else:
 193         bInsertHeader = True
 194
 195     with open(fileName, "a") as f:
 196         if bInsertHeader:
 197             line = '\t'.join(["Name", "Count", "First report", "Last Report",
 198                 "ID", "Version", "Reason", "OS", "Stack", "Code Lines" '\n'])
 199             f.write(line)
 200             f.flush()
 201
 202         for k, lDate in crashes.items():
 203             if k not in crashesInFile:
 204                 print("Parsing " + k)
 205                 try:
 206                     crashCount, crashID, crashVersion, crashOS = parse_reports_and_get_most_recent_report_from_last_page(
 207                             "https://crashreport.libreoffice.org/stats/signature/" + urllib.parse.quote(k))
 208                     crashReason, crashStack, codeLine = parse_details_and_get_info(
 209                             "https://crashreport.libreoffice.org/stats/crash_details/" + crashID, args.repository)
 210                     line = '\t'.join([k, str(crashCount), lDate[1], lDate[2],
 211                             crashID, crashVersion, crashReason, crashOS, crashStack, codeLine, '\n'])
 212                     f.write(line)
 213                     f.flush()
 214                 except (requests.exceptions.Timeout, AttributeError):
 215                     continue