sw vba: move SwWordBasic to its own file
[LibreOffice.git] / bin / crashreportScraper.py
blob7d57ab1f747b5c67a2730b3fbd34b37de92b237e
1 #!/usr/bin/env python3
3 # This file is part of the LibreOffice project.
5 # This Source Code Form is subject to the terms of the Mozilla Public
6 # License, v. 2.0. If a copy of the MPL was not distributed with this
7 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 # Use this script to retrieve information from https://crashreport.libreoffice.org
10 # about a specific version of LibreOffice
11 # Usage sample: ./crashreportScraper.py --version 7.2.0.4 --repository /path/to/libreoffice/repository/
13 import argparse
14 import requests
15 from bs4 import BeautifulSoup
16 import sys
17 import os
18 import math
19 from datetime import datetime
20 import urllib.parse
22 def convert_str_to_date(value):
23 value = value.replace('.', '')
24 value = value.replace('March', 'Mar')
25 value = value.replace('April', 'Apr')
26 value = value.replace('June', 'Jun')
27 value = value.replace('July', 'Jul')
28 value = value.replace('Sept', 'Sep')
29 # reset the time leaving the date
30 value = ", ".join(value.split(", ")[:-1])
31 dtDate = datetime.strptime(value, '%b %d, %Y')
33 return dtDate.strftime('%y/%m/%d')
35 def parse_version_url(url):
36 crashReports = {}
38 try:
39 html_text = requests.get(url, timeout=200).text
40 soup = BeautifulSoup(html_text, 'html.parser')
41 except requests.exceptions.Timeout:
42 print("Timeout requesting " + url)
43 sys.exit(1)
45 table = soup.find("table", {"id": "data-table"}).tbody
46 for tr in table.find_all("tr"):
47 td_list = tr.find_all("td")
48 crashName = td_list[0].a.text.strip()
49 crashNumber = int(td_list[1].text.strip())
50 firstCrashDate = convert_str_to_date(td_list[5].text.strip())
51 lastCrashDate = convert_str_to_date(td_list[6].text.strip())
52 crashReports[crashName] = [crashNumber, firstCrashDate, lastCrashDate]
54 return crashReports
56 def parse_reports_and_get_most_recent_report_from_last_page(url):
57 try:
58 html_text = requests.get(url, timeout=200).text
59 soup = BeautifulSoup(html_text, 'html.parser')
60 except requests.exceptions.Timeout:
61 print("Timeout")
62 raise
64 count = 0
65 try:
66 os_tab = soup.find("table", {"id": "os_tab"}).tbody
67 except AttributeError:
68 print("os_tab not found")
69 raise
71 tr_list = os_tab.find_all("tr")
72 for tr in tr_list:
73 td_list = tr.find_all("td")
74 count += int(td_list[1].text.strip())
76 # There are 50 reports on each page.
77 # Go to the last page based on the total count to get a recent report
78 last_page = math.ceil( count / 50 )
80 if last_page > 1:
81 url = url + "?page=" + str(last_page)
82 try:
83 html_text = requests.get(url, timeout=200).text
84 soup = BeautifulSoup(html_text, 'html.parser')
85 except requests.exceptions.Timeout:
86 print("Timeout")
87 raise
89 reports = soup.find("div", {"id": "reports"}).tbody
90 ID, currentID = "", ""
91 version, currentVersion = "", ""
92 OS, currentOS = "", ""
94 tr_list = reports.find_all("tr")
95 for tr in tr_list:
96 td_list = tr.find_all("td")
98 currentID = td_list[0].a.text.strip()
99 currentVersion = td_list[2].text.strip().split(': ')[1]
100 currentOS = td_list[3].text.strip()
102 # get most recent version
103 # symbols on linux are not very informative generally
104 if currentOS == "windows" and currentVersion > version:
105 version = currentVersion
106 ID = currentID
107 OS = currentOS
109 if not version:
110 version = currentVersion
112 if not ID:
113 ID = currentID
115 if not OS:
116 OS = currentOS
118 return count, ID, version, OS
120 def parse_details_and_get_info(url, gitRepo):
121 try:
122 html_text = requests.get(url, timeout=200).text
123 soup = BeautifulSoup(html_text, 'html.parser')
124 except requests.exceptions.Timeout:
125 print("Timeout")
126 raise
128 details = soup.find("div", {"id": "details"}).tbody
129 tr_list = details.find_all("tr")
130 reason = tr_list[8].td.text.strip()
132 stack = ""
133 codeLine = ""
135 count = 0
136 frames = soup.find("div", {"id": "frames"}).tbody
137 for tr in frames.find_all("tr"):
138 td_list = tr.find_all("td")
139 source = td_list[3].text.strip()
140 if source and count <= 10:
141 source = source.replace("\\", "/").replace("C:/cygwin64/home/buildslave/source/libo-core/", "")
142 stack += source + "\n"
143 count += 1
145 codeFile = source.split(":")[0]
146 codeNumber = source.split(":")[1]
147 try:
148 with open(os.path.join(gitRepo, codeFile)) as f:
149 lines = f.readlines()
150 for index, line in enumerate(lines):
151 if index + 1 == int(codeNumber):
152 codeLine += line.strip().replace("\"", "'") + "\n"
153 except FileNotFoundError:
154 codeLine += "\n"
155 continue
157 if stack:
158 #multiline
159 stack = "\"" + stack + "\""
161 if codeLine:
162 #multiline
163 codeLine = "\"" + codeLine + "\""
165 return reason, stack, codeLine
168 if __name__ == '__main__':
170 parser = argparse.ArgumentParser()
172 parser.add_argument('--version', action='store', dest="version", required=True)
173 parser.add_argument('--repository', action="store", dest="repository", required=True)
175 args = parser.parse_args()
177 crashes = parse_version_url(
178 "https://crashreport.libreoffice.org/stats/version/" + args.version + "?limit=1000&days=30")
180 print(str(len(crashes)) + " crash reports in version " + args.version)
182 crashesInFile = []
183 fileName = "crashes_" + args.version.replace(".", "_") + ".csv"
184 print("Using " + fileName)
186 bInsertHeader = False
187 if os.path.exists(fileName):
188 with open(fileName, "r") as f:
189 lines = f.readlines()
190 for line in lines:
191 crashesInFile.append(line.split("\t")[0])
192 else:
193 bInsertHeader = True
195 with open(fileName, "a") as f:
196 if bInsertHeader:
197 line = '\t'.join(["Name", "Count", "First report", "Last Report",
198 "ID", "Version", "Reason", "OS", "Stack", "Code Lines" '\n'])
199 f.write(line)
200 f.flush()
202 for k, lDate in crashes.items():
203 if k not in crashesInFile:
204 print("Parsing " + k)
205 try:
206 crashCount, crashID, crashVersion, crashOS = parse_reports_and_get_most_recent_report_from_last_page(
207 "https://crashreport.libreoffice.org/stats/signature/" + urllib.parse.quote(k))
208 crashReason, crashStack, codeLine = parse_details_and_get_info(
209 "https://crashreport.libreoffice.org/stats/crash_details/" + crashID, args.repository)
210 line = '\t'.join([k, str(crashCount), lDate[1], lDate[2],
211 crashID, crashVersion, crashReason, crashOS, crashStack, codeLine, '\n'])
212 f.write(line)
213 f.flush()
214 except (requests.exceptions.Timeout, AttributeError):
215 continue