Revert "workaround compile failures in libxml, with clang-17"
[LibreOffice.git] / bin / get-bugzilla-attachments-by-mimetype
blob6ce2a82d30ebe7651c631fd02ce95dbd572f23da
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # This file is part of the LibreOffice project.
6 # This Source Code Form is subject to the terms of the Mozilla Public
7 # License, v. 2.0. If a copy of the MPL was not distributed with this
8 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 # This digs through a pile of bugzilla's and populates the cwd with a big
12 # collection of bug-docs in per-filetype dirs with bug-ids as names with
13 # prefixes to indicate which bug-tracker, e.g.
15 # fdo-bugid-X.suffix
16 # rhbz-bugid-X.suffix
17 # moz-bugid-X.suffix
19 # where X is the n'th attachment of that type in the bug
21 # The results are stored in the current directory, categorized by the
22 # extension of the downloaded file. When a file already exists, it is assumed
23 # it is already downloaded by a previous run, and up-to-date.
25 from __future__ import print_function
26 import feedparser
27 import base64
28 import datetime
29 import glob
30 import re
31 import os, os.path
32 import stat
33 import sys
34 import threading
35 try:
36 import queue
37 except:
38 import Queue as queue
39 try:
40 from urllib.request import urlopen
41 except:
42 from urllib import urlopen
43 try:
44 import xmlrpc.client as xmlrpclib
45 except:
46 import xmlrpclib
47 from xml.dom import minidom
48 from xml.sax.saxutils import escape
49 from attachment_mimetypes import mimetypes
51 def urlopen_retry(url):
52 maxretries = 3
53 for i in range(maxretries + 1):
54 try:
55 return urlopen(url)
56 except IOError as e:
57 print("caught IOError: " + str(e))
58 if maxretries == i:
59 raise
60 print("retrying...")
62 def get_from_bug_url_via_xml(url, mimetype, prefix, suffix):
63 id = url.rsplit('=', 2)[1]
64 print("id is " + prefix + id + " " + suffix)
65 print("parsing " + id)
66 sock = urlopen_retry(url+"&ctype=xml")
67 dom = minidom.parse(sock)
68 sock.close()
69 attachmentid=0
70 for attachment in dom.getElementsByTagName('attachment'):
71 attachmentid += 1
72 print(" mimetype is", end=' ')
73 for node in attachment.childNodes:
74 if node.nodeName == 'type':
75 # check if attachment is deleted
76 if not node.firstChild:
77 print('deleted attachment, skipping')
78 continue
80 print(node.firstChild.nodeValue, end=' ')
81 if node.firstChild.nodeValue.lower() != mimetype.lower():
82 print('skipping')
83 break
84 elif node.nodeName == 'data':
85 # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml)
86 if not node.firstChild:
87 print('deleted attachment, skipping')
88 continue
90 download = suffix + '/' +prefix + id + '-' + str(attachmentid) + '.' + suffix
91 if os.path.isfile(download):
92 print("assuming " + download + " is up to date")
93 continue
95 # prevent re-downloading FDO attachments from TDF
96 if prefix == "tdf" and int(id) < 88776:
97 fdodownload = download.replace("tdf", "fdo")
98 if os.path.isfile(fdodownload):
99 print("assuming FDO " + fdodownload + " is up to date")
100 continue
102 print('downloading as ' + download)
103 tmpfile = download + ".tmp"
104 f = open(tmpfile, 'wb')
105 f.write(base64.b64decode(node.firstChild.nodeValue))
106 f.close()
107 os.rename(tmpfile, download)
108 break
110 def get_novell_bug_via_xml(url, mimetype, prefix, suffix):
111 id = url.rsplit('=', 2)[1]
112 print("id is " + prefix + id + " " + suffix)
113 print("parsing " + id)
114 sock = urlopen_retry(url+"&ctype=xml")
115 dom = minidom.parse(sock)
116 sock.close()
117 attachmentid=0
118 for comment in dom.getElementsByTagName('thetext'):
119 commentText = comment.firstChild.nodeValue
120 match = re.search(r".*Created an attachment \(id=([0-9]+)\)", commentText)
121 if not match:
122 continue
124 attachmentid += 1
126 download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix
127 if os.path.isfile(download):
128 print("assuming " + download + " is up to date")
129 continue
131 realAttachmentId = match.group(1)
132 handle = urlopen_retry(novellattach + realAttachmentId)
133 if not handle:
134 print("attachment %s is not accessible" % realAttachmentId)
135 continue
136 print(" mimetype is", end=' ')
138 info = handle.info()
139 if info.get_content_type:
140 remoteMime = info.get_content_type()
141 else:
142 remoteMime = info.gettype()
143 print(remoteMime, end=' ')
144 if remoteMime != mimetype:
145 print("skipping")
146 continue
148 print('downloading as ' + download)
149 tmpfile = download + ".tmp"
150 f = open(tmpfile, 'wb')
151 f.write(handle.read())
152 f.close()
153 os.rename(tmpfile, download)
155 def create_query(mimetype):
156 query = dict()
157 query['query_format']='advanced'
158 query['field0-0-0']='attachments.mimetype'
159 query['type0-0-0']='equals'
160 query['value0-0-0']=mimetype
161 return query
163 def get_downloaded_files(prefix, suffix):
164 return glob.glob(os.path.join(suffix, '%s*.%s' % (prefix, suffix)))
166 def get_file_bz_ids(files, prefix):
167 return set([os.path.basename(f).split('-')[0].replace(prefix, '', 1) for f in files])
169 def get_changed_date(files):
170 newest = max([os.stat(f)[stat.ST_MTIME] for f in files])
171 # Subtract a day to avoid timezone differences. The worst thing that
172 # can happen is that we are going to process more bugs than necessary.
173 return datetime.date.fromtimestamp(newest - 24 * 60 * 60)
175 def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix):
176 try:
177 os.mkdir(suffix)
178 except:
179 pass
181 def process(query, full, have=[]):
182 try:
183 proxy = xmlrpclib.ServerProxy(rpcurl)
184 result = proxy.Bug.search(query)
185 bugs = result['bugs']
186 print(str(len(bugs)) + ' bugs to process')
188 if full:
189 available = set([str(bug['id']) for bug in bugs])
190 # we already have files from all available bugs
191 if available.difference(set(have)) == set():
192 print("assuming all downloaded files are up to date")
193 return
195 for bug in bugs:
196 url = showurl + str(bug['id'])
197 get_from_bug_url_via_xml(url, mimetype, prefix, suffix)
198 except xmlrpclib.Fault as err:
199 print("A fault occurred")
200 print("Fault code: %s" % err.faultCode)
201 print(err.faultString)
203 query = create_query(mimetype)
204 query['column_list']='bug_id'
206 files = get_downloaded_files(prefix, suffix)
208 if files != []:
209 print('looking for updated bugs having %s attachment(s)' % mimetype)
210 query_changed = query.copy()
211 query_changed['field0-1-0'] = 'days_elapsed'
212 query_changed['type0-1-0'] = 'lessthaneq'
213 query_changed['value0-1-0'] = str((datetime.date.today() - get_changed_date(files)).days)
214 process(query_changed, False)
216 print('looking for all bugs having %s attachment(s)' % mimetype)
217 process(query, True, get_file_bz_ids(files, prefix))
219 def get_through_rss_query(queryurl, mimetype, prefix, suffix):
220 try:
221 os.mkdir(suffix)
222 except:
223 pass
225 #Getting detailed bug information and downloading an attachment body is not possible without logging in to Novell bugzilla
226 #get_novell_bug_via_xml function is a workaround for that situation
227 get_bug_function = get_novell_bug_via_xml if prefix == "novell" else get_from_bug_url_via_xml
229 def process(query, full, have=[]):
230 url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.items()])
231 print('url is ' + url)
232 d = feedparser.parse(url)
233 print(str(len(d['entries'])) + ' bugs to process')
235 entries = []
236 for entry in d['entries']:
237 bugid = entry['id'].split('=')[-1]
238 entries.append(entry)
240 if full:
241 available = set([str(entry['id'].split('=')[-1]) for entry in entries])
242 # we already have files from all available bugs
243 if available.difference(set(have)) == set():
244 print("assuming all downloaded files are up to date")
245 return
247 for entry in entries:
248 try:
249 get_bug_function(entry['id'], mimetype, prefix, suffix)
250 except KeyboardInterrupt:
251 raise # Ctrl+C should work
252 except:
253 print(entry['id'] + " failed: " + str(sys.exc_info()[0]))
254 pass
256 query = create_query(escape(mimetype.replace("+","%2B")))
257 query['ctype'] = 'rss'
259 files = get_downloaded_files(prefix, suffix)
261 if files != []:
262 print('looking for updated bugs having %s attachment(s)' % mimetype)
263 query_changed = query.copy()
264 query_changed['field0-1-0'] = 'delta_ts'
265 query_changed['type0-1-0'] = 'greaterthaneq'
266 query_changed['value0-1-0'] = get_changed_date(files).isoformat()
267 process(query_changed, False)
269 print('looking for all bugs having %s attachment(s)' % mimetype)
270 process(query, True, get_file_bz_ids(files, prefix))
272 #since searching bugs having attachments with specific mimetypes is not available in launchpad API
273 #we're iterating over all bugs of the most interesting source packages
274 launchpad_pkgs = (
275 "abiword",
276 "calibre",
277 "calligra",
278 "gnumeric",
279 "inkscape",
280 "koffice",
281 "libabw",
282 "libcdr",
283 "libe-book",
284 "libetonyek",
285 "libfreehand",
286 "libmspub",
287 "libmwaw",
288 "liborcus",
289 "libpagemaker",
290 "libreoffice",
291 "libvisio",
292 "libwpd",
293 "libwpg",
294 "libwps",
295 "openoffice.org",
296 "python-uniconvertor",
297 "scribus",
298 "sk1",
299 "unoconv",
302 def get_launchpad_bugs(prefix):
303 #launchpadlib python module is required to download launchpad attachments
304 from launchpadlib.launchpad import Launchpad
306 launchpad = Launchpad.login_anonymously("attachmentdownload", "production")
307 ubuntu = launchpad.distributions["ubuntu"]
309 for pkg in launchpad_pkgs:
310 srcpkg = ubuntu.getSourcePackage(name=pkg)
311 pkgbugs = srcpkg.searchTasks(status=["New", "Fix Committed", "Invalid", "Won't Fix", "Confirmed", "Triaged", "In Progress", "Incomplete", "Incomplete (with response)", "Incomplete (without response)", "Fix Released", "Opinion", "Expired"])
313 for bugtask in pkgbugs:
314 bug = bugtask.bug
315 id = str(bug.id)
316 print("parsing " + id + " status: " + bugtask.status + " title: " + bug.title[:50])
317 attachmentid = 0
318 for attachment in bug.attachments:
319 attachmentid += 1
320 handle = attachment.data.open()
321 if not handle.content_type in mimetypes:
322 #print "skipping"
323 continue
325 suffix = mimetypes[handle.content_type]
326 if not os.path.isdir(suffix):
327 try:
328 os.mkdir(suffix)
329 except:
330 pass
332 download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix
334 if os.path.isfile(download):
335 print("assuming " + id + " is up to date")
336 break
338 print('mimetype is ' + handle.content_type + ' downloading as ' + download)
340 tmpfile = download + ".tmp"
341 f = open(tmpfile, "wb")
342 f.write(handle.read())
343 f.close()
344 os.rename(tmpfile, download)
346 rss_bugzillas = (
347 # note: currently abisource has an expired TLS cert
348 # ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi' ), #added for abiword
349 ( 'fdo', 'http://bugs.freedesktop.org/buglist.cgi' ),
350 ( 'gentoo', 'http://bugs.gentoo.org/buglist.cgi' ),
351 ( 'gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric
352 ( 'kde', 'http://bugs.kde.org/buglist.cgi' ), # added for koffice/calligra
353 ( 'mandriva', 'https://qa.mandriva.com/buglist.cgi' ),
354 ( 'moz', 'https://bugzilla.mozilla.org/buglist.cgi' ),
355 # It seems something has changed and it is no longer possible to
356 # download any files from there.
357 # NOTE: I am leaving it in the list, commented out, just so someone
358 # does not add it back immediately .-)
359 # 'novell': 'https://bugzilla.novell.com/buglist.cgi',
360 # note: running this script against bz.apache.org apparently causes one's IP
361 # to be banned or something; you won't get new files in any case...
362 # ( 'ooo', 'https://bz.apache.org/ooo/buglist.cgi' ),
363 ( 'tdf', 'http://bugs.documentfoundation.org/buglist.cgi' ),
366 redhatrpc = 'https://bugzilla.redhat.com/xmlrpc.cgi'
367 redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id='
369 #Novell Bugzilla requires users to log in, in order to get details of the bugs such as attachment bodies etc.
370 #As a dirty workaround, we parse comments containing "Created an attachment (id=xxxxxx)" and download attachments manually
371 #python-bugzilla claims that it supports Novell bugzilla login but it's not working right now and novell bugzilla login
372 #system is a nightmare
373 novellattach = 'https://bugzilla.novell.com/attachment.cgi?id='
375 class manage_threads(threading.Thread):
376 def run(self):
377 #print(threading.current_thread().get_ident())
378 while 1:
379 # Try to receive a job from queue
380 try:
381 # Get job from queue
382 # Use job parameters to call our query
383 # Then let the queue know we are done with this job
384 (uri, mimetype, prefix, extension) = jobs.get(True,6)
385 try:
386 get_through_rss_query(uri, mimetype, prefix, extension)
387 finally:
388 jobs.task_done()
389 except KeyboardInterrupt:
390 raise # Ctrl+C should work
391 except queue.Empty:
392 break
394 def generate_multi_threading():
396 # Initialize threads
397 for i in range(max_threads):
398 manage_threads().start()
400 for (prefix, uri) in rss_bugzillas:
402 # Create a job for every mimetype for a bugzilla
403 for (mimetype,extension) in mimetypes.items():
404 # It seems that bugzilla has problems returning that many results
405 # (10000 results is probably a limit set somewhere) so we always
406 # end processing the complete list.
407 if mimetype == 'text/html' and prefix == 'moz':
408 continue
410 jobs.put([uri, mimetype, prefix, extension], block=True)
411 print("successfully placed a job in the queue searching for " + mimetype + " in bugtracker " + prefix)
413 # Continue when all mimetypes are done for a bugzilla
414 print("STARTED all bugtracker " + prefix)
416 jobs.join()
418 # Number of threads to create, (1 = without multi-threading, default = 20)
419 max_threads = int(os.environ.get('PARALLELISM', 20))
420 jobs = queue.Queue()
422 generate_multi_threading()
424 for (mimetype,extension) in mimetypes.items():
425 get_through_rpc_query(redhatrpc, redhatbug, mimetype, "rhbz", extension)
427 try:
428 get_launchpad_bugs("lp")
429 except ImportError:
430 print("launchpadlib unavailable, skipping Ubuntu tracker")
432 # vim:set shiftwidth=4 softtabstop=4 expandtab: