bin/get-bugzilla-attachments-by-mimetype

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # This file is part of the LibreOffice project.
   5 #
   6 # This Source Code Form is subject to the terms of the Mozilla Public
   7 # License, v. 2.0. If a copy of the MPL was not distributed with this
   8 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
   9 #
  10
  11 # This digs through a pile of bugzilla's and populates the cwd with a big
  12 # collection of bug-docs in per-filetype dirs with bug-ids as names with
  13 # prefixes to indicate which bug-tracker, e.g.
  14 #
  15 # fdo-bugid-X.suffix
  16 # rhbz-bugid-X.suffix
  17 # moz-bugid-X.suffix
  18 #
  19 # where X is the n'th attachment of that type in the bug
  20 #
  21 # The results are stored in the current directory, categorized by the
  22 # extension of the downloaded file.  When a file already exists, it is assumed
  23 # it is already downloaded by a previous run, and up-to-date.
  24
  25 from __future__ import print_function
  26 import feedparser
  27 import base64
  28 import datetime
  29 import glob
  30 import re
  31 import os, os.path
  32 import stat
  33 import sys
  34 import threading
  35 try:
  36     import queue
  37 except:
  38     import Queue as queue
  39 try:
  40     from urllib.request import urlopen
  41 except:
  42     from urllib import urlopen
  43 try:
  44     import xmlrpc.client as xmlrpclib
  45 except:
  46     import xmlrpclib
  47 from xml.dom import minidom
  48 from xml.sax.saxutils import escape
  49 from attachment_mimetypes import mimetypes
  50
  51 def urlopen_retry(url):
  52     maxretries = 3
  53     for i in range(maxretries + 1):
  54         try:
  55             return urlopen(url)
  56         except IOError as e:
  57             print("caught IOError: " + str(e))
  58             if maxretries == i:
  59                 raise
  60             print("retrying...")
  61
  62 def get_from_bug_url_via_xml(url, mimetype, prefix, suffix):
  63     id = url.rsplit('=', 2)[1]
  64     print("id is " + prefix + id + " " + suffix)
  65     print("parsing " + id)
  66     sock = urlopen_retry(url+"&ctype=xml")
  67     dom = minidom.parse(sock)
  68     sock.close()
  69     attachmentid=0
  70     for attachment in dom.getElementsByTagName('attachment'):
  71         attachmentid += 1
  72         print(" mimetype is", end=' ')
  73         for node in attachment.childNodes:
  74             if node.nodeName == 'type':
  75                 # check if attachment is deleted
  76                 if not node.firstChild:
  77                     print('deleted attachment, skipping')
  78                     continue
  79
  80                 print(node.firstChild.nodeValue, end=' ')
  81                 if node.firstChild.nodeValue.lower() != mimetype.lower():
  82                     print('skipping')
  83                     break
  84             elif node.nodeName == 'data':
  85                 # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml)
  86                 if not node.firstChild:
  87                     print('deleted attachment, skipping')
  88                     continue
  89
  90                 download = suffix + '/' +prefix + id + '-' + str(attachmentid) + '.' + suffix
  91                 if os.path.isfile(download):
  92                     print("assuming " + download + " is up to date")
  93                     continue
  94
  95                 # prevent re-downloading FDO attachments from TDF
  96                 if prefix == "tdf" and int(id) < 88776:
  97                     fdodownload = download.replace("tdf", "fdo")
  98                     if os.path.isfile(fdodownload):
  99                         print("assuming FDO " + fdodownload + " is up to date")
 100                         continue
 101
 102                 print('downloading as ' + download)
 103                 tmpfile = download + ".tmp"
 104                 f = open(tmpfile, 'wb')
 105                 f.write(base64.b64decode(node.firstChild.nodeValue))
 106                 f.close()
 107                 os.rename(tmpfile, download)
 108                 break
 109
 110 def get_novell_bug_via_xml(url, mimetype, prefix, suffix):
 111     id = url.rsplit('=', 2)[1]
 112     print("id is " + prefix + id + " " + suffix)
 113     print("parsing " + id)
 114     sock = urlopen_retry(url+"&ctype=xml")
 115     dom = minidom.parse(sock)
 116     sock.close()
 117     attachmentid=0
 118     for comment in dom.getElementsByTagName('thetext'):
 119         commentText = comment.firstChild.nodeValue
 120         match = re.search(r".*Created an attachment \(id=([0-9]+)\)", commentText)
 121         if not match:
 122             continue
 123
 124         attachmentid += 1
 125
 126         download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix
 127         if os.path.isfile(download):
 128             print("assuming " + download + " is up to date")
 129             continue
 130
 131         realAttachmentId = match.group(1)
 132         handle = urlopen_retry(novellattach + realAttachmentId)
 133         if not handle:
 134             print("attachment %s is not accessible" % realAttachmentId)
 135             continue
 136         print(" mimetype is", end=' ')
 137
 138         info = handle.info()
 139         if info.get_content_type:
 140             remoteMime = info.get_content_type()
 141         else:
 142             remoteMime = info.gettype()
 143         print(remoteMime, end=' ')
 144         if remoteMime != mimetype:
 145             print("skipping")
 146             continue
 147
 148         print('downloading as ' + download)
 149         tmpfile = download + ".tmp"
 150         f = open(tmpfile, 'wb')
 151         f.write(handle.read())
 152         f.close()
 153         os.rename(tmpfile, download)
 154
 155 def create_query(mimetype):
 156     query = dict()
 157     query['query_format']='advanced'
 158     query['field0-0-0']='attachments.mimetype'
 159     query['type0-0-0']='equals'
 160     query['value0-0-0']=mimetype
 161     return query
 162
 163 def get_downloaded_files(prefix, suffix):
 164     return glob.glob(os.path.join(suffix, '%s*.%s' % (prefix, suffix)))
 165
 166 def get_file_bz_ids(files, prefix):
 167     return set([os.path.basename(f).split('-')[0].replace(prefix, '', 1) for f in files])
 168
 169 def get_changed_date(files):
 170     newest = max([os.stat(f)[stat.ST_MTIME] for f in files])
 171     # Subtract a day to avoid timezone differences. The worst thing that
 172     # can happen is that we are going to process more bugs than necessary.
 173     return datetime.date.fromtimestamp(newest - 24 * 60 * 60)
 174
 175 def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix):
 176     try:
 177         os.mkdir(suffix)
 178     except:
 179         pass
 180
 181     def process(query, full, have=[]):
 182         try:
 183             proxy = xmlrpclib.ServerProxy(rpcurl)
 184             result = proxy.Bug.search(query)
 185             bugs = result['bugs']
 186             print(str(len(bugs)) + ' bugs to process')
 187
 188             if full:
 189                 available = set([str(bug['id']) for bug in bugs])
 190                 # we already have files from all available bugs
 191                 if available.difference(set(have)) == set():
 192                     print("assuming all downloaded files are up to date")
 193                     return
 194
 195             for bug in bugs:
 196                 url = showurl + str(bug['id'])
 197                 get_from_bug_url_via_xml(url, mimetype, prefix, suffix)
 198         except xmlrpclib.Fault as err:
 199             print("A fault occurred")
 200             print("Fault code: %s" % err.faultCode)
 201             print(err.faultString)
 202
 203     query = create_query(mimetype)
 204     query['column_list']='bug_id'
 205
 206     files = get_downloaded_files(prefix, suffix)
 207
 208     if files != []:
 209         print('looking for updated bugs having %s attachment(s)' % mimetype)
 210         query_changed = query.copy()
 211         query_changed['field0-1-0'] = 'days_elapsed'
 212         query_changed['type0-1-0'] = 'lessthaneq'
 213         query_changed['value0-1-0'] = str((datetime.date.today() - get_changed_date(files)).days)
 214         process(query_changed, False)
 215
 216     print('looking for all bugs having %s attachment(s)' % mimetype)
 217     process(query, True, get_file_bz_ids(files, prefix))
 218
 219 def get_through_rss_query(queryurl, mimetype, prefix, suffix):
 220     try:
 221         os.mkdir(suffix)
 222     except:
 223         pass
 224
 225     #Getting detailed bug information and downloading an attachment body is not possible without logging in to Novell bugzilla
 226     #get_novell_bug_via_xml function is a workaround for that situation
 227     get_bug_function = get_novell_bug_via_xml if prefix == "novell" else get_from_bug_url_via_xml
 228
 229     def process(query, full, have=[]):
 230         url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.items()])
 231         print('url is ' + url)
 232         d = feedparser.parse(url)
 233         print(str(len(d['entries'])) + ' bugs to process')
 234
 235         entries = []
 236         for entry in d['entries']:
 237             bugid = entry['id'].split('=')[-1]
 238             entries.append(entry)
 239
 240         if full:
 241             available = set([str(entry['id'].split('=')[-1]) for entry in entries])
 242             # we already have files from all available bugs
 243             if available.difference(set(have)) == set():
 244                 print("assuming all downloaded files are up to date")
 245                 return
 246
 247         for entry in entries:
 248             try:
 249                 get_bug_function(entry['id'], mimetype, prefix, suffix)
 250             except KeyboardInterrupt:
 251                 raise # Ctrl+C should work
 252             except:
 253                 print(entry['id'] + " failed: " + str(sys.exc_info()[0]))
 254                 pass
 255
 256     query = create_query(escape(mimetype.replace("+","%2B")))
 257     query['ctype'] = 'rss'
 258
 259     files = get_downloaded_files(prefix, suffix)
 260
 261     if files != []:
 262         print('looking for updated bugs having %s attachment(s)' % mimetype)
 263         query_changed = query.copy()
 264         query_changed['field0-1-0'] = 'delta_ts'
 265         query_changed['type0-1-0'] = 'greaterthaneq'
 266         query_changed['value0-1-0'] = get_changed_date(files).isoformat()
 267         process(query_changed, False)
 268
 269     print('looking for all bugs having %s attachment(s)' % mimetype)
 270     process(query, True, get_file_bz_ids(files, prefix))
 271
 272 #since searching bugs having attachments with specific mimetypes is not available in launchpad API
 273 #we're iterating over all bugs of the most interesting source packages
 274 launchpad_pkgs = (
 275     "abiword",
 276     "calibre",
 277     "calligra",
 278     "gnumeric",
 279     "inkscape",
 280     "koffice",
 281     "libabw",
 282     "libcdr",
 283     "libe-book",
 284     "libetonyek",
 285     "libfreehand",
 286     "libmspub",
 287     "libmwaw",
 288     "liborcus",
 289     "libpagemaker",
 290     "libreoffice",
 291     "libvisio",
 292     "libwpd",
 293     "libwpg",
 294     "libwps",
 295     "openoffice.org",
 296     "python-uniconvertor",
 297     "scribus",
 298     "sk1",
 299     "unoconv",
 300 )
 301
 302 def get_launchpad_bugs(prefix):
 303     #launchpadlib python module is required to download launchpad attachments
 304     from launchpadlib.launchpad import Launchpad
 305
 306     launchpad = Launchpad.login_anonymously("attachmentdownload", "production")
 307     ubuntu = launchpad.distributions["ubuntu"]
 308
 309     for pkg in launchpad_pkgs:
 310         srcpkg = ubuntu.getSourcePackage(name=pkg)
 311         pkgbugs = srcpkg.searchTasks(status=["New", "Fix Committed", "Invalid", "Won't Fix", "Confirmed", "Triaged", "In Progress", "Incomplete", "Incomplete (with response)", "Incomplete (without response)", "Fix Released", "Opinion", "Expired"])
 312
 313         for bugtask in pkgbugs:
 314             bug = bugtask.bug
 315             id = str(bug.id)
 316             print("parsing " + id + " status: " + bugtask.status + " title: " + bug.title[:50])
 317             attachmentid = 0
 318             for attachment in bug.attachments:
 319                 attachmentid += 1
 320                 handle = attachment.data.open()
 321                 if not handle.content_type in mimetypes:
 322                     #print "skipping"
 323                     continue
 324
 325                 suffix = mimetypes[handle.content_type]
 326                 if not os.path.isdir(suffix):
 327                     try:
 328                         os.mkdir(suffix)
 329                     except:
 330                         pass
 331
 332                 download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix
 333
 334                 if os.path.isfile(download):
 335                     print("assuming " + id + " is up to date")
 336                     break
 337
 338                 print('mimetype is ' + handle.content_type + ' downloading as ' + download)
 339
 340                 tmpfile = download + ".tmp"
 341                 f = open(tmpfile, "wb")
 342                 f.write(handle.read())
 343                 f.close()
 344                 os.rename(tmpfile, download)
 345
 346 rss_bugzillas = (
 347 # note: currently abisource has an expired TLS cert
 348 #    ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi' ), #added for abiword
 349     ( 'fdo', 'http://bugs.freedesktop.org/buglist.cgi' ),
 350     ( 'gentoo', 'http://bugs.gentoo.org/buglist.cgi' ),
 351     ( 'gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric
 352     ( 'kde', 'http://bugs.kde.org/buglist.cgi' ), # added for koffice/calligra
 353     ( 'mandriva', 'https://qa.mandriva.com/buglist.cgi' ),
 354     ( 'moz', 'https://bugzilla.mozilla.org/buglist.cgi' ),
 355     # It seems something has changed and it is no longer possible to
 356     # download any files from there.
 357     # NOTE: I am leaving it in the list, commented out, just so someone
 358     # does not add it back immediately .-)
 359     # 'novell': 'https://bugzilla.novell.com/buglist.cgi',
 360 # note: running this script against bz.apache.org apparently causes one's IP
 361 # to be banned or something; you won't get new files in any case...
 362 #    ( 'ooo', 'https://bz.apache.org/ooo/buglist.cgi' ),
 363     ( 'tdf', 'http://bugs.documentfoundation.org/buglist.cgi' ),
 364 )
 365
 366 redhatrpc = 'https://bugzilla.redhat.com/xmlrpc.cgi'
 367 redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id='
 368
 369 #Novell Bugzilla requires users to log in, in order to get details of the bugs such as attachment bodies etc.
 370 #As a dirty workaround, we parse comments containing "Created an attachment (id=xxxxxx)" and download attachments manually
 371 #python-bugzilla claims that it supports Novell bugzilla login but it's not working right now and novell bugzilla login
 372 #system is a nightmare
 373 novellattach = 'https://bugzilla.novell.com/attachment.cgi?id='
 374
 375 class manage_threads(threading.Thread):
 376     def run(self):
 377         #print(threading.current_thread().get_ident())
 378         while 1:
 379             # Try to receive a job from queue
 380             try:
 381                 # Get job from queue
 382                 # Use job parameters to call our query
 383                 # Then let the queue know we are done with this job
 384                 (uri, mimetype, prefix, extension) = jobs.get(True,6)
 385                 try:
 386                     get_through_rss_query(uri, mimetype, prefix, extension)
 387                 finally:
 388                     jobs.task_done()
 389             except KeyboardInterrupt:
 390                 raise # Ctrl+C should work
 391             except queue.Empty:
 392                 break
 393
 394 def generate_multi_threading():
 395
 396     # Initialize threads
 397     for i in range(max_threads):
 398         manage_threads().start()
 399
 400     for (prefix, uri) in rss_bugzillas:
 401
 402         # Create a job for every mimetype for a bugzilla
 403         for (mimetype,extension) in mimetypes.items():
 404             # It seems that bugzilla has problems returning that many results
 405             # (10000 results is probably a limit set somewhere) so we always
 406             # end processing the complete list.
 407             if mimetype == 'text/html' and prefix == 'moz':
 408                     continue
 409
 410             jobs.put([uri, mimetype, prefix, extension], block=True)
 411             print("successfully placed a job in the queue searching for " + mimetype + " in bugtracker " + prefix)
 412
 413         # Continue when all mimetypes are done for a bugzilla
 414         print("STARTED all bugtracker " + prefix)
 415
 416     jobs.join()
 417
 418 # Number of threads to create, (1 = without multi-threading, default = 20)
 419 max_threads = int(os.environ.get('PARALLELISM', 20))
 420 jobs = queue.Queue()
 421
 422 generate_multi_threading()
 423
 424 for (mimetype,extension) in mimetypes.items():
 425     get_through_rpc_query(redhatrpc, redhatbug, mimetype, "rhbz", extension)
 426
 427 try:
 428     get_launchpad_bugs("lp")
 429 except ImportError:
 430     print("launchpadlib unavailable, skipping Ubuntu tracker")
 431
 432 # vim:set shiftwidth=4 softtabstop=4 expandtab: