python3: upgrade to release 3.8.13
[LibreOffice.git] / bin / get-forum-attachments.py
blob92d30ccb183ae19dcf9d2383fc41291b3a958931
1 #!/usr/bin/env python3
3 # This file is part of the LibreOffice project.
5 # This Source Code Form is subject to the terms of the Mozilla Public
6 # License, v. 2.0. If a copy of the MPL was not distributed with this
7 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 import argparse
10 import configparser
11 import hashlib
12 import magic
13 import os
14 import requests
15 import shutil
16 import sys
17 import tempfile
19 from bs4 import BeautifulSoup
20 from attachment_mimetypes import mimetypes
21 from concurrent.futures import ThreadPoolExecutor, as_completed
22 from requests.adapters import HTTPAdapter
23 from requests.packages.urllib3.util.retry import Retry
25 forums = {
26 # https://wiki.documentfoundation.org/Website/Web_Sites_services#Unofficial_and_Related_Pages
27 'en': ["https://forum.openoffice.org/en/forum", False, 0],
28 'es': ["https://forum.openoffice.org/es/forum", False, 0],
29 'fr': ["https://forum.openoffice.org/fr/forum", False, 0],
30 'hu': ["https://forum.openoffice.org/hu/forum", False, 1300],
31 'it': ["https://forum.openoffice.org/it/forum", False, 0],
32 'ja': ["https://forum.openoffice.org/ja/forum", False, 0],
33 'nl': ["https://forum.openoffice.org/nl/forum", False, 0],
34 'pl': ["https://forum.openoffice.org/pl/forum", False, 0],
35 'vi': ["https://forum.openoffice.org/vi/forum", False, 0],
36 'tr': ["https://forum.libreoffice.org.tr", False, 0],
37 'de': ["https://www.openoffice-forum.de", False, 0],
38 'de2': ["https://www.libreoffice-forum.de", False, 0],
39 'de3': ["https://de.openoffice.info", False, 0],
40 # Others
41 'mso-de': ["https://www.ms-office-forum.net/forum", True, 0],
42 'mso-en': ["https://www.msofficeforums.com", True, 0],
43 'mso-en2': ["https://www.excelguru.ca/forums", False, 0],
44 'mso-en3': ["http://www.vbaexpress.com/forum", True, 5100],
45 # lang : [url, doLogin, startIndex]
48 def get_attachment_query(lang):
49 if lang.startswith("mso"):
50 return "/attachment.php?attachmentid="
51 else:
52 return "/download/file.php?id="
54 def login(session, url, configFile):
55 config = configparser.ConfigParser()
57 config.read(configFile)
58 username = config.get('login', 'username')
59 password = config.get('login', 'password')
60 resp = session.post(url + '/login.php?do=login', {
61 'vb_login_username': username,
62 'vb_login_password': '',
63 'vb_login_md5password': hashlib.md5(password.encode()).hexdigest(),
64 'vb_login_md5password_utf': hashlib.md5(password.encode()).hexdigest(),
65 'cookieuser': 1,
66 'do': 'login',
67 's': '',
68 'securitytoken': 'guest'
71 if resp.status_code != 200:
72 return False
74 soup = BeautifulSoup(resp.content, 'lxml')
75 for p in soup.find_all("p"):
76 if 'Thank you for logging in' in p.get_text():
77 return True
78 elif 'Danke für Ihre Anmeldung' in p.get_text():
79 return True
81 return False
83 def get_attachments_from_url(lang, config, pathes):
84 url = config[0]
85 doLogin = config[1]
86 startIndex = config[2]
88 print("Checking " + url)
90 # Keep the index and resume from there
91 indexFile = os.path.join(pathes.outdir, lang + ".index")
92 if os.path.isfile(indexFile):
93 with open(indexFile) as f:
94 startIndex = int(f.readline().rstrip()) + 1
96 session = requests.Session()
97 retry = Retry(connect=3, backoff_factor=0.5)
98 adapter = HTTPAdapter(max_retries=retry)
99 session.mount('http://', adapter)
100 session.mount('https://', adapter)
102 if doLogin:
103 if not login(session, url, pathes.config):
104 print("Can't log in to " + url)
105 return
107 invalidCount = 0
108 for i in range(startIndex, 999999):
109 fileUrl = url + get_attachment_query(lang) + str(i)
111 h = session.head(fileUrl)
112 header = h.headers
113 content_type = header.get('content-type')
114 if "html" in content_type:
115 # Let's assume this is an invalid file link
116 invalidCount += 1
118 # Let's assume, if we get 200 invalid files, that there are no more files
119 if invalidCount == 200:
120 print("No more attachments found in " + url)
121 break
122 else:
123 invalidCount = 0
125 r = session.get(fileUrl, allow_redirects=True)
126 with tempfile.NamedTemporaryFile() as tmp:
127 tmp.write(r.content)
128 mimetype = magic.from_file(tmp.name, mime=True)
129 if mimetype in mimetypes:
130 suffix = mimetypes[mimetype]
131 suffixDir = os.path.join(pathes.outdir, suffix)
132 try:
133 os.mkdir(suffixDir)
134 except:
135 pass
137 download = os.path.join(suffixDir,
138 "forum-" + lang + '-' + str(i) + '.' + suffix)
140 print("Downloading as " + download)
141 shutil.copy(tmp.name, download)
143 # Save the index
144 with open(indexFile, 'w') as f:
145 f.write(str(i))
147 if __name__ == '__main__':
148 parser = argparse.ArgumentParser()
150 parser.add_argument('--outdir', action='store', dest="outdir", required=True)
151 parser.add_argument('--config', action="store", dest="config", required=True)
153 pathes = parser.parse_args()
155 if not os.path.exists(pathes.outdir) or os.path.isfile(pathes.outdir):
156 print("Outdir folder doesn't exists")
157 sys.exit(1)
158 elif not os.path.exists(pathes.config) or not os.path.isfile(pathes.config):
159 print("Config file doesn't exists")
160 sys.exit(1)
162 processes = []
163 # by default, 10 at a time seems to work fine
164 with ThreadPoolExecutor(max_workers=int(os.environ.get('PARALLELISM', 10))) as executor:
165 for lang, config in forums.items():
166 processes.append(executor.submit(get_attachments_from_url, lang, config, pathes))
168 for task in as_completed(processes):
169 result = task.result()
170 if result:
171 print(result)