3 # This file is part of the LibreOffice project.
5 # This Source Code Form is subject to the terms of the Mozilla Public
6 # License, v. 2.0. If a copy of the MPL was not distributed with this
7 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
19 from bs4
import BeautifulSoup
20 from attachment_mimetypes
import mimetypes
21 from concurrent
.futures
import ThreadPoolExecutor
, as_completed
22 from requests
.adapters
import HTTPAdapter
23 from requests
.packages
.urllib3
.util
.retry
import Retry
26 # https://wiki.documentfoundation.org/Website/Web_Sites_services#Unofficial_and_Related_Pages
27 'en': ["https://forum.openoffice.org/en/forum", False, 0],
28 'es': ["https://forum.openoffice.org/es/forum", False, 0],
29 'fr': ["https://forum.openoffice.org/fr/forum", False, 0],
30 'hu': ["https://forum.openoffice.org/hu/forum", False, 1300],
31 'it': ["https://forum.openoffice.org/it/forum", False, 0],
32 'ja': ["https://forum.openoffice.org/ja/forum", False, 0],
33 'nl': ["https://forum.openoffice.org/nl/forum", False, 0],
34 'pl': ["https://forum.openoffice.org/pl/forum", False, 0],
35 'vi': ["https://forum.openoffice.org/vi/forum", False, 0],
36 'tr': ["https://forum.libreoffice.org.tr", False, 0],
37 'de': ["https://www.openoffice-forum.de", False, 0],
38 'de2': ["https://www.libreoffice-forum.de", False, 0],
39 'de3': ["https://de.openoffice.info", False, 0],
41 'mso-de': ["https://www.ms-office-forum.net/forum", True, 0],
42 'mso-en': ["https://www.msofficeforums.com", True, 0],
43 'mso-en2': ["https://www.excelguru.ca/forums", False, 0],
44 'mso-en3': ["http://www.vbaexpress.com/forum", True, 5100],
45 # lang : [url, doLogin, startIndex]
48 def get_attachment_query(lang
):
49 if lang
.startswith("mso"):
50 return "/attachment.php?attachmentid="
52 return "/download/file.php?id="
54 def login(session
, url
, configFile
):
55 config
= configparser
.ConfigParser()
57 config
.read(configFile
)
58 username
= config
.get('login', 'username')
59 password
= config
.get('login', 'password')
60 resp
= session
.post(url
+ '/login.php?do=login', {
61 'vb_login_username': username
,
62 'vb_login_password': '',
63 'vb_login_md5password': hashlib
.md5(password
.encode()).hexdigest(),
64 'vb_login_md5password_utf': hashlib
.md5(password
.encode()).hexdigest(),
68 'securitytoken': 'guest'
71 if resp
.status_code
!= 200:
74 soup
= BeautifulSoup(resp
.content
, 'lxml')
75 for p
in soup
.find_all("p"):
76 if 'Thank you for logging in' in p
.get_text():
78 elif 'Danke für Ihre Anmeldung' in p
.get_text():
83 def get_attachments_from_url(lang
, config
, pathes
):
86 startIndex
= config
[2]
88 print("Checking " + url
)
90 # Keep the index and resume from there
91 indexFile
= os
.path
.join(pathes
.outdir
, lang
+ ".index")
92 if os
.path
.isfile(indexFile
):
93 with
open(indexFile
) as f
:
94 startIndex
= int(f
.readline().rstrip()) + 1
96 session
= requests
.Session()
97 retry
= Retry(connect
=3, backoff_factor
=0.5)
98 adapter
= HTTPAdapter(max_retries
=retry
)
99 session
.mount('http://', adapter
)
100 session
.mount('https://', adapter
)
103 if not login(session
, url
, pathes
.config
):
104 print("Can't log in to " + url
)
108 for i
in range(startIndex
, 999999):
109 fileUrl
= url
+ get_attachment_query(lang
) + str(i
)
111 h
= session
.head(fileUrl
)
113 content_type
= header
.get('content-type')
114 if "html" in content_type
:
115 # Let's assume this is an invalid file link
118 # Let's assume, if we get 200 invalid files, that there are no more files
119 if invalidCount
== 200:
120 print("No more attachments found in " + url
)
125 r
= session
.get(fileUrl
, allow_redirects
=True)
126 with tempfile
.NamedTemporaryFile() as tmp
:
128 mimetype
= magic
.from_file(tmp
.name
, mime
=True)
129 if mimetype
in mimetypes
:
130 suffix
= mimetypes
[mimetype
]
131 suffixDir
= os
.path
.join(pathes
.outdir
, suffix
)
137 download
= os
.path
.join(suffixDir
,
138 "forum-" + lang
+ '-' + str(i
) + '.' + suffix
)
140 print("Downloading as " + download
)
141 shutil
.copy(tmp
.name
, download
)
144 with
open(indexFile
, 'w') as f
:
147 if __name__
== '__main__':
148 parser
= argparse
.ArgumentParser()
150 parser
.add_argument('--outdir', action
='store', dest
="outdir", required
=True)
151 parser
.add_argument('--config', action
="store", dest
="config", required
=True)
153 pathes
= parser
.parse_args()
155 if not os
.path
.exists(pathes
.outdir
) or os
.path
.isfile(pathes
.outdir
):
156 print("Outdir folder doesn't exists")
158 elif not os
.path
.exists(pathes
.config
) or not os
.path
.isfile(pathes
.config
):
159 print("Config file doesn't exists")
163 # by default, 10 at a time seems to work fine
164 with
ThreadPoolExecutor(max_workers
=int(os
.environ
.get('PARALLELISM', 10))) as executor
:
165 for lang
, config
in forums
.items():
166 processes
.append(executor
.submit(get_attachments_from_url
, lang
, config
, pathes
))
168 for task
in as_completed(processes
):
169 result
= task
.result()