fix picture fetching
[rofl0r-twatscrape.git] / twatbot.py
bloba9d7ade77321cbbd93c40e0229dfd85ebb2bc808
1 from twat import get_twats, mirror_twat, get_effective_twat_id, unshorten_urls, fetch_nitter_picture
2 from mastodon import get_toots, fetch_mastodon_picture
3 from rocksock import RocksockProxyFromURL
4 from nitter import set_invalid_nitter, get_nitter_instance
5 import time
6 import json
7 import argparse
8 import os.path
9 import os
10 import random
11 import sys
12 import urllib
13 from HTMLParser import HTMLParser
14 from http2 import RsHttp
15 import threading
16 from soup_parser import soupify
17 import hashlib
18 import paths
19 from utils import safe_write, retry_makedirs
20 import socket, errno
21 import misc
22 import re
23 import collections
25 title="twatscrape"
26 tweets = dict()
27 tweet_cache = dict()
28 disabled_users = dict()
29 watchlist = []
30 new_accounts = []
31 all_tweets = []
32 site_dirs = [
33 "/css",
35 nitters = {}
36 def replace_url_in_twat(twat, args=None):
38 user = twat['user'].lower()
40 soup = soupify(twat["text"])
42 # linked files
43 for a in soup.body.find_all('a'):
44 ## replace /search?q= links
45 if a.attrs['href'].startswith('/search'):
46 twat['text'] = twat['text'].replace('/search?q=', '/index.html?search=')
48 ## @username : replace when local
49 elif 'title' in a.attrs:
50 username = a.attrs['href'].split('/')[1]
51 at_link = user_at_link(username.lower())
52 if username.find('@') == -1:
53 rebuild = '<b>%s<a href="https://%s/%s">%s</a></b>' % (at_link, random.choice(args.instances), username, username)
54 else:
55 _, u, h = username.split('@')
56 rebuild = '<b>%s<a href="https://%s/@%s">%s</a></b>' % (at_link, h, u, username)
57 # this fails when nonascii chars are present in a['title']
58 # XXX: would be nice to remove that 'title' attr, which would solve the issue
59 try: twat['text'] = twat['text'].replace(str(a), rebuild)
60 except Exception as e:
61 print('replace_url_in_twats: %s' %e)
62 pass
64 return twat['text']
66 def build_searchbox(variables):
67 link = make_index_link(variables, exclude=['search', 'find', 'user'])
69 if 'search' in variables and len(variables['search']):
70 fill = urllib.unquote_plus(variables['search'])
71 search_value = fill
72 else:
73 fill = 'foo "bar baz" -quux'
74 search_value = ''
76 user_sel = ['<center><table><tr>']
77 i = 0
78 for user in sorted(watchlist, key=str.lower):
79 if user[0] == '#': continue
80 selected = '' if (not 'user' in variables or not user in variables['user']) else ' checked'
81 user_sel.append("""<td width="33%%"><label class="hide_until_hover"><input id="u_%s" class="hide_until_hover" type="checkbox" value="%s"%s>%s</label></td>""" % (user, user, selected, user))
82 i = i + 1
83 if i >= 3:
84 user_sel.append('</tr><tr>')
85 i = 0
86 user_sel.append('</tr></table></center>')
88 ret = [
89 '<div class="searchbox">',
90 ' <form name="search" id="searchbox" onsubmit="searchbar_check()" method="get" action= \'%s\'>' % link,
91 ' <input class="searchbar hide_until_hover" name="search" type="text" value="%s" placeholder=\'%s\'/>' % (search_value, fill),
92 ' <input class="submit hide_until_hover" type="submit" value="&#8629">',
93 ' <div class="userlist">%s</div>' % '\n'.join(user_sel),
94 ' <input name="user" id="user" type="hidden" value="">',
95 ' </form><br />',
96 '</div>'
98 if len(search_value) or 'user' in variables:
99 ret.insert(7, '<span class="gotoindex hide_until_hover"><a href="%s">%s</a></span>' % (link,link))
101 return '\n'.join(ret)
103 def build_iconbar(twat, variables, quoted):
104 bar = '\n<div class="iconbar">'
106 ## anchor / next
107 if not quoted:
108 il = make_index_link(variables, ['page'])
109 if not '?' in il: il += '?'
110 else: il += '&'
111 id = get_effective_twat_id(twat)
112 il2 = il + 'find_next=%s'%id
113 bar += '<a href="%s" name="%s">%s</a>'%(il2, id,'&#9194;')
114 il2 = il + 'find=%s'%id
115 bar += '<a href="%s" name="%s">%s</a>'%(il2, id,'&#9875;')
116 il2 = il + 'find_prev=%s'%id
117 bar += '<a href="%s" name="%s">%s</a>'%(il2, id,'&#9193;')
119 ## twitter
120 #bar += '&nbsp;<a target="_blank" href="https://api.twitter.com/1.1/statuses/retweets/%d.json" title="retweet">%s</a>' % (int(twat['id']), '&#128038;')
121 ## wayback machine
122 bar += '&nbsp;<a target="_blank" href="https://web.archive.org/save/https://twitter.com/%s/status/%s" title="wayback">%s</a>' % (twat['user'], twat['id'], '&#9852;')
123 ## json file
124 bar += '&nbsp;<a target="_blank" href="%s">%s</a>' % (paths.get_user_json(twat['owner']), '&#128190;')
126 bar += '</div>\n'
127 return bar
130 class MLStripper(HTMLParser):
131 def __init__(self):
132 self.reset()
133 self.fed = []
134 def handle_data(self, d):
135 self.fed.append(d)
136 def get_data(self):
137 return ''.join(self.fed)
139 def strip_tags(html):
140 s = MLStripper()
141 s.feed(html)
142 return s.get_data()
144 def file_exists(fn):
145 return os.path.exists(fn)
147 def in_twatlist(user, twat):
148 eid = get_effective_twat_id(twat)
149 if user in tweet_cache and eid in tweet_cache[user]: return True
150 else: return False
152 def add_twatlist(user, twat, insert_pos):
153 if not user in tweets: tweets[user] = list()
154 if not user in tweet_cache: tweet_cache[user] = dict()
155 tweets[user].insert(insert_pos, twat)
156 tweet_cache[user][get_effective_twat_id(twat)] = True
158 def write_user_tweets(user):
159 fn = paths.get_user_json(user)
160 content = json.dumps(tweets[user], sort_keys=True, indent=4)
161 safe_write(fn, content)
163 def remove_known_retweets(lst):
164 nl = []
165 for x in lst:
166 if "rid" in x and x["user"] in tweet_cache and x["id"] in tweet_cache[x["user"]]: pass
167 else: nl.append(x)
168 return nl
170 def format_time(stmp):
171 return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(stmp))
173 def add_owner_to_list(user, lst):
174 nl = []
175 for x in lst:
176 y = x.copy()
177 y["owner"] = user
178 nl.append(y)
179 return nl
181 def js_searchbox():
182 return ("<script>"
183 "function searchbar_check() {"
184 " f = document.getElementById('searchbox');"
185 " a = f.elements;"
186 " var l = a.length;"
187 " var s = '';"
188 " for(var i = 0; i < l; i++) {"
189 " if(a[i].id.substring(0,2) === 'u_') {"
190 " if(a[i].checked) s += ',' + a[i].value;"
191 " }"
192 " }"
193 " var u = document.getElementById('user');"
194 " u.value = s.substring(1);"
196 "</script>")
198 def html_header():
199 global all_tweets
200 header = """<!DOCTYPE html><html><head>
201 <title>%s</title><meta charset="utf-8"/>""" % args.title
203 ## check user box
204 header += js_searchbox()
206 ## autorefresh the page ?
207 if args.refresh: header += """<meta http-equiv="refresh" content="%d" >""" % args.refresh
208 header += """<link rel='stylesheet' type='text/css' href='css/%s.css'></head><body>""" % args.theme
209 if len(all_tweets): header += '<a class="export" href=/export download="twats.json">export %d tweets</a>' % len(all_tweets)
211 return header
213 def user_at_link(user):
214 if user in watchlist:
215 return '<a href="?user=%s">@</a>' % user
217 if user.find('@') == -1:
218 return '<a href="https://%s/%s">@</a>' % (random.choice(args.instances), user)
219 else:
220 _, u, h = user.split('@')
221 return '<a href="https://%s/@%s">@</a>' % (h,u)
223 def replace_twat_text(text):
224 try: text = text.decode('utf8').replace('\n', '<br>') #replace( u'\xa0', ' ').replace(u'\0xe2', ' ')
225 except: return text
226 return text
228 def htmlize_twat(twat, variables, quoted=False):
229 tw = '<div class="twat-container">'
230 tweet_pic = None
231 retweet_pic = None
233 if not 'rid' in twat:
234 retweet_str = ""
235 if paths.has_profile_pic(twat['owner']): tweet_pic = paths.get_profile_pic(twat['owner'])
237 else:
238 if paths.has_profile_pic(twat['user']): tweet_pic = paths.get_profile_pic(twat['user'])
239 else: tweet_pic = ""
241 if paths.has_profile_pic(twat['owner']): retweet_pic = paths.get_profile_pic(twat['owner'])
243 if twat['user'].find('@') == -1:
244 retweet_str = " (RT %s<a target='_blank' href='https://%s/%s/status/%s'>%s</a>)" % \
245 (user_at_link(twat['user']), random.choice(args.instances), twat['user'], twat['id'], twat['user'])
246 else:
247 _, u, h = twat['user'].split('@')
248 retweet_str = " (RT %s<a target='_blank' href='https://%s/@%s/%s'>%s</a>)" % \
249 (user_at_link(twat['user']), h, u, twat['id'], twat['user'].lstrip('@'))
251 if tweet_pic: tw += '<div class="profile_picture"><img width="100%%" height="100%%" src="%s"></div>' % tweet_pic
252 if retweet_pic: tw += '<div class="profile_picture_retweet"><img width="100%%" height="100%%" src="%s"></div>' % retweet_pic
254 user_str = user_at_link(twat["owner"].lower())
255 user_str += "<a target='_blank' href='https://%s/%s/status/%s'>%s</a>%s" % \
256 (random.choice(args.instances), twat["owner"], get_effective_twat_id(twat), twat["owner"], retweet_str)
259 tw += '\n<div class="twat-title">'
261 ## add icon bar
262 if args.iconbar: tw += build_iconbar(twat, variables, quoted)
264 time_str = 'unknown' if twat["time"] == 0 else format_time(twat["time"])
265 tw += '%s&nbsp;-&nbsp;%s' % (user_str, time_str)
267 tw += '\n</div>\n'
269 ## replace urls in twats
270 twat['text'] = replace_url_in_twat(twat, args=args)
271 ## strip html ?
272 if args.nohtml: twat['text']= strip_tags(twat['text'])
274 tw += '<p class="twat-text">%s</p>\n' % (replace_twat_text(twat['text']))
276 if 'curl' in twat and args.iframe > 0:
277 user = twat['user'].lower()
278 ifu = paths.get_user(user) + '/%s-%s' % (twat['id'], "card.html")
279 if (not 'c' in args.mirror) or (not file_exists(ifu)):
280 ifu = twat['curl']
281 tw += '<span class="twat-iframe"><iframe src="%s"></iframe></span>\n'%ifu
283 if 'images' in twat:
284 tw += '<p class="twat-image">'
285 if len(twat['images']) > 1: wdth = (100/len(twat['images'])) - 1
286 else: wdth = 100
288 for i in twat['images']:
289 if args.images <= 0:
290 tw += '<a href="%s">%s</a>'%(i, i)
291 else:
292 img_path = paths.get_user(twat['user']) + "/%s-%s" % (twat['id'], i.split('/')[-1])
293 if not file_exists(img_path): img_path = i
294 span_or_div = "span"
295 img_class = "img"
296 div_class = ""
297 if args.upstream_img:
298 href = i
299 title = "view remote image"
300 elif 'video' in twat or 'ext_tw_video_thumb' in i:
301 mp4_path = paths.get_user(twat['user']) + '/%s.mp4' % str(twat['id'])
302 if os.path.exists(mp4_path):
303 href = mp4_path
304 title = "view local video"
305 else:
306 href = "https://twitter.com/i/status/" + twat['id']
307 title = "view remote video"
308 img_class = ""
309 div_class = "video-thumbnail"
310 span_or_div = "div"
311 else:
312 href = img_path
313 title = "view local image"
314 tw += '<a href="%s" title="%s"><%s class="%s"><img class="%s" src="%s" width="%d%%"></%s></a>' % (href, title, span_or_div, div_class, img_class, img_path, wdth, span_or_div)
316 tw += '</p>\n'
318 if 'quote' in twat:
319 pseudo_twat = {
320 'user' : twat['quote']['user'],
321 'owner' : twat['quote']['user'],
322 'id' : twat['quote']['id'],
323 'text' : twat['quote']['text'],
324 'time' : 0
326 tw += htmlize_twat(pseudo_twat, variables, quoted=True)
328 tw += '</div>\n'
330 return tw
332 def retweet_time(twat):
333 if 'rid_time' in twat: return twat['rid_time']
334 if 'fetched' in twat: return twat['fetched']
335 return twat['time']
337 def sort_tweets_func(x, y):
338 # somewhere in 2017, the numbering scheme of twitter changed
339 # that's a pity because the twat id is the most accurate
340 # sorting indicator, so we use it on all tweets > 2018
341 timestamp_2018 = 1514764800 #01/01/2018
342 if x['time'] > timestamp_2018 and y['time'] > timestamp_2018:
343 try:
344 t1 = int(get_effective_twat_id(x))
345 t2 = int(get_effective_twat_id(y))
346 except:
347 return -1
348 if t1 == t2: return 0
349 elif t1 > t2: return 1
350 else: return -1
351 else:
352 t1 = retweet_time(x) if 'rid' in x else x["time"]
353 t2 = retweet_time(y) if 'rid' in y else y["time"]
354 if t1 == t2: return 0
355 elif t1 > t2: return 1
356 else: return -1
358 def sort_tweets(twts):
359 return sorted(twts, cmp=sort_tweets_func, reverse=True)
361 def get_all_tweets(remove_dupes=False):
362 global blacklist, whitelist
363 all_tweets = []
364 use_whitelist = True if len(whitelist) else False
365 for user in tweets:
366 if user in blacklist: continue
367 if use_whitelist and not user in whitelist: continue
368 all_tweets.extend(add_owner_to_list(user, tweets[user]))
370 all_tweets = sort_tweets(all_tweets)
371 if remove_dupes: all_tweets = remove_known_retweets(all_tweets)
372 return all_tweets
374 def find_tweet_page(all_tweets, twid, offset):
375 for i in xrange(0, len(all_tweets)):
376 if get_effective_twat_id(all_tweets[i]) == twid:
377 if i + offset >= 0 and i < len(all_tweets):
378 i += offset
379 return int(i / args.tpp), get_effective_twat_id(all_tweets[i])
380 return 0
382 def parse_search(str):
383 class SearchTerm():
384 def __init__(self, str):
385 self.exclude = (str[0] == '-')
386 self.term= str if not self.exclude else str[1:]
387 def match(self, text):
388 return (self.exclude and not self.term in text) or (not self.exclude and self.term in text)
389 terms = []
390 s = ''
391 in_quotes = False
392 for i in xrange(len(str)):
393 handled = False
394 if str[i] in ' "':
395 if str[i] == ' ':
396 if not in_quotes:
397 if len(s): terms.append(SearchTerm(s))
398 s = ''
399 handled = True
400 if str[i] == '"':
401 if in_quotes:
402 if len(s): terms.append(SearchTerm(s))
403 s = ''
404 handled = True
405 in_quotes = False
406 else:
407 in_quotes = True
408 handled = True
409 if not handled:
410 s += str[i]
411 if len(s): terms.append(SearchTerm(s))
412 return terms
414 def find_tweets(all_tweets, search=None, users=None):
415 terms = parse_search(urllib.unquote_plus(search).lower()) if search else []
416 match_tweets = []
417 for i in xrange(0, len(all_tweets)):
418 match = True
419 for t in terms:
420 if not t.match(all_tweets[i]['text'].lower()):
421 match = False
422 break
423 if match and users and not all_tweets[i]['owner'].lower() in users:
424 match = False
425 if match: match_tweets.append(all_tweets[i])
426 return match_tweets
428 # return tuple of html, redirect_url
429 # only one of both is set to something other than ""
430 def render_site(variables = {}):
431 global all_tweets
432 html = []
434 page = 0 if not 'page' in variables else int(variables['page'])
435 if 'find' in variables:
436 find_offset = 0
437 find = variables['find']
438 variables.pop('find', None)
439 elif 'find_next' in variables:
440 find_offset = -1
441 find = variables['find_next']
442 variables.pop('find_next', None)
443 elif 'find_prev' in variables:
444 find_offset = 1
445 find = variables['find_prev']
446 variables.pop('find_prev', None)
447 else:
448 find_offset = None
449 find = ''
450 search = None if not 'search' in variables else variables['search']
451 users = None if not 'user' in variables else urllib.unquote_plus(variables['user']).lower().split(',')
453 # don't remove duplicates if users is specified: this could remove retweets
454 remove_dupes = True if not users else False
456 all_tweets = get_all_tweets(remove_dupes)
457 if users or search: all_tweets = find_tweets(all_tweets, search=search, users=users)
458 if find != '':
459 variables['page'], tid = find_tweet_page(all_tweets, find, find_offset)
460 return "", make_index_link(variables) + '#%s'%tid
462 pagetotalf = len(all_tweets) / float(args.tpp)
463 pagetotal = int(pagetotalf)
464 if pagetotalf > pagetotal: pagetotal += 1
466 max = (page+1)*args.tpp
467 if max > len(all_tweets): max = len(all_tweets)
469 for i in xrange(page*args.tpp, max):
470 twat = all_tweets[i]
471 html.append(htmlize_twat(twat, variables))
473 if len(html):
474 return write_html(html=html, variables=variables, pages=pagetotal), ""
476 return "", ""
477 def render_empty(variables = {}):
478 html = ['<div class="error_message"><p class="twatter">&#129296;</p><p class="error_text">There is nothing here..<p><p><a href="/">Back to index</a></p></div>']
479 return write_html(html=html, variables=variables)
481 def make_index_link(variables, exclude=None):
482 exclude = exclude if exclude else []
483 s = '/index.html'
484 t = [ '%s=%s'%(x, str(variables[x])) for x in variables if not x in exclude ]
485 if len(t): return '%s?%s' % (s, '&'.join(t))
486 return s
488 def page_selection(curr, total, margin=5):
489 set = []
490 for i in xrange(0, margin):
491 set.append(i)
492 for i in xrange(curr - margin, curr):
493 if not i in set: set.append(i)
494 for i in xrange(curr, curr+margin+1):
495 if not i in set: set.append(i)
496 for i in xrange(total-margin, total):
497 if not i in set: set.append(i)
498 i = 0
499 while i < len(set):
500 if set[i] >= total or set[i] < 0: set.pop(i)
501 else: i = i + 1
502 return set
504 def page_selection_html(variables, page, pages):
505 div = []
506 sel = page_selection(page, pages)
507 for i in xrange(len(sel)):
508 if i > 0 and sel[i] - sel[i-1] != 1:
509 div.append('...')
510 if sel[i] == page:
511 div.append(str(page))
512 else:
513 variables['page'] = sel[i]
514 indx = make_index_link(variables)
515 div.append('<a class="menu" href="%s">%d</a>' % (indx,sel[i]))
516 variables['page'] = page
517 return div
519 def write_html(html, variables=None, pages=0):
520 ht = [ html_header() ]
521 page = int(variables['page']) if 'page' in variables else 0
523 div = page_selection_html(variables, page, pages)
524 if len(div):
525 ht.append('\n<div class="menu">%s</div>\n' % '&nbsp;'.join(div))
527 [ ht.append(i) for i in html ]
529 if len(div):
530 ht.append('\n<div class="menu">%s</div>\n' % '&nbsp;'.join(div))
532 ht.append(build_searchbox(variables))
533 ht.append("\n</body></html>")
535 return "\n".join(ht).encode('utf-8')
537 def fetch_more_tweets_callback(item, twats):
538 # iterate over last 20 tweets only as this is called once per page with the full list
539 twats_per_page = 20
540 if len(twats) < twats_per_page: twats_per_page = len(twats)
541 for i in xrange(1, twats_per_page + 1):
542 twat = twats[i * -1]
543 if 'pinned' in twat and twat['pinned'] == 1: continue
544 user = twat['user'] if item[0] == '#' else item.lower()
545 if in_twatlist(user, twat): return False
546 return True
548 def scrape(item, http, host, search, user_agent):
549 global nitters
550 global mastodon_rshttp
551 item = item.lower()
553 if item in new_accounts:
554 count = args.count
555 checkfn = None
556 new_accounts.remove(item)
557 else:
558 checkfn = fetch_more_tweets_callback
559 count = args.count if item[0] == '#' else -1
561 if item.count('@') < 2:
562 fetch_profile_picture = fetch_nitter_picture
563 twats, nitters, host, http, page = get_twats(item, proxies=args.proxy, count=count, http=http, checkfn=checkfn, nitters=nitters, host=host, search=search, user_agent=user_agent, blacklist=blacklist, whitelist=whitelist)
564 else:
565 fetch_profile_picture = fetch_mastodon_picture
566 twats, http = get_toots(item, proxies=args.proxy, count=count, http=http, checkfn=checkfn, user_agent=user_agent, blacklist=args.blacklist, whitelist=args.whitelist)
567 mastodon_rshttp[host] = http
569 insert_pos = dict()
570 new = False
571 user = None if item[0] == '#' else item
572 insert_pos_total = 0
573 elapsed_time = time.time()
574 for t in twats:
575 if search: user = t['user'].lower()
576 if not user in insert_pos: insert_pos[user] = 0
578 if not in_twatlist(user, t):
579 new = True
580 if args.unshorten: t = unshorten_urls(t, proxies=args.proxy, shorteners=shorteners)
581 add_twatlist(user, t, insert_pos[user])
582 insert_pos[user] += 1
583 insert_pos_total += 1
584 if 'quote_tweet' in t:
585 if '@' in t['quote_tweet']['user']:
586 _, foo, bar = t['quote_tweet']['user'].split('@')
587 http = None if not bar in mastodon_rshttp else mastodon_rshttp[bar]
589 if not os.path.isdir(paths.get_user(t[quote_tweet]['user'])): retry_makedirs(paths.get_user(t[quote_tweet]['user']))
590 if args.fetch_profile_picture: fetch_profile_picture(t[quote_tweet]['user'], args.proxy, twhttp=http, nitters=nitters, user_agent=user_agent)
591 if 'user' in t:
592 if '@' in t['user']:
593 _, foo, bar = t['user'].split('@')
594 http = None if not bar in mastodon_rshttp else mastodon_rshttp[bar]
596 if not os.path.isdir(paths.get_user(t['user'])): retry_makedirs(paths.get_user(t['user']))
597 if args.fetch_profile_picture: fetch_profile_picture(t['user'], args.proxy, twhttp=http, nitters=nitters, user_agent=user_agent)
598 if args.mirror: mirror_twat(t, args=args)
599 sys.stdout.write('\r[%s] %s: extracting from %d page(s): +%d twat(s)' % (misc.get_timestamp("%Y-%m-%d %H:%M:%S", elapsed_time), item, page, insert_pos_total))
600 sys.stdout.flush()
602 if new:
603 if search:
604 for user in insert_pos.keys(): write_user_tweets(user)
605 else:
606 write_user_tweets(item)
607 elapsed_time = (time.time() - elapsed_time)
608 sys.stdout.write('done (%s)\n' % misc.get_timestamp("%H:%M:%S", elapsed_time))
609 sys.stdout.flush()
610 return http, host
612 def resume_retry_mirroring(done):
613 start_time = time.time()
614 print('resume_retry_mirroring: thread started')
615 infoticks = time.time()
616 for user in watchlist:
617 for t in tweets[user]:
618 if done.is_set(): break
619 elif (time.time() - infoticks) > 300:
620 print('resume_retry_mirroring: thread is still running')
621 infoticks = time.time()
622 mirror_twat(t, args=args)
623 elapsed_time = time.time() - start_time
624 print('resume_retry_mirroring: end of thread, duration: %s' % time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
625 done.set()
627 def load_user_json(user):
628 tweet_cache[user] = dict()
629 try:
630 tweets[user] = json.loads(open(paths.get_user_json(user), 'r').read())
631 for i in xrange(len(tweets[user])):
632 tweet_cache[user][get_effective_twat_id(tweets[user][i])] = True
633 except:
634 tweets[user] = []
636 def json_loads():
637 for user in watchlist:
638 if not user in tweets:
639 load_user_json(user)
641 def serve_loop(hs, done):
642 client_threads = []
643 while not done.is_set():
644 c = hs.wait_client()
646 evt_done = threading.Event()
647 cthread = threading.Thread(target=httpsrv_client_thread, args=(c,evt_done))
648 cthread.daemon = True
649 cthread.start()
651 ctrm = []
652 for ct, ct_done in client_threads:
653 if ct_done.is_set():
654 ctrm.append((ct,ct_done))
655 ct.join()
657 if len(ctrm):
658 client_threads = [ x for x in client_threads if not x in ctrm ]
660 client_threads.append((cthread, evt_done))
662 def forbidden_page():
663 return (
664 '<!DOCTYPE html>\n'
665 ' <head>\n'
666 ' <style>div.e{position:fixed;top:25%;bottom:25%;left:25%;right:25%;font-size:150px;text-align:center;}</style>\n'
667 ' <title>Forbidden</title>\n'
668 ' </head>\n'
669 ' <body>\n'
670 ' <div class="e">&#128405;</div>\n'
671 ' </body>\n'
672 '</html>')
674 def configpage(req = {}, variables={}):
675 html = ''
676 redir = ''
677 if not 'postdata' in req:
678 content = ''
679 with open('watchlist.txt', 'r') as handle: content = ''.join(handle.readlines())
680 html = [
681 '<div class="watchlist"><form name="configuration" action="config.html" method="post">\n',
682 '<label for=watchlist>watchlist</label><textarea id=watchlist name="watchlist" cols="30" rows="20" placeholder="watchlist, one per line">%s</textarea>\n' % content,
683 '<label for=whitelist>whitelist</label><textarea id=whitelist name="whitelist" cols="30" rows="20" placeholder="whitelist, one per line">%s</textarea>\n' %'\n'.join(whitelist.keys()),
684 '<label for=blacklist>blacklist</label><textarea id=blacklist name="blacklist" cols="30" rows="20" placeholder="blacklist, one per line">%s</textarea><br/>\n' %'\n'.join(blacklist.keys()),
685 '<input type="submit" value="save and apply">\n',
686 '</form></div>\n'
688 html = write_html(html=html, variables=variables)
690 else:
691 redir = 'index.html'
692 for item in req['postdata']:
693 if item == 'watchlist':
694 with open(args.watchlist, 'w') as handle: handle.write(req['postdata'][item])
695 load_watchlist()
696 elif item == 'blacklist':
697 with open(args.blacklist, 'w') as handle: handle.write(req['postdata'][item])
698 load_list(item)
699 elif item == 'whitelist':
700 with open(args.whitelist, 'w') as handle: handle.write(req['postdata'][item])
701 load_list(item)
703 return html, redir
705 def variables_from_request(req):
706 variables={}
707 variables['page'] = 0
708 if '?' in req['url']:
709 a,b= req['url'].split('?')
710 l = b.split('&')
711 for d in l:
712 if not '=' in d: continue
713 e,f=d.split('=')
714 if len(f): variables[e.lower()] = f
716 return variables
718 def httpsrv_client_thread(c, evt_done):
719 req = c.read_request()
720 if req is None: pass
721 elif len(watchlist) == 0:
722 c.redirect('/config.html')
723 elif os.path.isdir(req['url'][1:]):
724 c.send(403,'Forbidden', forbidden_page())
725 elif req['url'] == '/':
726 c.redirect('/index.html')
727 elif req['url'].startswith('/index.html'):
728 variables = variables_from_request(req)
729 r, redir = render_site(variables)
730 if redir is not "":
731 c.redirect(redir)
732 else:
733 if r == '': r = render_empty(variables=variables)
734 c.send(200, "OK", r)
735 elif not '..' in req['url'] and file_exists(os.getcwd() + req['url']):
736 c.serve_file(os.getcwd() + req['url'])
737 elif req['url'] == '/robots.txt':
738 c.send(200, "OK", "User-agent: *\nDisallow: /")
740 elif req['url'] == '/export':
741 global all_tweets
742 c.send(200,'OK', json.dumps(all_tweets, sort_keys=True, indent=4))
744 elif req['url'].startswith('/config.html'):
745 if args.config > 0:
746 variables=variables_from_request(req)
747 r, redir = configpage(req,variables)
748 else:
749 redir = '/index.html'
750 if redir is not "":
751 c.redirect(redir)
752 else:
753 if r == '': r = render_empty(variables=variables)
754 c.send(200, "OK", r)
756 else:
757 c.send(404, "not exist", "the reqested file not exist!!!1")
758 c.disconnect()
759 evt_done.set()
761 def start_server(ip, port):
762 done = threading.Event()
763 from httpsrv import HttpSrv
764 hs = HttpSrv(ip, port)
765 try:
766 hs.setup()
767 except socket.error as e:
768 if e.errno == errno.EADDRINUSE:
769 sys.stderr.write((
770 "ERROR: server socket address in use\n"
771 "wait a couple seconds and try again.\n"
772 "in case you're in pdb, you need to quit it\n"))
773 sys.exit(1)
774 else:
775 raise e
777 t = threading.Thread(target=serve_loop, args=(hs, done))
778 t.daemon = True
779 t.start()
780 return t, done
782 whitelist_hash = None
783 whitelist = dict()
784 blacklist_hash = None
785 blacklist = dict()
786 def load_list(item):
787 if item == 'whitelist':
788 global whitelist_hash, whitelist
789 old_hash = whitelist_hash
790 fname = args.whitelist
791 else:
792 global blacklist_hash, blacklist
793 old_hash = blacklist_hash
794 fname = args.blacklist
796 wl = dict()
797 for x in open(fname, 'r').readlines():
798 x = x.rstrip().lower()
799 if not len(x): continue
800 if x.startswith(';'): continue
801 else: wl[x] = 1
803 if not len(wl): return
804 newhash = hashlib.md5( ''.join(wl.keys())).hexdigest()
805 if newhash != old_hash:
806 print('reloading %s' %item)
807 if item == 'whitelist':
808 whitelist_hash = newhash
809 whitelist = wl
810 else:
811 blacklist_hash = newhash
812 blacklist = wl
814 wl_hash = None
815 def load_watchlist():
816 global watchlist, wl_hash
817 has_keywords = False
818 wl = []
819 for x in open(args.watchlist, 'r').readlines():
820 x = x.rstrip().lower()
821 if x[0] == ';':
822 username = x[1:]
823 disabled_users[username] = True
824 elif x[0] == '#':
825 if not has_keywords: has_keyword = True
826 username = x if x.find(' ') == -1 else x.replace(' ', '+')
827 else:
828 username = x
830 if not username[0] == '#' and not os.path.exists(paths.get_user_json(username)):
831 new_accounts.append(username)
832 if not os.path.exists(paths.get_user(username)):
833 retry_makedirs(paths.get_user(username))
835 wl.append(username)
836 newhash = hashlib.md5(''.join(wl)).hexdigest()
837 if newhash != wl_hash:
838 print('reloading watchlist')
839 wl_hash = newhash
840 watchlist = wl
841 json_loads()
843 if has_keywords and os.path.exists('users'):
844 for file in os.listdir('users'):
845 d = os.path.join('users', file)
846 if os.path.isdir(d): load_user_json(d)
848 def sort_keywords(interests):
849 return sorted(interests.items(), key = lambda kv:(kv[1],kv[0]))
851 def get_keywords(username):
852 js = paths.get_user_json(username)
854 with open(js, 'r') as h:
855 interests = {}
856 j = json.load(h)
858 lines = [ twat['text'] for twat in j ]
860 for line in lines:
861 line = line.lower().strip()
862 for word in line.split():
863 if word[0] == '#':
864 if len(word) > 3:
865 interests[word[1:]] = 1 if not word[1:] in interests else (interests[word[1:]] + 1)
866 elif re.match('^[a-z0-9]{5,}$', word):
867 interests[word] = 1 if not word in interests else (interests[word] + 1)
869 sample = len(interests) if len(interests) < 10 else 10
870 return random.sample( interests, sample )
872 if __name__ == '__main__':
873 parser = argparse.ArgumentParser()
874 parser.add_argument('--dir', help="where to save twats (default: current directory)", type=str, default=None, required=False)
875 parser.add_argument('--watchlist', help="specify watchlist to use (default: watchlist.txt)", type=str, default='watchlist.txt', required=False)
876 parser.add_argument('--blacklist', help="specify a file containing user accounts to ignore (default: blacklist.txt)", type=str, default="blacklist.txt", required=False)
877 parser.add_argument('--whitelist', help="only save twats from those user accounts (default: whitelist.txt)", type=str, default="whitelist.txt", required=False)
878 parser.add_argument('--randomize-watchlist', help="randomize watchlist on each loop (default: 0)", type=int, default=0, required=False)
879 parser.add_argument('--refresh', help="refresh html page every X seconds - 0: disabled (default: 0)", type=int, default=0, required=False)
880 parser.add_argument('--title', help="defile title (default: %s)" % title, type=str, default=title, required=False)
881 parser.add_argument('--theme', help="select theme (default: fancy)", default='fancy', type=str, required=False)
882 parser.add_argument('--config', help="enable the /config.html page (default: 1)", default=1, type=int, required=False)
883 parser.add_argument('--iframe', help="show iframe (default: 1)", default=1, type=int, required=False)
884 parser.add_argument('--profile', help="check profile every X second(s) (default: 60)", default=60, type=int, required=False)
885 parser.add_argument('--images', help="show image (default: 1)", default=1, type=int, required=False)
886 parser.add_argument('--tpp', help="twats per page (default: very high number)", default=99999999999, type=int, required=False)
887 parser.add_argument('--proxy', help="use a proxy (syntax: socks5://ip:port)", default=None, type=str, required=False)
888 parser.add_argument('--iconbar', help="show iconbar bar (default: 1)", default=1, type=int, required=False)
889 parser.add_argument('--unshorten', help='unshorten shortened links (default: 0)', default=0, type=int, required=False)
890 parser.add_argument('--nohtml', help="strip html from tweets (default: 0)", default=0, type=int, required=False)
891 parser.add_argument('--mirror', help="mirror [i]mages, [f]iles, [e]mojis, [c]ards, [v]ideos (default: None)", default='', type=str, required=False)
892 parser.add_argument('--mirror-size', help="Maximum file size allowed to mirror (in MB) - default: no limit", default=0, type=int, required=False)
893 parser.add_argument('--ext', help="space-delimited extension to fetch when mirroring files (default: None)", default=None, type=str, required=False)
894 parser.add_argument('--count', help="Fetch $count latests tweets for a new account (default: 20). -1: whole timeline", default=0, type=int, required=False)
895 parser.add_argument('--upstream-img', help="make image point to the defaut url (default: 0)", default=0, type=int, required=False)
896 parser.add_argument('--resume', help="resume/retry mirroring at startup - default: 0", default=None, type=int, required=False)
897 parser.add_argument('--port', help="port of the integrated webserver - default: 1999", default=1999, type=int, required=False)
898 parser.add_argument('--listenip', help="listenip of the integrated webserver - default: localhost", default="localhost", type=str, required=False)
899 parser.add_argument('--ytdl', help="Define full path to youtube-dl", default=None, type=str, required=False)
900 parser.add_argument('--ytdl-upgrade', help="Define whether or not youtube-dl should be upgraded on statup - default: False", default=False, type=bool, required=False)
901 parser.add_argument('--instances', help="define nitter instance(s), comma separated - deault: letsencrypt instances", default=None, type=str, required=False)
902 parser.add_argument('--user-agent', help="define user agent to use", default="curl/7.74.0", type=str, required=False)
903 parser.add_argument('--random-user-agent', help="use random user agent", default=False, type=bool, required=False)
904 parser.add_argument('--user-agent-file', help="file containing user agents", default='useragent.txt', type=str, required=False)
905 parser.add_argument('--once', help="run once then exit", default=False, type=bool, required=False)
906 parser.add_argument('--random-instances', help="randomize nitter instances (default: False)", default=False, type=bool, required=False)
907 parser.add_argument('--fetch-profile-picture', help="fetch profile pictures (Default: True)", default=True, type=bool, required=False)
908 parser.add_argument('--interests', help="also fetch interests extracted from profile (Default: false)", default=False, type=bool, required=False)
909 parser.add_argument('--maxpage', help="go maximum $maxpages in the past (Default: 1000)", default=1000, type=int, required=False)
912 args = parser.parse_args()
914 if args.instances:
915 args.instances = [ instance.strip() for instance in args.instances.split(',') ]
916 else:
917 with open('nitter_instances.txt', 'r') as h:
918 args.instances = [ r.strip() for r in h.readlines() ]
919 if args.random_instances: random.shuffle(args.instances)
921 nitters = {}
922 for instance in args.instances:
923 nitters[instance] = {'fail_ticks': 0, 'ban_time': 0}
925 if args.mirror and 'v' in args.mirror:
926 args.rawproxy = args.proxy
927 if not args.ytdl: args.ytdl = 'youtube-dl'
928 try:
929 # check if youtube-dl exists
930 os.system('%s --help > /dev/null 2>&1' % args.ytdl)
931 ## update on startup
932 if args.ytdl_upgrade:
933 try:
934 if args.proxy:
935 os.system('%s --proxy %s -U > /dev/null 2>&1' % (args.ytdl, args.rawproxy))
936 else:
937 os.system('%s -U > /dev/null 2>&1' % args.ytdl)
938 except:
939 print('Could not upgrade youtube-dl (path: %s).' % args.ytdl)
940 pass
941 except:
942 print('youtube-dl not found, videos won\'t be downloaded (path: %s)' % args.ytdl)
943 args.mirror = args.mirror.replace('v','')
945 if args.mirror_size > 0:
946 args.mirror_size = args.mirror_size * 1024*1024
948 shorteners = {}
949 if args.unshorten:
950 with open('shorteners.txt', 'r') as f:
951 for i in f.readlines():
952 i = i.strip()
953 if len(i): shorteners[i] = True
955 if args.dir:
956 if not os.path.exists(args.dir):
957 retry_makedirs(args.dir)
958 for d in site_dirs:
959 if not os.path.exists(args.dir + d):
960 os.symlink(os.getcwd() + d, args.dir + d)
961 os.chdir(args.dir)
963 args.proxy = [RocksockProxyFromURL(args.proxy)] if args.proxy else None
965 if args.random_user_agent:
966 with open(args.user_agent_file, 'r') as f:
967 useragents = [ f.strip() for f in f.readlines() ]
969 nitter_rshttp = None
970 host = None
971 mastodon_rshttp = dict()
973 _ = load_watchlist()
974 for li in [ 'whitelist', 'blacklist']: load_list(li)
976 ## resume/retry mirroring process
977 mirroring_done = threading.Event()
978 if args.resume and args.mirror:
979 thread_resume_mirroring = threading.Thread(target=resume_retry_mirroring, args=(mirroring_done,))
980 thread_resume_mirroring.start()
981 else: mirroring_done.set()
983 start_server(args.listenip, args.port)
985 user_agent = 'curl/7.74.0'
986 interests = dict()
987 known_interests = dict()
988 while True:
989 try:
990 if args.random_user_agent: user_agent = random.choice(useragents)
991 if args.randomize_watchlist > 0: random.shuffle(watchlist)
993 for item in watchlist:
994 if item in disabled_users:
995 continue
997 elif item.count('@') >= 2:
998 _, _, host = item.split('@')
999 if not host in mastodon_rshttp: mastodon_rshttp[host] = None
1000 mastodon_rshttp[host], _ = scrape(item=item, http=mastodon_rshttp[host], host=host, search=False, user_agent=user_agent)
1002 else:
1003 search = True if item[0] == '#' else False
1004 nitter_rshttp, host = scrape(item, nitter_rshttp, host, search, user_agent)
1005 if args.interests and not search:
1006 interest = get_keywords(item)
1007 if len(interest): interests[item] = interest
1009 if args.interests and interests:
1010 for username in interests.keys():
1011 if not username in known_interests: known_interests[username] = dict()
1012 for interest in interests[username]:
1013 if interest in known_interests[username]:
1014 last = known_interests[username][interest]
1015 if (time.time() - last) < (3600*(24*7)): continue
1017 known_interests[username][interest] = time.time()
1018 nitter_rshttp, host = scrape('@%s+%s' % (username, interest), nitter_rshttp, host, True, user_agent)
1020 if args.once: break
1021 time.sleep(args.profile)
1023 except KeyboardInterrupt:
1024 break
1026 try:
1027 if not mirroring_done.is_set():
1028 mirroring_done.set()
1029 time.sleep(1)
1030 thread_resume_mirroring.terminate()
1031 thread_resume_mirroring.join()
1033 except:
1034 pass