1 from twat
import get_twats
, mirror_twat
, get_effective_twat_id
, unshorten_urls
, fetch_nitter_picture
2 from mastodon
import get_toots
, fetch_mastodon_picture
3 from rocksock
import RocksockProxyFromURL
4 from nitter
import set_invalid_nitter
, get_nitter_instance
13 from HTMLParser
import HTMLParser
14 from http2
import RsHttp
16 from soup_parser
import soupify
19 from utils
import safe_write
, retry_makedirs
28 disabled_users
= dict()
36 def replace_url_in_twat(twat
, args
=None):
38 user
= twat
['user'].lower()
40 soup
= soupify(twat
["text"])
43 for a
in soup
.body
.find_all('a'):
44 ## replace /search?q= links
45 if a
.attrs
['href'].startswith('/search'):
46 twat
['text'] = twat
['text'].replace('/search?q=', '/index.html?search=')
48 ## @username : replace when local
49 elif 'title' in a
.attrs
:
50 username
= a
.attrs
['href'].split('/')[1]
51 at_link
= user_at_link(username
.lower())
52 if username
.find('@') == -1:
53 rebuild
= '<b>%s<a href="https://%s/%s">%s</a></b>' % (at_link
, random
.choice(args
.instances
), username
, username
)
55 _
, u
, h
= username
.split('@')
56 rebuild
= '<b>%s<a href="https://%s/@%s">%s</a></b>' % (at_link
, h
, u
, username
)
57 # this fails when nonascii chars are present in a['title']
58 # XXX: would be nice to remove that 'title' attr, which would solve the issue
59 try: twat
['text'] = twat
['text'].replace(str(a
), rebuild
)
60 except Exception as e
:
61 print('replace_url_in_twats: %s' %e)
66 def build_searchbox(variables
):
67 link
= make_index_link(variables
, exclude
=['search', 'find', 'user'])
69 if 'search' in variables
and len(variables
['search']):
70 fill
= urllib
.unquote_plus(variables
['search'])
73 fill
= 'foo "bar baz" -quux'
76 user_sel
= ['<center><table><tr>']
78 for user
in sorted(watchlist
, key
=str.lower
):
79 if user
[0] == '#': continue
80 selected
= '' if (not 'user' in variables
or not user
in variables
['user']) else ' checked'
81 user_sel
.append("""<td width="33%%"><label class="hide_until_hover"><input id="u_%s" class="hide_until_hover" type="checkbox" value="%s"%s>%s</label></td>""" % (user
, user
, selected
, user
))
84 user_sel
.append('</tr><tr>')
86 user_sel
.append('</tr></table></center>')
89 '<div class="searchbox">',
90 ' <form name="search" id="searchbox" onsubmit="searchbar_check()" method="get" action= \'%s\'>' % link
,
91 ' <input class="searchbar hide_until_hover" name="search" type="text" value="%s" placeholder=\'%s\'/>' % (search_value
, fill
),
92 ' <input class="submit hide_until_hover" type="submit" value="↵">',
93 ' <div class="userlist">%s</div>' % '\n'.join(user_sel
),
94 ' <input name="user" id="user" type="hidden" value="">',
98 if len(search_value
) or 'user' in variables
:
99 ret
.insert(7, '<span class="gotoindex hide_until_hover"><a href="%s">%s</a></span>' % (link
,link
))
101 return '\n'.join(ret
)
103 def build_iconbar(twat
, variables
, quoted
):
104 bar
= '\n<div class="iconbar">'
108 il
= make_index_link(variables
, ['page'])
109 if not '?' in il
: il
+= '?'
111 id = get_effective_twat_id(twat
)
112 il2
= il
+ 'find_next=%s'%id
113 bar
+= '<a href="%s" name="%s">%s</a>'%(il2
, id,'⏪')
114 il2
= il
+ 'find=%s'%id
115 bar
+= '<a href="%s" name="%s">%s</a>'%(il2
, id,'⚓')
116 il2
= il
+ 'find_prev=%s'%id
117 bar
+= '<a href="%s" name="%s">%s</a>'%(il2
, id,'⏩')
120 #bar += ' <a target="_blank" href="https://api.twitter.com/1.1/statuses/retweets/%d.json" title="retweet">%s</a>' % (int(twat['id']), '🐦')
122 bar
+= ' <a target="_blank" href="https://web.archive.org/save/https://twitter.com/%s/status/%s" title="wayback">%s</a>' % (twat
['user'], twat
['id'], '♼')
124 bar
+= ' <a target="_blank" href="%s">%s</a>' % (paths
.get_user_json(twat
['owner']), '💾')
130 class MLStripper(HTMLParser
):
134 def handle_data(self
, d
):
137 return ''.join(self
.fed
)
139 def strip_tags(html
):
145 return os
.path
.exists(fn
)
147 def in_twatlist(user
, twat
):
148 eid
= get_effective_twat_id(twat
)
149 if user
in tweet_cache
and eid
in tweet_cache
[user
]: return True
152 def add_twatlist(user
, twat
, insert_pos
):
153 if not user
in tweets
: tweets
[user
] = list()
154 if not user
in tweet_cache
: tweet_cache
[user
] = dict()
155 tweets
[user
].insert(insert_pos
, twat
)
156 tweet_cache
[user
][get_effective_twat_id(twat
)] = True
158 def write_user_tweets(user
):
159 fn
= paths
.get_user_json(user
)
160 content
= json
.dumps(tweets
[user
], sort_keys
=True, indent
=4)
161 safe_write(fn
, content
)
163 def remove_known_retweets(lst
):
166 if "rid" in x
and x
["user"] in tweet_cache
and x
["id"] in tweet_cache
[x
["user"]]: pass
170 def format_time(stmp
):
171 return time
.strftime('%Y-%m-%d %H:%M:%S', time
.localtime(stmp
))
173 def add_owner_to_list(user
, lst
):
183 "function searchbar_check() {"
184 " f = document.getElementById('searchbox');"
188 " for(var i = 0; i < l; i++) {"
189 " if(a[i].id.substring(0,2) === 'u_') {"
190 " if(a[i].checked) s += ',' + a[i].value;"
193 " var u = document.getElementById('user');"
194 " u.value = s.substring(1);"
200 header
= """<!DOCTYPE html><html><head>
201 <title>%s</title><meta charset="utf-8"/>""" % args
.title
204 header
+= js_searchbox()
206 ## autorefresh the page ?
207 if args
.refresh
: header
+= """<meta http-equiv="refresh" content="%d" >""" % args
.refresh
208 header
+= """<link rel='stylesheet' type='text/css' href='css/%s.css'></head><body>""" % args
.theme
209 if len(all_tweets
): header
+= '<a class="export" href=/export download="twats.json">export %d tweets</a>' % len(all_tweets
)
213 def user_at_link(user
):
214 if user
in watchlist
:
215 return '<a href="?user=%s">@</a>' % user
217 if user
.find('@') == -1:
218 return '<a href="https://%s/%s">@</a>' % (random
.choice(args
.instances
), user
)
220 _
, u
, h
= user
.split('@')
221 return '<a href="https://%s/@%s">@</a>' % (h
,u
)
223 def replace_twat_text(text
):
224 try: text
= text
.decode('utf8').replace('\n', '<br>') #replace( u'\xa0', ' ').replace(u'\0xe2', ' ')
228 def htmlize_twat(twat
, variables
, quoted
=False):
229 tw
= '<div class="twat-container">'
233 if not 'rid' in twat
:
235 if paths
.has_profile_pic(twat
['owner']): tweet_pic
= paths
.get_profile_pic(twat
['owner'])
238 if paths
.has_profile_pic(twat
['user']): tweet_pic
= paths
.get_profile_pic(twat
['user'])
239 else: tweet_pic
= "data:image/gif;base64,R0lGODdhAQABAIAAAP///////ywAAAAAAQABAAACAkQBADs="
241 if paths
.has_profile_pic(twat
['owner']): retweet_pic
= paths
.get_profile_pic(twat
['owner'])
243 if twat
['user'].find('@') == -1:
244 retweet_str
= " (RT %s<a target='_blank' href='https://%s/%s/status/%s'>%s</a>)" % \
245 (user_at_link(twat
['user']), random
.choice(args
.instances
), twat
['user'], twat
['id'], twat
['user'])
247 _
, u
, h
= twat
['user'].split('@')
248 retweet_str
= " (RT %s<a target='_blank' href='https://%s/@%s/%s'>%s</a>)" % \
249 (user_at_link(twat
['user']), h
, u
, twat
['id'], twat
['user'].lstrip('@'))
251 if tweet_pic
: tw
+= '<div class="profile_picture"><img width="100%%" height="100%%" src="%s"></div>' % tweet_pic
252 if retweet_pic
: tw
+= '<div class="profile_picture_retweet"><img width="100%%" height="100%%" src="%s"></div>' % retweet_pic
254 user_str
= user_at_link(twat
["owner"].lower())
255 user_str
+= "<a target='_blank' href='https://%s/%s/status/%s'>%s</a>%s" % \
256 (random
.choice(args
.instances
), twat
["owner"], get_effective_twat_id(twat
), twat
["owner"], retweet_str
)
259 tw
+= '\n<div class="twat-title">'
262 if args
.iconbar
: tw
+= build_iconbar(twat
, variables
, quoted
)
264 time_str
= 'unknown' if twat
["time"] == 0 else format_time(twat
["time"])
265 tw
+= '%s - %s' % (user_str
, time_str
)
269 ## replace urls in twats
270 twat
['text'] = replace_url_in_twat(twat
, args
=args
)
272 if args
.nohtml
: twat
['text']= strip_tags(twat
['text'])
274 tw
+= '<p class="twat-text">%s</p>\n' % (replace_twat_text(twat
['text']))
276 if 'curl' in twat
and args
.iframe
> 0:
277 user
= twat
['user'].lower()
278 ifu
= paths
.get_user(user
) + '/%s-%s' % (twat
['id'], "card.html")
279 if (not 'c' in args
.mirror
) or (not file_exists(ifu
)):
281 tw
+= '<span class="twat-iframe"><iframe src="%s"></iframe></span>\n'%ifu
284 tw
+= '<p class="twat-image">'
285 if len(twat
['images']) > 1: wdth
= (100/len(twat
['images'])) - 1
288 for i
in twat
['images']:
290 tw
+= '<a href="%s">%s</a>'%(i
, i
)
292 img_path
= paths
.get_user(twat
['user']) + "/%s-%s" % (twat
['id'], i
.split('/')[-1])
293 if not file_exists(img_path
): img_path
= i
297 if args
.upstream_img
:
299 title
= "view remote image"
300 elif 'video' in twat
or 'ext_tw_video_thumb' in i
:
301 mp4_path
= paths
.get_user(twat
['user']) + '/%s.mp4' % str(twat
['id'])
302 if os
.path
.exists(mp4_path
):
304 title
= "view local video"
306 href
= "https://twitter.com/i/status/" + twat
['id']
307 title
= "view remote video"
309 div_class
= "video-thumbnail"
313 title
= "view local image"
314 tw
+= '<a href="%s" title="%s"><%s class="%s"><img class="%s" src="%s" width="%d%%"></%s></a>' % (href
, title
, span_or_div
, div_class
, img_class
, img_path
, wdth
, span_or_div
)
320 'user' : twat
['quote']['user'],
321 'owner' : twat
['quote']['user'],
322 'id' : twat
['quote']['id'],
323 'text' : twat
['quote']['text'],
326 tw
+= htmlize_twat(pseudo_twat
, variables
, quoted
=True)
332 def retweet_time(twat
):
333 if 'rid_time' in twat
: return twat
['rid_time']
334 if 'fetched' in twat
: return twat
['fetched']
337 def sort_tweets_func(x
, y
):
338 # somewhere in 2017, the numbering scheme of twitter changed
339 # that's a pity because the twat id is the most accurate
340 # sorting indicator, so we use it on all tweets > 2018
341 timestamp_2018
= 1514764800 #01/01/2018
342 if x
['time'] > timestamp_2018
and y
['time'] > timestamp_2018
:
344 t1
= int(get_effective_twat_id(x
))
345 t2
= int(get_effective_twat_id(y
))
348 if t1
== t2
: return 0
349 elif t1
> t2
: return 1
352 t1
= retweet_time(x
) if 'rid' in x
else x
["time"]
353 t2
= retweet_time(y
) if 'rid' in y
else y
["time"]
354 if t1
== t2
: return 0
355 elif t1
> t2
: return 1
358 def sort_tweets(twts
):
359 return sorted(twts
, cmp=sort_tweets_func
, reverse
=True)
361 def get_all_tweets(remove_dupes
=False):
362 global blacklist
, whitelist
364 use_whitelist
= True if len(whitelist
) else False
366 if user
in blacklist
: continue
367 if use_whitelist
and not user
in whitelist
: continue
368 all_tweets
.extend(add_owner_to_list(user
, tweets
[user
]))
370 all_tweets
= sort_tweets(all_tweets
)
371 if remove_dupes
: all_tweets
= remove_known_retweets(all_tweets
)
374 def find_tweet_page(all_tweets
, twid
, offset
):
375 for i
in xrange(0, len(all_tweets
)):
376 if get_effective_twat_id(all_tweets
[i
]) == twid
:
377 if i
+ offset
>= 0 and i
< len(all_tweets
):
379 return int(i
/ args
.tpp
), get_effective_twat_id(all_tweets
[i
])
382 def parse_search(str):
384 def __init__(self
, str):
385 self
.exclude
= (str[0] == '-')
386 self
.term
= str if not self
.exclude
else str[1:]
387 def match(self
, text
):
388 return (self
.exclude
and not self
.term
in text
) or (not self
.exclude
and self
.term
in text
)
392 for i
in xrange(len(str)):
397 if len(s
): terms
.append(SearchTerm(s
))
402 if len(s
): terms
.append(SearchTerm(s
))
411 if len(s
): terms
.append(SearchTerm(s
))
414 def find_tweets(all_tweets
, search
=None, users
=None):
415 terms
= parse_search(urllib
.unquote_plus(search
).lower()) if search
else []
417 for i
in xrange(0, len(all_tweets
)):
420 if not t
.match(all_tweets
[i
]['text'].lower()):
423 if match
and users
and not all_tweets
[i
]['owner'].lower() in users
:
425 if match
: match_tweets
.append(all_tweets
[i
])
428 # return tuple of html, redirect_url
429 # only one of both is set to something other than ""
430 def render_site(variables
= {}):
434 page
= 0 if not 'page' in variables
else int(variables
['page'])
435 if 'find' in variables
:
437 find
= variables
['find']
438 variables
.pop('find', None)
439 elif 'find_next' in variables
:
441 find
= variables
['find_next']
442 variables
.pop('find_next', None)
443 elif 'find_prev' in variables
:
445 find
= variables
['find_prev']
446 variables
.pop('find_prev', None)
450 search
= None if not 'search' in variables
else variables
['search']
451 users
= None if not 'user' in variables
else urllib
.unquote_plus(variables
['user']).lower().split(',')
453 # don't remove duplicates if users is specified: this could remove retweets
454 remove_dupes
= True if not users
else False
456 all_tweets
= get_all_tweets(remove_dupes
)
457 if users
or search
: all_tweets
= find_tweets(all_tweets
, search
=search
, users
=users
)
459 variables
['page'], tid
= find_tweet_page(all_tweets
, find
, find_offset
)
460 return "", make_index_link(variables
) + '#%s'%tid
462 pagetotalf
= len(all_tweets
) / float(args
.tpp
)
463 pagetotal
= int(pagetotalf
)
464 if pagetotalf
> pagetotal
: pagetotal
+= 1
466 max = (page
+1)*args
.tpp
467 if max > len(all_tweets
): max = len(all_tweets
)
469 for i
in xrange(page
*args
.tpp
, max):
471 html
.append(htmlize_twat(twat
, variables
))
474 return write_html(html
=html
, variables
=variables
, pages
=pagetotal
), ""
477 def render_empty(variables
= {}):
478 html
= ['<div class="error_message"><p class="twatter">🤐</p><p class="error_text">There is nothing here..<p><p><a href="/">Back to index</a></p></div>']
479 return write_html(html
=html
, variables
=variables
)
481 def make_index_link(variables
, exclude
=None):
482 exclude
= exclude
if exclude
else []
484 t
= [ '%s=%s'%(x
, str(variables
[x
])) for x
in variables
if not x
in exclude
]
485 if len(t
): return '%s?%s' % (s
, '&'.join(t
))
488 def page_selection(curr
, total
, margin
=5):
490 for i
in xrange(0, margin
):
492 for i
in xrange(curr
- margin
, curr
):
493 if not i
in set: set.append(i
)
494 for i
in xrange(curr
, curr
+margin
+1):
495 if not i
in set: set.append(i
)
496 for i
in xrange(total
-margin
, total
):
497 if not i
in set: set.append(i
)
500 if set[i
] >= total
or set[i
] < 0: set.pop(i
)
504 def page_selection_html(variables
, page
, pages
):
506 sel
= page_selection(page
, pages
)
507 for i
in xrange(len(sel
)):
508 if i
> 0 and sel
[i
] - sel
[i
-1] != 1:
511 div
.append(str(page
))
513 variables
['page'] = sel
[i
]
514 indx
= make_index_link(variables
)
515 div
.append('<a class="menu" href="%s">%d</a>' % (indx
,sel
[i
]))
516 variables
['page'] = page
519 def write_html(html
, variables
=None, pages
=0):
520 ht
= [ html_header() ]
521 page
= int(variables
['page']) if 'page' in variables
else 0
523 div
= page_selection_html(variables
, page
, pages
)
525 ht
.append('\n<div class="menu">%s</div>\n' % ' '.join(div
))
527 [ ht
.append(i
) for i
in html
]
530 ht
.append('\n<div class="menu">%s</div>\n' % ' '.join(div
))
532 ht
.append(build_searchbox(variables
))
533 ht
.append("\n</body></html>")
535 return "\n".join(ht
).encode('utf-8')
537 def fetch_more_tweets_callback(item
, twats
):
538 # iterate over last 20 tweets only as this is called once per page with the full list
540 if len(twats
) < twats_per_page
: twats_per_page
= len(twats
)
541 for i
in xrange(1, twats_per_page
+ 1):
543 if 'pinned' in twat
and twat
['pinned'] == 1: continue
544 user
= twat
['user'] if item
[0] == '#' else item
.lower()
545 if in_twatlist(user
, twat
): return False
548 def scrape(item
, http
, host
, search
, user_agent
):
550 global mastodon_rshttp
553 if item
in new_accounts
:
556 new_accounts
.remove(item
)
558 checkfn
= fetch_more_tweets_callback
559 count
= args
.count
if item
[0] == '#' else -1
561 if item
.count('@') < 2:
562 fetch_profile_picture
= fetch_nitter_picture
563 twats
, nitters
, host
, http
, page
= get_twats(item
, proxies
=args
.proxy
, count
=count
, http
=http
, checkfn
=checkfn
, nitters
=nitters
, host
=host
, search
=search
, user_agent
=user_agent
, blacklist
=blacklist
, whitelist
=whitelist
)
565 fetch_profile_picture
= fetch_mastodon_picture
566 twats
, http
= get_toots(item
, proxies
=args
.proxy
, count
=count
, http
=http
, checkfn
=checkfn
, user_agent
=user_agent
, blacklist
=args
.blacklist
, whitelist
=args
.whitelist
)
567 mastodon_rshttp
[host
] = http
571 user
= None if item
[0] == '#' else item
573 elapsed_time
= time
.time()
575 if search
: user
= t
['user'].lower()
576 if not user
in insert_pos
: insert_pos
[user
] = 0
578 if not in_twatlist(user
, t
):
580 if args
.unshorten
: t
= unshorten_urls(t
, proxies
=args
.proxy
, shorteners
=shorteners
)
581 add_twatlist(user
, t
, insert_pos
[user
])
582 insert_pos
[user
] += 1
583 insert_pos_total
+= 1
584 if 'quote_tweet' in t
:
585 if '@' in t
['quote_tweet']['user']:
586 _
, foo
, bar
= t
['quote_tweet']['user'].split('@')
587 http
= None if not bar
in mastodon_rshttp
else mastodon_rshttp
[bar
]
589 if not os
.path
.isdir(paths
.get_user(t
[quote_tweet
]['user'])): retry_makedirs(paths
.get_user(t
[quote_tweet
]['user']))
590 if args
.fetch_profile_picture
: fetch_profile_picture(t
[quote_tweet
]['user'], args
.proxy
, twhttp
=http
, nitters
=nitters
, user_agent
=user_agent
)
593 _
, foo
, bar
= t
['user'].split('@')
594 http
= None if not bar
in mastodon_rshttp
else mastodon_rshttp
[bar
]
596 if not os
.path
.isdir(paths
.get_user(t
['user'])): retry_makedirs(paths
.get_user(t
['user']))
597 if args
.fetch_profile_picture
: fetch_profile_picture(t
['user'], args
.proxy
, twhttp
=http
, nitters
=nitters
, user_agent
=user_agent
)
598 if args
.mirror
: mirror_twat(t
, args
=args
)
599 sys
.stdout
.write('\r[%s] %s: extracting from %d page(s): +%d twat(s)' % (misc
.get_timestamp("%Y-%m-%d %H:%M:%S", elapsed_time
), item
, page
, insert_pos_total
))
604 for user
in insert_pos
.keys(): write_user_tweets(user
)
606 write_user_tweets(item
)
607 elapsed_time
= (time
.time() - elapsed_time
)
608 sys
.stdout
.write('done (%s)\n' % misc
.get_timestamp("%H:%M:%S", elapsed_time
))
612 def resume_retry_mirroring(done
):
613 start_time
= time
.time()
614 print('resume_retry_mirroring: thread started')
615 infoticks
= time
.time()
616 for user
in watchlist
:
617 for t
in tweets
[user
]:
618 if done
.is_set(): break
619 elif (time
.time() - infoticks
) > 300:
620 print('resume_retry_mirroring: thread is still running')
621 infoticks
= time
.time()
622 mirror_twat(t
, args
=args
)
623 elapsed_time
= time
.time() - start_time
624 print('resume_retry_mirroring: end of thread, duration: %s' % time
.strftime("%H:%M:%S", time
.gmtime(elapsed_time
)))
627 def load_user_json(user
):
628 tweet_cache
[user
] = dict()
630 tweets
[user
] = json
.loads(open(paths
.get_user_json(user
), 'r').read())
631 for i
in xrange(len(tweets
[user
])):
632 tweet_cache
[user
][get_effective_twat_id(tweets
[user
][i
])] = True
637 for user
in watchlist
:
638 if not user
in tweets
:
641 def serve_loop(hs
, done
):
643 while not done
.is_set():
646 evt_done
= threading
.Event()
647 cthread
= threading
.Thread(target
=httpsrv_client_thread
, args
=(c
,evt_done
))
648 cthread
.daemon
= True
652 for ct
, ct_done
in client_threads
:
654 ctrm
.append((ct
,ct_done
))
658 client_threads
= [ x
for x
in client_threads
if not x
in ctrm
]
660 client_threads
.append((cthread
, evt_done
))
662 def forbidden_page():
666 ' <style>div.e{position:fixed;top:25%;bottom:25%;left:25%;right:25%;font-size:150px;text-align:center;}</style>\n'
667 ' <title>Forbidden</title>\n'
670 ' <div class="e">🖕</div>\n'
674 def configpage(req
= {}, variables
={}):
677 if not 'postdata' in req
:
679 with
open('watchlist.txt', 'r') as handle
: content
= ''.join(handle
.readlines())
681 '<div class="watchlist"><form name="configuration" action="config.html" method="post">\n',
682 '<label for=watchlist>watchlist</label><textarea id=watchlist name="watchlist" cols="30" rows="20" placeholder="watchlist, one per line">%s</textarea>\n' % content
,
683 '<label for=whitelist>whitelist</label><textarea id=whitelist name="whitelist" cols="30" rows="20" placeholder="whitelist, one per line">%s</textarea>\n' %'\n'.join(whitelist
.keys()),
684 '<label for=blacklist>blacklist</label><textarea id=blacklist name="blacklist" cols="30" rows="20" placeholder="blacklist, one per line">%s</textarea><br/>\n' %'\n'.join(blacklist
.keys()),
685 '<input type="submit" value="save and apply">\n',
688 html
= write_html(html
=html
, variables
=variables
)
692 for item
in req
['postdata']:
693 if item
== 'watchlist':
694 with
open(args
.watchlist
, 'w') as handle
: handle
.write(req
['postdata'][item
])
696 elif item
== 'blacklist':
697 with
open(args
.blacklist
, 'w') as handle
: handle
.write(req
['postdata'][item
])
699 elif item
== 'whitelist':
700 with
open(args
.whitelist
, 'w') as handle
: handle
.write(req
['postdata'][item
])
705 def variables_from_request(req
):
707 variables
['page'] = 0
708 if '?' in req
['url']:
709 a
,b
= req
['url'].split('?')
712 if not '=' in d
: continue
714 if len(f
): variables
[e
.lower()] = f
718 def httpsrv_client_thread(c
, evt_done
):
719 req
= c
.read_request()
721 elif len(watchlist
) == 0:
722 c
.redirect('/config.html')
723 elif os
.path
.isdir(req
['url'][1:]):
724 c
.send(403,'Forbidden', forbidden_page())
725 elif req
['url'] == '/':
726 c
.redirect('/index.html')
727 elif req
['url'].startswith('/index.html'):
728 variables
= variables_from_request(req
)
729 r
, redir
= render_site(variables
)
733 if r
== '': r
= render_empty(variables
=variables
)
735 elif not '..' in req
['url'] and file_exists(os
.getcwd() + req
['url']):
736 c
.serve_file(os
.getcwd() + req
['url'])
737 elif req
['url'] == '/robots.txt':
738 c
.send(200, "OK", "User-agent: *\nDisallow: /")
740 elif req
['url'] == '/export':
742 c
.send(200,'OK', json
.dumps(all_tweets
, sort_keys
=True, indent
=4))
744 elif req
['url'].startswith('/config.html'):
746 variables
=variables_from_request(req
)
747 r
, redir
= configpage(req
,variables
)
749 redir
= '/index.html'
753 if r
== '': r
= render_empty(variables
=variables
)
757 c
.send(404, "not exist", "the reqested file not exist!!!1")
761 def start_server(ip
, port
):
762 done
= threading
.Event()
763 from httpsrv
import HttpSrv
764 hs
= HttpSrv(ip
, port
)
767 except socket
.error
as e
:
768 if e
.errno
== errno
.EADDRINUSE
:
770 "ERROR: server socket address in use\n"
771 "wait a couple seconds and try again.\n"
772 "in case you're in pdb, you need to quit it\n"))
777 t
= threading
.Thread(target
=serve_loop
, args
=(hs
, done
))
782 whitelist_hash
= None
784 blacklist_hash
= None
787 if item
== 'whitelist':
788 global whitelist_hash
, whitelist
789 old_hash
= whitelist_hash
790 fname
= args
.whitelist
792 global blacklist_hash
, blacklist
793 old_hash
= blacklist_hash
794 fname
= args
.blacklist
797 for x
in open(fname
, 'r').readlines():
798 x
= x
.rstrip().lower()
799 if not len(x
): continue
800 if x
.startswith(';'): continue
803 if not len(wl
): return
804 newhash
= hashlib
.md5( ''.join(wl
.keys())).hexdigest()
805 if newhash
!= old_hash
:
806 print('reloading %s' %item
)
807 if item
== 'whitelist':
808 whitelist_hash
= newhash
811 blacklist_hash
= newhash
815 def load_watchlist():
816 global watchlist
, wl_hash
819 for x
in open(args
.watchlist
, 'r').readlines():
820 x
= x
.rstrip().lower()
823 disabled_users
[username
] = True
825 if not has_keywords
: has_keyword
= True
826 username
= x
if x
.find(' ') == -1 else x
.replace(' ', '+')
830 if not username
[0] == '#' and not os
.path
.exists(paths
.get_user_json(username
)):
831 new_accounts
.append(username
)
832 if not os
.path
.exists(paths
.get_user(username
)):
833 retry_makedirs(paths
.get_user(username
))
836 newhash
= hashlib
.md5(''.join(wl
)).hexdigest()
837 if newhash
!= wl_hash
:
838 print('reloading watchlist')
843 if has_keywords
and os
.path
.exists('users'):
844 for file in os
.listdir('users'):
845 d
= os
.path
.join('users', file)
846 if os
.path
.isdir(d
): load_user_json(d
)
848 def sort_keywords(interests
):
849 return sorted(interests
.items(), key
= lambda kv
:(kv
[1],kv
[0]))
851 def get_keywords(username
):
852 js
= paths
.get_user_json(username
)
854 with
open(js
, 'r') as h
:
858 lines
= [ twat
['text'] for twat
in j
]
861 line
= line
.lower().strip()
862 for word
in line
.split():
865 interests
[word
[1:]] = 1 if not word
[1:] in interests
else (interests
[word
[1:]] + 1)
866 elif re
.match('^[a-z0-9]{5,}$', word
):
867 interests
[word
] = 1 if not word
in interests
else (interests
[word
] + 1)
869 sample
= len(interests
) if len(interests
) < 10 else 10
870 return random
.sample( interests
, sample
)
872 if __name__
== '__main__':
873 parser
= argparse
.ArgumentParser()
874 parser
.add_argument('--dir', help="where to save twats (default: current directory)", type=str, default
=None, required
=False)
875 parser
.add_argument('--watchlist', help="specify watchlist to use (default: watchlist.txt)", type=str, default
='watchlist.txt', required
=False)
876 parser
.add_argument('--blacklist', help="specify a file containing user accounts to ignore (default: blacklist.txt)", type=str, default
="blacklist.txt", required
=False)
877 parser
.add_argument('--whitelist', help="only save twats from those user accounts (default: whitelist.txt)", type=str, default
="whitelist.txt", required
=False)
878 parser
.add_argument('--randomize-watchlist', help="randomize watchlist on each loop (default: 0)", type=int, default
=0, required
=False)
879 parser
.add_argument('--refresh', help="refresh html page every X seconds - 0: disabled (default: 0)", type=int, default
=0, required
=False)
880 parser
.add_argument('--title', help="defile title (default: %s)" % title
, type=str, default
=title
, required
=False)
881 parser
.add_argument('--theme', help="select theme (default: fancy)", default
='fancy', type=str, required
=False)
882 parser
.add_argument('--config', help="enable the /config.html page (default: 1)", default
=1, type=int, required
=False)
883 parser
.add_argument('--iframe', help="show iframe (default: 1)", default
=1, type=int, required
=False)
884 parser
.add_argument('--profile', help="check profile every X second(s) (default: 60)", default
=60, type=int, required
=False)
885 parser
.add_argument('--images', help="show image (default: 1)", default
=1, type=int, required
=False)
886 parser
.add_argument('--tpp', help="twats per page (default: very high number)", default
=99999999999, type=int, required
=False)
887 parser
.add_argument('--proxy', help="use a proxy (syntax: socks5://ip:port)", default
=None, type=str, required
=False)
888 parser
.add_argument('--iconbar', help="show iconbar bar (default: 1)", default
=1, type=int, required
=False)
889 parser
.add_argument('--unshorten', help='unshorten shortened links (default: 0)', default
=0, type=int, required
=False)
890 parser
.add_argument('--nohtml', help="strip html from tweets (default: 0)", default
=0, type=int, required
=False)
891 parser
.add_argument('--mirror', help="mirror [i]mages, [f]iles, [e]mojis, [c]ards, [v]ideos (default: None)", default
='', type=str, required
=False)
892 parser
.add_argument('--mirror-size', help="Maximum file size allowed to mirror (in MB) - default: no limit", default
=0, type=int, required
=False)
893 parser
.add_argument('--ext', help="space-delimited extension to fetch when mirroring files (default: None)", default
=None, type=str, required
=False)
894 parser
.add_argument('--count', help="Fetch $count latests tweets for a new account (default: 20). -1: whole timeline", default
=0, type=int, required
=False)
895 parser
.add_argument('--upstream-img', help="make image point to the defaut url (default: 0)", default
=0, type=int, required
=False)
896 parser
.add_argument('--resume', help="resume/retry mirroring at startup - default: 0", default
=None, type=int, required
=False)
897 parser
.add_argument('--port', help="port of the integrated webserver - default: 1999", default
=1999, type=int, required
=False)
898 parser
.add_argument('--listenip', help="listenip of the integrated webserver - default: localhost", default
="localhost", type=str, required
=False)
899 parser
.add_argument('--ytdl', help="Define full path to youtube-dl", default
=None, type=str, required
=False)
900 parser
.add_argument('--ytdl-upgrade', help="Define whether or not youtube-dl should be upgraded on statup - default: False", default
=False, type=bool, required
=False)
901 parser
.add_argument('--instances', help="define nitter instance(s), comma separated - deault: letsencrypt instances", default
=None, type=str, required
=False)
902 parser
.add_argument('--user-agent', help="define user agent to use", default
="curl/7.74.0", type=str, required
=False)
903 parser
.add_argument('--random-user-agent', help="use random user agent", default
=False, type=bool, required
=False)
904 parser
.add_argument('--user-agent-file', help="file containing user agents", default
='useragent.txt', type=str, required
=False)
905 parser
.add_argument('--once', help="run once then exit", default
=False, type=bool, required
=False)
906 parser
.add_argument('--random-instances', help="randomize nitter instances (default: False)", default
=False, type=bool, required
=False)
907 parser
.add_argument('--fetch-profile-picture', help="fetch profile pictures (Default: True)", default
=True, type=bool, required
=False)
908 parser
.add_argument('--interests', help="also fetch interests extracted from profile (Default: false)", default
=False, type=bool, required
=False)
909 parser
.add_argument('--maxpage', help="go maximum $maxpages in the past (Default: 1000)", default
=1000, type=int, required
=False)
912 args
= parser
.parse_args()
915 args
.instances
= [ instance
.strip() for instance
in args
.instances
.split(',') ]
917 with
open('nitter_instances.txt', 'r') as h
:
918 args
.instances
= [ r
.strip() for r
in h
.readlines() ]
919 if args
.random_instances
: random
.shuffle(args
.instances
)
922 for instance
in args
.instances
:
923 nitters
[instance
] = {'fail_ticks': 0, 'ban_time': 0}
925 if args
.mirror
and 'v' in args
.mirror
:
926 args
.rawproxy
= args
.proxy
927 if not args
.ytdl
: args
.ytdl
= 'youtube-dl'
929 # check if youtube-dl exists
930 os
.system('%s --help > /dev/null 2>&1' % args
.ytdl
)
932 if args
.ytdl_upgrade
:
935 os
.system('%s --proxy %s -U > /dev/null 2>&1' % (args
.ytdl
, args
.rawproxy
))
937 os
.system('%s -U > /dev/null 2>&1' % args
.ytdl
)
939 print('Could not upgrade youtube-dl (path: %s).' % args
.ytdl
)
942 print('youtube-dl not found, videos won\'t be downloaded (path: %s)' % args
.ytdl
)
943 args
.mirror
= args
.mirror
.replace('v','')
945 if args
.mirror_size
> 0:
946 args
.mirror_size
= args
.mirror_size
* 1024*1024
950 with
open('shorteners.txt', 'r') as f
:
951 for i
in f
.readlines():
953 if len(i
): shorteners
[i
] = True
956 if not os
.path
.exists(args
.dir):
957 retry_makedirs(args
.dir)
959 if not os
.path
.exists(args
.dir + d
):
960 os
.symlink(os
.getcwd() + d
, args
.dir + d
)
963 args
.proxy
= [RocksockProxyFromURL(args
.proxy
)] if args
.proxy
else None
965 if args
.random_user_agent
:
966 with
open(args
.user_agent_file
, 'r') as f
:
967 useragents
= [ f
.strip() for f
in f
.readlines() ]
971 mastodon_rshttp
= dict()
974 for li
in [ 'whitelist', 'blacklist']: load_list(li
)
976 ## resume/retry mirroring process
977 mirroring_done
= threading
.Event()
978 if args
.resume
and args
.mirror
:
979 thread_resume_mirroring
= threading
.Thread(target
=resume_retry_mirroring
, args
=(mirroring_done
,))
980 thread_resume_mirroring
.start()
981 else: mirroring_done
.set()
983 start_server(args
.listenip
, args
.port
)
985 user_agent
= 'curl/7.74.0'
987 known_interests
= dict()
990 if args
.random_user_agent
: user_agent
= random
.choice(useragents
)
991 if args
.randomize_watchlist
> 0: random
.shuffle(watchlist
)
993 for item
in watchlist
:
994 if item
in disabled_users
:
997 elif item
.count('@') >= 2:
998 _
, _
, host
= item
.split('@')
999 if not host
in mastodon_rshttp
: mastodon_rshttp
[host
] = None
1000 mastodon_rshttp
[host
], _
= scrape(item
=item
, http
=mastodon_rshttp
[host
], host
=host
, search
=False, user_agent
=user_agent
)
1003 search
= True if item
[0] == '#' else False
1004 nitter_rshttp
, host
= scrape(item
, nitter_rshttp
, host
, search
, user_agent
)
1005 if args
.interests
and not search
:
1006 interest
= get_keywords(item
)
1007 if len(interest
): interests
[item
] = interest
1009 if args
.interests
and interests
:
1010 for username
in interests
.keys():
1011 if not username
in known_interests
: known_interests
[username
] = dict()
1012 for interest
in interests
[username
]:
1013 if interest
in known_interests
[username
]:
1014 last
= known_interests
[username
][interest
]
1015 if (time
.time() - last
) < (3600*(24*7)): continue
1017 known_interests
[username
][interest
] = time
.time()
1018 nitter_rshttp
, host
= scrape('@%s+%s' % (username
, interest
), nitter_rshttp
, host
, True, user_agent
)
1021 time
.sleep(args
.profile
)
1023 except KeyboardInterrupt:
1027 if not mirroring_done
.is_set():
1028 mirroring_done
.set()
1030 thread_resume_mirroring
.terminate()
1031 thread_resume_mirroring
.join()