fix picture fetching
[rofl0r-twatscrape.git] / twat.py
blob4b5dd298c6612184fd591dc40e4e2f38188ecb86
1 from http2 import RsHttp, _parse_url
2 from soup_parser import soupify
3 from nitter import nitter_get, nitter_connect, get_nitter_instance, set_invalid_nitter
4 from mastodon import mastodon_get
5 import time, datetime, calendar
6 import json
7 import os.path
8 import hashlib
9 import re
10 import random
11 import paths
12 import misc
13 import sys
14 import rsparse
15 from utils import retry_write, retry_makedirs
16 # the effective id of a twat is the retweet id, if it's a retweet
17 def get_effective_twat_id(twat):
18 if 'rid' in twat: return twat['rid']
19 return twat['id']
21 def _split_url(url):
22 url = url.encode('utf-8') if isinstance(url, unicode) else url
23 host, port, ssl, uri = _parse_url(url)
24 result = {'host':host, 'port':port, 'ssl':ssl, 'uri':uri}
25 aa = uri.split('#')
26 if len(aa) > 1:
27 result['anchor'] = aa[1]
28 else:
29 aa = uri.split('/')
30 if aa[-1] != "" and '.' in aa[-1]:
31 result['filename'] = aa[-1]
32 return result
34 def _hash(str):
35 value = str.encode('utf-8') if isinstance(str, unicode) else str
36 return hashlib.md5(value).hexdigest()
38 def _get_real_location(url, proxies=None):
39 url_components = _split_url(url)
41 http = RsHttp(url_components['host'], ssl=url_components['ssl'], port=url_components['port'], keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent="curl/7.74.0")
43 if not http.connect(): return url
44 hdr = http.head(url_components['uri'])
46 for line in hdr.split('\n'):
47 if line.lower().startswith('location: '): return line.split(': ')[1].strip()
49 return url
51 def _mirror_file(url_components, user, tid, args=None, content_type=None, force=False):
52 if 'filename' in url_components:
53 outname = paths.get_user(user)+ '/%s-%s' % (tid, url_components['filename'])
54 ext = url_components['filename'].split('.')[-1]
55 else:
56 outname = paths.get_user(user)+'/%s-%s' % (tid, url_components['uri'].split('/')[3])
57 ext = None
58 if not force and os.path.exists(outname):
59 return
61 http = RsHttp(url_components['host'], ssl=url_components['ssl'], port=url_components['port'], keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=args.proxy, user_agent="curl/7.74.0")
63 ## do nothing if we cannot connect
64 if not http.connect(): return None
66 if content_type:
68 if ext is not None and args.ext: filtre = str(args.ext).split(',')
69 else: filtre = []
71 hdr = http.head(url_components['uri'])
73 ## max mirror size
74 if args.mirror_size:
75 # extract second part of the Content-Length: line
76 value = [ str(i.split(':')[1]).strip() for i in hdr.split('\n') if i.lower().startswith('content-length:') ]
77 if not len(value) or int(value[0]) > args.mirror_size: return
79 # extract second part of the Content-Type: line
80 value = [ str(i.split(':')[1]).strip() for i in hdr.split('\n') if i.lower().startswith('content-type:') ]
82 ## server does not provide Content-Type info
83 if not len(value): return
84 # content type contains ';' (usually when html)
85 elif ';' in value[0]: value[0] = value[0].split(';')[0]
86 value = value[0].split('/')
88 ## when filtering extensions (--ext)
89 ## if unset, everything is mirrored
90 if len(filtre):
91 ## values don't match anything
92 if len(value) < 2 or (not value[0] in filtre and not value[1] in filtre): return
94 # XXX : mirror html files
95 ## we actually don't save html files
96 ## what about making automated save
97 ## thru the wayback machine ?
98 if 'html' in value: return
100 ## previous http object cannot be re-used
101 http = RsHttp(url_components['host'], ssl=url_components['ssl'], port=url_components['port'], keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=args.proxy, user_agent="curl/7.74.0")
103 ## do nothing if we cannot connect
104 if not http.connect(): return
106 extras = []
107 if 'filename' in url_components and url_components['filename'] == 'card.html' and 'twitter.com' in url_components['host']:
108 extras.append("Referer: https://twitter.com/")
110 hdr, res = http.get(url_components['uri'], extras=extras)
111 if res == '' and hdr != "":
112 # print http error code when things go wrong
113 print "%s%s : %s" % (url_components['host'], url_components['uri'], hdr.split('\n')[0])
114 return
116 res_bytes = res.encode('utf-8') if isinstance(res, unicode) else res
117 filehash = _hash(res_bytes)
118 out_fn = 'data/%s.%s' % (filehash, ext)
119 if not os.path.exists(out_fn):
120 retry_write(out_fn, res_bytes)
122 if os.path.lexists(outname): os.unlink(outname)
123 os.symlink('../../data/%s.%s' % (filehash, ext), outname)
125 def unshorten_urls(twat, proxies=None, shorteners={}):
126 soup = soupify(twat["text"])
127 for a in soup.body.find_all('a'):
128 if not 'href' in a.attrs: continue
129 href = a.attrs['href']
130 comp = _split_url(href)
131 if comp['host'] in shorteners:
132 try: twat['text'] = twat['text'].decode('utf8').replace( href, _get_real_location(href, proxies=proxies))
133 except: pass
135 return twat
137 def mirror_twat(twat, args=None):
139 if 'owner' in twat:
140 user = twat['owner'].lower()
141 else:
142 user = twat['user'].lower()
144 if not os.path.isdir('data'): retry_makedirs( 'data')
146 ## soupify user's text
147 soup = soupify(twat["text"])
149 ## try to automatically mirror links posted by the user,
150 ## if it matches the extension list.
152 if 'c' in args.mirror and 'curl' in twat:
153 url = twat['curl']
154 # XXX: unsupported nitter feature
155 # this displays fine when loading from twitter in a regular browser,
156 # which is probably converted using some js code
157 # TODO: check if nitter handles card:// stuff..
158 unsuported_shemes = ['card://']
159 for _us in unsuported_shemes:
160 if url.startswith(_us): continue
161 url_components = _split_url(url)
162 url_components['filename'] = 'card.html' #% twat['id']
163 _mirror_file(url_components, user, twat['id'], args)
165 if 'f' in args.mirror:
166 for a in soup.body.find_all('a'):
167 if 'data-expanded-url' in a.attrs:
168 url_components = _split_url(a.attrs['data-expanded-url'])
170 if 'filename' in url_components:
171 _mirror_file(url_components, user, twat['id'], args, content_type=True)
173 ## mirror videos
174 if 'v' in args.mirror and 'video' in twat:
175 tid = str(twat['id'])
176 url = 'https://twitter.com/%s/status/%s' % (twat['user'], tid)
177 outname = paths.get_user(twat['user']) + '/%s.mp4' % tid
178 if not os.path.exists('data/%s.mp4' % tid):
179 if args.proxy:
180 os.system('%s --proxy %s -o data/%s.mp4 %s > /dev/null 2>&1' % (args.ytdl, args.rawproxy, tid, url))
181 else:
182 os.system('%s -o data/%s.mp4 %s > /dev/null 2>&1' % (args.ytdl, tid, url))
183 if not os.path.exists('%s' % outname) and os.path.exists('data/%s.mp4' % tid):
184 os.symlink('../../data/%s.mp4' % tid, outname)
186 ## mirror posted pictures
187 if 'images' in twat and 'i' in args.mirror:
189 for x in xrange(0, len(twat['images'])):
190 i = twat['images'][x]
192 if '?format=' in i:
193 i = i.split('&')[0]
194 fmt = i.split('=')[1]
195 i = '%s.%s' % (i.split('?')[0], fmt)
197 url_components = _split_url(i)
198 _mirror_file(url_components, user, twat['id'], args)
200 ## deal with emojis
201 if 'e' in args.mirror:
202 for img in soup.body.find_all('img'):
203 if 'class' in img.attrs and 'Emoji' in img.attrs['class']:
204 src = img.attrs['src']
205 src = src.encode('utf-8') if isinstance(src, unicode) else src
207 split = src.split('/')
208 host = split[2]
209 emodir = '/'.join(split[3: len(split) - 1])
210 filename = split[-1]
211 uri = '%s/%s' % (emodir, filename)
213 if not os.path.isdir(emodir):
214 retry_makedirs( emodir )
216 if not os.path.exists('%s/%s' % (emodir,filename)):
217 http = RsHttp(host=host, port=443, timeout=30, ssl=True, keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=args.proxy, user_agent="curl/7.74.0")
218 while not http.connect():
219 # FIXME : what should happen on connect error ?
220 pass
221 hdr, res = http.get('/%s' % uri)
222 res = res.encode('utf-8') if isinstance(res, unicode) else res
223 retry_write('%s/%s' % (emodir, filename), res)
226 def add_tweet(id, user, time, text):
227 print "%s (%s) -> %s" % (user, time, id)
228 print text
230 # twat_id looks like: '/username/status/id'
231 def get_twat_timestamp(twat_id):
232 host = 'twitter.com'
233 http = RsHttp(host=host, port=443, timeout=30, ssl=True, keep_alive=True, follow_redirects=True, auto_set_cookies=True, user_agent="curl/7.74.0")
234 while not http.connect():
235 # FIXME : what should happen on connect error ?
236 pass
237 hdr, res = http.get(twat_id)
238 soup = soupify (res)
239 for small in soup.body.find_all('small', attrs={'class':'time'}):
240 if small.find('a').attrs["href"] == twat_id:
241 for span in small.find_all('span'):
242 span.attrs['data-time']
243 if 'data-time' in span.attrs:
244 return int(span.attrs['data-time'])
245 return 0
247 def get_twats_mobile(user, proxies=None):
248 host = 'mobile.twitter.com'
249 http = RsHttp(host=host, port=443, timeout=30, ssl=True, keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent="curl/7.74.0")
250 # http.debugreq = True
251 while not http.connect():
252 # FIXME : what should happen on connect error ?
253 pass
254 hdr, res = http.get("/" + user)
256 twats = []
258 soup = soupify (res)
259 tweet_id = 0
260 tweet_user = None
261 tweet_time = None
262 tweet_text = None
264 for tbl in soup.body.find_all('table'): # , attrs={'class':'tweet '}):
265 if not "class" in tbl.attrs: continue
266 if not "tweet" in repr(tbl.attrs["class"]): continue
267 for td in tbl.find_all('td'):
268 cls = td.attrs["class"][0]
269 #print "." + repr(cls) + "."
270 if cls == "user-info":
271 tweet_user=td.find('div', attrs={'class':'username'}).text.strip()
272 elif cls == 'timestamp':
273 a = td.find('a')
274 tweet_time = a.text
275 tweet_id = a.attrs["href"].rstrip("?p=p")
276 elif cls == 'tweet-content':
277 tweet_text = td.find('div', attrs={'class':'tweet-text'}).text.strip()
278 if tweet_user != None and tweet_id:
279 twats.append({'id':tweet_id, 'user':tweet_user, 'time':tweet_time, 'text':tweet_text})
281 return twats
284 def strify_tag_arr(tag_arr):
285 pass
287 def get_style_tag(tag, styles):
288 sta = [x.strip() for x in styles.split(';')]
289 for st in sta:
290 tg, s = st.split(':', 1)
291 if tg.strip() == tag: return s.strip()
292 return None
294 def fetch_nitter_picture(user, proxies, res=None, twhttp=None, nitters={}, user_agent='curl/7.74.0'):
295 pic_path = paths.get_profile_pic(user)
296 if os.path.isfile(pic_path): return
298 if not res:
299 while not twhttp:
300 twhttp, host, nitters = nitter_connect(nitters, proxies)
301 # no avail. instance, pic will be scraped another time
302 if not twhttp: return
304 try: hdr, res = twhttp.get("/%s" % user)
305 # user does not exist
306 except UnicodeDecodeError: return None
308 soup = soupify(res)
309 for a in soup.find_all('a', attrs={'class': 'profile-card-avatar'}):
310 pic_url = a.get('href') if '://' in a.get('href') else 'https://%s%s' % (get_nitter_instance(nitters, False), a.get('href'))
311 url_components = _split_url(pic_url)
312 http = RsHttp(host=url_components['host'], port=url_components['port'], timeout=30, ssl=url_components['ssl'], keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent="curl/7.74.0")
314 # if connection fails, the profile picture
315 # will be fetched another time
316 if not http.connect(): return
318 hdr, res = http.get(url_components['uri'])
319 if res == '' and hdr != "":
320 print('error fetching profile picture: %s' % url_components)
321 else:
322 res_bytes = res.encode('utf-8') if isinstance(res, unicode) else res
323 retry_write(pic_path, res_bytes)
324 return
326 def extract_twats(html, item, twats, timestamp, checkfn, nitters, blacklist, whitelist):
327 def find_div_end(html):
328 level = 0
329 for i in xrange(len(html)):
330 if html[i] == '<' and html[i+1] == 'd' and html[i+2] == 'i' and html[i+3] == 'v':
331 level += 1
332 if html[i] == '<' and html[i+1] == '/' and html[i+2] == 'd' and html[i+3] == 'i' and html[i+4] == 'v':
333 level -= 1
334 if level == 0:
335 return i + len('</div>')
337 regex = re.compile(r'<div.*class.*[" ]timeline.item[" ]')
338 nfetched = 0
339 cursor = None
340 for a in soupify(html).body.find_all('a'):
341 href = a.get('href')
342 if href and href.find('cursor=') != -1:
343 cursor = a.get('href')
344 break
346 while 1:
347 match = regex.search(html)
348 if not match:
349 return twats, cursor
350 html = html[match.start():]
351 div_end = find_div_end(html)
352 slice = html[:div_end]
353 html = html[div_end:]
354 twats = extract_twat(html, twats, timestamp, nitters, blacklist, whitelist)
355 nfetched += 1
356 # if the first two (the very first could be pinned) tweets are already known
357 # do not waste cpu processing more html
358 if nfetched == 2 and checkfn and not checkfn(item, twats):
359 return twats, cursor
361 """ this function might require some love """
362 def nitter_time_to_timegm(nt):
363 nt = nt.encode('utf-8') if isinstance(nt, unicode) else nt
364 # new date format
365 if nt.find('/') == -1:
366 months = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12 }
367 ampm = nt.split(' ')[5]
368 mon = months[nt.split(' ')[0]]
369 day = nt.split(' ')[1].strip(',')
370 yea = nt.split(' ')[2]
371 hou = int(nt.split(' ')[4].split(':')[0])
372 min = nt.split(' ')[4].split(':')[1]
373 strp = datetime.datetime.strptime('%s-%s-%s %s:%s:00 %s' % (int(yea), int(mon), int(day), int(hou), int(min), ampm), '%Y-%m-%d %H:%M:%S %p')
374 dd, tt = str(strp).split(' ')
375 yea, mon, day = dd.split('-')
376 hou, min, sec = tt.split(':')
378 dtdt = datetime.datetime(int(yea), int(mon), int(day), int(hou), int(min), int(sec))
379 # old time format
380 else:
381 nt = nt.split(',')
382 d = nt[0].split('/')
383 t = nt[1].strip().split(':')
384 dtdt = datetime.datetime(int(d[2]), int(d[1]), int(d[0]), int(t[0]), int(t[1]))
385 return calendar.timegm(dtdt.timetuple())
387 def extract_twat(html, twats, timestamp, nitters={}, blacklist={}, whitelist={}):
388 soup = soupify(html)
389 for div in soup.body.find_all('div', attrs={'class':'timeline-item'}):
391 tweet_id = 0
392 tweet_user = None
393 tweet_time = None
394 tweet_text = None
395 retweet_id = 0
396 retweet_user = None
397 card_url = None
398 card_title = None
399 card_description = None
400 card_destination = None
401 images = None
402 quote_tweet = None
403 video = False
405 pinned = ('pinned' in div.attrs["class"])
407 tweet_id = div.find('a', attrs={'class': 'tweet-link'}).get('href').split('/')[3].split('#')[0]
408 tweet_user = div.find('a', attrs={'class': 'username'}).get('title').lstrip('@').lower()
409 if tweet_user in blacklist or (len(whitelist) and not tweet_user in whitelist): continue
411 tt = ''.join( [ i.string for i in div.find('div', attrs={'class': 'tweet-content'}).contents ] )
412 tweet_text = tt.encode('utf-8') if isinstance(tt, unicode) else tt
413 tweet_time = nitter_time_to_timegm( div.find('span', attrs={'class': 'tweet-date'}).find('a').get('title') )
415 # it's a retweet
416 rt = div.find('div', attrs={'class': 'retweet-header'})
417 if rt is not None:
418 retweet_user = div.find('a', attrs={'class':'username'}).get('title').lstrip('@').lower()
419 if retweet_user != tweet_user: retweet_id = tweet_id
420 else: retweet_user = None
422 # user quotes someone else
423 qdiv = div.find('div', attrs={'class': 'quote-big'})
424 if qdiv:
425 quoted = qdiv.find('div', attrs={'class':'quote-text'})
426 if quoted:
427 quote_link = qdiv.find('a', attrs={'class': 'quote-link'}).get('href')
428 quser = quote_link.split('/')[1]
429 if quser in blacklist: continue
430 qtext = quoted.get_text()
431 if isinstance(qtext, unicode): qtext = qtext.encode('utf-8')
432 qid = quote_link.split('/')[3].split('#')[0]
433 qtime = qdiv.find('span', attrs={'class': 'tweet-date'}).find('a').get('title')
434 if qtime: qtime = nitter_time_to_timegm( qtime )
435 quote_tweet = {
436 'user': quser.lower(),
437 'id': qid,
438 'text': qtext,
439 'time': qtime
442 # find attachments
443 attachments_div = div.find('div', attrs={'class': 'attachments'})
444 if attachments_div:
445 images = []
446 for img in attachments_div.find_all('img'):
447 images.append('https://%s%s' % (get_nitter_instance(nitters, False), img.get('src')))
449 for vid in attachments_div.find_all('video'):
450 video = True
451 bg = vid.get('poster')
452 images.append('https://%s%s' % (get_nitter_instance(nitters, False), bg))
454 # card div..
455 card_div = div.find('div', attrs={'class': 'card'})
456 if card_div:
457 # card url (OK)
458 for a in card_div.find_all('a'):
459 if 'class' in a.attrs and 'card-container' in a.attrs['class']:
460 card_url = a.get('href')
461 break
462 # card title (OK)
463 for h2 in card_div.find_all('h2'):
464 if 'class' in h2.attrs and 'card-title' in h2.attrs['class']:
465 card_title = h2.get_text()
466 break
467 # card description
468 for p in card_div.find_all('p'):
469 if 'class' in p.attrs and 'card_description' in p.attrs['class']:
470 print('got card description')
471 card_description = p.get_text()
472 break
473 # card destination (OK)
474 for span in card_div.find_all('span'):
475 if 'class' in span.attrs and 'card-destination' in span.attrs['class']:
476 card_destination = span.get_text()
477 break
479 if tweet_user != None and tweet_id:
480 vals = {'id':tweet_id, 'user':tweet_user, 'time':tweet_time, 'text':tweet_text, 'fetched':timestamp}
481 if retweet_id: vals['rid'] = retweet_id
482 if card_url: vals['curl'] = card_url
483 if card_title: vals['ctitle'] = card_title
484 if card_description: vals['cdesc'] = card_description
485 if card_destination: vals['cdest'] = card_destination
486 if images: vals['images'] = images
487 if quote_tweet: vals['quote'] = quote_tweet
488 if pinned: vals['pinned'] = 1
489 if video: vals['video'] = 1
490 # save order of timeline by storing id of next twat
491 # next is equivalent to the next-newer twat.
492 if len(twats) and not 'pinned' in twats[len(twats)-1]:
493 next_twat = twats[len(twats)-1]
494 if len(next_twat):
495 vals['next'] = next_twat['id']
496 if retweet_id:
497 pr_time = 0
498 if 'rid' in next_twat:
499 if 'rid_time' in next_twat:
500 pr_time = next_twat['rid_time'] - 1
501 else:
502 pr_time = next_twat['time'] - 1
504 if pr_time != 0: vals['rid_time'] = pr_time
506 if not vals in twats: twats.append(vals)
507 return twats
509 # count: specify the number of twats that shall be fetched.
510 # the actual number delivered could be slightly more than specified.
511 # if 0 is specified, only the most recent page (containing typically 20 tweets)
512 # is harvested. if -1 is specified, the entire timeline will be harvested back
513 # to the very first tweet.
514 # if checkfn is passed , it'll be called with the username and current list of
515 # received twats, and can decide whether fetching will be continued or not,
516 # by returning True (continue) or False.
517 def get_twats(item, proxies=None, count=0, http=None, checkfn=None, nitters={}, host=None, search=False, user_agent="curl/7.74.0", blacklist={}, whitelist={}, maxpage=1000):
518 query = '/search?f=tweets&q=%s' % item.strip('#') if search else '/%s' %item
520 page = 1
521 elapsed_time = time.time()
523 hdr, res, http, host, nitters = nitter_get(query, http, host, nitters, proxies, user_agent)
525 # make sure all tweets fetched in a single invocation get the same timestamp,
526 # otherwise ordering might become messed up, once we sort them
527 timestamp = int(time.time())
529 known_cursors = []
530 twats = []
531 break_loop = False
533 while True:
534 twats, cursor = extract_twats(res, item, twats, timestamp, checkfn, nitters, blacklist, whitelist)
535 sys.stdout.write('\r[%s] %s: scraping... p:%d ' % (misc.get_timestamp("%Y-%m-%d %H:%M:%S", elapsed_time), item, page))
536 sys.stdout.flush()
537 if count == 0 or (not len(twats) and not cursor) or break_loop or (count != -1 and len(twats) >= count): break
538 if checkfn and not checkfn(item, twats): break
540 # fetch additional tweets that are not in the initial set of 20:
541 if len(twats): last_id = get_effective_twat_id(twats[len(twats)-1])
543 # we scrapped everything
544 if not cursor or (maxpage > 0 and page >= maxpage) or cursor in known_cursors: break
545 known_cursors.append(cursor)
546 query = '/search?f=tweets&q=%s%s' % (item.strip('#'), cursor) if search else '/%s%s' % (item, cursor)
547 print('cursor: %s, query: %s' %(cursor,query))
548 hdr, res, http, host, nitters = nitter_get(query, http, host, nitters, proxies, user_agent)
549 page = page + 1
551 return twats, nitters, host, http, page
553 if __name__ == '__main__':
554 print repr ( get_twats('realdonaldtrump') )
555 # print repr ( get_twats('FLOTUS') )
556 # get_twat_timestamp('/2runtherace/status/1015320873044234240')