twatbot.py: add interest option
[rofl0r-twatscrape.git] / mastodon.py
blob2a093669cc9f0dedc9820f4b40112225aea75b5e
1 from http2 import RsHttp, _parse_url
2 from soup_parser import soupify
3 import time, datetime, calendar
4 import json
5 import re
7 def time_to_timegm(nt):
8 nt = nt.encode('utf-8') if isinstance(nt, unicode) else nt
9 # new date format
11 dd, tt = nt.split('+')[0].split('T')
12 yea, mon, day = dd.split('-')
13 hou, min, sec = tt.split(':')
14 dtdt = datetime.datetime(int(yea), int(mon), int(day), int(hou), int(min), int(sec))
15 return calendar.timegm(dtdt.timetuple())
17 def extract_props(item):
18 props = item.get('data-props')
19 props = props.encode('utf-8') if isinstance(props, unicode) else props
20 return json.loads(props)
22 def extract_toots(html, item, toots, timestamp, checkfn, ignore={}):
23 cursor = [ a.get('href') for a in soupify(html).body.find_all('a') if a.get('href').find('?max_id=') != -1 ]
24 cursor = cursor[0] if len(cursor) else None
25 quote_toot = None
26 images = []
27 toot = dict()
29 elements = [ div for div in soupify(html).body.find_all('div') if ('class' in div.attrs and 'status-public' in div.attrs['class']) ]
31 for element in elements:
32 video = None
33 card = None
34 images = list()
35 toot_text = None
36 toot_boosted = False
37 pinned = False
38 toot_author = None
39 toot_time = None
41 for span in element.find_all('span'):
42 if span.get_text() == 'Pinned post':
43 pinned = True
44 break
46 infodiv = element.find('div', attrs={'class':'status__info'})
47 if infodiv is None: continue # should not happen
48 toot_id = infodiv.find('a', attrs={'class':'status__relative-time'}).get('href').split('/')[4]
49 # XXX some toot_id are in format dead-beef-0123
50 # also, usernames could appear ?
51 toot_id = int(toot_id) if isinstance(toot_id, int) else toot_id
52 toot_time = time_to_timegm( infodiv.find('data', attrs={'class':'dt-published'}).get('value') )
53 toot_author = infodiv.find('a', attrs={'class':'status__display-name'}).get('href').split('/')[3].lower()
54 toot_displayname = infodiv.find('strong', attrs={'class':'display-name__html'}).get_text()
55 toot_account = infodiv.find('span', attrs={'class':'display-name__account'}).contents[0].strip()
56 if toot_account in ignore: continue
57 # FIXME: toot_text has weird formatting upon scraping, but displays fine
58 # once twatbot is restarted... needs to investigate this.
59 toot_text = str(element.find('div', attrs={'class':'e-content'}))
60 toot_text = toot_text.encode('utf-8') if isinstance(toot_text, unicode) else toot_text
61 #toot_avatar = infodiv.find('img', attrs={'class':'account__avatar'}).get('src')
63 card = element.find('div', attrs={'data-component':'Card'})
64 if card:
65 card = extract_props(card)
67 video = element.find('div', attrs={'data-component':'Video'})
68 if video:
69 video = extract_props(video)
70 for v in video['media']:
71 images.append( v['preview_url'] )
73 gallery = element.find('div', attrs={'data-component':'MediaGallery'})
74 if gallery:
75 gallery = extract_props(gallery)
76 images.append(gallery['media'][0]['url'])
78 toot = {
79 'owner': toot_account,
80 'fetched': int(time.time()),
81 'time': toot_time,
82 'id': toot_id,
83 'user': toot_account,
84 'displayname': toot_displayname,
85 'account': toot_account,
86 'text': toot_text,
89 if item != toot_account: toot['rid'] = toot_id
90 if pinned: toot['pinned'] = 1
91 if len(images): toot['images'] = images
92 if video: toot['video'] = 1
94 if card:
95 toot['curl'] = card['card']['url']
96 toot['ctitle'] = card['card']['title']
97 toot['cdesc'] = card['card']['description']
99 toots.append(toot)
100 # print(toot)
102 return toots, cursor
104 def mastodon_get(req, http, host, proxies, user_agent='curl/7.60.0'):
106 if http is None:
107 http = RsHttp(host=host,port=443,timeout=30,ssl=True,keep_alive=True,follow_redirects=True,auto_set_cookies=True,proxies=proxies,user_agent=user_agent)
109 if http.connect():
110 hdr, res = http.get(req)
111 if not '200 OK' in hdr:
112 http = None
114 return hdr, res, http, host
115 return None, None, None, host
117 def get_toots(item, proxies=None, count=0, http=None, checkfn=None, user_agent='curl/7.60.0', ignore={}):
118 toots = []
119 _, user, host = item.split('@')
121 hdr, res, http, host = mastodon_get('/@%s' %user, http, host, proxies, user_agent)
123 timestamp = int(time.time())
124 break_loop = False
126 while True:
127 toots, cursor = extract_toots(res, item, toots, timestamp, checkfn, ignore)
129 if count == 0 or len(toots) == 0 or break_loop or (count != -1 and len(toots) >= count): break
130 if checkfn and not checkfn(item, toots): break
131 if not cursor: break
133 #last_id = get_effective_toot_id( toots[ len(toots) - 1])
134 _, _, _, cursor = cursor.split('/')
135 hdr, res, http, host = mastodon_get('/%s' %cursor, http, host, proxies, user_agent)
137 return toots, http
139 if __name__ == '__main__':
140 get_toots('@Decentralize_today@mastodon.social', proxies=None, count=-1, http=None, checkfn=None)