fix picture fetching
[rofl0r-twatscrape.git] / mastodon.py
blobad14f91594b2ffaa7397e6dc021b1b4f2bd68c70
1 from http2 import RsHttp, _parse_url
2 from soup_parser import soupify
3 import time, datetime, calendar
4 import json
5 import re
6 import paths
8 def fetch_mastodon_picture(user, proxies, res=None, twhttp=None, nitters={}, user_agent='curl/7.74.0'):
9 return
11 def time_to_timegm(nt):
12 nt = nt.encode('utf-8') if isinstance(nt, unicode) else nt
13 # new date format
15 dd, tt = nt.split('+')[0].split('T')
16 yea, mon, day = dd.split('-')
17 hou, min, sec = tt.split(':')
18 dtdt = datetime.datetime(int(yea), int(mon), int(day), int(hou), int(min), int(sec))
19 return calendar.timegm(dtdt.timetuple())
21 def extract_props(item):
22 props = item.get('data-props')
23 props = props.encode('utf-8') if isinstance(props, unicode) else props
24 return json.loads(props)
26 def extract_toots(html, item, toots, timestamp, checkfn, ignore={}):
27 cursor = [ a.get('href') for a in soupify(html).body.find_all('a') if a.get('href').find('?max_id=') != -1 ]
28 cursor = cursor[0] if len(cursor) else None
29 quote_toot = None
30 images = []
31 toot = dict()
33 elements = [ div for div in soupify(html).body.find_all('div') if ('class' in div.attrs and 'status-public' in div.attrs['class']) ]
35 for element in elements:
36 video = None
37 card = None
38 images = list()
39 toot_text = None
40 toot_boosted = False
41 pinned = False
42 toot_author = None
43 toot_time = None
45 for span in element.find_all('span'):
46 if span.get_text() == 'Pinned post':
47 pinned = True
48 break
50 infodiv = element.find('div', attrs={'class':'status__info'})
51 if infodiv is None: continue # should not happen
52 toot_id = infodiv.find('a', attrs={'class':'status__relative-time'}).get('href').split('/')[4]
53 # XXX some toot_id are in format dead-beef-0123
54 # also, usernames could appear ?
55 toot_id = int(toot_id) if isinstance(toot_id, int) else toot_id
56 toot_time = time_to_timegm( infodiv.find('data', attrs={'class':'dt-published'}).get('value') )
57 toot_author = infodiv.find('a', attrs={'class':'status__display-name'}).get('href').split('/')[3].lower()
58 toot_displayname = infodiv.find('strong', attrs={'class':'display-name__html'}).get_text()
59 toot_account = infodiv.find('span', attrs={'class':'display-name__account'}).contents[0].strip()
60 if toot_account in ignore: continue
61 # FIXME: toot_text has weird formatting upon scraping, but displays fine
62 # once twatbot is restarted... needs to investigate this.
63 toot_text = str(element.find('div', attrs={'class':'e-content'}))
64 toot_text = toot_text.encode('utf-8') if isinstance(toot_text, unicode) else toot_text
65 #toot_avatar = infodiv.find('img', attrs={'class':'account__avatar'}).get('src')
67 card = element.find('div', attrs={'data-component':'Card'})
68 if card:
69 card = extract_props(card)
71 video = element.find('div', attrs={'data-component':'Video'})
72 if video:
73 video = extract_props(video)
74 for v in video['media']:
75 images.append( v['preview_url'] )
77 gallery = element.find('div', attrs={'data-component':'MediaGallery'})
78 if gallery:
79 gallery = extract_props(gallery)
80 images.append(gallery['media'][0]['url'])
82 toot = {
83 'owner': toot_account,
84 'fetched': int(time.time()),
85 'time': toot_time,
86 'id': toot_id,
87 'user': toot_account,
88 'displayname': toot_displayname,
89 'account': toot_account,
90 'text': toot_text,
93 if item != toot_account: toot['rid'] = toot_id
94 if pinned: toot['pinned'] = 1
95 if len(images): toot['images'] = images
96 if video: toot['video'] = 1
98 if card:
99 toot['curl'] = card['card']['url']
100 toot['ctitle'] = card['card']['title']
101 toot['cdesc'] = card['card']['description']
103 toots.append(toot)
104 # print(toot)
106 return toots, cursor
108 def mastodon_get(req, http, host, proxies, user_agent='curl/7.74.0'):
110 if http is None:
111 http = RsHttp(host=host,port=443,timeout=30,ssl=True,keep_alive=True,follow_redirects=True,auto_set_cookies=True,proxies=proxies,user_agent=user_agent)
113 if http.connect():
114 hdr, res = http.get(req)
115 if not '200 OK' in hdr:
116 http = None
118 return hdr, res, http, host
119 return None, None, None, host
121 def get_toots(item, proxies=None, count=0, http=None, checkfn=None, user_agent='curl/7.74.0', ignore={}):
122 toots = []
123 _, user, host = item.split('@')
125 hdr, res, http, host = mastodon_get('/@%s' %user, http, host, proxies, user_agent)
127 timestamp = int(time.time())
128 break_loop = False
130 while True:
131 toots, cursor = extract_toots(res, item, toots, timestamp, checkfn, ignore)
133 if count == 0 or len(toots) == 0 or break_loop or (count != -1 and len(toots) >= count): break
134 if checkfn and not checkfn(item, toots): break
135 if not cursor: break
137 #last_id = get_effective_toot_id( toots[ len(toots) - 1])
138 _, _, _, cursor = cursor.split('/')
139 hdr, res, http, host = mastodon_get('/%s' %cursor, http, host, proxies, user_agent)
141 return toots, http
143 if __name__ == '__main__':
144 get_toots('@Decentralize_today@mastodon.social', proxies=None, count=-1, http=None, checkfn=None)