1 from http2
import RsHttp
, _parse_url
2 from soup_parser
import soupify
3 import time
, datetime
, calendar
8 def fetch_mastodon_picture(user
, proxies
, res
=None, twhttp
=None, nitters
={}, user_agent
='curl/7.74.0'):
11 def time_to_timegm(nt
):
12 nt
= nt
.encode('utf-8') if isinstance(nt
, unicode) else nt
15 dd
, tt
= nt
.split('+')[0].split('T')
16 yea
, mon
, day
= dd
.split('-')
17 hou
, min, sec
= tt
.split(':')
18 dtdt
= datetime
.datetime(int(yea
), int(mon
), int(day
), int(hou
), int(min), int(sec
))
19 return calendar
.timegm(dtdt
.timetuple())
21 def extract_props(item
):
22 props
= item
.get('data-props')
23 props
= props
.encode('utf-8') if isinstance(props
, unicode) else props
24 return json
.loads(props
)
26 def extract_toots(html
, item
, toots
, timestamp
, checkfn
, ignore
={}):
27 cursor
= [ a
.get('href') for a
in soupify(html
).body
.find_all('a') if a
.get('href').find('?max_id=') != -1 ]
28 cursor
= cursor
[0] if len(cursor
) else None
33 elements
= [ div
for div
in soupify(html
).body
.find_all('div') if ('class' in div
.attrs
and 'status-public' in div
.attrs
['class']) ]
35 for element
in elements
:
45 for span
in element
.find_all('span'):
46 if span
.get_text() == 'Pinned post':
50 infodiv
= element
.find('div', attrs
={'class':'status__info'})
51 if infodiv
is None: continue # should not happen
52 toot_id
= infodiv
.find('a', attrs
={'class':'status__relative-time'}).get('href').split('/')[4]
53 # XXX some toot_id are in format dead-beef-0123
54 # also, usernames could appear ?
55 toot_id
= int(toot_id
) if isinstance(toot_id
, int) else toot_id
56 toot_time
= time_to_timegm( infodiv
.find('data', attrs
={'class':'dt-published'}).get('value') )
57 toot_author
= infodiv
.find('a', attrs
={'class':'status__display-name'}).get('href').split('/')[3].lower()
58 toot_displayname
= infodiv
.find('strong', attrs
={'class':'display-name__html'}).get_text()
59 toot_account
= infodiv
.find('span', attrs
={'class':'display-name__account'}).contents
[0].strip()
60 if toot_account
in ignore
: continue
61 # FIXME: toot_text has weird formatting upon scraping, but displays fine
62 # once twatbot is restarted... needs to investigate this.
63 toot_text
= str(element
.find('div', attrs
={'class':'e-content'}))
64 toot_text
= toot_text
.encode('utf-8') if isinstance(toot_text
, unicode) else toot_text
65 #toot_avatar = infodiv.find('img', attrs={'class':'account__avatar'}).get('src')
67 card
= element
.find('div', attrs
={'data-component':'Card'})
69 card
= extract_props(card
)
71 video
= element
.find('div', attrs
={'data-component':'Video'})
73 video
= extract_props(video
)
74 for v
in video
['media']:
75 images
.append( v
['preview_url'] )
77 gallery
= element
.find('div', attrs
={'data-component':'MediaGallery'})
79 gallery
= extract_props(gallery
)
80 images
.append(gallery
['media'][0]['url'])
83 'owner': toot_account
,
84 'fetched': int(time
.time()),
88 'displayname': toot_displayname
,
89 'account': toot_account
,
93 if item
!= toot_account
: toot
['rid'] = toot_id
94 if pinned
: toot
['pinned'] = 1
95 if len(images
): toot
['images'] = images
96 if video
: toot
['video'] = 1
99 toot
['curl'] = card
['card']['url']
100 toot
['ctitle'] = card
['card']['title']
101 toot
['cdesc'] = card
['card']['description']
108 def mastodon_get(req
, http
, host
, proxies
, user_agent
='curl/7.74.0'):
111 http
= RsHttp(host
=host
,port
=443,timeout
=30,ssl
=True,keep_alive
=True,follow_redirects
=True,auto_set_cookies
=True,proxies
=proxies
,user_agent
=user_agent
)
114 hdr
, res
= http
.get(req
)
115 if not '200 OK' in hdr
:
118 return hdr
, res
, http
, host
119 return None, None, None, host
121 def get_toots(item
, proxies
=None, count
=0, http
=None, checkfn
=None, user_agent
='curl/7.74.0', ignore
={}):
123 _
, user
, host
= item
.split('@')
125 hdr
, res
, http
, host
= mastodon_get('/@%s' %user, http
, host
, proxies
, user_agent
)
127 timestamp
= int(time
.time())
131 toots
, cursor
= extract_toots(res
, item
, toots
, timestamp
, checkfn
, ignore
)
133 if count
== 0 or len(toots
) == 0 or break_loop
or (count
!= -1 and len(toots
) >= count
): break
134 if checkfn
and not checkfn(item
, toots
): break
137 #last_id = get_effective_toot_id( toots[ len(toots) - 1])
138 _
, _
, _
, cursor
= cursor
.split('/')
139 hdr
, res
, http
, host
= mastodon_get('/%s' %cursor, http
, host
, proxies
, user_agent
)
143 if __name__
== '__main__':
144 get_toots('@Decentralize_today@mastodon.social', proxies
=None, count
=-1, http
=None, checkfn
=None)