1 from http2
import RsHttp
, _parse_url
2 from soup_parser
import soupify
3 import time
, datetime
, calendar
7 def time_to_timegm(nt
):
8 nt
= nt
.encode('utf-8') if isinstance(nt
, unicode) else nt
11 dd
, tt
= nt
.split('+')[0].split('T')
12 yea
, mon
, day
= dd
.split('-')
13 hou
, min, sec
= tt
.split(':')
14 dtdt
= datetime
.datetime(int(yea
), int(mon
), int(day
), int(hou
), int(min), int(sec
))
15 return calendar
.timegm(dtdt
.timetuple())
17 def extract_props(item
):
18 props
= item
.get('data-props')
19 props
= props
.encode('utf-8') if isinstance(props
, unicode) else props
20 return json
.loads(props
)
22 def extract_toots(html
, item
, toots
, timestamp
, checkfn
, ignore
={}):
23 cursor
= [ a
.get('href') for a
in soupify(html
).body
.find_all('a') if a
.get('href').find('?max_id=') != -1 ]
24 cursor
= cursor
[0] if len(cursor
) else None
29 elements
= [ div
for div
in soupify(html
).body
.find_all('div') if ('class' in div
.attrs
and 'status-public' in div
.attrs
['class']) ]
31 for element
in elements
:
41 for span
in element
.find_all('span'):
42 if span
.get_text() == 'Pinned post':
46 infodiv
= element
.find('div', attrs
={'class':'status__info'})
47 if infodiv
is None: continue # should not happen
48 toot_id
= infodiv
.find('a', attrs
={'class':'status__relative-time'}).get('href').split('/')[4]
49 # XXX some toot_id are in format dead-beef-0123
50 # also, usernames could appear ?
51 toot_id
= int(toot_id
) if isinstance(toot_id
, int) else toot_id
52 toot_time
= time_to_timegm( infodiv
.find('data', attrs
={'class':'dt-published'}).get('value') )
53 toot_author
= infodiv
.find('a', attrs
={'class':'status__display-name'}).get('href').split('/')[3].lower()
54 toot_displayname
= infodiv
.find('strong', attrs
={'class':'display-name__html'}).get_text()
55 toot_account
= infodiv
.find('span', attrs
={'class':'display-name__account'}).contents
[0].strip()
56 if toot_account
in ignore
: continue
57 # FIXME: toot_text has weird formatting upon scraping, but displays fine
58 # once twatbot is restarted... needs to investigate this.
59 toot_text
= str(element
.find('div', attrs
={'class':'e-content'}))
60 toot_text
= toot_text
.encode('utf-8') if isinstance(toot_text
, unicode) else toot_text
61 #toot_avatar = infodiv.find('img', attrs={'class':'account__avatar'}).get('src')
63 card
= element
.find('div', attrs
={'data-component':'Card'})
65 card
= extract_props(card
)
67 video
= element
.find('div', attrs
={'data-component':'Video'})
69 video
= extract_props(video
)
70 for v
in video
['media']:
71 images
.append( v
['preview_url'] )
73 gallery
= element
.find('div', attrs
={'data-component':'MediaGallery'})
75 gallery
= extract_props(gallery
)
76 images
.append(gallery
['media'][0]['url'])
79 'owner': toot_account
,
80 'fetched': int(time
.time()),
84 'displayname': toot_displayname
,
85 'account': toot_account
,
89 if item
!= toot_account
: toot
['rid'] = toot_id
90 if pinned
: toot
['pinned'] = 1
91 if len(images
): toot
['images'] = images
92 if video
: toot
['video'] = 1
95 toot
['curl'] = card
['card']['url']
96 toot
['ctitle'] = card
['card']['title']
97 toot
['cdesc'] = card
['card']['description']
104 def mastodon_get(req
, http
, host
, proxies
, user_agent
='curl/7.60.0'):
107 http
= RsHttp(host
=host
,port
=443,timeout
=30,ssl
=True,keep_alive
=True,follow_redirects
=True,auto_set_cookies
=True,proxies
=proxies
,user_agent
=user_agent
)
110 hdr
, res
= http
.get(req
)
111 if not '200 OK' in hdr
:
114 return hdr
, res
, http
, host
115 return None, None, None, host
117 def get_toots(item
, proxies
=None, count
=0, http
=None, checkfn
=None, user_agent
='curl/7.60.0', ignore
={}):
119 _
, user
, host
= item
.split('@')
121 hdr
, res
, http
, host
= mastodon_get('/@%s' %user, http
, host
, proxies
, user_agent
)
123 timestamp
= int(time
.time())
127 toots
, cursor
= extract_toots(res
, item
, toots
, timestamp
, checkfn
, ignore
)
129 if count
== 0 or len(toots
) == 0 or break_loop
or (count
!= -1 and len(toots
) >= count
): break
130 if checkfn
and not checkfn(item
, toots
): break
133 #last_id = get_effective_toot_id( toots[ len(toots) - 1])
134 _
, _
, _
, cursor
= cursor
.split('/')
135 hdr
, res
, http
, host
= mastodon_get('/%s' %cursor, http
, host
, proxies
, user_agent
)
139 if __name__
== '__main__':
140 get_toots('@Decentralize_today@mastodon.social', proxies
=None, count
=-1, http
=None, checkfn
=None)