mastodon.py

   1 from http2 import RsHttp, _parse_url
   2 from soup_parser import soupify
   3 import time, datetime, calendar
   4 import json
   5 import re
   6
   7 def time_to_timegm(nt):
   8         nt = nt.encode('utf-8') if isinstance(nt, unicode) else nt
   9         # new date format
  10
  11         dd, tt = nt.split('+')[0].split('T')
  12         yea, mon, day = dd.split('-')
  13         hou, min, sec = tt.split(':')
  14         dtdt = datetime.datetime(int(yea), int(mon), int(day), int(hou), int(min), int(sec))
  15         return calendar.timegm(dtdt.timetuple())
  16
  17 def extract_props(item):
  18         props = item.get('data-props')
  19         props = props.encode('utf-8') if isinstance(props, unicode) else props
  20         return json.loads(props)
  21
  22 def extract_toots(html, item, toots, timestamp, checkfn, ignore={}):
  23         cursor = [ a.get('href') for a in soupify(html).body.find_all('a') if a.get('href').find('?max_id=') != -1 ]
  24         cursor = cursor[0] if len(cursor) else None
  25         quote_toot = None
  26         images = []
  27         toot = dict()
  28
  29         elements = [ div for div in soupify(html).body.find_all('div') if ('class' in div.attrs and 'status-public' in div.attrs['class']) ]
  30
  31         for element in elements:
  32                 video = None
  33                 card = None
  34                 images = list()
  35                 toot_text = None
  36                 toot_boosted = False
  37                 pinned = False
  38                 toot_author = None
  39                 toot_time = None
  40
  41                 for span in element.find_all('span'):
  42                         if span.get_text() == 'Pinned post':
  43                                 pinned = True
  44                                 break
  45
  46                 infodiv = element.find('div', attrs={'class':'status__info'})
  47                 if infodiv is None: continue # should not happen
  48                 toot_id = infodiv.find('a', attrs={'class':'status__relative-time'}).get('href').split('/')[4]
  49                 # XXX some toot_id are in format dead-beef-0123
  50                 # also, usernames could appear ?
  51                 toot_id = int(toot_id) if isinstance(toot_id, int) else toot_id
  52                 toot_time = time_to_timegm( infodiv.find('data', attrs={'class':'dt-published'}).get('value') )
  53                 toot_author = infodiv.find('a', attrs={'class':'status__display-name'}).get('href').split('/')[3].lower()
  54                 toot_displayname = infodiv.find('strong', attrs={'class':'display-name__html'}).get_text()
  55                 toot_account = infodiv.find('span', attrs={'class':'display-name__account'}).contents[0].strip()
  56                 if toot_account in ignore: continue
  57                 # FIXME: toot_text has weird formatting upon scraping, but displays fine
  58                 # once twatbot is restarted... needs to investigate this.
  59                 toot_text = str(element.find('div', attrs={'class':'e-content'}))
  60                 toot_text = toot_text.encode('utf-8') if isinstance(toot_text, unicode) else toot_text
  61                 #toot_avatar = infodiv.find('img', attrs={'class':'account__avatar'}).get('src')
  62
  63                 card = element.find('div', attrs={'data-component':'Card'})
  64                 if card:
  65                         card = extract_props(card)
  66
  67                 video = element.find('div', attrs={'data-component':'Video'})
  68                 if video:
  69                         video = extract_props(video)
  70                         for v in video['media']:
  71                                 images.append( v['preview_url'] )
  72
  73                 gallery = element.find('div', attrs={'data-component':'MediaGallery'})
  74                 if gallery:
  75                         gallery = extract_props(gallery)
  76                         images.append(gallery['media'][0]['url'])
  77
  78                 toot = {
  79                         'owner': toot_account,
  80                         'fetched': int(time.time()),
  81                         'time': toot_time,
  82                         'id': toot_id,
  83                         'user': toot_account,
  84                         'displayname': toot_displayname,
  85                         'account': toot_account,
  86                         'text': toot_text,
  87                 }
  88
  89                 if item != toot_account: toot['rid'] = toot_id
  90                 if pinned: toot['pinned'] = 1
  91                 if len(images): toot['images'] = images
  92                 if video: toot['video'] = 1
  93
  94                 if card:
  95                         toot['curl'] = card['card']['url']
  96                         toot['ctitle'] = card['card']['title']
  97                         toot['cdesc'] = card['card']['description']
  98
  99                 toots.append(toot)
 100 #               print(toot)
 101
 102         return toots, cursor
 103
 104 def mastodon_get(req, http, host, proxies, user_agent='curl/7.60.0'):
 105
 106         if http is None:
 107                 http = RsHttp(host=host,port=443,timeout=30,ssl=True,keep_alive=True,follow_redirects=True,auto_set_cookies=True,proxies=proxies,user_agent=user_agent)
 108
 109         if http.connect():
 110                 hdr, res = http.get(req)
 111                 if not '200 OK' in hdr:
 112                         http = None
 113
 114                 return hdr, res, http, host
 115         return None, None, None, host
 116
 117 def get_toots(item, proxies=None, count=0, http=None, checkfn=None, user_agent='curl/7.60.0', ignore={}):
 118         toots = []
 119         _, user, host = item.split('@')
 120
 121         hdr, res, http, host = mastodon_get('/@%s' %user, http, host, proxies, user_agent)
 122
 123         timestamp = int(time.time())
 124         break_loop = False
 125
 126         while True:
 127                 toots, cursor = extract_toots(res, item, toots, timestamp, checkfn, ignore)
 128
 129                 if count == 0 or len(toots) == 0 or break_loop or (count != -1 and len(toots) >= count): break
 130                 if checkfn and not checkfn(item, toots): break
 131                 if not cursor: break
 132
 133                 #last_id = get_effective_toot_id( toots[ len(toots) - 1])
 134                 _, _, _, cursor = cursor.split('/')
 135                 hdr, res, http, host = mastodon_get('/%s' %cursor, http, host, proxies, user_agent)
 136
 137         return toots, http
 138
 139 if __name__ == '__main__':
 140         get_toots('@Decentralize_today@mastodon.social', proxies=None, count=-1, http=None, checkfn=None)