mastodon.py

   1 from http2 import RsHttp, _parse_url
   2 from soup_parser import soupify
   3 import time, datetime, calendar
   4 import json
   5 import re
   6 import paths
   7
   8 def fetch_mastodon_picture(user, proxies, res=None, twhttp=None, nitters={}, user_agent='curl/7.74.0'):
   9         return
  10
  11 def time_to_timegm(nt):
  12         nt = nt.encode('utf-8') if isinstance(nt, unicode) else nt
  13         # new date format
  14
  15         dd, tt = nt.split('+')[0].split('T')
  16         yea, mon, day = dd.split('-')
  17         hou, min, sec = tt.split(':')
  18         dtdt = datetime.datetime(int(yea), int(mon), int(day), int(hou), int(min), int(sec))
  19         return calendar.timegm(dtdt.timetuple())
  20
  21 def extract_props(item):
  22         props = item.get('data-props')
  23         props = props.encode('utf-8') if isinstance(props, unicode) else props
  24         return json.loads(props)
  25
  26 def extract_toots(html, item, toots, timestamp, checkfn, ignore={}):
  27         cursor = [ a.get('href') for a in soupify(html).body.find_all('a') if a.get('href').find('?max_id=') != -1 ]
  28         cursor = cursor[0] if len(cursor) else None
  29         quote_toot = None
  30         images = []
  31         toot = dict()
  32
  33         elements = [ div for div in soupify(html).body.find_all('div') if ('class' in div.attrs and 'status-public' in div.attrs['class']) ]
  34
  35         for element in elements:
  36                 video = None
  37                 card = None
  38                 images = list()
  39                 toot_text = None
  40                 toot_boosted = False
  41                 pinned = False
  42                 toot_author = None
  43                 toot_time = None
  44
  45                 for span in element.find_all('span'):
  46                         if span.get_text() == 'Pinned post':
  47                                 pinned = True
  48                                 break
  49
  50                 infodiv = element.find('div', attrs={'class':'status__info'})
  51                 if infodiv is None: continue # should not happen
  52                 toot_id = infodiv.find('a', attrs={'class':'status__relative-time'}).get('href').split('/')[4]
  53                 # XXX some toot_id are in format dead-beef-0123
  54                 # also, usernames could appear ?
  55                 toot_id = int(toot_id) if isinstance(toot_id, int) else toot_id
  56                 toot_time = time_to_timegm( infodiv.find('data', attrs={'class':'dt-published'}).get('value') )
  57                 toot_author = infodiv.find('a', attrs={'class':'status__display-name'}).get('href').split('/')[3].lower()
  58                 toot_displayname = infodiv.find('strong', attrs={'class':'display-name__html'}).get_text()
  59                 toot_account = infodiv.find('span', attrs={'class':'display-name__account'}).contents[0].strip()
  60                 if toot_account in ignore: continue
  61                 # FIXME: toot_text has weird formatting upon scraping, but displays fine
  62                 # once twatbot is restarted... needs to investigate this.
  63                 toot_text = str(element.find('div', attrs={'class':'e-content'}))
  64                 toot_text = toot_text.encode('utf-8') if isinstance(toot_text, unicode) else toot_text
  65                 #toot_avatar = infodiv.find('img', attrs={'class':'account__avatar'}).get('src')
  66
  67                 card = element.find('div', attrs={'data-component':'Card'})
  68                 if card:
  69                         card = extract_props(card)
  70
  71                 video = element.find('div', attrs={'data-component':'Video'})
  72                 if video:
  73                         video = extract_props(video)
  74                         for v in video['media']:
  75                                 images.append( v['preview_url'] )
  76
  77                 gallery = element.find('div', attrs={'data-component':'MediaGallery'})
  78                 if gallery:
  79                         gallery = extract_props(gallery)
  80                         images.append(gallery['media'][0]['url'])
  81
  82                 toot = {
  83                         'owner': toot_account,
  84                         'fetched': int(time.time()),
  85                         'time': toot_time,
  86                         'id': toot_id,
  87                         'user': toot_account,
  88                         'displayname': toot_displayname,
  89                         'account': toot_account,
  90                         'text': toot_text,
  91                 }
  92
  93                 if item != toot_account: toot['rid'] = toot_id
  94                 if pinned: toot['pinned'] = 1
  95                 if len(images): toot['images'] = images
  96                 if video: toot['video'] = 1
  97
  98                 if card:
  99                         toot['curl'] = card['card']['url']
 100                         toot['ctitle'] = card['card']['title']
 101                         toot['cdesc'] = card['card']['description']
 102
 103                 toots.append(toot)
 104 #               print(toot)
 105
 106         return toots, cursor
 107
 108 def mastodon_get(req, http, host, proxies, user_agent='curl/7.74.0'):
 109
 110         if http is None:
 111                 http = RsHttp(host=host,port=443,timeout=30,ssl=True,keep_alive=True,follow_redirects=True,auto_set_cookies=True,proxies=proxies,user_agent=user_agent)
 112
 113         if http.connect():
 114                 hdr, res = http.get(req)
 115                 if not '200 OK' in hdr:
 116                         http = None
 117
 118                 return hdr, res, http, host
 119         return None, None, None, host
 120
 121 def get_toots(item, proxies=None, count=0, http=None, checkfn=None, user_agent='curl/7.74.0', ignore={}):
 122         toots = []
 123         _, user, host = item.split('@')
 124
 125         hdr, res, http, host = mastodon_get('/@%s' %user, http, host, proxies, user_agent)
 126
 127         timestamp = int(time.time())
 128         break_loop = False
 129
 130         while True:
 131                 toots, cursor = extract_toots(res, item, toots, timestamp, checkfn, ignore)
 132
 133                 if count == 0 or len(toots) == 0 or break_loop or (count != -1 and len(toots) >= count): break
 134                 if checkfn and not checkfn(item, toots): break
 135                 if not cursor: break
 136
 137                 #last_id = get_effective_toot_id( toots[ len(toots) - 1])
 138                 _, _, _, cursor = cursor.split('/')
 139                 hdr, res, http, host = mastodon_get('/%s' %cursor, http, host, proxies, user_agent)
 140
 141         return toots, http
 142
 143 if __name__ == '__main__':
 144         get_toots('@Decentralize_today@mastodon.social', proxies=None, count=-1, http=None, checkfn=None)