twat.py

   1 from http2 import RsHttp, _parse_url
   2 from soup_parser import soupify
   3 from nitter import nitter_get, nitter_connect, get_nitter_instance, set_invalid_nitter
   4 from mastodon import mastodon_get
   5 import time, datetime, calendar
   6 import json
   7 import os.path
   8 import hashlib
   9 import re
  10 import random
  11 import paths
  12 import misc
  13 import sys
  14 import rsparse
  15 from utils import retry_write, retry_makedirs
  16 # the effective id of a twat is the retweet id, if it's a retweet
  17 def get_effective_twat_id(twat):
  18         if 'rid' in twat: return twat['rid']
  19         return twat['id']
  20
  21 def _split_url(url):
  22         url = url.encode('utf-8') if isinstance(url, unicode) else url
  23         host, port, ssl, uri = _parse_url(url)
  24         result = {'host':host, 'port':port, 'ssl':ssl, 'uri':uri}
  25         aa = uri.split('#')
  26         if len(aa) > 1:
  27                 result['anchor'] = aa[1]
  28         else:
  29                 aa = uri.split('/')
  30                 if aa[-1] != "" and '.' in aa[-1]:
  31                         result['filename'] = aa[-1]
  32         return result
  33
  34 def _hash(str):
  35         value = str.encode('utf-8') if isinstance(str, unicode) else str
  36         return hashlib.md5(value).hexdigest()
  37
  38 def _get_real_location(url, proxies=None):
  39         url_components = _split_url(url)
  40
  41         http = RsHttp(url_components['host'], ssl=url_components['ssl'], port=url_components['port'], keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent="curl/7.74.0")
  42
  43         if not http.connect(): return url
  44         hdr = http.head(url_components['uri'])
  45
  46         for line in hdr.split('\n'):
  47                 if line.lower().startswith('location: '): return line.split(': ')[1].strip()
  48
  49         return url
  50
  51 def _mirror_file(url_components, user, tid, args=None, content_type=None, force=False):
  52         if 'filename' in url_components:
  53                 outname = paths.get_user(user)+ '/%s-%s' % (tid, url_components['filename'])
  54                 ext = url_components['filename'].split('.')[-1]
  55         else:
  56                 outname = paths.get_user(user)+'/%s-%s' % (tid, url_components['uri'].split('/')[3])
  57                 ext = None
  58         if not force and os.path.exists(outname):
  59                 return
  60
  61         http = RsHttp(url_components['host'], ssl=url_components['ssl'], port=url_components['port'], keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=args.proxy, user_agent="curl/7.74.0")
  62
  63         ## do nothing if we cannot connect
  64         if not http.connect(): return None
  65
  66         if content_type:
  67
  68                 if ext is not None and args.ext: filtre = str(args.ext).split(',')
  69                 else: filtre = []
  70
  71                 hdr = http.head(url_components['uri'])
  72
  73                 ## max mirror size
  74                 if args.mirror_size:
  75                         # extract second part of the Content-Length: line
  76                         value = [ str(i.split(':')[1]).strip() for i in hdr.split('\n') if i.lower().startswith('content-length:') ]
  77                         if not len(value) or int(value[0]) > args.mirror_size: return
  78
  79                 # extract second part of the Content-Type: line
  80                 value = [ str(i.split(':')[1]).strip() for i in hdr.split('\n') if i.lower().startswith('content-type:') ]
  81
  82                 ## server does not provide Content-Type info
  83                 if not len(value): return
  84                 # content type contains ';' (usually when html)
  85                 elif ';' in value[0]: value[0] = value[0].split(';')[0]
  86                 value = value[0].split('/')
  87
  88                 ## when filtering extensions (--ext)
  89                 ## if unset, everything is mirrored
  90                 if len(filtre):
  91                         ## values don't match anything
  92                         if len(value) < 2 or (not value[0] in filtre and not value[1] in filtre): return
  93
  94                 # XXX : mirror html files
  95                 ## we actually don't save html files
  96                 ## what about making automated save
  97                 ## thru the wayback machine ?
  98                 if 'html' in value: return
  99
 100                 ## previous http object cannot be re-used
 101                 http = RsHttp(url_components['host'], ssl=url_components['ssl'], port=url_components['port'], keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=args.proxy, user_agent="curl/7.74.0")
 102
 103                 ## do nothing if we cannot connect
 104                 if not http.connect(): return
 105
 106         extras = []
 107         if 'filename' in url_components and url_components['filename'] == 'card.html' and 'twitter.com' in url_components['host']:
 108                 extras.append("Referer: https://twitter.com/")
 109
 110         hdr, res = http.get(url_components['uri'], extras=extras)
 111         if res == '' and hdr != "":
 112                 # print http error code when things go wrong
 113                 print "%s%s : %s" % (url_components['host'], url_components['uri'], hdr.split('\n')[0])
 114                 return
 115
 116         res_bytes = res.encode('utf-8') if isinstance(res, unicode) else res
 117         filehash = _hash(res_bytes)
 118         out_fn = 'data/%s.%s' % (filehash, ext)
 119         if not os.path.exists(out_fn):
 120                 retry_write(out_fn, res_bytes)
 121
 122         if os.path.lexists(outname): os.unlink(outname)
 123         os.symlink('../../data/%s.%s' % (filehash, ext), outname)
 124
 125 def unshorten_urls(twat, proxies=None, shorteners={}):
 126         soup = soupify(twat["text"])
 127         for a in soup.body.find_all('a'):
 128                 if not 'href' in a.attrs: continue
 129                 href = a.attrs['href']
 130                 comp = _split_url(href)
 131                 if comp['host'] in shorteners:
 132                         try: twat['text'] = twat['text'].decode('utf8').replace( href, _get_real_location(href, proxies=proxies))
 133                         except: pass
 134
 135         return twat
 136
 137 def mirror_twat(twat, args=None):
 138
 139         if 'owner' in twat:
 140                 user = twat['owner'].lower()
 141         else:
 142                 user = twat['user'].lower()
 143
 144         if not os.path.isdir('data'): retry_makedirs( 'data')
 145
 146         ## soupify user's text
 147         soup = soupify(twat["text"])
 148
 149         ## try to automatically mirror links posted by the user,
 150         ## if it matches the extension list.
 151
 152         if 'c' in args.mirror and 'curl' in twat:
 153                 url = twat['curl']
 154                 # XXX: unsupported nitter feature
 155                 # this displays fine when loading from twitter in a regular browser,
 156                 # which is probably converted using some js code
 157                 # TODO: check if nitter handles card:// stuff..
 158                 unsuported_shemes = ['card://']
 159                 for _us in unsuported_shemes:
 160                         if url.startswith(_us): continue
 161                         url_components = _split_url(url)
 162                         url_components['filename'] = 'card.html' #% twat['id']
 163                         _mirror_file(url_components, user, twat['id'], args)
 164
 165         if 'f' in args.mirror:
 166                 for a in soup.body.find_all('a'):
 167                         if 'data-expanded-url' in a.attrs:
 168                                 url_components = _split_url(a.attrs['data-expanded-url'])
 169
 170                                 if 'filename' in url_components:
 171                                         _mirror_file(url_components, user, twat['id'], args, content_type=True)
 172
 173         ## mirror videos
 174         if 'v' in args.mirror and 'video' in twat:
 175                 tid = str(twat['id'])
 176                 url = 'https://twitter.com/%s/status/%s' % (twat['user'], tid)
 177                 outname = paths.get_user(twat['user']) + '/%s.mp4' % tid
 178                 if not os.path.exists('data/%s.mp4' % tid):
 179                         if args.proxy:
 180                                 os.system('%s --proxy %s -o data/%s.mp4 %s > /dev/null 2>&1' % (args.ytdl, args.rawproxy, tid, url))
 181                         else:
 182                                 os.system('%s -o data/%s.mp4 %s > /dev/null 2>&1' % (args.ytdl, tid, url))
 183                 if not os.path.exists('%s' % outname) and os.path.exists('data/%s.mp4' % tid):
 184                         os.symlink('../../data/%s.mp4' % tid, outname)
 185
 186         ## mirror posted pictures
 187         if 'images' in twat and 'i' in args.mirror:
 188
 189                 for x in xrange(0, len(twat['images'])):
 190                         i = twat['images'][x]
 191
 192                         if '?format=' in i:
 193                                 i = i.split('&')[0]
 194                                 fmt = i.split('=')[1]
 195                                 i = '%s.%s' % (i.split('?')[0], fmt)
 196
 197                         url_components = _split_url(i)
 198                         _mirror_file(url_components, user, twat['id'], args)
 199
 200         ## deal with emojis
 201         if 'e' in args.mirror:
 202                 for img in soup.body.find_all('img'):
 203                         if 'class' in img.attrs and 'Emoji' in img.attrs['class']:
 204                                 src = img.attrs['src']
 205                                 src = src.encode('utf-8') if isinstance(src, unicode) else src
 206
 207                                 split = src.split('/')
 208                                 host = split[2]
 209                                 emodir = '/'.join(split[3: len(split) - 1])
 210                                 filename = split[-1]
 211                                 uri = '%s/%s' % (emodir, filename)
 212
 213                                 if not os.path.isdir(emodir):
 214                                         retry_makedirs( emodir )
 215
 216                                 if not os.path.exists('%s/%s' % (emodir,filename)):
 217                                         http = RsHttp(host=host, port=443, timeout=30, ssl=True, keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=args.proxy, user_agent="curl/7.74.0")
 218                                         while not http.connect():
 219                                                 # FIXME : what should happen on connect error ?
 220                                                 pass
 221                                         hdr, res = http.get('/%s' % uri)
 222                                         res = res.encode('utf-8') if isinstance(res, unicode) else res
 223                                         retry_write('%s/%s' % (emodir, filename), res)
 224
 225
 226 def add_tweet(id, user, time, text):
 227         print "%s (%s) -> %s" % (user, time, id)
 228         print text
 229
 230 # twat_id looks like: '/username/status/id'
 231 def get_twat_timestamp(twat_id):
 232         host = 'twitter.com'
 233         http = RsHttp(host=host, port=443, timeout=30, ssl=True, keep_alive=True, follow_redirects=True, auto_set_cookies=True, user_agent="curl/7.74.0")
 234         while not http.connect():
 235                 # FIXME : what should happen on connect error ?
 236                 pass
 237         hdr, res = http.get(twat_id)
 238         soup = soupify (res)
 239         for small in soup.body.find_all('small', attrs={'class':'time'}):
 240                 if small.find('a').attrs["href"] == twat_id:
 241                         for span in small.find_all('span'):
 242                                 span.attrs['data-time']
 243                                 if 'data-time' in span.attrs:
 244                                         return int(span.attrs['data-time'])
 245         return 0
 246
 247 def get_twats_mobile(user, proxies=None):
 248         host = 'mobile.twitter.com'
 249         http = RsHttp(host=host, port=443, timeout=30, ssl=True, keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent="curl/7.74.0")
 250 #       http.debugreq = True
 251         while not http.connect():
 252                 # FIXME : what should happen on connect error ?
 253                 pass
 254         hdr, res = http.get("/" + user)
 255
 256         twats = []
 257
 258         soup = soupify (res)
 259         tweet_id = 0
 260         tweet_user = None
 261         tweet_time = None
 262         tweet_text = None
 263
 264         for tbl in soup.body.find_all('table'): # , attrs={'class':'tweet  '}):
 265                 if not "class" in tbl.attrs: continue
 266                 if not "tweet" in repr(tbl.attrs["class"]): continue
 267                 for td in tbl.find_all('td'):
 268                         cls = td.attrs["class"][0]
 269                         #print "." + repr(cls) + "."
 270                         if cls == "user-info":
 271                                 tweet_user=td.find('div', attrs={'class':'username'}).text.strip()
 272                         elif cls == 'timestamp':
 273                                 a = td.find('a')
 274                                 tweet_time = a.text
 275                                 tweet_id = a.attrs["href"].rstrip("?p=p")
 276                         elif cls == 'tweet-content':
 277                                 tweet_text = td.find('div', attrs={'class':'tweet-text'}).text.strip()
 278                 if tweet_user != None and tweet_id:
 279                         twats.append({'id':tweet_id, 'user':tweet_user, 'time':tweet_time, 'text':tweet_text})
 280
 281         return twats
 282
 283
 284 def strify_tag_arr(tag_arr):
 285         pass
 286
 287 def get_style_tag(tag, styles):
 288         sta = [x.strip() for x in styles.split(';')]
 289         for st in sta:
 290                 tg, s = st.split(':', 1)
 291                 if tg.strip() == tag: return s.strip()
 292         return None
 293
 294 def fetch_nitter_picture(user, proxies, res=None, twhttp=None, nitters={}, user_agent='curl/7.74.0'):
 295         pic_path = paths.get_profile_pic(user)
 296         if os.path.isfile(pic_path): return
 297
 298         if not res:
 299                 while not twhttp:
 300                         twhttp, host, nitters = nitter_connect(nitters, proxies)
 301                         # no avail. instance, pic will be scraped another time
 302                         if not twhttp: return
 303
 304                 try: hdr, res = twhttp.get("/%s" % user)
 305                 # user does not exist
 306                 except UnicodeDecodeError: return None
 307
 308         soup = soupify(res)
 309         for a in soup.find_all('a', attrs={'class': 'profile-card-avatar'}):
 310                 pic_url = a.get('href') if '://' in a.get('href') else 'https://%s%s' % (get_nitter_instance(nitters, False), a.get('href'))
 311                 url_components = _split_url(pic_url)
 312                 http = RsHttp(host=url_components['host'], port=url_components['port'], timeout=30, ssl=url_components['ssl'], keep_alive=True, follow_redirects=True, auto_set_cookies=True, proxies=proxies, user_agent="curl/7.74.0")
 313
 314                 # if connection fails, the profile picture
 315                 # will be fetched another time
 316                 if not http.connect(): return
 317
 318                 hdr, res = http.get(url_components['uri'])
 319                 if res == '' and hdr != "":
 320                         print('error fetching profile picture: %s' % url_components)
 321                 else:
 322                         res_bytes = res.encode('utf-8') if isinstance(res, unicode) else res
 323                         retry_write(pic_path, res_bytes)
 324                 return
 325
 326 def extract_twats(html, item, twats, timestamp, checkfn, nitters, blacklist, whitelist):
 327         def find_div_end(html):
 328                 level = 0
 329                 for i in xrange(len(html)):
 330                         if html[i] == '<' and html[i+1] == 'd' and  html[i+2] == 'i' and html[i+3] == 'v':
 331                                 level += 1
 332                         if html[i] == '<' and html[i+1] == '/' and  html[i+2] == 'd' and html[i+3] == 'i' and html[i+4] == 'v':
 333                                 level -= 1
 334                         if level == 0:
 335                                 return i + len('</div>')
 336
 337         regex = re.compile(r'<div.*class.*[" ]timeline.item[" ]')
 338         nfetched = 0
 339         cursor = None
 340         for a in soupify(html).body.find_all('a'):
 341                 href = a.get('href')
 342                 if href and href.find('cursor=') != -1:
 343                         cursor = a.get('href')
 344                         break
 345
 346         while 1:
 347                 match = regex.search(html)
 348                 if not match:
 349                         return twats, cursor
 350                 html = html[match.start():]
 351                 div_end = find_div_end(html)
 352                 slice = html[:div_end]
 353                 html = html[div_end:]
 354                 twats = extract_twat(html, twats, timestamp, nitters, blacklist, whitelist)
 355                 nfetched += 1
 356                 # if the first two (the very first could be pinned) tweets are already known
 357                 # do not waste cpu processing more html
 358                 if nfetched == 2 and checkfn and not checkfn(item, twats):
 359                         return twats, cursor
 360
 361 """ this function might require some love """
 362 def nitter_time_to_timegm(nt):
 363         nt = nt.encode('utf-8') if isinstance(nt, unicode) else nt
 364         # new date format
 365         if nt.find('/') == -1:
 366                 months = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12 }
 367                 ampm = nt.split(' ')[5]
 368                 mon = months[nt.split(' ')[0]]
 369                 day = nt.split(' ')[1].strip(',')
 370                 yea = nt.split(' ')[2]
 371                 hou = int(nt.split(' ')[4].split(':')[0])
 372                 min = nt.split(' ')[4].split(':')[1]
 373                 strp = datetime.datetime.strptime('%s-%s-%s %s:%s:00 %s' % (int(yea), int(mon), int(day), int(hou), int(min), ampm), '%Y-%m-%d %H:%M:%S %p')
 374                 dd, tt = str(strp).split(' ')
 375                 yea, mon, day = dd.split('-')
 376                 hou, min, sec = tt.split(':')
 377
 378                 dtdt = datetime.datetime(int(yea), int(mon), int(day), int(hou), int(min), int(sec))
 379         # old time format
 380         else:
 381                 nt = nt.split(',')
 382                 d = nt[0].split('/')
 383                 t = nt[1].strip().split(':')
 384                 dtdt = datetime.datetime(int(d[2]), int(d[1]), int(d[0]), int(t[0]), int(t[1]))
 385         return calendar.timegm(dtdt.timetuple())
 386
 387 def extract_twat(html, twats, timestamp, nitters={}, blacklist={}, whitelist={}):
 388         soup = soupify(html)
 389         for div in soup.body.find_all('div', attrs={'class':'timeline-item'}):
 390
 391                 tweet_id = 0
 392                 tweet_user = None
 393                 tweet_time = None
 394                 tweet_text = None
 395                 retweet_id = 0
 396                 retweet_user = None
 397                 card_url = None
 398                 card_title = None
 399                 card_description = None
 400                 card_destination = None
 401                 images = None
 402                 quote_tweet = None
 403                 video = False
 404
 405                 pinned = ('pinned' in div.attrs["class"])
 406
 407                 tweet_id = div.find('a', attrs={'class': 'tweet-link'}).get('href').split('/')[3].split('#')[0]
 408                 tweet_user = div.find('a', attrs={'class': 'username'}).get('title').lstrip('@').lower()
 409                 if tweet_user in blacklist or (len(whitelist) and not tweet_user in whitelist): continue
 410
 411                 tt = ''.join( [ i.string for i in div.find('div', attrs={'class': 'tweet-content'}).contents ] )
 412                 tweet_text = tt.encode('utf-8') if isinstance(tt, unicode) else tt
 413                 tweet_time = nitter_time_to_timegm( div.find('span', attrs={'class': 'tweet-date'}).find('a').get('title') )
 414
 415                 # it's a retweet
 416                 rt = div.find('div', attrs={'class': 'retweet-header'})
 417                 if rt is not None:
 418                         retweet_user = div.find('a', attrs={'class':'username'}).get('title').lstrip('@').lower()
 419                         if retweet_user != tweet_user: retweet_id = tweet_id
 420                         else: retweet_user = None
 421
 422                 # user quotes someone else
 423                 qdiv = div.find('div', attrs={'class': 'quote-big'})
 424                 if qdiv:
 425                         quoted = qdiv.find('div', attrs={'class':'quote-text'})
 426                         if quoted:
 427                                 quote_link = qdiv.find('a', attrs={'class': 'quote-link'}).get('href')
 428                                 quser = quote_link.split('/')[1]
 429                                 if quser in blacklist: continue
 430                                 qtext = quoted.get_text()
 431                                 if isinstance(qtext, unicode): qtext = qtext.encode('utf-8')
 432                                 qid = quote_link.split('/')[3].split('#')[0]
 433                                 qtime = qdiv.find('span', attrs={'class': 'tweet-date'}).find('a').get('title')
 434                                 if qtime: qtime = nitter_time_to_timegm( qtime )
 435                                 quote_tweet = {
 436                                         'user': quser.lower(),
 437                                         'id': qid,
 438                                         'text': qtext,
 439                                         'time': qtime
 440                                 }
 441
 442                 # find attachments
 443                 attachments_div = div.find('div', attrs={'class': 'attachments'})
 444                 if attachments_div:
 445                         images = []
 446                         for img in attachments_div.find_all('img'):
 447                                 images.append('https://%s%s' % (get_nitter_instance(nitters, False), img.get('src')))
 448
 449                         for vid in attachments_div.find_all('video'):
 450                                 video = True
 451                                 bg = vid.get('poster')
 452                                 images.append('https://%s%s' % (get_nitter_instance(nitters, False), bg))
 453
 454                 # card div..
 455                 card_div = div.find('div', attrs={'class': 'card'})
 456                 if card_div:
 457                         # card url (OK)
 458                         for a in card_div.find_all('a'):
 459                                 if 'class' in a.attrs and 'card-container' in a.attrs['class']:
 460                                         card_url = a.get('href')
 461                                         break
 462                         # card title (OK)
 463                         for h2 in card_div.find_all('h2'):
 464                                 if 'class' in h2.attrs and 'card-title' in h2.attrs['class']:
 465                                         card_title = h2.get_text()
 466                                         break
 467                         # card description
 468                         for p in card_div.find_all('p'):
 469                                 if 'class' in p.attrs and 'card_description' in p.attrs['class']:
 470                                         print('got card description')
 471                                         card_description = p.get_text()
 472                                         break
 473                         # card destination (OK)
 474                         for span in card_div.find_all('span'):
 475                                 if 'class' in span.attrs and 'card-destination' in span.attrs['class']:
 476                                         card_destination = span.get_text()
 477                                         break
 478
 479                 if tweet_user != None and tweet_id:
 480                         vals = {'id':tweet_id, 'user':tweet_user, 'time':tweet_time, 'text':tweet_text, 'fetched':timestamp}
 481                         if retweet_id: vals['rid'] = retweet_id
 482                         if card_url: vals['curl'] = card_url
 483                         if card_title: vals['ctitle'] = card_title
 484                         if card_description: vals['cdesc'] = card_description
 485                         if card_destination: vals['cdest'] = card_destination
 486                         if images: vals['images'] = images
 487                         if quote_tweet: vals['quote'] = quote_tweet
 488                         if pinned: vals['pinned'] = 1
 489                         if video: vals['video'] = 1
 490                         # save order of timeline by storing id of next twat
 491                         # next is equivalent to the next-newer twat.
 492                         if len(twats) and not 'pinned' in twats[len(twats)-1]:
 493                                 next_twat = twats[len(twats)-1]
 494                                 if len(next_twat):
 495                                         vals['next'] = next_twat['id']
 496                                         if retweet_id:
 497                                                 pr_time = 0
 498                                                 if 'rid' in next_twat:
 499                                                         if 'rid_time' in next_twat:
 500                                                                 pr_time = next_twat['rid_time'] - 1
 501                                                 else:
 502                                                         pr_time = next_twat['time'] - 1
 503
 504                                                 if pr_time != 0: vals['rid_time'] = pr_time
 505
 506                         if not vals in twats: twats.append(vals)
 507         return twats
 508
 509 # count: specify the number of twats that shall be fetched.
 510 # the actual number delivered could be slightly more than specified.
 511 # if 0 is specified, only the most recent page (containing typically 20 tweets)
 512 # is harvested. if -1 is specified, the entire timeline will be harvested back
 513 # to the very first tweet.
 514 # if checkfn is passed , it'll be called with the username and current list of
 515 # received twats, and can decide whether fetching will be continued or not,
 516 # by returning True (continue) or False.
 517 def get_twats(item, proxies=None, count=0, http=None, checkfn=None, nitters={}, host=None, search=False, user_agent="curl/7.74.0", blacklist={}, whitelist={}, maxpage=1000):
 518         query = '/search?f=tweets&q=%s' % item.strip('#') if search else '/%s' %item
 519
 520         page = 1
 521         elapsed_time = time.time()
 522
 523         hdr, res, http, host, nitters = nitter_get(query, http, host, nitters, proxies, user_agent)
 524
 525         # make sure all tweets fetched in a single invocation get the same timestamp,
 526         # otherwise ordering might become messed up, once we sort them
 527         timestamp = int(time.time())
 528
 529         known_cursors = []
 530         twats = []
 531         break_loop = False
 532
 533         while True:
 534                 twats, cursor = extract_twats(res, item, twats, timestamp, checkfn, nitters, blacklist, whitelist)
 535                 sys.stdout.write('\r[%s] %s: scraping... p:%d ' % (misc.get_timestamp("%Y-%m-%d %H:%M:%S", elapsed_time), item, page))
 536                 sys.stdout.flush()
 537                 if count == 0 or (not len(twats) and not cursor) or break_loop or (count != -1 and len(twats) >= count): break
 538                 if checkfn and not checkfn(item, twats): break
 539
 540                 # fetch additional tweets that are not in the initial set of 20:
 541                 if len(twats): last_id = get_effective_twat_id(twats[len(twats)-1])
 542
 543                 # we scrapped everything
 544                 if not cursor or (maxpage > 0 and page >= maxpage) or cursor in known_cursors: break
 545                 known_cursors.append(cursor)
 546                 query = '/search?f=tweets&q=%s%s' % (item.strip('#'), cursor) if search else '/%s%s' % (item, cursor)
 547                 print('cursor: %s, query: %s' %(cursor,query))
 548                 hdr, res, http, host, nitters = nitter_get(query, http, host, nitters, proxies, user_agent)
 549                 page = page + 1
 550
 551         return twats, nitters, host, http, page
 552
 553 if __name__ == '__main__':
 554         print repr ( get_twats('realdonaldtrump') )
 555 #       print repr ( get_twats('FLOTUS') )
 556 #       get_twat_timestamp('/2runtherace/status/1015320873044234240')