1 from http2
import RsHttp
, _parse_url
2 from soup_parser
import soupify
3 from nitter
import nitter_get
, nitter_connect
, get_nitter_instance
, set_invalid_nitter
4 from mastodon
import mastodon_get
5 import time
, datetime
, calendar
15 from utils
import retry_write
, retry_makedirs
16 # the effective id of a twat is the retweet id, if it's a retweet
17 def get_effective_twat_id(twat
):
18 if 'rid' in twat
: return twat
['rid']
22 url
= url
.encode('utf-8') if isinstance(url
, unicode) else url
23 host
, port
, ssl
, uri
= _parse_url(url
)
24 result
= {'host':host
, 'port':port
, 'ssl':ssl
, 'uri':uri
}
27 result
['anchor'] = aa
[1]
30 if aa
[-1] != "" and '.' in aa
[-1]:
31 result
['filename'] = aa
[-1]
35 value
= str.encode('utf-8') if isinstance(str, unicode) else str
36 return hashlib
.md5(value
).hexdigest()
38 def _get_real_location(url
, proxies
=None):
39 url_components
= _split_url(url
)
41 http
= RsHttp(url_components
['host'], ssl
=url_components
['ssl'], port
=url_components
['port'], keep_alive
=True, follow_redirects
=True, auto_set_cookies
=True, proxies
=proxies
, user_agent
="curl/7.74.0")
43 if not http
.connect(): return url
44 hdr
= http
.head(url_components
['uri'])
46 for line
in hdr
.split('\n'):
47 if line
.lower().startswith('location: '): return line
.split(': ')[1].strip()
51 def _mirror_file(url_components
, user
, tid
, args
=None, content_type
=None, force
=False):
52 if 'filename' in url_components
:
53 outname
= paths
.get_user(user
)+ '/%s-%s' % (tid
, url_components
['filename'])
54 ext
= url_components
['filename'].split('.')[-1]
56 outname
= paths
.get_user(user
)+'/%s-%s' % (tid
, url_components
['uri'].split('/')[3])
58 if not force
and os
.path
.exists(outname
):
61 http
= RsHttp(url_components
['host'], ssl
=url_components
['ssl'], port
=url_components
['port'], keep_alive
=True, follow_redirects
=True, auto_set_cookies
=True, proxies
=args
.proxy
, user_agent
="curl/7.74.0")
63 ## do nothing if we cannot connect
64 if not http
.connect(): return None
68 if ext
is not None and args
.ext
: filtre
= str(args
.ext
).split(',')
71 hdr
= http
.head(url_components
['uri'])
75 # extract second part of the Content-Length: line
76 value
= [ str(i
.split(':')[1]).strip() for i
in hdr
.split('\n') if i
.lower().startswith('content-length:') ]
77 if not len(value
) or int(value
[0]) > args
.mirror_size
: return
79 # extract second part of the Content-Type: line
80 value
= [ str(i
.split(':')[1]).strip() for i
in hdr
.split('\n') if i
.lower().startswith('content-type:') ]
82 ## server does not provide Content-Type info
83 if not len(value
): return
84 # content type contains ';' (usually when html)
85 elif ';' in value
[0]: value
[0] = value
[0].split(';')[0]
86 value
= value
[0].split('/')
88 ## when filtering extensions (--ext)
89 ## if unset, everything is mirrored
91 ## values don't match anything
92 if len(value
) < 2 or (not value
[0] in filtre
and not value
[1] in filtre
): return
94 # XXX : mirror html files
95 ## we actually don't save html files
96 ## what about making automated save
97 ## thru the wayback machine ?
98 if 'html' in value
: return
100 ## previous http object cannot be re-used
101 http
= RsHttp(url_components
['host'], ssl
=url_components
['ssl'], port
=url_components
['port'], keep_alive
=True, follow_redirects
=True, auto_set_cookies
=True, proxies
=args
.proxy
, user_agent
="curl/7.74.0")
103 ## do nothing if we cannot connect
104 if not http
.connect(): return
107 if 'filename' in url_components
and url_components
['filename'] == 'card.html' and 'twitter.com' in url_components
['host']:
108 extras
.append("Referer: https://twitter.com/")
110 hdr
, res
= http
.get(url_components
['uri'], extras
=extras
)
111 if res
== '' and hdr
!= "":
112 # print http error code when things go wrong
113 print "%s%s : %s" % (url_components
['host'], url_components
['uri'], hdr
.split('\n')[0])
116 res_bytes
= res
.encode('utf-8') if isinstance(res
, unicode) else res
117 filehash
= _hash(res_bytes
)
118 out_fn
= 'data/%s.%s' % (filehash
, ext
)
119 if not os
.path
.exists(out_fn
):
120 retry_write(out_fn
, res_bytes
)
122 if os
.path
.lexists(outname
): os
.unlink(outname
)
123 os
.symlink('../../data/%s.%s' % (filehash
, ext
), outname
)
125 def unshorten_urls(twat
, proxies
=None, shorteners
={}):
126 soup
= soupify(twat
["text"])
127 for a
in soup
.body
.find_all('a'):
128 if not 'href' in a
.attrs
: continue
129 href
= a
.attrs
['href']
130 comp
= _split_url(href
)
131 if comp
['host'] in shorteners
:
132 try: twat
['text'] = twat
['text'].decode('utf8').replace( href
, _get_real_location(href
, proxies
=proxies
))
137 def mirror_twat(twat
, args
=None):
140 user
= twat
['owner'].lower()
142 user
= twat
['user'].lower()
144 if not os
.path
.isdir('data'): retry_makedirs( 'data')
146 ## soupify user's text
147 soup
= soupify(twat
["text"])
149 ## try to automatically mirror links posted by the user,
150 ## if it matches the extension list.
152 if 'c' in args
.mirror
and 'curl' in twat
:
154 # XXX: unsupported nitter feature
155 # this displays fine when loading from twitter in a regular browser,
156 # which is probably converted using some js code
157 # TODO: check if nitter handles card:// stuff..
158 unsuported_shemes
= ['card://']
159 for _us
in unsuported_shemes
:
160 if url
.startswith(_us
): continue
161 url_components
= _split_url(url
)
162 url_components
['filename'] = 'card.html' #% twat['id']
163 _mirror_file(url_components
, user
, twat
['id'], args
)
165 if 'f' in args
.mirror
:
166 for a
in soup
.body
.find_all('a'):
167 if 'data-expanded-url' in a
.attrs
:
168 url_components
= _split_url(a
.attrs
['data-expanded-url'])
170 if 'filename' in url_components
:
171 _mirror_file(url_components
, user
, twat
['id'], args
, content_type
=True)
174 if 'v' in args
.mirror
and 'video' in twat
:
175 tid
= str(twat
['id'])
176 url
= 'https://twitter.com/%s/status/%s' % (twat
['user'], tid
)
177 outname
= paths
.get_user(twat
['user']) + '/%s.mp4' % tid
178 if not os
.path
.exists('data/%s.mp4' % tid
):
180 os
.system('%s --proxy %s -o data/%s.mp4 %s > /dev/null 2>&1' % (args
.ytdl
, args
.rawproxy
, tid
, url
))
182 os
.system('%s -o data/%s.mp4 %s > /dev/null 2>&1' % (args
.ytdl
, tid
, url
))
183 if not os
.path
.exists('%s' % outname
) and os
.path
.exists('data/%s.mp4' % tid
):
184 os
.symlink('../../data/%s.mp4' % tid
, outname
)
186 ## mirror posted pictures
187 if 'images' in twat
and 'i' in args
.mirror
:
189 for x
in xrange(0, len(twat
['images'])):
190 i
= twat
['images'][x
]
194 fmt
= i
.split('=')[1]
195 i
= '%s.%s' % (i
.split('?')[0], fmt
)
197 url_components
= _split_url(i
)
198 _mirror_file(url_components
, user
, twat
['id'], args
)
201 if 'e' in args
.mirror
:
202 for img
in soup
.body
.find_all('img'):
203 if 'class' in img
.attrs
and 'Emoji' in img
.attrs
['class']:
204 src
= img
.attrs
['src']
205 src
= src
.encode('utf-8') if isinstance(src
, unicode) else src
207 split
= src
.split('/')
209 emodir
= '/'.join(split
[3: len(split
) - 1])
211 uri
= '%s/%s' % (emodir
, filename
)
213 if not os
.path
.isdir(emodir
):
214 retry_makedirs( emodir
)
216 if not os
.path
.exists('%s/%s' % (emodir
,filename
)):
217 http
= RsHttp(host
=host
, port
=443, timeout
=30, ssl
=True, keep_alive
=True, follow_redirects
=True, auto_set_cookies
=True, proxies
=args
.proxy
, user_agent
="curl/7.74.0")
218 while not http
.connect():
219 # FIXME : what should happen on connect error ?
221 hdr
, res
= http
.get('/%s' % uri
)
222 res
= res
.encode('utf-8') if isinstance(res
, unicode) else res
223 retry_write('%s/%s' % (emodir
, filename
), res
)
226 def add_tweet(id, user
, time
, text
):
227 print "%s (%s) -> %s" % (user
, time
, id)
230 # twat_id looks like: '/username/status/id'
231 def get_twat_timestamp(twat_id
):
233 http
= RsHttp(host
=host
, port
=443, timeout
=30, ssl
=True, keep_alive
=True, follow_redirects
=True, auto_set_cookies
=True, user_agent
="curl/7.74.0")
234 while not http
.connect():
235 # FIXME : what should happen on connect error ?
237 hdr
, res
= http
.get(twat_id
)
239 for small
in soup
.body
.find_all('small', attrs
={'class':'time'}):
240 if small
.find('a').attrs
["href"] == twat_id
:
241 for span
in small
.find_all('span'):
242 span
.attrs
['data-time']
243 if 'data-time' in span
.attrs
:
244 return int(span
.attrs
['data-time'])
247 def get_twats_mobile(user
, proxies
=None):
248 host
= 'mobile.twitter.com'
249 http
= RsHttp(host
=host
, port
=443, timeout
=30, ssl
=True, keep_alive
=True, follow_redirects
=True, auto_set_cookies
=True, proxies
=proxies
, user_agent
="curl/7.74.0")
250 # http.debugreq = True
251 while not http
.connect():
252 # FIXME : what should happen on connect error ?
254 hdr
, res
= http
.get("/" + user
)
264 for tbl
in soup
.body
.find_all('table'): # , attrs={'class':'tweet '}):
265 if not "class" in tbl
.attrs
: continue
266 if not "tweet" in repr(tbl
.attrs
["class"]): continue
267 for td
in tbl
.find_all('td'):
268 cls
= td
.attrs
["class"][0]
269 #print "." + repr(cls) + "."
270 if cls
== "user-info":
271 tweet_user
=td
.find('div', attrs
={'class':'username'}).text
.strip()
272 elif cls
== 'timestamp':
275 tweet_id
= a
.attrs
["href"].rstrip("?p=p")
276 elif cls
== 'tweet-content':
277 tweet_text
= td
.find('div', attrs
={'class':'tweet-text'}).text
.strip()
278 if tweet_user
!= None and tweet_id
:
279 twats
.append({'id':tweet_id
, 'user':tweet_user
, 'time':tweet_time
, 'text':tweet_text
})
284 def strify_tag_arr(tag_arr
):
287 def get_style_tag(tag
, styles
):
288 sta
= [x
.strip() for x
in styles
.split(';')]
290 tg
, s
= st
.split(':', 1)
291 if tg
.strip() == tag
: return s
.strip()
294 def fetch_nitter_picture(user
, proxies
, res
=None, twhttp
=None, nitters
={}, user_agent
='curl/7.74.0'):
295 pic_path
= paths
.get_profile_pic(user
)
296 if os
.path
.isfile(pic_path
): return
300 twhttp
, host
, nitters
= nitter_connect(nitters
, proxies
)
301 # no avail. instance, pic will be scraped another time
302 if not twhttp
: return
304 try: hdr
, res
= twhttp
.get("/%s" % user
)
305 # user does not exist
306 except UnicodeDecodeError: return None
309 for a
in soup
.find_all('a', attrs
={'class': 'profile-card-avatar'}):
310 pic_url
= a
.get('href') if '://' in a
.get('href') else 'https://%s%s' % (get_nitter_instance(nitters
, False), a
.get('href'))
311 url_components
= _split_url(pic_url
)
312 http
= RsHttp(host
=url_components
['host'], port
=url_components
['port'], timeout
=30, ssl
=url_components
['ssl'], keep_alive
=True, follow_redirects
=True, auto_set_cookies
=True, proxies
=proxies
, user_agent
="curl/7.74.0")
314 # if connection fails, the profile picture
315 # will be fetched another time
316 if not http
.connect(): return
318 hdr
, res
= http
.get(url_components
['uri'])
319 if res
== '' and hdr
!= "":
320 print('error fetching profile picture: %s' % url_components
)
322 res_bytes
= res
.encode('utf-8') if isinstance(res
, unicode) else res
323 retry_write(pic_path
, res_bytes
)
326 def extract_twats(html
, item
, twats
, timestamp
, checkfn
, nitters
, blacklist
, whitelist
):
327 def find_div_end(html
):
329 for i
in xrange(len(html
)):
330 if html
[i
] == '<' and html
[i
+1] == 'd' and html
[i
+2] == 'i' and html
[i
+3] == 'v':
332 if html
[i
] == '<' and html
[i
+1] == '/' and html
[i
+2] == 'd' and html
[i
+3] == 'i' and html
[i
+4] == 'v':
335 return i
+ len('</div>')
337 regex
= re
.compile(r
'<div.*class.*[" ]timeline.item[" ]')
340 for a
in soupify(html
).body
.find_all('a'):
342 if href
and href
.find('cursor=') != -1:
343 cursor
= a
.get('href')
347 match
= regex
.search(html
)
350 html
= html
[match
.start():]
351 div_end
= find_div_end(html
)
352 slice = html
[:div_end
]
353 html
= html
[div_end
:]
354 twats
= extract_twat(html
, twats
, timestamp
, nitters
, blacklist
, whitelist
)
356 # if the first two (the very first could be pinned) tweets are already known
357 # do not waste cpu processing more html
358 if nfetched
== 2 and checkfn
and not checkfn(item
, twats
):
361 """ this function might require some love """
362 def nitter_time_to_timegm(nt
):
363 nt
= nt
.encode('utf-8') if isinstance(nt
, unicode) else nt
365 if nt
.find('/') == -1:
366 months
= {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12 }
367 ampm
= nt
.split(' ')[5]
368 mon
= months
[nt
.split(' ')[0]]
369 day
= nt
.split(' ')[1].strip(',')
370 yea
= nt
.split(' ')[2]
371 hou
= int(nt
.split(' ')[4].split(':')[0])
372 min = nt
.split(' ')[4].split(':')[1]
373 strp
= datetime
.datetime
.strptime('%s-%s-%s %s:%s:00 %s' % (int(yea
), int(mon
), int(day
), int(hou
), int(min), ampm
), '%Y-%m-%d %H:%M:%S %p')
374 dd
, tt
= str(strp
).split(' ')
375 yea
, mon
, day
= dd
.split('-')
376 hou
, min, sec
= tt
.split(':')
378 dtdt
= datetime
.datetime(int(yea
), int(mon
), int(day
), int(hou
), int(min), int(sec
))
383 t
= nt
[1].strip().split(':')
384 dtdt
= datetime
.datetime(int(d
[2]), int(d
[1]), int(d
[0]), int(t
[0]), int(t
[1]))
385 return calendar
.timegm(dtdt
.timetuple())
387 def extract_twat(html
, twats
, timestamp
, nitters
={}, blacklist
={}, whitelist
={}):
389 for div
in soup
.body
.find_all('div', attrs
={'class':'timeline-item'}):
399 card_description
= None
400 card_destination
= None
405 pinned
= ('pinned' in div
.attrs
["class"])
407 tweet_id
= div
.find('a', attrs
={'class': 'tweet-link'}).get('href').split('/')[3].split('#')[0]
408 tweet_user
= div
.find('a', attrs
={'class': 'username'}).get('title').lstrip('@').lower()
409 if tweet_user
in blacklist
or (len(whitelist
) and not tweet_user
in whitelist
): continue
411 tt
= ''.join( [ i
.string
for i
in div
.find('div', attrs
={'class': 'tweet-content'}).contents
] )
412 tweet_text
= tt
.encode('utf-8') if isinstance(tt
, unicode) else tt
413 tweet_time
= nitter_time_to_timegm( div
.find('span', attrs
={'class': 'tweet-date'}).find('a').get('title') )
416 rt
= div
.find('div', attrs
={'class': 'retweet-header'})
418 retweet_user
= div
.find('a', attrs
={'class':'username'}).get('title').lstrip('@').lower()
419 if retweet_user
!= tweet_user
: retweet_id
= tweet_id
420 else: retweet_user
= None
422 # user quotes someone else
423 qdiv
= div
.find('div', attrs
={'class': 'quote-big'})
425 quoted
= qdiv
.find('div', attrs
={'class':'quote-text'})
427 quote_link
= qdiv
.find('a', attrs
={'class': 'quote-link'}).get('href')
428 quser
= quote_link
.split('/')[1]
429 if quser
in blacklist
: continue
430 qtext
= quoted
.get_text()
431 if isinstance(qtext
, unicode): qtext
= qtext
.encode('utf-8')
432 qid
= quote_link
.split('/')[3].split('#')[0]
433 qtime
= qdiv
.find('span', attrs
={'class': 'tweet-date'}).find('a').get('title')
434 if qtime
: qtime
= nitter_time_to_timegm( qtime
)
436 'user': quser
.lower(),
443 attachments_div
= div
.find('div', attrs
={'class': 'attachments'})
446 for img
in attachments_div
.find_all('img'):
447 images
.append('https://%s%s' % (get_nitter_instance(nitters
, False), img
.get('src')))
449 for vid
in attachments_div
.find_all('video'):
451 bg
= vid
.get('poster')
452 images
.append('https://%s%s' % (get_nitter_instance(nitters
, False), bg
))
455 card_div
= div
.find('div', attrs
={'class': 'card'})
458 for a
in card_div
.find_all('a'):
459 if 'class' in a
.attrs
and 'card-container' in a
.attrs
['class']:
460 card_url
= a
.get('href')
463 for h2
in card_div
.find_all('h2'):
464 if 'class' in h2
.attrs
and 'card-title' in h2
.attrs
['class']:
465 card_title
= h2
.get_text()
468 for p
in card_div
.find_all('p'):
469 if 'class' in p
.attrs
and 'card_description' in p
.attrs
['class']:
470 print('got card description')
471 card_description
= p
.get_text()
473 # card destination (OK)
474 for span
in card_div
.find_all('span'):
475 if 'class' in span
.attrs
and 'card-destination' in span
.attrs
['class']:
476 card_destination
= span
.get_text()
479 if tweet_user
!= None and tweet_id
:
480 vals
= {'id':tweet_id
, 'user':tweet_user
, 'time':tweet_time
, 'text':tweet_text
, 'fetched':timestamp
}
481 if retweet_id
: vals
['rid'] = retweet_id
482 if card_url
: vals
['curl'] = card_url
483 if card_title
: vals
['ctitle'] = card_title
484 if card_description
: vals
['cdesc'] = card_description
485 if card_destination
: vals
['cdest'] = card_destination
486 if images
: vals
['images'] = images
487 if quote_tweet
: vals
['quote'] = quote_tweet
488 if pinned
: vals
['pinned'] = 1
489 if video
: vals
['video'] = 1
490 # save order of timeline by storing id of next twat
491 # next is equivalent to the next-newer twat.
492 if len(twats
) and not 'pinned' in twats
[len(twats
)-1]:
493 next_twat
= twats
[len(twats
)-1]
495 vals
['next'] = next_twat
['id']
498 if 'rid' in next_twat
:
499 if 'rid_time' in next_twat
:
500 pr_time
= next_twat
['rid_time'] - 1
502 pr_time
= next_twat
['time'] - 1
504 if pr_time
!= 0: vals
['rid_time'] = pr_time
506 if not vals
in twats
: twats
.append(vals
)
509 # count: specify the number of twats that shall be fetched.
510 # the actual number delivered could be slightly more than specified.
511 # if 0 is specified, only the most recent page (containing typically 20 tweets)
512 # is harvested. if -1 is specified, the entire timeline will be harvested back
513 # to the very first tweet.
514 # if checkfn is passed , it'll be called with the username and current list of
515 # received twats, and can decide whether fetching will be continued or not,
516 # by returning True (continue) or False.
517 def get_twats(item
, proxies
=None, count
=0, http
=None, checkfn
=None, nitters
={}, host
=None, search
=False, user_agent
="curl/7.74.0", blacklist
={}, whitelist
={}, maxpage
=1000):
518 query
= '/search?f=tweets&q=%s' % item
.strip('#') if search
else '/%s' %item
521 elapsed_time
= time
.time()
523 hdr
, res
, http
, host
, nitters
= nitter_get(query
, http
, host
, nitters
, proxies
, user_agent
)
525 # make sure all tweets fetched in a single invocation get the same timestamp,
526 # otherwise ordering might become messed up, once we sort them
527 timestamp
= int(time
.time())
534 twats
, cursor
= extract_twats(res
, item
, twats
, timestamp
, checkfn
, nitters
, blacklist
, whitelist
)
535 sys
.stdout
.write('\r[%s] %s: scraping... p:%d ' % (misc
.get_timestamp("%Y-%m-%d %H:%M:%S", elapsed_time
), item
, page
))
537 if count
== 0 or (not len(twats
) and not cursor
) or break_loop
or (count
!= -1 and len(twats
) >= count
): break
538 if checkfn
and not checkfn(item
, twats
): break
540 # fetch additional tweets that are not in the initial set of 20:
541 if len(twats
): last_id
= get_effective_twat_id(twats
[len(twats
)-1])
543 # we scrapped everything
544 if not cursor
or (maxpage
> 0 and page
>= maxpage
) or cursor
in known_cursors
: break
545 known_cursors
.append(cursor
)
546 query
= '/search?f=tweets&q=%s%s' % (item
.strip('#'), cursor
) if search
else '/%s%s' % (item
, cursor
)
547 print('cursor: %s, query: %s' %(cursor
,query
))
548 hdr
, res
, http
, host
, nitters
= nitter_get(query
, http
, host
, nitters
, proxies
, user_agent
)
551 return twats
, nitters
, host
, http
, page
553 if __name__
== '__main__':
554 print repr ( get_twats('realdonaldtrump') )
555 # print repr ( get_twats('FLOTUS') )
556 # get_twat_timestamp('/2runtherace/status/1015320873044234240')