fix picture fetching
[rofl0r-twatscrape.git] / http2.py
blob0b26b005d365c0e33cdba4ca7c1a2efd942570ee
1 # -*- coding: utf-8 -*-
3 from rocksock import Rocksock, RocksockException
4 import rocksock
5 import urllib, zlib
6 import ssl, socket
7 import time, sys
9 def _parse_errorcode(line):
10 r = line.find(' ')
11 if r == -1:
12 return line, -1, ''
13 ver = line[:r]
14 rest = line[r+1:]
15 r = rest.find(' ')
16 if r == -1:
17 msg = ''
18 err = int(rest)
19 else:
20 msg = rest[r+1:]
21 err = int(rest[:r])
22 return ver, err, msg
24 def _parse_url(url):
25 host = ''
26 url_l = url.lower()
27 if url_l.startswith('https://'):
28 ssl = True
29 url = url[8:]
30 port = 443
31 elif url_l.startswith('http://'):
32 ssl = False
33 url = url[7:]
34 port = 80
35 elif url_l.startswith('//'):
36 # can happen with a redirect
37 ssl = False
38 url = url[2:]
39 port = -1
40 elif url_l.startswith('/'):
41 # can happen with a redirect
42 url = url[1:]
43 port = 0
44 else:
45 raise
47 if not '/' in url: url = url + '/'
49 if port == 0:
50 return "", 0, False, url
52 port_index = -1
53 fixed_amazon_redirect = False
54 for i in range(len(url)):
55 if url[i] == '?':
56 if not fixed_amazon_redirect:
57 url = url.replace('?','/?',True)
58 fixed_amazon_redirect = True
59 if url[i] == ':':
60 host = url[:i]
61 port_index = i+1
62 elif url[i] == '/':
63 if port_index >= 0:
64 port = int(url[port_index:i])
65 else:
66 host = url[:i]
67 url = url[i:]
68 break
69 return host, port, ssl, url
71 def _parse_content_type(line):
72 ct = ''
73 cs = ''
74 a = line.split(';')
75 for x in a:
76 if x.lower().startswith('charset='):
77 cs = x[len('charset='):]
78 else:
79 ct = x
80 return ct, cs
82 TEXTUAL_CONTENT_TYPES_LIST = ['text/html', 'text/plain']
83 def _is_textual_content_type(ct):
84 ct = ct.lower()
85 return ct in TEXTUAL_CONTENT_TYPES_LIST
87 class RsHttp():
88 def __init__(self, host, port=80, ssl=False, follow_redirects=False, \
89 auto_set_cookies=False, keep_alive=False, timeout=60, \
90 user_agent=None, proxies=None, max_tries=10, log_errors=True, \
91 verify_cert=False,
92 **kwargs):
93 self.host = host
94 self.port = port
95 self.use_ssl = ssl
96 self.debugreq = False
97 self.follow_redirects = follow_redirects
98 self.redirect_counter = 0
99 self.auto_set_cookies = auto_set_cookies
100 self.keep_alive = keep_alive
101 self.timeout = timeout
102 self.user_agent = user_agent if user_agent else 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
103 self.proxies = proxies
104 self.cookies = dict()
105 self.max_tries = max_tries
106 self.log_errors = log_errors
107 self.last_rs_exception = None
108 self.verify_cert=verify_cert
109 self.headers = []
111 def get_last_rocksock_exception(self):
112 return self.last_rs_exception
114 def _err_log(self, s):
115 if self.log_errors:
116 sys.stderr.write(s + '\n')
118 def connect(self):
119 return self.reconnect()
121 def _key_match(self, want, got):
122 return want.lower() == got.lower()
124 def _make_request(self, typ, url, extras=None):
125 extras = extras if extras else []
126 s = typ + ' '+ url +' HTTP/1.1\r\n'
127 if self.port != 80 and self.port != 443:
128 s += 'Host: %s:%d\r\n'%(self.host,self.port)
129 else:
130 s += 'Host: %s\r\n'%(self.host)
131 if self.keep_alive:
132 s += 'Connection: keep-alive\r\n'
133 else:
134 s += 'Connection: close\r\n'
135 s += 'Accept: */*\r\n'
136 s += 'Accept-Encoding: gzip, deflate\r\n'
137 s += 'User-Agent: %s\r\n'%self.user_agent
138 s += 'DNT: 1\r\n'
139 for i in self.headers:
140 s += i + '\r\n'
142 cs = ''
143 for c in self.cookies:
144 if cs != '':
145 cs += '; '
146 if self.cookies[c] != '':
147 cs += c + '=' + self.cookies[c]
148 else:
149 cs += c
150 if cs != '':
151 s += 'Cookie: ' + cs + '\r\n'
152 postdata = ''
153 for i in extras:
154 if i.startswith('p0$tD4ta:'):
155 postdata = i[9:]
156 else:
157 s += i + '\r\n'
158 s += '\r\n'
159 if postdata != '':
160 s += postdata
161 if self.debugreq:
162 print ">>>\n", s
163 return s
165 def _make_head_request(self, url, extras=None):
166 return self._make_request('HEAD', url, extras)
168 def _make_get_request(self, url, extras=None):
169 return self._make_request('GET', url, extras)
171 def _make_post_request_raw(self, url, data, extras=None):
172 x = extras if extras else []
173 x.append('Content-Type: application/x-www-form-urlencoded')
174 x.append('Content-Length: ' + str(len(data)))
175 x.append('p0$tD4ta:' + data)
176 return self._make_request('POST', url, x)
178 def _make_post_request(self, url, values, extras=None):
179 data = urllib.urlencode(values)
180 return self._make_post_request_raw(url, data, extras)
182 def _try_gunzip(self, data):
183 try:
184 res = zlib.decompress(data, 16+zlib.MAX_WBITS)
185 return 0, res
186 except zlib.error as e:
187 if 'incomplete' in e.message:
188 return -1, ''
189 return -2, ''
191 def _get_response(self):
192 def parse_header_fields(line):
193 if not ':' in line: return line.rstrip(' '), ""
194 if not ': ' in line: return line.split(':', 1)
195 return line.split(': ', 1)
197 chunked = False
198 unzip = ''
199 redirect = ''
200 charset = ''
201 # some sites don't set content-length, -1 will cause to fetch as much as possible
202 q = -1
203 s = ''
204 res = ''
205 #'HTTP/1.1 302 Found\r\n'
206 l = ''
207 while not l.startswith('HTTP/'):
208 l = self.conn.recvline().strip()
209 s = l + '\n'
210 foo, code, msg = _parse_errorcode(l)
211 while True:
212 l = self.conn.recvline().strip()
213 s += l + '\n'
214 if l == '': break
215 key, val = parse_header_fields(l)
216 if self._key_match(key, 'Transfer-Encoding') and 'chunked' in val:
217 chunked = True
218 elif self._key_match(key, 'Set-Cookie') and self.auto_set_cookies:
219 self.set_cookie(l)
220 elif self._key_match(key, 'Location'):
221 redirect = val
222 elif self._key_match(key, 'Content-Type'):
223 ct, cs = _parse_content_type(val)
224 if cs.lower() == 'utf-8':
225 if _is_textual_content_type(ct):
226 charset = 'utf-8'
227 elif self._key_match(key, 'Content-Encoding'):
228 if val == 'gzip':
229 unzip = 'gzip'
230 elif val == 'deflate':
231 unzip = 'deflate'
232 elif self._key_match(key, 'Content-Length'):
233 q = int(val)
235 if q == -1 and code >= 400 and code < 600:
236 return (s, res, redirect)
238 if not chunked:
239 res = self.conn.recv(q)
240 else:
241 while True:
242 l = self.conn.recvline().strip().split(';', 1)
243 if(l[0]) == '': break
244 q = int(l[0], 16)
245 data = self.conn.recv(q)
246 assert(len(data) == q)
247 res += data
248 crlf = self.conn.recv(2)
249 assert(crlf == '\r\n')
250 if q == 0: break
252 if len(res) != 0:
253 if unzip == 'gzip':
254 ec, extr = self._try_gunzip(res)
255 while ec == -1:
256 res += self.conn.recv(-1)
257 ec, extr = self._try_gunzip(res)
258 if ec == -2:
259 raise zlib.error
260 res = extr
261 elif unzip == 'deflate':
262 try:
263 res = zlib.decompress(res)
264 except zlib.error:
265 res = zlib.decompress(res, -zlib.MAX_WBITS)
267 if charset != '':
268 res = res.decode(charset)
270 if self.debugreq:
271 print "<<<\n", s, res
273 return (s, res, redirect)
275 def reconnect(self):
276 tries = 0
277 while tries < self.max_tries:
278 tries += 1
279 try:
280 self.conn = Rocksock(host=self.host, port=self.port, proxies=self.proxies, ssl=self.use_ssl, timeout=self.timeout, verifycert=self.verify_cert)
281 self.conn.connect()
282 return True
283 except RocksockException as e:
284 self.last_rs_exception = e
285 if e.errortype == rocksock.RS_ET_GAI and e.error==-2:
286 # -2: Name does not resolve
287 self.conn.disconnect()
288 self.conn = None
289 return False
290 self._err_log(e.get_errormessage())
291 time.sleep(0.05)
292 continue
293 except socket.gaierror:
294 self._err_log("gaie")
295 time.sleep(0.05)
296 continue
297 except ssl.SSLError as e:
298 self._err_log("ssle" + e.reason)
299 time.sleep(0.05)
300 continue
301 return False
303 def _send_and_recv_i(self, req):
304 if self._send_raw(req):
305 return self._get_response()
306 else: return "", "", ""
308 def _send_and_recv(self, req):
309 tries = 0
310 while tries < self.max_tries:
311 tries += 1
312 a = self._catch(self._send_and_recv_i, None, req)
313 if a is not None: return a
314 return "", "", ""
316 def _catch(self, func, failret, *args):
317 try:
318 return func(*args)
319 except RocksockException as e:
320 self.last_rs_exception = e
321 self.conn.disconnect()
322 if not self.reconnect(): return failret
323 except IOError:
324 self.conn.disconnect()
325 if not self.reconnect(): return failret
326 except EOFError:
327 self.conn.disconnect()
328 if not self.reconnect(): return failret
329 except ssl.SSLError:
330 self.conn.disconnect()
331 if not self.reconnect(): return failret
334 def _send_raw(self, req):
335 if self.conn is None:
336 if not self.reconnect(): return False
337 res = self.conn.send(req)
338 if res is not False: return True
339 return False
342 def get(self, url, extras=None):
343 req = self._make_get_request(url, extras)
344 hdr, res, redirect = self._send_and_recv(req)
346 if redirect != '' and self.follow_redirects:
347 MAX_REDIRECTS = 16
348 self.redirect_counter += 1
349 if self.redirect_counter > MAX_REDIRECTS:
350 return '', ''
352 host, port, use_ssl, url = _parse_url(redirect)
353 if port != 0:
354 self.host = host
355 if port != -1: # -1: use existing port/ssl
356 self.port = port
357 self.use_ssl = use_ssl
358 self.conn.disconnect()
359 self.conn = None
360 self.reconnect()
361 return self.get(url, extras)
362 else:
363 self.redirect_counter = 0
365 return hdr, res
367 def _head_i(self, url, extras=None):
368 req = self._make_head_request(url, extras)
369 if not self._send_raw(req): return ""
370 s = ''
371 res = ''
372 #'HTTP/1.1 302 Found\r\n'
373 l = self.conn.recvline().strip()
374 s = l + '\n'
375 foo, code, msg = _parse_errorcode(l)
376 while True:
377 l = self.conn.recvline().strip()
378 s += l + '\n'
379 if l == '': break
380 if self.debugreq: print "<<<\n", s
381 return s
383 def head(self, url, extras=None):
384 tries = 0
385 while tries < self.max_tries:
386 tries += 1
387 res = self._catch(self._head_i, None, url, extras)
388 if res is not None: return res
389 return ""
391 def post_raw(self, url, data, extras=None):
392 req = self._make_post_request_raw(url, data, extras)
393 hdr, res, redirect = self._send_and_recv(req)
394 return hdr, res
396 def post(self, url, values, extras=None):
397 req = self._make_post_request(url, values, extras)
398 hdr, res, redirect = self._send_and_recv(req)
399 return hdr, res
401 def xhr_get(self, url):
402 return self.get(url, ['X-Requested-With: XMLHttpRequest'])
404 def xhr_post(self, url, values={}):
405 return self.post(url, values, ['X-Requested-With: XMLHttpRequest'])
407 def add_header(self, s):
408 # copy a header verbatim into each request, example:
409 # http.add_header("Referer: http://bbc.com")
410 self.headers.append(s)
412 def add_headers(self, lines):
413 # copy a multi-line header chunk verbatim into each request:
414 for line in lines.split('\n'):
415 line = line.rstrip('\r')
416 if len(line): self.headers.append(line)
418 def set_cookie(self, c):
419 if c.lower().startswith('set-cookie: '):
420 c = c[len('Set-Cookie: '):]
421 j = c.find(';')
422 if j == -1: j = len(c)
423 c = c[:j]
424 i = c.find('=')
425 if i == -1: i = len(c)
426 s = c[i+1:]
427 self.cookies[c[:i]] = s
430 if __name__ == '__main__':
431 url = 'https://www.openssl.org/news/secadv/20170126.txt'
432 host, port, use_ssl, uri = _parse_url(url)
433 http = RsHttp(host=host, port=port, timeout=15, ssl=use_ssl, follow_redirects=True, auto_set_cookies=True)
434 http.debugreq = True
435 if not http.connect():
436 print "sorry, couldn't connect"
437 else:
438 hdr = http.head(uri)
439 hdr, res = http.get(uri)