fix picture fetching
[rofl0r-twatscrape.git] / rsparse.py
blob48bedc49b0d83b0811f9f72d61d4344de446497a
1 # generator
2 def find_all_tags(content, tag):
3 tag_end = ' \t\n/>'
4 def find_next_tag_start(content, tag, start=0):
5 l = len(tag)
6 lc = len(content)
7 while start < lc:
8 if content[start] != '<':
9 start += 1
10 continue
11 i = 0
12 while i < l and start+1+i < lc:
13 if content[start+1+i] != tag[i]: break
14 i += 1
15 if i == l and start+1+i < lc and content[start+1+i] in tag_end:
16 return start
17 start += 1
18 return -1
20 def find_next_tag_end(content, tag, start=0):
21 s = '</%s>'%tag
22 i = content.find(s, start)
23 if i == -1: return i
24 return i + len(s)
26 def find_next_tag(content, tag, start=0):
27 s = find_next_tag_start(content, tag, start)
28 if s == -1: return (-1, -1)
29 e = find_next_tag_end(content, tag, s+1+len(tag))
30 if e == -1: return (-1, -1)
31 return (s, e)
33 start = 0
34 while start != -1:
35 s,e = find_next_tag(content, tag, start)
36 if s == -1: break
37 yield content[s:e]
38 start = e
40 if __name__ == '__main__':
41 import sys
42 with open(sys.argv[1], "r") as h:
43 s = h.read()
44 for a in find_all_tags(s, 'a'):
45 print a