Don't search addresses that have already been searched, even if they appear
[ottawa-travel-planner.git] / stopMashupLeech.py
blobf15a65eeb6c6714179a5134dd83414afaff9c437
1 #!/usr/bin/python
2 # vi: set softtabstop=4 shiftwidth=4 tabstop=8 expandtab:
4 import sys
5 import time
6 import random
7 import re
8 import pickle
9 import signal
10 import getopt
11 import tempfile
12 import os
13 from xml.etree import ElementTree
15 import LatLongTools
16 import RectTools
17 import BusStopMashup
19 class SearchedAddress(object):
20 def __init__(self, referrer, address):
21 self.referrer = referrer
22 self.address = address
23 self.location = None
25 def __repr__(self):
26 return "searchaddr (referrer %s, addr %s, location %s)" % (
27 self.referrer, self.address, self.location)
29 class Leech(object):
30 def __init__(self, sleepRange):
31 self.sleepRange = sleepRange
33 # All stops we've found so far, indexed by code.
34 self.allStops = {}
36 # Stops whose addresses we've fed back into the web application.
37 self.searched = []
39 # Stops we weren't able to parse properly.
40 # Each entry is a tuple: (UnknownStopType, SearchedAddress)
41 self.unparsedStops = []
43 # Stops whose names we weren't able to convert to addresses.
44 self.unparsedStopNames = []
46 self.client = BusStopMashup.Client()
48 self.progressListener = lambda searchaddr, leech: True
49 self.firstTime = True
50 self.bail = False
52 def bailEarly(self):
53 self.bail = True
55 def followSeed(self, seed):
56 print >> sys.stderr, "SEED %s" % seed
58 oall = len(self.allStops)
59 osearch = len(self.searched)
61 sa = SearchedAddress(seed, seed)
62 self.recursiveSearch(sa)
63 print "END SEED %s: searched %d addresses, found %d new stops" % (
64 seed, len(self.searched) - osearch, len(self.allStops) - oall)
66 def recursiveSearch(self, searchaddr):
67 candidates = self.searchAddress(searchaddr)
69 # By now sa.location should be filled in with the center location.
70 # Drop out if the address was invalid.
71 if searchaddr.location is None:
72 return False
74 corners = LatLongTools.findCorners(searchaddr.location,
75 [(stop.location, stop) for stop in candidates])
77 for c in corners:
78 for stop in c:
79 if self.bail:
80 return True
82 addr = self.stopToAddress(stop)
83 if addr is None:
84 continue
86 if addr in [sa.address for sa in self.searched]:
87 continue
89 if radiusCompletelyCoveredByOtherPoints(stop.location,
90 [s.location for s in self.searched
91 if s is not searchaddr and s.location is not None],
92 0.01, 0.01):
93 # Already been close to here. Skip.
94 continue
96 newsa = SearchedAddress(stop, addr)
97 if self.recursiveSearch(newsa):
98 # Valid address. Stop looking in this corner.
99 break
101 return True
103 def stopToAddress(self, stop):
104 if not isinstance(stop, BusStopMashup.Stop):
105 # Only regular bus stops tend to have names that we can use as
106 # addresses.
107 return None
108 if _station_stop_rx.match(stop.name) is not None:
109 # Ignore Transitway station stops.
110 return None
111 if _tulip_rx.search(stop.name) is not None:
112 # Ignore Tulipfest stops (ARRET TULIPES / TULIP STOP / ...)
113 return None
115 # RIVERSIDE / AD. 2865 -> 2865 RIVERSIDE
116 m = _addr_rx.match(stop.name)
117 if m is not None:
118 return "%s %s" % (m.group("number"), m.group("street1"))
120 # BANK / RIVERSIDE -> BANK & RIVERSIDE
121 m = _intersection_rx.match(stop.name)
122 if m is not None:
123 return "%s & %s" % (m.group("street1"), m.group("street2"))
125 self.unparsedStopNames.append(stop)
127 def searchAddress(self, searchaddr):
128 self.pause()
130 self.searched.append(searchaddr)
131 try:
132 stops = list(self.client.findStops(searchaddr.address))
133 except BusStopMashup.InvalidAddressException, e:
134 print >> sys.stderr, "\tINVALID ADDR %s (referrer %s)" % (
135 searchaddr.address, searchaddr.referrer)
136 return []
138 oldstops = 0
139 newstops = 0
140 followCandidates = []
141 for stop in stops:
142 if isinstance(stop, BusStopMashup.HomeLocation):
143 # Record the actual center location, which is probably
144 # slightly off from the location of the stop we fed in
145 # since we're looking at the intersection and not the stop.
146 searchaddr.location = stop.location
147 elif isinstance(stop, BusStopMashup.UnknownStopType):
148 self.unparsedStops.append((stop, searchaddr))
149 else:
150 # Even if we've seen a stop before, we should consider it as
151 # a possible search location.
152 followCandidates.append(stop)
153 if self.allStops.has_key(stop.code):
154 oldstops += 1
155 else:
156 newstops += 1
157 self.allStops[stop.code] = stop
159 print >> sys.stderr, "\t%s (referrer %s): %d stops, %d new" % (
160 searchaddr.address, searchaddr.referrer,
161 oldstops + newstops, newstops)
163 self.progressListener(searchaddr, self)
165 return followCandidates
167 def pause(self):
168 if self.firstTime:
169 self.firstTime = False
170 else:
171 time.sleep(random.uniform(*self.sleepRange))
173 # Matches a location like 'RIVERSIDE / AD. 2865'
174 _addr_re = (r'^(?P<street1>[^/]+?)\s*/\s*AD\.\s*(?P<number>\d+)$')
175 _addr_rx = re.compile(_addr_re)
177 # Matches a regular intersection, 'STREET1 / STREET2'
178 _intersection_re = (r'^(?P<street1>[^/]+?)\s*/\s*(?P<street2>[^/]+?)\s*$')
179 _intersection_rx = re.compile(_intersection_re)
181 # Matches a transitway stop. We'll have to ignore these.
182 _station_stop_re = (r'.*STOP\s*/ ARR.*')
183 _station_stop_rx = re.compile(_station_stop_re)
185 # Tulipfest. Formatting is not consistent; ignore
186 _tulip_re = (r'TULIPES')
187 _tulip_rx = re.compile(_tulip_re)
190 def radiusCompletelyCoveredByOtherPoints(location, locations, width, height):
191 p = RectTools.PolyRect([RectTools.Rectangle(location.longitude - width/2,
192 location.latitude - height/2,
193 width, height)])
195 min_area = 0.000001
196 for i in locations:
197 p.subtract(RectTools.Rectangle(i.longitude - width/2,
198 i.latitude - width/2,
199 width, height))
201 if p.area() < min_area:
202 break
204 return p.area() < min_area
206 def usage():
207 print >> sys.stderr, \
208 """Usage: ./stopMashupLeech.py [--fast] seed1 [seed2 [...]]
210 Don't use --fast except for brief test runs."""
211 sys.exit(1)
213 def doBail(leech):
214 print >> sys.stderr, "Exiting after current search completes."
215 leech.bailEarly()
217 def progressListener(searchaddr, leech):
218 saveAll(leech)
220 def saveAll(leech):
221 dumpPickle("grabs/allstops.pickle", leech.allStops)
222 dumpXML("grabs/allstops.xml", leech.allStops)
223 dumpSearchedXML("grabs/allsearched.xml", leech.searched)
226 def main(argv):
227 sleepRange = (38, 66)
229 try:
230 opts, seeds = getopt.getopt(argv[1:], "fh", ["fast", "help"])
231 for o, a in opts:
232 if o in ('-f', '--fast'):
233 sleepRange = (3, 10)
234 elif o in ('-h', '--help'):
235 usage()
236 else:
237 usage()
238 except getopt.GetoptError, e:
239 print e
240 usage()
242 if len(seeds) == 0:
243 usage()
245 leech = Leech(sleepRange)
246 leech.progressListener = progressListener
248 sighandler = lambda sig, frame: doBail(leech)
249 signal.signal(signal.SIGINT, sighandler)
250 signal.signal(signal.SIGTERM, sighandler)
252 for seed in seeds:
253 if not leech.bail:
254 leech.followSeed(seed)
256 print "Locations searched: %d" % len(leech.searched)
257 print "Stops loaded: %d" % len(leech.allStops)
259 print "\nUnparsed stops: %d" % len(leech.unparsedStops)
260 for s in leech.unparsedStops:
261 print "\t%s; returned in search for %s" % (s[0], s[1].address)
263 print "\nUnparsed stop names: %d" % len(leech.unparsedStopNames)
264 for s in leech.unparsedStopNames:
265 print "\t%s" % s
267 inval = [sa for sa in leech.searched if sa.location is None]
268 print "\nInvalid locations searched: %d" % len(inval)
269 for i in inval:
270 print "\t%s" % i
272 saveAll(leech)
274 def dumpPickle(filename, data):
275 safeWrite(filename, data, pickle.dump)
277 def safeWrite(filename, data, dumpfunc):
278 (fd, n) = tempfile.mkstemp(dir='.', prefix=filename)
279 f = os.fdopen(fd, "w")
280 dumpfunc(data, f)
281 f.close()
283 try:
284 os.unlink(filename)
285 except:
286 pass
287 os.link(n, filename)
288 os.unlink(n)
290 def dumpXML(filename, data):
291 safeWrite(filename, data, xmlify)
293 def dumpSearchedXML(filename, data):
294 safeWrite(filename, data, xmlifySearched)
296 def xmlify(data, f):
297 root = ElementTree.Element("stops")
298 root.tail = '\n'
299 for (code, stop) in data.iteritems():
300 attribs = {}
301 attribs['code'] = code
303 if stop.number is not None:
304 attribs['number'] = str(stop.number)
305 if stop.name is not None:
306 attribs['name'] = stop.name
307 if stop.location is not None:
308 attribs['latitude'] = str(stop.location.latitude)
309 attribs['longitude'] = str(stop.location.longitude)
311 if isinstance(stop, BusStopMashup.Stop):
312 attribs['type'] = 'stop'
313 elif isinstance(stop, BusStopMashup.Station):
314 attribs['type'] = 'station'
315 elif isinstance(stop, BusStopMashup.HomeLocation):
316 attribs['type'] = 'home'
317 attribs['requested'] = stop.requestedAddress
318 attribs['responded'] = stop.respondedAddress
319 elif isinstance(stop, BusStopMashup.UnknownStopType):
320 attribs['type'] = 'unknown'
321 attribs['html'] = stop.html
323 stopel = ElementTree.SubElement(root, "stop", attribs)
324 stopel.tail = '\n'
326 if isinstance(stop, BusStopMashup.Stop):
327 routeroot = ElementTree.SubElement(stopel, "routes")
328 for route in stop.routes:
329 ElementTree.SubElement(routeroot, "route",
330 { 'number': str(route.number),
331 'direction': str(route.direction) })
333 tree = ElementTree.ElementTree(root)
334 tree.write(f, "utf-8")
336 def xmlifySearched(data, f):
337 root = ElementTree.Element("searched")
338 root.tail = '\n'
339 for sa in data:
340 if sa.location is None:
341 continue
342 attribs = { "latitude": str(sa.location.latitude),
343 "longitude": str(sa.location.longitude) }
345 if sa.address is not None:
346 attribs['address'] = sa.address
348 if sa.referrer is not None:
349 attribs['referrer'] = str(sa.referrer)
351 sel = ElementTree.SubElement(root, "address", attribs)
352 sel.tail = '\n'
354 tree = ElementTree.ElementTree(root)
355 tree.write(f, "utf-8")
357 if __name__ == '__main__':
358 sys.exit(main(sys.argv))