Save invalid locations, too.
[ottawa-travel-planner.git] / stopMashupLeech.py
blob572cc244036e5e788fc441b897549345d4af9fce
1 #!/usr/bin/python
2 # vi: set softtabstop=4 shiftwidth=4 tabstop=8 expandtab:
4 import sys
5 import time
6 import random
7 import re
8 import pickle
9 import signal
10 import getopt
11 import tempfile
12 import os
13 from xml.etree import ElementTree
15 import LatLongTools
16 import RectTools
17 import BusStopMashup
19 class SearchedAddress(object):
20 def __init__(self, referrer, address):
21 self.referrer = referrer
22 self.address = address
23 self.location = None
25 def __repr__(self):
26 return "searchaddr (referrer %s, addr %s, location %s)" % (
27 self.referrer, self.address, self.location)
29 class Leech(object):
30 def __init__(self, sleepRange):
31 self.sleepRange = sleepRange
33 # All stops we've found so far, indexed by code.
34 self.allStops = {}
36 # Stops whose addresses we've fed back into the web application.
37 self.searched = []
39 # Stops we weren't able to parse properly.
40 # Each entry is a tuple: (UnknownStopType, SearchedAddress)
41 self.unparsedStops = []
43 # Stops whose names we weren't able to convert to addresses.
44 self.unparsedStopNames = []
46 self.client = BusStopMashup.Client()
48 self.progressListener = lambda searchaddr, leech: True
49 self.firstTime = True
50 self.bail = False
52 def bailEarly(self):
53 self.bail = True
55 def followSeed(self, seed):
56 print >> sys.stderr, "SEED %s" % seed
58 oall = len(self.allStops)
59 osearch = len(self.searched)
61 sa = SearchedAddress(seed, seed)
62 self.recursiveSearch(sa)
63 print "END SEED %s: searched %d addresses, found %d new stops" % (
64 seed, len(self.searched) - osearch, len(self.allStops) - oall)
66 def recursiveSearch(self, searchaddr):
67 candidates = self.searchAddress(searchaddr)
69 # By now sa.location should be filled in with the center location.
70 # Drop out if the address was invalid.
71 if searchaddr.location is None:
72 return False
74 corners = LatLongTools.findCorners(searchaddr.location,
75 [(stop.location, stop) for stop in candidates])
77 for c in corners:
78 for stop in c:
79 if self.bail:
80 return True
82 addr = self.stopToAddress(stop)
83 if addr is None:
84 continue
86 if addr in [sa.address for sa in self.searched]:
87 # Already been here. Skip this corner.
88 break
90 if radiusCompletelyCoveredByOtherPoints(stop.location,
91 [s.location for s in self.searched
92 if s is not searchaddr and s.location is not None],
93 0.019, 0.019):
94 # Already been close to here. Skip this corner.
95 break
97 newsa = SearchedAddress(stop, addr)
98 if self.recursiveSearch(newsa):
99 # Valid address. Stop looking in this corner.
100 break
102 return True
104 def stopToAddress(self, stop):
105 # The mashup doesn't like accented characters, so try a simple
106 # replacement.
107 name = anglicize(stop.name)
109 if not isinstance(stop, BusStopMashup.Stop):
110 # Only regular bus stops tend to have names that we can use as
111 # addresses.
112 return None
113 if _station_stop_rx.match(name) is not None:
114 # Ignore Transitway station stops.
115 return None
116 if _tulip_rx.search(name) is not None:
117 # Ignore Tulipfest stops (ARRET TULIPES / TULIP STOP / ...)
118 return None
120 # RIVERSIDE / AD. 2865 -> 2865 RIVERSIDE
121 m = _addr_rx.match(name)
122 if m is not None:
123 return "%s %s" % (m.group("number"), m.group("street1"))
125 # BANK / RIVERSIDE -> BANK & RIVERSIDE
126 m = _intersection_rx.match(name)
127 if m is not None:
128 return "%s & %s" % (m.group("street1"), m.group("street2"))
130 self.unparsedStopNames.append(stop)
132 def searchAddress(self, searchaddr):
133 self.pause()
135 self.searched.append(searchaddr)
136 try:
137 stops = list(self.client.findStops(searchaddr.address))
138 except BusStopMashup.InvalidAddressException, e:
139 print >> sys.stderr, "\tINVALID ADDR %s (referrer %s)" % (
140 searchaddr.address, searchaddr.referrer)
141 return []
142 except IOError, e:
143 print >> sys.stderr, "\tFAILED SEARCH %s: %s (referrer %s)" % (
144 searchaddr.address, e, searchaddr.referrer)
145 return []
147 oldstops = 0
148 newstops = 0
149 followCandidates = []
150 for stop in stops:
151 if isinstance(stop, BusStopMashup.HomeLocation):
152 # Record the actual center location, which is probably
153 # slightly off from the location of the stop we fed in
154 # since we're looking at the intersection and not the stop.
155 searchaddr.location = stop.location
156 elif isinstance(stop, BusStopMashup.UnknownStopType):
157 self.unparsedStops.append((stop, searchaddr))
158 else:
159 # Even if we've seen a stop before, we should consider it as
160 # a possible search location.
161 followCandidates.append(stop)
162 if self.allStops.has_key(stop.code):
163 oldstops += 1
164 else:
165 newstops += 1
166 self.allStops[stop.code] = stop
168 print >> sys.stderr, "\t%s (referrer %s): %d stops, %d new" % (
169 searchaddr.address, searchaddr.referrer,
170 oldstops + newstops, newstops)
172 self.progressListener(searchaddr, self)
174 return followCandidates
176 def pause(self):
177 if self.firstTime:
178 self.firstTime = False
179 else:
180 time.sleep(random.uniform(*self.sleepRange))
182 # Matches a location like 'RIVERSIDE / AD. 2865'
183 _addr_re = (r'^(?P<street1>[^/]+?)\s*/\s*AD\.\s*(?P<number>\d+)$')
184 _addr_rx = re.compile(_addr_re)
186 # Matches a regular intersection, 'STREET1 / STREET2'
187 _intersection_re = (r'^(?P<street1>[^/]+?)\s*/\s*(?P<street2>[^/]+?)\s*$')
188 _intersection_rx = re.compile(_intersection_re)
190 # Matches a transitway stop. We'll have to ignore these.
191 _station_stop_re = (r'.*STOP\s*/ ARR.*')
192 _station_stop_rx = re.compile(_station_stop_re)
194 # Tulipfest. Formatting is not consistent; ignore
195 _tulip_re = (r'TULIPES')
196 _tulip_rx = re.compile(_tulip_re)
198 def anglicize(ucodestr):
199 return ucodestr.translate({
200 0xc0: u'A', 0xc1: u'A', 0xc2: u'A', 0xc3: u'A', 0xc4: u'A', 0xc5: u'A',
201 0xc6: u'AE', 0xc7: u'C',
202 0xc8: u'E', 0xc9: u'E', 0xca: u'E', 0xcb: u'E',
203 0xcc: u'I', 0xcd: u'I', 0xce: u'I', 0xcf: u'I',
204 0xd2: u'O', 0xd3: u'O', 0xd4: u'O', 0xd5: u'O', 0xd6: u'O', 0xd8: u'O',
205 0xd9: u'U', 0xda: u'U', 0xdb: u'U', 0xdc: u'U',
206 0xdd: u'Y',
207 0xe0: u'a', 0xe1: u'a', 0xe2: u'a', 0xe3: u'a', 0xe4: u'a', 0xe5: u'a',
208 0xe6: u'ae',
209 0xe8: u'e', 0xe9: u'e', 0xea: u'e', 0xeb: u'e',
210 0xec: u'i', 0xed: u'i', 0xee: u'i', 0xef: u'i',
211 0xf2: u'o', 0xf3: u'o', 0xf4: u'o', 0xf5: u'o', 0xf6: u'o', 0xf8: u'o',
212 0xf9: u'u', 0xfa: u'u', 0xfb: u'u', 0xfc: u'u', 0xfd: u'u',
213 0xfd: u'y'}).encode("ascii")
216 def radiusCompletelyCoveredByOtherPoints(location, locations, width, height):
217 p = RectTools.PolyRect([RectTools.Rectangle(location.longitude - width/2,
218 location.latitude - height/2,
219 width, height)])
221 # Require at least 2% new coverage.
222 min_area = 0.02 * width * height
223 for i in locations:
224 p = p.subtract(RectTools.Rectangle(i.longitude - width/2,
225 i.latitude - width/2,
226 width, height))
228 if p.area() < min_area:
229 break
231 return p.area() < min_area
233 def usage():
234 print >> sys.stderr, \
235 """Usage: ./stopMashupLeech.py [--fast] seed1 [seed2 [...]]
237 Don't use --fast except for brief test runs."""
238 sys.exit(1)
240 def doBail(leech):
241 print >> sys.stderr, "Exiting after current search completes."
242 leech.bailEarly()
244 def progressListener(searchaddr, leech):
245 saveAll(leech)
247 def saveAll(leech):
248 dumpPickle("grabs/allstops.pickle", leech)
249 dumpXML("grabs/allstops.xml", leech.allStops)
250 dumpSearchedXML("grabs/allsearched.xml", leech.searched)
253 def main(argv):
254 sleepRange = (38, 66)
256 try:
257 opts, seeds = getopt.getopt(argv[1:], "fh", ["fast", "help"])
258 for o, a in opts:
259 if o in ('-f', '--fast'):
260 sleepRange = (3, 10)
261 elif o in ('-h', '--help'):
262 usage()
263 else:
264 usage()
265 except getopt.GetoptError, e:
266 print e
267 usage()
269 if len(seeds) == 0:
270 usage()
272 leech = Leech(sleepRange)
273 leech.progressListener = progressListener
275 sighandler = lambda sig, frame: doBail(leech)
276 signal.signal(signal.SIGINT, sighandler)
277 signal.signal(signal.SIGTERM, sighandler)
279 for seed in seeds:
280 if not leech.bail:
281 leech.followSeed(seed)
283 print "Locations searched: %d" % len(leech.searched)
284 print "Stops loaded: %d" % len(leech.allStops)
286 print "\nUnparsed stops: %d" % len(leech.unparsedStops)
287 for s in leech.unparsedStops:
288 print "\t%s; returned in search for %s" % (s[0], s[1].address)
290 print "\nUnparsed stop names: %d" % len(leech.unparsedStopNames)
291 for s in leech.unparsedStopNames:
292 print "\t%s" % repr(s)
294 inval = [sa for sa in leech.searched if sa.location is None]
295 print "\nInvalid locations searched: %d" % len(inval)
296 for i in inval:
297 print "\t%s" % repr(i)
299 saveAll(leech)
301 def dumpPickle(filename, data):
302 safeWrite(filename, data, pickle.dump)
304 def safeWrite(filename, data, dumpfunc):
305 (fd, n) = tempfile.mkstemp(dir='.', prefix=filename)
306 f = os.fdopen(fd, "w")
307 dumpfunc(data, f)
308 f.close()
310 try:
311 os.unlink(filename)
312 except:
313 pass
314 os.link(n, filename)
315 os.unlink(n)
317 def dumpXML(filename, data):
318 safeWrite(filename, data, xmlify)
320 def dumpSearchedXML(filename, data):
321 safeWrite(filename, data, xmlifySearched)
323 def xmlify(data, f):
324 root = ElementTree.Element("stops")
325 root.tail = '\n'
326 for (code, stop) in data.iteritems():
327 attribs = {}
328 attribs['code'] = code
330 if stop.number is not None:
331 attribs['number'] = str(stop.number)
332 if stop.name is not None:
333 attribs['name'] = stop.name
334 if stop.location is not None:
335 attribs['latitude'] = str(stop.location.latitude)
336 attribs['longitude'] = str(stop.location.longitude)
337 if stop.requestedAddress is not None:
338 attribs['searchLocation'] = stop.requestedAddress
340 if isinstance(stop, BusStopMashup.Stop):
341 attribs['type'] = 'stop'
342 elif isinstance(stop, BusStopMashup.Station):
343 attribs['type'] = 'station'
344 else:
345 continue
347 stopel = ElementTree.SubElement(root, "stop", attribs)
348 stopel.tail = '\n'
350 if isinstance(stop, BusStopMashup.Stop):
351 routeroot = ElementTree.SubElement(stopel, "routes")
352 for route in stop.routes:
353 ElementTree.SubElement(routeroot, "route",
354 { 'number': str(route.number),
355 'direction': str(route.direction) })
357 tree = ElementTree.ElementTree(root)
358 tree.write(f, "utf-8")
360 def xmlifySearched(data, f):
361 root = ElementTree.Element("searched")
362 root.tail = '\n'
363 for sa in data:
364 if sa.location is None:
365 attribs = { "invalid": "1" }
366 else:
367 attribs = { "latitude": str(sa.location.latitude),
368 "longitude": str(sa.location.longitude) }
370 if sa.address is not None:
371 attribs['address'] = sa.address
373 if sa.referrer is not None:
374 attribs['referrer'] = str(sa.referrer)
376 sel = ElementTree.SubElement(root, "address", attribs)
377 sel.tail = '\n'
379 tree = ElementTree.ElementTree(root)
380 tree.write(f, "utf-8")
382 if __name__ == '__main__':
383 sys.exit(main(sys.argv))