ignore .pyc
[ottawa-travel-planner.git] / stopMashupLeech.py
blobc59ceb9711ecaa2adb10d8cab38c61aefbba66e9
1 #!/usr/bin/python
2 # vi: set softtabstop=4 shiftwidth=4 tabstop=8 expandtab:
4 import sys
5 import time
6 import random
7 import re
8 import pickle
9 import signal
10 import getopt
11 import tempfile
12 import os
13 from xml.etree import ElementTree
15 import LatLongTools
16 import RectTools
17 import BusStopMashup
19 class SearchedAddress(object):
20 def __init__(self, referrer, address):
21 self.referrer = referrer
22 self.address = address
23 self.location = None
25 def __repr__(self):
26 return "searchaddr (referrer %s, addr %s, location %s)" % (
27 self.referrer, self.address, self.location)
29 class Leech(object):
30 def __init__(self, sleepRange):
31 self.sleepRange = sleepRange
33 # All stops we've found so far, indexed by code.
34 self.allStops = {}
36 # Stops whose addresses we've fed back into the web application.
37 self.searched = []
39 # Stops we weren't able to parse properly.
40 # Each entry is a tuple: (UnknownStopType, SearchedAddress)
41 self.unparsedStops = []
43 # Stops whose names we weren't able to convert to addresses.
44 self.unparsedStopNames = []
46 self.client = BusStopMashup.Client()
48 self.progressListener = lambda searchaddr, leech: True
49 self.firstTime = True
50 self.bail = False
52 def bailEarly(self):
53 self.bail = True
55 def followSeed(self, seed):
56 print >> sys.stderr, "SEED %s" % seed
58 oall = len(self.allStops)
59 osearch = len(self.searched)
61 sa = SearchedAddress(seed, seed)
62 self.recursiveSearch(sa)
63 print >> sys.stderr, \
64 "END SEED %s: searched %d addresses, found %d new stops" % (
65 seed, len(self.searched) - osearch, len(self.allStops) - oall)
67 def recursiveSearch(self, searchaddr):
68 candidates = self.searchAddress(searchaddr)
70 # By now sa.location should be filled in with the center location.
71 # Drop out if the address was invalid.
72 if searchaddr.location is None:
73 return False
75 corners = LatLongTools.findCorners(searchaddr.location,
76 [(stop.location, stop) for stop in candidates])
78 for c in corners:
79 for stop in c:
80 if self.bail:
81 return True
83 addr = self.stopToAddress(stop)
84 if addr is None:
85 continue
87 if addr in [sa.address for sa in self.searched]:
88 # Already been here. Skip this corner.
89 break
91 if radiusCompletelyCoveredByOtherPoints(stop.location,
92 [s.location for s in self.searched
93 if s is not searchaddr and s.location is not None],
94 0.019, 0.019):
95 # Already been close to here. Skip this corner.
96 break
98 newsa = SearchedAddress(stop, addr)
99 if self.recursiveSearch(newsa):
100 # Valid address. Stop looking in this corner.
101 break
103 return True
105 def stopToAddress(self, stop):
106 # The mashup doesn't like accented characters, so try a simple
107 # replacement.
108 name = anglicize(stop.name)
110 if not isinstance(stop, BusStopMashup.Stop):
111 # Only regular bus stops tend to have names that we can use as
112 # addresses.
113 return None
114 if _station_stop_rx.match(name) is not None:
115 # Ignore Transitway station stops.
116 return None
117 if _tulip_rx.search(name) is not None:
118 # Ignore Tulipfest stops (ARRET TULIPES / TULIP STOP / ...)
119 return None
121 # RIVERSIDE / AD. 2865 -> 2865 RIVERSIDE
122 m = _addr_rx.match(name)
123 if m is not None:
124 return "%s %s" % (m.group("number"), m.group("street1"))
126 # BANK / RIVERSIDE -> BANK & RIVERSIDE
127 m = _intersection_rx.match(name)
128 if m is not None:
129 return "%s & %s" % (m.group("street1"), m.group("street2"))
131 self.unparsedStopNames.append(stop)
133 def searchAddress(self, searchaddr):
134 self.pause()
136 self.searched.append(searchaddr)
137 try:
138 stops = list(self.client.findStops(searchaddr.address))
139 except BusStopMashup.InvalidAddressException, e:
140 print >> sys.stderr, "\tINVALID ADDR %s (referrer %s)" % (
141 searchaddr.address, searchaddr.referrer)
142 return []
143 except IOError, e:
144 print >> sys.stderr, "\tFAILED SEARCH %s: %s (referrer %s)" % (
145 searchaddr.address, e, searchaddr.referrer)
146 return []
148 oldstops = 0
149 newstops = 0
150 followCandidates = []
151 for stop in stops:
152 if isinstance(stop, BusStopMashup.HomeLocation):
153 # Record the actual center location, which is probably
154 # slightly off from the location of the stop we fed in
155 # since we're looking at the intersection and not the stop.
156 searchaddr.location = stop.location
157 elif isinstance(stop, BusStopMashup.UnknownStopType):
158 self.unparsedStops.append((stop, searchaddr))
159 else:
160 # Even if we've seen a stop before, we should consider it as
161 # a possible search location.
162 followCandidates.append(stop)
163 if self.allStops.has_key(stop.code):
164 oldstops += 1
165 else:
166 newstops += 1
167 self.allStops[stop.code] = stop
169 print >> sys.stderr, "\t%s (referrer %s): %d stops, %d new" % (
170 searchaddr.address, searchaddr.referrer,
171 oldstops + newstops, newstops)
173 self.progressListener(searchaddr, self)
175 return followCandidates
177 def pause(self):
178 if self.firstTime:
179 self.firstTime = False
180 else:
181 time.sleep(random.uniform(*self.sleepRange))
183 # Matches a location like 'RIVERSIDE / AD. 2865'
184 _addr_re = (r'^(?P<street1>[^/]+?)\s*/\s*AD\.\s*(?P<number>\d+)$')
185 _addr_rx = re.compile(_addr_re)
187 # Matches a regular intersection, 'STREET1 / STREET2'
188 _intersection_re = (r'^(?P<street1>[^/]+?)\s*/\s*(?P<street2>[^/]+?)\s*$')
189 _intersection_rx = re.compile(_intersection_re)
191 # Matches a transitway stop. We'll have to ignore these.
192 _station_stop_re = (r'.*STOP\s*/ ARR.*')
193 _station_stop_rx = re.compile(_station_stop_re)
195 # Tulipfest. Formatting is not consistent; ignore
196 _tulip_re = (r'TULIPES')
197 _tulip_rx = re.compile(_tulip_re)
199 def anglicize(ucodestr):
200 return ucodestr.translate({
201 0xc0: u'A', 0xc1: u'A', 0xc2: u'A', 0xc3: u'A', 0xc4: u'A', 0xc5: u'A',
202 0xc6: u'AE', 0xc7: u'C',
203 0xc8: u'E', 0xc9: u'E', 0xca: u'E', 0xcb: u'E',
204 0xcc: u'I', 0xcd: u'I', 0xce: u'I', 0xcf: u'I',
205 0xd2: u'O', 0xd3: u'O', 0xd4: u'O', 0xd5: u'O', 0xd6: u'O', 0xd8: u'O',
206 0xd9: u'U', 0xda: u'U', 0xdb: u'U', 0xdc: u'U',
207 0xdd: u'Y',
208 0xe0: u'a', 0xe1: u'a', 0xe2: u'a', 0xe3: u'a', 0xe4: u'a', 0xe5: u'a',
209 0xe6: u'ae',
210 0xe8: u'e', 0xe9: u'e', 0xea: u'e', 0xeb: u'e',
211 0xec: u'i', 0xed: u'i', 0xee: u'i', 0xef: u'i',
212 0xf2: u'o', 0xf3: u'o', 0xf4: u'o', 0xf5: u'o', 0xf6: u'o', 0xf8: u'o',
213 0xf9: u'u', 0xfa: u'u', 0xfb: u'u', 0xfc: u'u', 0xfd: u'u',
214 0xfd: u'y'}).encode("ascii")
217 def radiusCompletelyCoveredByOtherPoints(location, locations, width, height):
218 p = RectTools.PolyRect([RectTools.Rectangle(location.longitude - width/2,
219 location.latitude - height/2,
220 width, height)])
222 # Require at least 2% new coverage.
223 min_area = 0.02 * width * height
224 for i in locations:
225 p = p.subtract(RectTools.Rectangle(i.longitude - width/2,
226 i.latitude - width/2,
227 width, height))
229 if p.area() < min_area:
230 break
232 return p.area() < min_area
234 def usage():
235 print >> sys.stderr, \
236 """Usage: ./stopMashupLeech.py [--fast] seed1 [seed2 [...]]
238 Don't use --fast except for brief test runs."""
239 sys.exit(1)
241 def doBail(leech):
242 print >> sys.stderr, "Exiting after current search completes."
243 leech.bailEarly()
245 def progressListener(searchaddr, leech):
246 saveAll(leech)
248 def saveAll(leech):
249 dumpXML(STOPS_XML, leech.allStops)
250 dumpSearchedXML(SEARCHADDR_XML, leech.searched)
252 def tryResume(leech):
253 try:
254 stopf = open(STOPS_XML, "r")
255 leech.allStops = unxmlifyStops(stopf)
256 stopf.close()
257 print >> sys.stderr, "Loaded %d previous stops." % len(leech.allStops)
259 searchf = open(SEARCHADDR_XML, "r")
260 leech.searched = unxmlifySearched(searchf)
261 searchf.close()
262 print >> sys.stderr, "Loaded %d previously searched addresses." % (
263 len(leech.searched))
264 except IOError, e:
265 pass
266 # XML parser exceptions will cause a program exit
268 def main(argv):
269 sleepRange = (38, 66)
271 try:
272 opts, seeds = getopt.getopt(argv[1:], "fh", ["fast", "help"])
273 for o, a in opts:
274 if o in ('-f', '--fast'):
275 sleepRange = (3, 10)
276 elif o in ('-h', '--help'):
277 usage()
278 else:
279 usage()
280 except getopt.GetoptError, e:
281 print e
282 usage()
284 if len(seeds) == 0:
285 usage()
287 leech = Leech(sleepRange)
288 leech.progressListener = progressListener
290 tryResume(leech)
292 sighandler = lambda sig, frame: doBail(leech)
293 signal.signal(signal.SIGINT, sighandler)
294 signal.signal(signal.SIGTERM, sighandler)
296 for seed in seeds:
297 if not leech.bail:
298 leech.followSeed(seed)
300 print "Locations searched: %d" % len(leech.searched)
301 print "Stops loaded: %d" % len(leech.allStops)
303 print "\nUnparsed stops: %d" % len(leech.unparsedStops)
304 for s in leech.unparsedStops:
305 print "\t%s; returned in search for %s" % (s[0], s[1].address)
307 print "\nUnparsed stop names: %d" % len(leech.unparsedStopNames)
308 for s in leech.unparsedStopNames:
309 print "\t%s" % repr(s)
311 inval = [sa for sa in leech.searched if sa.location is None]
312 print "\nInvalid locations searched: %d" % len(inval)
313 for i in inval:
314 print "\t%s" % repr(i)
316 saveAll(leech)
318 def safeWrite(filename, data, dumpfunc):
319 (fd, n) = tempfile.mkstemp(dir='.', prefix=filename)
320 f = os.fdopen(fd, "w")
321 dumpfunc(data, f)
322 f.close()
324 try:
325 os.unlink(filename)
326 except:
327 pass
328 os.link(n, filename)
329 os.unlink(n)
331 def dumpXML(filename, data):
332 safeWrite(filename, data, xmlifyStops)
334 def dumpSearchedXML(filename, data):
335 safeWrite(filename, data, xmlifySearched)
337 def xmlifyStops(data, f):
338 root = ElementTree.Element("stops")
339 root.text = '\n'
340 root.tail = '\n'
341 for (code, stop) in data.iteritems():
342 attribs = {}
343 attribs['code'] = code
345 if stop.number is not None:
346 attribs['number'] = str(stop.number)
347 if stop.name is not None:
348 attribs['name'] = stop.name
349 if stop.location is not None:
350 attribs['latitude'] = str(stop.location.latitude)
351 attribs['longitude'] = str(stop.location.longitude)
352 if stop.requestedAddress is not None:
353 attribs['searchLocation'] = stop.requestedAddress
355 if isinstance(stop, BusStopMashup.Stop):
356 attribs['type'] = 'stop'
357 elif isinstance(stop, BusStopMashup.Station):
358 attribs['type'] = 'station'
359 else:
360 continue
362 stopel = ElementTree.SubElement(root, "stop", attribs)
363 stopel.tail = '\n'
365 if isinstance(stop, BusStopMashup.Stop):
366 routeroot = ElementTree.SubElement(stopel, "routes")
367 for route in stop.routes:
368 ElementTree.SubElement(routeroot, "route",
369 { 'number': str(route.number),
370 'direction': str(route.direction) })
372 tree = ElementTree.ElementTree(root)
373 tree.write(f, "utf-8")
375 def unxmlifyStops(f):
376 stops = {}
378 etree = ElementTree.parse(f)
379 for stopel in etree.findall("stop"):
380 type = stopel.get("type")
381 if type == "station":
382 stop = BusStopMashup.Station()
383 elif type == "stop":
384 stop = BusStopMashup.Stop()
386 stop.code = stopel.get("code")
387 if stop.code is None:
388 continue
390 stop.number = stopel.get("number")
391 stop.name = stopel.get("name")
393 lat = stopel.get("latitude")
394 lng = stopel.get("longitude")
395 if lat and lng:
396 stop.location = BusStopMashup.StopLocation(lat, lng)
398 stop.requestedAddress = stopel.get("searchLocation")
400 if isinstance(stop, BusStopMashup.Stop):
401 for routeel in stopel.find("routes").findall("route"):
402 stop.routes.append(
403 BusStopMashup.StopRoute(
404 routeel.get("number"), routeel.get("direction")))
406 # For consistency, make all strings unicode if they aren't already.
407 if stop.code:
408 stop.code = unicode(stop.code)
409 if stop.name:
410 stop.name = unicode(stop.name)
412 stops[stop.code] = stop
414 return stops
417 def xmlifySearched(data, f):
418 root = ElementTree.Element("searched")
419 root.tail = '\n'
420 for sa in data:
421 if sa.location is None:
422 attribs = { "invalid": "1" }
423 else:
424 attribs = { "latitude": str(sa.location.latitude),
425 "longitude": str(sa.location.longitude) }
427 if sa.address is not None:
428 attribs['address'] = sa.address
430 if sa.referrer is not None:
431 attribs['referrer'] = str(sa.referrer)
433 sel = ElementTree.SubElement(root, "address", attribs)
434 sel.tail = '\n'
436 tree = ElementTree.ElementTree(root)
437 tree.write(f, "utf-8")
439 def unxmlifySearched(f):
440 searched = []
442 etree = ElementTree.parse(f)
443 for searchel in etree.findall("address"):
444 sa = SearchedAddress(searchel.get("referrer"), searchel.get("address"))
446 lat = searchel.get("latitude")
447 lng = searchel.get("longitude")
448 if lat and lng:
449 sa.location = BusStopMashup.StopLocation(lat, lng)
450 searched.append(sa)
452 return searched
454 SEARCHADDR_XML = "grabs/allsearched.xml"
455 STOPS_XML = "grabs/allstops.xml"
457 if __name__ == '__main__':
458 sys.exit(main(sys.argv))