Script to grab stop codes from allstops.xml and sort
[ottawa-travel-planner.git] / ocdata / stopMashupLeech.py
blob25170fd1f6ac49fafa9afb13a4b7a4e7a31a2977
1 #!/usr/bin/python
2 # vi: set softtabstop=4 shiftwidth=4 tabstop=8 expandtab:
4 import sys
5 import time
6 import random
7 import re
8 import pickle
9 import signal
10 import getopt
11 import tempfile
12 import os
13 from xml.etree import ElementTree
15 import sys
16 sys.path.append("..")
18 import LatLongTools
19 import RectTools
20 import BusStopMashup
22 class SearchedAddress(object):
23 def __init__(self, referrer, address):
24 self.referrer = referrer
25 self.address = address
26 self.location = None
28 def __repr__(self):
29 return "searchaddr (referrer %s, addr %s, location %s)" % (
30 self.referrer, self.address, self.location)
32 class Leech(object):
33 def __init__(self, sleepRange):
34 self.sleepRange = sleepRange
36 # All stops we've found so far, indexed by code.
37 self.allStops = {}
39 # Stops whose addresses we've fed back into the web application.
40 self.searched = []
42 # Stops we weren't able to parse properly.
43 # Each entry is a tuple: (UnknownStopType, SearchedAddress)
44 self.unparsedStops = []
46 # Stops whose names we weren't able to convert to addresses.
47 self.unparsedStopNames = []
49 self.client = BusStopMashup.Client()
51 self.progressListener = lambda searchaddr, leech: True
52 self.firstTime = True
53 self.bail = False
55 def bailEarly(self):
56 self.bail = True
58 def followSeed(self, seed):
59 print >> sys.stderr, "SEED %s" % seed
61 oall = len(self.allStops)
62 osearch = len(self.searched)
64 sa = SearchedAddress(seed, seed)
65 self.recursiveSearch(sa)
66 print >> sys.stderr, \
67 "END SEED %s: searched %d addresses, found %d new stops" % (
68 seed, len(self.searched) - osearch, len(self.allStops) - oall)
70 def recursiveSearch(self, searchaddr):
71 candidates = self.searchAddress(searchaddr)
73 # By now sa.location should be filled in with the center location.
74 # Drop out if the address was invalid.
75 if searchaddr.location is None:
76 return False
78 corners = LatLongTools.findCorners(searchaddr.location,
79 [(stop.location, stop) for stop in candidates])
81 for c in corners:
82 for stop in c:
83 if self.bail:
84 return True
86 addr = self.stopToAddress(stop)
87 if addr is None:
88 continue
90 if addr in [sa.address for sa in self.searched]:
91 # Already been here. Skip this corner.
92 break
94 if radiusCompletelyCoveredByOtherPoints(stop.location,
95 [s.location for s in self.searched
96 if s is not searchaddr and s.location is not None],
97 0.019, 0.019):
98 # Already been close to here. Skip this corner.
99 break
101 newsa = SearchedAddress(stop, addr)
102 if self.recursiveSearch(newsa):
103 # Valid address. Stop looking in this corner.
104 break
106 return True
108 def stopToAddress(self, stop):
109 # The mashup doesn't like accented characters, so try a simple
110 # replacement.
111 name = anglicize(stop.name)
113 if not isinstance(stop, BusStopMashup.Stop):
114 # Only regular bus stops tend to have names that we can use as
115 # addresses.
116 return None
117 if _station_stop_rx.match(name) is not None:
118 # Ignore Transitway station stops.
119 return None
120 if _tulip_rx.search(name) is not None:
121 # Ignore Tulipfest stops (ARRET TULIPES / TULIP STOP / ...)
122 return None
124 # RIVERSIDE / AD. 2865 -> 2865 RIVERSIDE
125 m = _addr_rx.match(name)
126 if m is not None:
127 return "%s %s" % (m.group("number"), m.group("street1"))
129 # BANK / RIVERSIDE -> BANK & RIVERSIDE
130 m = _intersection_rx.match(name)
131 if m is not None:
132 return "%s & %s" % (m.group("street1"), m.group("street2"))
134 self.unparsedStopNames.append(stop)
136 def searchAddress(self, searchaddr):
137 self.pause()
139 self.searched.append(searchaddr)
140 try:
141 stops = list(self.client.findStops(searchaddr.address))
142 except BusStopMashup.InvalidAddressException, e:
143 print >> sys.stderr, "\tINVALID ADDR %s (referrer %s)" % (
144 searchaddr.address, searchaddr.referrer)
145 return []
146 except IOError, e:
147 print >> sys.stderr, "\tFAILED SEARCH %s: %s (referrer %s)" % (
148 searchaddr.address, e, searchaddr.referrer)
149 return []
151 oldstops = 0
152 newstops = 0
153 followCandidates = []
154 for stop in stops:
155 if isinstance(stop, BusStopMashup.HomeLocation):
156 # Record the actual center location, which is probably
157 # slightly off from the location of the stop we fed in
158 # since we're looking at the intersection and not the stop.
159 searchaddr.location = stop.location
160 elif isinstance(stop, BusStopMashup.UnknownStopType):
161 self.unparsedStops.append((stop, searchaddr))
162 else:
163 # Even if we've seen a stop before, we should consider it as
164 # a possible search location.
165 followCandidates.append(stop)
166 if self.allStops.has_key(stop.code):
167 oldstops += 1
168 else:
169 newstops += 1
170 self.allStops[stop.code] = stop
172 print >> sys.stderr, "\t%s (referrer %s): %d stops, %d new" % (
173 searchaddr.address, searchaddr.referrer,
174 oldstops + newstops, newstops)
176 self.progressListener(searchaddr, self)
178 return followCandidates
180 def pause(self):
181 if self.firstTime:
182 self.firstTime = False
183 else:
184 time.sleep(random.uniform(*self.sleepRange))
186 # Matches a location like 'RIVERSIDE / AD. 2865'
187 _addr_re = (r'^(?P<street1>[^/]+?)\s*/\s*AD\.\s*(?P<number>\d+)$')
188 _addr_rx = re.compile(_addr_re)
190 # Matches a regular intersection, 'STREET1 / STREET2'
191 _intersection_re = (r'^(?P<street1>[^/]+?)\s*/\s*(?P<street2>[^/]+?)\s*$')
192 _intersection_rx = re.compile(_intersection_re)
194 # Matches a transitway stop. We'll have to ignore these.
195 _station_stop_re = (r'.*STOP\s*/ ARR.*')
196 _station_stop_rx = re.compile(_station_stop_re)
198 # Tulipfest. Formatting is not consistent; ignore
199 _tulip_re = (r'TULIPES')
200 _tulip_rx = re.compile(_tulip_re)
202 def anglicize(ucodestr):
203 return ucodestr.translate({
204 0xc0: u'A', 0xc1: u'A', 0xc2: u'A', 0xc3: u'A', 0xc4: u'A', 0xc5: u'A',
205 0xc6: u'AE', 0xc7: u'C',
206 0xc8: u'E', 0xc9: u'E', 0xca: u'E', 0xcb: u'E',
207 0xcc: u'I', 0xcd: u'I', 0xce: u'I', 0xcf: u'I',
208 0xd2: u'O', 0xd3: u'O', 0xd4: u'O', 0xd5: u'O', 0xd6: u'O', 0xd8: u'O',
209 0xd9: u'U', 0xda: u'U', 0xdb: u'U', 0xdc: u'U',
210 0xdd: u'Y',
211 0xe0: u'a', 0xe1: u'a', 0xe2: u'a', 0xe3: u'a', 0xe4: u'a', 0xe5: u'a',
212 0xe6: u'ae',
213 0xe8: u'e', 0xe9: u'e', 0xea: u'e', 0xeb: u'e',
214 0xec: u'i', 0xed: u'i', 0xee: u'i', 0xef: u'i',
215 0xf2: u'o', 0xf3: u'o', 0xf4: u'o', 0xf5: u'o', 0xf6: u'o', 0xf8: u'o',
216 0xf9: u'u', 0xfa: u'u', 0xfb: u'u', 0xfc: u'u', 0xfd: u'u',
217 0xfd: u'y'}).encode("ascii")
220 def radiusCompletelyCoveredByOtherPoints(location, locations, width, height):
221 p = RectTools.PolyRect([RectTools.Rectangle(location.longitude - width/2,
222 location.latitude - height/2,
223 width, height)])
225 # Require at least 2% new coverage.
226 min_area = 0.02 * width * height
227 for i in locations:
228 p = p.subtract(RectTools.Rectangle(i.longitude - width/2,
229 i.latitude - width/2,
230 width, height))
232 if p.area() < min_area:
233 break
235 return p.area() < min_area
237 def usage():
238 print >> sys.stderr, \
239 """Usage: ./stopMashupLeech.py [--fast] seed1 [seed2 [...]]
241 Don't use --fast except for brief test runs."""
242 sys.exit(1)
244 def doBail(leech):
245 print >> sys.stderr, "Exiting after current search completes."
246 leech.bailEarly()
248 def progressListener(searchaddr, leech):
249 saveAll(leech)
251 def saveAll(leech):
252 dumpXML(STOPS_XML, leech.allStops)
253 dumpSearchedXML(SEARCHADDR_XML, leech.searched)
255 def tryResume(leech):
256 try:
257 stopf = open(STOPS_XML, "r")
258 leech.allStops = unxmlifyStops(stopf)
259 stopf.close()
260 print >> sys.stderr, "Loaded %d previous stops." % len(leech.allStops)
262 searchf = open(SEARCHADDR_XML, "r")
263 leech.searched = unxmlifySearched(searchf)
264 searchf.close()
265 print >> sys.stderr, "Loaded %d previously searched addresses." % (
266 len(leech.searched))
267 except IOError, e:
268 pass
269 # XML parser exceptions will cause a program exit
271 def main(argv):
272 sleepRange = (38, 66)
274 try:
275 opts, seeds = getopt.getopt(argv[1:], "fh", ["fast", "help"])
276 for o, a in opts:
277 if o in ('-f', '--fast'):
278 sleepRange = (3, 10)
279 elif o in ('-h', '--help'):
280 usage()
281 else:
282 usage()
283 except getopt.GetoptError, e:
284 print e
285 usage()
287 if len(seeds) == 0:
288 usage()
290 leech = Leech(sleepRange)
291 leech.progressListener = progressListener
293 tryResume(leech)
295 sighandler = lambda sig, frame: doBail(leech)
296 signal.signal(signal.SIGINT, sighandler)
297 signal.signal(signal.SIGTERM, sighandler)
299 for seed in seeds:
300 if not leech.bail:
301 leech.followSeed(seed)
303 print "Locations searched: %d" % len(leech.searched)
304 print "Stops loaded: %d" % len(leech.allStops)
306 print "\nUnparsed stops: %d" % len(leech.unparsedStops)
307 for s in leech.unparsedStops:
308 print "\t%s; returned in search for %s" % (s[0], s[1].address)
310 print "\nUnparsed stop names: %d" % len(leech.unparsedStopNames)
311 for s in leech.unparsedStopNames:
312 print "\t%s" % repr(s)
314 inval = [sa for sa in leech.searched if sa.location is None]
315 print "\nInvalid locations searched: %d" % len(inval)
316 for i in inval:
317 print "\t%s" % repr(i)
319 saveAll(leech)
321 def safeWrite(filename, data, dumpfunc):
322 (fd, n) = tempfile.mkstemp(dir='.', prefix=filename)
323 f = os.fdopen(fd, "w")
324 dumpfunc(data, f)
325 f.close()
327 try:
328 os.unlink(filename)
329 except:
330 pass
331 os.link(n, filename)
332 os.unlink(n)
334 def dumpXML(filename, data):
335 safeWrite(filename, data, xmlifyStops)
337 def dumpSearchedXML(filename, data):
338 safeWrite(filename, data, xmlifySearched)
340 def xmlifyStops(data, f):
341 root = ElementTree.Element("stops")
342 root.text = '\n'
343 root.tail = '\n'
344 for (code, stop) in data.iteritems():
345 attribs = {}
346 attribs['code'] = code
348 if stop.number is not None:
349 attribs['number'] = str(stop.number)
350 if stop.name is not None:
351 attribs['name'] = stop.name
352 if stop.location is not None:
353 attribs['latitude'] = str(stop.location.latitude)
354 attribs['longitude'] = str(stop.location.longitude)
355 if stop.requestedAddress is not None:
356 attribs['searchLocation'] = stop.requestedAddress
358 if isinstance(stop, BusStopMashup.Stop):
359 attribs['type'] = 'stop'
360 elif isinstance(stop, BusStopMashup.Station):
361 attribs['type'] = 'station'
362 else:
363 continue
365 stopel = ElementTree.SubElement(root, "stop", attribs)
366 stopel.tail = '\n'
368 if isinstance(stop, BusStopMashup.Stop):
369 routeroot = ElementTree.SubElement(stopel, "routes")
370 for route in stop.routes:
371 ElementTree.SubElement(routeroot, "route",
372 { 'number': str(route.number),
373 'direction': str(route.direction) })
375 tree = ElementTree.ElementTree(root)
376 tree.write(f, "utf-8")
378 def unxmlifyStops(f):
379 stops = {}
381 etree = ElementTree.parse(f)
382 for stopel in etree.findall("stop"):
383 type = stopel.get("type")
384 if type == "station":
385 stop = BusStopMashup.Station()
386 elif type == "stop":
387 stop = BusStopMashup.Stop()
389 stop.code = stopel.get("code")
390 if stop.code is None:
391 continue
393 stop.number = stopel.get("number")
394 stop.name = stopel.get("name")
396 lat = stopel.get("latitude")
397 lng = stopel.get("longitude")
398 if lat and lng:
399 stop.location = BusStopMashup.StopLocation(lat, lng)
401 stop.requestedAddress = stopel.get("searchLocation")
403 if isinstance(stop, BusStopMashup.Stop):
404 for routeel in stopel.find("routes").findall("route"):
405 stop.routes.append(
406 BusStopMashup.StopRoute(
407 routeel.get("number"), routeel.get("direction")))
409 # For consistency, make all strings unicode if they aren't already.
410 if stop.code:
411 stop.code = unicode(stop.code)
412 if stop.name:
413 stop.name = unicode(stop.name)
415 stops[stop.code] = stop
417 return stops
420 def xmlifySearched(data, f):
421 root = ElementTree.Element("searched")
422 root.tail = '\n'
423 for sa in data:
424 if sa.location is None:
425 attribs = { "invalid": "1" }
426 else:
427 attribs = { "latitude": str(sa.location.latitude),
428 "longitude": str(sa.location.longitude) }
430 if sa.address is not None:
431 attribs['address'] = sa.address
433 if sa.referrer is not None:
434 attribs['referrer'] = str(sa.referrer)
436 sel = ElementTree.SubElement(root, "address", attribs)
437 sel.tail = '\n'
439 tree = ElementTree.ElementTree(root)
440 tree.write(f, "utf-8")
442 def unxmlifySearched(f):
443 searched = []
445 etree = ElementTree.parse(f)
446 for searchel in etree.findall("address"):
447 sa = SearchedAddress(searchel.get("referrer"), searchel.get("address"))
449 lat = searchel.get("latitude")
450 lng = searchel.get("longitude")
451 if lat and lng:
452 sa.location = BusStopMashup.StopLocation(lat, lng)
453 searched.append(sa)
455 return searched
457 SEARCHADDR_XML = "out/allsearched.xml"
458 STOPS_XML = "out/allstops.xml"
460 if __name__ == '__main__':
461 sys.exit(main(sys.argv))