Accept --fast option to turn down the delay.
[ottawa-travel-planner.git] / stopMashupLeech.py
blob882facfaba084badc347be700c59a1fcb8abe0cb
1 #!/usr/bin/python
2 # vi: set softtabstop=4 shiftwidth=4 tabstop=8 expandtab:
4 import sys
5 import time
6 import random
7 import re
8 import pickle
9 import signal
10 import getopt
12 import LatLongTools
13 import BusStopMashup
15 class SearchedAddress(object):
16 def __init__(self, referrer, address):
17 self.referrer = referrer
18 self.address = address
19 self.location = None
21 def __repr__(self):
22 return "searchaddr (referrer %s, addr %s, location %s)" % (
23 self.referrer, self.address, self.location)
25 class Leech(object):
26 def __init__(self, sleepRange):
27 self.sleepRange = sleepRange
29 # All stops we've found so far, indexed by code.
30 self.allStops = {}
32 # Stops whose addresses we've fed back into the web application.
33 self.searched = []
35 # Stops we weren't able to parse properly.
36 # Each entry is a tuple: (UnknownStopType, SearchedAddress)
37 self.unparsedStops = []
39 # Stops whose names we weren't able to convert to addresses.
40 self.unparsedStopNames = []
42 self.client = BusStopMashup.Client()
44 self.firstTime = True
45 self.bail = False
47 def bailEarly(self):
48 self.bail = True
50 def followSeed(self, seed):
51 print >> sys.stderr, "SEED %s" % seed
53 oall = len(self.allStops)
54 osearch = len(self.searched)
56 sa = SearchedAddress(seed, seed)
57 self.recursiveSearch(sa)
58 print "END SEED %s: searched %d addresses, found %d new stops" % (
59 seed, len(self.searched) - osearch, len(self.allStops) - oall)
61 def recursiveSearch(self, searchaddr):
62 candidates = self.searchAddress(searchaddr)
64 # By now sa.location should be filled in with the center location.
65 # Drop out if the address was invalid.
66 if searchaddr.location is None:
67 return False
69 corners = LatLongTools.findCorners(searchaddr.location,
70 [(stop.location, stop) for stop in candidates])
72 for c in corners:
73 for stop in c:
74 if self.bail:
75 return True
77 addr = self.stopToAddress(stop)
78 if addr is None:
79 continue
81 # XXX - this is no good. Need
82 # radiusCompletedCoveredByOtherPoints.
83 if LatLongTools.pointWithinRadiusOfPoints(
84 stop.location, 0.01, 0.01,
85 [s.location for s in self.searched
86 if s is not searchaddr and s.location is not None]):
87 # Already been close to here. Skip.
88 continue
89 newsa = SearchedAddress(stop, addr)
90 if self.recursiveSearch(newsa):
91 # Valid address. Stop looking in this corner.
92 break
94 return True
96 def stopToAddress(self, stop):
97 if not isinstance(stop, BusStopMashup.Stop):
98 # Only regular bus stops tend to have names that we can use as
99 # addresses.
100 return None
101 if _station_stop_rx.match(stop.name) is not None:
102 # Ignore Transitway station stops.
103 return None
104 if _tulip_rx.search(stop.name) is not None:
105 # Ignore Tulipfest stops (ARRET TULIPES / TULIP STOP / ...)
106 return None
108 # RIVERSIDE / AD. 2865 -> 2865 RIVERSIDE
109 m = _addr_rx.match(stop.name)
110 if m is not None:
111 return "%s %s" % (m.group("number"), m.group("street1"))
113 # BANK / RIVERSIDE -> BANK & RIVERSIDE
114 m = _intersection_rx.match(stop.name)
115 if m is not None:
116 return "%s & %s" % (m.group("street1"), m.group("street2"))
118 self.unparsedStopNames.append(stop)
120 def searchAddress(self, searchaddr):
121 self.pause()
123 self.searched.append(searchaddr)
124 try:
125 stops = list(self.client.findStops(searchaddr.address))
126 except BusStopMashup.InvalidAddressException, e:
127 print >> sys.stderr, "\tINVALID ADDR %s (referrer %s)" % (
128 searchaddr.address, searchaddr.referrer)
129 return []
131 oldstops = 0
132 newstops = 0
133 followCandidates = []
134 for stop in stops:
135 if isinstance(stop, BusStopMashup.HomeLocation):
136 # Record the actual center location, which is probably
137 # slightly off from the location of the stop we fed in
138 # since we're looking at the intersection and not the stop.
139 searchaddr.location = stop.location
140 elif isinstance(stop, BusStopMashup.UnknownStopType):
141 self.unparsedStops.append((stop, searchaddr))
142 else:
143 # Even if we've seen a stop before, we should consider it as
144 # a possible search location.
145 followCandidates.append(stop)
146 if self.allStops.has_key(stop.code):
147 oldstops += 1
148 else:
149 newstops += 1
150 self.allStops[stop.code] = stop
152 print >> sys.stderr, "\t%s (referrer %s): %d stops, %d new" % (
153 searchaddr.address, searchaddr.referrer,
154 oldstops + newstops, newstops)
156 return followCandidates
158 def pause(self):
159 if self.firstTime:
160 self.firstTime = False
161 else:
162 time.sleep(random.uniform(*self.sleepRange))
164 # Matches a location like 'RIVERSIDE / AD. 2865'
165 _addr_re = (r'^(?P<street1>[^/]+?)\s*/\s*AD\.\s*(?P<number>\d+)$')
166 _addr_rx = re.compile(_addr_re)
168 # Matches a regular intersection, 'STREET1 / STREET2'
169 _intersection_re = (r'^(?P<street1>[^/]+?)\s*/\s*(?P<street2>[^/]+?)\s*$')
170 _intersection_rx = re.compile(_intersection_re)
172 # Matches a transitway stop. We'll have to ignore these.
173 _station_stop_re = (r'.*STOP\s*/ ARR.*')
174 _station_stop_rx = re.compile(_station_stop_re)
176 # Tulipfest. Formatting is not consistent; ignore
177 _tulip_re = (r'TULIPES')
178 _tulip_rx = re.compile(_tulip_re)
180 def usage():
181 print >> sys.stderr, \
182 """Usage: ./stopMashupLeech.py [--fast] seed1 [seed2 [...]]
184 Don't use --fast except for brief test runs."""
185 sys.exit(1)
187 def doBail(leech):
188 print >> sys.stderr, "Exiting after current search completes."
189 leech.bailEarly()
191 def main(argv):
192 sleepRange = (38, 66)
194 try:
195 opts, seeds = getopt.getopt(argv[1:], "fh", ["fast", "help"])
196 for o, a in opts:
197 if o in ('-f', '--fast'):
198 sleepRange = (3, 10)
199 elif o in ('-h', '--help'):
200 usage()
201 else:
202 usage()
203 except getopt.GetoptError, e:
204 print e
205 usage()
207 if len(seeds) == 0:
208 usage()
210 leech = Leech(sleepRange)
212 sighandler = lambda sig, frame: doBail(leech)
213 signal.signal(signal.SIGINT, sighandler)
214 signal.signal(signal.SIGTERM, sighandler)
216 for seed in seeds:
217 if not leech.bail:
218 leech.followSeed(seed)
220 dumpPickle("grabs/allstops.pickle", leech.allStops)
222 print "Locations searched: %d" % len(leech.searched)
223 print "Stops loaded: %d" % len(leech.allStops)
225 print "\nUnparsed stops: %d" % len(leech.unparsedStops)
226 for s in leech.unparsedStops:
227 print "\t%s; returned in search for %s" % (s[0], s[1].address)
229 print "\nUnparsed stop names: %d" % len(leech.unparsedStopNames)
230 for s in leech.unparsedStopNames:
231 print "\t%s" % s
233 inval = [sa for sa in leech.searched if sa.location is None]
234 print "\nInvalid locations searched: %d" % len(inval)
235 for i in inval:
236 print "\t%s" % i
238 def dumpPickle(filename, data):
239 f = open(filename, "w")
240 pickle.dump(data, f)
241 f.close()
243 if __name__ == '__main__':
244 sys.exit(main(sys.argv))