2 # vi: set softtabstop=4 shiftwidth=4 tabstop=8 expandtab:
15 class SearchedAddress(object):
16 def __init__(self
, referrer
, address
):
17 self
.referrer
= referrer
18 self
.address
= address
22 return "searchaddr (referrer %s, addr %s, location %s)" % (
23 self
.referrer
, self
.address
, self
.location
)
26 def __init__(self
, sleepRange
):
27 self
.sleepRange
= sleepRange
29 # All stops we've found so far, indexed by code.
32 # Stops whose addresses we've fed back into the web application.
35 # Stops we weren't able to parse properly.
36 # Each entry is a tuple: (UnknownStopType, SearchedAddress)
37 self
.unparsedStops
= []
39 # Stops whose names we weren't able to convert to addresses.
40 self
.unparsedStopNames
= []
42 self
.client
= BusStopMashup
.Client()
50 def followSeed(self
, seed
):
51 print >> sys
.stderr
, "SEED %s" % seed
53 oall
= len(self
.allStops
)
54 osearch
= len(self
.searched
)
56 sa
= SearchedAddress(seed
, seed
)
57 self
.recursiveSearch(sa
)
58 print "END SEED %s: searched %d addresses, found %d new stops" % (
59 seed
, len(self
.searched
) - osearch
, len(self
.allStops
) - oall
)
61 def recursiveSearch(self
, searchaddr
):
62 candidates
= self
.searchAddress(searchaddr
)
64 # By now sa.location should be filled in with the center location.
65 # Drop out if the address was invalid.
66 if searchaddr
.location
is None:
69 corners
= LatLongTools
.findCorners(searchaddr
.location
,
70 [(stop
.location
, stop
) for stop
in candidates
])
77 addr
= self
.stopToAddress(stop
)
81 # XXX - this is no good. Need
82 # radiusCompletedCoveredByOtherPoints.
83 if LatLongTools
.pointWithinRadiusOfPoints(
84 stop
.location
, 0.01, 0.01,
85 [s
.location
for s
in self
.searched
86 if s
is not searchaddr
and s
.location
is not None]):
87 # Already been close to here. Skip.
89 newsa
= SearchedAddress(stop
, addr
)
90 if self
.recursiveSearch(newsa
):
91 # Valid address. Stop looking in this corner.
96 def stopToAddress(self
, stop
):
97 if not isinstance(stop
, BusStopMashup
.Stop
):
98 # Only regular bus stops tend to have names that we can use as
101 if _station_stop_rx
.match(stop
.name
) is not None:
102 # Ignore Transitway station stops.
104 if _tulip_rx
.search(stop
.name
) is not None:
105 # Ignore Tulipfest stops (ARRET TULIPES / TULIP STOP / ...)
108 # RIVERSIDE / AD. 2865 -> 2865 RIVERSIDE
109 m
= _addr_rx
.match(stop
.name
)
111 return "%s %s" % (m
.group("number"), m
.group("street1"))
113 # BANK / RIVERSIDE -> BANK & RIVERSIDE
114 m
= _intersection_rx
.match(stop
.name
)
116 return "%s & %s" % (m
.group("street1"), m
.group("street2"))
118 self
.unparsedStopNames
.append(stop
)
120 def searchAddress(self
, searchaddr
):
123 self
.searched
.append(searchaddr
)
125 stops
= list(self
.client
.findStops(searchaddr
.address
))
126 except BusStopMashup
.InvalidAddressException
, e
:
127 print >> sys
.stderr
, "\tINVALID ADDR %s (referrer %s)" % (
128 searchaddr
.address
, searchaddr
.referrer
)
133 followCandidates
= []
135 if isinstance(stop
, BusStopMashup
.HomeLocation
):
136 # Record the actual center location, which is probably
137 # slightly off from the location of the stop we fed in
138 # since we're looking at the intersection and not the stop.
139 searchaddr
.location
= stop
.location
140 elif isinstance(stop
, BusStopMashup
.UnknownStopType
):
141 self
.unparsedStops
.append((stop
, searchaddr
))
143 # Even if we've seen a stop before, we should consider it as
144 # a possible search location.
145 followCandidates
.append(stop
)
146 if self
.allStops
.has_key(stop
.code
):
150 self
.allStops
[stop
.code
] = stop
152 print >> sys
.stderr
, "\t%s (referrer %s): %d stops, %d new" % (
153 searchaddr
.address
, searchaddr
.referrer
,
154 oldstops
+ newstops
, newstops
)
156 return followCandidates
160 self
.firstTime
= False
162 time
.sleep(random
.uniform(*self
.sleepRange
))
164 # Matches a location like 'RIVERSIDE / AD. 2865'
165 _addr_re
= (r
'^(?P<street1>[^/]+?)\s*/\s*AD\.\s*(?P<number>\d+)$')
166 _addr_rx
= re
.compile(_addr_re
)
168 # Matches a regular intersection, 'STREET1 / STREET2'
169 _intersection_re
= (r
'^(?P<street1>[^/]+?)\s*/\s*(?P<street2>[^/]+?)\s*$')
170 _intersection_rx
= re
.compile(_intersection_re
)
172 # Matches a transitway stop. We'll have to ignore these.
173 _station_stop_re
= (r
'.*STOP\s*/ ARR.*')
174 _station_stop_rx
= re
.compile(_station_stop_re
)
176 # Tulipfest. Formatting is not consistent; ignore
177 _tulip_re
= (r
'TULIPES')
178 _tulip_rx
= re
.compile(_tulip_re
)
181 print >> sys
.stderr
, \
182 """Usage: ./stopMashupLeech.py [--fast] seed1 [seed2 [...]]
184 Don't use --fast except for brief test runs."""
188 print >> sys
.stderr
, "Exiting after current search completes."
192 sleepRange
= (38, 66)
195 opts
, seeds
= getopt
.getopt(argv
[1:], "fh", ["fast", "help"])
197 if o
in ('-f', '--fast'):
199 elif o
in ('-h', '--help'):
203 except getopt
.GetoptError
, e
:
210 leech
= Leech(sleepRange
)
212 sighandler
= lambda sig
, frame
: doBail(leech
)
213 signal
.signal(signal
.SIGINT
, sighandler
)
214 signal
.signal(signal
.SIGTERM
, sighandler
)
218 leech
.followSeed(seed
)
220 dumpPickle("grabs/allstops.pickle", leech
.allStops
)
222 print "Locations searched: %d" % len(leech
.searched
)
223 print "Stops loaded: %d" % len(leech
.allStops
)
225 print "\nUnparsed stops: %d" % len(leech
.unparsedStops
)
226 for s
in leech
.unparsedStops
:
227 print "\t%s; returned in search for %s" % (s
[0], s
[1].address
)
229 print "\nUnparsed stop names: %d" % len(leech
.unparsedStopNames
)
230 for s
in leech
.unparsedStopNames
:
233 inval
= [sa
for sa
in leech
.searched
if sa
.location
is None]
234 print "\nInvalid locations searched: %d" % len(inval
)
238 def dumpPickle(filename
, data
):
239 f
= open(filename
, "w")
243 if __name__
== '__main__':
244 sys
.exit(main(sys
.argv
))