2 # vi: set softtabstop=4 shiftwidth=4 tabstop=8 expandtab:
13 from xml
.etree
import ElementTree
19 class SearchedAddress(object):
20 def __init__(self
, referrer
, address
):
21 self
.referrer
= referrer
22 self
.address
= address
26 return "searchaddr (referrer %s, addr %s, location %s)" % (
27 self
.referrer
, self
.address
, self
.location
)
30 def __init__(self
, sleepRange
):
31 self
.sleepRange
= sleepRange
33 # All stops we've found so far, indexed by code.
36 # Stops whose addresses we've fed back into the web application.
39 # Stops we weren't able to parse properly.
40 # Each entry is a tuple: (UnknownStopType, SearchedAddress)
41 self
.unparsedStops
= []
43 # Stops whose names we weren't able to convert to addresses.
44 self
.unparsedStopNames
= []
46 self
.client
= BusStopMashup
.Client()
48 self
.progressListener
= lambda searchaddr
, leech
: True
55 def followSeed(self
, seed
):
56 print >> sys
.stderr
, "SEED %s" % seed
58 oall
= len(self
.allStops
)
59 osearch
= len(self
.searched
)
61 sa
= SearchedAddress(seed
, seed
)
62 self
.recursiveSearch(sa
)
63 print "END SEED %s: searched %d addresses, found %d new stops" % (
64 seed
, len(self
.searched
) - osearch
, len(self
.allStops
) - oall
)
66 def recursiveSearch(self
, searchaddr
):
67 candidates
= self
.searchAddress(searchaddr
)
69 # By now sa.location should be filled in with the center location.
70 # Drop out if the address was invalid.
71 if searchaddr
.location
is None:
74 corners
= LatLongTools
.findCorners(searchaddr
.location
,
75 [(stop
.location
, stop
) for stop
in candidates
])
82 addr
= self
.stopToAddress(stop
)
86 if addr
in [sa
.address
for sa
in self
.searched
]:
89 if radiusCompletelyCoveredByOtherPoints(stop
.location
,
90 [s
.location
for s
in self
.searched
91 if s
is not searchaddr
and s
.location
is not None],
93 # Already been close to here. Skip.
96 newsa
= SearchedAddress(stop
, addr
)
97 if self
.recursiveSearch(newsa
):
98 # Valid address. Stop looking in this corner.
103 def stopToAddress(self
, stop
):
104 if not isinstance(stop
, BusStopMashup
.Stop
):
105 # Only regular bus stops tend to have names that we can use as
108 if _station_stop_rx
.match(stop
.name
) is not None:
109 # Ignore Transitway station stops.
111 if _tulip_rx
.search(stop
.name
) is not None:
112 # Ignore Tulipfest stops (ARRET TULIPES / TULIP STOP / ...)
115 # RIVERSIDE / AD. 2865 -> 2865 RIVERSIDE
116 m
= _addr_rx
.match(stop
.name
)
118 return "%s %s" % (m
.group("number"), m
.group("street1"))
120 # BANK / RIVERSIDE -> BANK & RIVERSIDE
121 m
= _intersection_rx
.match(stop
.name
)
123 return "%s & %s" % (m
.group("street1"), m
.group("street2"))
125 self
.unparsedStopNames
.append(stop
)
127 def searchAddress(self
, searchaddr
):
130 self
.searched
.append(searchaddr
)
132 stops
= list(self
.client
.findStops(searchaddr
.address
))
133 except BusStopMashup
.InvalidAddressException
, e
:
134 print >> sys
.stderr
, "\tINVALID ADDR %s (referrer %s)" % (
135 searchaddr
.address
, searchaddr
.referrer
)
140 followCandidates
= []
142 if isinstance(stop
, BusStopMashup
.HomeLocation
):
143 # Record the actual center location, which is probably
144 # slightly off from the location of the stop we fed in
145 # since we're looking at the intersection and not the stop.
146 searchaddr
.location
= stop
.location
147 elif isinstance(stop
, BusStopMashup
.UnknownStopType
):
148 self
.unparsedStops
.append((stop
, searchaddr
))
150 # Even if we've seen a stop before, we should consider it as
151 # a possible search location.
152 followCandidates
.append(stop
)
153 if self
.allStops
.has_key(stop
.code
):
157 self
.allStops
[stop
.code
] = stop
159 print >> sys
.stderr
, "\t%s (referrer %s): %d stops, %d new" % (
160 searchaddr
.address
, searchaddr
.referrer
,
161 oldstops
+ newstops
, newstops
)
163 self
.progressListener(searchaddr
, self
)
165 return followCandidates
169 self
.firstTime
= False
171 time
.sleep(random
.uniform(*self
.sleepRange
))
173 # Matches a location like 'RIVERSIDE / AD. 2865'
174 _addr_re
= (r
'^(?P<street1>[^/]+?)\s*/\s*AD\.\s*(?P<number>\d+)$')
175 _addr_rx
= re
.compile(_addr_re
)
177 # Matches a regular intersection, 'STREET1 / STREET2'
178 _intersection_re
= (r
'^(?P<street1>[^/]+?)\s*/\s*(?P<street2>[^/]+?)\s*$')
179 _intersection_rx
= re
.compile(_intersection_re
)
181 # Matches a transitway stop. We'll have to ignore these.
182 _station_stop_re
= (r
'.*STOP\s*/ ARR.*')
183 _station_stop_rx
= re
.compile(_station_stop_re
)
185 # Tulipfest. Formatting is not consistent; ignore
186 _tulip_re
= (r
'TULIPES')
187 _tulip_rx
= re
.compile(_tulip_re
)
190 def radiusCompletelyCoveredByOtherPoints(location
, locations
, width
, height
):
191 p
= RectTools
.PolyRect([RectTools
.Rectangle(location
.longitude
- width
/2,
192 location
.latitude
- height
/2,
197 p
.subtract(RectTools
.Rectangle(i
.longitude
- width
/2,
198 i
.latitude
- width
/2,
201 if p
.area() < min_area
:
204 return p
.area() < min_area
207 print >> sys
.stderr
, \
208 """Usage: ./stopMashupLeech.py [--fast] seed1 [seed2 [...]]
210 Don't use --fast except for brief test runs."""
214 print >> sys
.stderr
, "Exiting after current search completes."
217 def progressListener(searchaddr
, leech
):
221 dumpPickle("grabs/allstops.pickle", leech
.allStops
)
222 dumpXML("grabs/allstops.xml", leech
.allStops
)
223 dumpSearchedXML("grabs/allsearched.xml", leech
.searched
)
227 sleepRange
= (38, 66)
230 opts
, seeds
= getopt
.getopt(argv
[1:], "fh", ["fast", "help"])
232 if o
in ('-f', '--fast'):
234 elif o
in ('-h', '--help'):
238 except getopt
.GetoptError
, e
:
245 leech
= Leech(sleepRange
)
246 leech
.progressListener
= progressListener
248 sighandler
= lambda sig
, frame
: doBail(leech
)
249 signal
.signal(signal
.SIGINT
, sighandler
)
250 signal
.signal(signal
.SIGTERM
, sighandler
)
254 leech
.followSeed(seed
)
256 print "Locations searched: %d" % len(leech
.searched
)
257 print "Stops loaded: %d" % len(leech
.allStops
)
259 print "\nUnparsed stops: %d" % len(leech
.unparsedStops
)
260 for s
in leech
.unparsedStops
:
261 print "\t%s; returned in search for %s" % (s
[0], s
[1].address
)
263 print "\nUnparsed stop names: %d" % len(leech
.unparsedStopNames
)
264 for s
in leech
.unparsedStopNames
:
267 inval
= [sa
for sa
in leech
.searched
if sa
.location
is None]
268 print "\nInvalid locations searched: %d" % len(inval
)
274 def dumpPickle(filename
, data
):
275 safeWrite(filename
, data
, pickle
.dump
)
277 def safeWrite(filename
, data
, dumpfunc
):
278 (fd
, n
) = tempfile
.mkstemp(dir='.', prefix
=filename
)
279 f
= os
.fdopen(fd
, "w")
290 def dumpXML(filename
, data
):
291 safeWrite(filename
, data
, xmlify
)
293 def dumpSearchedXML(filename
, data
):
294 safeWrite(filename
, data
, xmlifySearched
)
297 root
= ElementTree
.Element("stops")
299 for (code
, stop
) in data
.iteritems():
301 attribs
['code'] = code
303 if stop
.number
is not None:
304 attribs
['number'] = str(stop
.number
)
305 if stop
.name
is not None:
306 attribs
['name'] = stop
.name
307 if stop
.location
is not None:
308 attribs
['latitude'] = str(stop
.location
.latitude
)
309 attribs
['longitude'] = str(stop
.location
.longitude
)
311 if isinstance(stop
, BusStopMashup
.Stop
):
312 attribs
['type'] = 'stop'
313 elif isinstance(stop
, BusStopMashup
.Station
):
314 attribs
['type'] = 'station'
315 elif isinstance(stop
, BusStopMashup
.HomeLocation
):
316 attribs
['type'] = 'home'
317 attribs
['requested'] = stop
.requestedAddress
318 attribs
['responded'] = stop
.respondedAddress
319 elif isinstance(stop
, BusStopMashup
.UnknownStopType
):
320 attribs
['type'] = 'unknown'
321 attribs
['html'] = stop
.html
323 stopel
= ElementTree
.SubElement(root
, "stop", attribs
)
326 if isinstance(stop
, BusStopMashup
.Stop
):
327 routeroot
= ElementTree
.SubElement(stopel
, "routes")
328 for route
in stop
.routes
:
329 ElementTree
.SubElement(routeroot
, "route",
330 { 'number': str(route
.number
),
331 'direction': str(route
.direction
) })
333 tree
= ElementTree
.ElementTree(root
)
334 tree
.write(f
, "utf-8")
336 def xmlifySearched(data
, f
):
337 root
= ElementTree
.Element("searched")
340 if sa
.location
is None:
342 attribs
= { "latitude": str(sa
.location
.latitude
),
343 "longitude": str(sa
.location
.longitude
) }
345 if sa
.address
is not None:
346 attribs
['address'] = sa
.address
348 if sa
.referrer
is not None:
349 attribs
['referrer'] = str(sa
.referrer
)
351 sel
= ElementTree
.SubElement(root
, "address", attribs
)
354 tree
= ElementTree
.ElementTree(root
)
355 tree
.write(f
, "utf-8")
357 if __name__
== '__main__':
358 sys
.exit(main(sys
.argv
))