2 # vi: set softtabstop=4 shiftwidth=4 tabstop=8 expandtab:
13 from xml
.etree
import ElementTree
22 class SearchedAddress(object):
23 def __init__(self
, referrer
, address
):
24 self
.referrer
= referrer
25 self
.address
= address
29 return "searchaddr (referrer %s, addr %s, location %s)" % (
30 self
.referrer
, self
.address
, self
.location
)
33 def __init__(self
, sleepRange
):
34 self
.sleepRange
= sleepRange
36 # All stops we've found so far, indexed by code.
39 # Stops whose addresses we've fed back into the web application.
42 # Stops we weren't able to parse properly.
43 # Each entry is a tuple: (UnknownStopType, SearchedAddress)
44 self
.unparsedStops
= []
46 # Stops whose names we weren't able to convert to addresses.
47 self
.unparsedStopNames
= []
49 self
.client
= BusStopMashup
.Client()
51 self
.progressListener
= lambda searchaddr
, leech
: True
58 def followSeed(self
, seed
):
59 print >> sys
.stderr
, "SEED %s" % seed
61 oall
= len(self
.allStops
)
62 osearch
= len(self
.searched
)
64 sa
= SearchedAddress(seed
, seed
)
65 self
.recursiveSearch(sa
)
66 print >> sys
.stderr
, \
67 "END SEED %s: searched %d addresses, found %d new stops" % (
68 seed
, len(self
.searched
) - osearch
, len(self
.allStops
) - oall
)
70 def recursiveSearch(self
, searchaddr
):
71 candidates
= self
.searchAddress(searchaddr
)
73 # By now sa.location should be filled in with the center location.
74 # Drop out if the address was invalid.
75 if searchaddr
.location
is None:
78 corners
= LatLongTools
.findCorners(searchaddr
.location
,
79 [(stop
.location
, stop
) for stop
in candidates
])
86 addr
= self
.stopToAddress(stop
)
90 if addr
in [sa
.address
for sa
in self
.searched
]:
91 # Already been here. Skip this corner.
94 if radiusCompletelyCoveredByOtherPoints(stop
.location
,
95 [s
.location
for s
in self
.searched
96 if s
is not searchaddr
and s
.location
is not None],
98 # Already been close to here. Skip this corner.
101 newsa
= SearchedAddress(stop
, addr
)
102 if self
.recursiveSearch(newsa
):
103 # Valid address. Stop looking in this corner.
108 def stopToAddress(self
, stop
):
109 # The mashup doesn't like accented characters, so try a simple
111 name
= anglicize(stop
.name
)
113 if not isinstance(stop
, BusStopMashup
.Stop
):
114 # Only regular bus stops tend to have names that we can use as
117 if _station_stop_rx
.match(name
) is not None:
118 # Ignore Transitway station stops.
120 if _tulip_rx
.search(name
) is not None:
121 # Ignore Tulipfest stops (ARRET TULIPES / TULIP STOP / ...)
124 # RIVERSIDE / AD. 2865 -> 2865 RIVERSIDE
125 m
= _addr_rx
.match(name
)
127 return "%s %s" % (m
.group("number"), m
.group("street1"))
129 # BANK / RIVERSIDE -> BANK & RIVERSIDE
130 m
= _intersection_rx
.match(name
)
132 return "%s & %s" % (m
.group("street1"), m
.group("street2"))
134 self
.unparsedStopNames
.append(stop
)
136 def searchAddress(self
, searchaddr
):
139 self
.searched
.append(searchaddr
)
141 stops
= list(self
.client
.findStops(searchaddr
.address
))
142 except BusStopMashup
.InvalidAddressException
, e
:
143 print >> sys
.stderr
, "\tINVALID ADDR %s (referrer %s)" % (
144 searchaddr
.address
, searchaddr
.referrer
)
147 print >> sys
.stderr
, "\tFAILED SEARCH %s: %s (referrer %s)" % (
148 searchaddr
.address
, e
, searchaddr
.referrer
)
153 followCandidates
= []
155 if isinstance(stop
, BusStopMashup
.HomeLocation
):
156 # Record the actual center location, which is probably
157 # slightly off from the location of the stop we fed in
158 # since we're looking at the intersection and not the stop.
159 searchaddr
.location
= stop
.location
160 elif isinstance(stop
, BusStopMashup
.UnknownStopType
):
161 self
.unparsedStops
.append((stop
, searchaddr
))
163 # Even if we've seen a stop before, we should consider it as
164 # a possible search location.
165 followCandidates
.append(stop
)
166 if self
.allStops
.has_key(stop
.code
):
170 self
.allStops
[stop
.code
] = stop
172 print >> sys
.stderr
, "\t%s (referrer %s): %d stops, %d new" % (
173 searchaddr
.address
, searchaddr
.referrer
,
174 oldstops
+ newstops
, newstops
)
176 self
.progressListener(searchaddr
, self
)
178 return followCandidates
182 self
.firstTime
= False
184 time
.sleep(random
.uniform(*self
.sleepRange
))
186 # Matches a location like 'RIVERSIDE / AD. 2865'
187 _addr_re
= (r
'^(?P<street1>[^/]+?)\s*/\s*AD\.\s*(?P<number>\d+)$')
188 _addr_rx
= re
.compile(_addr_re
)
190 # Matches a regular intersection, 'STREET1 / STREET2'
191 _intersection_re
= (r
'^(?P<street1>[^/]+?)\s*/\s*(?P<street2>[^/]+?)\s*$')
192 _intersection_rx
= re
.compile(_intersection_re
)
194 # Matches a transitway stop. We'll have to ignore these.
195 _station_stop_re
= (r
'.*STOP\s*/ ARR.*')
196 _station_stop_rx
= re
.compile(_station_stop_re
)
198 # Tulipfest. Formatting is not consistent; ignore
199 _tulip_re
= (r
'TULIPES')
200 _tulip_rx
= re
.compile(_tulip_re
)
202 def anglicize(ucodestr
):
203 return ucodestr
.translate({
204 0xc0: u
'A', 0xc1: u
'A', 0xc2: u
'A', 0xc3: u
'A', 0xc4: u
'A', 0xc5: u
'A',
205 0xc6: u
'AE', 0xc7: u
'C',
206 0xc8: u
'E', 0xc9: u
'E', 0xca: u
'E', 0xcb: u
'E',
207 0xcc: u
'I', 0xcd: u
'I', 0xce: u
'I', 0xcf: u
'I',
208 0xd2: u
'O', 0xd3: u
'O', 0xd4: u
'O', 0xd5: u
'O', 0xd6: u
'O', 0xd8: u
'O',
209 0xd9: u
'U', 0xda: u
'U', 0xdb: u
'U', 0xdc: u
'U',
211 0xe0: u
'a', 0xe1: u
'a', 0xe2: u
'a', 0xe3: u
'a', 0xe4: u
'a', 0xe5: u
'a',
213 0xe8: u
'e', 0xe9: u
'e', 0xea: u
'e', 0xeb: u
'e',
214 0xec: u
'i', 0xed: u
'i', 0xee: u
'i', 0xef: u
'i',
215 0xf2: u
'o', 0xf3: u
'o', 0xf4: u
'o', 0xf5: u
'o', 0xf6: u
'o', 0xf8: u
'o',
216 0xf9: u
'u', 0xfa: u
'u', 0xfb: u
'u', 0xfc: u
'u', 0xfd: u
'u',
217 0xfd: u
'y'}).encode("ascii")
220 def radiusCompletelyCoveredByOtherPoints(location
, locations
, width
, height
):
221 p
= RectTools
.PolyRect([RectTools
.Rectangle(location
.longitude
- width
/2,
222 location
.latitude
- height
/2,
225 # Require at least 2% new coverage.
226 min_area
= 0.02 * width
* height
228 p
= p
.subtract(RectTools
.Rectangle(i
.longitude
- width
/2,
229 i
.latitude
- width
/2,
232 if p
.area() < min_area
:
235 return p
.area() < min_area
238 print >> sys
.stderr
, \
239 """Usage: ./stopMashupLeech.py [--fast] seed1 [seed2 [...]]
241 Don't use --fast except for brief test runs."""
245 print >> sys
.stderr
, "Exiting after current search completes."
248 def progressListener(searchaddr
, leech
):
252 dumpXML(STOPS_XML
, leech
.allStops
)
253 dumpSearchedXML(SEARCHADDR_XML
, leech
.searched
)
255 def tryResume(leech
):
257 stopf
= open(STOPS_XML
, "r")
258 leech
.allStops
= unxmlifyStops(stopf
)
260 print >> sys
.stderr
, "Loaded %d previous stops." % len(leech
.allStops
)
262 searchf
= open(SEARCHADDR_XML
, "r")
263 leech
.searched
= unxmlifySearched(searchf
)
265 print >> sys
.stderr
, "Loaded %d previously searched addresses." % (
269 # XML parser exceptions will cause a program exit
272 sleepRange
= (38, 66)
275 opts
, seeds
= getopt
.getopt(argv
[1:], "fh", ["fast", "help"])
277 if o
in ('-f', '--fast'):
279 elif o
in ('-h', '--help'):
283 except getopt
.GetoptError
, e
:
290 leech
= Leech(sleepRange
)
291 leech
.progressListener
= progressListener
295 sighandler
= lambda sig
, frame
: doBail(leech
)
296 signal
.signal(signal
.SIGINT
, sighandler
)
297 signal
.signal(signal
.SIGTERM
, sighandler
)
301 leech
.followSeed(seed
)
303 print "Locations searched: %d" % len(leech
.searched
)
304 print "Stops loaded: %d" % len(leech
.allStops
)
306 print "\nUnparsed stops: %d" % len(leech
.unparsedStops
)
307 for s
in leech
.unparsedStops
:
308 print "\t%s; returned in search for %s" % (s
[0], s
[1].address
)
310 print "\nUnparsed stop names: %d" % len(leech
.unparsedStopNames
)
311 for s
in leech
.unparsedStopNames
:
312 print "\t%s" % repr(s
)
314 inval
= [sa
for sa
in leech
.searched
if sa
.location
is None]
315 print "\nInvalid locations searched: %d" % len(inval
)
317 print "\t%s" % repr(i
)
321 def safeWrite(filename
, data
, dumpfunc
):
322 (fd
, n
) = tempfile
.mkstemp(dir='.', prefix
=filename
)
323 f
= os
.fdopen(fd
, "w")
334 def dumpXML(filename
, data
):
335 safeWrite(filename
, data
, xmlifyStops
)
337 def dumpSearchedXML(filename
, data
):
338 safeWrite(filename
, data
, xmlifySearched
)
340 def xmlifyStops(data
, f
):
341 root
= ElementTree
.Element("stops")
344 for (code
, stop
) in data
.iteritems():
346 attribs
['code'] = code
348 if stop
.number
is not None:
349 attribs
['number'] = str(stop
.number
)
350 if stop
.name
is not None:
351 attribs
['name'] = stop
.name
352 if stop
.location
is not None:
353 attribs
['latitude'] = str(stop
.location
.latitude
)
354 attribs
['longitude'] = str(stop
.location
.longitude
)
355 if stop
.requestedAddress
is not None:
356 attribs
['searchLocation'] = stop
.requestedAddress
358 if isinstance(stop
, BusStopMashup
.Stop
):
359 attribs
['type'] = 'stop'
360 elif isinstance(stop
, BusStopMashup
.Station
):
361 attribs
['type'] = 'station'
365 stopel
= ElementTree
.SubElement(root
, "stop", attribs
)
368 if isinstance(stop
, BusStopMashup
.Stop
):
369 routeroot
= ElementTree
.SubElement(stopel
, "routes")
370 for route
in stop
.routes
:
371 ElementTree
.SubElement(routeroot
, "route",
372 { 'number': str(route
.number
),
373 'direction': str(route
.direction
) })
375 tree
= ElementTree
.ElementTree(root
)
376 tree
.write(f
, "utf-8")
378 def unxmlifyStops(f
):
381 etree
= ElementTree
.parse(f
)
382 for stopel
in etree
.findall("stop"):
383 type = stopel
.get("type")
384 if type == "station":
385 stop
= BusStopMashup
.Station()
387 stop
= BusStopMashup
.Stop()
389 stop
.code
= stopel
.get("code")
390 if stop
.code
is None:
393 stop
.number
= stopel
.get("number")
394 stop
.name
= stopel
.get("name")
396 lat
= stopel
.get("latitude")
397 lng
= stopel
.get("longitude")
399 stop
.location
= BusStopMashup
.StopLocation(lat
, lng
)
401 stop
.requestedAddress
= stopel
.get("searchLocation")
403 if isinstance(stop
, BusStopMashup
.Stop
):
404 for routeel
in stopel
.find("routes").findall("route"):
406 BusStopMashup
.StopRoute(
407 routeel
.get("number"), routeel
.get("direction")))
409 # For consistency, make all strings unicode if they aren't already.
411 stop
.code
= unicode(stop
.code
)
413 stop
.name
= unicode(stop
.name
)
415 stops
[stop
.code
] = stop
420 def xmlifySearched(data
, f
):
421 root
= ElementTree
.Element("searched")
424 if sa
.location
is None:
425 attribs
= { "invalid": "1" }
427 attribs
= { "latitude": str(sa
.location
.latitude
),
428 "longitude": str(sa
.location
.longitude
) }
430 if sa
.address
is not None:
431 attribs
['address'] = sa
.address
433 if sa
.referrer
is not None:
434 attribs
['referrer'] = str(sa
.referrer
)
436 sel
= ElementTree
.SubElement(root
, "address", attribs
)
439 tree
= ElementTree
.ElementTree(root
)
440 tree
.write(f
, "utf-8")
442 def unxmlifySearched(f
):
445 etree
= ElementTree
.parse(f
)
446 for searchel
in etree
.findall("address"):
447 sa
= SearchedAddress(searchel
.get("referrer"), searchel
.get("address"))
449 lat
= searchel
.get("latitude")
450 lng
= searchel
.get("longitude")
452 sa
.location
= BusStopMashup
.StopLocation(lat
, lng
)
457 SEARCHADDR_XML
= "out/allsearched.xml"
458 STOPS_XML
= "out/allstops.xml"
460 if __name__
== '__main__':
461 sys
.exit(main(sys
.argv
))