2 # vi: set softtabstop=4 shiftwidth=4 tabstop=8 expandtab:
13 from xml
.etree
import ElementTree
19 class SearchedAddress(object):
20 def __init__(self
, referrer
, address
):
21 self
.referrer
= referrer
22 self
.address
= address
26 return "searchaddr (referrer %s, addr %s, location %s)" % (
27 self
.referrer
, self
.address
, self
.location
)
30 def __init__(self
, sleepRange
):
31 self
.sleepRange
= sleepRange
33 # All stops we've found so far, indexed by code.
36 # Stops whose addresses we've fed back into the web application.
39 # Stops we weren't able to parse properly.
40 # Each entry is a tuple: (UnknownStopType, SearchedAddress)
41 self
.unparsedStops
= []
43 # Stops whose names we weren't able to convert to addresses.
44 self
.unparsedStopNames
= []
46 self
.client
= BusStopMashup
.Client()
48 self
.progressListener
= lambda searchaddr
, leech
: True
55 def followSeed(self
, seed
):
56 print >> sys
.stderr
, "SEED %s" % seed
58 oall
= len(self
.allStops
)
59 osearch
= len(self
.searched
)
61 sa
= SearchedAddress(seed
, seed
)
62 self
.recursiveSearch(sa
)
63 print >> sys
.stderr
, \
64 "END SEED %s: searched %d addresses, found %d new stops" % (
65 seed
, len(self
.searched
) - osearch
, len(self
.allStops
) - oall
)
67 def recursiveSearch(self
, searchaddr
):
68 candidates
= self
.searchAddress(searchaddr
)
70 # By now sa.location should be filled in with the center location.
71 # Drop out if the address was invalid.
72 if searchaddr
.location
is None:
75 corners
= LatLongTools
.findCorners(searchaddr
.location
,
76 [(stop
.location
, stop
) for stop
in candidates
])
83 addr
= self
.stopToAddress(stop
)
87 if addr
in [sa
.address
for sa
in self
.searched
]:
88 # Already been here. Skip this corner.
91 if radiusCompletelyCoveredByOtherPoints(stop
.location
,
92 [s
.location
for s
in self
.searched
93 if s
is not searchaddr
and s
.location
is not None],
95 # Already been close to here. Skip this corner.
98 newsa
= SearchedAddress(stop
, addr
)
99 if self
.recursiveSearch(newsa
):
100 # Valid address. Stop looking in this corner.
105 def stopToAddress(self
, stop
):
106 # The mashup doesn't like accented characters, so try a simple
108 name
= anglicize(stop
.name
)
110 if not isinstance(stop
, BusStopMashup
.Stop
):
111 # Only regular bus stops tend to have names that we can use as
114 if _station_stop_rx
.match(name
) is not None:
115 # Ignore Transitway station stops.
117 if _tulip_rx
.search(name
) is not None:
118 # Ignore Tulipfest stops (ARRET TULIPES / TULIP STOP / ...)
121 # RIVERSIDE / AD. 2865 -> 2865 RIVERSIDE
122 m
= _addr_rx
.match(name
)
124 return "%s %s" % (m
.group("number"), m
.group("street1"))
126 # BANK / RIVERSIDE -> BANK & RIVERSIDE
127 m
= _intersection_rx
.match(name
)
129 return "%s & %s" % (m
.group("street1"), m
.group("street2"))
131 self
.unparsedStopNames
.append(stop
)
133 def searchAddress(self
, searchaddr
):
136 self
.searched
.append(searchaddr
)
138 stops
= list(self
.client
.findStops(searchaddr
.address
))
139 except BusStopMashup
.InvalidAddressException
, e
:
140 print >> sys
.stderr
, "\tINVALID ADDR %s (referrer %s)" % (
141 searchaddr
.address
, searchaddr
.referrer
)
144 print >> sys
.stderr
, "\tFAILED SEARCH %s: %s (referrer %s)" % (
145 searchaddr
.address
, e
, searchaddr
.referrer
)
150 followCandidates
= []
152 if isinstance(stop
, BusStopMashup
.HomeLocation
):
153 # Record the actual center location, which is probably
154 # slightly off from the location of the stop we fed in
155 # since we're looking at the intersection and not the stop.
156 searchaddr
.location
= stop
.location
157 elif isinstance(stop
, BusStopMashup
.UnknownStopType
):
158 self
.unparsedStops
.append((stop
, searchaddr
))
160 # Even if we've seen a stop before, we should consider it as
161 # a possible search location.
162 followCandidates
.append(stop
)
163 if self
.allStops
.has_key(stop
.code
):
167 self
.allStops
[stop
.code
] = stop
169 print >> sys
.stderr
, "\t%s (referrer %s): %d stops, %d new" % (
170 searchaddr
.address
, searchaddr
.referrer
,
171 oldstops
+ newstops
, newstops
)
173 self
.progressListener(searchaddr
, self
)
175 return followCandidates
179 self
.firstTime
= False
181 time
.sleep(random
.uniform(*self
.sleepRange
))
183 # Matches a location like 'RIVERSIDE / AD. 2865'
184 _addr_re
= (r
'^(?P<street1>[^/]+?)\s*/\s*AD\.\s*(?P<number>\d+)$')
185 _addr_rx
= re
.compile(_addr_re
)
187 # Matches a regular intersection, 'STREET1 / STREET2'
188 _intersection_re
= (r
'^(?P<street1>[^/]+?)\s*/\s*(?P<street2>[^/]+?)\s*$')
189 _intersection_rx
= re
.compile(_intersection_re
)
191 # Matches a transitway stop. We'll have to ignore these.
192 _station_stop_re
= (r
'.*STOP\s*/ ARR.*')
193 _station_stop_rx
= re
.compile(_station_stop_re
)
195 # Tulipfest. Formatting is not consistent; ignore
196 _tulip_re
= (r
'TULIPES')
197 _tulip_rx
= re
.compile(_tulip_re
)
199 def anglicize(ucodestr
):
200 return ucodestr
.translate({
201 0xc0: u
'A', 0xc1: u
'A', 0xc2: u
'A', 0xc3: u
'A', 0xc4: u
'A', 0xc5: u
'A',
202 0xc6: u
'AE', 0xc7: u
'C',
203 0xc8: u
'E', 0xc9: u
'E', 0xca: u
'E', 0xcb: u
'E',
204 0xcc: u
'I', 0xcd: u
'I', 0xce: u
'I', 0xcf: u
'I',
205 0xd2: u
'O', 0xd3: u
'O', 0xd4: u
'O', 0xd5: u
'O', 0xd6: u
'O', 0xd8: u
'O',
206 0xd9: u
'U', 0xda: u
'U', 0xdb: u
'U', 0xdc: u
'U',
208 0xe0: u
'a', 0xe1: u
'a', 0xe2: u
'a', 0xe3: u
'a', 0xe4: u
'a', 0xe5: u
'a',
210 0xe8: u
'e', 0xe9: u
'e', 0xea: u
'e', 0xeb: u
'e',
211 0xec: u
'i', 0xed: u
'i', 0xee: u
'i', 0xef: u
'i',
212 0xf2: u
'o', 0xf3: u
'o', 0xf4: u
'o', 0xf5: u
'o', 0xf6: u
'o', 0xf8: u
'o',
213 0xf9: u
'u', 0xfa: u
'u', 0xfb: u
'u', 0xfc: u
'u', 0xfd: u
'u',
214 0xfd: u
'y'}).encode("ascii")
217 def radiusCompletelyCoveredByOtherPoints(location
, locations
, width
, height
):
218 p
= RectTools
.PolyRect([RectTools
.Rectangle(location
.longitude
- width
/2,
219 location
.latitude
- height
/2,
222 # Require at least 2% new coverage.
223 min_area
= 0.02 * width
* height
225 p
= p
.subtract(RectTools
.Rectangle(i
.longitude
- width
/2,
226 i
.latitude
- width
/2,
229 if p
.area() < min_area
:
232 return p
.area() < min_area
235 print >> sys
.stderr
, \
236 """Usage: ./stopMashupLeech.py [--fast] seed1 [seed2 [...]]
238 Don't use --fast except for brief test runs."""
242 print >> sys
.stderr
, "Exiting after current search completes."
245 def progressListener(searchaddr
, leech
):
249 dumpXML(STOPS_XML
, leech
.allStops
)
250 dumpSearchedXML(SEARCHADDR_XML
, leech
.searched
)
252 def tryResume(leech
):
254 stopf
= open(STOPS_XML
, "r")
255 leech
.allStops
= unxmlifyStops(stopf
)
257 print >> sys
.stderr
, "Loaded %d previous stops." % len(leech
.allStops
)
259 searchf
= open(SEARCHADDR_XML
, "r")
260 leech
.searched
= unxmlifySearched(searchf
)
262 print >> sys
.stderr
, "Loaded %d previously searched addresses." % (
266 # XML parser exceptions will cause a program exit
269 sleepRange
= (38, 66)
272 opts
, seeds
= getopt
.getopt(argv
[1:], "fh", ["fast", "help"])
274 if o
in ('-f', '--fast'):
276 elif o
in ('-h', '--help'):
280 except getopt
.GetoptError
, e
:
287 leech
= Leech(sleepRange
)
288 leech
.progressListener
= progressListener
292 sighandler
= lambda sig
, frame
: doBail(leech
)
293 signal
.signal(signal
.SIGINT
, sighandler
)
294 signal
.signal(signal
.SIGTERM
, sighandler
)
298 leech
.followSeed(seed
)
300 print "Locations searched: %d" % len(leech
.searched
)
301 print "Stops loaded: %d" % len(leech
.allStops
)
303 print "\nUnparsed stops: %d" % len(leech
.unparsedStops
)
304 for s
in leech
.unparsedStops
:
305 print "\t%s; returned in search for %s" % (s
[0], s
[1].address
)
307 print "\nUnparsed stop names: %d" % len(leech
.unparsedStopNames
)
308 for s
in leech
.unparsedStopNames
:
309 print "\t%s" % repr(s
)
311 inval
= [sa
for sa
in leech
.searched
if sa
.location
is None]
312 print "\nInvalid locations searched: %d" % len(inval
)
314 print "\t%s" % repr(i
)
318 def safeWrite(filename
, data
, dumpfunc
):
319 (fd
, n
) = tempfile
.mkstemp(dir='.', prefix
=filename
)
320 f
= os
.fdopen(fd
, "w")
331 def dumpXML(filename
, data
):
332 safeWrite(filename
, data
, xmlifyStops
)
334 def dumpSearchedXML(filename
, data
):
335 safeWrite(filename
, data
, xmlifySearched
)
337 def xmlifyStops(data
, f
):
338 root
= ElementTree
.Element("stops")
341 for (code
, stop
) in data
.iteritems():
343 attribs
['code'] = code
345 if stop
.number
is not None:
346 attribs
['number'] = str(stop
.number
)
347 if stop
.name
is not None:
348 attribs
['name'] = stop
.name
349 if stop
.location
is not None:
350 attribs
['latitude'] = str(stop
.location
.latitude
)
351 attribs
['longitude'] = str(stop
.location
.longitude
)
352 if stop
.requestedAddress
is not None:
353 attribs
['searchLocation'] = stop
.requestedAddress
355 if isinstance(stop
, BusStopMashup
.Stop
):
356 attribs
['type'] = 'stop'
357 elif isinstance(stop
, BusStopMashup
.Station
):
358 attribs
['type'] = 'station'
362 stopel
= ElementTree
.SubElement(root
, "stop", attribs
)
365 if isinstance(stop
, BusStopMashup
.Stop
):
366 routeroot
= ElementTree
.SubElement(stopel
, "routes")
367 for route
in stop
.routes
:
368 ElementTree
.SubElement(routeroot
, "route",
369 { 'number': str(route
.number
),
370 'direction': str(route
.direction
) })
372 tree
= ElementTree
.ElementTree(root
)
373 tree
.write(f
, "utf-8")
375 def unxmlifyStops(f
):
378 etree
= ElementTree
.parse(f
)
379 for stopel
in etree
.findall("stop"):
380 type = stopel
.get("type")
381 if type == "station":
382 stop
= BusStopMashup
.Station()
384 stop
= BusStopMashup
.Stop()
386 stop
.code
= stopel
.get("code")
387 if stop
.code
is None:
390 stop
.number
= stopel
.get("number")
391 stop
.name
= stopel
.get("name")
393 lat
= stopel
.get("latitude")
394 lng
= stopel
.get("longitude")
396 stop
.location
= BusStopMashup
.StopLocation(lat
, lng
)
398 stop
.requestedAddress
= stopel
.get("searchLocation")
400 if isinstance(stop
, BusStopMashup
.Stop
):
401 for routeel
in stopel
.find("routes").findall("route"):
403 BusStopMashup
.StopRoute(
404 routeel
.get("number"), routeel
.get("direction")))
406 # For consistency, make all strings unicode if they aren't already.
408 stop
.code
= unicode(stop
.code
)
410 stop
.name
= unicode(stop
.name
)
412 stops
[stop
.code
] = stop
417 def xmlifySearched(data
, f
):
418 root
= ElementTree
.Element("searched")
421 if sa
.location
is None:
422 attribs
= { "invalid": "1" }
424 attribs
= { "latitude": str(sa
.location
.latitude
),
425 "longitude": str(sa
.location
.longitude
) }
427 if sa
.address
is not None:
428 attribs
['address'] = sa
.address
430 if sa
.referrer
is not None:
431 attribs
['referrer'] = str(sa
.referrer
)
433 sel
= ElementTree
.SubElement(root
, "address", attribs
)
436 tree
= ElementTree
.ElementTree(root
)
437 tree
.write(f
, "utf-8")
439 def unxmlifySearched(f
):
442 etree
= ElementTree
.parse(f
)
443 for searchel
in etree
.findall("address"):
444 sa
= SearchedAddress(searchel
.get("referrer"), searchel
.get("address"))
446 lat
= searchel
.get("latitude")
447 lng
= searchel
.get("longitude")
449 sa
.location
= BusStopMashup
.StopLocation(lat
, lng
)
454 SEARCHADDR_XML
= "grabs/allsearched.xml"
455 STOPS_XML
= "grabs/allstops.xml"
457 if __name__
== '__main__':
458 sys
.exit(main(sys
.argv
))