2 # vi: set softtabstop=4 shiftwidth=4 tabstop=8 expandtab:
7 from HTMLParser
import HTMLParser
, HTMLParseError
10 from PlannerExceptions
import *
13 def __init__(self
, start
, end
, time
, html
):
19 itinParser
= ItineraryParser()
21 # Extract the HTML chunk for the actual itinerary, and feed it to
23 itinParser
.feed(self
._extractItin
(html
))
25 self
.entries
= itinParser
.entries
28 def _extractItin(self
, html
):
29 match
= _itin_rx
.search(html
)
31 raise ItineraryParseException("Failed to extract itinerary")
32 return match
.group("itin")
34 # Matches the entire itinerary, pulling the table into group "itin"
35 _itin_re
= ('<!-- begin graphical itin.*?>\s*'
36 # ?s: DOTALL: . matches \n
37 '(?s)(?P<itin><table.*</table>)\s*'
38 '<!-- end graphical itin')
39 _itin_rx
= sre
.compile(_itin_re
)
43 class ItineraryParser(HTMLParser
):
45 HTMLParser
.__init
__(self
)
46 self
.state
= STATE_NOTHING
51 # Most recent ItinEntry, to which a bus stop number may be added
54 def handle_starttag(self
, tag
, attrs
):
55 # The directions are contained in <td class="itinText...">.
56 if tag
== "td" and attrStarts(attrs
, "class", "itinText"):
60 self
.state
= STATE_IN_ITIN_TEXT
62 # The stop numbers are inside a block like this:
63 # <span class="itinBusStop">O-TRAIN CARLETON S.</span>
64 # <span class="itinBusStop"><a href="#" onClick="MM_openBrWindow('get.stop.timetable.oci?sptDate=2005-06-05&stop560=3062&stopLabel=CG995&stopName=O-TRAIN%2520CARLETON%2520S.','_stopWindow','scrollbars=yes,resizable=yes,width=800,height=600')">(3062)</a></span>
66 # We'll save the data in the <a> tag.
67 elif self
.state
== STATE_AFTER_ITIN_TEXT \
68 and tag
== "span" and attrStarts(attrs
, "class", "itinBusStop"):
71 self
.state
= STATE_IN_BUS_STOP
73 elif self
.state
== STATE_IN_ITIN_TEXT
and tag
== "br":
76 def handle_endtag(self
, tag
):
77 if self
.state
== STATE_IN_ITIN_TEXT
and tag
== "td":
79 self
.state
= STATE_AFTER_ITIN_TEXT
82 elif self
.state
== STATE_IN_BUS_STOP
and tag
== "span":
83 if self
._saveBusStop
():
84 self
.state
= STATE_NOTHING
86 # Usually there are two itinBusStop spans in a row, and
87 # this was probably the first one.
88 self
.state
= STATE_AFTER_ITIN_TEXT
90 def handle_data(self
, data
):
91 if self
.state
in (STATE_IN_ITIN_TEXT
, STATE_IN_BUS_STOP
):
94 def _saveTextEntry(self
):
95 ie
= self
._buildItinEntry
(self
.data
.strip())
96 self
.entries
.append(ie
)
99 def _saveBusStop(self
):
100 match
= _bus_stop_rx
.search(self
.data
)
102 self
.lastIE
.busStop
= match
.group("stopnum")
105 def _buildItinEntry(self
, text
):
108 # Match each regexp in turn.
110 (TYPE_DEPART_TIME
, _depart_rx
),
111 (TYPE_WAIT
, _wait_rx
),
112 (TYPE_WALK_TO_STOP
, _walk_to_stop_rx
),
113 (TYPE_WALK_TO_TRANSFER
, _walk_transfer_stop_rx
),
114 (TYPE_TAKE_BUS
, _take_bus_rx
),
115 (TYPE_WALK_TO_DEST
, _walk_to_dest_rx
),
117 for entry
in matchTypes
:
118 match
= entry
[1].search(text
)
122 # Extract fields, if any.
123 groups
= match
.groupdict()
124 for grp
in groups
.iteritems():
125 setattr(ie
, grp
[0], grp
[1])
130 # ItineraryParser is an FSM.
132 STATE_IN_ITIN_TEXT
= 1
133 STATE_AFTER_ITIN_TEXT
= 2
134 STATE_IN_BUS_STOP
= 3
137 def attrStarts(attrs
, name
, val
):
138 """Searches for attribute "name" in the attrs list. If it finds it,
139 returns true if its value starts with "val". Case insensitive."""
145 # The attribute name is already lowercased by HTMLParser;
146 # make the value lowercase, too.
147 if kv
[0] == name
and kv
[1].lower().startswith(val
):
151 _bus_stop_re
= '\((?P<stopnum>\d{4})\)'
152 _bus_stop_rx
= sre
.compile(_bus_stop_re
)
155 # Expressions we'll use to narrow an ItinEntry down to a specific type.
156 # We'll also pick out data fields where possible.
157 _ie_time_re
= '[\d: APM]+'
159 _depart_re
= '^Depart at (?P<startTime>' + _ie_time_re
+ ')$'
160 _depart_rx
= sre
.compile(_depart_re
)
162 _walk_to_stop_re
= ('^At (?P<startTime>' + _ie_time_re
163 + '),\s*walk to (?:stop|station)\s*(?P<destination>.*?)\s*'
164 '\((?P<duration>[\d]+)\s*min')
165 _walk_to_stop_rx
= sre
.compile(_walk_to_stop_re
)
167 # If the trip includes a transfer through a nearby stop, you'll get this.
168 _walk_transfer_stop_re
= ('(?i)^Walk to (?:stop|station)\s*'
169 '(?P<destination>.*?)\s*'
171 _walk_transfer_stop_rx
= sre
.compile(_walk_transfer_stop_re
)
173 # Flag 's' (DOTALL) is set on this one so we can skip over newlines in
174 # the description after "get off at stop <destination>". This description
175 # could be of the form "1 station(s) further" or
176 # "street PRESTON following street GLADSTONE.\nLast intersections: ... AVE."
177 _take_bus_re
= ('(?is)^At (?P<startTime>' + _ie_time_re
178 + '),\s*take (?:train|bus)\s+route\s+(?P<route>.*?)\s*direction'
179 '\s*(?P<direction>.*?)\s*and get off at (?:stop|station)\s*'
180 '(?P<destination>.*?)(?:\s*, .*)?\.\s*Arrive at'
181 '\s*(?P<endTime>' + _ie_time_re
+ ')\.$')
182 _take_bus_rx
= sre
.compile(_take_bus_re
)
184 _wait_re
= ('(?i)^Wait (?P<duration>\d+) min')
185 _wait_rx
= sre
.compile(_wait_re
)
187 _walk_to_dest_re
= ('(?i)^Walk to (?P<destination>.*)\.\s+Arrive at'
188 '\s*(?P<endTime>' + _ie_time_re
+ ')\s+'
189 '\((?P<duration>\d+) min')
190 _walk_to_dest_rx
= sre
.compile(_walk_to_dest_re
)
193 """One step in the plan.
197 text: Original text from the planner.
198 duration: Duration of the step in minutes.
199 startTime: Start time as a string.
200 endTime: End time as a string.
201 route: Bus or train route.
202 direction: Bus or train direction.
203 destination: A string. Usually a bus stop at the end of a step.
204 busStop: Bus stop number associated with the destination."""
206 def __init__(self
, text
):
209 self
.type = TYPE_UNKNOWN
211 self
.startTime
= None
214 self
.direction
= None
215 self
.destination
= None
219 return "<T%d %s (%s)>" % (self
.type, self
.text
, self
.busStop
)
221 return "<T%d %s>" % (self
.type, self
.text
)
224 return self
.__str
__()
229 TYPE_WALK_TO_STOP
= 2
232 TYPE_WALK_TO_DEST
= 5
233 TYPE_WALK_TO_TRANSFER
= 6