Script to grab stop codes from allstops.xml and sort
[ottawa-travel-planner.git] / Itinerary.py
blobd52d8580656e095b3b9fd4cb83be23c684ee3418
2 # vi: set softtabstop=4 shiftwidth=4 tabstop=8 expandtab:
4 """mcPlan"""
6 import re
7 from HTMLParser import HTMLParser, HTMLParseError
9 import Planner
10 from PlannerExceptions import *
12 class Itinerary:
13 def __init__(self, start, end, time, html):
14 self.start = start
15 self.end = end
16 self.wantTime = time
17 self.html = html
19 itinParser = ItineraryParser()
21 # Extract the HTML chunk for the actual itinerary, and feed it to
22 # the sub-parser.
23 itinParser.feed(self._extractItin(html))
24 itinParser.close()
25 self.entries = itinParser.entries
27 def anyUnparsed(self):
28 for ie in self.entries:
29 if ie.type == TYPE_UNKNOWN:
30 return True
31 return False
33 def _extractItin(self, html):
34 match = _itin_rx.search(html)
35 if not match:
36 raise ItineraryParseException("Failed to extract itinerary")
37 return match.group("itin")
39 # Matches the entire itinerary, pulling the table into group "itin"
40 _itin_re = ('<!-- begin graphical itin.*?>\s*'
41 # ?s: DOTALL: . matches \n
42 '(?s)(?P<itin><table.*</table>)\s*'
43 '<!-- end graphical itin')
44 _itin_rx = re.compile(_itin_re)
48 class ItineraryParser(HTMLParser):
49 def __init__(self):
50 HTMLParser.__init__(self)
51 self.state = STATE_NOTHING
53 # List of ItinEntry
54 self.entries = []
56 # Most recent ItinEntry, to which a bus stop number may be added
57 self.lastIE = None
59 def handle_starttag(self, tag, attrs):
60 # The directions are contained in <td class="itinText...">.
61 if tag == "td" and attrStarts(attrs, "class", "itinText"):
63 # start saving text
64 self.data = ""
65 self.state = STATE_IN_ITIN_TEXT
67 # The stop numbers are inside a block like this:
68 # <span class="itinBusStop">O-TRAIN CARLETON S.</span>
69 # <span class="itinBusStop"><a href="#" onClick="MM_openBrWindow('get.stop.timetable.oci?sptDate=2005-06-05&stop560=3062&stopLabel=CG995&stopName=O-TRAIN%2520CARLETON%2520S.','_stopWindow','scrollbars=yes,resizable=yes,width=800,height=600')">(3062)</a></span>
71 # We'll save the data in the <a> tag.
72 elif self.state == STATE_AFTER_ITIN_TEXT \
73 and tag == "span" and attrStarts(attrs, "class", "itinBusStop"):
75 self.data = ""
76 self.state = STATE_IN_BUS_STOP
78 elif self.state == STATE_IN_ITIN_TEXT and tag == "br":
79 self.data += "\n"
81 def handle_endtag(self, tag):
82 if self.state == STATE_IN_ITIN_TEXT and tag == "td":
84 self.state = STATE_AFTER_ITIN_TEXT
85 self._saveTextEntry()
87 elif self.state == STATE_IN_BUS_STOP and tag == "span":
88 if self._saveBusStop():
89 self.state = STATE_NOTHING
90 else:
91 # Usually there are two itinBusStop spans in a row, and
92 # this was probably the first one.
93 self.state = STATE_AFTER_ITIN_TEXT
95 def handle_data(self, data):
96 if self.state in (STATE_IN_ITIN_TEXT, STATE_IN_BUS_STOP):
97 self.data += data
99 def _saveTextEntry(self):
100 ie = self._buildItinEntry(self.data.strip())
101 self.entries.append(ie)
102 self.lastIE = ie
104 def _saveBusStop(self):
105 match = _bus_stop_rx.search(self.data)
106 if match:
107 self.lastIE.busStop = match.group("stopnum")
108 return bool(match)
110 def _buildItinEntry(self, text):
111 ie = ItinEntry(text)
113 # Match each regexp in turn.
114 matchTypes = (
115 (TYPE_DEPART_TIME, _depart_rx),
116 (TYPE_WAIT, _wait_rx),
117 (TYPE_WALK_TO_STOP, _walk_to_stop_rx),
118 (TYPE_WALK_TO_TRANSFER, _walk_transfer_stop_rx),
119 (TYPE_TAKE_BUS, _take_bus_rx),
120 (TYPE_WALK_TO_DEST, _walk_to_dest_rx),
122 for entry in matchTypes:
123 match = entry[1].search(text)
124 if match:
125 ie.type = entry[0]
127 # Extract fields, if any.
128 groups = match.groupdict()
129 for grp in groups.iteritems():
130 setattr(ie, grp[0], grp[1])
132 break
133 return ie
135 # ItineraryParser is an FSM.
136 STATE_NOTHING = 0
137 STATE_IN_ITIN_TEXT = 1
138 STATE_AFTER_ITIN_TEXT = 2
139 STATE_IN_BUS_STOP = 3
142 def attrStarts(attrs, name, val):
143 """Searches for attribute "name" in the attrs list. If it finds it,
144 returns true if its value starts with "val". Case insensitive."""
146 name = name.lower()
147 val = val.lower()
149 for kv in attrs:
150 # The attribute name is already lowercased by HTMLParser;
151 # make the value lowercase, too.
152 if kv[0] == name and kv[1].lower().startswith(val):
153 return True
154 return False
156 _bus_stop_re = '\((?P<stopnum>\d{4})\)'
157 _bus_stop_rx = re.compile(_bus_stop_re)
160 # Expressions we'll use to narrow an ItinEntry down to a specific type.
161 # We'll also pick out data fields where possible.
162 _ie_time_re = '[\d: APM]+'
164 _depart_re = '^Depart at (?P<startTime>' + _ie_time_re + ')$'
165 _depart_rx = re.compile(_depart_re)
167 _walk_to_stop_re = ('^At (?P<startTime>' + _ie_time_re
168 + '),\s*walk to (?:stop|station)\s*(?P<destination>.*?)\s*'
169 '\((?P<duration>[\d]+)\s*min')
170 _walk_to_stop_rx = re.compile(_walk_to_stop_re)
172 # If the trip includes a transfer through a nearby stop, you'll get this.
173 _walk_transfer_stop_re = ('(?i)^Walk to (?:stop|station)\s*'
174 '(?P<destination>.*?)\s*'
175 'for transfer\.$')
176 _walk_transfer_stop_rx = re.compile(_walk_transfer_stop_re)
178 # Flag 's' (DOTALL) is set on this one so we can skip over newlines in
179 # the description after "get off at stop <destination>". This description
180 # could be of the form "1 station(s) further" or
181 # "street PRESTON following street GLADSTONE.\nLast intersections: ... AVE."
182 _take_bus_re = ('(?is)^At (?P<startTime>' + _ie_time_re
183 + '),\s*take (?:train|bus)\s+route\s+(?P<route>.*?)\s*direction'
184 '\s*(?P<direction>.*?)\s*and get off at (?:stop|station)\s*'
185 '(?P<destination>.*?)(?:\s*, .*)?\.\s*Arrive at'
186 '\s*(?P<endTime>' + _ie_time_re + ')\.')
187 _take_bus_rx = re.compile(_take_bus_re)
189 _wait_re = ('(?i)^Wait (?P<duration>\d+) min')
190 _wait_rx = re.compile(_wait_re)
192 # If Chris asks to go from 916 to 918 meadowlands, it returns one step that
193 # begins with "At 1:45 PM, ...". Otherwise, the "At ... " clause isn't there.
194 _walk_to_dest_re = ('(?i)^(?:At (?P<startTime>' + _ie_time_re + '),\s*)?'
195 'Walk to (?P<destination>.*)\.\s+Arrive at'
196 '\s*(?P<endTime>' + _ie_time_re + ')\s+'
197 '\((?P<duration>\d+) min')
198 _walk_to_dest_rx = re.compile(_walk_to_dest_re)
200 class ItinEntry:
201 """One step in the plan.
203 Fields:
205 text: Original text from the planner.
206 duration: Duration of the step in minutes.
207 startTime: Start time as a string.
208 endTime: End time as a string.
209 route: Bus or train route.
210 direction: Bus or train direction.
211 destination: A string. Usually a bus stop at the end of a step.
212 busStop: Bus stop number associated with the destination."""
214 def __init__(self, text):
215 self.text = text
216 self.busStop = None
217 self.type = TYPE_UNKNOWN
218 self.duration = None
219 self.startTime = None
220 self.endTime = None
221 self.route = None
222 self.direction = None
223 self.destination = None
225 def __str__(self):
226 if self.busStop:
227 return "<T%d %s (%s)>" % (self.type, self.text, self.busStop)
228 else:
229 return "<T%d %s>" % (self.type, self.text)
231 def __repr__(self):
232 return self.__str__()
234 # ItinEntry types.
235 TYPE_UNKNOWN = 0
236 TYPE_DEPART_TIME = 1
237 TYPE_WALK_TO_STOP = 2
238 TYPE_TAKE_BUS = 3
239 TYPE_WAIT = 4
240 TYPE_WALK_TO_DEST = 5
241 TYPE_WALK_TO_TRANSFER = 6