Full location support with partial match smartitty
[ottawa-travel-planner.git] / Itinerary.py
blobf8847e97d289b17f3ab62528b594672b247a2fd7
2 # vi: set softtabstop=4 shiftwidth=4 tabstop=8 expandtab:
4 """mcPlan"""
6 import sre
7 from HTMLParser import HTMLParser, HTMLParseError
9 import Planner
10 from PlannerExceptions import *
12 class Itinerary:
13 def __init__(self, start, end, time, html):
14 self.start = start
15 self.end = end
16 self.wantTime = time
17 self.html = html
19 itinParser = ItineraryParser()
21 # Extract the HTML chunk for the actual itinerary, and feed it to
22 # the sub-parser.
23 itinParser.feed(self._extractItin(html))
24 itinParser.close()
25 self.entries = itinParser.entries
28 def _extractItin(self, html):
29 match = _itin_rx.search(html)
30 if not match:
31 raise ItineraryParseException("Failed to extract itinerary")
32 return match.group("itin")
34 # Matches the entire itinerary, pulling the table into group "itin"
35 _itin_re = ('<!-- begin graphical itin.*?>\s*'
36 # ?s: DOTALL: . matches \n
37 '(?s)(?P<itin><table.*</table>)\s*'
38 '<!-- end graphical itin')
39 _itin_rx = sre.compile(_itin_re)
43 class ItineraryParser(HTMLParser):
44 def __init__(self):
45 HTMLParser.__init__(self)
46 self.state = STATE_NOTHING
48 # List of ItinEntry
49 self.entries = []
51 # Most recent ItinEntry, to which a bus stop number may be added
52 self.lastIE = None
54 def handle_starttag(self, tag, attrs):
55 # The directions are contained in <td class="itinText...">.
56 if tag == "td" and attrStarts(attrs, "class", "itinText"):
58 # start saving text
59 self.data = ""
60 self.state = STATE_IN_ITIN_TEXT
62 # The stop numbers are inside a block like this:
63 # <span class="itinBusStop">O-TRAIN CARLETON S.</span>
64 # <span class="itinBusStop"><a href="#" onClick="MM_openBrWindow('get.stop.timetable.oci?sptDate=2005-06-05&stop560=3062&stopLabel=CG995&stopName=O-TRAIN%2520CARLETON%2520S.','_stopWindow','scrollbars=yes,resizable=yes,width=800,height=600')">(3062)</a></span>
66 # We'll save the data in the <a> tag.
67 elif self.state == STATE_AFTER_ITIN_TEXT \
68 and tag == "span" and attrStarts(attrs, "class", "itinBusStop"):
70 self.data = ""
71 self.state = STATE_IN_BUS_STOP
73 elif self.state == STATE_IN_ITIN_TEXT and tag == "br":
74 self.data += "\n"
76 def handle_endtag(self, tag):
77 if self.state == STATE_IN_ITIN_TEXT and tag == "td":
79 self.state = STATE_AFTER_ITIN_TEXT
80 self._saveTextEntry()
82 elif self.state == STATE_IN_BUS_STOP and tag == "span":
83 if self._saveBusStop():
84 self.state = STATE_NOTHING
85 else:
86 # Usually there are two itinBusStop spans in a row, and
87 # this was probably the first one.
88 self.state = STATE_AFTER_ITIN_TEXT
90 def handle_data(self, data):
91 if self.state in (STATE_IN_ITIN_TEXT, STATE_IN_BUS_STOP):
92 self.data += data
94 def _saveTextEntry(self):
95 ie = self._buildItinEntry(self.data.strip())
96 self.entries.append(ie)
97 self.lastIE = ie
99 def _saveBusStop(self):
100 match = _bus_stop_rx.search(self.data)
101 if match:
102 self.lastIE.busStop = match.group("stopnum")
103 return bool(match)
105 def _buildItinEntry(self, text):
106 ie = ItinEntry(text)
108 # Match each regexp in turn.
109 matchTypes = (
110 (TYPE_DEPART_TIME, _depart_rx),
111 (TYPE_WAIT, _wait_rx),
112 (TYPE_WALK_TO_STOP, _walk_to_stop_rx),
113 (TYPE_WALK_TO_TRANSFER, _walk_transfer_stop_rx),
114 (TYPE_TAKE_BUS, _take_bus_rx),
115 (TYPE_WALK_TO_DEST, _walk_to_dest_rx),
117 for entry in matchTypes:
118 match = entry[1].search(text)
119 if match:
120 ie.type = entry[0]
122 # Extract fields, if any.
123 groups = match.groupdict()
124 for grp in groups.iteritems():
125 setattr(ie, grp[0], grp[1])
127 break
128 return ie
130 # ItineraryParser is an FSM.
131 STATE_NOTHING = 0
132 STATE_IN_ITIN_TEXT = 1
133 STATE_AFTER_ITIN_TEXT = 2
134 STATE_IN_BUS_STOP = 3
137 def attrStarts(attrs, name, val):
138 """Searches for attribute "name" in the attrs list. If it finds it,
139 returns true if its value starts with "val". Case insensitive."""
141 name = name.lower()
142 val = val.lower()
144 for kv in attrs:
145 # The attribute name is already lowercased by HTMLParser;
146 # make the value lowercase, too.
147 if kv[0] == name and kv[1].lower().startswith(val):
148 return True
149 return False
151 _bus_stop_re = '\((?P<stopnum>\d{4})\)'
152 _bus_stop_rx = sre.compile(_bus_stop_re)
155 # Expressions we'll use to narrow an ItinEntry down to a specific type.
156 # We'll also pick out data fields where possible.
157 _ie_time_re = '[\d: APM]+'
159 _depart_re = '^Depart at (?P<startTime>' + _ie_time_re + ')$'
160 _depart_rx = sre.compile(_depart_re)
162 _walk_to_stop_re = ('^At (?P<startTime>' + _ie_time_re
163 + '),\s*walk to (?:stop|station)\s*(?P<destination>.*?)\s*'
164 '\((?P<duration>[\d]+)\s*min')
165 _walk_to_stop_rx = sre.compile(_walk_to_stop_re)
167 # If the trip includes a transfer through a nearby stop, you'll get this.
168 _walk_transfer_stop_re = ('(?i)^Walk to (?:stop|station)\s*'
169 '(?P<destination>.*?)\s*'
170 'for transfer\.$')
171 _walk_transfer_stop_rx = sre.compile(_walk_transfer_stop_re)
173 # Flag 's' (DOTALL) is set on this one so we can skip over newlines in
174 # the description after "get off at stop <destination>". This description
175 # could be of the form "1 station(s) further" or
176 # "street PRESTON following street GLADSTONE.\nLast intersections: ... AVE."
177 _take_bus_re = ('(?is)^At (?P<startTime>' + _ie_time_re
178 + '),\s*take (?:train|bus)\s+route\s+(?P<route>.*?)\s*direction'
179 '\s*(?P<direction>.*?)\s*and get off at (?:stop|station)\s*'
180 '(?P<destination>.*?)(?:\s*, .*)?\.\s*Arrive at'
181 '\s*(?P<endTime>' + _ie_time_re + ')\.$')
182 _take_bus_rx = sre.compile(_take_bus_re)
184 _wait_re = ('(?i)^Wait (?P<duration>\d+) min')
185 _wait_rx = sre.compile(_wait_re)
187 _walk_to_dest_re = ('(?i)^Walk to (?P<destination>.*)\.\s+Arrive at'
188 '\s*(?P<endTime>' + _ie_time_re + ')\s+'
189 '\((?P<duration>\d+) min')
190 _walk_to_dest_rx = sre.compile(_walk_to_dest_re)
192 class ItinEntry:
193 """One step in the plan.
195 Fields:
197 text: Original text from the planner.
198 duration: Duration of the step in minutes.
199 startTime: Start time as a string.
200 endTime: End time as a string.
201 route: Bus or train route.
202 direction: Bus or train direction.
203 destination: A string. Usually a bus stop at the end of a step.
204 busStop: Bus stop number associated with the destination."""
206 def __init__(self, text):
207 self.text = text
208 self.busStop = None
209 self.type = TYPE_UNKNOWN
210 self.duration = None
211 self.startTime = None
212 self.endTime = None
213 self.route = None
214 self.direction = None
215 self.destination = None
217 def __str__(self):
218 if self.busStop:
219 return "<T%d %s (%s)>" % (self.type, self.text, self.busStop)
220 else:
221 return "<T%d %s>" % (self.type, self.text)
223 def __repr__(self):
224 return self.__str__()
226 # ItinEntry types.
227 TYPE_UNKNOWN = 0
228 TYPE_DEPART_TIME = 1
229 TYPE_WALK_TO_STOP = 2
230 TYPE_TAKE_BUS = 3
231 TYPE_WAIT = 4
232 TYPE_WALK_TO_DEST = 5
233 TYPE_WALK_TO_TRANSFER = 6