Fix :width: option for the table directives.
[docutils.git] / docutils / docutils / parsers / rst / tableparser.py
blobe1a1717e54f52ae92c1f0e9372f52ef274670462
1 # $Id$
2 # Author: David Goodger <goodger@python.org>
3 # Copyright: This module has been placed in the public domain.
5 """
6 This module defines table parser classes,which parse plaintext-graphic tables
7 and produce a well-formed data structure suitable for building a CALS table.
9 :Classes:
10 - `GridTableParser`: Parse fully-formed tables represented with a grid.
11 - `SimpleTableParser`: Parse simple tables, delimited by top & bottom
12 borders.
14 :Exception class: `TableMarkupError`
16 :Function:
17 `update_dict_of_lists()`: Merge two dictionaries containing list values.
18 """
20 __docformat__ = 'reStructuredText'
23 import re
24 import sys
25 from docutils import DataError
26 from docutils.utils import strip_combining_chars
29 class TableMarkupError(DataError):
31 """
32 Raise if there is any problem with table markup.
34 The keyword argument `offset` denotes the offset of the problem
35 from the table's start line.
36 """
38 def __init__(self, *args, **kwargs):
39 self.offset = kwargs.pop('offset', 0)
40 DataError.__init__(self, *args)
43 class TableParser:
45 """
46 Abstract superclass for the common parts of the syntax-specific parsers.
47 """
49 head_body_separator_pat = None
50 """Matches the row separator between head rows and body rows."""
52 double_width_pad_char = '\x00'
53 """Padding character for East Asian double-width text."""
55 def parse(self, block):
56 """
57 Analyze the text `block` and return a table data structure.
59 Given a plaintext-graphic table in `block` (list of lines of text; no
60 whitespace padding), parse the table, construct and return the data
61 necessary to construct a CALS table or equivalent.
63 Raise `TableMarkupError` if there is any problem with the markup.
64 """
65 self.setup(block)
66 self.find_head_body_sep()
67 self.parse_table()
68 structure = self.structure_from_cells()
69 return structure
71 def find_head_body_sep(self):
72 """Look for a head/body row separator line; store the line index."""
73 for i in range(len(self.block)):
74 line = self.block[i]
75 if self.head_body_separator_pat.match(line):
76 if self.head_body_sep:
77 raise TableMarkupError(
78 'Multiple head/body row separators '
79 '(table lines %s and %s); only one allowed.'
80 % (self.head_body_sep+1, i+1), offset=i)
81 else:
82 self.head_body_sep = i
83 self.block[i] = line.replace('=', '-')
84 if self.head_body_sep == 0 or self.head_body_sep == (len(self.block)
85 - 1):
86 raise TableMarkupError('The head/body row separator may not be '
87 'the first or last line of the table.',
88 offset=i)
91 class GridTableParser(TableParser):
93 """
94 Parse a grid table using `parse()`.
96 Here's an example of a grid table::
98 +------------------------+------------+----------+----------+
99 | Header row, column 1 | Header 2 | Header 3 | Header 4 |
100 +========================+============+==========+==========+
101 | body row 1, column 1 | column 2 | column 3 | column 4 |
102 +------------------------+------------+----------+----------+
103 | body row 2 | Cells may span columns. |
104 +------------------------+------------+---------------------+
105 | body row 3 | Cells may | - Table cells |
106 +------------------------+ span rows. | - contain |
107 | body row 4 | | - body elements. |
108 +------------------------+------------+---------------------+
110 Intersections use '+', row separators use '-' (except for one optional
111 head/body row separator, which uses '='), and column separators use '|'.
113 Passing the above table to the `parse()` method will result in the
114 following data structure::
116 ([24, 12, 10, 10],
117 [[(0, 0, 1, ['Header row, column 1']),
118 (0, 0, 1, ['Header 2']),
119 (0, 0, 1, ['Header 3']),
120 (0, 0, 1, ['Header 4'])]],
121 [[(0, 0, 3, ['body row 1, column 1']),
122 (0, 0, 3, ['column 2']),
123 (0, 0, 3, ['column 3']),
124 (0, 0, 3, ['column 4'])],
125 [(0, 0, 5, ['body row 2']),
126 (0, 2, 5, ['Cells may span columns.']),
127 None,
128 None],
129 [(0, 0, 7, ['body row 3']),
130 (1, 0, 7, ['Cells may', 'span rows.', '']),
131 (1, 1, 7, ['- Table cells', '- contain', '- body elements.']),
132 None],
133 [(0, 0, 9, ['body row 4']), None, None, None]])
135 The first item is a list containing column widths (colspecs). The second
136 item is a list of head rows, and the third is a list of body rows. Each
137 row contains a list of cells. Each cell is either None (for a cell unused
138 because of another cell's span), or a tuple. A cell tuple contains four
139 items: the number of extra rows used by the cell in a vertical span
140 (morerows); the number of extra columns used by the cell in a horizontal
141 span (morecols); the line offset of the first line of the cell contents;
142 and the cell contents, a list of lines of text.
145 head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$')
147 def setup(self, block):
148 self.block = block[:] # make a copy; it may be modified
149 self.block.disconnect() # don't propagate changes to parent
150 self.bottom = len(block) - 1
151 self.right = len(block[0]) - 1
152 self.head_body_sep = None
153 self.done = [-1] * len(block[0])
154 self.cells = []
155 self.rowseps = {0: [0]}
156 self.colseps = {0: [0]}
158 def parse_table(self):
160 Start with a queue of upper-left corners, containing the upper-left
161 corner of the table itself. Trace out one rectangular cell, remember
162 it, and add its upper-right and lower-left corners to the queue of
163 potential upper-left corners of further cells. Process the queue in
164 top-to-bottom order, keeping track of how much of each text column has
165 been seen.
167 We'll end up knowing all the row and column boundaries, cell positions
168 and their dimensions.
170 corners = [(0, 0)]
171 while corners:
172 top, left = corners.pop(0)
173 if top == self.bottom or left == self.right \
174 or top <= self.done[left]:
175 continue
176 result = self.scan_cell(top, left)
177 if not result:
178 continue
179 bottom, right, rowseps, colseps = result
180 update_dict_of_lists(self.rowseps, rowseps)
181 update_dict_of_lists(self.colseps, colseps)
182 self.mark_done(top, left, bottom, right)
183 cellblock = self.block.get_2D_block(top + 1, left + 1,
184 bottom, right)
185 cellblock.disconnect() # lines in cell can't sync with parent
186 cellblock.replace(self.double_width_pad_char, '')
187 self.cells.append((top, left, bottom, right, cellblock))
188 corners.extend([(top, right), (bottom, left)])
189 corners.sort()
190 if not self.check_parse_complete():
191 raise TableMarkupError('Malformed table; parse incomplete.')
193 def mark_done(self, top, left, bottom, right):
194 """For keeping track of how much of each text column has been seen."""
195 before = top - 1
196 after = bottom - 1
197 for col in range(left, right):
198 assert self.done[col] == before
199 self.done[col] = after
201 def check_parse_complete(self):
202 """Each text column should have been completely seen."""
203 last = self.bottom - 1
204 for col in range(self.right):
205 if self.done[col] != last:
206 return False
207 return True
209 def scan_cell(self, top, left):
210 """Starting at the top-left corner, start tracing out a cell."""
211 assert self.block[top][left] == '+'
212 result = self.scan_right(top, left)
213 return result
215 def scan_right(self, top, left):
217 Look for the top-right corner of the cell, and make note of all column
218 boundaries ('+').
220 colseps = {}
221 line = self.block[top]
222 for i in range(left + 1, self.right + 1):
223 if line[i] == '+':
224 colseps[i] = [top]
225 result = self.scan_down(top, left, i)
226 if result:
227 bottom, rowseps, newcolseps = result
228 update_dict_of_lists(colseps, newcolseps)
229 return bottom, i, rowseps, colseps
230 elif line[i] != '-':
231 return None
232 return None
234 def scan_down(self, top, left, right):
236 Look for the bottom-right corner of the cell, making note of all row
237 boundaries.
239 rowseps = {}
240 for i in range(top + 1, self.bottom + 1):
241 if self.block[i][right] == '+':
242 rowseps[i] = [right]
243 result = self.scan_left(top, left, i, right)
244 if result:
245 newrowseps, colseps = result
246 update_dict_of_lists(rowseps, newrowseps)
247 return i, rowseps, colseps
248 elif self.block[i][right] != '|':
249 return None
250 return None
252 def scan_left(self, top, left, bottom, right):
254 Noting column boundaries, look for the bottom-left corner of the cell.
255 It must line up with the starting point.
257 colseps = {}
258 line = self.block[bottom]
259 for i in range(right - 1, left, -1):
260 if line[i] == '+':
261 colseps[i] = [bottom]
262 elif line[i] != '-':
263 return None
264 if line[left] != '+':
265 return None
266 result = self.scan_up(top, left, bottom, right)
267 if result is not None:
268 rowseps = result
269 return rowseps, colseps
270 return None
272 def scan_up(self, top, left, bottom, right):
274 Noting row boundaries, see if we can return to the starting point.
276 rowseps = {}
277 for i in range(bottom - 1, top, -1):
278 if self.block[i][left] == '+':
279 rowseps[i] = [left]
280 elif self.block[i][left] != '|':
281 return None
282 return rowseps
284 def structure_from_cells(self):
286 From the data collected by `scan_cell()`, convert to the final data
287 structure.
289 rowseps = self.rowseps.keys() # list of row boundaries
290 rowseps.sort()
291 rowindex = {}
292 for i in range(len(rowseps)):
293 rowindex[rowseps[i]] = i # row boundary -> row number mapping
294 colseps = self.colseps.keys() # list of column boundaries
295 colseps.sort()
296 colindex = {}
297 for i in range(len(colseps)):
298 colindex[colseps[i]] = i # column boundary -> col number map
299 colspecs = [(colseps[i] - colseps[i - 1] - 1)
300 for i in range(1, len(colseps))] # list of column widths
301 # prepare an empty table with the correct number of rows & columns
302 onerow = [None for i in range(len(colseps) - 1)]
303 rows = [onerow[:] for i in range(len(rowseps) - 1)]
304 # keep track of # of cells remaining; should reduce to zero
305 remaining = (len(rowseps) - 1) * (len(colseps) - 1)
306 for top, left, bottom, right, block in self.cells:
307 rownum = rowindex[top]
308 colnum = colindex[left]
309 assert rows[rownum][colnum] is None, (
310 'Cell (row %s, column %s) already used.'
311 % (rownum + 1, colnum + 1))
312 morerows = rowindex[bottom] - rownum - 1
313 morecols = colindex[right] - colnum - 1
314 remaining -= (morerows + 1) * (morecols + 1)
315 # write the cell into the table
316 rows[rownum][colnum] = (morerows, morecols, top + 1, block)
317 assert remaining == 0, 'Unused cells remaining.'
318 if self.head_body_sep: # separate head rows from body rows
319 numheadrows = rowindex[self.head_body_sep]
320 headrows = rows[:numheadrows]
321 bodyrows = rows[numheadrows:]
322 else:
323 headrows = []
324 bodyrows = rows
325 return (colspecs, headrows, bodyrows)
328 class SimpleTableParser(TableParser):
331 Parse a simple table using `parse()`.
333 Here's an example of a simple table::
335 ===== =====
336 col 1 col 2
337 ===== =====
338 1 Second column of row 1.
339 2 Second column of row 2.
340 Second line of paragraph.
341 3 - Second column of row 3.
343 - Second item in bullet
344 list (row 3, column 2).
345 4 is a span
346 ------------
348 ===== =====
350 Top and bottom borders use '=', column span underlines use '-', column
351 separation is indicated with spaces.
353 Passing the above table to the `parse()` method will result in the
354 following data structure, whose interpretation is the same as for
355 `GridTableParser`::
357 ([5, 25],
358 [[(0, 0, 1, ['col 1']),
359 (0, 0, 1, ['col 2'])]],
360 [[(0, 0, 3, ['1']),
361 (0, 0, 3, ['Second column of row 1.'])],
362 [(0, 0, 4, ['2']),
363 (0, 0, 4, ['Second column of row 2.',
364 'Second line of paragraph.'])],
365 [(0, 0, 6, ['3']),
366 (0, 0, 6, ['- Second column of row 3.',
368 '- Second item in bullet',
369 ' list (row 3, column 2).'])],
370 [(0, 1, 10, ['4 is a span'])],
371 [(0, 0, 12, ['5']),
372 (0, 0, 12, [''])]])
375 head_body_separator_pat = re.compile('=[ =]*$')
376 span_pat = re.compile('-[ -]*$')
378 def setup(self, block):
379 self.block = block[:] # make a copy; it will be modified
380 self.block.disconnect() # don't propagate changes to parent
381 # Convert top & bottom borders to column span underlines:
382 self.block[0] = self.block[0].replace('=', '-')
383 self.block[-1] = self.block[-1].replace('=', '-')
384 self.head_body_sep = None
385 self.columns = []
386 self.border_end = None
387 self.table = []
388 self.done = [-1] * len(block[0])
389 self.rowseps = {0: [0]}
390 self.colseps = {0: [0]}
392 def parse_table(self):
394 First determine the column boundaries from the top border, then
395 process rows. Each row may consist of multiple lines; accumulate
396 lines until a row is complete. Call `self.parse_row` to finish the
397 job.
399 # Top border must fully describe all table columns.
400 self.columns = self.parse_columns(self.block[0], 0)
401 self.border_end = self.columns[-1][1]
402 firststart, firstend = self.columns[0]
403 offset = 1 # skip top border
404 start = 1
405 text_found = None
406 while offset < len(self.block):
407 line = self.block[offset]
408 if self.span_pat.match(line):
409 # Column span underline or border; row is complete.
410 self.parse_row(self.block[start:offset], start,
411 (line.rstrip(), offset))
412 start = offset + 1
413 text_found = None
414 elif line[firststart:firstend].strip():
415 # First column not blank, therefore it's a new row.
416 if text_found and offset != start:
417 self.parse_row(self.block[start:offset], start)
418 start = offset
419 text_found = 1
420 elif not text_found:
421 start = offset + 1
422 offset += 1
424 def parse_columns(self, line, offset):
426 Given a column span underline, return a list of (begin, end) pairs.
428 cols = []
429 end = 0
430 while True:
431 begin = line.find('-', end)
432 end = line.find(' ', begin)
433 if begin < 0:
434 break
435 if end < 0:
436 end = len(line)
437 cols.append((begin, end))
438 if self.columns:
439 if cols[-1][1] != self.border_end:
440 raise TableMarkupError('Column span incomplete in table '
441 'line %s.' % (offset+1),
442 offset=offset)
443 # Allow for an unbounded rightmost column:
444 cols[-1] = (cols[-1][0], self.columns[-1][1])
445 return cols
447 def init_row(self, colspec, offset):
448 i = 0
449 cells = []
450 for start, end in colspec:
451 morecols = 0
452 try:
453 assert start == self.columns[i][0]
454 while end != self.columns[i][1]:
455 i += 1
456 morecols += 1
457 except (AssertionError, IndexError):
458 raise TableMarkupError('Column span alignment problem '
459 'in table line %s.' % (offset+2),
460 offset=offset+1)
461 cells.append([0, morecols, offset, []])
462 i += 1
463 return cells
465 def parse_row(self, lines, start, spanline=None):
467 Given the text `lines` of a row, parse it and append to `self.table`.
469 The row is parsed according to the current column spec (either
470 `spanline` if provided or `self.columns`). For each column, extract
471 text from each line, and check for text in column margins. Finally,
472 adjust for insignificant whitespace.
474 if not (lines or spanline):
475 # No new row, just blank lines.
476 return
477 if spanline:
478 columns = self.parse_columns(*spanline)
479 span_offset = spanline[1]
480 else:
481 columns = self.columns[:]
482 span_offset = start
483 self.check_columns(lines, start, columns)
484 row = self.init_row(columns, start)
485 for i in range(len(columns)):
486 start, end = columns[i]
487 cellblock = lines.get_2D_block(0, start, len(lines), end)
488 cellblock.disconnect() # lines in cell can't sync with parent
489 cellblock.replace(self.double_width_pad_char, '')
490 row[i][3] = cellblock
491 self.table.append(row)
493 def check_columns(self, lines, first_line, columns):
495 Check for text in column margins and text overflow in the last column.
496 Raise TableMarkupError if anything but whitespace is in column margins.
497 Adjust the end value for the last column if there is text overflow.
499 # "Infinite" value for a dummy last column's beginning, used to
500 # check for text overflow:
501 columns.append((sys.maxint, None))
502 lastcol = len(columns) - 2
503 # combining characters do not contribute to the column width
504 lines = [strip_combining_chars(line) for line in lines]
506 for i in range(len(columns) - 1):
507 start, end = columns[i]
508 nextstart = columns[i+1][0]
509 offset = 0
510 for line in lines:
511 if i == lastcol and line[end:].strip():
512 text = line[start:].rstrip()
513 new_end = start + len(text)
514 main_start, main_end = self.columns[-1]
515 columns[i] = (start, max(main_end, new_end))
516 if new_end > main_end:
517 self.columns[-1] = (main_start, new_end)
518 elif line[end:nextstart].strip():
519 raise TableMarkupError('Text in column margin '
520 'in table line %s.' % (first_line+offset+1),
521 offset=first_line+offset)
522 offset += 1
523 columns.pop()
525 def structure_from_cells(self):
526 colspecs = [end - start for start, end in self.columns]
527 first_body_row = 0
528 if self.head_body_sep:
529 for i in range(len(self.table)):
530 if self.table[i][0][2] > self.head_body_sep:
531 first_body_row = i
532 break
533 return (colspecs, self.table[:first_body_row],
534 self.table[first_body_row:])
537 def update_dict_of_lists(master, newdata):
539 Extend the list values of `master` with those from `newdata`.
541 Both parameters must be dictionaries containing list values.
543 for key, values in newdata.items():
544 master.setdefault(key, []).extend(values)