docutils/docutils/parsers/rst/tableparser.py

   1 # $Id$
   2 # Author: David Goodger <goodger@python.org>
   3 # Copyright: This module has been placed in the public domain.
   4
   5 """
   6 This module defines table parser classes,which parse plaintext-graphic tables
   7 and produce a well-formed data structure suitable for building a CALS table.
   8
   9 :Classes:
  10     - `GridTableParser`: Parse fully-formed tables represented with a grid.
  11     - `SimpleTableParser`: Parse simple tables, delimited by top & bottom
  12       borders.
  13
  14 :Exception class: `TableMarkupError`
  15
  16 :Function:
  17     `update_dict_of_lists()`: Merge two dictionaries containing list values.
  18 """
  19
  20 __docformat__ = 'reStructuredText'
  21
  22
  23 import re
  24 import sys
  25 from docutils import DataError
  26 from docutils.utils import strip_combining_chars
  27
  28
  29 class TableMarkupError(DataError):
  30
  31     """
  32     Raise if there is any problem with table markup.
  33
  34     The keyword argument `offset` denotes the offset of the problem
  35     from the table's start line.
  36     """
  37
  38     def __init__(self, *args, **kwargs):
  39             self.offset = kwargs.pop('offset', 0)
  40             DataError.__init__(self, *args)
  41
  42
  43 class TableParser:
  44
  45     """
  46     Abstract superclass for the common parts of the syntax-specific parsers.
  47     """
  48
  49     head_body_separator_pat = None
  50     """Matches the row separator between head rows and body rows."""
  51
  52     double_width_pad_char = '\x00'
  53     """Padding character for East Asian double-width text."""
  54
  55     def parse(self, block):
  56         """
  57         Analyze the text `block` and return a table data structure.
  58
  59         Given a plaintext-graphic table in `block` (list of lines of text; no
  60         whitespace padding), parse the table, construct and return the data
  61         necessary to construct a CALS table or equivalent.
  62
  63         Raise `TableMarkupError` if there is any problem with the markup.
  64         """
  65         self.setup(block)
  66         self.find_head_body_sep()
  67         self.parse_table()
  68         structure = self.structure_from_cells()
  69         return structure
  70
  71     def find_head_body_sep(self):
  72         """Look for a head/body row separator line; store the line index."""
  73         for i in range(len(self.block)):
  74             line = self.block[i]
  75             if self.head_body_separator_pat.match(line):
  76                 if self.head_body_sep:
  77                     raise TableMarkupError(
  78                         'Multiple head/body row separators '
  79                         '(table lines %s and %s); only one allowed.'
  80                         % (self.head_body_sep+1, i+1), offset=i)
  81                 else:
  82                     self.head_body_sep = i
  83                     self.block[i] = line.replace('=', '-')
  84         if self.head_body_sep == 0 or self.head_body_sep == (len(self.block)
  85                                                              - 1):
  86             raise TableMarkupError('The head/body row separator may not be '
  87                                    'the first or last line of the table.',
  88                                    offset=i)
  89
  90
  91 class GridTableParser(TableParser):
  92
  93     """
  94     Parse a grid table using `parse()`.
  95
  96     Here's an example of a grid table::
  97
  98         +------------------------+------------+----------+----------+
  99         | Header row, column 1   | Header 2   | Header 3 | Header 4 |
 100         +========================+============+==========+==========+
 101         | body row 1, column 1   | column 2   | column 3 | column 4 |
 102         +------------------------+------------+----------+----------+
 103         | body row 2             | Cells may span columns.          |
 104         +------------------------+------------+---------------------+
 105         | body row 3             | Cells may  | - Table cells       |
 106         +------------------------+ span rows. | - contain           |
 107         | body row 4             |            | - body elements.    |
 108         +------------------------+------------+---------------------+
 109
 110     Intersections use '+', row separators use '-' (except for one optional
 111     head/body row separator, which uses '='), and column separators use '|'.
 112
 113     Passing the above table to the `parse()` method will result in the
 114     following data structure::
 115
 116         ([24, 12, 10, 10],
 117          [[(0, 0, 1, ['Header row, column 1']),
 118            (0, 0, 1, ['Header 2']),
 119            (0, 0, 1, ['Header 3']),
 120            (0, 0, 1, ['Header 4'])]],
 121          [[(0, 0, 3, ['body row 1, column 1']),
 122            (0, 0, 3, ['column 2']),
 123            (0, 0, 3, ['column 3']),
 124            (0, 0, 3, ['column 4'])],
 125           [(0, 0, 5, ['body row 2']),
 126            (0, 2, 5, ['Cells may span columns.']),
 127            None,
 128            None],
 129           [(0, 0, 7, ['body row 3']),
 130            (1, 0, 7, ['Cells may', 'span rows.', '']),
 131            (1, 1, 7, ['- Table cells', '- contain', '- body elements.']),
 132            None],
 133           [(0, 0, 9, ['body row 4']), None, None, None]])
 134
 135     The first item is a list containing column widths (colspecs). The second
 136     item is a list of head rows, and the third is a list of body rows. Each
 137     row contains a list of cells. Each cell is either None (for a cell unused
 138     because of another cell's span), or a tuple. A cell tuple contains four
 139     items: the number of extra rows used by the cell in a vertical span
 140     (morerows); the number of extra columns used by the cell in a horizontal
 141     span (morecols); the line offset of the first line of the cell contents;
 142     and the cell contents, a list of lines of text.
 143     """
 144
 145     head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$')
 146
 147     def setup(self, block):
 148         self.block = block[:]           # make a copy; it may be modified
 149         self.block.disconnect()         # don't propagate changes to parent
 150         self.bottom = len(block) - 1
 151         self.right = len(block[0]) - 1
 152         self.head_body_sep = None
 153         self.done = [-1] * len(block[0])
 154         self.cells = []
 155         self.rowseps = {0: [0]}
 156         self.colseps = {0: [0]}
 157
 158     def parse_table(self):
 159         """
 160         Start with a queue of upper-left corners, containing the upper-left
 161         corner of the table itself. Trace out one rectangular cell, remember
 162         it, and add its upper-right and lower-left corners to the queue of
 163         potential upper-left corners of further cells. Process the queue in
 164         top-to-bottom order, keeping track of how much of each text column has
 165         been seen.
 166
 167         We'll end up knowing all the row and column boundaries, cell positions
 168         and their dimensions.
 169         """
 170         corners = [(0, 0)]
 171         while corners:
 172             top, left = corners.pop(0)
 173             if top == self.bottom or left == self.right \
 174                   or top <= self.done[left]:
 175                 continue
 176             result = self.scan_cell(top, left)
 177             if not result:
 178                 continue
 179             bottom, right, rowseps, colseps = result
 180             update_dict_of_lists(self.rowseps, rowseps)
 181             update_dict_of_lists(self.colseps, colseps)
 182             self.mark_done(top, left, bottom, right)
 183             cellblock = self.block.get_2D_block(top + 1, left + 1,
 184                                                 bottom, right)
 185             cellblock.disconnect()      # lines in cell can't sync with parent
 186             cellblock.replace(self.double_width_pad_char, '')
 187             self.cells.append((top, left, bottom, right, cellblock))
 188             corners.extend([(top, right), (bottom, left)])
 189             corners.sort()
 190         if not self.check_parse_complete():
 191             raise TableMarkupError('Malformed table; parse incomplete.')
 192
 193     def mark_done(self, top, left, bottom, right):
 194         """For keeping track of how much of each text column has been seen."""
 195         before = top - 1
 196         after = bottom - 1
 197         for col in range(left, right):
 198             assert self.done[col] == before
 199             self.done[col] = after
 200
 201     def check_parse_complete(self):
 202         """Each text column should have been completely seen."""
 203         last = self.bottom - 1
 204         for col in range(self.right):
 205             if self.done[col] != last:
 206                 return False
 207         return True
 208
 209     def scan_cell(self, top, left):
 210         """Starting at the top-left corner, start tracing out a cell."""
 211         assert self.block[top][left] == '+'
 212         result = self.scan_right(top, left)
 213         return result
 214
 215     def scan_right(self, top, left):
 216         """
 217         Look for the top-right corner of the cell, and make note of all column
 218         boundaries ('+').
 219         """
 220         colseps = {}
 221         line = self.block[top]
 222         for i in range(left + 1, self.right + 1):
 223             if line[i] == '+':
 224                 colseps[i] = [top]
 225                 result = self.scan_down(top, left, i)
 226                 if result:
 227                     bottom, rowseps, newcolseps = result
 228                     update_dict_of_lists(colseps, newcolseps)
 229                     return bottom, i, rowseps, colseps
 230             elif line[i] != '-':
 231                 return None
 232         return None
 233
 234     def scan_down(self, top, left, right):
 235         """
 236         Look for the bottom-right corner of the cell, making note of all row
 237         boundaries.
 238         """
 239         rowseps = {}
 240         for i in range(top + 1, self.bottom + 1):
 241             if self.block[i][right] == '+':
 242                 rowseps[i] = [right]
 243                 result = self.scan_left(top, left, i, right)
 244                 if result:
 245                     newrowseps, colseps = result
 246                     update_dict_of_lists(rowseps, newrowseps)
 247                     return i, rowseps, colseps
 248             elif self.block[i][right] != '|':
 249                 return None
 250         return None
 251
 252     def scan_left(self, top, left, bottom, right):
 253         """
 254         Noting column boundaries, look for the bottom-left corner of the cell.
 255         It must line up with the starting point.
 256         """
 257         colseps = {}
 258         line = self.block[bottom]
 259         for i in range(right - 1, left, -1):
 260             if line[i] == '+':
 261                 colseps[i] = [bottom]
 262             elif line[i] != '-':
 263                 return None
 264         if line[left] != '+':
 265             return None
 266         result = self.scan_up(top, left, bottom, right)
 267         if result is not None:
 268             rowseps = result
 269             return rowseps, colseps
 270         return None
 271
 272     def scan_up(self, top, left, bottom, right):
 273         """
 274         Noting row boundaries, see if we can return to the starting point.
 275         """
 276         rowseps = {}
 277         for i in range(bottom - 1, top, -1):
 278             if self.block[i][left] == '+':
 279                 rowseps[i] = [left]
 280             elif self.block[i][left] != '|':
 281                 return None
 282         return rowseps
 283
 284     def structure_from_cells(self):
 285         """
 286         From the data collected by `scan_cell()`, convert to the final data
 287         structure.
 288         """
 289         rowseps = self.rowseps.keys()   # list of row boundaries
 290         rowseps.sort()
 291         rowindex = {}
 292         for i in range(len(rowseps)):
 293             rowindex[rowseps[i]] = i    # row boundary -> row number mapping
 294         colseps = self.colseps.keys()   # list of column boundaries
 295         colseps.sort()
 296         colindex = {}
 297         for i in range(len(colseps)):
 298             colindex[colseps[i]] = i    # column boundary -> col number map
 299         colspecs = [(colseps[i] - colseps[i - 1] - 1)
 300                     for i in range(1, len(colseps))] # list of column widths
 301         # prepare an empty table with the correct number of rows & columns
 302         onerow = [None for i in range(len(colseps) - 1)]
 303         rows = [onerow[:] for i in range(len(rowseps) - 1)]
 304         # keep track of # of cells remaining; should reduce to zero
 305         remaining = (len(rowseps) - 1) * (len(colseps) - 1)
 306         for top, left, bottom, right, block in self.cells:
 307             rownum = rowindex[top]
 308             colnum = colindex[left]
 309             assert rows[rownum][colnum] is None, (
 310                   'Cell (row %s, column %s) already used.'
 311                   % (rownum + 1, colnum + 1))
 312             morerows = rowindex[bottom] - rownum - 1
 313             morecols = colindex[right] - colnum - 1
 314             remaining -= (morerows + 1) * (morecols + 1)
 315             # write the cell into the table
 316             rows[rownum][colnum] = (morerows, morecols, top + 1, block)
 317         assert remaining == 0, 'Unused cells remaining.'
 318         if self.head_body_sep:          # separate head rows from body rows
 319             numheadrows = rowindex[self.head_body_sep]
 320             headrows = rows[:numheadrows]
 321             bodyrows = rows[numheadrows:]
 322         else:
 323             headrows = []
 324             bodyrows = rows
 325         return (colspecs, headrows, bodyrows)
 326
 327
 328 class SimpleTableParser(TableParser):
 329
 330     """
 331     Parse a simple table using `parse()`.
 332
 333     Here's an example of a simple table::
 334
 335         =====  =====
 336         col 1  col 2
 337         =====  =====
 338         1      Second column of row 1.
 339         2      Second column of row 2.
 340                Second line of paragraph.
 341         3      - Second column of row 3.
 342
 343                - Second item in bullet
 344                  list (row 3, column 2).
 345         4 is a span
 346         ------------
 347         5
 348         =====  =====
 349
 350     Top and bottom borders use '=', column span underlines use '-', column
 351     separation is indicated with spaces.
 352
 353     Passing the above table to the `parse()` method will result in the
 354     following data structure, whose interpretation is the same as for
 355     `GridTableParser`::
 356
 357         ([5, 25],
 358          [[(0, 0, 1, ['col 1']),
 359            (0, 0, 1, ['col 2'])]],
 360          [[(0, 0, 3, ['1']),
 361            (0, 0, 3, ['Second column of row 1.'])],
 362           [(0, 0, 4, ['2']),
 363            (0, 0, 4, ['Second column of row 2.',
 364                       'Second line of paragraph.'])],
 365           [(0, 0, 6, ['3']),
 366            (0, 0, 6, ['- Second column of row 3.',
 367                       '',
 368                       '- Second item in bullet',
 369                       '  list (row 3, column 2).'])],
 370           [(0, 1, 10, ['4 is a span'])],
 371           [(0, 0, 12, ['5']),
 372            (0, 0, 12, [''])]])
 373     """
 374
 375     head_body_separator_pat = re.compile('=[ =]*$')
 376     span_pat = re.compile('-[ -]*$')
 377
 378     def setup(self, block):
 379         self.block = block[:]           # make a copy; it will be modified
 380         self.block.disconnect()         # don't propagate changes to parent
 381         # Convert top & bottom borders to column span underlines:
 382         self.block[0] = self.block[0].replace('=', '-')
 383         self.block[-1] = self.block[-1].replace('=', '-')
 384         self.head_body_sep = None
 385         self.columns = []
 386         self.border_end = None
 387         self.table = []
 388         self.done = [-1] * len(block[0])
 389         self.rowseps = {0: [0]}
 390         self.colseps = {0: [0]}
 391
 392     def parse_table(self):
 393         """
 394         First determine the column boundaries from the top border, then
 395         process rows.  Each row may consist of multiple lines; accumulate
 396         lines until a row is complete.  Call `self.parse_row` to finish the
 397         job.
 398         """
 399         # Top border must fully describe all table columns.
 400         self.columns = self.parse_columns(self.block[0], 0)
 401         self.border_end = self.columns[-1][1]
 402         firststart, firstend = self.columns[0]
 403         offset = 1                      # skip top border
 404         start = 1
 405         text_found = None
 406         while offset < len(self.block):
 407             line = self.block[offset]
 408             if self.span_pat.match(line):
 409                 # Column span underline or border; row is complete.
 410                 self.parse_row(self.block[start:offset], start,
 411                                (line.rstrip(), offset))
 412                 start = offset + 1
 413                 text_found = None
 414             elif line[firststart:firstend].strip():
 415                 # First column not blank, therefore it's a new row.
 416                 if text_found and offset != start:
 417                     self.parse_row(self.block[start:offset], start)
 418                 start = offset
 419                 text_found = 1
 420             elif not text_found:
 421                 start = offset + 1
 422             offset += 1
 423
 424     def parse_columns(self, line, offset):
 425         """
 426         Given a column span underline, return a list of (begin, end) pairs.
 427         """
 428         cols = []
 429         end = 0
 430         while True:
 431             begin = line.find('-', end)
 432             end = line.find(' ', begin)
 433             if begin < 0:
 434                 break
 435             if end < 0:
 436                 end = len(line)
 437             cols.append((begin, end))
 438         if self.columns:
 439             if cols[-1][1] != self.border_end:
 440                 raise TableMarkupError('Column span incomplete in table '
 441                                        'line %s.' % (offset+1),
 442                                        offset=offset)
 443             # Allow for an unbounded rightmost column:
 444             cols[-1] = (cols[-1][0], self.columns[-1][1])
 445         return cols
 446
 447     def init_row(self, colspec, offset):
 448         i = 0
 449         cells = []
 450         for start, end in colspec:
 451             morecols = 0
 452             try:
 453                 assert start == self.columns[i][0]
 454                 while end != self.columns[i][1]:
 455                     i += 1
 456                     morecols += 1
 457             except (AssertionError, IndexError):
 458                 raise TableMarkupError('Column span alignment problem '
 459                                        'in table line %s.' % (offset+2),
 460                                        offset=offset+1)
 461             cells.append([0, morecols, offset, []])
 462             i += 1
 463         return cells
 464
 465     def parse_row(self, lines, start, spanline=None):
 466         """
 467         Given the text `lines` of a row, parse it and append to `self.table`.
 468
 469         The row is parsed according to the current column spec (either
 470         `spanline` if provided or `self.columns`).  For each column, extract
 471         text from each line, and check for text in column margins.  Finally,
 472         adjust for insignificant whitespace.
 473         """
 474         if not (lines or spanline):
 475             # No new row, just blank lines.
 476             return
 477         if spanline:
 478             columns = self.parse_columns(*spanline)
 479             span_offset = spanline[1]
 480         else:
 481             columns = self.columns[:]
 482             span_offset = start
 483         self.check_columns(lines, start, columns)
 484         row = self.init_row(columns, start)
 485         for i in range(len(columns)):
 486             start, end = columns[i]
 487             cellblock = lines.get_2D_block(0, start, len(lines), end)
 488             cellblock.disconnect()      # lines in cell can't sync with parent
 489             cellblock.replace(self.double_width_pad_char, '')
 490             row[i][3] = cellblock
 491         self.table.append(row)
 492
 493     def check_columns(self, lines, first_line, columns):
 494         """
 495         Check for text in column margins and text overflow in the last column.
 496         Raise TableMarkupError if anything but whitespace is in column margins.
 497         Adjust the end value for the last column if there is text overflow.
 498         """
 499         # "Infinite" value for a dummy last column's beginning, used to
 500         # check for text overflow:
 501         columns.append((sys.maxint, None))
 502         lastcol = len(columns) - 2
 503         # combining characters do not contribute to the column width
 504         lines = [strip_combining_chars(line) for line in lines]
 505
 506         for i in range(len(columns) - 1):
 507             start, end = columns[i]
 508             nextstart = columns[i+1][0]
 509             offset = 0
 510             for line in lines:
 511                 if i == lastcol and line[end:].strip():
 512                     text = line[start:].rstrip()
 513                     new_end = start + len(text)
 514                     main_start, main_end = self.columns[-1]
 515                     columns[i] = (start, max(main_end, new_end))
 516                     if new_end > main_end:
 517                         self.columns[-1] = (main_start, new_end)
 518                 elif line[end:nextstart].strip():
 519                     raise TableMarkupError('Text in column margin '
 520                         'in table line %s.' % (first_line+offset+1),
 521                         offset=first_line+offset)
 522                 offset += 1
 523         columns.pop()
 524
 525     def structure_from_cells(self):
 526         colspecs = [end - start for start, end in self.columns]
 527         first_body_row = 0
 528         if self.head_body_sep:
 529             for i in range(len(self.table)):
 530                 if self.table[i][0][2] > self.head_body_sep:
 531                     first_body_row = i
 532                     break
 533         return (colspecs, self.table[:first_body_row],
 534                 self.table[first_body_row:])
 535
 536
 537 def update_dict_of_lists(master, newdata):
 538     """
 539     Extend the list values of `master` with those from `newdata`.
 540
 541     Both parameters must be dictionaries containing list values.
 542     """
 543     for key, values in newdata.items():
 544         master.setdefault(key, []).extend(values)