cola/textwrap.py

   1 """Text wrapping and filling.
   2 """
   3
   4 # Copyright (C) 1999-2001 Gregory P. Ward.
   5 # Copyright (C) 2002, 2003 Python Software Foundation.
   6 # Copyright (C) 2013, David Aguilar
   7 # Written by Greg Ward <gward@python.net>
   8 # Simplified for git-cola by David Aguilar <davvid@gmail.com>
   9
  10 import re
  11
  12
  13 class TextWrapper(object):
  14     """
  15     Object for wrapping/filling text.  The public interface consists of
  16     the wrap() and fill() methods; the other methods are just there for
  17     subclasses to override in order to tweak the default behaviour.
  18     If you want to completely replace the main wrapping algorithm,
  19     you'll probably have to override _wrap_chunks().
  20
  21     Several instance attributes control various aspects of wrapping:
  22       width (default: 70)
  23         The preferred width of wrapped lines.
  24       tabwidth (default: 8)
  25         The width of a tab used when calculating line length.
  26       break_on_hyphens (default: true)
  27         Allow breaking hyphenated words. If true, wrapping will occur
  28         preferably on whitespaces and right after hyphens part of
  29         compound words.
  30       drop_whitespace (default: true)
  31         Drop leading and trailing whitespace from lines.
  32     """
  33
  34     # This funky little regex is just the trick for splitting
  35     # text up into word-wrappable chunks.  E.g.
  36     #   "Hello there -- you goof-ball, use the -b option!"
  37     # splits into
  38     #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
  39     # (after stripping out empty strings).
  40     wordsep_re = re.compile(
  41         r'(\s+|'                                  # any whitespace
  42         r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|'   # hyphenated words
  43         r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))')   # em-dash
  44
  45     # This less funky little regex just split on recognized spaces. E.g.
  46     #   "Hello there -- you goof-ball, use the -b option!"
  47     # splits into
  48     #   Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
  49     wordsep_simple_re = re.compile(r'(\s+)')
  50
  51     def __init__(self,
  52                  width=70,
  53                  tabwidth=8,
  54                  break_on_hyphens=True,
  55                  drop_whitespace=True):
  56         self.width = width
  57         self.tabwidth = tabwidth
  58         self.break_on_hyphens = break_on_hyphens
  59         self.drop_whitespace = drop_whitespace
  60
  61         # recompile the regexes for Unicode mode -- done in this clumsy way for
  62         # backwards compatibility because it's rather common to monkey-patch
  63         # the TextWrapper class' wordsep_re attribute.
  64         self.wordsep_re_uni = re.compile(self.wordsep_re.pattern, re.U)
  65         self.wordsep_simple_re_uni = re.compile(
  66             self.wordsep_simple_re.pattern, re.U)
  67
  68     def _split(self, text):
  69         """_split(text : string) -> [string]
  70
  71         Split the text to wrap into indivisible chunks.  Chunks are
  72         not quite the same as words; see _wrap_chunks() for full
  73         details.  As an example, the text
  74           Look, goof-ball -- use the -b option!
  75         breaks into the following chunks:
  76           'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
  77           'use', ' ', 'the', ' ', '-b', ' ', 'option!'
  78         if break_on_hyphens is True, or in:
  79           'Look,', ' ', 'goof-ball', ' ', '--', ' ',
  80           'use', ' ', 'the', ' ', '-b', ' ', option!'
  81         otherwise.
  82         """
  83         if isinstance(text, unicode):
  84             if self.break_on_hyphens:
  85                 pat = self.wordsep_re_uni
  86             else:
  87                 pat = self.wordsep_simple_re_uni
  88         else:
  89             if self.break_on_hyphens:
  90                 pat = self.wordsep_re
  91             else:
  92                 pat = self.wordsep_simple_re
  93         chunks = pat.split(text)
  94         chunks = filter(None, chunks)  # remove empty chunks
  95         return chunks
  96
  97     def _wrap_chunks(self, chunks):
  98         """_wrap_chunks(chunks : [string]) -> [string]
  99
 100         Wrap a sequence of text chunks and return a list of lines of length
 101         'self.width' or less.  Some lines may be longer than this.  Chunks
 102         correspond roughly to words and the whitespace between them: each
 103         chunk is indivisible, but a line break can come between any two
 104         chunks.  Chunks should not have internal whitespace; ie. a chunk is
 105         either all whitespace or a "word".  Whitespace chunks will be removed
 106         from the beginning and end of lines, but apart from that whitespace is
 107         preserved.
 108         """
 109         lines = []
 110
 111         # Arrange in reverse order so items can be efficiently popped
 112         # from a stack of chucks.
 113         chunks.reverse()
 114
 115         while chunks:
 116
 117             # Start the list of chunks that will make up the current line.
 118             # cur_len is just the length of all the chunks in cur_line.
 119             cur_line = []
 120             cur_len = 0
 121
 122             # Maximum width for this line.
 123             width = self.width
 124
 125             # First chunk on line is a space -- drop it, unless this
 126             # is the very beginning of the text (ie. no lines started yet).
 127             if self.drop_whitespace and chunks[-1] == ' ' and lines:
 128                 chunks.pop()
 129
 130             while chunks:
 131                 l = self.chunklen(chunks[-1])
 132
 133                 # Can at least squeeze this chunk onto the current line.
 134                 if cur_len + l <= width:
 135                     cur_line.append(chunks.pop())
 136                     cur_len += l
 137                 # Nope, this line is full.
 138                 else:
 139                     break
 140
 141             # The current line is full, and the next chunk is too big to
 142             # fit on *any* line (not just this one).
 143             if chunks and self.chunklen(chunks[-1]) > width:
 144                 if not cur_line:
 145                     cur_line.append(chunks.pop())
 146
 147             # If the last chunk on this line is all a space, drop it.
 148             if self.drop_whitespace and cur_line and cur_line[-1] == ' ':
 149                 cur_line.pop()
 150
 151             # Avoid whitespace at the beginining of the line.
 152             if (self.drop_whitespace and cur_line and
 153                     cur_line[0] in (' ', '  ')):
 154                 cur_line.pop(0)
 155
 156             # Convert current line back to a string and store it in list
 157             # of all lines (return value).
 158             if cur_line:
 159                 lines.append(''.join(cur_line))
 160
 161         return lines
 162
 163     def chunklen(self, word):
 164         """Return length of a word taking tabs into account
 165
 166         >>> w = TextWrapper(tabwidth=8)
 167         >>> w.chunklen("\\t\\t\\t\\tX")
 168         33
 169
 170         """
 171         return len(word.replace('\t', '')) + word.count('\t') * self.tabwidth
 172
 173
 174     # -- Public interface ----------------------------------------------
 175
 176     def wrap(self, text):
 177         """wrap(text : string) -> [string]
 178
 179         Reformat the single paragraph in 'text' so it fits in lines of
 180         no more than 'self.width' columns, and return a list of wrapped
 181         lines.  Tabs in 'text' are expanded with string.expandtabs(),
 182         and all other whitespace characters (including newline) are
 183         converted to space.
 184         """
 185         chunks = self._split(text)
 186         return self._wrap_chunks(chunks)
 187
 188     def fill(self, text):
 189         """fill(text : string) -> string
 190
 191         Reformat the single paragraph in 'text' to fit in lines of no
 192         more than 'self.width' columns, and return a new string
 193         containing the entire wrapped paragraph.
 194         """
 195         return "\n".join(self.wrap(text))
 196
 197
 198 def word_wrap(text, tabwidth, limit):
 199     r"""Wrap long lines to the specified limit
 200
 201     >>> text = 'a bb ccc dddd\neeeee'
 202     >>> word_wrap(text, 8, 2)
 203     'a\nbb\nccc\ndddd\neeeee'
 204
 205     >>> word_wrap(text, 8, 4)
 206     'a bb\nccc\ndddd\neeeee'
 207
 208     >>> text = 'a bb ccc dddd\n\teeeee'
 209     >>> word_wrap(text, 8, 4)
 210     'a bb\nccc\ndddd\n\t\neeeee'
 211
 212     """
 213
 214     lines = []
 215
 216     # Acked-by:, Signed-off-by:, Helped-by:, etc.
 217     special_tag_rgx = re.compile('^[a-zA-Z_-]+:')
 218
 219     w = TextWrapper(width=limit,
 220                     tabwidth=tabwidth,
 221                     break_on_hyphens=True,
 222                     drop_whitespace=True)
 223
 224     for line in text.split('\n'):
 225         if special_tag_rgx.match(line):
 226             lines.append(line)
 227         else:
 228             lines.append(w.fill(line))
 229
 230     return '\n'.join(lines)