Lib/textwrap.py

   1 """Text wrapping and filling.
   2 """
   3
   4 # Copyright (C) 1999-2001 Gregory P. Ward.
   5 # Copyright (C) 2002, 2003 Python Software Foundation.
   6 # Written by Greg Ward <gward@python.net>
   7
   8 __revision__ = "$Id$"
   9
  10 import string, re
  11
  12 # Do the right thing with boolean values for all known Python versions
  13 # (so this module can be copied to projects that don't depend on Python
  14 # 2.3, e.g. Optik and Docutils).
  15 try:
  16     True, False
  17 except NameError:
  18     (True, False) = (1, 0)
  19
  20 __all__ = ['TextWrapper', 'wrap', 'fill']
  21
  22 # Hardcode the recognized whitespace characters to the US-ASCII
  23 # whitespace characters.  The main reason for doing this is that in
  24 # ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
  25 # that character winds up in string.whitespace.  Respecting
  26 # string.whitespace in those cases would 1) make textwrap treat 0xa0 the
  27 # same as any other whitespace char, which is clearly wrong (it's a
  28 # *non-breaking* space), 2) possibly cause problems with Unicode,
  29 # since 0xa0 is not in range(128).
  30 _whitespace = '\t\n\x0b\x0c\r '
  31
  32 class TextWrapper:
  33     """
  34     Object for wrapping/filling text.  The public interface consists of
  35     the wrap() and fill() methods; the other methods are just there for
  36     subclasses to override in order to tweak the default behaviour.
  37     If you want to completely replace the main wrapping algorithm,
  38     you'll probably have to override _wrap_chunks().
  39
  40     Several instance attributes control various aspects of wrapping:
  41       width (default: 70)
  42         the maximum width of wrapped lines (unless break_long_words
  43         is false)
  44       initial_indent (default: "")
  45         string that will be prepended to the first line of wrapped
  46         output.  Counts towards the line's width.
  47       subsequent_indent (default: "")
  48         string that will be prepended to all lines save the first
  49         of wrapped output; also counts towards each line's width.
  50       expand_tabs (default: true)
  51         Expand tabs in input text to spaces before further processing.
  52         Each tab will become 1 .. 8 spaces, depending on its position in
  53         its line.  If false, each tab is treated as a single character.
  54       replace_whitespace (default: true)
  55         Replace all whitespace characters in the input text by spaces
  56         after tab expansion.  Note that if expand_tabs is false and
  57         replace_whitespace is true, every tab will be converted to a
  58         single space!
  59       fix_sentence_endings (default: false)
  60         Ensure that sentence-ending punctuation is always followed
  61         by two spaces.  Off by default because the algorithm is
  62         (unavoidably) imperfect.
  63       break_long_words (default: true)
  64         Break words longer than 'width'.  If false, those words will not
  65         be broken, and some lines might be longer than 'width'.
  66       drop_whitespace (default: true)
  67         Drop leading and trailing whitespace from lines.
  68     """
  69
  70     whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace))
  71
  72     unicode_whitespace_trans = {}
  73     uspace = ord(u' ')
  74     for x in map(ord, _whitespace):
  75         unicode_whitespace_trans[x] = uspace
  76
  77     # This funky little regex is just the trick for splitting
  78     # text up into word-wrappable chunks.  E.g.
  79     #   "Hello there -- you goof-ball, use the -b option!"
  80     # splits into
  81     #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
  82     # (after stripping out empty strings).
  83     wordsep_re = re.compile(
  84         r'(\s+|'                                  # any whitespace
  85         r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|'   # hyphenated words
  86         r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))')   # em-dash
  87
  88     # XXX this is not locale- or charset-aware -- string.lowercase
  89     # is US-ASCII only (and therefore English-only)
  90     sentence_end_re = re.compile(r'[%s]'              # lowercase letter
  91                                  r'[\.\!\?]'          # sentence-ending punct.
  92                                  r'[\"\']?'           # optional end-of-quote
  93                                  % string.lowercase)
  94
  95
  96     def __init__(self,
  97                  width=70,
  98                  initial_indent="",
  99                  subsequent_indent="",
 100                  expand_tabs=True,
 101                  replace_whitespace=True,
 102                  fix_sentence_endings=False,
 103                  break_long_words=True,
 104                  drop_whitespace=True):
 105         self.width = width
 106         self.initial_indent = initial_indent
 107         self.subsequent_indent = subsequent_indent
 108         self.expand_tabs = expand_tabs
 109         self.replace_whitespace = replace_whitespace
 110         self.fix_sentence_endings = fix_sentence_endings
 111         self.break_long_words = break_long_words
 112         self.drop_whitespace = drop_whitespace
 113
 114
 115     # -- Private methods -----------------------------------------------
 116     # (possibly useful for subclasses to override)
 117
 118     def _munge_whitespace(self, text):
 119         """_munge_whitespace(text : string) -> string
 120
 121         Munge whitespace in text: expand tabs and convert all other
 122         whitespace characters to spaces.  Eg. " foo\tbar\n\nbaz"
 123         becomes " foo    bar  baz".
 124         """
 125         if self.expand_tabs:
 126             text = text.expandtabs()
 127         if self.replace_whitespace:
 128             if isinstance(text, str):
 129                 text = text.translate(self.whitespace_trans)
 130             elif isinstance(text, unicode):
 131                 text = text.translate(self.unicode_whitespace_trans)
 132         return text
 133
 134
 135     def _split(self, text):
 136         """_split(text : string) -> [string]
 137
 138         Split the text to wrap into indivisible chunks.  Chunks are
 139         not quite the same as words; see wrap_chunks() for full
 140         details.  As an example, the text
 141           Look, goof-ball -- use the -b option!
 142         breaks into the following chunks:
 143           'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
 144           'use', ' ', 'the', ' ', '-b', ' ', 'option!'
 145         """
 146         chunks = self.wordsep_re.split(text)
 147         chunks = filter(None, chunks)  # remove empty chunks
 148         return chunks
 149
 150     def _fix_sentence_endings(self, chunks):
 151         """_fix_sentence_endings(chunks : [string])
 152
 153         Correct for sentence endings buried in 'chunks'.  Eg. when the
 154         original text contains "... foo.\nBar ...", munge_whitespace()
 155         and split() will convert that to [..., "foo.", " ", "Bar", ...]
 156         which has one too few spaces; this method simply changes the one
 157         space to two.
 158         """
 159         i = 0
 160         pat = self.sentence_end_re
 161         while i < len(chunks)-1:
 162             if chunks[i+1] == " " and pat.search(chunks[i]):
 163                 chunks[i+1] = "  "
 164                 i += 2
 165             else:
 166                 i += 1
 167
 168     def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
 169         """_handle_long_word(chunks : [string],
 170                              cur_line : [string],
 171                              cur_len : int, width : int)
 172
 173         Handle a chunk of text (most likely a word, not whitespace) that
 174         is too long to fit in any line.
 175         """
 176         space_left = max(width - cur_len, 1)
 177
 178         # If we're allowed to break long words, then do so: put as much
 179         # of the next chunk onto the current line as will fit.
 180         if self.break_long_words:
 181             cur_line.append(reversed_chunks[-1][:space_left])
 182             reversed_chunks[-1] = reversed_chunks[-1][space_left:]
 183
 184         # Otherwise, we have to preserve the long word intact.  Only add
 185         # it to the current line if there's nothing already there --
 186         # that minimizes how much we violate the width constraint.
 187         elif not cur_line:
 188             cur_line.append(reversed_chunks.pop())
 189
 190         # If we're not allowed to break long words, and there's already
 191         # text on the current line, do nothing.  Next time through the
 192         # main loop of _wrap_chunks(), we'll wind up here again, but
 193         # cur_len will be zero, so the next line will be entirely
 194         # devoted to the long word that we can't handle right now.
 195
 196     def _wrap_chunks(self, chunks):
 197         """_wrap_chunks(chunks : [string]) -> [string]
 198
 199         Wrap a sequence of text chunks and return a list of lines of
 200         length 'self.width' or less.  (If 'break_long_words' is false,
 201         some lines may be longer than this.)  Chunks correspond roughly
 202         to words and the whitespace between them: each chunk is
 203         indivisible (modulo 'break_long_words'), but a line break can
 204         come between any two chunks.  Chunks should not have internal
 205         whitespace; ie. a chunk is either all whitespace or a "word".
 206         Whitespace chunks will be removed from the beginning and end of
 207         lines, but apart from that whitespace is preserved.
 208         """
 209         lines = []
 210         if self.width <= 0:
 211             raise ValueError("invalid width %r (must be > 0)" % self.width)
 212
 213         # Arrange in reverse order so items can be efficiently popped
 214         # from a stack of chucks.
 215         chunks.reverse()
 216
 217         while chunks:
 218
 219             # Start the list of chunks that will make up the current line.
 220             # cur_len is just the length of all the chunks in cur_line.
 221             cur_line = []
 222             cur_len = 0
 223
 224             # Figure out which static string will prefix this line.
 225             if lines:
 226                 indent = self.subsequent_indent
 227             else:
 228                 indent = self.initial_indent
 229
 230             # Maximum width for this line.
 231             width = self.width - len(indent)
 232
 233             # First chunk on line is whitespace -- drop it, unless this
 234             # is the very beginning of the text (ie. no lines started yet).
 235             if self.drop_whitespace and chunks[-1].strip() == '' and lines:
 236                 del chunks[-1]
 237
 238             while chunks:
 239                 l = len(chunks[-1])
 240
 241                 # Can at least squeeze this chunk onto the current line.
 242                 if cur_len + l <= width:
 243                     cur_line.append(chunks.pop())
 244                     cur_len += l
 245
 246                 # Nope, this line is full.
 247                 else:
 248                     break
 249
 250             # The current line is full, and the next chunk is too big to
 251             # fit on *any* line (not just this one).
 252             if chunks and len(chunks[-1]) > width:
 253                 self._handle_long_word(chunks, cur_line, cur_len, width)
 254
 255             # If the last chunk on this line is all whitespace, drop it.
 256             if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
 257                 del cur_line[-1]
 258
 259             # Convert current line back to a string and store it in list
 260             # of all lines (return value).
 261             if cur_line:
 262                 lines.append(indent + ''.join(cur_line))
 263
 264         return lines
 265
 266
 267     # -- Public interface ----------------------------------------------
 268
 269     def wrap(self, text):
 270         """wrap(text : string) -> [string]
 271
 272         Reformat the single paragraph in 'text' so it fits in lines of
 273         no more than 'self.width' columns, and return a list of wrapped
 274         lines.  Tabs in 'text' are expanded with string.expandtabs(),
 275         and all other whitespace characters (including newline) are
 276         converted to space.
 277         """
 278         text = self._munge_whitespace(text)
 279         chunks = self._split(text)
 280         if self.fix_sentence_endings:
 281             self._fix_sentence_endings(chunks)
 282         return self._wrap_chunks(chunks)
 283
 284     def fill(self, text):
 285         """fill(text : string) -> string
 286
 287         Reformat the single paragraph in 'text' to fit in lines of no
 288         more than 'self.width' columns, and return a new string
 289         containing the entire wrapped paragraph.
 290         """
 291         return "\n".join(self.wrap(text))
 292
 293
 294 # -- Convenience interface ---------------------------------------------
 295
 296 def wrap(text, width=70, **kwargs):
 297     """Wrap a single paragraph of text, returning a list of wrapped lines.
 298
 299     Reformat the single paragraph in 'text' so it fits in lines of no
 300     more than 'width' columns, and return a list of wrapped lines.  By
 301     default, tabs in 'text' are expanded with string.expandtabs(), and
 302     all other whitespace characters (including newline) are converted to
 303     space.  See TextWrapper class for available keyword args to customize
 304     wrapping behaviour.
 305     """
 306     w = TextWrapper(width=width, **kwargs)
 307     return w.wrap(text)
 308
 309 def fill(text, width=70, **kwargs):
 310     """Fill a single paragraph of text, returning a new string.
 311
 312     Reformat the single paragraph in 'text' to fit in lines of no more
 313     than 'width' columns, and return a new string containing the entire
 314     wrapped paragraph.  As with wrap(), tabs are expanded and other
 315     whitespace characters converted to space.  See TextWrapper class for
 316     available keyword args to customize wrapping behaviour.
 317     """
 318     w = TextWrapper(width=width, **kwargs)
 319     return w.fill(text)
 320
 321
 322 # -- Loosely related functionality -------------------------------------
 323
 324 _whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
 325 _leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
 326
 327 def dedent(text):
 328     """Remove any common leading whitespace from every line in `text`.
 329
 330     This can be used to make triple-quoted strings line up with the left
 331     edge of the display, while still presenting them in the source code
 332     in indented form.
 333
 334     Note that tabs and spaces are both treated as whitespace, but they
 335     are not equal: the lines "  hello" and "\thello" are
 336     considered to have no common leading whitespace.  (This behaviour is
 337     new in Python 2.5; older versions of this module incorrectly
 338     expanded tabs before searching for common leading whitespace.)
 339     """
 340     # Look for the longest leading string of spaces and tabs common to
 341     # all lines.
 342     margin = None
 343     text = _whitespace_only_re.sub('', text)
 344     indents = _leading_whitespace_re.findall(text)
 345     for indent in indents:
 346         if margin is None:
 347             margin = indent
 348
 349         # Current line more deeply indented than previous winner:
 350         # no change (previous winner is still on top).
 351         elif indent.startswith(margin):
 352             pass
 353
 354         # Current line consistent with and no deeper than previous winner:
 355         # it's the new winner.
 356         elif margin.startswith(indent):
 357             margin = indent
 358
 359         # Current line and previous winner have no common whitespace:
 360         # there is no margin.
 361         else:
 362             margin = ""
 363             break
 364
 365     # sanity check (testing/debugging only)
 366     if 0 and margin:
 367         for line in text.split("\n"):
 368             assert not line or line.startswith(margin), \
 369                    "line = %r, margin = %r" % (line, margin)
 370
 371     if margin:
 372         text = re.sub(r'(?m)^' + margin, '', text)
 373     return text
 374
 375 if __name__ == "__main__":
 376     #print dedent("\tfoo\n\tbar")
 377     #print dedent("  \thello there\n  \t  how are you?")
 378     print dedent("Hello there.\n  This is indented.")