Lib/textwrap.py

   1 """Text wrapping and filling.
   2 """
   3
   4 # Copyright (C) 1999-2001 Gregory P. Ward.
   5 # Copyright (C) 2002, 2003 Python Software Foundation.
   6 # Written by Greg Ward <gward@python.net>
   7
   8 __revision__ = "$Id$"
   9
  10 import string, re
  11
  12 __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent']
  13
  14 # Hardcode the recognized whitespace characters to the US-ASCII
  15 # whitespace characters.  The main reason for doing this is that in
  16 # ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
  17 # that character winds up in string.whitespace.  Respecting
  18 # string.whitespace in those cases would 1) make textwrap treat 0xa0 the
  19 # same as any other whitespace char, which is clearly wrong (it's a
  20 # *non-breaking* space), 2) possibly cause problems with Unicode,
  21 # since 0xa0 is not in range(128).
  22 _whitespace = '\t\n\x0b\x0c\r '
  23
  24 class TextWrapper:
  25     """
  26     Object for wrapping/filling text.  The public interface consists of
  27     the wrap() and fill() methods; the other methods are just there for
  28     subclasses to override in order to tweak the default behaviour.
  29     If you want to completely replace the main wrapping algorithm,
  30     you'll probably have to override _wrap_chunks().
  31
  32     Several instance attributes control various aspects of wrapping:
  33       width (default: 70)
  34         the maximum width of wrapped lines (unless break_long_words
  35         is false)
  36       initial_indent (default: "")
  37         string that will be prepended to the first line of wrapped
  38         output.  Counts towards the line's width.
  39       subsequent_indent (default: "")
  40         string that will be prepended to all lines save the first
  41         of wrapped output; also counts towards each line's width.
  42       expand_tabs (default: true)
  43         Expand tabs in input text to spaces before further processing.
  44         Each tab will become 1 .. 8 spaces, depending on its position in
  45         its line.  If false, each tab is treated as a single character.
  46       replace_whitespace (default: true)
  47         Replace all whitespace characters in the input text by spaces
  48         after tab expansion.  Note that if expand_tabs is false and
  49         replace_whitespace is true, every tab will be converted to a
  50         single space!
  51       fix_sentence_endings (default: false)
  52         Ensure that sentence-ending punctuation is always followed
  53         by two spaces.  Off by default because the algorithm is
  54         (unavoidably) imperfect.
  55       break_long_words (default: true)
  56         Break words longer than 'width'.  If false, those words will not
  57         be broken, and some lines might be longer than 'width'.
  58       break_on_hyphens (default: true)
  59         Allow breaking hyphenated words. If true, wrapping will occur
  60         preferably on whitespaces and right after hyphens part of
  61         compound words.
  62       drop_whitespace (default: true)
  63         Drop leading and trailing whitespace from lines.
  64     """
  65
  66     unicode_whitespace_trans = {}
  67     uspace = ord(' ')
  68     for x in _whitespace:
  69         unicode_whitespace_trans[ord(x)] = uspace
  70
  71     # This funky little regex is just the trick for splitting
  72     # text up into word-wrappable chunks.  E.g.
  73     #   "Hello there -- you goof-ball, use the -b option!"
  74     # splits into
  75     #   Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
  76     # (after stripping out empty strings).
  77     wordsep_re = re.compile(
  78         r'(\s+|'                                  # any whitespace
  79         r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|'   # hyphenated words
  80         r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))')   # em-dash
  81
  82     # This less funky little regex just split on recognized spaces. E.g.
  83     #   "Hello there -- you goof-ball, use the -b option!"
  84     # splits into
  85     #   Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
  86     wordsep_simple_re = re.compile(r'(\s+)')
  87
  88     # XXX this is not locale- or charset-aware -- string.lowercase
  89     # is US-ASCII only (and therefore English-only)
  90     sentence_end_re = re.compile(r'[a-z]'             # lowercase letter
  91                                  r'[\.\!\?]'          # sentence-ending punct.
  92                                  r'[\"\']?'           # optional end-of-quote
  93                                  r'\Z')               # end of chunk
  94
  95
  96     def __init__(self,
  97                  width=70,
  98                  initial_indent="",
  99                  subsequent_indent="",
 100                  expand_tabs=True,
 101                  replace_whitespace=True,
 102                  fix_sentence_endings=False,
 103                  break_long_words=True,
 104                  drop_whitespace=True,
 105                  break_on_hyphens=True):
 106         self.width = width
 107         self.initial_indent = initial_indent
 108         self.subsequent_indent = subsequent_indent
 109         self.expand_tabs = expand_tabs
 110         self.replace_whitespace = replace_whitespace
 111         self.fix_sentence_endings = fix_sentence_endings
 112         self.break_long_words = break_long_words
 113         self.drop_whitespace = drop_whitespace
 114         self.break_on_hyphens = break_on_hyphens
 115
 116
 117     # -- Private methods -----------------------------------------------
 118     # (possibly useful for subclasses to override)
 119
 120     def _munge_whitespace(self, text):
 121         """_munge_whitespace(text : string) -> string
 122
 123         Munge whitespace in text: expand tabs and convert all other
 124         whitespace characters to spaces.  Eg. " foo\tbar\n\nbaz"
 125         becomes " foo    bar  baz".
 126         """
 127         if self.expand_tabs:
 128             text = text.expandtabs()
 129         if self.replace_whitespace:
 130             text = text.translate(self.unicode_whitespace_trans)
 131         return text
 132
 133
 134     def _split(self, text):
 135         """_split(text : string) -> [string]
 136
 137         Split the text to wrap into indivisible chunks.  Chunks are
 138         not quite the same as words; see _wrap_chunks() for full
 139         details.  As an example, the text
 140           Look, goof-ball -- use the -b option!
 141         breaks into the following chunks:
 142           'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
 143           'use', ' ', 'the', ' ', '-b', ' ', 'option!'
 144         if break_on_hyphens is True, or in:
 145           'Look,', ' ', 'goof-ball', ' ', '--', ' ',
 146           'use', ' ', 'the', ' ', '-b', ' ', option!'
 147         otherwise.
 148         """
 149         if self.break_on_hyphens is True:
 150             chunks = self.wordsep_re.split(text)
 151         else:
 152             chunks = self.wordsep_simple_re.split(text)
 153         chunks = [c for c in chunks if c]
 154         return chunks
 155
 156     def _fix_sentence_endings(self, chunks):
 157         """_fix_sentence_endings(chunks : [string])
 158
 159         Correct for sentence endings buried in 'chunks'.  Eg. when the
 160         original text contains "... foo.\nBar ...", munge_whitespace()
 161         and split() will convert that to [..., "foo.", " ", "Bar", ...]
 162         which has one too few spaces; this method simply changes the one
 163         space to two.
 164         """
 165         i = 0
 166         patsearch = self.sentence_end_re.search
 167         while i < len(chunks)-1:
 168             if chunks[i+1] == " " and patsearch(chunks[i]):
 169                 chunks[i+1] = "  "
 170                 i += 2
 171             else:
 172                 i += 1
 173
 174     def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
 175         """_handle_long_word(chunks : [string],
 176                              cur_line : [string],
 177                              cur_len : int, width : int)
 178
 179         Handle a chunk of text (most likely a word, not whitespace) that
 180         is too long to fit in any line.
 181         """
 182         # Figure out when indent is larger than the specified width, and make
 183         # sure at least one character is stripped off on every pass
 184         if width < 1:
 185             space_left = 1
 186         else:
 187             space_left = width - cur_len
 188
 189         # If we're allowed to break long words, then do so: put as much
 190         # of the next chunk onto the current line as will fit.
 191         if self.break_long_words:
 192             cur_line.append(reversed_chunks[-1][:space_left])
 193             reversed_chunks[-1] = reversed_chunks[-1][space_left:]
 194
 195         # Otherwise, we have to preserve the long word intact.  Only add
 196         # it to the current line if there's nothing already there --
 197         # that minimizes how much we violate the width constraint.
 198         elif not cur_line:
 199             cur_line.append(reversed_chunks.pop())
 200
 201         # If we're not allowed to break long words, and there's already
 202         # text on the current line, do nothing.  Next time through the
 203         # main loop of _wrap_chunks(), we'll wind up here again, but
 204         # cur_len will be zero, so the next line will be entirely
 205         # devoted to the long word that we can't handle right now.
 206
 207     def _wrap_chunks(self, chunks):
 208         """_wrap_chunks(chunks : [string]) -> [string]
 209
 210         Wrap a sequence of text chunks and return a list of lines of
 211         length 'self.width' or less.  (If 'break_long_words' is false,
 212         some lines may be longer than this.)  Chunks correspond roughly
 213         to words and the whitespace between them: each chunk is
 214         indivisible (modulo 'break_long_words'), but a line break can
 215         come between any two chunks.  Chunks should not have internal
 216         whitespace; ie. a chunk is either all whitespace or a "word".
 217         Whitespace chunks will be removed from the beginning and end of
 218         lines, but apart from that whitespace is preserved.
 219         """
 220         lines = []
 221         if self.width <= 0:
 222             raise ValueError("invalid width %r (must be > 0)" % self.width)
 223
 224         # Arrange in reverse order so items can be efficiently popped
 225         # from a stack of chucks.
 226         chunks.reverse()
 227
 228         while chunks:
 229
 230             # Start the list of chunks that will make up the current line.
 231             # cur_len is just the length of all the chunks in cur_line.
 232             cur_line = []
 233             cur_len = 0
 234
 235             # Figure out which static string will prefix this line.
 236             if lines:
 237                 indent = self.subsequent_indent
 238             else:
 239                 indent = self.initial_indent
 240
 241             # Maximum width for this line.
 242             width = self.width - len(indent)
 243
 244             # First chunk on line is whitespace -- drop it, unless this
 245             # is the very beginning of the text (ie. no lines started yet).
 246             if self.drop_whitespace and chunks[-1].strip() == '' and lines:
 247                 del chunks[-1]
 248
 249             while chunks:
 250                 l = len(chunks[-1])
 251
 252                 # Can at least squeeze this chunk onto the current line.
 253                 if cur_len + l <= width:
 254                     cur_line.append(chunks.pop())
 255                     cur_len += l
 256
 257                 # Nope, this line is full.
 258                 else:
 259                     break
 260
 261             # The current line is full, and the next chunk is too big to
 262             # fit on *any* line (not just this one).
 263             if chunks and len(chunks[-1]) > width:
 264                 self._handle_long_word(chunks, cur_line, cur_len, width)
 265
 266             # If the last chunk on this line is all whitespace, drop it.
 267             if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
 268                 del cur_line[-1]
 269
 270             # Convert current line back to a string and store it in list
 271             # of all lines (return value).
 272             if cur_line:
 273                 lines.append(indent + ''.join(cur_line))
 274
 275         return lines
 276
 277
 278     # -- Public interface ----------------------------------------------
 279
 280     def wrap(self, text):
 281         """wrap(text : string) -> [string]
 282
 283         Reformat the single paragraph in 'text' so it fits in lines of
 284         no more than 'self.width' columns, and return a list of wrapped
 285         lines.  Tabs in 'text' are expanded with string.expandtabs(),
 286         and all other whitespace characters (including newline) are
 287         converted to space.
 288         """
 289         text = self._munge_whitespace(text)
 290         chunks = self._split(text)
 291         if self.fix_sentence_endings:
 292             self._fix_sentence_endings(chunks)
 293         return self._wrap_chunks(chunks)
 294
 295     def fill(self, text):
 296         """fill(text : string) -> string
 297
 298         Reformat the single paragraph in 'text' to fit in lines of no
 299         more than 'self.width' columns, and return a new string
 300         containing the entire wrapped paragraph.
 301         """
 302         return "\n".join(self.wrap(text))
 303
 304
 305 # -- Convenience interface ---------------------------------------------
 306
 307 def wrap(text, width=70, **kwargs):
 308     """Wrap a single paragraph of text, returning a list of wrapped lines.
 309
 310     Reformat the single paragraph in 'text' so it fits in lines of no
 311     more than 'width' columns, and return a list of wrapped lines.  By
 312     default, tabs in 'text' are expanded with string.expandtabs(), and
 313     all other whitespace characters (including newline) are converted to
 314     space.  See TextWrapper class for available keyword args to customize
 315     wrapping behaviour.
 316     """
 317     w = TextWrapper(width=width, **kwargs)
 318     return w.wrap(text)
 319
 320 def fill(text, width=70, **kwargs):
 321     """Fill a single paragraph of text, returning a new string.
 322
 323     Reformat the single paragraph in 'text' to fit in lines of no more
 324     than 'width' columns, and return a new string containing the entire
 325     wrapped paragraph.  As with wrap(), tabs are expanded and other
 326     whitespace characters converted to space.  See TextWrapper class for
 327     available keyword args to customize wrapping behaviour.
 328     """
 329     w = TextWrapper(width=width, **kwargs)
 330     return w.fill(text)
 331
 332
 333 # -- Loosely related functionality -------------------------------------
 334
 335 _whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
 336 _leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
 337
 338 def dedent(text):
 339     """Remove any common leading whitespace from every line in `text`.
 340
 341     This can be used to make triple-quoted strings line up with the left
 342     edge of the display, while still presenting them in the source code
 343     in indented form.
 344
 345     Note that tabs and spaces are both treated as whitespace, but they
 346     are not equal: the lines "  hello" and "\thello" are
 347     considered to have no common leading whitespace.  (This behaviour is
 348     new in Python 2.5; older versions of this module incorrectly
 349     expanded tabs before searching for common leading whitespace.)
 350     """
 351     # Look for the longest leading string of spaces and tabs common to
 352     # all lines.
 353     margin = None
 354     text = _whitespace_only_re.sub('', text)
 355     indents = _leading_whitespace_re.findall(text)
 356     for indent in indents:
 357         if margin is None:
 358             margin = indent
 359
 360         # Current line more deeply indented than previous winner:
 361         # no change (previous winner is still on top).
 362         elif indent.startswith(margin):
 363             pass
 364
 365         # Current line consistent with and no deeper than previous winner:
 366         # it's the new winner.
 367         elif margin.startswith(indent):
 368             margin = indent
 369
 370         # Current line and previous winner have no common whitespace:
 371         # there is no margin.
 372         else:
 373             margin = ""
 374             break
 375
 376     # sanity check (testing/debugging only)
 377     if 0 and margin:
 378         for line in text.split("\n"):
 379             assert not line or line.startswith(margin), \
 380                    "line = %r, margin = %r" % (line, margin)
 381
 382     if margin:
 383         text = re.sub(r'(?m)^' + margin, '', text)
 384     return text
 385
 386 if __name__ == "__main__":
 387     #print dedent("\tfoo\n\tbar")
 388     #print dedent("  \thello there\n  \t  how are you?")
 389     print(dedent("Hello there.\n  This is indented."))