1 # Pod::PlainText -- Convert POD data to formatted ASCII text.
2 # $Id: Text.pm,v 2.1 1999/09/20 11:53:33 eagle Exp $
4 # Copyright 1999-2000 by Russ Allbery <rra@stanford.edu>
6 # This program is free software; you can redistribute it and/or modify it
7 # under the same terms as Perl itself.
9 # This module is intended to be a replacement for Pod::Text, and attempts to
10 # match its output except for some specific circumstances where other
11 # decisions seemed to produce better output. It uses Pod::Parser and is
12 # designed to be very easy to subclass.
14 ############################################################################
15 # Modules and declarations
16 ############################################################################
18 package Pod
::PlainText
;
23 use Carp
qw(carp croak);
26 use vars
qw(@ISA %ESCAPES $VERSION);
28 # We inherit from Pod::Select instead of Pod::Parser so that we can be used
30 @ISA = qw(Pod::Select);
41 ############################################################################
42 # Table of supported E<> escapes
43 ############################################################################
45 # This table is taken near verbatim from Pod::PlainText in Pod::Parser,
46 # which got it near verbatim from the original Pod::Text. It is therefore
47 # credited to Tom Christiansen, and I'm glad I didn't have to write it. :)
49 'amp' => '&', # ampersand
50 'lt' => '<', # left chevron, less-than
51 'gt' => '>', # right chevron, greater-than
52 'quot' => '"', # double quote
54 "Aacute" => "\xC1", # capital A, acute accent
55 "aacute" => "\xE1", # small a, acute accent
56 "Acirc" => "\xC2", # capital A, circumflex accent
57 "acirc" => "\xE2", # small a, circumflex accent
58 "AElig" => "\xC6", # capital AE diphthong (ligature)
59 "aelig" => "\xE6", # small ae diphthong (ligature)
60 "Agrave" => "\xC0", # capital A, grave accent
61 "agrave" => "\xE0", # small a, grave accent
62 "Aring" => "\xC5", # capital A, ring
63 "aring" => "\xE5", # small a, ring
64 "Atilde" => "\xC3", # capital A, tilde
65 "atilde" => "\xE3", # small a, tilde
66 "Auml" => "\xC4", # capital A, dieresis or umlaut mark
67 "auml" => "\xE4", # small a, dieresis or umlaut mark
68 "Ccedil" => "\xC7", # capital C, cedilla
69 "ccedil" => "\xE7", # small c, cedilla
70 "Eacute" => "\xC9", # capital E, acute accent
71 "eacute" => "\xE9", # small e, acute accent
72 "Ecirc" => "\xCA", # capital E, circumflex accent
73 "ecirc" => "\xEA", # small e, circumflex accent
74 "Egrave" => "\xC8", # capital E, grave accent
75 "egrave" => "\xE8", # small e, grave accent
76 "ETH" => "\xD0", # capital Eth, Icelandic
77 "eth" => "\xF0", # small eth, Icelandic
78 "Euml" => "\xCB", # capital E, dieresis or umlaut mark
79 "euml" => "\xEB", # small e, dieresis or umlaut mark
80 "Iacute" => "\xCD", # capital I, acute accent
81 "iacute" => "\xED", # small i, acute accent
82 "Icirc" => "\xCE", # capital I, circumflex accent
83 "icirc" => "\xEE", # small i, circumflex accent
84 "Igrave" => "\xCD", # capital I, grave accent
85 "igrave" => "\xED", # small i, grave accent
86 "Iuml" => "\xCF", # capital I, dieresis or umlaut mark
87 "iuml" => "\xEF", # small i, dieresis or umlaut mark
88 "Ntilde" => "\xD1", # capital N, tilde
89 "ntilde" => "\xF1", # small n, tilde
90 "Oacute" => "\xD3", # capital O, acute accent
91 "oacute" => "\xF3", # small o, acute accent
92 "Ocirc" => "\xD4", # capital O, circumflex accent
93 "ocirc" => "\xF4", # small o, circumflex accent
94 "Ograve" => "\xD2", # capital O, grave accent
95 "ograve" => "\xF2", # small o, grave accent
96 "Oslash" => "\xD8", # capital O, slash
97 "oslash" => "\xF8", # small o, slash
98 "Otilde" => "\xD5", # capital O, tilde
99 "otilde" => "\xF5", # small o, tilde
100 "Ouml" => "\xD6", # capital O, dieresis or umlaut mark
101 "ouml" => "\xF6", # small o, dieresis or umlaut mark
102 "szlig" => "\xDF", # small sharp s, German (sz ligature)
103 "THORN" => "\xDE", # capital THORN, Icelandic
104 "thorn" => "\xFE", # small thorn, Icelandic
105 "Uacute" => "\xDA", # capital U, acute accent
106 "uacute" => "\xFA", # small u, acute accent
107 "Ucirc" => "\xDB", # capital U, circumflex accent
108 "ucirc" => "\xFB", # small u, circumflex accent
109 "Ugrave" => "\xD9", # capital U, grave accent
110 "ugrave" => "\xF9", # small u, grave accent
111 "Uuml" => "\xDC", # capital U, dieresis or umlaut mark
112 "uuml" => "\xFC", # small u, dieresis or umlaut mark
113 "Yacute" => "\xDD", # capital Y, acute accent
114 "yacute" => "\xFD", # small y, acute accent
115 "yuml" => "\xFF", # small y, dieresis or umlaut mark
117 "lchevron" => "\xAB", # left chevron (double less than)
118 "rchevron" => "\xBB", # right chevron (double greater than)
122 ############################################################################
124 ############################################################################
126 # Initialize the object. Must be sure to call our parent initializer.
130 $$self{alt
} = 0 unless defined $$self{alt
};
131 $$self{indent
} = 4 unless defined $$self{indent
};
132 $$self{loose
} = 0 unless defined $$self{loose
};
133 $$self{sentence
} = 0 unless defined $$self{sentence
};
134 $$self{width
} = 76 unless defined $$self{width
};
136 $$self{INDENTS
} = []; # Stack of indentations.
137 $$self{MARGIN
} = $$self{indent
}; # Current left margin in spaces.
139 return $self->SUPER::initialize
;
143 ############################################################################
145 ############################################################################
147 # Called for each command paragraph. Gets the command, the associated
148 # paragraph, the line number, and a Pod::Paragraph object. Just dispatches
149 # the command to a method named the same as the command. =cut is handled
150 # internally by Pod::Parser.
154 return if $command eq 'pod';
155 return if ($$self{EXCLUDE
} && $command ne 'end');
156 if (defined $$self{ITEM
}) {
159 $self->output($_) if($command eq 'back');
161 $command = 'cmd_' . $command;
162 return $self->$command (@_);
165 # Called for a verbatim paragraph. Gets the paragraph, the line number, and
166 # a Pod::Paragraph object. Just output it verbatim, but with tabs converted
170 return if $$self{EXCLUDE
};
171 $self->item if defined $$self{ITEM
};
174 s/^(\s*\S+)/(' ' x $$self{MARGIN}) . $1/gme;
175 return $self->output($_);
178 # Called for a regular text block. Gets the paragraph, the line number, and
179 # a Pod::Paragraph object. Perform interpolation and output the results.
182 return if $$self{EXCLUDE
};
183 if($$self{VERBATIM
}) {
184 $self->output($_[0]);
190 # Perform a little magic to collapse multiple L<> references. This is
191 # here mostly for backwards-compatibility. We'll just rewrite the whole
192 # thing into actual text at this part, bypassing the whole internal
193 # sequence parsing thing.
196 L
< # A link of the form L</something>.
199 [:\w
]+ # The item has to be a simple word...
200 (\
(\
))?
# ...or simple function.
204 ,?\s
+(and\s
+)?
# Allow lots of them, conjuncted.
217 my @items = split /(?:,?\s+(?:and\s+)?)/;
220 for ($i = 0; $i < @items; $i++) {
221 $string .= $items[$i];
222 $string .= ", " if @items > 2 && $i != $#items;
223 $string .= " and " if ($i == $#items - 1);
225 $string .= " entries elsewhere in this document";
229 # Now actually interpolate and output the paragraph.
230 $_ = $self->interpolate ($_, $line);
232 if (defined $$self{ITEM
}) {
233 $self->item ($_ . "\n");
235 $self->output ($self->reformat ($_ . "\n"));
239 # Called for an interior sequence. Gets the command, argument, and a
240 # Pod::InteriorSequence object and is expected to return the resulting text.
241 # Calls code, bold, italic, file, and link to handle those types of
242 # sequences, and handles S<>, E<>, X<>, and Z<> directly.
243 sub interior_sequence
{
247 return '' if ($command eq 'X' || $command eq 'Z');
249 # Expand escapes into the actual character now, carping if invalid.
250 if ($command eq 'E') {
251 return $ESCAPES{$_} if defined $ESCAPES{$_};
252 carp
"Unknown escape: E<$_>";
256 # For all the other sequences, empty content produces no output.
259 # For S<>, compress all internal whitespace and then map spaces to \01.
260 # When we output the text, we'll map this back.
261 if ($command eq 'S') {
267 # Anything else needs to get dispatched to another method.
268 if ($command eq 'B') { return $self->seq_b ($_) }
269 elsif ($command eq 'C') { return $self->seq_c ($_) }
270 elsif ($command eq 'F') { return $self->seq_f ($_) }
271 elsif ($command eq 'I') { return $self->seq_i ($_) }
272 elsif ($command eq 'L') { return $self->seq_l ($_) }
273 else { carp
"Unknown sequence $command<$_>" }
276 # Called for each paragraph that's actually part of the POD. We take
277 # advantage of this opportunity to untabify the input.
278 sub preprocess_paragraph
{
281 1 while s/^(.*?)(\t+)/$1 . ' ' x (length ($2) * 8 - length ($1) % 8)/me;
286 ############################################################################
288 ############################################################################
290 # All command paragraphs take the paragraph and the line number.
292 # First level heading.
297 $_ = $self->interpolate ($_, shift);
299 $self->output ("\n==== $_ ====\n\n");
301 $_ .= "\n" if $$self{loose
};
302 $self->output ($_ . "\n");
306 # Second level heading.
311 $_ = $self->interpolate ($_, shift);
313 $self->output ("\n== $_ ==\n\n");
315 $_ .= "\n" if $$self{loose
};
316 $self->output (' ' x
($$self{indent
} / 2) . $_ . "\n");
320 # third level heading - not strictly perlpodspec compliant
325 $_ = $self->interpolate ($_, shift);
327 $self->output ("\n= $_ =\n");
329 $_ .= "\n" if $$self{loose
};
330 $self->output (' ' x
($$self{indent
}) . $_ . "\n");
334 # fourth level heading - not strictly perlpodspec compliant
336 *cmd_head4
= \
&cmd_head3
;
342 unless (/^[-+]?\d+\s+$/) { $_ = $$self{indent
} }
343 push (@
{ $$self{INDENTS
} }, $$self{MARGIN
});
344 $$self{MARGIN
} += ($_ + 0);
350 $$self{MARGIN
} = pop @
{ $$self{INDENTS
} };
351 unless (defined $$self{MARGIN
}) {
352 carp
'Unmatched =back';
353 $$self{MARGIN
} = $$self{indent
};
357 # An individual list item.
360 if (defined $$self{ITEM
}) { $self->item }
363 $$self{ITEM
} = $self->interpolate ($_);
366 # Begin a block for a particular translator. Setting VERBATIM triggers
367 # special handling in textblock().
371 my ($kind) = /^(\S+)/ or return;
372 if ($kind eq 'text') {
373 $$self{VERBATIM
} = 1;
379 # End a block for a particular translator. We assume that all =begin/=end
380 # pairs are properly closed.
384 $$self{VERBATIM
} = 0;
387 # One paragraph for a particular translator. Ignore it unless it's intended
388 # for text, in which case we treat it as a verbatim text block.
393 return unless s/^text\b[ \t]*\r?\n?//;
394 $self->verbatim ($_, $line);
398 ############################################################################
400 ############################################################################
402 # The simple formatting ones. These are here mostly so that subclasses can
403 # override them and do more complicated things.
404 sub seq_b
{ return $_[0]{alt
} ?
"``$_[1]''" : $_[1] }
405 sub seq_c
{ return $_[0]{alt
} ?
"``$_[1]''" : "`$_[1]'" }
406 sub seq_f
{ return $_[0]{alt
} ?
"\"$_[1]\"" : $_[1] }
407 sub seq_i
{ return '*' . $_[1] . '*' }
409 # The complicated one. Handle links. Since this is plain text, we can't
410 # actually make any real links, so this is all to figure out what text we
416 # Smash whitespace in case we were split across multiple lines.
419 # If we were given any explicit text, just output it.
420 if (/^([^|]+)\|/) { return $1 }
422 # Okay, leading and trailing whitespace isn't important; get rid of it.
426 # Default to using the whole content of the link entry as a section
427 # name. Note that L<manpage/> forces a manpage interpretation, as does
428 # something looking like L<manpage(section)>. The latter is an
429 # enhancement over the original Pod::Text.
430 my ($manpage, $section) = ('', $_);
431 if (/^(?:https?|ftp|news):/) {
434 } elsif (/^"\s*(.*?)\s*"$/) {
435 $section = '"' . $1 . '"';
436 } elsif (m/^[-:.\w]+(?:\(\S+\))?$/) {
437 ($manpage, $section) = ($_, '');
439 ($manpage, $section) = split (/\s*\/\s
*/, $_, 2);
443 # Now build the actual output text.
444 if (!length $section) {
445 $text = "the $manpage manpage" if length $manpage;
446 } elsif ($section =~ /^[:\w]+(?:\(\))?/) {
447 $text .= 'the ' . $section . ' entry';
448 $text .= (length $manpage) ?
" in the $manpage manpage"
449 : ' elsewhere in this document';
451 $section =~ s/^\"\s*//;
452 $section =~ s/\s*\"$//;
453 $text .= 'the section on "' . $section . '"';
454 $text .= " in the $manpage manpage" if length $manpage;
460 ############################################################################
462 ############################################################################
464 # This method is called whenever an =item command is complete (in other
465 # words, we've seen its associated paragraph or know for certain that it
466 # doesn't have one). It gets the paragraph associated with the item as an
467 # argument. If that argument is empty, just output the item tag; if it
468 # contains a newline, output the item tag followed by the newline.
469 # Otherwise, see if there's enough room for us to output the item tag in the
470 # margin of the text or if we have to put it on a separate line.
474 my $tag = $$self{ITEM
};
475 unless (defined $tag) {
476 carp
'item called without tag';
480 my $indent = $$self{INDENTS
}[-1];
481 unless (defined $indent) { $indent = $$self{indent
} }
482 my $space = ' ' x
$indent;
483 $space =~ s/^ /:/ if $$self{alt
};
484 if (!$_ || /^\s+$/ || ($$self{MARGIN
} - $indent < length ($tag) + 1)) {
485 my $margin = $$self{MARGIN
};
486 $$self{MARGIN
} = $indent;
487 my $output = $self->reformat ($tag);
488 $output =~ s/[\r\n]*$/\n/;
489 $self->output ($output);
490 $$self{MARGIN
} = $margin;
491 $self->output ($self->reformat ($_)) if /\S/;
493 $_ = $self->reformat ($_);
494 s/^ /:/ if ($$self{alt
} && $indent > 0);
495 my $tagspace = ' ' x
length $tag;
496 s/^($space)$tagspace/$1$tag/ or carp
'Bizarre space in item';
502 ############################################################################
504 ############################################################################
506 # Wrap a line, indenting by the current left margin. We can't use
507 # Text::Wrap because it plays games with tabs. We can't use formline, even
508 # though we'd really like to, because it screws up non-printing characters.
509 # So we have to do the wrapping ourselves.
514 my $spaces = ' ' x
$$self{MARGIN
};
515 my $width = $$self{width
} - $$self{MARGIN
};
516 while (length > $width) {
517 if (s/^([^\r\n]{0,$width})\s+// || s/^([^\r\n]{$width})//) {
518 $output .= $spaces . $1 . "\n";
523 $output .= $spaces . $_;
524 $output =~ s/\s+$/\n\n/;
528 # Reformat a paragraph of text for the current margin. Takes the text to
529 # reformat and returns the formatted text.
534 # If we're trying to preserve two spaces after sentences, do some
535 # munging to support that. Otherwise, smash all repeated whitespace.
536 if ($$self{sentence
}) {
544 return $self->wrap($_);
547 # Output text to the output device.
548 sub output
{ $_[1] =~ tr/\01/ /; print { $_[0]->output_handle } $_[1] }
551 ############################################################################
552 # Backwards compatibility
553 ############################################################################
555 # The old Pod::Text module did everything in a pod2text() function. This
556 # tries to provide the same interface for legacy applications.
560 # This is really ugly; I hate doing option parsing in the middle of a
561 # module. But the old Pod::Text module supported passing flags to its
562 # entry function, so handle -a and -<number>.
563 while ($_[0] =~ /^-/) {
565 if ($flag eq '-a') { push (@args, alt
=> 1) }
566 elsif ($flag =~ /^-(\d+)$/) { push (@args, width
=> $1) }
573 # Now that we know what arguments we're using, create the parser.
574 my $parser = Pod
::PlainText
->new (@args);
576 # If two arguments were given, the second argument is going to be a file
577 # handle. That means we want to call parse_from_filehandle(), which
578 # means we need to turn the first argument into a file handle. Magic
579 # open will handle the <&STDIN case automagically.
585 unless (open ($infh, $_[0])) {
586 croak
("Can't open $_[0] for reading: $!\n");
589 return $parser->parse_from_filehandle (@_);
591 return $parser->parse_from_file (@_);
596 ############################################################################
597 # Module return value and documentation
598 ############################################################################
605 Pod::PlainText - Convert POD data to formatted ASCII text
610 my $parser = Pod::PlainText->new (sentence => 0, width => 78);
612 # Read POD from STDIN and write to STDOUT.
613 $parser->parse_from_filehandle;
615 # Read POD from file.pod and write to file.txt.
616 $parser->parse_from_file ('file.pod', 'file.txt');
620 Pod::PlainText is a module that can convert documentation in the POD format (the
621 preferred language for documenting Perl) into formatted ASCII. It uses no
622 special formatting controls or codes whatsoever, and its output is therefore
623 suitable for nearly any device.
625 As a derived class from Pod::Parser, Pod::PlainText supports the same methods and
626 interfaces. See L<Pod::Parser> for all the details; briefly, one creates a
627 new parser with C<Pod::PlainText-E<gt>new()> and then calls either
628 parse_from_filehandle() or parse_from_file().
630 new() can take options, in the form of key/value pairs, that control the
631 behavior of the parser. The currently recognized options are:
637 If set to a true value, selects an alternate output format that, among other
638 things, uses a different heading style and marks C<=item> entries with a
639 colon in the left margin. Defaults to false.
643 The number of spaces to indent regular text, and the default indentation for
644 C<=over> blocks. Defaults to 4.
648 If set to a true value, a blank line is printed after a C<=headN> headings.
649 If set to false (the default), no blank line is printed after C<=headN>.
650 This is the default because it's the expected formatting for manual pages;
651 if you're formatting arbitrary text documents, setting this to true may
652 result in more pleasing output.
656 If set to a true value, Pod::PlainText will assume that each sentence ends in two
657 spaces, and will try to preserve that spacing. If set to false, all
658 consecutive whitespace in non-verbatim paragraphs is compressed into a
659 single space. Defaults to true.
663 The column at which to wrap text on the right-hand side. Defaults to 76.
667 The standard Pod::Parser method parse_from_filehandle() takes up to two
668 arguments, the first being the file handle to read POD from and the second
669 being the file handle to write the formatted output to. The first defaults
670 to STDIN if not given, and the second defaults to STDOUT. The method
671 parse_from_file() is almost identical, except that its two arguments are the
672 input and output disk files instead. See L<Pod::Parser> for the specific
679 =item Bizarre space in item
681 (W) Something has gone wrong in internal C<=item> processing. This message
682 indicates a bug in Pod::PlainText; you should never see it.
684 =item Can't open %s for reading: %s
686 (F) Pod::PlainText was invoked via the compatibility mode pod2text() interface
687 and the input file it was given could not be opened.
689 =item Unknown escape: %s
691 (W) The POD source contained an C<EE<lt>E<gt>> escape that Pod::PlainText didn't
694 =item Unknown sequence: %s
696 (W) The POD source contained a non-standard internal sequence (something of
697 the form C<XE<lt>E<gt>>) that Pod::PlainText didn't know about.
699 =item Unmatched =back
701 (W) Pod::PlainText encountered a C<=back> command that didn't correspond to an
708 Embedded Ctrl-As (octal 001) in the input will be mapped to spaces on
709 output, due to an internal implementation detail.
713 This is a replacement for an earlier Pod::Text module written by Tom
714 Christiansen. It has a revamped interface, since it now uses Pod::Parser,
715 but an interface roughly compatible with the old Pod::Text::pod2text()
716 function is still available. Please change to the new calling convention,
719 The original Pod::Text contained code to do formatting via termcap
720 sequences, although it wasn't turned on by default and it was problematic to
721 get it to work at all. This rewrite doesn't even try to do that, but a
722 subclass of it does. Look for L<Pod::Text::Termcap|Pod::Text::Termcap>.
726 B<Pod::PlainText> is part of the L<Pod::Parser> distribution.
728 L<Pod::Parser|Pod::Parser>, L<Pod::Text::Termcap|Pod::Text::Termcap>,
733 Please report bugs using L<http://rt.cpan.org>.
735 Russ Allbery E<lt>rra@stanford.eduE<gt>, based I<very> heavily on the
736 original Pod::Text by Tom Christiansen E<lt>tchrist@mox.perl.comE<gt> and
737 its conversion to Pod::Parser by Brad Appleton
738 E<lt>bradapp@enteract.comE<gt>.