1 #####################################################################
3 # Grutatxt - A text to HTML (and other things) converter
5 # Copyright (C) 2000/2004 Angel Ortega <angel@triptico.com>
7 # This program is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU General Public License
9 # as published by the Free Software Foundation; either version 2
10 # of the License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 # http://www.triptico.com
23 #####################################################################
35 Grutatxt - Text to HTML (and other formats) converter
41 # create a new Grutatxt converter object
42 $grutatxt=new Grutatxt();
44 # process a Grutatxt format string
45 @output=$grutatxt->process($text);
48 @output2=$grutatxt->process_file($file);
52 Grutatxt is a module to process text documents in
53 a special markup format (also called Grutatxt), very
54 similar to plain ASCII text. These documents can be
55 converted to HTML or troff.
57 The markup is designed to be fairly intuitive and
58 straightforward and can include headings, bold and italic
59 text effects, bulleted, numbered and definition lists, URLs,
60 function and variable names, preformatted text, horizontal
61 separators and tables. Special marks can be inserted in the
62 text and a heading-based structural index can be obtained
65 A comprehensive description of the markup is defined in
66 the README file, included with the Grutatxt package (it is
67 written in Grutatxt format itself, so it can be converted
68 using the I<grutatxt> tool to any of the supported formats).
69 The latest version (and more information) can be retrieved
70 from the Grutatxt home page at:
72 http://www.triptico.com/software/grutatxt.html
74 =head1 FUNCTIONS AND METHODS
78 $grutatxt=new Grutatxt([ "mode" => $mode, ]
79 [ "title" => \$title, ]
80 [ "marks" => \@marks, ]
81 [ "index" => \@index, ]
82 [ "abstract" => \$abstract, ]
83 [ "strip-parens" => $bool, ]
84 [ "strip-dollars" => $bool, ]
85 [ %driver_specific_arguments ] );
87 Creates a new Grutatxt object instance. All parameters are
94 Output format. Can be HTML or troff. HTML is used if not specified.
98 If I<title> is specified as a reference to scalar, the first
99 level 1 heading found in the text is stored inside it.
103 Marks in the Grutatxt markup are created by inserting the
104 string <-> alone in a line. If I<marks> is specified as a
105 reference to array, it will be filled with the subscripts
106 (relative to the output array) of the lines where the marks
107 are found in the text.
111 If I<index> is specified as a reference to array, it will
112 be filled with strings in the format
116 This information can be used to build a table of contents
117 of the processed text.
119 =item I<strip-parens>
121 Function names in the Grutatxt markup are strings of
122 alphanumeric characters immediately followed by a pair
123 of open and close parentheses. If this boolean value is
124 set, function names found in the processed text will have
125 their parentheses deleted.
127 =item I<strip-dollars>
129 Variable names in the Grutatxt markup are strings of
130 alphanumeric characters preceded by a dollar sign.
131 If this boolean value is set, variable names found in
132 the processed text will have the dollar sign deleted.
136 The I<abstract> of a Grutatxt document is the fragment of text
137 from the beginning of the document to the end of the first
138 paragraph after the title. If I<abstract> is specified as a
139 reference to scalar, it will contain (after each call to the
140 B<process()> method) the subscript of the element of the output
141 array that marks the end of the subject.
149 my ($class,%args) = @_;
152 $args{'mode'} ||= 'HTML';
154 $class .= "::" . $args{'mode'};
156 $gh = new
$class(%args);
164 @output=$grutatxt->process($text);
166 Processes a text in Grutatxt format. The result is returned
167 as an array of lines.
173 my ($gh,$content) = @_;
179 # clean title and paragraph numbers
180 $gh->{'-title'} = "";
184 @
{$gh->{'marks'}} = () if ref($gh->{'marks'});
187 @
{$gh->{'index'}} = () if ref($gh->{'index'});
189 # reset abstract line
190 ${$gh->{'abstract'}} = 0 if ref($gh->{'abstract'});
195 $gh->{'-mode'} = undef;
197 foreach my $l (split(/\n/,$content))
199 # inline data (passthrough)
200 if($l =~ /^<<$/ .. $l =~ /^>>$/)
207 if($l =~ /^\s*<\->\s*$/)
209 push(@
{$gh->{'marks'}},scalar(@
{$gh->{'o'}}))
210 if ref($gh->{'marks'});
215 # escape possibly dangerous characters
216 $l = $gh->_escape($l);
220 if($l =~ s/^$/$gh->_empty_line()/ge)
222 # mark the abstract end
227 # mark abstract if it's the
228 # second paragraph from the title
229 ${$gh->{'abstract'}} = scalar(@
{$gh->{'o'}})-1
234 if($gh->{'-process-urls'})
236 # URLs followed by a parenthesized phrase
237 $l =~ s/(https?:\/\/\S
+)\s
+\
(([^\
)]+)\
)/$gh->_url($1,$2)/ge
;
238 $l =~ s/(ftps?:\/\/\S
+)\s
+\
(([^\
)]+)\
)/$gh->_url($1,$2)/ge
;
239 $l =~ s/(file:\/?\S+)\s+\(([^\)]+)\)/$gh->_url($1,$2)/ge;
240 $l =~ s
|(\s
+)\
./(\S
+)\s
+\
(([^\
)]+)\
)|$1.$gh->_url($2,$3)|ge;
241 $l =~ s
|^\
./(\S
+)\s
+\
(([^\
)]+)\
)|$gh->_url($1,$2)|ge;
243 # URLs without phrase
244 $l =~ s/([^=][^\"])(https?:\/\/\S
+)/$1.$gh->_url($2)/ge
;
245 $l =~ s/([^=][^\"])(ftps?:\/\/\S
+)/$1.$gh->_url($2)/ge
;
246 $l =~ s/([^=][^\"])(file:\/?\S+)/$1.$gh->_url($2)/ge;
247 $l =~ s
|(\s
+)\
./(\S
+)|$1.$gh->_url($2)|ge;
249 $l =~ s/^(https?:\/\/\S
+)/$gh->_url($1)/ge
;
250 $l =~ s/^(ftps?:\/\/\S
+)/$gh->_url($1)/ge
;
251 $l =~ s/^(file:\/?\S+)/$gh->_url($1)/ge;
252 $l =~ s
|^\
./(\S
+)|$gh->_url($1)|ge;
255 # change '''text''' and *text* into strong emphasis
256 $l =~ s/\'\'\'([^\'][^\'][^\']*)\'\'\'/$gh->_strong($1)/ge;
257 $l =~ s/\*(\S[^\*]+\S)\*/$gh->_strong($1)/ge;
258 $l =~ s/\*(\S+)\*/$gh->_strong($1)/ge;
260 # change ''text'' and _text_ into emphasis
261 $l =~ s/\'\'([^\'][^\']*)\'\'/$gh->_em($1)/ge;
262 $l =~ s/\b_(\S[^_]*\S)_\b/$gh->_em($1)/ge;
263 $l =~ s/\b_(\S+)_\b/$gh->_em($1)/ge;
265 # enclose function names
266 if($gh->{'strip-parens'})
268 $l =~ s/(\w+)\(\)/$gh->_funcname($1)/ge;
272 $l =~ s/(\w+)\(\)/$gh->_funcname($1."()")/ge;
275 # enclose variable names
276 if($gh->{'strip-dollars'})
278 $l =~ s/\$([\w_\.]+)/$gh->_varname($1)/ge;
282 $l =~ s/(\$[\w_\.]+)/$gh->_varname($1)/ge;
290 if($l =~ s/^\s\*\s+([\w\s\-\(\)]+)\:\s+/$gh->_dl($1)/e)
295 elsif($gh->{'-mode'} ne "pre" and
296 ($l =~ s/^(\s+)\*\s+/$gh->_unsorted_list($1)/e or
297 $l =~ s/^(\s+)\-\s+/$gh->_unsorted_list($1)/e))
302 elsif($gh->{'-mode'} ne "pre" and
303 ($l =~ s/^(\s+)\#\s+/$gh->_ordered_list($1)/e or
304 $l =~ s/^(\s+)1\s+/$gh->_ordered_list($1)/e))
309 elsif($l =~ s/^\s\"/$gh->_blockquote()/e)
314 elsif($l =~ s/^\s*\|(.*)\|\s*$/$gh->_table_row($1)/e)
318 # table heading / end of row
319 elsif($l =~ s/^\s*(\+[-\+\|]+\+)\s*$/$gh->_table($1)/e)
324 elsif($l =~ s/^(\s.*)$/$gh->_pre($1)/e)
331 # back to normal mode
332 $gh->_new_mode(undef);
336 $l =~ s/^(=+)\s*$/$gh->_process_heading(1,$1)/e;
339 $l =~ s/^(-+)\s*$/$gh->_process_heading(2,$1)/e;
342 $l =~ s/^(~+)\s*$/$gh->_process_heading(3,$1)/e;
344 # change ------ into hr
345 $l =~ s/^----*$/$gh->_hr()/e;
348 $gh->_push($l) if $l;
352 $gh->_new_mode(undef);
358 ${$gh->{'title'}} = $gh->{'-title'} if ref($gh->{'title'});
360 # set abstract, if not set
361 ${$gh->{'abstract'}} = scalar(@
{$gh->{'o'}})
362 if ref($gh->{'abstract'}) and not ${$gh->{'abstract'}};
364 return(@
{$gh->{'o'}});
368 =head2 B<process_file>
370 @output=$grutatxt->process_file($filename);
372 Processes a file in Grutatxt format.
380 open F
, $file or return(undef);
382 my ($content) = join('',<F
>);
385 return($gh->process($content));
393 push(@
{$gh->{'o'}},$l);
399 my ($gh,$level,$hd) = @_;
402 $l = pop(@
{$gh->{'o'}});
404 if($l eq $gh->_empty_line())
411 $gh->{'-title'} = $l if $level == 1 and not $gh->{'-title'};
414 if(ref($gh->{'index'}))
416 push(@
{$gh->{'index'}},"$level,$l");
419 return($gh->_heading($level,$l));
428 # strip first + and all -
432 my ($t) = 1; @spans = ();
433 for(my $n = 0;$n < length($l);$n++)
435 if(substr($l,$n,1) eq '+')
442 # it's a colspan mark:
456 my @s = split(/\|/,$str);
458 for(my $n = 0;$n < scalar(@s);$n++)
460 ${$gh->{'-table'}}[$n] .= ' ' . $s[$n];
463 push(@
{$gh->{'-table-raw'}}, $str);
473 # if any other mode is active, add to it
474 if($gh->{'-mode'} and $gh->{'-mode'} ne "pre")
478 my ($a) = pop(@
{$gh->{'o'}})." ".$l;
484 $gh->_new_mode("pre");
493 my ($gh, $str, $ind) = @_;
502 # if last level is less indented, increase
509 # if last level is more indented, decrease
510 # levels until the same is found (or back to
511 # the beginning if not)
515 last if $l[-1] == $ind;
529 return($gh->_ul($gh->_multilevel_list('-ul-levels', $ind)));
537 return($gh->_ol($gh->_multilevel_list('-ol-levels', $ind)));
541 # empty stubs for falling through the superclass
543 sub _inline
{ my ($gh,$l) = @_; $l; }
544 sub _escape
{ my ($gh,$l) = @_; $l; }
545 sub _empty_line
{ my ($gh) = @_; ""; }
546 sub _url
{ my ($gh,$url,$label) = @_; ""; }
547 sub _strong
{ my ($gh,$str) = @_; $str; }
548 sub _em
{ my ($gh,$str) = @_; $str; }
549 sub _funcname
{ my ($gh,$str) = @_; $str; }
550 sub _varname
{ my ($gh,$str) = @_; $str; }
551 sub _new_mode
{ my ($gh,$mode) = @_; }
552 sub _dl
{ my ($gh,$str) = @_; $str; }
553 sub _ul
{ my ($gh,$level) = @_; ""; }
554 sub _ol
{ my ($gh,$level) = @_; ""; }
555 sub _blockquote
{ my ($gh,$str) = @_; $str; }
556 sub _hr
{ my ($gh) = @_; "" }
557 sub _heading
{ my ($gh,$level,$l) = @_; $l; }
558 sub _table
{ my ($gh,$str) = @_; $str; }
559 sub _prefix
{ my ($gh) = @_; }
560 sub _postfix
{ my ($gh) = @_; }
562 ###########################################################
564 =head1 DRIVER SPECIFIC INFORMATION
568 ###########################################################
571 package Grutatxt
::HTML
;
577 The additional parameters for a new Grutatxt object are:
581 =item I<table-headers>
583 If this boolean value is set, the first row in tables
584 is assumed to be the heading and rendered using <th>
585 instead of <td> tags.
587 =item I<center-tables>
589 If this boolean value is set, tables are centered.
591 =item I<expand-tables>
593 If this boolean value is set, tables are expanded (width 100%).
597 If this boolean value is set, definition lists will be
598 rendered using <dl>, <dt> and <dd> instead of tables.
600 =item I<header-offset>
602 Offset to be summed to the heading level when rendering
603 <h?> tags (default is 0).
605 =item I<class-oddeven>
607 If this boolean value is set, tables will be rendered
608 with an "oddeven" CSS class, and rows alternately classed
609 as "even" or "odd". If it's not set, no CSS class info
618 my ($class,%args) = @_;
621 bless(\
%args,$class);
624 $gh->{'-process-urls'} = 1;
634 # accept unnamed and HTML inlines
635 if($l =~ /^<<$/ or $l =~ /^<<\s*html$/i)
637 $gh->{'-inline'} = "HTML";
643 delete $gh->{'-inline'};
647 if($gh->{'-inline'} eq "HTML")
676 my ($gh,$url,$label) = @_;
678 $label = $url unless $label;
680 return("<a href=\"$url\">$label</a>");
687 return("<strong class=strong>$str</strong>");
694 return("<em class=em>$str</em>");
701 return("<code class=funcname>$str</code>");
708 return("<code class=var>$str</code>");
714 my ($gh,$mode,$params) = @_;
716 if($mode ne $gh->{'-mode'})
720 # flush previous mode
723 if($gh->{'-mode'} eq "ul")
725 $gh->_push("</ul>" x
scalar(@
{$gh->{'-ul-levels'}}));
727 elsif($gh->{'-mode'} eq "ol")
729 $gh->_push("</ol>" x
scalar(@
{$gh->{'-ol-levels'}}));
731 elsif($gh->{'-mode'})
733 $gh->_push("</$gh->{'-mode'}>");
737 $tag = $params ?
"<$mode $params>" : "<$mode>";
738 $gh->_push($tag) if $mode;
740 $gh->{'-mode'} = $mode;
742 # clean previous lists
743 $gh->{'-ul-levels'} = undef;
744 $gh->{'-ol-levels'} = undef;
753 if($gh->{'dl-as-dl'})
755 $gh->_new_mode("dl");
756 return("<dt><strong class=term>$str</strong><dd>");
760 $gh->_new_mode("table");
761 return("<tr><td valign=top><strong class=term>$1</strong> </td><td valign=top>");
768 my ($gh, $levels) = @_;
779 $ret = "</ul>" x
abs($levels);
782 $gh->{'-mode'} = "ul";
792 my ($gh, $levels) = @_;
803 $ret = "</ol>" x
abs($levels);
806 $gh->{'-mode'} = "ol";
818 $gh->_new_mode("blockquote");
827 return("<hr size=1 noshade>");
833 my ($gh,$level,$l) = @_;
835 # creates a valid anchor
842 $l = sprintf("<a name=\"%s\"></a>\n<h%d class=level$level>%s</h%d>",
843 $a, $level+$gh->{'header-offset'},
844 $l, $level+$gh->{'header-offset'});
854 if($gh->{'-mode'} eq "table")
857 my (@spans) = $gh->_calc_col_span($str);
859 # calculate CSS class, if any
860 if($gh->{'class-oddeven'})
862 $class = ($gh->{'-tbl-row'} & 1) ?
"odd" : "even";
865 $str = "<tr $class>";
868 for(my $n = 0;$n < scalar(@
{$gh->{'-table'}});$n++)
872 $i = ${$gh->{'-table'}}[$n];
873 $i = " " if $i =~ /^\s*$/;
875 $s = " colspan=$spans[$n]" if $spans[$n] > 1;
877 if($gh->{'table-headers'} and $gh->{'-tbl-row'} == 1)
879 $str .= "<th $class $s>$i</th>";
883 $str .= "<td $class $s>$i</td>";
887 @
{$gh->{'-table'}} = ();
895 $params = "border=1";
896 $params .= " width='100\%'" if $gh->{'expand-tables'};
897 $params .= " align=center" if $gh->{'center-tables'};
898 $params .= " class=oddeven" if $gh->{'class-oddeven'};
900 $gh->_new_mode("table", $params);
902 @
{$gh->{'-table'}} = ();
903 $gh->{'-tbl-row'} = 1;
911 ###########################################################
914 package Grutatxt
::troff
;
920 The troff driver uses the B<-me> macros and B<tbl>. A
921 good way to post-process this output (to PostScript in
922 the example) could be by using
926 The additional parameters for a new Grutatxt object are:
932 The point size of normal text. By default is 10.
934 =item I<heading-sizes>
936 This argument must be a reference to an array containing
937 the size in points of the 3 different heading levels. By
938 default, level sizes are [ 20, 18, 15 ].
942 The type of table to be rendered by B<tbl>. Can be
943 I<allbox> (all lines rendered; this is the default value),
944 I<box> (only outlined) or I<doublebox> (only outlined by
953 my ($class,%args) = @_;
956 bless(\
%args,$class);
959 $gh->{'-process-urls'} = 0;
961 $gh->{'heading-sizes'} ||= [ 20, 18, 15 ];
962 $gh->{'normal-size'} ||= 10;
963 $gh->{'table-type'} ||= "allbox"; # box, allbox, doublebox
973 $gh->_push(".nr pp $gh->{'normal-size'}");
982 # accept only troff inlines
983 if($l =~ /^<<\s*troff$/i)
985 $gh->{'-inline'} = "troff";
991 delete $gh->{'-inline'};
995 if($gh->{'-inline'} eq "troff")
1024 return("\\fB$str\\fP");
1031 return("\\fI$str\\fP");
1038 return("\\fB$str\\fP");
1045 return("\\fI$str\\fP");
1051 my ($gh,$mode,$params) = @_;
1053 if($mode ne $gh->{'-mode'})
1057 # flush previous list
1058 if($gh->{'-mode'} eq "pre")
1062 elsif($gh->{'-mode'} eq "table")
1064 chomp($gh->{'-table-head'});
1065 $gh->{'-table-head'} =~ s/\s+$//;
1066 $gh->_push($gh->{'-table-head'} . ".");
1067 $gh->_push($gh->{'-table-body'} . ".TE\n.sp 0.6");
1069 elsif($gh->{'-mode'} eq "blockquote")
1077 $gh->_push(".(l L");
1079 elsif($mode eq "blockquote")
1084 $gh->{'-mode'} = $mode;
1093 $gh->_new_mode("dl");
1094 return(".ip \"$str\"\n");
1102 $gh->_new_mode("ul");
1111 $gh->_new_mode("ol");
1120 $gh->_new_mode("blockquote");
1135 my ($gh,$level,$l) = @_;
1137 $l = ".sz " . ${$gh->{'heading-sizes'}}[$level - 1] . "\n$l\n.sp 0.6";
1147 if($gh->{'-mode'} eq "table")
1150 my (@spans) = $gh->_calc_col_span($str);
1155 for(my $n = 0;$n < scalar(@
{$gh->{'-table'}});$n++)
1159 if($gh->{'table-headers'} and $gh->{'-tbl-row'} == 1)
1169 $h .= "s " x
($spans[$n] - 1) if $spans[$n] > 1;
1173 $i = ${$gh->{'-table'}}[$n];
1181 $b .= "\n_" if $gh->{'table-headers'} and
1182 $gh->{'-tbl-row'} == 1 and
1183 $gh->{'table-type'} ne "allbox";
1185 $gh->{'-table-head'} .= "$h\n";
1186 $gh->{'-table-body'} .= "$b\n";
1188 @
{$gh->{'-table'}} = ();
1189 $gh->{'-tbl-row'}++;
1194 $gh->_new_mode("table");
1196 @
{$gh->{'-table'}} = ();
1197 $gh->{'-tbl-row'} = 1;
1199 $gh->{'-table-head'} = ".TS\n$gh->{'table-type'} tab (#);\n";
1200 $gh->{'-table-body'} = "";
1212 # add to top headings and footers
1213 unshift(@
{$gh->{'o'}},".ef '\%' ''");
1214 unshift(@
{$gh->{'o'}},".of '' '\%'");
1215 unshift(@
{$gh->{'o'}},".eh '$gh->{'-title'}' ''");
1216 unshift(@
{$gh->{'o'}},".oh '' '$gh->{'-title'}'");
1220 ###########################################################
1223 package Grutatxt
::man
;
1225 @ISA = ("Grutatxt::troff", "Grutatxt");
1229 The man driver is used to generate Unix-like man pages. Note that
1230 all headings have the same level with this output driver.
1232 The additional parameters for a new Grutatxt object are:
1238 The man page section (see man documentation). By default is 1.
1242 The name of the page. This is usually the name of the program
1243 or function the man page is documenting and will be shown in the
1244 page header. By default is the empty string.
1252 my ($class,%args) = @_;
1255 bless(\
%args,$class);
1258 $gh->{'-process-urls'} = 0;
1260 $gh->{'section'} ||= 1;
1261 $gh->{'page-name'} ||= "";
1271 $gh->_push(".TH \"$gh->{'page-name'}\" \"$gh->{'section'}\" \"" . localtime() . "\"");
1279 # accept only man markup inlines
1280 if($l =~ /^<<\s*man$/i)
1282 $gh->{'-inline'} = "man";
1288 delete $gh->{'-inline'};
1292 if($gh->{'-inline'} eq "man")
1309 my ($gh,$mode,$params) = @_;
1311 if($mode ne $gh->{'-mode'})
1315 # flush previous list
1316 if($gh->{'-mode'} eq "pre" or
1317 $gh->{'-mode'} eq "table")
1322 if($gh->{'-mode'} eq "blockquote")
1327 if($gh->{'-mode'} eq "ul")
1329 $gh->_push(".RE\n" x
scalar(@
{$gh->{'-ul-levels'}}));
1332 if($gh->{'-mode'} eq "ol")
1334 $gh->_push(".RE\n" x
scalar(@
{$gh->{'-ol-levels'}}));
1338 if($mode eq "pre" or
1344 if($mode eq "blockquote")
1346 $gh->_push(".RS 4");
1349 $gh->{'-mode'} = $mode;
1358 $gh->_new_mode("dl");
1359 return(".TP\n.B \"$str\"\n");
1365 my ($gh, $levels) = @_;
1374 $ret = ".RE\n" x
abs($levels);
1377 $gh->_new_mode("ul");
1378 return($ret . ".TP 4\n\\(bu\n");
1384 my ($gh, $levels) = @_;
1385 my $l = @
{$gh->{'-ol-levels'}};
1388 $gh->{'-ol-level'} += $levels;
1394 $l[$gh->{'-ol-level'}] = 1;
1398 $ret = ".RE\n" x
abs($levels);
1401 $gh->_new_mode("ol");
1402 $ret .= ".TP 4\n" . $l[$gh->{'-ol-level'}]++ . ".\n";
1418 my ($gh,$level,$l) = @_;
1420 # all headers are the same depth in man pages
1421 return(".SH \"" . uc($l) . "\"");
1429 if($gh->{'-mode'} eq "table")
1431 foreach my $r (@
{$gh->{'-table-raw'}})
1438 $gh->_new_mode("table");
1441 @
{$gh->{'-table'}} = ();
1442 @
{$gh->{'-table-raw'}} = ();
1457 Angel Ortega angel@triptico.com