1 #####################################################################
3 # Grutatxt - A text to HTML (and other things) converter
5 # Copyright (C) 2000/2003 Angel Ortega <angel@triptico.com>
7 # This program is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU General Public License
9 # as published by the Free Software Foundation; either version 2
10 # of the License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 # http://www.triptico.com
23 #####################################################################
35 Grutatxt - Text to HTML (and other formats) converter
41 # create a new Grutatxt converter object
42 $grutatxt=new Grutatxt();
44 # process a Grutatxt format string
45 @output=$grutatxt->process($text);
48 @output2=$grutatxt->process_file($file);
52 Grutatxt is a module to process text documents in
53 a special markup format (also called Grutatxt), very
54 similar to plain ASCII text. These documents can be
55 converted to HTML or troff.
57 The markup is designed to be fairly intuitive and
58 straightforward and can include headings, bold and italic
59 text effects, bulleted, numbered and definition lists, URLs,
60 function and variable names, preformatted text, horizontal
61 separators and tables. Special marks can be inserted in the
62 text and a heading-based structural index can be obtained
65 A comprehensive description of the markup is defined in
66 the README file, included with the Grutatxt package (it is
67 written in Grutatxt format itself, so it can be converted
68 using the I<grutatxt> tool to any of the supported formats).
69 The latest version (and more information) can be retrieved
70 from the Grutatxt home page at:
72 http://www.triptico.com/software/grutatxt.html
74 =head1 FUNCTIONS AND METHODS
78 $grutatxt=new Grutatxt([ "mode" => $mode, ]
79 [ "title" => \$title, ]
80 [ "marks" => \@marks, ]
81 [ "index" => \@index, ]
82 [ "abstract" => \$abstract, ]
83 [ "strip-parens" => $bool, ]
84 [ "strip-dollars" => $bool, ]
85 [ %driver_specific_arguments ] );
87 Creates a new Grutatxt object instance. All parameters are
94 Output format. Can be HTML or troff. HTML is used if not specified.
98 If I<title> is specified as a reference to scalar, the first
99 level 1 heading found in the text is stored inside it.
103 Marks in the Grutatxt markup are created by inserting the
104 string <-> alone in a line. If I<marks> is specified as a
105 reference to array, it will be filled with the subscripts
106 (relative to the output array) of the lines where the marks
107 are found in the text.
111 If I<index> is specified as a reference to array, it will
112 be filled with strings in the format
116 This information can be used to build a table of contents
117 of the processed text.
119 =item I<strip-parens>
121 Function names in the Grutatxt markup are strings of
122 alphanumeric characters immediately followed by a pair
123 of open and close parentheses. If this boolean value is
124 set, function names found in the processed text will have
125 their parentheses deleted.
127 =item I<strip-dollars>
129 Variable names in the Grutatxt markup are strings of
130 alphanumeric characters preceded by a dollar sign.
131 If this boolean value is set, variable names found in
132 the processed text will have the dollar sign deleted.
136 The I<abstract> of a Grutatxt document is the fragment of text
137 from the beginning of the document to the end of the first
138 paragraph after the title. If I<abstract> is specified as a
139 reference to scalar, it will contain (after each call to the
140 B<process()> method) the subscript of the element of the output
141 array that marks the end of the subject.
149 my ($class,%args) = @_;
152 $args{'mode'} ||= 'HTML';
154 $class .= "::" . $args{'mode'};
156 $gh = new
$class(%args);
164 @output=$grutatxt->process($text);
166 Processes a text in Grutatxt format. The result is returned
167 as an array of lines.
173 my ($gh,$content) = @_;
179 # clean title and paragraph numbers
180 $gh->{'-title'} = "";
184 @
{$gh->{'marks'}} = () if ref($gh->{'marks'});
187 @
{$gh->{'index'}} = () if ref($gh->{'index'});
189 # reset abstract line
190 ${$gh->{'abstract'}} = 0 if ref($gh->{'abstract'});
195 $gh->{'-mode'} = undef;
197 foreach my $l (split(/\n/,$content))
199 # inline data (passthrough)
200 if($l =~ /^<<$/ .. $l =~ /^>>$/)
207 if($l =~ /^\s*<\->\s*$/)
209 push(@
{$gh->{'marks'}},scalar(@
{$gh->{'o'}}))
210 if ref($gh->{'marks'});
215 # escape possibly dangerous characters
216 $l = $gh->_escape($l);
220 if($l =~ s/^$/$gh->_empty_line()/ge)
222 # mark the abstract end
227 # mark abstract if it's the
228 # second paragraph from the title
229 ${$gh->{'abstract'}} = scalar(@
{$gh->{'o'}})-1
234 if($gh->{'-process-urls'})
236 # URLs followed by a parenthesized phrase
237 $l =~ s/(https?:\/\/\S
+)\s
+\
(([^\
)]+)\
)/$gh->_url($1,$2)/ge
;
238 $l =~ s/(ftps?:\/\/\S
+)\s
+\
(([^\
)]+)\
)/$gh->_url($1,$2)/ge
;
239 $l =~ s/(file:\/?\S+)\s+\(([^\)]+)\)/$gh->_url($1,$2)/ge;
241 # URLs without phrase
242 $l =~ s/([^=][^\"])(https?:\/\/\S
+)/$1.$gh->_url($2)/ge
;
243 $l =~ s/([^=][^\"])(ftps?:\/\/\S
+)/$1.$gh->_url($2)/ge
;
244 $l =~ s/([^=][^\"])(file:\/?\S+)/$1.$gh->_url($2)/ge;
245 $l =~ s/^(https?:\/\/\S
+)/$gh->_url($1)/ge
;
246 $l =~ s/^(ftps?:\/\/\S
+)/$gh->_url($1)/ge
;
247 $l =~ s/^(file:\/?\S+)/$gh->_url($1)/ge;
250 # change '''text''' and *text* into strong emphasis
251 $l =~ s/\'\'\'([^\'][^\'][^\']*)\'\'\'/$gh->_strong($1)/ge;
252 $l =~ s/\*(\S[^\*]+\S)\*/$gh->_strong($1)/ge;
253 $l =~ s/\*(\S+)\*/$gh->_strong($1)/ge;
255 # change ''text'' and _text_ into emphasis
256 $l =~ s/\'\'([^\'][^\']*)\'\'/$gh->_em($1)/ge;
257 $l =~ s/\b_(\S[^_]*\S)_\b/$gh->_em($1)/ge;
258 $l =~ s/\b_(\S+)_\b/$gh->_em($1)/ge;
260 # enclose function names
261 if($gh->{'strip-parens'})
263 $l =~ s/(\w+)\(\)/$gh->_funcname($1)/ge;
267 $l =~ s/(\w+)\(\)/$gh->_funcname($1."()")/ge;
270 # enclose variable names
271 if($gh->{'strip-dollars'})
273 $l =~ s/\$([\w_\.]+)/$gh->_varname($1)/ge;
277 $l =~ s/(\$[\w_\.]+)/$gh->_varname($1)/ge;
285 if($l =~ s/^\s\*\s+([\w\s\-\(\)]+)\:\s+/$gh->_dl($1)/e)
290 elsif($gh->{'-mode'} ne "pre" and
291 ($l =~ s/^(\s+)\*\s+/$gh->_unsorted_list($1)/e or
292 $l =~ s/^(\s+)\-\s+/$gh->_unsorted_list($1)/e))
297 elsif($gh->{'-mode'} ne "pre" and
298 ($l =~ s/^(\s+)\#\s+/$gh->_ordered_list($1)/e or
299 $l =~ s/^(\s+)1\s+/$gh->_ordered_list($1)/e))
304 elsif($l =~ s/^\s\"/$gh->_blockquote()/e)
309 elsif($l =~ s/^\s*\|(.*)\|\s*$/$gh->_table_row($1)/e)
313 # table heading / end of row
314 elsif($l =~ s/^\s*(\+[-\+\|]+\+)\s*$/$gh->_table($1)/e)
319 elsif($l =~ s/^(\s.*)$/$gh->_pre($1)/e)
326 # back to normal mode
327 $gh->_new_mode(undef);
331 $l =~ s/^(=+)\s*$/$gh->_process_heading(1,$1)/e;
334 $l =~ s/^(-+)\s*$/$gh->_process_heading(2,$1)/e;
337 $l =~ s/^(~+)\s*$/$gh->_process_heading(3,$1)/e;
339 # change ------ into hr
340 $l =~ s/^----*$/$gh->_hr()/e;
343 $gh->_push($l) if $l;
347 $gh->_new_mode(undef);
353 ${$gh->{'title'}} = $gh->{'-title'} if ref($gh->{'title'});
355 # set abstract, if not set
356 ${$gh->{'abstract'}} = scalar(@
{$gh->{'o'}})
357 if ref($gh->{'abstract'}) and not ${$gh->{'abstract'}};
359 return(@
{$gh->{'o'}});
363 =head2 B<process_file>
365 @output=$grutatxt->process_file($filename);
367 Processes a file in Grutatxt format.
375 open F
, $file or return(undef);
377 my ($content) = join('',<F
>);
380 return($gh->process($content));
388 push(@
{$gh->{'o'}},$l);
394 my ($gh,$level,$hd) = @_;
397 $l = pop(@
{$gh->{'o'}});
399 if($l eq $gh->_empty_line())
406 $gh->{'-title'} = $l if $level == 1 and not $gh->{'-title'};
409 if(ref($gh->{'index'}))
411 push(@
{$gh->{'index'}},"$level,$l");
414 return($gh->_heading($level,$l));
423 # strip first + and all -
427 my ($t) = 1; @spans = ();
428 for(my $n = 0;$n < length($l);$n++)
430 if(substr($l,$n,1) eq '+')
437 # it's a colspan mark:
451 my @s = split(/\|/,$str);
453 for(my $n = 0;$n < scalar(@s);$n++)
455 ${$gh->{'-table'}}[$n] .= ' ' . $s[$n];
466 # if any other mode is active, add to it
467 if($gh->{'-mode'} and $gh->{'-mode'} ne "pre")
471 my ($a) = pop(@
{$gh->{'o'}})." ".$l;
477 $gh->_new_mode("pre");
486 my ($gh, $str, $ind) = @_;
495 # if last level is less indented, increase
502 # if last level is more indented, decrease
503 # levels until the same is found (or back to
504 # the beginning if not)
508 last if $l[-1] == $ind;
522 return($gh->_ul($gh->_multilevel_list('-ul-levels', $ind)));
530 return($gh->_ol($gh->_multilevel_list('-ol-levels', $ind)));
534 # empty stubs for falling through the superclass
536 sub _inline
{ my ($gh,$l) = @_; $l; }
537 sub _escape
{ my ($gh,$l) = @_; $l; }
538 sub _empty_line
{ my ($gh) = @_; ""; }
539 sub _url
{ my ($gh,$url,$label) = @_; ""; }
540 sub _strong
{ my ($gh,$str) = @_; $str; }
541 sub _em
{ my ($gh,$str) = @_; $str; }
542 sub _funcname
{ my ($gh,$str) = @_; $str; }
543 sub _varname
{ my ($gh,$str) = @_; $str; }
544 sub _new_mode
{ my ($gh,$mode) = @_; }
545 sub _dl
{ my ($gh,$str) = @_; $str; }
546 sub _ul
{ my ($gh,$level) = @_; ""; }
547 sub _ol
{ my ($gh,$level) = @_; ""; }
548 sub _blockquote
{ my ($gh,$str) = @_; $str; }
549 sub _hr
{ my ($gh) = @_; "" }
550 sub _heading
{ my ($gh,$level,$l) = @_; $l; }
551 sub _table
{ my ($gh,$str) = @_; $str; }
552 sub _prefix
{ my ($gh) = @_; }
553 sub _postfix
{ my ($gh) = @_; }
555 ###########################################################
557 =head1 DRIVER SPECIFIC INFORMATION
561 ###########################################################
564 package Grutatxt
::HTML
;
570 The additional parameters for a new Grutatxt object are:
574 =item I<table-headers>
576 If this boolean value is set, the first row in tables
577 is assumed to be the heading and rendered using <th>
578 instead of <td> tags.
580 =item I<center-tables>
582 If this boolean value is set, tables are centered.
584 =item I<expand-tables>
586 If this boolean value is set, tables are expanded (width 100%).
590 If this boolean value is set, definition lists will be
591 rendered using <dl>, <dt> and <dd> instead of tables.
593 =item I<header-offset>
595 Offset to be summed to the heading level when rendering
596 <h?> tags (default is 0).
598 =item I<class-oddeven>
600 If this boolean value is set, tables will be rendered
601 with an "oddeven" CSS class, and rows alternately classed
602 as "even" or "odd". If it's not set, no CSS class info
611 my ($class,%args) = @_;
614 bless(\
%args,$class);
617 $gh->{'-process-urls'} = 1;
627 # accept unnamed and HTML inlines
628 if($l =~ /^<<$/ or $l =~ /^<<\s*html$/i)
630 $gh->{'-inline'} = "HTML";
636 delete $gh->{'-inline'};
640 if($gh->{'-inline'} eq "HTML")
669 my ($gh,$url,$label) = @_;
671 $label = $url unless $label;
673 return("<a href=\"$url\">$label</a>");
680 return("<strong class=strong>$str</strong>");
687 return("<em class=em>$str</em>");
694 return("<code class=funcname>$str</code>");
701 return("<code class=var>$str</code>");
707 my ($gh,$mode,$params) = @_;
709 if($mode ne $gh->{'-mode'})
713 # flush previous mode
716 if($gh->{'-mode'} eq "ul")
718 $gh->_push("</ul>" x
scalar(@
{$gh->{'-ul-levels'}}));
720 elsif($gh->{'-mode'} eq "ol")
722 $gh->_push("</ol>" x
scalar(@
{$gh->{'-ol-levels'}}));
724 elsif($gh->{'-mode'})
726 $gh->_push("</$gh->{'-mode'}>");
730 $tag = $params ?
"<$mode $params>" : "<$mode>";
731 $gh->_push($tag) if $mode;
733 $gh->{'-mode'} = $mode;
735 # clean previous lists
736 $gh->{'-ul-levels'} = undef;
737 $gh->{'-ol-levels'} = undef;
746 if($gh->{'dl-as-dl'})
748 $gh->_new_mode("dl");
749 return("<dt><strong class=term>$str</strong><dd>");
753 $gh->_new_mode("table");
754 return("<tr><td valign=top><strong class=term>$1</strong> </td><td valign=top>");
761 my ($gh, $levels) = @_;
772 $ret = "</ul>" x
abs($levels);
775 $gh->{'-mode'} = "ul";
785 my ($gh, $levels) = @_;
796 $ret = "</ol>" x
abs($levels);
799 $gh->{'-mode'} = "ol";
811 $gh->_new_mode("blockquote");
820 return("<hr size=1 noshade>");
826 my ($gh,$level,$l) = @_;
828 # substitute anchor spaces with underscores
829 my ($a) = lc($l); $a =~ s/\s/_/g;
831 $l = sprintf("<a name=\"%s\"></a>\n<h%d class=level$level>%s</h%d>",
832 $a, $level+$gh->{'header-offset'},
833 $l, $level+$gh->{'header-offset'});
843 if($gh->{'-mode'} eq "table")
846 my (@spans) = $gh->_calc_col_span($str);
848 # calculate CSS class, if any
849 if($gh->{'class-oddeven'})
851 $class = ($gh->{'-tbl-row'} & 1) ?
"odd" : "even";
854 $str = "<tr $class>";
857 for(my $n = 0;$n < scalar(@
{$gh->{'-table'}});$n++)
861 $i = ${$gh->{'-table'}}[$n];
862 $i = " " if $i =~ /^\s*$/;
864 $s = " colspan=$spans[$n]" if $spans[$n] > 1;
866 if($gh->{'table-headers'} and $gh->{'-tbl-row'} == 1)
868 $str .= "<th $class $s>$i</th>";
872 $str .= "<td $class $s>$i</td>";
876 @
{$gh->{'-table'}} = ();
884 $params = "border=1";
885 $params .= " width='100\%'" if $gh->{'expand-tables'};
886 $params .= " align=center" if $gh->{'center-tables'};
887 $params .= " class=oddeven" if $gh->{'class-oddeven'};
889 $gh->_new_mode("table", $params);
891 @
{$gh->{'-table'}} = ();
892 $gh->{'-tbl-row'} = 1;
900 ###########################################################
903 package Grutatxt
::troff
;
909 The troff driver uses the B<-me> macros and B<tbl>. A
910 good way to post-process this output (to PostScript in
911 the example) could be by using
915 The additional parameters for a new Grutatxt object are:
921 The point size of normal text. By default is 10.
923 =item I<heading-sizes>
925 This argument must be a reference to an array containing
926 the size in points of the 3 different heading levels. By
927 default, level sizes are [ 20, 18, 15 ].
931 The type of table to be rendered by B<tbl>. Can be
932 I<allbox> (all lines rendered; this is the default value),
933 I<box> (only outlined) or I<doublebox> (only outlined by
942 my ($class,%args) = @_;
945 bless(\
%args,$class);
948 $gh->{'-process-urls'} = 0;
950 $gh->{'heading-sizes'} ||= [ 20, 18, 15 ];
951 $gh->{'normal-size'} ||= 10;
952 $gh->{'table-type'} ||= "allbox"; # box, allbox, doublebox
962 $gh->_push(".nr pp $gh->{'normal-size'}");
971 # accept only troff inlines
972 if($l =~ /^<<\s*troff$/i)
974 $gh->{'-inline'} = "troff";
980 delete $gh->{'-inline'};
984 if($gh->{'-inline'} eq "troff")
1013 return("\\fB$str\\fP");
1020 return("\\fI$str\\fP");
1027 return("\\fB$str\\fP");
1034 return("\\fI$str\\fP");
1040 my ($gh,$mode,$params) = @_;
1042 if($mode ne $gh->{'-mode'})
1046 # flush previous list
1047 if($gh->{'-mode'} eq "pre")
1051 elsif($gh->{'-mode'} eq "table")
1053 chomp($gh->{'-table-head'});
1054 $gh->{'-table-head'} =~ s/\s+$//;
1055 $gh->_push($gh->{'-table-head'} . ".");
1056 $gh->_push($gh->{'-table-body'} . ".TE\n.sp 0.6");
1058 elsif($gh->{'-mode'} eq "blockquote")
1066 $gh->_push(".(l L");
1068 elsif($mode eq "blockquote")
1073 $gh->{'-mode'} = $mode;
1082 $gh->_new_mode("dl");
1083 return(".ip \"$str\"\n");
1091 $gh->_new_mode("ul");
1100 $gh->_new_mode("ol");
1109 $gh->_new_mode("blockquote");
1124 my ($gh,$level,$l) = @_;
1126 $l = ".sz " . ${$gh->{'heading-sizes'}}[$level - 1] . "\n$l\n.sp 0.6";
1136 if($gh->{'-mode'} eq "table")
1139 my (@spans) = $gh->_calc_col_span($str);
1144 for(my $n = 0;$n < scalar(@
{$gh->{'-table'}});$n++)
1148 if($gh->{'table-headers'} and $gh->{'-tbl-row'} == 1)
1158 $h .= "s " x
($spans[$n] - 1) if $spans[$n] > 1;
1162 $i = ${$gh->{'-table'}}[$n];
1170 $b .= "\n_" if $gh->{'table-headers'} and
1171 $gh->{'-tbl-row'} == 1 and
1172 $gh->{'table-type'} ne "allbox";
1174 $gh->{'-table-head'} .= "$h\n";
1175 $gh->{'-table-body'} .= "$b\n";
1177 @
{$gh->{'-table'}} = ();
1178 $gh->{'-tbl-row'}++;
1183 $gh->_new_mode("table");
1185 @
{$gh->{'-table'}} = ();
1186 $gh->{'-tbl-row'} = 1;
1188 $gh->{'-table-head'} = ".TS\n$gh->{'table-type'} tab (#);\n";
1189 $gh->{'-table-body'} = "";
1201 # add to top headings and footers
1202 unshift(@
{$gh->{'o'}},".ef '\%' ''");
1203 unshift(@
{$gh->{'o'}},".of '' '\%'");
1204 unshift(@
{$gh->{'o'}},".eh '$gh->{'-title'}' ''");
1205 unshift(@
{$gh->{'o'}},".oh '' '$gh->{'-title'}'");
1211 Angel Ortega angel@triptico.com