- Fixed some minor bugs to conform with the w3c HTML standards.
[grutatxt.git] / Grutatxt.pm
blobf8934b8bae2ff04c1992fde6870f1156b1f954a3
1 #####################################################################
3 # Grutatxt - A text to HTML (and other things) converter
5 # Copyright (C) 2000/2002 Angel Ortega <angel@triptico.com>
7 # This program is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU General Public License
9 # as published by the Free Software Foundation; either version 2
10 # of the License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 # http://www.triptico.com
23 #####################################################################
25 package Grutatxt;
27 $VERSION='2.0.1';
29 =pod
31 =head1 NAME
33 Grutatxt - Text to HTML (and other formats) converter
35 =head1 SYNOPSIS
37 use Grutatxt;
39 # create a new Grutatxt converter object
40 $grutatxt=new Grutatxt();
42 # process a Grutatxt format string
43 @output=$grutatxt->process($text);
45 # idem for a file
46 @output2=$grutatxt->process_file($file);
48 =head1 DESCRIPTION
50 Grutatxt is a module to process text documents in
51 a special markup format (also called Grutatxt), very
52 similar to plain ASCII text. These documents can be
53 converted to HTML or troff.
55 The markup is designed to be fairly intuitive and
56 straightforward and can include headings, bold and italic
57 text effects, bulleted, numbered and definition lists, URLs,
58 function and variable names, preformatted text, horizontal
59 separators and tables. Special marks can be inserted in the
60 text and a heading-based structural index can be obtained
61 from it.
63 A comprehensive description of the markup is defined in
64 the README file, included with the Grutatxt package (it is
65 written in Grutatxt format itself, so it can be converted
66 using the I<grutatxt> tool to any of the supported formats).
67 The latest version (and more information) can be retrieved
68 from the Grutatxt home page at:
70 http://www.triptico.com/software/grutatxt.html
72 =head1 FUNCTIONS AND METHODS
74 =head2 B<new>
76 $grutatxt=new Grutatxt([ "mode" => $mode, ]
77 [ "title" => \$title, ]
78 [ "marks" => \@marks, ]
79 [ "index" => \@index, ]
80 [ "abstract" => \$abstract, ]
81 [ "strip-parens" => $bool, ]
82 [ "strip-dollars" => $bool, ]
83 [ %driver_specific_arguments ] );
85 Creates a new Grutatxt object instance. All parameters are
86 optional.
88 =over 4
90 =item I<mode>
92 Output format. Can be HTML or troff. HTML is used if not specified.
94 =item I<title>
96 If I<title> is specified as a reference to scalar, the first
97 level 1 heading found in the text is stored inside it.
99 =item I<marks>
101 Marks in the Grutatxt markup are created by inserting the
102 string <-> alone in a line. If I<marks> is specified as a
103 reference to array, it will be filled with the subscripts
104 (relative to the output array) of the lines where the marks
105 are found in the text.
107 =item I<index>
109 If I<index> is specified as a reference to array, it will
110 be filled with strings in the format
112 level,heading
114 This information can be used to build a table of contents
115 of the processed text.
117 =item I<strip-parens>
119 Function names in the Grutatxt markup are strings of
120 alphanumeric characters immediately followed by a pair
121 of open and close parentheses. If this boolean value is
122 set, function names found in the processed text will have
123 their parentheses deleted.
125 =item I<strip-dollars>
127 Variable names in the Grutatxt markup are strings of
128 alphanumeric characters preceded by a dollar sign.
129 If this boolean value is set, variable names found in
130 the processed text will have the dollar sign deleted.
132 =item I<abstract>
134 The I<abstract> of a Grutatxt document is the fragment of text
135 from the beginning of the document to the end of the first
136 paragraph after the title. If I<abstract> is specified as a
137 reference to scalar, it will contain (after each call to the
138 B<process()> method) the subscript of the element of the output
139 array that marks the end of the subject.
141 =back
143 =cut
145 sub new
147 my ($class,%args)=@_;
148 my ($gh);
150 $args{'mode'}||='HTML';
152 $class.="::".$args{'mode'};
154 $gh=new $class(%args);
156 return($gh);
160 =head2 B<process>
162 @output=$grutatxt->process($text);
164 Processes a text in Grutatxt format. The result is returned
165 as an array of lines.
167 =cut
169 sub process
171 my ($gh,$content)=@_;
172 my ($p);
174 # clean output
175 @{$gh->{'o'}}=();
177 # clean title and paragraph numbers
178 $gh->{'-title'}="";
179 $gh->{'-p'}=0;
181 # clean marks
182 @{$gh->{'marks'}}=() if ref($gh->{'marks'});
184 # clean index
185 @{$gh->{'index'}}=() if ref($gh->{'index'});
187 # reset abstract line
188 ${$gh->{'abstract'}}=0 if ref($gh->{'abstract'});
190 # insert prefix
191 $gh->_prefix();
193 $gh->{'-mode'}=undef;
195 foreach my $l (split(/\n/,$content))
197 # inline data (passthrough)
198 if($l =~ /^<<$/ .. $l =~ /^>>$/)
200 $gh->_inline($l);
201 next;
204 # marks
205 if($l =~ /^\s*<\->\s*$/)
207 push(@{$gh->{'marks'}},scalar(@{$gh->{'o'}}))
208 if ref($gh->{'marks'});
210 next;
213 # escape possibly dangerous characters
214 $l=$gh->_escape($l);
216 # empty lines
217 $l =~ s/^\r$//ge;
218 if($l =~ s/^$/$gh->_empty_line()/ge)
220 # mark the abstract end
221 if($gh->{'-title'})
223 $gh->{'-p'}++;
225 # mark abstract if it's the
226 # second paragraph from the title
227 ${$gh->{'abstract'}}=scalar(@{$gh->{'o'}})-1
228 if $gh->{'-p'}==2;
232 if($gh->{'-process-urls'})
234 # URLs followed by a parenthesized phrase
235 $l =~ s/(http:\/\/[\w\/\.\?\&\=\-\%\;]*)\s*\(([^\)]+)\)/$gh->_url($1,$2)/ge;
237 # URLs without phrase
238 $l =~ s/([^=][^\"])(http:\/\/[\w\/\.\?\&\=\-\%\;]*)/$1.$gh->_url($2,$2)/ge;
239 $l =~ s/^(http:\/\/[\w\/\.\?\&\=\-\%\;]*)/$gh->_url($1,$1)/ge;
242 # change '''text''' and *text* into strong emphasis
243 $l =~ s/\'\'\'([^\'][^\'][^\']*)\'\'\'/$gh->_strong($1)/ge;
244 $l =~ s/\*(\S[^\*]+\S)\*/$gh->_strong($1)/ge;
245 $l =~ s/\*(\S+)\*/$gh->_strong($1)/ge;
247 # change ''text'' and _text_ into emphasis
248 $l =~ s/\'\'([^\'][^\']*)\'\'/$gh->_em($1)/ge;
249 $l =~ s/\b_(\S[^_]*\S)_\b/$gh->_em($1)/ge;
250 $l =~ s/\b_(\S+)_\b/$gh->_em($1)/ge;
252 # enclose function names
253 if($gh->{'strip-parens'})
255 $l =~ s/(\w+)\(\)/$gh->_funcname($1)/ge;
257 else
259 $l =~ s/(\w+)\(\)/$gh->_funcname($1."()")/ge;
262 # enclose variable names
263 if($gh->{'strip-dollars'})
265 $l =~ s/\$([\w_\.]+)/$gh->_varname($1)/ge;
267 else
269 $l =~ s/(\$[\w_\.]+)/$gh->_varname($1)/ge;
272 # main switch
274 # definition list
275 if($l =~ s/^\s\*\s+([\w\s\-]+)\:\s+/$gh->_dl($1)/e)
279 # unsorted list
280 elsif($l =~ s/^\s\*\s+/$gh->_ul()/e or
281 $l =~ s/^\s\-\s+/$gh->_ul()/e)
285 # sorted list
286 elsif($l =~ s/^\s\#\s+/$gh->_ol()/e or
287 $l =~ s/^\s1\s+/$gh->_ol()/e)
291 # table rows
292 elsif($l =~ s/^\s*\|(.*)\|\s*$/$gh->_table_row($1)/e)
296 # table heading / end of row
297 elsif($l =~ s/^\s*(\+[-\+\|]+\+)\s*$/$gh->_table($1)/e)
301 # preformatted text
302 elsif($l =~ s/^(\s.*)$/$gh->_pre($1)/e)
306 # anything else
307 else
309 # back to normal mode
310 $gh->_new_mode(undef);
313 # 1 level heading
314 $l =~ s/^(=+)\s*$/$gh->_process_heading(1,$1)/e;
316 # 2 level heading
317 $l =~ s/^(-+)\s*$/$gh->_process_heading(2,$1)/e;
319 # 3 level heading
320 $l =~ s/^(~+)\s*$/$gh->_process_heading(3,$1)/e;
322 # change ------ into hr
323 $l =~ s/^----*$/$gh->_hr()/e;
325 # push finally
326 $gh->_push($l) if $l;
329 # flush
330 $gh->_new_mode(undef);
332 # postfix
333 $gh->_postfix();
335 # set title
336 ${$gh->{'title'}}=$gh->{'-title'} if ref($gh->{'title'});
338 # set abstract, if not set
339 ${$gh->{'abstract'}}=scalar(@{$gh->{'o'}})
340 if ref($gh->{'abstract'}) and not ${$gh->{'abstract'}};
342 return(@{$gh->{'o'}});
346 =head2 B<process_file>
348 @output=$grutatxt->process_file($filename);
350 Processes a file in Grutatxt format.
352 =cut
354 sub process_file
356 my ($gh,$file)=@_;
358 open F, $file or return(undef);
360 my ($content)=join('',<F>);
361 close F;
363 return($gh->process($content));
367 sub _push
369 my ($gh,$l)=@_;
371 push(@{$gh->{'o'}},$l);
375 sub _process_heading
377 my ($gh,$level,$hd)=@_;
378 my ($l);
380 $l=pop(@{$gh->{'o'}});
382 if($l eq $gh->_empty_line())
384 $gh->_push($l);
385 return($hd);
388 # store title
389 $gh->{'-title'}=$l if $level==1 and not $gh->{'-title'};
391 # store index
392 if(ref($gh->{'index'}))
394 push(@{$gh->{'index'}},"$level,$l");
397 return($gh->_heading($level,$l));
401 sub _calc_col_span
403 my ($gh,$l)=@_;
404 my (@spans);
406 # strip first + and all -
407 $l =~ s/^\+//;
408 $l =~ s/-//g;
410 my ($t)=1; @spans=();
411 for(my $n=0;$n < length($l);$n++)
413 if(substr($l,$n,1) eq '+')
415 push(@spans,$t);
416 $t=1;
418 else
420 # it's a colspan mark:
421 # increment
422 $t++;
426 return(@spans);
430 sub _table_row
432 my ($gh,$str)=@_;
434 my @s=split(/\|/,$str);
436 for(my $n=0;$n < scalar(@s);$n++)
438 ${$gh->{'-table'}}[$n].=' '.$s[$n];
441 return("");
445 sub _pre
447 my ($gh,$l)=@_;
449 # if any other mode is active, add to it
450 if($gh->{'-mode'} and $gh->{'-mode'} ne "pre")
452 $l =~ s/^\s+//;
454 my ($a)=pop(@{$gh->{'o'}})." ".$l;
455 $gh->_push($a);
456 $l="";
458 else
460 $gh->_new_mode("pre");
463 return($l);
466 # empty stubs for falling through the superclass
468 sub _inline { my ($gh,$l)=@_; $l; }
469 sub _escape { my ($gh,$l)=@_; $l; }
470 sub _empty_line { my ($gh)=@_; ""; }
471 sub _url { my ($gh,$url,$label)=@_; ""; }
472 sub _strong { my ($gh,$str)=@_; $str; }
473 sub _em { my ($gh,$str)=@_; $str; }
474 sub _funcname { my ($gh,$str)=@_; $str; }
475 sub _varname { my ($gh,$str)=@_; $str; }
476 sub _new_mode { my ($gh,$mode)=@; }
477 sub _dl { my ($gh,$str)=@_; $str; }
478 sub _ul { my ($gh,$str)=@_; $str; }
479 sub _ol { my ($gh,$str)=@_; $str; }
480 sub _hr { my ($gh)=@_; "" }
481 sub _heading { my ($gh,$level,$l)=@_; $l; }
482 sub _table { my ($gh,$str)=@_; $str; }
483 sub _prefix { my ($gh)=@_; }
484 sub _postfix { my ($gh)=@_; }
486 ###########################################################
488 =head1 DRIVER SPECIFIC INFORMATION
490 =cut
492 ###########################################################
493 # HTML Driver
495 package Grutatxt::HTML;
497 @ISA=("Grutatxt");
499 =head2 HTML Driver
501 The additional parameters for a new Grutatxt object are:
503 =over 4
505 =item I<table-headers>
507 If this boolean value is set, the first row in tables
508 is assumed to be the heading and rendered using <th>
509 instead of <td> tags.
511 =item I<center-tables>
513 If this boolean value is set, tables are centered.
515 =item I<expand-tables>
517 If this boolean value is set, tables are expanded (width 100%).
519 =item I<dl-as-dl>
521 If this boolean value is set, definition lists will be
522 rendered using <dl>, <dt> and <dd> instead of tables.
524 =item I<header-offset>
526 Offset to be summed to the heading level when rendering
527 <h?> tags (default is 0).
529 =item I<class-oddeven>
531 If this boolean value is set, tables will be rendered
532 with an "oddeven" CSS class, and rows alternately classed
533 as "even" or "odd". If it's not set, no CSS class info
534 is added to tables.
536 =back
538 =cut
540 sub new
542 my ($class,%args)=@_;
543 my ($gh);
545 bless(\%args,$class);
546 $gh=\%args;
548 $gh->{'-process-urls'}=1;
550 return($gh);
554 sub _inline
556 my ($gh,$l)=@_;
558 # accept unnamed and HTML inlines
559 if($l =~ /^<<$/ or $l =~ /^<<\s*html$/i)
561 $gh->{'-inline'}="HTML";
562 return;
565 if($l =~ /^>>$/)
567 delete $gh->{'-inline'};
568 return;
571 if($gh->{'-inline'} eq "HTML")
573 $gh->_push($l);
578 sub _escape
580 my ($gh,$l)=@_;
582 $l =~ s/&/&amp;/g;
583 $l =~ s/</&lt;/g;
584 $l =~ s/>/&gt;/g;
586 return($l);
590 sub _empty_line
592 my ($gh)=@_;
594 return("<p>");
598 sub _url
600 my ($gh,$url,$label)=@_;
602 $label=$url unless $label;
604 return("<a href=\"$url\">$label</a>");
608 sub _strong
610 my ($gh,$str)=@_;
611 return("<strong class=strong>$str</strong>");
615 sub _em
617 my ($gh,$str)=@_;
618 return("<em class=em>$str</em>");
622 sub _funcname
624 my ($gh,$str)=@_;
625 return("<code class=funcname>$str</code>");
629 sub _varname
631 my ($gh,$str)=@_;
632 return("<code class=var>$str</code>");
636 sub _new_mode
638 my ($gh,$mode,$params)=@_;
640 if($mode ne $gh->{'-mode'})
642 my $tag;
644 # flush previous list
645 $gh->_push("</$gh->{'-mode'}>")
646 if $gh->{'-mode'};
648 # send new one
649 $tag=$params ? "<$mode $params>" : "<$mode>";
650 $gh->_push($tag) if $mode;
652 $gh->{'-mode'}=$mode;
657 sub _dl
659 my ($gh,$str)=@_;
661 if($gh->{'dl-as-dl'})
663 $gh->_new_mode("dl");
664 return("<dt><strong class=term>$str</strong><dd>");
666 else
668 $gh->_new_mode("table");
669 return("<tr><td valign=top><strong class=term>$1</strong>&nbsp;&nbsp;</td><td valign=top>");
674 sub _ul
676 my ($gh)=@_;
678 $gh->_new_mode("ul");
679 return("<li>");
683 sub _ol
685 my ($gh)=@_;
687 $gh->_new_mode("ol");
688 return("<li>");
692 sub _hr
694 my ($gh)=@_;
696 return("<hr size=1 noshade>");
700 sub _heading
702 my ($gh,$level,$l)=@_;
704 # substitute anchor spaces with underscores
705 my ($a)=lc($l); $a =~ s/\s/_/g;
707 $l=sprintf("<a name=\"$a\"></a>\n<h%d class=level$level>$l</h%d>",
708 $level+$gh->{'header-offset'},
709 $level+$gh->{'header-offset'});
711 return($l);
715 sub _table
717 my ($gh,$str)=@_;
719 if($gh->{'-mode'} eq "table")
721 my ($class)="";
722 my (@spans)=$gh->_calc_col_span($str);
724 # calculate CSS class, if any
725 if($gh->{'class-oddeven'})
727 $class=($gh->{'-tbl-row'} & 1) ? "odd" : "even";
730 $str="<tr $class>";
732 # build columns
733 for(my $n=0;$n < scalar(@{$gh->{'-table'}});$n++)
735 my ($i,$s);
737 $i=${$gh->{'-table'}}[$n];
738 $i="&nbsp;" if $i =~ /^\s*$/;
740 $s=" colspan=$spans[$n]" if $spans[$n] > 1;
742 if($gh->{'table-headers'} and $gh->{'-tbl-row'}==1)
744 $str.="<th $class $s>$i</th>";
746 else
748 $str.="<td $class $s>$i</td>";
752 @{$gh->{'-table'}}=();
753 $gh->{'-tbl-row'}++;
755 else
757 # new table
758 my ($params);
760 $params="border=1";
761 $params.=" width='100\%'" if $gh->{'expand-tables'};
762 $params.=" align=center" if $gh->{'center-tables'};
763 $params.=" class=oddeven" if $gh->{'class-oddeven'};
765 $gh->_new_mode("table", $params);
767 @{$gh->{'-table'}}=();
768 $gh->{'-tbl-row'}=1;
769 $str="";
772 return($str);
776 ###########################################################
777 # troff Driver
779 package Grutatxt::troff;
781 @ISA=("Grutatxt");
783 =head2 troff Driver
785 The troff driver uses the B<-me> macros and B<tbl>. A
786 good way to post-process this output (to PostScript in
787 the example) could be by using
789 groff -t -me -Tps
791 The additional parameters for a new Grutatxt object are:
793 =over 4
795 =item I<normal-size>
797 The point size of normal text. By default is 10.
799 =item I<heading-sizes>
801 This argument must be a reference to an array containing
802 the size in points of the 3 different heading levels. By
803 default, level sizes are [ 20, 18, 15 ].
805 =item I<table-type>
807 The type of table to be rendered by B<tbl>. Can be
808 I<allbox> (all lines rendered; this is the default value),
809 I<box> (only outlined) or I<doublebox> (only outlined by
810 a double line).
812 =back
814 =cut
816 sub new
818 my ($class,%args)=@_;
819 my ($gh);
821 bless(\%args,$class);
822 $gh=\%args;
824 $gh->{'-process-urls'}=0;
826 $gh->{'heading-sizes'}||=[ 20, 18, 15 ];
827 $gh->{'normal-size'}||=10;
828 $gh->{'table-type'}||="allbox"; # box, allbox, doublebox
830 return($gh);
834 sub _prefix
836 my ($gh)=@_;
838 $gh->_push(".nr pp $gh->{'normal-size'}");
839 $gh->_push(".nh");
843 sub _inline
845 my ($gh,$l)=@_;
847 # accept only troff inlines
848 if($l =~ /^<<\s*troff$/i)
850 $gh->{'-inline'}="troff";
851 return;
854 if($l =~ /^>>$/)
856 delete $gh->{'-inline'};
857 return;
860 if($gh->{'-inline'} eq "troff")
862 $gh->_push($l);
867 sub _escape
869 my ($gh,$l)=@_;
871 $l =~ s/\\/\\\\/g;
872 $l =~ s/^'/\\&'/;
874 return($l);
878 sub _empty_line
880 my ($gh)=@_;
882 return(".lp");
886 sub _strong
888 my ($gh,$str)=@_;
889 return("\\fB$str\\fP");
893 sub _em
895 my ($gh,$str)=@_;
896 return("\\fI$str\\fP");
900 sub _funcname
902 my ($gh,$str)=@_;
903 return("\\fB$str\\fP");
907 sub _varname
909 my ($gh,$str)=@_;
910 return("\\fI$str\\fP");
914 sub _new_mode
916 my ($gh,$mode,$params)=@_;
918 if($mode ne $gh->{'-mode'})
920 my $tag;
922 # flush previous list
923 if($gh->{'-mode'} eq "pre")
925 $gh->_push(".)l");
927 elsif($gh->{'-mode'} eq "table")
929 chomp($gh->{'-table-head'});
930 $gh->{'-table-head'} =~ s/\s+$//;
931 $gh->_push($gh->{'-table-head'}.".");
932 $gh->_push($gh->{'-table-body'}.".TE\n.sp 0.6");
935 # send new one
936 if($mode eq "pre")
938 $gh->_push(".(l L");
941 $gh->{'-mode'}=$mode;
946 sub _dl
948 my ($gh,$str)=@_;
950 $gh->_new_mode("dl");
951 return(".ip \"$str\"\n");
955 sub _ul
957 my ($gh)=@_;
959 $gh->_new_mode("ul");
960 return(".bu\n");
964 sub _ol
966 my ($gh)=@_;
968 $gh->_new_mode("ol");
969 return(".np\n");
973 sub _hr
975 my ($gh)=@_;
977 return(".hl");
981 sub _heading
983 my ($gh,$level,$l)=@_;
985 $l=".sz ".${$gh->{'heading-sizes'}}[$level - 1]."\n$l\n.sp 0.6";
987 return($l);
991 sub _table
993 my ($gh,$str)=@_;
995 if($gh->{'-mode'} eq "table")
997 my ($h,$b);
998 my (@spans)=$gh->_calc_col_span($str);
1000 # build columns
1001 $h="";
1002 $b="";
1003 for(my $n=0;$n < scalar(@{$gh->{'-table'}});$n++)
1005 my ($i);
1007 if($gh->{'table-headers'} and $gh->{'-tbl-row'}==1)
1009 $h.="cB ";
1011 else
1013 $h.="l ";
1016 # add span columns
1017 $h.="s " x ($spans[$n] - 1) if $spans[$n] > 1;
1019 $b.="#" if $n;
1021 $i=${$gh->{'-table'}}[$n];
1022 $i =~ s/^\s+//;
1023 $i =~ s/\s+$//;
1024 $i =~ s/(\s)+/$1/g;
1025 $b.=$i;
1028 # add a separator
1029 $b.="\n_" if $gh->{'table-headers'} and
1030 $gh->{'-tbl-row'}==1 and
1031 $gh->{'table-type'} ne "allbox";
1033 $gh->{'-table-head'}.="$h\n";
1034 $gh->{'-table-body'}.="$b\n";
1036 @{$gh->{'-table'}}=();
1037 $gh->{'-tbl-row'}++;
1039 else
1041 # new table
1042 $gh->_new_mode("table");
1044 @{$gh->{'-table'}}=();
1045 $gh->{'-tbl-row'}=1;
1047 $gh->{'-table-head'}=".TS\n$gh->{'table-type'} tab (#);\n";
1048 $gh->{'-table-body'}="";
1051 $str="";
1052 return($str);
1056 sub _postfix
1058 my ($gh)=@_;
1060 # add to top headings and footers
1061 unshift(@{$gh->{'o'}},".ef '\%' ''");
1062 unshift(@{$gh->{'o'}},".of '' '\%'");
1063 unshift(@{$gh->{'o'}},".eh '$gh->{'-title'}' ''");
1064 unshift(@{$gh->{'o'}},".oh '' '$gh->{'-title'}'");
1068 =head1 AUTHOR
1070 Angel Ortega angel@triptico.com
1072 =cut