- Functionality has been moved to a Perl Module.
[grutatxt.git] / Grutatxt.pm
blob261df8e926b43d02d2e7285b5f71ca544415666f
1 #####################################################################
3 # Grutatxt - A text to HTML (and other things) converter
5 # Copyright (C) 2000/2002 Angel Ortega <angel@triptico.com>
7 # This program is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU General Public License
9 # as published by the Free Software Foundation; either version 2
10 # of the License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 # http://www.triptico.com
23 #####################################################################
25 package Grutatxt;
27 $VERSION='2.0.0';
29 =pod
31 =head1 NAME
33 Grutatxt - Text to HTML (and other formats) converter
35 =head1 SYNOPSIS
37 use Grutatxt;
39 # create a new Grutatxt converter object
40 $grutatxt=new Grutatxt();
42 # process a Grutatxt format string
43 @output=$grutatxt->process($text);
45 # idem for a file
46 @output2=$grutatxt->process_file($file);
48 =head1 DESCRIPTION
50 Grutatxt is a module to process text documents in
51 a special markup format (also called Grutatxt), very
52 similar to plain ASCII text. These documents can be
53 converted to HTML or troff.
55 The markup is designed to be fairly intuitive and
56 straightforward and can include headings, bold and italic
57 text effects, bulleted, numbered and definition lists, URLs,
58 function and variable names, preformatted text, horizontal
59 separators and tables. Special marks can be inserted in the
60 text and a heading-based structural index can be obtained
61 from it.
63 A comprehensive description of the markup is defined in
64 the README file, included with the Grutatxt package (it is
65 written in Grutatxt format itself, so it can be converted
66 using the I<grutatxt> tool to any of the supported formats).
67 The latest version (and more information) can be retrieved
68 from the Grutatxt home page at:
70 http://www.triptico.com/software/grutatxt.html
72 =head1 FUNCTIONS AND METHODS
74 =head2 B<new>
76 $grutatxt=new Grutatxt([ "mode" => $mode, ]
77 [ "title" => \$title, ]
78 [ "marks" => \@marks, ]
79 [ "index" => \@index, ]
80 [ "abstract" => \$abstract, ]
81 [ "strip-parens" => $bool, ]
82 [ "strip-dollars" => $bool, ]
83 [ %driver_specific_arguments ] );
85 Creates a new Grutatxt object instance. All parameters are
86 optional.
88 =over 4
90 =item I<mode>
92 Output format. Can be HTML or troff. HTML is used if not specified.
94 =item I<title>
96 If I<title> is specified as a reference to scalar, the first
97 level 1 heading found in the text is stored inside it.
99 =item I<marks>
101 Marks in the Grutatxt markup are created by inserting the
102 string <-> alone in a line. If I<marks> is specified as a
103 reference to array, it will be filled with the subscripts
104 (relative to the output array) of the lines where the marks
105 are found in the text.
107 =item I<index>
109 If I<index> is specified as a reference to array, it will
110 be filled with strings in the format
112 level,heading
114 This information can be used to build a table of contents
115 of the processed text.
117 =item I<strip-parens>
119 Function names in the Grutatxt markup are strings of
120 alphanumeric characters immediately followed by a pair
121 of open and close parentheses. If this boolean value is
122 set, function names found in the processed text will have
123 their parentheses deleted.
125 =item I<strip-dollars>
127 Variable names in the Grutatxt markup are strings of
128 alphanumeric characters preceded by a dollar sign.
129 If this boolean value is set, variable names found in
130 the processed text will have the dollar sign deleted.
132 =item I<abstract>
134 The I<abstract> of a Grutatxt document is the fragment of text
135 from the beginning of the document to the end of the first
136 paragraph after the title. If I<abstract> is specified as a
137 reference to scalar, it will contain (after each call to the
138 B<process()> method) the subscript of the element of the output
139 array that marks the end of the subject.
141 =back
143 =cut
145 sub new
147 my ($class,%args)=@_;
148 my ($gh);
150 $args{'mode'}||='HTML';
152 $class.="::".$args{'mode'};
154 $gh=new $class(%args);
156 return($gh);
160 =head2 B<process>
162 @output=$grutatxt->process($text);
164 Processes a text in Grutatxt format. The result is returned
165 as an array of lines.
167 =cut
169 sub process
171 my ($gh,$content)=@_;
172 my ($p);
174 # clean output
175 @{$gh->{'o'}}=();
177 # clean title and paragraph numbers
178 $gh->{'-title'}="";
179 $gh->{'-p'}=0;
181 # clean marks
182 @{$gh->{'marks'}}=() if ref($gh->{'marks'});
184 # clean index
185 @{$gh->{'index'}}=() if ref($gh->{'index'});
187 # reset abstract line
188 ${$gh->{'abstract'}}=0 if ref($gh->{'abstract'});
190 # insert prefix
191 $gh->_prefix();
193 foreach my $l (split(/\n/,$content))
195 # inline data (passthrough)
196 if($l =~ /^<<$/ .. $l =~ /^>>$/)
198 $gh->_inline($l);
199 next;
202 # marks
203 if($l =~ /^\s*<\->\s*$/)
205 push(@{$gh->{'marks'}},scalar(@{$gh->{'o'}}))
206 if ref($gh->{'marks'});
208 next;
211 # escape possibly dangerous characters
212 $l=$gh->_escape($l);
214 # empty lines
215 $l =~ s/^\r$//ge;
216 if($l =~ s/^$/$gh->_empty_line()/ge)
218 # mark the abstract end
219 if($gh->{'-title'})
221 $gh->{'-p'}++;
223 # mark abstract if it's the
224 # second paragraph from the title
225 ${$gh->{'abstract'}}=scalar(@{$gh->{'o'}})-1
226 if $gh->{'-p'}==2;
230 if($gh->{'-process-urls'})
232 # URLs followed by a parenthesized phrase
233 $l =~ s/(http:\/\/[\w\/\.\?\&\=\-\%\;]*)\s*\(([^\)]+)\)/$gh->_url($1,$2)/ge;
235 # URLs without phrase
236 $l =~ s/([^=][^\"])(http:\/\/[\w\/\.\?\&\=\-\%\;]*)/$1.$gh->_url($2,$2)/ge;
237 $l =~ s/^(http:\/\/[\w\/\.\?\&\=\-\%\;]*)/$gh->_url($1,$1)/ge;
240 # change '''text''' and *text* into strong emphasis
241 $l =~ s/\'\'\'([^\'][^\'][^\']*)\'\'\'/$gh->_strong($1)/ge;
242 $l =~ s/\*(\S[^\*]+\S)\*/$gh->_strong($1)/ge;
243 $l =~ s/\*(\S+)\*/$gh->_strong($1)/ge;
245 # change ''text'' and _text_ into emphasis
246 $l =~ s/\'\'([^\'][^\']*)\'\'/$gh->_em($1)/ge;
247 $l =~ s/\b_(\S[^_]*\S)_\b/$gh->_em($1)/ge;
248 $l =~ s/\b_(\S+)_\b/$gh->_em($1)/ge;
250 # enclose function names
251 if($gh->{'strip-parens'})
253 $l =~ s/(\w+)\(\)/$gh->_funcname($1)/ge;
255 else
257 $l =~ s/(\w+)\(\)/$gh->_funcname($1."()")/ge;
260 # enclose variable names
261 if($gh->{'strip-dollars'})
263 $l =~ s/\$([\w_\.]+)/$gh->_varname($1)/ge;
265 else
267 $l =~ s/(\$[\w_\.]+)/$gh->_varname($1)/ge;
270 # main switch
272 # definition list
273 if($l =~ s/^\s\*\s+([\w\s\-]+)\:\s+/$gh->_dl($1)/e)
277 # unsorted list
278 elsif($l =~ s/^\s\*\s+/$gh->_ul()/e or
279 $l =~ s/^\s\-\s+/$gh->_ul()/e)
283 # sorted list
284 elsif($l =~ s/^\s\#\s+/$gh->_ol()/e or
285 $l =~ s/^\s1\s+/$gh->_ol()/e)
289 # table rows
290 elsif($l =~ s/^\s*\|(.*)\|\s*$/$gh->_table_row($1)/e)
294 # table heading / end of row
295 elsif($l =~ s/^\s*(\+[-\+\|]+\+)\s*$/$gh->_table($1)/e)
299 # preformatted text
300 elsif($l =~ s/^(\s.*)$/$gh->_pre($1)/e)
304 # anything else
305 else
307 # back to normal mode
308 $gh->_new_mode(undef);
311 # 1 level heading
312 $l =~ s/^(=+)\s*$/$gh->_process_heading(1,$1)/e;
314 # 2 level heading
315 $l =~ s/^(-+)\s*$/$gh->_process_heading(2,$1)/e;
317 # 3 level heading
318 $l =~ s/^(~+)\s*$/$gh->_process_heading(3,$1)/e;
320 # change ------ into hr
321 $l =~ s/^----*$/$gh->_hr()/e;
323 # push finally
324 $gh->_push($l) if $l;
327 # flush
328 $gh->_new_mode(undef);
330 # postfix
331 $gh->_postfix();
333 # set title
334 ${$gh->{'title'}}=$gh->{'-title'} if ref($gh->{'title'});
336 # set abstract, if not set
337 ${$gh->{'abstract'}}=scalar(@{$gh->{'o'}})
338 if ref($gh->{'abstract'}) and not ${$gh->{'abstract'}};
340 return(@{$gh->{'o'}});
344 =head2 B<process_file>
346 @output=$grutatxt->process_file($filename);
348 Processes a file in Grutatxt format.
350 =cut
352 sub process_file
354 my ($gh,$file)=@_;
356 open F, $file or return(undef);
358 my ($content)=join('',<F>);
359 close F;
361 return($gh->process($content));
365 sub _push
367 my ($gh,$l)=@_;
369 push(@{$gh->{'o'}},$l);
373 sub _process_heading
375 my ($gh,$level,$hd)=@_;
376 my ($l);
378 $l=pop(@{$gh->{'o'}});
380 if($l eq $gh->_empty_line())
382 $gh->_push($l);
383 return($hd);
386 # store title
387 $gh->{'-title'}=$l if $level==1 and not $gh->{'-title'};
389 # store index
390 if(ref($gh->{'index'}))
392 push(@{$gh->{'index'}},"$level,$l");
395 return($gh->_heading($level,$l));
399 sub _calc_col_span
401 my ($gh,$l)=@_;
402 my (@spans);
404 # strip first + and all -
405 $l =~ s/^\+//;
406 $l =~ s/-//g;
408 my ($t)=1; @spans=();
409 for(my $n=0;$n < length($l);$n++)
411 if(substr($l,$n,1) eq '+')
413 push(@spans,$t);
414 $t=1;
416 else
418 # it's a colspan mark:
419 # increment
420 $t++;
424 return(@spans);
428 sub _table_row
430 my ($gh,$str)=@_;
432 my @s=split(/\|/,$str);
434 for(my $n=0;$n < scalar(@s);$n++)
436 ${$gh->{'-table'}}[$n].=' '.$s[$n];
439 return("");
443 sub _pre
445 my ($gh,$l)=@_;
447 # if any other mode is active, add to it
448 if($gh->{'mode'} and $gh->{'mode'} ne "pre")
450 $l =~ s/^\s+//;
452 my ($a)=pop(@{$gh->{'o'}})." ".$l;
453 $gh->_push($a);
454 $l="";
456 else
458 $gh->_new_mode("pre");
461 return($l);
464 # empty stubs for falling through the superclass
466 sub _inline { my ($gh,$l)=@_; $l; }
467 sub _escape { my ($gh,$l)=@_; $l; }
468 sub _empty_line { my ($gh)=@_; ""; }
469 sub _url { my ($gh,$url,$label)=@_; ""; }
470 sub _strong { my ($gh,$str)=@_; $str; }
471 sub _em { my ($gh,$str)=@_; $str; }
472 sub _funcname { my ($gh,$str)=@_; $str; }
473 sub _varname { my ($gh,$str)=@_; $str; }
474 sub _new_mode { my ($gh,$mode)=@; }
475 sub _dl { my ($gh,$str)=@_; $str; }
476 sub _ul { my ($gh,$str)=@_; $str; }
477 sub _ol { my ($gh,$str)=@_; $str; }
478 sub _hr { my ($gh)=@_; "" }
479 sub _heading { my ($gh,$level,$l)=@_; $l; }
480 sub _table { my ($gh,$str)=@_; $str; }
481 sub _prefix { my ($gh)=@_; }
482 sub _postfix { my ($gh)=@_; }
484 ###########################################################
486 =head1 DRIVER SPECIFIC INFORMATION
488 =cut
490 ###########################################################
491 # HTML Driver
493 package Grutatxt::HTML;
495 @ISA=("Grutatxt");
497 =head2 HTML Driver
499 The additional parameters for a new Grutatxt object are:
501 =over 4
503 =item I<table-headers>
505 If this boolean value is set, the first row in tables
506 is assumed to be the heading and rendered using <th>
507 instead of <td> tags.
509 =item I<center-tables>
511 If this boolean value is set, tables are centered.
513 =item I<expand-tables>
515 If this boolean value is set, tables are expanded (width 100%).
517 =item I<dl-as-dl>
519 If this boolean value is set, definition lists will be
520 rendered using <dl>, <dt> and <dd> instead of tables.
522 =item I<header-offset>
524 Offset to be summed to the heading level when rendering
525 <h?> tags (default is 0).
527 =item I<class-oddeven>
529 If this boolean value is set, tables will be rendered
530 with an "oddeven" CSS class, and rows alternately classed
531 as "even" or "odd". If it's not set, no CSS class info
532 is added to tables.
534 =back
536 =cut
538 sub new
540 my ($class,%args)=@_;
541 my ($gh);
543 bless(\%args,$class);
544 $gh=\%args;
546 $gh->{'-process-urls'}=1;
548 return($gh);
552 sub _inline
554 my ($gh,$l)=@_;
556 # accept unnamed and HTML inlines
557 if($l =~ /^<<$/ or $l =~ /^<<\s*html$/i)
559 $gh->{'-inline'}="HTML";
560 return;
563 if($l =~ /^>>$/)
565 delete $gh->{'-inline'};
566 return;
569 if($gh->{'-inline'} eq "HTML")
571 $gh->_push($l);
576 sub _escape
578 my ($gh,$l)=@_;
580 $l =~ s/&/&amp;/g;
581 $l =~ s/</&lt;/g;
582 $l =~ s/>/&gt;/g;
584 return($l);
588 sub _empty_line
590 my ($gh)=@_;
592 return("<p>");
596 sub _url
598 my ($gh,$url,$label)=@_;
600 $label=$url unless $label;
602 return("<a href=\"$url\">$label</a>");
606 sub _strong
608 my ($gh,$str)=@_;
609 return("<strong class=strong>$str</strong>");
613 sub _em
615 my ($gh,$str)=@_;
616 return("<em class=em>$str</em>");
620 sub _funcname
622 my ($gh,$str)=@_;
623 return("<code class=funcname>$str</code>");
627 sub _varname
629 my ($gh,$str)=@_;
630 return("<code class=var>$str</code>");
634 sub _new_mode
636 my ($gh,$mode,$params)=@_;
638 if($mode ne $gh->{'mode'})
640 my $tag;
642 # flush previous list
643 $gh->_push("</$gh->{'mode'}>")
644 if $gh->{'mode'};
646 # send new one
647 $tag=$params ? "<$mode $params>" : "<$mode>";
648 $gh->_push($tag) if $mode;
650 $gh->{'mode'}=$mode;
655 sub _dl
657 my ($gh,$str)=@_;
659 if($gh->{'dl-as-dl'})
661 $gh->_new_mode("dl");
662 return("<dt><strong class=term>$str</strong><dd>");
664 else
666 $gh->_new_mode("table");
667 return("<tr><td valign=top><strong class=term>$1</strong class=strong>&nbsp;&nbsp;</td><td valign=top>");
672 sub _ul
674 my ($gh)=@_;
676 $gh->_new_mode("ul");
677 return("<li>");
681 sub _ol
683 my ($gh)=@_;
685 $gh->_new_mode("ol");
686 return("<li>");
690 sub _hr
692 my ($gh)=@_;
694 return("<hr size=1 noshade>");
698 sub _heading
700 my ($gh,$level,$l)=@_;
702 # substitute anchor spaces with underscores
703 my ($a)=lc($l); $a =~ s/\s/_/g;
705 $l=sprintf("<a name=\"$a\"></a>\n<h%d class=level$level>$l</h%d>",
706 $level+$gh->{'header-offset'},
707 $level+$gh->{'header-offset'});
709 return($l);
713 sub _table
715 my ($gh,$str)=@_;
717 if($gh->{'mode'} eq "table")
719 my ($class)="";
720 my (@spans)=$gh->_calc_col_span($str);
722 # calculate CSS class, if any
723 if($gh->{'class-oddeven'})
725 $class=($gh->{'-tbl-row'} & 1) ? "odd" : "even";
728 $str="<tr $class>";
730 # build columns
731 for(my $n=0;$n < scalar(@{$gh->{'-table'}});$n++)
733 my ($i,$s);
735 $i=${$gh->{'-table'}}[$n];
736 $i="&nbsp;" if $i =~ /^\s*$/;
738 $s=" colspan=$spans[$n]" if $spans[$n] > 1;
740 if($gh->{'table-headers'} and $gh->{'-tbl-row'}==1)
742 $str.="<th $class $s>$i</th>";
744 else
746 $str.="<td $class $s>$i</td>";
750 @{$gh->{'-table'}}=();
751 $gh->{'-tbl-row'}++;
753 else
755 # new table
756 my ($params);
758 $params="border=1";
759 $params.=" width='100\%'" if $gh->{'expand-tables'};
760 $params.=" align=center" if $gh->{'center-tables'};
761 $params.=" class=oddeven" if $gh->{'class-oddeven'};
763 $gh->_new_mode("table", $params);
765 @{$gh->{'-table'}}=();
766 $gh->{'-tbl-row'}=1;
767 $str="";
770 return($str);
774 ###########################################################
775 # troff Driver
777 package Grutatxt::troff;
779 @ISA=("Grutatxt");
781 =head2 troff Driver
783 The troff driver uses the B<-me> macros and B<tbl>. A
784 good way to post-process this output (to PostScript in
785 the example) could be by using
787 groff -t -me -Tps
789 The additional parameters for a new Grutatxt object are:
791 =over 4
793 =item I<normal-size>
795 The point size of normal text. By default is 10.
797 =item I<heading-sizes>
799 This argument must be a reference to an array containing
800 the size in points of the 3 different heading levels. By
801 default, level sizes are [ 20, 18, 15 ].
803 =item I<table-type>
805 The type of table to be rendered by B<tbl>. Can be
806 I<allbox> (all lines rendered; this is the default value),
807 I<box> (only outlined) or I<doublebox> (only outlined by
808 a double line).
810 =back
812 =cut
814 sub new
816 my ($class,%args)=@_;
817 my ($gh);
819 bless(\%args,$class);
820 $gh=\%args;
822 $gh->{'-process-urls'}=0;
824 $gh->{'heading-sizes'}||=[ 20, 18, 15 ];
825 $gh->{'normal-size'}||=10;
826 $gh->{'table-type'}||="allbox"; # box, allbox, doublebox
828 return($gh);
832 sub _prefix
834 my ($gh)=@_;
836 $gh->_push(".nr pp $gh->{'normal-size'}");
837 $gh->_push(".nh");
841 sub _inline
843 my ($gh,$l)=@_;
845 # accept only troff inlines
846 if($l =~ /^<<\s*troff$/i)
848 $gh->{'-inline'}="troff";
849 return;
852 if($l =~ /^>>$/)
854 delete $gh->{'-inline'};
855 return;
858 if($gh->{'-inline'} eq "troff")
860 $gh->_push($l);
865 sub _escape
867 my ($gh,$l)=@_;
869 $l =~ s/\\/\\\\/g;
870 $l =~ s/^'/\\&'/;
872 return($l);
876 sub _empty_line
878 my ($gh)=@_;
880 return(".lp");
884 sub _strong
886 my ($gh,$str)=@_;
887 return("\\fB$str\\fP");
891 sub _em
893 my ($gh,$str)=@_;
894 return("\\fI$str\\fP");
898 sub _funcname
900 my ($gh,$str)=@_;
901 return("\\fB$str\\fP");
905 sub _varname
907 my ($gh,$str)=@_;
908 return("\\fI$str\\fP");
912 sub _new_mode
914 my ($gh,$mode,$params)=@_;
916 if($mode ne $gh->{'mode'})
918 my $tag;
920 # flush previous list
921 if($gh->{'mode'} eq "pre")
923 $gh->_push(".)l");
925 elsif($gh->{'mode'} eq "table")
927 chomp($gh->{'-table-head'});
928 $gh->{'-table-head'} =~ s/\s+$//;
929 $gh->_push($gh->{'-table-head'}.".");
930 $gh->_push($gh->{'-table-body'}.".TE\n.sp 0.6");
933 # send new one
934 if($mode eq "pre")
936 $gh->_push(".(l L");
939 $gh->{'mode'}=$mode;
944 sub _dl
946 my ($gh,$str)=@_;
948 $gh->_new_mode("dl");
949 return(".ip \"$str\"\n");
953 sub _ul
955 my ($gh)=@_;
957 $gh->_new_mode("ul");
958 return(".bu\n");
962 sub _ol
964 my ($gh)=@_;
966 $gh->_new_mode("ol");
967 return(".np\n");
971 sub _hr
973 my ($gh)=@_;
975 return(".hl");
979 sub _heading
981 my ($gh,$level,$l)=@_;
983 $l=".sz ".${$gh->{'heading-sizes'}}[$level - 1]."\n$l\n.sp 0.6";
985 return($l);
989 sub _table
991 my ($gh,$str)=@_;
993 if($gh->{'mode'} eq "table")
995 my ($h,$b);
996 my (@spans)=$gh->_calc_col_span($str);
998 # build columns
999 $h="";
1000 $b="";
1001 for(my $n=0;$n < scalar(@{$gh->{'-table'}});$n++)
1003 my ($i);
1005 if($gh->{'table-headers'} and $gh->{'-tbl-row'}==1)
1007 $h.="cB ";
1009 else
1011 $h.="l ";
1014 # add span columns
1015 $h.="s " x ($spans[$n] - 1) if $spans[$n] > 1;
1017 $b.="#" if $n;
1019 $i=${$gh->{'-table'}}[$n];
1020 $i =~ s/^\s+//;
1021 $i =~ s/\s+$//;
1022 $i =~ s/(\s)+/$1/g;
1023 $b.=$i;
1026 # add a separator
1027 $b.="\n_" if $gh->{'table-headers'} and
1028 $gh->{'-tbl-row'}==1 and
1029 $gh->{'table-type'} ne "allbox";
1031 $gh->{'-table-head'}.="$h\n";
1032 $gh->{'-table-body'}.="$b\n";
1034 @{$gh->{'-table'}}=();
1035 $gh->{'-tbl-row'}++;
1037 else
1039 # new table
1040 $gh->_new_mode("table");
1042 @{$gh->{'-table'}}=();
1043 $gh->{'-tbl-row'}=1;
1045 $gh->{'-table-head'}=".TS\n$gh->{'table-type'} tab (#);\n";
1046 $gh->{'-table-body'}="";
1049 $str="";
1050 return($str);
1054 sub _postfix
1056 my ($gh)=@_;
1058 # add to top headings and footers
1059 unshift(@{$gh->{'o'}},".ef '\%' ''");
1060 unshift(@{$gh->{'o'}},".of '' '\%'");
1061 unshift(@{$gh->{'o'}},".eh '$gh->{'-title'}' ''");
1062 unshift(@{$gh->{'o'}},".oh '' '$gh->{'-title'}'");
1066 =head1 AUTHOR
1068 Angel Ortega angel@triptico.com
1070 =cut