- New markup for quoted blocks.
[grutatxt.git] / Grutatxt.pm
blob45083b99c8f939f46641d0208b33cb545b2856ce
1 #####################################################################
3 # Grutatxt - A text to HTML (and other things) converter
5 # Copyright (C) 2000/2002 Angel Ortega <angel@triptico.com>
7 # This program is free software; you can redistribute it and/or
8 # modify it under the terms of the GNU General Public License
9 # as published by the Free Software Foundation; either version 2
10 # of the License, or (at your option) any later version.
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with this program; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 # http://www.triptico.com
23 #####################################################################
25 package Grutatxt;
27 use locale;
29 $VERSION='2.0.3';
31 =pod
33 =head1 NAME
35 Grutatxt - Text to HTML (and other formats) converter
37 =head1 SYNOPSIS
39 use Grutatxt;
41 # create a new Grutatxt converter object
42 $grutatxt=new Grutatxt();
44 # process a Grutatxt format string
45 @output=$grutatxt->process($text);
47 # idem for a file
48 @output2=$grutatxt->process_file($file);
50 =head1 DESCRIPTION
52 Grutatxt is a module to process text documents in
53 a special markup format (also called Grutatxt), very
54 similar to plain ASCII text. These documents can be
55 converted to HTML or troff.
57 The markup is designed to be fairly intuitive and
58 straightforward and can include headings, bold and italic
59 text effects, bulleted, numbered and definition lists, URLs,
60 function and variable names, preformatted text, horizontal
61 separators and tables. Special marks can be inserted in the
62 text and a heading-based structural index can be obtained
63 from it.
65 A comprehensive description of the markup is defined in
66 the README file, included with the Grutatxt package (it is
67 written in Grutatxt format itself, so it can be converted
68 using the I<grutatxt> tool to any of the supported formats).
69 The latest version (and more information) can be retrieved
70 from the Grutatxt home page at:
72 http://www.triptico.com/software/grutatxt.html
74 =head1 FUNCTIONS AND METHODS
76 =head2 B<new>
78 $grutatxt=new Grutatxt([ "mode" => $mode, ]
79 [ "title" => \$title, ]
80 [ "marks" => \@marks, ]
81 [ "index" => \@index, ]
82 [ "abstract" => \$abstract, ]
83 [ "strip-parens" => $bool, ]
84 [ "strip-dollars" => $bool, ]
85 [ %driver_specific_arguments ] );
87 Creates a new Grutatxt object instance. All parameters are
88 optional.
90 =over 4
92 =item I<mode>
94 Output format. Can be HTML or troff. HTML is used if not specified.
96 =item I<title>
98 If I<title> is specified as a reference to scalar, the first
99 level 1 heading found in the text is stored inside it.
101 =item I<marks>
103 Marks in the Grutatxt markup are created by inserting the
104 string <-> alone in a line. If I<marks> is specified as a
105 reference to array, it will be filled with the subscripts
106 (relative to the output array) of the lines where the marks
107 are found in the text.
109 =item I<index>
111 If I<index> is specified as a reference to array, it will
112 be filled with strings in the format
114 level,heading
116 This information can be used to build a table of contents
117 of the processed text.
119 =item I<strip-parens>
121 Function names in the Grutatxt markup are strings of
122 alphanumeric characters immediately followed by a pair
123 of open and close parentheses. If this boolean value is
124 set, function names found in the processed text will have
125 their parentheses deleted.
127 =item I<strip-dollars>
129 Variable names in the Grutatxt markup are strings of
130 alphanumeric characters preceded by a dollar sign.
131 If this boolean value is set, variable names found in
132 the processed text will have the dollar sign deleted.
134 =item I<abstract>
136 The I<abstract> of a Grutatxt document is the fragment of text
137 from the beginning of the document to the end of the first
138 paragraph after the title. If I<abstract> is specified as a
139 reference to scalar, it will contain (after each call to the
140 B<process()> method) the subscript of the element of the output
141 array that marks the end of the subject.
143 =back
145 =cut
147 sub new
149 my ($class,%args)=@_;
150 my ($gh);
152 $args{'mode'}||='HTML';
154 $class.="::".$args{'mode'};
156 $gh=new $class(%args);
158 return($gh);
162 =head2 B<process>
164 @output=$grutatxt->process($text);
166 Processes a text in Grutatxt format. The result is returned
167 as an array of lines.
169 =cut
171 sub process
173 my ($gh,$content)=@_;
174 my ($p);
176 # clean output
177 @{$gh->{'o'}}=();
179 # clean title and paragraph numbers
180 $gh->{'-title'}="";
181 $gh->{'-p'}=0;
183 # clean marks
184 @{$gh->{'marks'}}=() if ref($gh->{'marks'});
186 # clean index
187 @{$gh->{'index'}}=() if ref($gh->{'index'});
189 # reset abstract line
190 ${$gh->{'abstract'}}=0 if ref($gh->{'abstract'});
192 # insert prefix
193 $gh->_prefix();
195 $gh->{'-mode'}=undef;
197 foreach my $l (split(/\n/,$content))
199 # inline data (passthrough)
200 if($l =~ /^<<$/ .. $l =~ /^>>$/)
202 $gh->_inline($l);
203 next;
206 # marks
207 if($l =~ /^\s*<\->\s*$/)
209 push(@{$gh->{'marks'}},scalar(@{$gh->{'o'}}))
210 if ref($gh->{'marks'});
212 next;
215 # escape possibly dangerous characters
216 $l=$gh->_escape($l);
218 # empty lines
219 $l =~ s/^\r$//ge;
220 if($l =~ s/^$/$gh->_empty_line()/ge)
222 # mark the abstract end
223 if($gh->{'-title'})
225 $gh->{'-p'}++;
227 # mark abstract if it's the
228 # second paragraph from the title
229 ${$gh->{'abstract'}}=scalar(@{$gh->{'o'}})-1
230 if $gh->{'-p'}==2;
234 if($gh->{'-process-urls'})
236 # URLs followed by a parenthesized phrase
237 $l =~ s/(http:\/\/\S+)\s+\(([^\)]+)\)/$gh->_url($1,$2)/ge;
239 # URLs without phrase
240 $l =~ s/([^=][^\"])(http:\/\/\S+)/$1.$gh->_url($2)/ge;
241 $l =~ s/^(http:\/\/\S+)/$gh->_url($1)/ge;
244 # change '''text''' and *text* into strong emphasis
245 $l =~ s/\'\'\'([^\'][^\'][^\']*)\'\'\'/$gh->_strong($1)/ge;
246 $l =~ s/\*(\S[^\*]+\S)\*/$gh->_strong($1)/ge;
247 $l =~ s/\*(\S+)\*/$gh->_strong($1)/ge;
249 # change ''text'' and _text_ into emphasis
250 $l =~ s/\'\'([^\'][^\']*)\'\'/$gh->_em($1)/ge;
251 $l =~ s/\b_(\S[^_]*\S)_\b/$gh->_em($1)/ge;
252 $l =~ s/\b_(\S+)_\b/$gh->_em($1)/ge;
254 # enclose function names
255 if($gh->{'strip-parens'})
257 $l =~ s/(\w+)\(\)/$gh->_funcname($1)/ge;
259 else
261 $l =~ s/(\w+)\(\)/$gh->_funcname($1."()")/ge;
264 # enclose variable names
265 if($gh->{'strip-dollars'})
267 $l =~ s/\$([\w_\.]+)/$gh->_varname($1)/ge;
269 else
271 $l =~ s/(\$[\w_\.]+)/$gh->_varname($1)/ge;
274 # main switch
276 # definition list
277 if($l =~ s/^\s\*\s+([\w\s\-]+)\:\s+/$gh->_dl($1)/e)
281 # unsorted list
282 elsif($l =~ s/^\s\*\s+/$gh->_ul()/e or
283 $l =~ s/^\s\-\s+/$gh->_ul()/e)
287 # sorted list
288 elsif($l =~ s/^\s\#\s+/$gh->_ol()/e or
289 $l =~ s/^\s1\s+/$gh->_ol()/e)
293 # quoted block
294 elsif($l =~ s/^\s\"/$gh->_blockquote()/e)
298 # table rows
299 elsif($l =~ s/^\s*\|(.*)\|\s*$/$gh->_table_row($1)/e)
303 # table heading / end of row
304 elsif($l =~ s/^\s*(\+[-\+\|]+\+)\s*$/$gh->_table($1)/e)
308 # preformatted text
309 elsif($l =~ s/^(\s.*)$/$gh->_pre($1)/e)
313 # anything else
314 else
316 # back to normal mode
317 $gh->_new_mode(undef);
320 # 1 level heading
321 $l =~ s/^(=+)\s*$/$gh->_process_heading(1,$1)/e;
323 # 2 level heading
324 $l =~ s/^(-+)\s*$/$gh->_process_heading(2,$1)/e;
326 # 3 level heading
327 $l =~ s/^(~+)\s*$/$gh->_process_heading(3,$1)/e;
329 # change ------ into hr
330 $l =~ s/^----*$/$gh->_hr()/e;
332 # push finally
333 $gh->_push($l) if $l;
336 # flush
337 $gh->_new_mode(undef);
339 # postfix
340 $gh->_postfix();
342 # set title
343 ${$gh->{'title'}}=$gh->{'-title'} if ref($gh->{'title'});
345 # set abstract, if not set
346 ${$gh->{'abstract'}}=scalar(@{$gh->{'o'}})
347 if ref($gh->{'abstract'}) and not ${$gh->{'abstract'}};
349 return(@{$gh->{'o'}});
353 =head2 B<process_file>
355 @output=$grutatxt->process_file($filename);
357 Processes a file in Grutatxt format.
359 =cut
361 sub process_file
363 my ($gh,$file)=@_;
365 open F, $file or return(undef);
367 my ($content)=join('',<F>);
368 close F;
370 return($gh->process($content));
374 sub _push
376 my ($gh,$l)=@_;
378 push(@{$gh->{'o'}},$l);
382 sub _process_heading
384 my ($gh,$level,$hd)=@_;
385 my ($l);
387 $l=pop(@{$gh->{'o'}});
389 if($l eq $gh->_empty_line())
391 $gh->_push($l);
392 return($hd);
395 # store title
396 $gh->{'-title'}=$l if $level==1 and not $gh->{'-title'};
398 # store index
399 if(ref($gh->{'index'}))
401 push(@{$gh->{'index'}},"$level,$l");
404 return($gh->_heading($level,$l));
408 sub _calc_col_span
410 my ($gh,$l)=@_;
411 my (@spans);
413 # strip first + and all -
414 $l =~ s/^\+//;
415 $l =~ s/-//g;
417 my ($t)=1; @spans=();
418 for(my $n=0;$n < length($l);$n++)
420 if(substr($l,$n,1) eq '+')
422 push(@spans,$t);
423 $t=1;
425 else
427 # it's a colspan mark:
428 # increment
429 $t++;
433 return(@spans);
437 sub _table_row
439 my ($gh,$str)=@_;
441 my @s=split(/\|/,$str);
443 for(my $n=0;$n < scalar(@s);$n++)
445 ${$gh->{'-table'}}[$n].=' '.$s[$n];
448 return("");
452 sub _pre
454 my ($gh,$l)=@_;
456 # if any other mode is active, add to it
457 if($gh->{'-mode'} and $gh->{'-mode'} ne "pre")
459 $l =~ s/^\s+//;
461 my ($a)=pop(@{$gh->{'o'}})." ".$l;
462 $gh->_push($a);
463 $l="";
465 else
467 $gh->_new_mode("pre");
470 return($l);
473 # empty stubs for falling through the superclass
475 sub _inline { my ($gh,$l)=@_; $l; }
476 sub _escape { my ($gh,$l)=@_; $l; }
477 sub _empty_line { my ($gh)=@_; ""; }
478 sub _url { my ($gh,$url,$label)=@_; ""; }
479 sub _strong { my ($gh,$str)=@_; $str; }
480 sub _em { my ($gh,$str)=@_; $str; }
481 sub _funcname { my ($gh,$str)=@_; $str; }
482 sub _varname { my ($gh,$str)=@_; $str; }
483 sub _new_mode { my ($gh,$mode)=@; }
484 sub _dl { my ($gh,$str)=@_; $str; }
485 sub _ul { my ($gh,$str)=@_; $str; }
486 sub _ol { my ($gh,$str)=@_; $str; }
487 sub _blockquote { my ($gh,$str)=@_; $str; }
488 sub _hr { my ($gh)=@_; "" }
489 sub _heading { my ($gh,$level,$l)=@_; $l; }
490 sub _table { my ($gh,$str)=@_; $str; }
491 sub _prefix { my ($gh)=@_; }
492 sub _postfix { my ($gh)=@_; }
494 ###########################################################
496 =head1 DRIVER SPECIFIC INFORMATION
498 =cut
500 ###########################################################
501 # HTML Driver
503 package Grutatxt::HTML;
505 @ISA=("Grutatxt");
507 =head2 HTML Driver
509 The additional parameters for a new Grutatxt object are:
511 =over 4
513 =item I<table-headers>
515 If this boolean value is set, the first row in tables
516 is assumed to be the heading and rendered using <th>
517 instead of <td> tags.
519 =item I<center-tables>
521 If this boolean value is set, tables are centered.
523 =item I<expand-tables>
525 If this boolean value is set, tables are expanded (width 100%).
527 =item I<dl-as-dl>
529 If this boolean value is set, definition lists will be
530 rendered using <dl>, <dt> and <dd> instead of tables.
532 =item I<header-offset>
534 Offset to be summed to the heading level when rendering
535 <h?> tags (default is 0).
537 =item I<class-oddeven>
539 If this boolean value is set, tables will be rendered
540 with an "oddeven" CSS class, and rows alternately classed
541 as "even" or "odd". If it's not set, no CSS class info
542 is added to tables.
544 =back
546 =cut
548 sub new
550 my ($class,%args)=@_;
551 my ($gh);
553 bless(\%args,$class);
554 $gh=\%args;
556 $gh->{'-process-urls'}=1;
558 return($gh);
562 sub _inline
564 my ($gh,$l)=@_;
566 # accept unnamed and HTML inlines
567 if($l =~ /^<<$/ or $l =~ /^<<\s*html$/i)
569 $gh->{'-inline'}="HTML";
570 return;
573 if($l =~ /^>>$/)
575 delete $gh->{'-inline'};
576 return;
579 if($gh->{'-inline'} eq "HTML")
581 $gh->_push($l);
586 sub _escape
588 my ($gh,$l)=@_;
590 $l =~ s/&/&amp;/g;
591 $l =~ s/</&lt;/g;
592 $l =~ s/>/&gt;/g;
594 return($l);
598 sub _empty_line
600 my ($gh)=@_;
602 return("<p>");
606 sub _url
608 my ($gh,$url,$label)=@_;
610 $label=$url unless $label;
612 return("<a href=\"$url\">$label</a>");
616 sub _strong
618 my ($gh,$str)=@_;
619 return("<strong class=strong>$str</strong>");
623 sub _em
625 my ($gh,$str)=@_;
626 return("<em class=em>$str</em>");
630 sub _funcname
632 my ($gh,$str)=@_;
633 return("<code class=funcname>$str</code>");
637 sub _varname
639 my ($gh,$str)=@_;
640 return("<code class=var>$str</code>");
644 sub _new_mode
646 my ($gh,$mode,$params)=@_;
648 if($mode ne $gh->{'-mode'})
650 my $tag;
652 # flush previous list
653 $gh->_push("</$gh->{'-mode'}>")
654 if $gh->{'-mode'};
656 # send new one
657 $tag=$params ? "<$mode $params>" : "<$mode>";
658 $gh->_push($tag) if $mode;
660 $gh->{'-mode'}=$mode;
665 sub _dl
667 my ($gh,$str)=@_;
669 if($gh->{'dl-as-dl'})
671 $gh->_new_mode("dl");
672 return("<dt><strong class=term>$str</strong><dd>");
674 else
676 $gh->_new_mode("table");
677 return("<tr><td valign=top><strong class=term>$1</strong>&nbsp;&nbsp;</td><td valign=top>");
682 sub _ul
684 my ($gh)=@_;
686 $gh->_new_mode("ul");
687 return("<li>");
691 sub _ol
693 my ($gh)=@_;
695 $gh->_new_mode("ol");
696 return("<li>");
700 sub _blockquote
702 my ($gh)=@_;
704 $gh->_new_mode("blockquote");
705 return("\"");
709 sub _hr
711 my ($gh)=@_;
713 return("<hr size=1 noshade>");
717 sub _heading
719 my ($gh,$level,$l)=@_;
721 # substitute anchor spaces with underscores
722 my ($a)=lc($l); $a =~ s/\s/_/g;
724 $l=sprintf("<a name=\"%s\"></a>\n<h%d class=level$level>%s</h%d>",
725 $a, $level+$gh->{'header-offset'},
726 $l, $level+$gh->{'header-offset'});
728 return($l);
732 sub _table
734 my ($gh,$str)=@_;
736 if($gh->{'-mode'} eq "table")
738 my ($class)="";
739 my (@spans)=$gh->_calc_col_span($str);
741 # calculate CSS class, if any
742 if($gh->{'class-oddeven'})
744 $class=($gh->{'-tbl-row'} & 1) ? "odd" : "even";
747 $str="<tr $class>";
749 # build columns
750 for(my $n=0;$n < scalar(@{$gh->{'-table'}});$n++)
752 my ($i,$s);
754 $i=${$gh->{'-table'}}[$n];
755 $i="&nbsp;" if $i =~ /^\s*$/;
757 $s=" colspan=$spans[$n]" if $spans[$n] > 1;
759 if($gh->{'table-headers'} and $gh->{'-tbl-row'}==1)
761 $str.="<th $class $s>$i</th>";
763 else
765 $str.="<td $class $s>$i</td>";
769 @{$gh->{'-table'}}=();
770 $gh->{'-tbl-row'}++;
772 else
774 # new table
775 my ($params);
777 $params="border=1";
778 $params.=" width='100\%'" if $gh->{'expand-tables'};
779 $params.=" align=center" if $gh->{'center-tables'};
780 $params.=" class=oddeven" if $gh->{'class-oddeven'};
782 $gh->_new_mode("table", $params);
784 @{$gh->{'-table'}}=();
785 $gh->{'-tbl-row'}=1;
786 $str="";
789 return($str);
793 ###########################################################
794 # troff Driver
796 package Grutatxt::troff;
798 @ISA=("Grutatxt");
800 =head2 troff Driver
802 The troff driver uses the B<-me> macros and B<tbl>. A
803 good way to post-process this output (to PostScript in
804 the example) could be by using
806 groff -t -me -Tps
808 The additional parameters for a new Grutatxt object are:
810 =over 4
812 =item I<normal-size>
814 The point size of normal text. By default is 10.
816 =item I<heading-sizes>
818 This argument must be a reference to an array containing
819 the size in points of the 3 different heading levels. By
820 default, level sizes are [ 20, 18, 15 ].
822 =item I<table-type>
824 The type of table to be rendered by B<tbl>. Can be
825 I<allbox> (all lines rendered; this is the default value),
826 I<box> (only outlined) or I<doublebox> (only outlined by
827 a double line).
829 =back
831 =cut
833 sub new
835 my ($class,%args)=@_;
836 my ($gh);
838 bless(\%args,$class);
839 $gh=\%args;
841 $gh->{'-process-urls'}=0;
843 $gh->{'heading-sizes'}||=[ 20, 18, 15 ];
844 $gh->{'normal-size'}||=10;
845 $gh->{'table-type'}||="allbox"; # box, allbox, doublebox
847 return($gh);
851 sub _prefix
853 my ($gh)=@_;
855 $gh->_push(".nr pp $gh->{'normal-size'}");
856 $gh->_push(".nh");
860 sub _inline
862 my ($gh,$l)=@_;
864 # accept only troff inlines
865 if($l =~ /^<<\s*troff$/i)
867 $gh->{'-inline'}="troff";
868 return;
871 if($l =~ /^>>$/)
873 delete $gh->{'-inline'};
874 return;
877 if($gh->{'-inline'} eq "troff")
879 $gh->_push($l);
884 sub _escape
886 my ($gh,$l)=@_;
888 $l =~ s/\\/\\\\/g;
889 $l =~ s/^'/\\&'/;
891 return($l);
895 sub _empty_line
897 my ($gh)=@_;
899 return(".lp");
903 sub _strong
905 my ($gh,$str)=@_;
906 return("\\fB$str\\fP");
910 sub _em
912 my ($gh,$str)=@_;
913 return("\\fI$str\\fP");
917 sub _funcname
919 my ($gh,$str)=@_;
920 return("\\fB$str\\fP");
924 sub _varname
926 my ($gh,$str)=@_;
927 return("\\fI$str\\fP");
931 sub _new_mode
933 my ($gh,$mode,$params)=@_;
935 if($mode ne $gh->{'-mode'})
937 my $tag;
939 # flush previous list
940 if($gh->{'-mode'} eq "pre")
942 $gh->_push(".)l");
944 elsif($gh->{'-mode'} eq "table")
946 chomp($gh->{'-table-head'});
947 $gh->{'-table-head'} =~ s/\s+$//;
948 $gh->_push($gh->{'-table-head'}.".");
949 $gh->_push($gh->{'-table-body'}.".TE\n.sp 0.6");
951 elsif($gh->{'-mode'} eq "blockquote")
953 $gh->_push(".)q");
956 # send new one
957 if($mode eq "pre")
959 $gh->_push(".(l L");
961 elsif($mode eq "blockquote")
963 $gh->_push(".(q");
966 $gh->{'-mode'}=$mode;
971 sub _dl
973 my ($gh,$str)=@_;
975 $gh->_new_mode("dl");
976 return(".ip \"$str\"\n");
980 sub _ul
982 my ($gh)=@_;
984 $gh->_new_mode("ul");
985 return(".bu\n");
989 sub _ol
991 my ($gh)=@_;
993 $gh->_new_mode("ol");
994 return(".np\n");
998 sub _blockquote
1000 my ($gh)=@_;
1002 $gh->_new_mode("blockquote");
1003 return("\"");
1007 sub _hr
1009 my ($gh)=@_;
1011 return(".hl");
1015 sub _heading
1017 my ($gh,$level,$l)=@_;
1019 $l=".sz ".${$gh->{'heading-sizes'}}[$level - 1]."\n$l\n.sp 0.6";
1021 return($l);
1025 sub _table
1027 my ($gh,$str)=@_;
1029 if($gh->{'-mode'} eq "table")
1031 my ($h,$b);
1032 my (@spans)=$gh->_calc_col_span($str);
1034 # build columns
1035 $h="";
1036 $b="";
1037 for(my $n=0;$n < scalar(@{$gh->{'-table'}});$n++)
1039 my ($i);
1041 if($gh->{'table-headers'} and $gh->{'-tbl-row'}==1)
1043 $h.="cB ";
1045 else
1047 $h.="l ";
1050 # add span columns
1051 $h.="s " x ($spans[$n] - 1) if $spans[$n] > 1;
1053 $b.="#" if $n;
1055 $i=${$gh->{'-table'}}[$n];
1056 $i =~ s/^\s+//;
1057 $i =~ s/\s+$//;
1058 $i =~ s/(\s)+/$1/g;
1059 $b.=$i;
1062 # add a separator
1063 $b.="\n_" if $gh->{'table-headers'} and
1064 $gh->{'-tbl-row'}==1 and
1065 $gh->{'table-type'} ne "allbox";
1067 $gh->{'-table-head'}.="$h\n";
1068 $gh->{'-table-body'}.="$b\n";
1070 @{$gh->{'-table'}}=();
1071 $gh->{'-tbl-row'}++;
1073 else
1075 # new table
1076 $gh->_new_mode("table");
1078 @{$gh->{'-table'}}=();
1079 $gh->{'-tbl-row'}=1;
1081 $gh->{'-table-head'}=".TS\n$gh->{'table-type'} tab (#);\n";
1082 $gh->{'-table-body'}="";
1085 $str="";
1086 return($str);
1090 sub _postfix
1092 my ($gh)=@_;
1094 # add to top headings and footers
1095 unshift(@{$gh->{'o'}},".ef '\%' ''");
1096 unshift(@{$gh->{'o'}},".of '' '\%'");
1097 unshift(@{$gh->{'o'}},".eh '$gh->{'-title'}' ''");
1098 unshift(@{$gh->{'o'}},".oh '' '$gh->{'-title'}'");
1102 =head1 AUTHOR
1104 Angel Ortega angel@triptico.com
1106 =cut