Markdown API: eliminate problematic xmlcheck == 1
[markdown.git] / Markdown.pl
blob65ded296e9b033a1c04b687b759c1873213a95d0
1 #!/usr/bin/env perl
4 # Markdown -- A text-to-HTML conversion tool for web writers
6 # Copyright (C) 2004 John Gruber
7 # Copyright (C) 2015,2016,2017,2018,2019,2020,2021 Kyle J. McKay
8 # All rights reserved.
9 # License is Modified BSD (aka 3-clause BSD) License\n";
10 # See LICENSE file (or <https://opensource.org/licenses/BSD-3-Clause>)
13 package Markdown;
15 use 5.008;
16 use strict;
17 use warnings;
19 use Encode;
21 use vars qw($COPYRIGHT $VERSION @ISA @EXPORT_OK);
23 BEGIN {*COPYRIGHT =
24 \"Copyright (C) 2004 John Gruber
25 Copyright (C) 2015,2016,2017,2018,2019,2020,2021 Kyle J. McKay
26 All rights reserved.
28 *VERSION = \"1.1.11-PRE"
31 use Exporter ();
32 use Digest::MD5 qw(md5 md5_hex);
33 use File::Basename qw(basename);
34 use Scalar::Util qw(refaddr looks_like_number);
35 my ($hasxml, $hasxml_err); BEGIN { ($hasxml, $hasxml_err) = (0, "") }
36 my ($hasxmlp, $hasxmlp_err); BEGIN { ($hasxmlp, $hasxmlp_err) = (0, "") }
37 BEGIN {
38 @ISA = qw(Exporter);
39 @EXPORT_OK = qw(Markdown ProcessRaw GenerateStyleSheet SetWikiOpts SplitURL
40 escapeXML unescapeXML ResolveFragment);
41 $INC{__PACKAGE__.'.pm'} = $INC{basename(__FILE__)} unless exists $INC{__PACKAGE__.'.pm'};
44 close(DATA) if fileno(DATA);
45 exit(&_main(@ARGV)||0) unless caller;
47 sub fauxdie($) {
48 my $msg = join(" ", @_);
49 $msg =~ s/\s+$//os;
50 printf STDERR "%s: fatal: %s\n", basename($0), $msg;
51 exit 1;
54 my $encoder;
55 BEGIN {
56 $encoder = Encode::find_encoding('Windows-1252') ||
57 Encode::find_encoding('ISO-8859-1') or
58 die "failed to load ISO-8859-1 encoder\n";
62 # Global default settings:
64 my ($g_style_prefix, $g_empty_element_suffix, $g_indent_width,
65 $g_start_p, $g_close_p);
66 BEGIN {
67 $g_style_prefix = "_markdown-"; # Prefix for markdown css class styles
68 $g_empty_element_suffix = " />"; # Change to ">" for HTML output
69 $g_indent_width = 4; # Number of spaces considered new level
70 $g_start_p = "<p>"; # _FormParagraphs open paragraph tag
71 $g_close_p = "</p>"; # _FormParagraphs close paragraph tag
76 # Globals:
79 # Style sheet template
80 my $g_style_sheet;
82 # Permanent block id table
83 my %g_perm_block_ids;
85 # Global hashes, used by various utility routines
86 my %g_urls;
87 my %g_titles;
88 my %g_anchors;
89 my %g_anchors_id;
90 my %g_block_ids;
91 my %g_code_block_ids;
92 my %g_html_blocks;
93 my %g_code_blocks;
94 my @g_xml_comments;
95 my %opt;
96 my @autonum;
98 # Return a "block id" to use to identify the block that does not contain
99 # any characters that could be misinterpreted by the rest of the code
100 # Originally this used md5_hex but that's unnecessarily slow
101 # Instead just use the refaddr of the scalar ref of the entry for that
102 # key in either the global or, if the optional second argument is true,
103 # permanent table. To avoid the result being confused with anything
104 # else, it's prefixed with a control character and suffixed with another
105 # both of which are not allowed by the XML standard or Unicode.
106 sub block_id {
107 $_[1] or return "\5".refaddr(\$g_block_ids{$_[0]})."\6";
108 $_[1] == 1 and return "\2".refaddr(\$g_perm_block_ids{$_[0]})."\3";
109 $_[1] == 2 and return "\25".refaddr(\$g_code_block_ids{$_[0]})."\26";
110 die "programmer error: bad block_id type $_[1]";
113 # Regex to match balanced [brackets]. See Friedl's
114 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
115 my $g_nested_brackets;
116 BEGIN {
117 $g_nested_brackets = qr{
118 (?> # Atomic matching
119 [^\[\]]+ # Anything other than brackets
122 (??{ $g_nested_brackets }) # Recursive set of nested brackets
128 # Regex to match balanced (parentheses)
129 my $g_nested_parens;
130 BEGIN {
131 $g_nested_parens = qr{
132 (?> # Atomic matching
133 [^\(\)]+ # Anything other than parentheses
136 (??{ $g_nested_parens }) # Recursive set of nested parentheses
142 # Table of hash values for escaped characters:
143 my %g_escape_table;
144 BEGIN {
145 $g_escape_table{""} = "\2\3";
146 foreach my $char (split //, "\\\`*_~{}[]()>#+-.!|:<") {
147 $g_escape_table{$char} = block_id($char,1);
151 # Used to track when we're inside an ordered or unordered list
152 # (see _ProcessListItems() for details):
153 my $g_list_level;
154 BEGIN {
155 $g_list_level = 0;
159 #### Blosxom plug-in interface ##########################################
160 my $_haveBX;
161 BEGIN {
162 no warnings 'once';
163 $_haveBX = defined($blosxom::version);
166 # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
167 # which posts Markdown should process, using a "meta-markup: markdown"
168 # header. If it's set to 0 (the default), Markdown will process all
169 # entries.
170 my $g_blosxom_use_meta;
171 BEGIN {
172 $g_blosxom_use_meta = 0;
175 sub start { 1; }
176 sub story {
177 my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
179 if ((! $g_blosxom_use_meta) or
180 (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
182 $$body_ref = Markdown($$body_ref);
188 #### Movable Type plug-in interface #####################################
189 my $_haveMT = eval {require MT; 1;}; # Test to see if we're running in MT
190 my $_haveMT3 = $_haveMT && eval {require MT::Plugin; 1;}; # and MT >= MT 3.0.
192 if ($_haveMT) {
193 require MT;
194 import MT;
195 require MT::Template::Context;
196 import MT::Template::Context;
198 if ($_haveMT3) {
199 require MT::Plugin;
200 import MT::Plugin;
201 my $plugin = new MT::Plugin({
202 name => "Markdown",
203 description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
204 doc_link => 'https://daringfireball.net/projects/markdown/'
206 MT->add_plugin( $plugin );
209 MT::Template::Context->add_container_tag(MarkdownOptions => sub {
210 my $ctx = shift;
211 my $args = shift;
212 my $builder = $ctx->stash('builder');
213 my $tokens = $ctx->stash('tokens');
215 if (defined ($args->{'output'}) ) {
216 $ctx->stash('markdown_output', lc $args->{'output'});
219 defined (my $str = $builder->build($ctx, $tokens) )
220 or return $ctx->error($builder->errstr);
221 $str; # return value
224 MT->add_text_filter('markdown' => {
225 label => 'Markdown',
226 docs => 'https://daringfireball.net/projects/markdown/',
227 on_format => sub {
228 my $text = shift;
229 my $ctx = shift;
230 my $raw = 0;
231 if (defined $ctx) {
232 my $output = $ctx->stash('markdown_output');
233 if (defined $output && $output =~ m/^html/i) {
234 $g_empty_element_suffix = ">";
235 $ctx->stash('markdown_output', '');
237 elsif (defined $output && $output eq 'raw') {
238 $raw = 1;
239 $ctx->stash('markdown_output', '');
241 else {
242 $raw = 0;
243 $g_empty_element_suffix = " />";
246 $text = $raw ? $text : Markdown($text);
247 $text;
251 # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
252 my $smartypants;
255 no warnings "once";
256 $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
259 if ($smartypants) {
260 MT->add_text_filter('markdown_with_smartypants' => {
261 label => 'Markdown With SmartyPants',
262 docs => 'https://daringfireball.net/projects/markdown/',
263 on_format => sub {
264 my $text = shift;
265 my $ctx = shift;
266 if (defined $ctx) {
267 my $output = $ctx->stash('markdown_output');
268 if (defined $output && $output eq 'html') {
269 $g_empty_element_suffix = ">";
271 else {
272 $g_empty_element_suffix = " />";
275 $text = Markdown($text);
276 $text = $smartypants->($text, '1');
282 sub _tabDefault {
283 return $_haveBX || $_haveMT ? 4 : 8;
286 sub _strip {
287 my $str = shift;
288 defined($str) or return undef;
289 $str =~ s/^\s+//;
290 $str =~ s/\s+$//;
291 $str =~ s/\s+/ /g;
292 $str;
295 my %_yamlmode;
296 BEGIN {%_yamlmode = (
297 disable => 0,
298 reveal => 1,
299 enable => 1,
300 conceal => 1,
301 show => -1,
302 unknown => -1,
303 strip => -1
305 my %_yamlvis;
306 BEGIN {%_yamlvis = (
307 disable => 0,
308 reveal => 1,
309 enable => -1,
310 conceal => 0,
311 show => 1,
312 unknown => -1,
313 strip => 0
316 sub _require_pod_usage() {
317 require Pod::Usage;
318 eval 'require Pod::Text::Termcap; 1;' and
319 @Pod::Usage::ISA = (qw( Pod::Text::Termcap ));
322 #### BBEdit/command-line text filter interface ##########################
323 sub _main {
324 local *ARGV = \@_;
327 #### Check for command-line switches: #################
328 my %options = ();
329 my %cli_opts;
330 my $raw = 0;
331 use Getopt::Long;
332 Getopt::Long::Configure(qw(bundling require_order pass_through));
333 GetOptions(
334 'help' => sub {
335 _require_pod_usage;
336 Pod::Usage::pod2usage(-verbose => 2, -exitval => 0)},
337 'h' => sub {
338 _require_pod_usage;
339 Pod::Usage::pod2usage(-verbose => 0, -exitval => 0)},
340 'version|V' => sub { # Version info
341 print "\nThis is Markdown, version $VERSION.\n", $COPYRIGHT;
342 print "License is Modified BSD (aka 3-clause BSD) License\n";
343 print "<https://opensource.org/licenses/BSD-3-Clause>\n";
344 exit 0},
345 'shortversion|short-version|s' => sub { # Just the version number string
346 print $VERSION;
347 exit 0},
348 'html4tags' => \$cli_opts{'html4tags'},
349 'deprecated' => \$cli_opts{'deprecated'},
350 'sanitize' => \$cli_opts{'sanitize'},
351 'no-sanitize' => sub {$cli_opts{'sanitize'} = 0},
352 'validate-xml' => sub {$cli_opts{'validate-xml'} = 1},
353 'validate-xml-internal' => sub {$cli_opts{'validate-xml'} = 2},
354 'no-validate-xml' => sub {$cli_opts{'validate-xml'} = 0},
355 'stripcomments|strip-comments' => \$cli_opts{'stripcomments'},
356 'no-stripcomments|no-strip-comments' => sub {$cli_opts{'stripcomments'} = 0},
357 'keepabs|keep-abs|k' => \$cli_opts{'keepabs'},
358 'absroot|a=s' => \$cli_opts{'absroot'},
359 'base|b=s' => \$cli_opts{'base'},
360 'htmlroot|r=s' => \$cli_opts{'htmlroot'},
361 'imageroot|i=s' => \$cli_opts{'imageroot'},
362 'wiki|w:s' => \$cli_opts{'wiki'},
363 'tabwidth|tab-width=s' => \$cli_opts{'tabwidth'},
364 'autonumber|auto-number' => \$cli_opts{'autonumber'},
365 'raw' => sub { $cli_opts{'raw'} = 1 },
366 'raw-xml' => sub { $cli_opts{'raw'} = 1 },
367 'raw-html' => sub { $cli_opts{'raw'} = 2 },
368 'stylesheet|style-sheet' => \$cli_opts{'stylesheet'},
369 'no-stylesheet|no-style-sheet' => sub {$cli_opts{'stylesheet'} = 0},
370 'stub' => \$cli_opts{'stub'},
371 'yaml:s' => \$cli_opts{'yaml'},
373 defined($cli_opts{'raw'}) or $cli_opts{'raw'} = 0;
374 my $stub = 0;
375 if ($cli_opts{'stub'}) {
376 $stub = 1;
378 if ($cli_opts{'html4tags'}) { # Use HTML tag style instead of XHTML
379 $options{empty_element_suffix} = ">";
380 $stub = -$stub;
382 if ($cli_opts{'deprecated'}) { # Allow <dir> and <menu> tags to pass through
383 _SetAllowedTag("dir");
384 _SetAllowedTag("menu");
386 my $xmlcheck;
387 $options{sanitize} = 1; # sanitize by default
388 $options{sanitize} = $cli_opts{'sanitize'} if defined($cli_opts{'sanitize'});
389 $xmlcheck = $options{sanitize} ? 2 : 0;
390 $xmlcheck = $cli_opts{'validate-xml'} if defined($cli_opts{'validate-xml'});
391 $options{stripcomments} = $cli_opts{'stripcomments'} if defined($cli_opts{'stripcomments'});
392 die "--html4tags and --validate-xml are incompatible\n"
393 if $cli_opts{'html4tags'} && $xmlcheck == 1;
394 die "--no-sanitize and --validate-xml-internal are incompatible\n"
395 if !$options{'sanitize'} && $xmlcheck == 2;
396 die "--no-sanitize and --strip-comments are incompatible\n"
397 if !$options{'sanitize'} && $options{stripcomments};
398 die "--raw-html requires --validate-xml-internal\n"
399 if $cli_opts{'raw'} == 2 && $xmlcheck != 2;
400 if ($xmlcheck == 1) {
401 eval { require XML::Simple; 1 } and $hasxml = 1 or $hasxml_err = $@;
402 eval { require XML::Parser; 1 } and $hasxmlp = 1 or $hasxmlp_err = $@ unless $hasxml;
403 die "$hasxml_err$hasxmlp_err" unless $hasxml || $hasxmlp;
405 if ($cli_opts{'tabwidth'}) {
406 my $tw = $cli_opts{'tabwidth'};
407 die "invalid tab width (must be integer)\n" unless looks_like_number $tw;
408 die "invalid tab width (must be >= 2 and <= 32)\n" unless $tw >= 2 && $tw <= 32;
409 $options{tab_width} = int(0+$tw);
411 $options{auto_number} = 6 if $cli_opts{'autonumber'};
412 $options{keepabs} = $cli_opts{'keepabs'};
413 $options{abs_prefix} = ""; # no abs prefix by default
414 if ($cli_opts{'absroot'}) { # Use abs prefix for absolute path URLs
415 my $abs = $cli_opts{'absroot'};
416 $abs =~ s{/+$}{};
417 $options{abs_prefix} = $abs;
419 $options{base_prefix} = ""; # no base prefix by default
420 if ($cli_opts{'base'}) { # Use base prefix for fragment URLs
421 $options{base_prefix} = $cli_opts{'base'};
423 if ($cli_opts{'htmlroot'}) { # Use URL prefix
424 $options{url_prefix} = $cli_opts{'htmlroot'};
426 if ($cli_opts{'imageroot'}) { # Use image URL prefix
427 $options{img_prefix} = $cli_opts{'imageroot'};
429 SetWikiOpts(\%options, $cli_opts{'wiki'}); # Set wiki links options
430 if (ref($options{wikiopt}) eq 'HASH') {
431 my $o = $options{wikiopt};
432 $o->{"f"} && $o->{"%"} and
433 die "--wiki sub-options 'f' and '%' are mutually exclusive\n"
435 if ($cli_opts{'raw'}) {
436 $raw = 1;
437 $options{htmlauto} = 1 if $cli_opts{'raw'} == 2;
439 $options{show_styles} = $cli_opts{'stylesheet'} if defined($cli_opts{'stylesheet'});
440 $options{show_styles} = 1 if $stub && !defined($options{show_styles});
441 $options{tab_width} = 8 unless defined($options{tab_width});
442 my $ym = $cli_opts{'yaml'};
443 defined($ym) && $ym ne "" or $ym = "enable";
444 my $lcym = lc($ym);
445 exists($_yamlmode{$lcym}) or die "invalid --yaml= value '$ym'\n";
446 $options{yamlmode} = $_yamlmode{$lcym};
447 $options{yamlvis} = $_yamlvis{$lcym};
449 my $hdrf = sub {
450 my $out = "";
451 if ($stub > 0) {
452 $out .= <<'HTML5';
453 <!DOCTYPE html>
454 <html xmlns="http://www.w3.org/1999/xhtml">
455 <head>
456 <meta charset="utf-8" />
457 <meta http-equiv="content-type" content="text/html; charset=utf-8" />
458 HTML5
459 } elsif ($stub < 0) {
460 $out .= <<'HTML4';
461 <html>
462 <head>
463 <meta charset="utf-8">
464 <meta http-equiv="content-type" content="text/html; charset=utf-8">
465 HTML4
467 if ($stub && ($options{title} || $options{h1})) {
468 my $title = $options{title};
469 defined($title) && $title ne "" or $title = $options{h1};
470 if (defined($title) && $title ne "") {
471 $title =~ s/&/&amp;/g;
472 $title =~ s/</&lt;/g;
473 $out .= "<title>$title</title>\n";
476 $out .= GenerateStyleSheet($g_style_prefix) if $options{show_styles};
477 if ($stub) {
478 $out .= "</head>\n<body style=\"text-align:center\">\n" .
479 "<div style=\"display:inline-block;text-align:left;max-width:42pc\">\n";
481 $out;
484 #### Process incoming text: ###########################
485 my ($didhdr, $hdr, $result, $ftr) = (0, "", "", "");
486 @ARGV or push(@ARGV, "-");
487 foreach (@ARGV) {
488 my ($fh, $contents, $oneresult);
489 $_ eq "-" or open $fh, '<', $_ or fauxdie "could not open \"$_\": $!\n";
491 local $/; # Slurp the whole file
492 $_ eq "-" and $contents = <STDIN>;
493 $_ ne "-" and $contents = <$fh>;
495 defined($contents) or fauxdie "could not read \"$_\": $!\n";
496 $_ eq "-" or close($fh);
497 $options{xmlcheck} = ($xmlcheck == 2) ? 2 : 0;
498 $oneresult = $raw ? ProcessRaw($contents, \%options) : Markdown($contents, \%options);
499 $oneresult =~ s/\s+$//os;
500 if ($oneresult ne "") {
501 if (!$didhdr && !$raw) {
502 $hdr = &$hdrf();
503 $didhdr = 1;
505 $result .= $oneresult . "\n";
508 $hdr = &$hdrf() unless $didhdr || $raw;
509 $ftr = "</div>\n</body>\n</html>\n" if $stub && !$raw;
510 if ($xmlcheck == 1) {
511 my ($good, $errs);
512 if ($stub && !$raw) {
513 ($good, $errs) = _xmlcheck($hdr.$result.$ftr);
514 } else {
515 ($good, $errs) = _xmlcheck("<div>".$result."</div>");
517 $good or die $errs;
519 print $hdr, $result, $ftr;
521 exit 0;
525 # INPUT
526 # $1: HASH ref
527 # $2: value of --wiki= option (see docs) except
528 # that a value of undef turns off wiki links
529 # OUTPUT
530 # $1->{wikipat}
531 # $1->{wikiopt}
533 sub SetWikiOpts {
534 my ($o, $wpat) = @_;
535 ref($o) eq "HASH" or die "internal error: first arg to SetWikiOpts must be HASH ref";
536 delete $o->{wikipat};
537 delete $o->{wikiopt};
538 defined($wpat) or return;
539 # Parse wiki links option setting
540 my $wopt = "s(:md)";
541 if ($wpat =~ /^(.*?)%\{((?:[%0-9A-Za-z]|[Ss]\([^)]*\))*)\}(.*)$/) {
542 $o->{wikipat} = $1 . "%{}" . $3;
543 $wopt = $2;
544 } else {
545 $o->{wikipat} = $wpat . "%{}.html";
547 my $sval = 1;
548 while ($wopt =~ /^(.*?)s\(([^)]*)\)(.*)$/i) {
549 my $sarg = $2;
550 $wopt = $1 . "s" . $3;
551 $sarg =~ s/^\s+//; $sarg =~ s/\s+$//;
552 $sval = {} unless ref($sval) eq "HASH";
553 s/^\.//, $sval->{lc($_)}=1 foreach split(/(?:\s*,\s*)|(?:(?<!,)\s+(?!,))/, $sarg);
554 $sval = 1 unless scalar(keys(%$sval));
556 $o->{wikiopt} = { map({$_ => 1} split(//,lc($wopt))) };
557 if (ref($sval) eq "HASH" && $sval->{':md'}) {
558 delete $sval->{':md'};
559 $sval->{$_} = 1 foreach qw(md rmd mkd mkdn mdwn mdown markdown litcoffee);
561 $o->{wikiopt}->{'s'} = $sval if $o->{wikiopt}->{'s'};
565 # Return a copy of the fancy CSS style sheet that uses the
566 # passed in prefix as a prefix for the CSS style names.
567 # If no argument is passed in, use $g_style_prefix
568 # as the CSS style name prefix.
569 sub GenerateStyleSheet {
570 my $prefix = shift;
571 defined($prefix) or $prefix = $g_style_prefix;
572 my $stylesheet = $g_style_sheet;
573 $stylesheet =~ s/%\(base\)/$prefix/g;
574 return $stylesheet;
578 sub _xmlcheck {
579 my $text = shift;
580 my ($good, $errs);
581 ($hasxml ? eval { XML::Simple::XMLin($text, KeepRoot => 1) && 1 } :
582 eval {
583 my $p = XML::Parser->new(Style => 'Tree', ErrorContext => 1);
584 $p->parse($text) && 1;
585 }) and $good = 1 or $errs = _trimerr($@);
586 ($good, $errs);
590 sub _trimerr {
591 my $err = shift;
592 1 while $err =~ s{\s+at\s+\.?/[^,\s\n]+\sline\s+[0-9]+\.?(\n|$)}{$1}is;
593 $err =~ s/\s+$//os;
594 $err . "\n";
598 sub _PrepareInput {
599 my ($input,$parseyaml) = @_;
600 defined $input or $input = "";
602 use bytes;
603 $input =~ s/[\x00-\x08\x0B\x0E-\x1F\x7F]+//gso;
605 my $output;
606 if (Encode::is_utf8($input) || utf8::decode($input)) {
607 $output = $input;
608 } else {
609 $output = $encoder->decode($input, Encode::FB_DEFAULT);
611 # Standardize line endings:
612 $output =~ s{\r\n}{\n}g; # DOS to Unix
613 $output =~ s{\r}{\n}g; # Mac to Unix
615 # Extract YAML front matter if requested
616 my $yaml = undef;
617 if ($parseyaml) {
618 $yaml = {};
619 if ($output =~ /^---[ \t]*(?:\n|\z)/g) {
620 until ($output =~ /\G(?:(?:(?:---)|(?:\.\.\.))[ \t]*(?:\n|\z)|\z)/gc) {
621 next if $output =~ m"\G[ \t]*(?:#[^\n]*)?\n"gc; # skip comment lines
622 next if $output =~ m"\G[ \t]*(?:#[^\n]*)\z"gc; # skip final no EOL comment
623 last unless $output =~ /\G([^\n]+)(?:\n|\z)/gc;
624 my $yl = $1;
625 if ($yl =~ /^([A-Za-z_][A-Za-z_0-9.-]*):[ \t]+(.*)$/os) {
626 my ($k, $v) = ($1, $2);
627 $yaml->{lc($k)} = _YAMLvalue($2);
630 $output = substr($output, pos($output));
633 return wantarray ? ($output, $yaml) : $output;
637 sub _YAMLvalue {
638 my $v = shift;
639 $v =~ s/^\s+//;
640 if (substr($v, 0, 1) eq '"') {
641 # only $ and/or @ present issues, map them
642 $v =~ tr/\@\$/\036\037/;
643 eval '{$v='.$v."\n}1" or $v = undef;
644 $v =~ tr/\036\037/\@\$/ if defined($v);
645 } else {
646 $v =~ s"#.*$""os;
647 $v =~ s/\s+$//os;
648 $v ne "" or $v = undef;
650 return $v;
654 sub ProcessRaw {
655 my $text = _PrepareInput(shift);
657 # Any remaining arguments after the first are options; either a single
658 # hashref or a list of name, value pairs. See _SanitizeOpts comments.
660 %opt = (
661 empty_element_suffix => $g_empty_element_suffix,
663 my %args = ();
664 if (ref($_[0]) eq "HASH") {
665 %args = %{$_[0]};
666 } else {
667 %args = @_;
669 while (my ($k,$v) = each %args) {
670 $opt{$k} = $v;
672 _SanitizeOpts(\%opt);
674 # Sanitize all '<'...'>' tags if requested
675 $text = _SanitizeTags($text, $opt{xmlcheck}, $opt{htmlauto}) if $opt{sanitize};
677 utf8::encode($text);
678 return $text;
682 # $1: HASH ref with the following key value semantics
684 # sanitize => any-false-value (no action), any-true-value (sanitize).
685 # note that a true value of xmlcheck or a true value of
686 # stripcomments or a urlfunc value that is a CODE ref
687 # always forces sanitize to activate.
688 # tag attributes are sanitized by removing all "questionable"
689 # attributes (such as script attributes, unknown attributes
690 # and so forth) and normalizing the remaining ones (i.e.
691 # adding missing quotes and/or values etc.).
692 # effective for both ProcessRaw and Markdown.
693 # xmlcheck => 0 (no check), any-true-value (internal check).
694 # note that the default if xmlcheck is not set/valid is 2.
695 # note that a true value is effective for both ProcessRaw
696 # and Markdown. note that a true value automatically inserts
697 # the closing tag for auto-closing tags and converts empty tags
698 # to the correct format converting empty tags that shouldn't be
699 # to an open and close pair; since xmlcheck is a function of the
700 # sanitizer, tag attributes are also always sanitized whenever
701 # xmlcheck has a true value.
702 # note that a true xmlcheck value WILL call "die" with a
703 # detailed indication of the error(s) if xml validation fails
704 # in which case any line/column numbers refer to the text that
705 # would be produced by a sanitize=>0, xmlcheck=>0 call to
706 # either ProcessRaw or Markdown, NOT the original input text.
707 # htmlauto => any-false-value (no auto close), any-true-value (auto-close)
708 # only effective for ProcessRaw; always enabled for Markdown.
709 # when xmlcheck is set to 2 provide html automatic closing tag
710 # and optional closing tag semantics where closing tags are
711 # automatically inserted when encountering an opening tag that
712 # auto closes a currently open tag and tags with an optional
713 # closing tag that's missing have that inserted as appropriate.
714 # a true value may result in some texts being rejected that
715 # would be otherwise be accepted (e.g. "<p><pre></pre></p>"
716 # which gets turned into "<p></p><pre></pre></p>" which then
717 # no longer validates).
718 # stripcomments => any-false-value (no action), any-true-value (strip).
719 # since the strip comments mechanism is a function of the
720 # sanitizer, if stripcomments is set to any-true-value then
721 # tag attributes will also always be sanitized.
722 # effective for both ProcessRaw and Markdown.
723 # empty_element_suffix => " />" or ">"
724 # will be forced to " />" if not valid or defined.
725 # effective for both ProcessRaw and Markdown.
726 # urlfunc => if set to a CODE ref, the function will be called with
727 # seven arguments like so:
728 # $result = &$urlfunc($iresult, \%opts, $tag, $uhost, $uabs, $q, $f)
729 # where on input $iresult is the result that would be produced
730 # if no urlfunc was provided and on output $result will be
731 # used as url value. $tag is either "img" or "a" to indicate the
732 # source of the url. $uhost.$uabs is the result of stripping off
733 # any query string and/or fragment from $iresult. $q contains either
734 # an empty string or the stripped off query string and $f contains
735 # an empty string or the stripped off fragment if they were originally
736 # present where a non-empty $q always starts with '?' and a non-empty
737 # $f always starts with '#'. $uhost contains the scheme and
738 # host+port if present (it may be the empty string). $uabs contains
739 # the path portion which may or may not start with a "/" depending on
740 # whether or not it's a relative path. The $iresult value is related
741 # to the other arguments like so:
742 # $iresult = $uhost . $uabs . $q . $f;
743 # All values passed to the urlfunc function have already been HTML
744 # unescaped and the returned value will be automatically HTML escaped.
745 # Any provided urlfunc should treat the %opts HASH as
746 # read-only. Modifying the %opts HASH in urlfunc will
747 # likely result in unpredictable behavior! Don't do it!
748 # If urlfunc is set to a CODE ref then tags will also always be
749 # sanitized.
751 # The remaining key value pairs are ignored by ProcessRaw and are only
752 # effective when using Markdown or _main
754 # tab_width => 1..32 which is how many spaces tabs are expanded to.
755 # will be forced to 8 if not in range.
756 # indent_width => 1..32 how many spaces make a new "indent" level.
757 # will be forced to 4 if not in range.
758 # style_prefix => prefix to prepend to all CSS style names in the
759 # fancy CSS style sheet.
760 # defaults to $g_style_prefix if not defined.
761 # note that _main actually adds the style sheet (when
762 # requested); use GenerateStyleSheet to retrieve the
763 # fancy style sheet when calling Markdown directly.
764 # auto_number => <= 0 (default) no numbering, 1 number h1s,
765 # 2 number h1s, h2s, 3 number h1-h3s, ... >= 6 number h1-h6s
766 # anchors => existence of this key triggers return of anchors HASH
767 # yamlmode => 0 (no YAML processing), > 0 (YAML on), < 0 (YAML ignore)
768 # if 0, the YAML front matter processor is completely
769 # disabled and any YAML front matter that might be present
770 # will be treated as markup. if > 0 any YAML front matter
771 # will be processed and any recognized options applied.
772 # if < 0 any YAML front matter will be parsed but no
773 # options will be applied at all. When != 0 the parsed
774 # YAML front matter can be retrieved via the 'yaml' key.
775 # defaults to 1 if not defined or not a number.
776 # yamlvis => 0 (invisible), > 0 (visible), < 0 (vis if unknown)
777 # if yamlmode == 0 then yamlvis has no effect. if > 0
778 # then any parsed YAML front matter options will be shown
779 # in the formatted output. if 0 then NO YAML front
780 # matter options will be shown in the formatted output.
781 # if < 0 then YAML front matter options will be shown in
782 # the formatted output only if there are any unrecognized
783 # options present.
784 # defaults to -1 if not defined or not a number.
785 # keepabs => any-false-value (no action), any-true-value (keep)
786 # if true, any absolute path URLs remaining after applying
787 # any abs_prefix value will be kept and not be subject
788 # to modification by any url_prefix or img_prefix value.
789 # abs_prefix => value to prefix to absolute path URLs (i.e. start with /).
790 # note that this does NOT get prepended to //host/path URLs.
791 # url_prefix => value to prefix to non-absolute URLs.
792 # note that this does NOT get prepended to //host/path URLs.
793 # img_prefix => value to prefix to non-absolute image URLs.
794 # note that this does NOT get prepended to //host/path URLs.
795 # note that if img_prefix is undef or empty ("") then
796 # url_prefix will be prepended to image URLs.
797 # base_prefix => value to prefix to fragment-only URLs (i.e. start with #).
798 # note that fragment-only URLs are always left undisturbed
799 # if this is not set. Fragment-only URLs are NOT affected by
800 # any of abs_prefix, url_prefix or img_prefix.
801 # wikipat => non-empty pattern string to enable wiki links.
802 # best set with SetWikiOpts (see SetWikiOpts comments).
803 # wikiopt => HASH ref of options affecting wiki links processing.
804 # best set with SetWikiOpts (see SetWikiOpts comments).
805 # wikifunc => if set to a CODE ref, the function will be called with
806 # six arguments like so:
807 # $result = &$wikifunc($iresult, \%opts, $link, $wbase, $qf, $io)
808 # where on input $iresult is the result that would be produced
809 # if no wikifunc was provided and on output $result will be
810 # used as the wiki expansion. $link is the original wiki
811 # destination as specified in the source, $wbase is the result
812 # of stripping off any query string and/or fragment from $link
813 # and then transforming that according to the wikiopt HASH ref.
814 # $qf contains either an empty string or the stripped off
815 # query string and/or fragment if one was originally present.
816 # If $io is a HASH ref (otherwise it will be undef), then it's
817 # a wiki image link and %$io contains the options (if any) where
818 # the keys that might be present include "width", "height", "align"
819 # and "alt". If present, width and height are guaranteed to be
820 # positive integers and align is guaranteed to be "left", "right"
821 # or "center". The value for "alt" may be the empty string.
822 # The "imgflag" key has a SCALAR ref value and if the value it
823 # refers to is changed to any false value the result will become
824 # an A tag rather than an IMG tag.
825 # The $iresult value is related to the other arguments like so:
826 # $iresult = $opts->{wikipat};
827 # $iresult =~ s/%\{\}.+$/%{}/ if ref($io) eq "HASH";
828 # $iresult = s/%\{\}/$wbase/;
829 # $iresult .= $qf;
830 # Any provided wikifunc should treat the %opts HASH as
831 # read-only. Modifying the %opts HASH in wikifunc will
832 # likely result in unpredictable behavior! Don't do it!
834 # Special handling for abs_prefix, url_prefix, img_prefix and/or base_prefix
835 # may be activated by setting any subset (or all) of the values for these
836 # keys to a CODE ref. The single argument is the URL and the result must
837 # be the adjusted URL. For example, the equivalent CODE ref to setting
838 # url_prefix to $string is simply sub { $string.$_[0] }. By using a
839 # CODE ref, behavior other than simply performing a prepend operation
840 # can be realized when necessary for unusual situations.
842 # The following are OUTPUT values that can only be retrieved when
843 # Markdown is called with a HASH ref as the second argument
845 # anchors => if the 'anchors' key exists in the input HASH ref
846 # will be set to a HASH ref containing lookup keys
847 # for valid fragment ids in the document (only those
848 # created from Markdown markup) with the value the
849 # actual fragment link to use. Do not use this directly
850 # but pass it as the first argument to the ResolveFragment
851 # function to resolve a "fuzzy" fragment name to its
852 # actual fragment name in the generated output.
853 # NOTE: to activate return of anchors the 'anchors' key
854 # simply must exist in the input HASH ref passed to the
855 # Markdown function, its value will be replaced on output.
857 # h1 => will be set to the tag-stripped value of the first
858 # non-empty H1 generated by Markdown-style markup.
859 # note that literal <h1>...</h1> values are NOT picked up.
860 # will be left unchanged if no Markdown-style H1 detected.
861 # note that the value is NOT xml escaped but should be
862 # before embedding in an XHTML document. If yamlmode > 0
863 # and a 'title' value has been encountered, then this
864 # will be set to that 'title' value instead (and the
865 # 'title' key and value will still be present in %$yaml).
867 # yaml => if yamlmode is != 0 then this will be set to a HASH
868 # ref containing any parsed YAML front matter or left
869 # unchanged if no YAML front matter was found. If the
870 # parsed YAML front matter contains only whitespace and/or
871 # comments then this will be set to a HASH ref that has
872 # no keys or values.
874 sub _SanitizeOpts {
875 my $o = shift; # hashref
876 ref($o) eq "HASH" or return;
878 $o->{xmlcheck} = looks_like_number($o->{xmlcheck}) && $o->{xmlcheck} == 0 ? 0 : 2;
879 $o->{sanitize} = 1 if $o->{stripcomments} && !$o->{sanitize};
880 $o->{sanitize} = 1 if $o->{xmlcheck} && !$o->{sanitize};
881 $o->{sanitize} = 1 if ref($o->{urlfunc}) eq 'CODE' && !$o->{sanitize};
883 # this is gross, but having the globals avoids unnecessary slowdown
884 if ($o->{sanitize} && $o->{xmlcheck}) {
885 $g_start_p = "<\20>";
886 $g_close_p = "</\20>";
887 } else {
888 $g_start_p = "<p>";
889 $g_close_p = "</p>";
892 defined($o->{empty_element_suffix}) &&
893 ($o->{empty_element_suffix} eq " />" || $o->{empty_element_suffix} eq ">")
894 or $o->{empty_element_suffix} = " />";
896 $o->{tab_width} = 8 unless looks_like_number($o->{tab_width}) &&
897 1 <= $o->{tab_width} && $o->{tab_width} <= 32;
898 $o->{tab_width} = int($o->{tab_width});
900 $o->{indent_width} = 4 unless looks_like_number($o->{indent_width}) &&
901 1 <= $o->{indent_width} && $o->{indent_width} <= 32;
902 $o->{indent_width} = int($o->{indent_width});
904 defined($o->{auto_number}) or $o->{auto_number} = '';
905 $o->{auto_number} eq '' || looks_like_number($o->{auto_number})
906 or $o->{auto_number} = 6;
907 if ($o->{auto_number} ne '') {
908 $o->{auto_number} = int(0+$o->{auto_number});
909 $o->{auto_number} >= 0 or $o->{auto_number} = 0;
910 $o->{auto_number} <= 6 or $o->{auto_number} = 6;
913 defined($o->{style_prefix}) or $o->{style_prefix} = $g_style_prefix;
915 $o->{abs_prefix} = _MakePrefixCODERef($o->{abs_prefix}, 1)
916 unless ref($o->{abs_prefix}) eq 'CODE';
917 $o->{url_prefix} = _MakePrefixCODERef($o->{url_prefix}, 0)
918 unless ref($o->{url_prefix}) eq 'CODE';
919 $o->{img_prefix} = _MakePrefixCODERef($o->{img_prefix}, 0)
920 unless ref($o->{img_prefix}) eq 'CODE';
921 $o->{base_prefix} = _MakePrefixCODERef($o->{base_prefix}, -1)
922 unless ref($o->{base_prefix}) eq 'CODE';
924 ref($o->{wikiopt}) eq "HASH" or $o->{wikiopt} = {};
926 # Note that because Markdown makes a copy of the options
927 # before calling this function, this does not actually remove
928 # any "h1" key that might have been set by the caller of
929 # the Markdown function. However, by deleting it here,
930 # this guarantees that any found value will actually be
931 # picked up and stored (which will not happen if the key
932 # already exists).
933 delete $o->{h1};
935 # Default is to silently strip any known YAML front matter
936 # Same comment about "yaml" key as above for "h1" key
937 $o->{yamlmode} = 1 unless looks_like_number($o->{yamlmode});
938 $o->{yamlvis} = -1 unless looks_like_number($o->{yamlvis});
939 delete $o->{yaml};
941 # The anchors hash will only be returned if the key exists
942 # (the key's value doesn't matter), set the value to an empty
943 # HASH ref just in case to make sure it's always a HASH ref.
944 $o->{anchors} = {} if exists($o->{anchors});
947 my %_yamlopts;
948 BEGIN {%_yamlopts = map({$_ => 1} qw(
949 display_metadata
950 header_enum
951 title
955 sub _HasUnknownYAMLOptions {
956 do { return 1 unless exists($_yamlopts{$_}) } foreach keys(%{$_[0]});
957 return 0;
961 sub _ApplyYAMLOpts {
962 my ($yaml, $opt) = @_;
963 if (defined($yaml->{display_metadata}) && $opt->{yamlvis} < 0) {
964 # ignore display_metadata except in --yaml=enable mode
965 $opt->{yamlvis} = _YAMLTrueValue($yaml->{display_metadata}) ? 1 : 0;
967 $opt->{h1} = $yaml->{title} if defined($yaml->{title});
968 if (defined($yaml->{header_enum}) && $opt->{auto_number} eq '') {
969 $opt->{auto_number} = _YAMLTrueValue($yaml->{header_enum}) ? 6 : 0;
974 sub _YAMLTrueValue {
975 my $v = shift;
976 defined($v) or $v = "";
977 $v = lc($v);
978 return !($v eq "" || $v eq "0" || $v eq "false" || $v eq "disable" || $v eq "off" || $v eq "no");
982 # Actually returns an empty string rather than a CODE ref
983 # if an empty prefix is passed in. Trailing "/"s are trimmed
984 # off if the second argument is positive or the string does NOT
985 # consist of only "/"s. A trailing "/" is added unless the
986 # trimmed prefix already has one or the second argument is true.
987 # If the second argument is negative, the prefix is used as-is.
988 sub _MakePrefixCODERef {
989 my ($prefix, $mtok) = @_;
990 defined($prefix) or $prefix = "";
991 looks_like_number($mtok) or $mtok = $mtok ? 1 : 0;
992 if ($mtok > 0) {
993 $prefix =~ s,/+$,,;
994 } elsif (!$mtok) {
995 $prefix =~ s,//+$,/,;
997 $prefix ne "" or return "";
998 $prefix .= '/' if !$mtok && substr($prefix, -1, 1) ne '/';
999 return sub { $prefix . $_[0] };
1003 sub Markdown {
1005 # Primary function. The order in which other subs are called here is
1006 # essential. Link and image substitutions need to happen before
1007 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
1008 # and <img> tags get encoded.
1010 my $text = shift;
1012 # Any remaining arguments after the first are options; either a single
1013 # hashref or a list of name, value pairs. See _SanitizeOpts comments.
1014 %opt = (
1015 # set initial defaults
1016 style_prefix => $g_style_prefix,
1017 empty_element_suffix => $g_empty_element_suffix,
1018 tab_width => _tabDefault,
1019 indent_width => $g_indent_width,
1020 abs_prefix => "", # Prefixed to absolute path URLs
1021 url_prefix => "", # Prefixed to non-absolute URLs
1022 img_prefix => "", # Prefixed to non-absolute image URLs
1023 base_prefix => "", # Prefixed to fragment-only URLs
1025 @autonum = ();
1026 my %args = ();
1027 if (ref($_[0]) eq "HASH") {
1028 %args = %{$_[0]};
1029 } else {
1030 %args = @_;
1032 while (my ($k,$v) = each %args) {
1033 $opt{$k} = $v;
1035 _SanitizeOpts(\%opt);
1037 my $yaml;
1038 ($text, $yaml) = _PrepareInput($text, $opt{yamlmode});
1039 _ApplyYAMLOpts($yaml, \%opt) if ref($yaml) eq "HASH" && $opt{yamlmode} > 0;
1041 # Clear the globals. If we don't clear these, you get conflicts
1042 # from other articles when generating a page which contains more than
1043 # one article (e.g. an index page that shows the N most recent
1044 # articles):
1045 %g_urls = ();
1046 %g_titles = ();
1047 %g_anchors = ();
1048 %g_anchors_id = ();
1049 %g_block_ids = ();
1050 %g_code_block_ids = ();
1051 %g_html_blocks = ();
1052 %g_code_blocks = ();
1053 @g_xml_comments = ();
1054 $g_list_level = 0;
1056 # Make sure $text ends with a couple of newlines:
1057 $text .= "\n\n";
1059 # Handle backticks-delimited code blocks
1060 $text = _HashBTCodeBlocks($text);
1062 # Convert all tabs to spaces.
1063 $text = _DeTab($text);
1065 # Strip any lines consisting only of spaces.
1066 # This makes subsequent regexen easier to write, because we can
1067 # match consecutive blank lines with /\n+/ instead of something
1068 # contorted like / *\n+/ .
1069 $text =~ s/^ +$//mg;
1071 # Turn block-level HTML blocks into hash entries
1072 $text = _HashHTMLBlocks($text, 1);
1074 # Strip link definitions, store in hashes.
1075 $text = _StripLinkDefinitions($text);
1077 $text = _RunBlockGamut($text, 1);
1079 # Remove indentation markers
1080 $text =~ s/\027+//gs;
1082 # Expand auto number flags
1083 $text =~ s/\034([1-6])/_AutoHeaderNum(ord($1)&0x7)/gse
1084 if $opt{auto_number} ne '' && $opt{auto_number} > 0;
1086 # Unhashify code blocks
1087 $text =~ s/(\025\d+\026)/$g_code_blocks{$1}/g;
1089 $text = _UnescapeSpecialChars($text);
1091 $text .= "\n" unless $text eq "";
1093 # Sanitize all '<'...'>' tags if requested
1094 $text = _SanitizeTags($text, $opt{xmlcheck}, 1) if $opt{sanitize};
1096 utf8::encode($text);
1097 if (ref($_[0]) eq "HASH") {
1098 ${$_[0]}{anchors} = {%g_anchors_id} if exists(${$_[0]}{anchors});
1099 if (defined($opt{h1}) && $opt{h1}) {
1100 utf8::encode($opt{h1});
1101 ${$_[0]}{h1} = $opt{h1};
1103 ${$_[0]}{yaml} = $yaml if ref($yaml) eq "HASH";
1105 my $yamltable = "";
1106 if (ref($yaml) eq "HASH" && %$yaml && $opt{yamlmode} && $opt{yamlvis}) {
1107 if ($opt{yamlvis} > 0 || _HasUnknownYAMLOptions($yaml)) {
1108 my ($hrows, $drows) = ("", "");
1109 foreach (sort(keys(%$yaml))) {
1110 my $v = $yaml->{$_};
1111 my $rspn = '';
1112 if (defined($v)) {
1113 $v =~ s/&/&amp;/g;
1114 $v =~ s/</&lt;/g;
1115 utf8::encode($v);
1116 $drows .= "<td>" . $v . "</td>\n";
1117 } else {
1118 $rspn = " class=\"$opt{style_prefix}yaml-undef-value\" rowspan=\"2\" valign=\"top\"";
1120 $hrows .= "<th$rspn>" . $_ . "</th>\n";
1122 $yamltable = "<table class=\"$opt{style_prefix}yaml-table\" border=\"1\">\n" .
1123 "<tr>\n$hrows</tr>\n<tr>\n$drows</tr>\n</table>\n";
1126 return $yamltable.$text;
1130 sub _HashBTCodeBlocks {
1132 # Process Markdown backticks (```) delimited code blocks
1134 my $text = shift;
1135 my $less_than_indent = $opt{indent_width} - 1;
1137 $text =~ s{
1138 (?:(?<=\n)|\A)
1139 ([ ]{0,$less_than_indent})``(`+)[ \t]*(?:([\w.+-]+[#]?)(?:[ \t][ \t\w.+-]*)?)?\n
1140 ( # $4 = the code block -- one or more lines, starting with ```
1142 .*\n+
1145 # and ending with ``` or end of document
1146 (?:(?:[ ]{0,$less_than_indent}``\2[ \t]*(?:\n|\Z))|\Z)
1148 # $2 contains syntax highlighting to use if defined
1149 my $leadsp = length($1);
1150 my $codeblock = $4;
1151 $codeblock =~ s/[ \t]+$//mg; # trim trailing spaces on lines
1152 $codeblock = _DeTab($codeblock, 8, $leadsp); # physical tab stops are always 8
1153 $codeblock =~ s/\A\n+//; # trim leading newlines
1154 $codeblock =~ s/\s+\z//; # trim trailing whitespace
1155 $codeblock = _EncodeCode($codeblock); # or run highlighter here
1156 $codeblock = "<div class=\"$opt{style_prefix}code-bt\"><pre style=\"display:none\"></pre><pre><code>"
1157 . $codeblock . "\n</code></pre></div>";
1159 my $key = block_id($codeblock);
1160 $g_html_blocks{$key} = $codeblock;
1161 "\n\n" . $key . "\n\n";
1162 }egmx;
1164 return $text;
1168 sub _StripLinkDefinitions {
1170 # Strips link definitions from text, stores the URLs and titles in
1171 # hash references.
1173 my $text = shift;
1174 my $less_than_indent = $opt{indent_width} - 1;
1176 # Link defs are in the form: ^[id]: url "optional title"
1177 while ($text =~ s{
1178 ^[ ]{0,$less_than_indent}\[(.+)\]: # id = $1
1179 [ ]*
1180 \n? # maybe *one* newline
1181 [ ]*
1182 <?((?:\S(?:\\\n\s*[^\s"(])?)+?)>? # url = $2
1183 [ ]*
1184 \n? # maybe one newline
1185 [ ]*
1187 (?<=\s) # lookbehind for whitespace
1188 (?:(['"])|(\()) # title quote char
1189 (.+?) # title = $5
1190 (?(4)\)|\3) # match same quote
1191 [ ]*
1192 )? # title is optional
1193 (?:\n+|\Z)
1195 {}mx) {
1196 my $id = _strip(lc $1); # Link IDs are case-insensitive
1197 my $url = $2;
1198 my $title = _strip($5);
1199 $url =~ s/\\\n\s*//gs;
1200 if ($id ne "") {
1201 # These values always get passed through _MakeATag or _MakeIMGTag later
1202 $g_urls{$id} = $url;
1203 if (defined($title) && $title ne "") {
1204 $g_titles{$id} = $title;
1209 return $text;
1212 my %ok_tag_name; # initialized later
1213 my ($block_tags_a, $block_tags_b, $block_tags_c);
1214 BEGIN {
1215 $block_tags_a = qr/\020|p|div|center|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/io;
1216 $block_tags_b = qr/\020|p|div|center|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/io;
1217 $block_tags_c = qr/div|center|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/io;
1220 sub _HashHTMLBlocks {
1221 my ($text, $toplevel) = @_;
1222 my $less_than_indent = $opt{indent_width} - 1;
1223 my $idt = "\027" x $g_list_level;
1224 my $blkprc = $toplevel ?
1225 sub { return $ok_tag_name{$_[1]} ? _EncodeAmpsAndAngles($_[0]) : $_[0] } :
1226 sub { return $_[0] };
1228 # Hashify HTML blocks:
1229 # We only want to do this for block-level HTML tags, such as headers,
1230 # lists, and tables. That's because we still want to wrap <p>s around
1231 # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
1232 # phrase emphasis, and spans. The list of tags we're looking for is
1233 # hard-coded:
1235 # First, look for nested blocks, e.g.:
1236 # <div>
1237 # <div>
1238 # tags for inner block must be indented.
1239 # </div>
1240 # </div>
1242 # The outermost tags must start at the left margin for this to match, and
1243 # the inner nested divs must be indented.
1244 # We need to do this before the next, more liberal match, because the next
1245 # match will start at the first `<div>` and stop at the first `</div>`.
1246 $text =~ s{
1247 ( # save in $1
1248 ^ # start of line (with /m)
1249 ((?:\Q$idt\E)?) # optional lead in = $2
1250 <($block_tags_a) # start tag = $3
1251 \b # word break
1252 (?:.*\n)*? # any number of lines, minimally matching
1253 \2</\3\s*> # the matching end tag
1254 [ ]* # trailing spaces
1255 (?=\n+|\Z) # followed by a newline or end of document
1258 my $blk = &$blkprc($1, $3);
1259 my $key = block_id($blk);
1260 $g_html_blocks{$key} = $blk;
1261 "\n\n" . $key . "\n\n";
1262 }eigmx;
1266 # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
1268 $text =~ s{
1269 ( # save in $1
1270 ^ # start of line (with /m)
1271 (?:\Q$idt\E)? # optional lead in
1272 <($block_tags_b) # start tag = $2
1273 \b # word break
1274 (?:.*\n)*? # any number of lines, minimally matching
1275 .*</\2\s*> # the matching end tag
1276 [ ]* # trailing spaces
1277 (?=\n+|\Z) # followed by a newline or end of document
1280 my $blk = &$blkprc($1, $2);
1281 my $key = block_id($blk);
1282 $g_html_blocks{$key} = $blk;
1283 "\n\n" . $key . "\n\n";
1284 }eigmx;
1287 # Now match any empty block tags that should have been paired
1289 $text =~ s{
1290 ( # save in $1
1291 ^ # start of line (with /m)
1292 (?:\Q$idt\E)? # optional lead in
1293 <($block_tags_c) # start tag = $2
1294 \b # word break
1295 (?:[^<>])*? #
1296 /?> # the matching end tag
1297 [ ]* # trailing spaces
1298 (?=\n+|\Z) # followed by a newline or end of document
1301 my $key = block_id($1);
1302 $g_html_blocks{$key} = $1;
1303 "\n\n" . $key . "\n\n";
1304 }eigmx;
1306 # Special case just for <hr />. It was easier to make a special case than
1307 # to make the other regex more complicated.
1308 $text =~ s{
1310 (?<=\n) # Starting after end of line
1311 | # or
1312 \A # the beginning of the doc
1314 ( # save in $1
1315 [ ]{0,$less_than_indent}
1316 <(?:hr) # start tag
1317 \b # word break
1318 (?:[^<>])*? #
1319 /?> # the matching end tag
1320 [ ]*
1321 (?=\n{1,}|\Z) # followed by end of line or end of document
1324 my $key = block_id($1);
1325 $g_html_blocks{$key} = $1;
1326 "\n\n" . $key . "\n\n";
1327 }eigx;
1329 # Special case for standalone XML comments:
1330 $text =~ s{
1332 (?<=\n\n) # Starting after a blank line
1333 | # or
1334 \A\n? # the beginning of the doc
1336 ( # save in $1
1337 [ ]{0,$less_than_indent}
1338 (?s:
1339 <!--
1340 (?:[^-]|(?:-(?!-)))*
1343 (?:[ \t]*\n[ \t]*)?
1344 <!--
1345 (?:[^-]|(?:-(?!-)))*
1349 [ ]*
1350 (?=\n{1,}|\Z) # followed by end of line or end of document
1353 my $key = block_id($1);
1354 push(@g_xml_comments, $key)
1355 if $opt{stripcomments} && !exists($g_html_blocks{$key});
1356 $g_html_blocks{$key} = $1;
1357 "\n\n" . $key . "\n\n";
1358 }egx;
1361 return $text;
1365 sub _RunBlockGamut {
1367 # These are all the transformations that form block-level
1368 # tags like paragraphs, headers, and list items.
1370 my ($text, $anchors) = @_;
1372 $text = _DoHeaders($text, $anchors);
1374 # Do Horizontal Rules:
1375 $text =~ s{^ {0,3}\*(?: {0,2}\*){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm;
1376 $text =~ s{^ {0,3}\_(?: {0,2}\_){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm;
1377 $text =~ s{^ {0,3}\-(?: {0,2}\-){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm;
1379 $text = _DoListsAndBlocks($text);
1381 $text = _DoTables($text);
1383 # We already ran _HashHTMLBlocks() before, in Markdown(), but that
1384 # was to escape raw HTML in the original Markdown source. This time,
1385 # we're escaping the markup we've just created, so that we don't wrap
1386 # <p> tags around block-level tags.
1387 $text = _HashHTMLBlocks($text);
1389 $text = _FormParagraphs($text, $anchors);
1391 return $text;
1395 sub _DoListBlocks {
1396 return _DoBlockQuotes(_DoCodeBlocks($_[0])) if $_[0] ne "";
1400 sub _RunSpanGamut {
1402 # These are all the transformations that occur *within* block-level
1403 # tags like paragraphs, headers, and list items.
1405 my $text = shift;
1407 $text = _DoCodeSpans($text);
1409 $text = _EscapeSpecialChars($text);
1411 # Process anchor and image tags. Images must come first,
1412 # because ![foo][f] looks like an anchor.
1413 $text = _DoImages($text);
1414 $text = _DoAnchors($text);
1416 # Make links out of things like `<http://example.com/>`
1417 # Must come after _DoAnchors(), because you can use < and >
1418 # delimiters in inline links like [this](<url>).
1419 $text = _DoAutoLinks($text);
1421 $text = _EncodeAmpsAndAngles($text);
1423 $text = _DoItalicsAndBoldAndStrike($text);
1425 # Do hard breaks:
1426 $text =~ s/ {3,}(\n|\z)/<br clear=\"all\"$opt{empty_element_suffix}$1/g;
1427 $text =~ s/ {2,}\n/<br$opt{empty_element_suffix}\n/g;
1428 $text =~ s/ {2,}\z//g;
1430 return $text;
1434 sub _EscapeSpecialChars {
1435 my $text = shift;
1436 my $tokens ||= _TokenizeHTML($text);
1438 $text = ''; # rebuild $text from the tokens
1439 # my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags.
1440 # my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
1442 foreach my $cur_token (@$tokens) {
1443 if ($cur_token->[0] eq "tag") {
1444 # Within tags, encode *, _ and ~ so they don't conflict
1445 # with their use in Markdown for italics and strong.
1446 # We're replacing each such character with its
1447 # corresponding block id value; this is likely
1448 # overkill, but it should prevent us from colliding
1449 # with the escape values by accident.
1450 $cur_token->[1] =~ s!([*_~])!$g_escape_table{$1}!g;
1451 $text .= $cur_token->[1];
1452 } else {
1453 my $t = $cur_token->[1];
1454 $t = _EncodeBackslashEscapes($t);
1455 $text .= $t;
1458 return $text;
1462 sub _ProcessWikiLink {
1463 my ($link_text, $link_loc) = @_;
1464 if (defined($link_loc) &&
1465 ($link_loc =~ m{^#\S*$} || $link_loc =~ m{^(?:http|ftp)s?://\S+$}i)) {
1466 # Return the new link
1467 return _MakeATag(_FindFragmentMatch($link_loc), $link_text);
1469 if (!defined($link_loc)) {
1470 $link_loc = _RunSpanGamut($link_text);
1471 $link_loc = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($link_loc))));
1472 $link_loc =~ m{^(?:http|ftp)s?://\S+$}i and
1473 # Return the new link
1474 return _MakeATag($link_loc, $link_text);
1476 return undef if $link_loc eq "" || $link_text eq "";
1477 if ($link_loc =~ /^[A-Za-z][A-Za-z0-9+.-]*:/os) {
1478 # Unrecognized scheme
1479 return undef;
1481 if ($opt{wikipat}) {
1482 my $o = $opt{wikiopt};
1483 my $img_link = _strip($link_text);
1484 my $img = 0;
1485 my $qsfrag = "";
1486 my $base;
1487 my $imgopts = undef;
1488 if ($img_link =~ /^[^#?\s]+\.(?:png|gif|jpe?g|svgz?)$/i) {
1489 $base = _wxform($img_link, 1);
1490 $img = 1;
1491 $imgopts = _ParseWikiImgOpts($link_loc);
1492 $imgopts->{imgflag} = \$img;
1493 } else {
1494 $base = $link_loc;
1495 if ($link_loc =~ /^(.*?)([?#].*)$/os) {
1496 ($base, $qsfrag) = ($1, $2);
1498 $base = _wxform($base);
1500 my $result = $opt{wikipat};
1501 $result =~ s/%\{\}.+$/%{}/os if $img;
1502 $result =~ s/%\{\}/$base/;
1503 if ($qsfrag =~ /^([^#]*)(#.+)$/os) {
1504 my ($q,$f) = ($1,$2);
1505 #$f = _wxform($f) if $f =~ / /;
1506 $qsfrag = $q . $f;
1508 $result .= $qsfrag;
1509 $result = &{$opt{wikifunc}}($result, \%opt, ($img?$img_link:$link_loc), $base, $qsfrag, $imgopts)
1510 if ref($opt{wikifunc}) eq 'CODE';
1512 use bytes;
1513 $result =~ s/%(?![0-9A-Fa-f]{2})/%25/sog;
1514 if ($o->{r}) {
1515 $result =~
1516 s/([\x00-\x1F <>"{}|\\^`x7F])/sprintf("%%%02X",ord($1))/soge;
1517 } else {
1518 $result =~
1519 s/([\x00-\x1F <>"{}|\\^`\x7F-\xFF])/sprintf("%%%02X",ord($1))/soge;
1521 $result =~ s/(%(?![0-9A-F]{2})[0-9A-Fa-f]{2})/uc($1)/soge;
1523 # Return the new link
1524 return $img ? _MakeIMGTag($result, undef, undef, $imgopts) : _MakeATag($result, $link_text);
1526 # leave it alone
1527 return undef;
1531 sub _ParseWikiImgOpts {
1532 my $alts = shift;
1533 my %o = ();
1534 # alt= consumes the rest of the line, do it first
1535 if ($alts =~ /(?:^|,)\s*alt\s*=\s*(.*)$/ios) {
1536 my $atext = $1;
1537 $alts = substr($alts, 0, $-[0]);
1538 $o{alt} = _strip($atext);
1540 foreach my $kv (split(/\s*,\s*/, lc($alts))) {
1541 if ($kv =~ /^\s*([^\s]+)\s*=\s*([^\s]+)\s*$/os) {
1542 my ($k, $v) = ($1, $2);
1543 if (($k eq "width" || $k eq "height") && $v =~ /^\d+$/) {
1544 $o{$k} = 0+$v if $v > 0;
1545 next;
1547 if ($k eq "align" && ($v eq "left" || $v eq "right" || $v eq "center")) {
1548 $o{$k} = $v;
1549 next;
1553 return \%o;
1557 sub _wxform {
1558 my ($w, $img) = @_;
1559 my $o = $opt{wikiopt};
1560 my $opt_s = $o->{s};
1561 if (!$img && $opt_s) {
1562 if (ref($opt_s)) {
1563 if ($w =~ m{^(.*)[.]([^./]*)$}) {
1564 my ($base, $ext) = ($1, $2);
1565 $w = $base if $opt_s->{lc($ext)};
1567 } else {
1568 $w =~ s{[.][^./]*$}{};
1571 $w = uc($w) if $o->{u};
1572 $w = lc($w) if $o->{l};
1573 $w =~ s{/+}{%252F}gos if $o->{"%"};
1574 $w =~ s/ +/%20/gos if $o->{b};
1575 $w =~ tr{/}{ } if $o->{f};
1576 $w =~ s{/+}{/}gos if !$o->{f} && !$o->{v};
1577 if ($o->{d}) {
1578 $w =~ tr{ }{-};
1579 $w =~ s/-+/-/gos unless $o->{v};
1580 } else {
1581 $w =~ tr{ }{_};
1582 $w =~ s/_+/_/gos unless $o->{v};
1584 return $w;
1588 # Return a suitably encoded <a...> tag string
1589 # On input NONE of $url, $text or $title should be xmlencoded
1590 # but $url should already be url-encoded if needed, but NOT g_escape_table'd
1591 sub _MakeATag {
1592 my ($url, $text, $title) = @_;
1593 defined($url) or $url="";
1594 defined($text) or $text="";
1595 defined($title) or $title="";
1597 $url =~ m"^#" && ref($opt{base_prefix}) eq 'CODE' and $url = &{$opt{base_prefix}}($url);
1598 my $result = $g_escape_table{'<'}."a href=\"" . _EncodeAttText($url) . "\"";
1599 $title = _strip($title);
1600 $text =~ s{<(/?a)}{&lt;$1}sogi;
1601 $text = _DoItalicsAndBoldAndStrike($text);
1602 # We've got to encode any of these remaining to avoid
1603 # conflicting with other italics, bold and strike through and links.
1604 $text =~ s!([]*_~[])!$g_escape_table{$1}!g;
1605 $result .= " title=\"" . _EncodeAttText($title) . "\"" if $title ne "";
1606 return $result . $g_escape_table{'>'} .
1607 $text . $g_escape_table{'<'}."/a".$g_escape_table{'>'};
1611 sub _DoAnchors {
1613 # Turn Markdown link shortcuts into XHTML <a> tags.
1615 my $text = shift;
1618 # First, handle wiki-style links: [[wiki style link]]
1620 $text =~ s{
1621 ( # wrap whole match in $1
1622 \[\[
1623 ($g_nested_brackets) # link text and id = $2
1624 \]\]
1627 my $result;
1628 my $whole_match = $1;
1629 my $link_text = $2;
1630 my $link_loc = undef;
1632 if ($link_text =~ /^(.*)\|(.*)$/s) {
1633 $link_text = $1;
1634 $link_loc = _strip($2);
1637 $result = _ProcessWikiLink($link_text, $link_loc);
1638 defined($result) or $result = $whole_match;
1639 $result;
1640 }xsge;
1643 # Next, handle reference-style links: [link text] [id]
1645 $text =~ s{
1646 ( # wrap whole match in $1
1648 ($g_nested_brackets) # link text = $2
1651 [ ]? # one optional space
1652 (?:\n[ ]*)? # one optional newline followed by spaces
1655 ($g_nested_brackets) # id = $3
1659 my $result;
1660 my $whole_match = $1;
1661 my $link_text = $2;
1662 my $link_id = $3;
1664 if ($link_id eq "") {
1665 # for shortcut links like [this][].
1666 $link_id = _RunSpanGamut($link_text);
1667 $link_id = unescapeXML(_StripTags(_UnescapeSpecialChars($link_id)));
1669 $link_id = _strip(lc $link_id);
1671 if (defined($g_urls{$link_id}) || defined($g_anchors{$link_id})) {
1672 my $url = $g_urls{$link_id};
1673 defined($url) or $url = $g_anchors{$link_id};
1674 $url = _FindFragmentMatch($url);
1675 $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
1676 $result = _MakeATag(_PrefixURL($url), $link_text, $g_titles{$link_id});
1678 else {
1679 $result = $whole_match;
1681 $result;
1682 }xsge;
1685 # Subsequently, inline-style links: [link text](url "optional title")
1687 $text =~ s{
1688 ( # wrap whole match in $1
1690 ($g_nested_brackets) # link text = $2
1692 \( # literal paren
1693 ($g_nested_parens) # href and optional title = $3
1697 #my $result;
1698 my $whole_match = $1;
1699 my $link_text = $2;
1700 my ($url, $title) = _SplitUrlTitlePart($3);
1702 if (defined($url)) {
1703 $url = _FindFragmentMatch($url);
1704 $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
1705 _MakeATag(_PrefixURL($url), $link_text, $title);
1706 } else {
1707 # The href/title part didn't match the pattern
1708 $whole_match;
1710 }xsge;
1713 # Finally, handle reference-style implicit shortcut links: [link text]
1715 $text =~ s{
1716 ( # wrap whole match in $1
1718 ($g_nested_brackets) # link text = $2
1722 my $result;
1723 my $whole_match = $1;
1724 my $link_text = $2;
1725 my $link_id = _RunSpanGamut($2);
1726 $link_id = _strip(lc(unescapeXML(_StripTags(_UnescapeSpecialChars($link_id)))));
1728 if (defined($g_urls{$link_id}) || defined($g_anchors{$link_id})) {
1729 my $url = $g_urls{$link_id};
1730 defined($url) or $url = $g_anchors{$link_id};
1731 $url = _FindFragmentMatch($url);
1732 $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
1733 $result = _MakeATag(_PrefixURL($url), $link_text, $g_titles{$link_id});
1735 else {
1736 $result = $whole_match;
1738 $result;
1739 }xsge;
1741 return $text;
1745 sub _PeelWrapped {
1746 defined($_[0]) or return undef;
1747 if (substr($_[0],0,1) eq "(") {
1748 return substr($_[0], 1, length($_[0]) - (substr($_[0], -1, 1) eq ")" ? 2 : 1));
1750 return $_[0];
1754 sub _SplitUrlTitlePart {
1755 return ("", undef) if $_[0] =~ m{^\s*$}; # explicitly allowed
1756 my $u = $_[0];
1757 $u =~ s/^\s*(['\042])/# $1/;
1758 if ($u =~ m{
1759 ^ # match beginning
1760 \s*?
1761 <?([^\s'\042]\S*?)>? # URL = $1
1762 (?: # optional grouping
1763 \s+ # must be distinct from URL
1764 (['\042]?) # quote char = $2
1765 (.*?) # Title = $3
1766 \2? # matching quote
1767 )? # title is optional
1769 \z # match end
1770 }osx) {
1771 return (undef, undef) if $_[1] && ($1 eq "" || $1 eq "#");
1772 return (_PeelWrapped($1), $2 ? $3 : _PeelWrapped($3));
1773 } else {
1774 return (undef, undef);
1779 sub _FindFragmentMatchInternal {
1780 my ($anchors_id, $url, $undefifnomatch) = @_;
1781 if (defined($url) && $url =~ /^#\S/) {
1782 # try very hard to find a match
1783 my $idbase = _strip(lc(substr($url, 1)));
1784 my $idbase0 = $idbase;
1785 my $id = _MakeAnchorId($idbase);
1786 $undefifnomatch and $url = undef;
1787 if (defined($$anchors_id{$id})) {
1788 $url = $$anchors_id{$id};
1789 } else {
1790 $idbase =~ s/-/_/gs;
1791 $id = _MakeAnchorId($idbase);
1792 if (defined($$anchors_id{$id})) {
1793 $url = $$anchors_id{$id};
1794 } else {
1795 $id = _MakeAnchorId($idbase0, 1);
1796 if (defined($$anchors_id{$id})) {
1797 $url = $$anchors_id{$id};
1798 } else {
1799 $id = _MakeAnchorId($idbase, 1);
1800 if (defined($$anchors_id{$id})) {
1801 $url = $$anchors_id{$id};
1807 return $url;
1811 sub _FindFragmentMatch {
1812 return _FindFragmentMatchInternal(\%g_anchors_id, @_);
1816 sub _ToUTF8 {
1817 my $input = shift;
1818 my $output;
1819 if (Encode::is_utf8($input) || utf8::decode($input)) {
1820 $output = $input;
1821 } else {
1822 $output = $encoder->decode($input, Encode::FB_DEFAULT);
1824 return $output;
1828 # $_[0] -> HASH ref of anchors (e.g. the "anchors" OUTPUT from Markdown)
1829 # $_[1] -> fragment to resolve, may optionally start with '#'
1830 # An empty string ("") or hash ("#") is returned as-is.
1831 # returns undef if no match otherwise resolved fragment name
1832 # which will start with a '#' if $_[1] started with '#' otherwise will not.
1833 # This function can be used to connect up links to "implicit" anchors.
1834 # All Markdown-format H1-H6 headers have an implicit anchor added
1835 # based on the header item text. Passing that text to this function
1836 # will cough up the matching implicit anchor if there is one.
1837 sub ResolveFragment
1839 my ($anchors, $frag) = @_;
1840 defined($frag) or return undef;
1841 $frag eq "" || $frag eq "#" and return $frag;
1842 my $hadhash = ($frag =~ s/^#//);
1843 $frag =~ /^\S/ or return undef;
1844 ref($anchors) eq 'HASH' or return undef;
1845 my $ans = _FindFragmentMatchInternal($anchors, '#'._ToUTF8($frag), 1);
1846 $hadhash || !defined($ans) or $ans =~ s/^#//;
1847 defined($ans) and utf8::encode($ans);
1848 return $ans;
1852 # Return a suitably encoded <img...> tag string
1853 # On input NONE of $url, $alt or $title should be xmlencoded
1854 # but $url should already be url-encoded if needed, but NOT g_escape_table'd
1855 sub _MakeIMGTag {
1856 my ($url, $alt, $title, $iopts) = @_;
1857 defined($url) or $url="";
1858 defined($alt) or $alt="";
1859 defined($title) or $title="";
1860 ref($iopts) eq "HASH" or $iopts = {};
1861 return "" unless $url ne "";
1863 my ($w, $h, $lf, $rt) = (0, 0, '', '');
1864 ($alt, $title) = (_strip($alt), _strip($title));
1865 if ($title =~ /^(.*)\((<?)([1-9][0-9]*)[xX\xd7]([1-9][0-9]*)(>?)\)$/os) {
1866 ($title, $w, $h, $lf, $rt) = (_strip($1), $3, $4, $2, $5);
1867 } elsif ($title =~ /^(.*)\((<?)\?[xX\xd7]([1-9][0-9]*)(>?)\)$/os) {
1868 ($title, $h, $lf, $rt) = (_strip($1), $3, $2, $4);
1869 } elsif ($title =~ /^(.*)\((<?)([1-9][0-9]*)[xX\xd7]\?(>?)\)$/os) {
1870 ($title, $w, $lf, $rt) = (_strip($1), $3, $2, $4);
1871 } elsif ($title =~ /^(.*)\((?!\))(<?)(>?)\)$/os) {
1872 ($title, $lf, $rt) = (_strip($1), $2, $3);
1874 $iopts->{align} = "center" if $lf && $rt;
1875 $iopts->{align} = "left" if $lf && !$rt;
1876 $iopts->{align} = "right" if !$lf && $rt;
1877 $iopts->{width} = $w if $w != 0;
1878 $iopts->{height} = $h if $h != 0;
1879 $iopts->{alt} = $alt if $alt ne "";
1880 $iopts->{title} = $title if $title ne "";
1881 my $iopt = sub { defined($iopts->{$_[0]}) ? $iopts->{$_[0]} : (@_ > 1 ? $_[1] : "") };
1882 my $result = '';
1883 $result .= $g_escape_table{'<'}."center".$g_escape_table{'>'}
1884 if &$iopt("align") eq "center";
1885 $result .= $g_escape_table{'<'}."img src=\"" . _EncodeAttText($url) . "\"";
1886 $result .= " align=\"left\"" if &$iopt("align") eq "left";
1887 $result .= " align=\"right\"" if &$iopt("align") eq "right";
1888 $result .= " alt=\"" . _EncodeAttText($iopts->{alt}) . "\"" if &$iopt("alt") ne "";
1889 $result .= " width=\"" . $iopts->{width} . "\"" if &$iopt("width",0) != 0;
1890 $result .= " height=\"" . $iopts->{height} . "\"" if &$iopt("height",0) != 0;
1891 $result .= " title=\"" . _EncodeAttText($iopts->{title}) . "\"" if &$iopt("title") ne "";
1892 $result .= " /" unless $opt{empty_element_suffix} eq ">";
1893 $result .= $g_escape_table{'>'};
1894 $result .= $g_escape_table{'<'}."/center".$g_escape_table{'>'}
1895 if &$iopt("align") eq "center";
1896 return $result;
1900 sub _DoImages {
1902 # Turn Markdown image shortcuts into <img> tags.
1904 my $text = shift;
1907 # First, handle reference-style labeled images: ![alt text][id]
1909 $text =~ s{
1910 ( # wrap whole match in $1
1912 ($g_nested_brackets) # alt text = $2
1915 [ ]? # one optional space
1916 (?:\n[ ]*)? # one optional newline followed by spaces
1919 ($g_nested_brackets) # id = $3
1924 my $result;
1925 my $whole_match = $1;
1926 my $alt_text = $2;
1927 my $link_id = $3;
1929 $link_id ne "" or $link_id = $alt_text; # for shortcut links like ![this][].
1930 $link_id = _strip(lc $link_id);
1932 if (defined $g_urls{$link_id}) {
1933 $result = _MakeIMGTag(
1934 _PrefixURL($g_urls{$link_id}), $alt_text, $g_titles{$link_id});
1936 else {
1937 # If there's no such link ID, leave intact:
1938 $result = $whole_match;
1941 $result;
1942 }xsge;
1945 # Next, handle inline images: ![alt text](url "optional title")
1946 # Don't forget: encode * and _
1948 $text =~ s{
1949 ( # wrap whole match in $1
1951 ($g_nested_brackets) # alt text = $2
1953 \( # literal paren
1954 ($g_nested_parens) # src and optional title = $3
1958 my $whole_match = $1;
1959 my $alt_text = $2;
1960 my ($url, $title) = _SplitUrlTitlePart($3, 1);
1961 defined($url) ? _MakeIMGTag(_PrefixURL($url), $alt_text, $title) : $whole_match;
1962 }xsge;
1965 # Finally, handle reference-style implicitly labeled links: ![alt text]
1967 $text =~ s{
1968 ( # wrap whole match in $1
1970 ($g_nested_brackets) # alt text = $2
1974 my $result;
1975 my $whole_match = $1;
1976 my $alt_text = $2;
1977 my $link_id = lc(_strip($alt_text));
1979 if (defined $g_urls{$link_id}) {
1980 $result = _MakeIMGTag(
1981 _PrefixURL($g_urls{$link_id}), $alt_text, $g_titles{$link_id});
1983 else {
1984 # If there's no such link ID, leave intact:
1985 $result = $whole_match;
1988 $result;
1989 }xsge;
1991 return $text;
1994 sub _EncodeAttText {
1995 my $text = shift;
1996 defined($text) or return undef;
1997 $text = escapeXML(_strip($text));
1998 # We've got to encode these to avoid conflicting
1999 # with italics, bold and strike through.
2000 $text =~ s!([*_~:])!$g_escape_table{$1}!g;
2001 return $text;
2005 sub _MakeAnchorId {
2006 use bytes;
2007 my ($link, $strip) = @_;
2008 $link = lc($link);
2009 if ($strip) {
2010 $link =~ s/\s+/_/gs;
2011 $link =~ tr/-a-z0-9_//cd;
2012 } else {
2013 $link =~ tr/-a-z0-9_/_/cs;
2015 return '' unless $link ne '';
2016 $link = "_".$link."_";
2017 $link =~ s/__+/_/gs;
2018 $link = "_".md5_hex($link)."_" if length($link) > 66;
2019 return $link;
2023 sub _GetNewAnchorId {
2024 my $link = _strip(lc(shift));
2025 return '' if $link eq "" || defined($g_anchors{$link});
2026 my $id = _MakeAnchorId($link);
2027 return '' unless $id;
2028 $g_anchors{$link} = '#'.$id;
2029 $g_anchors_id{$id} = $g_anchors{$link};
2030 if ($id =~ /-/) {
2031 my $id2 = $id;
2032 $id2 =~ s/-/_/gs;
2033 $id2 =~ s/__+/_/gs;
2034 defined($g_anchors_id{$id2}) or $g_anchors_id{$id2} = $g_anchors{$link};
2036 my $idd = _MakeAnchorId($link, 1);
2037 if ($idd) {
2038 defined($g_anchors_id{$idd}) or $g_anchors_id{$idd} = $g_anchors{$link};
2039 if ($idd =~ /-/) {
2040 my $idd2 = $idd;
2041 $idd2 =~ s/-/_/gs;
2042 $idd2 =~ s/__+/_/gs;
2043 defined($g_anchors_id{$idd2}) or $g_anchors_id{$idd2} = $g_anchors{$link};
2046 $id;
2050 sub _DoHeaders {
2051 my ($text, $anchors) = @_;
2052 my $h1;
2053 my $geth1 = $anchors && !defined($opt{h1}) ? sub {
2054 return unless !defined($h1);
2055 my $h = shift;
2056 $h1 = $h if $h ne "";
2057 } : sub {};
2059 # atx-style headers:
2060 # # Header 1
2061 # ## Header 2
2062 # ## Header 2 with closing hashes ##
2063 # ...
2064 # ###### Header 6
2066 $text =~ s{
2067 ^(\#{1,6}) # $1 = string of #'s
2068 [ ]*
2069 ((?:(?:(?<![#])[^\s]|[^#\s]).*?)?) # $2 = Header text
2070 [ ]*
2073 my $h_level = length($1);
2074 my $h = $2;
2075 $h =~ s/#+$//;
2076 $h =~ s/\s+$//;
2077 my $rsg = _RunSpanGamut($h);
2078 $h = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($rsg))));
2079 my $id = $h eq "" ? "" : _GetNewAnchorId($h);
2080 $id = " id=\"$id\"" if $id ne "";
2081 &$geth1($h) if $h_level == 1;
2082 "<h$h_level$id>" . _AutoHeaderFlag($h_level) . $rsg . "</h$h_level>\n\n";
2083 }egmx;
2085 # Setext-style headers:
2086 # Header 1
2087 # ========
2089 # Header 2
2090 # --------
2092 # Header 3
2093 # ~~~~~~~~
2095 $text =~ s{ ^(?:=+[ ]*\n)?[ ]*(.+?)[ ]*\n=+[ ]*\n+ }{
2096 my $h = $1;
2097 my $rsg = _RunSpanGamut($h);
2098 $h = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($rsg))));
2099 my $id = $h eq "" ? "" : _GetNewAnchorId($h);
2100 $id = " id=\"$id\"" if $id ne "";
2101 &$geth1($h);
2102 "<h1$id>" . _AutoHeaderFlag(1) . $rsg . "</h1>\n\n";
2103 }egmx;
2105 $text =~ s{ ^(?:-+[ ]*\n)?[ ]*(.+?)[ ]*\n-+[ ]*\n+ }{
2106 my $h = $1;
2107 my $rsg = _RunSpanGamut($h);
2108 $h = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($rsg))));
2109 my $id = $h eq "" ? "" : _GetNewAnchorId($h);
2110 $id = " id=\"$id\"" if $id ne "";
2111 "<h2$id>" . _AutoHeaderFlag(2) . $rsg . "</h2>\n\n";
2112 }egmx;
2114 $text =~ s{ ^(?:~+[ ]*\n)?[ ]*(.+?)[ ]*\n~+[ ]*\n+ }{
2115 my $h = $1;
2116 my $rsg = _RunSpanGamut($h);
2117 $h = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($rsg))));
2118 my $id = $h eq "" ? "" : _GetNewAnchorId($h);
2119 $id = " id=\"$id\"" if $id ne "";
2120 "<h3$id>" . _AutoHeaderFlag(3) . $rsg . "</h3>\n\n";
2121 }egmx;
2123 $opt{h1} = $h1 if defined($h1) && $h1 ne "";
2124 return $text;
2128 sub _AutoHeaderFlag {
2129 my $level = shift;
2130 my $auto = $opt{auto_number} || 0;
2131 return '' unless 1 <= $level && $level <= $auto;
2132 return "\34".chr(0x30+$level);
2136 sub _AutoHeaderNum {
2137 my $level = shift;
2138 my $auto = $opt{auto_number} || 0;
2139 return '' unless 1 <= $level && $level <= $auto;
2140 pop(@autonum) while @autonum > $level;
2141 push(@autonum, 1) while @autonum < $level - 1;
2142 $autonum[$level - 1] += 1;
2143 return join('.', @autonum).' ';
2147 my ($marker_ul, $marker_ol, $marker_any, $roman_numeral, $greek_lower);
2148 BEGIN {
2149 # Re-usable patterns to match list item bullets and number markers:
2150 $roman_numeral = qr/(?:
2151 [IiVvXx]|[Ii]{2,3}|[Ii][VvXx]|[VvXx][Ii]{1,3}|[Xx][Vv][Ii]{0,3}|
2152 [Xx][Ii][VvXx]|[Xx]{2}[Ii]{0,3}|[Xx]{2}[Ii]?[Vv]|[Xx]{2}[Vv][Ii]{1,2})/ox;
2153 $greek_lower = qr/(?:[\x{03b1}-\x{03c9}])/o;
2154 $marker_ul = qr/[*+-]/o;
2155 $marker_ol = qr/(?:\d+|[A-Za-z]|$roman_numeral|$greek_lower)[.\)]/o;
2156 $marker_any = qr/(?:$marker_ul|$marker_ol)/o;
2160 sub _GetListMarkerType {
2161 my ($list_type, $list_marker, $last_marker) = @_;
2162 return "" unless $list_type && $list_marker && lc($list_type) eq "ol";
2163 my $last_marker_type = '';
2164 $last_marker_type = _GetListMarkerType($list_type, $last_marker)
2165 if defined($last_marker) &&
2166 # these are roman unless $last_marker type case matches and is 'a' or 'A'
2167 $list_marker =~ /^[IiVvXx][.\)]?$/;
2168 return "I" if $list_marker =~ /^[IVX]/ && $last_marker_type ne 'A';
2169 return "i" if $list_marker =~ /^[ivx]/ && $last_marker_type ne 'a';
2170 return "A" if $list_marker =~ /^[A-Z]/;
2171 return "a" if $list_marker =~ /^[a-z]/ || $list_marker =~ /^$greek_lower/o;
2172 return "1";
2176 sub _GetListItemTypeClass {
2177 my ($list_type, $list_marker, $last_marker) = @_;
2178 my $list_marker_type = _GetListMarkerType($list_type, $list_marker, $last_marker);
2179 my $ans = &{sub{
2180 return "" unless length($list_marker) >= 2 && $list_marker_type =~ /^[IiAa1]$/;
2181 return "lower-greek" if $list_marker_type eq "a" && $list_marker =~ /^$greek_lower/o;
2182 return "" unless $list_marker =~ /\)$/;
2183 return "upper-roman" if $list_marker_type eq "I";
2184 return "lower-roman" if $list_marker_type eq "i";
2185 return "upper-alpha" if $list_marker_type eq "A";
2186 return "lower-alpha" if $list_marker_type eq "a";
2187 return "decimal";
2189 return ($list_marker_type, $ans);
2193 my %_roman_number_table;
2194 BEGIN {
2195 %_roman_number_table = (
2196 i => 1,
2197 ii => 2,
2198 iii => 3,
2199 iv => 4,
2200 v => 5,
2201 vi => 6,
2202 vii => 7,
2203 viii => 8,
2204 ix => 9,
2205 x => 10,
2206 xi => 11,
2207 xii => 12,
2208 xiii => 13,
2209 xiv => 14,
2210 xv => 15,
2211 xvi => 16,
2212 xvii => 17,
2213 xviii => 18,
2214 xix => 19,
2215 xx => 20,
2216 xxi => 21,
2217 xxii => 22,
2218 xxiii => 23,
2219 xxiv => 24,
2220 xxv => 25,
2221 xxvi => 26,
2222 xxvii => 27
2227 # Necessary because ς and σ are the same value grrr
2228 my %_greek_number_table;
2229 BEGIN {
2230 %_greek_number_table = (
2231 "\x{03b1}" => 1, # α
2232 "\x{03b2}" => 2, # β
2233 "\x{03b3}" => 3, # γ
2234 "\x{03b4}" => 4, # δ
2235 "\x{03b5}" => 5, # ε
2236 "\x{03b6}" => 6, # ζ
2237 "\x{03b7}" => 7, # η
2238 "\x{03b8}" => 8, # θ
2239 "\x{03b9}" => 9, # ι
2240 "\x{03ba}" => 10, # κ
2241 "\x{03bb}" => 11, # λ
2242 #"\x{00b5}"=> 12, # µ is "micro" not "mu"
2243 "\x{03bc}" => 12, # μ
2244 "\x{03bd}" => 13, # ν
2245 "\x{03be}" => 14, # ξ
2246 "\x{03bf}" => 15, # ο
2247 "\x{03c0}" => 16, # π
2248 "\x{03c1}" => 17, # ρ
2249 "\x{03c2}" => 18, # ς
2250 "\x{03c3}" => 18, # σ
2251 "\x{03c4}" => 19, # τ
2252 "\x{03c5}" => 20, # υ
2253 "\x{03c6}" => 21, # φ
2254 "\x{03c7}" => 22, # χ
2255 "\x{03c8}" => 23, # ψ
2256 "\x{03c9}" => 24 # ω
2261 sub _GetMarkerIntegerNum {
2262 my ($list_marker_type, $marker_val) = @_;
2263 my $ans = &{sub{
2264 return 0 + $marker_val if $list_marker_type eq "1";
2265 $list_marker_type = lc($list_marker_type);
2266 return $_greek_number_table{$marker_val}
2267 if $list_marker_type eq "a" &&
2268 defined($_greek_number_table{$marker_val});
2269 $marker_val = lc($marker_val);
2270 return ord($marker_val) - ord("a") + 1 if $list_marker_type eq "a";
2271 return 1 unless $list_marker_type eq "i";
2272 defined($_roman_number_table{$marker_val}) and
2273 return $_roman_number_table{$marker_val};
2274 return 1;
2276 return $ans if $ans == 0 && $list_marker_type eq "1";
2277 return $ans >= 1 ? $ans : 1;
2281 sub _IncrList {
2282 my ($from, $to, $extra) = @_;
2283 $extra = defined($extra) ? " $extra" : "";
2284 my $result = "";
2285 while ($from + 10 <= $to) {
2286 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-10\"></span>\n";
2287 $from += 10;
2289 while ($from + 5 <= $to) {
2290 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-5\"></span>\n";
2291 $from += 5;
2293 while ($from + 2 <= $to) {
2294 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-2\"></span>\n";
2295 $from += 2;
2297 while ($from < $to) {
2298 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr\"></span>\n";
2299 ++$from;
2301 return $result;
2305 sub _DoListsAndBlocks {
2307 # Form HTML ordered (numbered) and unordered (bulleted) lists.
2309 my $text = shift;
2310 my $indent = $opt{indent_width};
2311 my $less_than_indent = $indent - 1;
2312 my $less_than_double_indent = 2 * $indent - 1;
2314 # Re-usable pattern to match any entire ul or ol list:
2315 my $whole_list = qr{
2316 ( # $1 (or $_[0]) = whole list
2317 ( # $2 (or $_[1])
2318 (?:(?<=\n)|\A)
2319 [ ]{0,$less_than_indent}
2320 (${marker_any}) # $3 (or $_[2]) = first list item marker
2321 [ ]+
2323 (?s:.+?)
2324 ( # $4 (or $_[3])
2327 \n{2,}
2328 (?=\S)
2329 (?! # Negative lookahead for another list item marker
2330 ${marker_any}[ ]
2334 }mx;
2336 my $list_item_sub = sub {
2337 my $list = $_[0];
2338 my $list_type = ($_[2] =~ m/$marker_ul/) ? "ul" : "ol";
2339 my $list_att = "";
2340 my $list_class = "";
2341 my $list_incr = "";
2342 # Turn double returns into triple returns, so that we can make a
2343 # paragraph for the last item in a list, if necessary:
2344 $list =~ s/\n\n/\n\n\n/g;
2345 my ($result, $first_marker, $fancy) = _ProcessListItems($list_type, $list);
2346 defined($first_marker) or return $list;
2347 my $list_marker_type = _GetListMarkerType($list_type, $first_marker);
2348 if ($list_marker_type) {
2349 $first_marker =~ s/[.\)]$//;
2350 my $first_marker_num = _GetMarkerIntegerNum($list_marker_type, $first_marker);
2351 $list_att = $list_marker_type eq "1" ? "" : " type=\"$list_marker_type\"";
2352 if ($fancy) {
2353 $list_class = " class=\"$opt{style_prefix}ol\"";
2354 my $start = $first_marker_num;
2355 $start = 10 if $start > 10;
2356 $start = 5 if $start > 5 && $start < 10;
2357 $start = 1 if $start > 1 && $start < 5;
2358 $list_att .= " start=\"$start\"" unless $start == 1;
2359 $list_incr = _IncrList($start, $first_marker_num);
2360 } else {
2361 $list_class = " class=\"$opt{style_prefix}lc-greek\""
2362 if $list_marker_type eq "a" && $first_marker =~ /^$greek_lower/o;
2363 $list_att .= " start=\"$first_marker_num\"" unless $first_marker_num == 1;
2366 my $idt = "\027" x $g_list_level;
2367 $result = "$idt<$list_type$list_att$list_class>\n$list_incr" . $result . "$idt</$list_type>\n\n";
2368 $result;
2371 # We use a different prefix before nested lists than top-level lists.
2372 # See extended comment in _ProcessListItems().
2374 # Note: (jg) There's a bit of duplication here. My original implementation
2375 # created a scalar regex pattern as the conditional result of the test on
2376 # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
2377 # substitution once, using the scalar as the pattern. This worked,
2378 # everywhere except when running under MT on my hosting account at Pair
2379 # Networks. There, this caused all rebuilds to be killed by the reaper (or
2380 # perhaps they crashed, but that seems incredibly unlikely given that the
2381 # same script on the same server ran fine *except* under MT. I've spent
2382 # more time trying to figure out why this is happening than I'd like to
2383 # admit. My only guess, backed up by the fact that this workaround works,
2384 # is that Perl optimizes the substition when it can figure out that the
2385 # pattern will never change, and when this optimization isn't on, we run
2386 # afoul of the reaper. Thus, the slightly redundant code to that uses two
2387 # static s/// patterns rather than one conditional pattern.
2389 # Note: (kjm) With the addition of the two-of-the-same-kind-in-a-row-
2390 # starts-a-list-at-the-top-level rule the two patterns really are somewhat
2391 # different now, but the duplication has pretty much been eliminated via
2392 # use of a separate sub which has the side-effect of making the below
2393 # two cases much easier to grok all at once.
2395 if ($g_list_level) {
2396 my $parse = $text;
2397 $text = "";
2398 pos($parse) = 0;
2399 while ($parse =~ /\G(?s:.)*?^$whole_list/gmc) {
2400 my @captures = ($1, $2, $3, $4);
2401 if ($-[1] > $-[0]) {
2402 $text .= _DoListBlocks(substr($parse, $-[0], $-[1] - $-[0]));
2404 $text .= &$list_item_sub(@captures);
2406 $text .= _DoListBlocks(substr($parse, pos($parse))) if pos($parse) < length($parse);
2408 else {
2409 my $parse = $text;
2410 $text = "";
2411 pos($parse) = 0;
2412 while ($parse =~ m{\G(?s:.)*?
2413 (?: (?<=\n\n) |
2414 \A\n? |
2415 (?<=:\n) |
2416 (?:(?<=\n) # a list starts with one unordered marker line
2417 (?=[ ]{0,$less_than_indent}$marker_ul[ ])) |
2418 (?:(?<=\n) # or two ordered marker lines in a row
2419 (?=[ ]{0,$less_than_indent}$marker_ol[ ].*\n\n?
2420 [ ]{0,$less_than_indent}$marker_ol[ ])) |
2421 (?:(?<=\n) # or any marker and a sublist marker
2422 (?=[ ]{0,$less_than_indent}$marker_any[ ].*\n\n?
2423 [ ]{$indent,$less_than_double_indent}$marker_any[ ]))
2425 $whole_list
2426 }gmcx) {
2427 my @captures = ($1, $2, $3, $4);
2428 if ($-[1] > $-[0]) {
2429 $text .= _DoListBlocks(substr($parse, $-[0], $-[1] - $-[0]));
2431 $text .= &$list_item_sub(@captures);
2433 $text .= _DoListBlocks(substr($parse, pos($parse))) if pos($parse) < length($parse);
2436 return $text;
2440 sub _ProcessListItems {
2442 # Process the contents of a single ordered or unordered list, splitting it
2443 # into individual list items.
2446 my $list_type = shift;
2447 my $list_str = shift;
2449 # The $g_list_level global keeps track of when we're inside a list.
2450 # Each time we enter a list, we increment it; when we leave a list,
2451 # we decrement. If it's zero, we're not in a list anymore.
2453 # We do this because when we're not inside a list, we want to treat
2454 # something like this:
2456 # I recommend upgrading to version
2457 # 8. Oops, now this line is treated
2458 # as a sub-list.
2460 # As a single paragraph, despite the fact that the second line starts
2461 # with a digit-period-space sequence.
2463 # Whereas when we're inside a list (or sub-list), that line will be
2464 # treated as the start of a sub-list. What a kludge, huh? This is
2465 # an aspect of Markdown's syntax that's hard to parse perfectly
2466 # without resorting to mind-reading. Perhaps the solution is to
2467 # change the syntax rules such that sub-lists must start with a
2468 # starting cardinal number; e.g. "1." or "a.".
2470 $g_list_level++;
2471 my $idt = "\027" x $g_list_level;
2472 my $marker_kind = $list_type eq "ul" ? $marker_ul : $marker_ol;
2473 my $first_marker;
2474 my $first_marker_type;
2475 my $first_marker_num;
2476 my $last_marker;
2477 my $fancy;
2478 my $skipped;
2479 my $typechanged;
2480 my $next_num = 1;
2482 # trim trailing blank lines:
2483 $list_str =~ s/\n{2,}\z/\n/;
2485 my $result = "";
2486 my $oldpos = 0;
2487 pos($list_str) = 0;
2488 while ($list_str =~ m{\G # start where we left off
2489 (\n+)? # leading line = $1
2490 (^[ ]*) # leading whitespace = $2
2491 ($marker_any) [ ] ([ ]*) # list marker = $3 leading item space = $4
2492 }cgmx) {
2493 my $leading_line = $1;
2494 my $leading_space = $2;
2495 my $list_marker = $3;
2496 my $list_marker_len = length($list_marker);
2497 my $leading_item_space = $4;
2498 if ($-[0] > $oldpos) {
2499 $result .= substr($list_str, $oldpos, $-[0] - $oldpos); # Sort-of $`
2500 $oldpos = $-[0]; # point at start of this entire match
2502 if (!defined($first_marker)) {
2503 $first_marker = $list_marker;
2504 $first_marker_type = _GetListMarkerType($list_type, $first_marker);
2505 if ($first_marker_type) {
2506 (my $marker_val = $first_marker) =~ s/[.\)]$//;
2507 $first_marker_num = _GetMarkerIntegerNum($first_marker_type, $marker_val);
2508 $next_num = $first_marker_num;
2509 $skipped = 1 if $next_num != 1;
2511 } elsif ($list_marker !~ /$marker_kind/) {
2512 # Wrong marker kind, "fix up" the marker to a correct "lazy" marker
2513 # But keep the old length in $list_marker_len
2514 $list_marker = $last_marker;
2517 # Now grab the rest of this item's data upto but excluding the next
2518 # list marker at the SAME indent level, but sublists must be INCLUDED
2520 my $item = "";
2521 while ($list_str =~ m{\G
2522 ((?:.+?)(?:\n{1,2})) # list item text = $1
2523 (?= \n* (?: \z | # end of string OR
2524 (^[ ]*) # leading whitespace = $2
2525 ($marker_any) # next list marker = $3
2526 ([ ]+) )) # one or more spaces after marker = $4
2527 }cgmxs) {
2529 # If $3 has a left edge that is at the left edge of the previous
2530 # marker OR $3 has a right edge that is at the right edge of the
2531 # previous marker then we stop; otherwise we go on
2533 $item .= substr($list_str, $-[0], $+[0] - $-[0]); # $&
2534 last if !defined($4) || length($2) == length($leading_space) ||
2535 length($2) + length($3) == length($leading_space) + $list_marker_len;
2536 # move along, you're not the marker droid we're looking for...
2537 $item .= substr($list_str, $+[0], $+[4] - $+[0]);
2538 pos($list_str) = $+[4]; # ...move along over the marker droid
2540 # Remember where we parked
2541 $oldpos = pos($list_str);
2543 # Process the $list_marker $item
2545 my $liatt = '';
2546 my $checkbox = '';
2547 my $incr = '';
2549 if ($list_type eq "ul" && !$leading_item_space && $item =~ /^\[([ xX\xd7])\] +(.*)$/s) {
2550 my $checkmark = $1;
2551 $item = $2;
2552 my ($checkbox_class, $checkbox_val);
2553 if ($checkmark ne " ") {
2554 ($checkbox_class, $checkbox_val) = ("checkbox-on", "x");
2555 } else {
2556 ($checkbox_class, $checkbox_val) = ("checkbox-off", "&#160;");
2558 $liatt = " class=\"$opt{style_prefix}$checkbox_class\"";
2559 $checkbox = "<span><span></span></span><span></span><span>[<tt>$checkbox_val</tt>]&#160;</span>";
2560 } else {
2561 my $list_marker_type;
2562 ($list_marker_type, $liatt) = _GetListItemTypeClass($list_type, $list_marker, $last_marker);
2563 if ($list_type eq "ol" && defined($first_marker)) {
2564 my $styled = $fancy = 1 if $liatt && $list_marker =~ /\)$/;
2565 my ($sfx, $dash) = ("", "");
2566 ($sfx, $dash) = ("li", "-") if $styled;
2567 if ($liatt =~ /lower/) {
2568 $sfx .= "${dash}lc";
2569 } elsif ($liatt =~ /upper/) {
2570 $sfx .= "${dash}uc";
2572 $sfx .= "-greek" if $liatt =~ /greek/;
2573 $liatt = " class=\"$opt{style_prefix}$sfx\"" if $sfx;
2574 $typechanged = 1 if $list_marker_type ne $first_marker_type;
2575 (my $marker_val = $list_marker) =~ s/[.\)]$//;
2576 my $marker_num = _GetMarkerIntegerNum($list_marker_type, $marker_val);
2577 $marker_num = $next_num if $marker_num < $next_num;
2578 $skipped = 1 if $next_num < $marker_num;
2579 $incr = _IncrList($next_num, $marker_num, "incrlevel=$g_list_level");
2580 $liatt = " value=\"$marker_num\"$liatt" if $fancy || $skipped;
2581 $liatt = " type=\"$list_marker_type\"$liatt" if $styled || $typechanged;
2582 $next_num = $marker_num + 1;
2585 $last_marker = $list_marker;
2587 if ($leading_line or ($item =~ m/\n{2,}/)) {
2588 $item = _RunBlockGamut(_Outdent($item));
2589 $item =~ s{(</[OUou][Ll]>)\s*\z}{$1} and $item .= "\n$idt<span style=\"display:none\">&#160;</span>";
2591 else {
2592 # Recursion for sub-lists:
2593 $item = _DoListsAndBlocks(_Outdent($item));
2594 chomp $item;
2595 $item = _RunSpanGamut($item);
2598 # Append to $result
2599 $result .= "$incr$idt<li$liatt>" . $checkbox . $item . "$idt</li>\n";
2601 if ($fancy) {
2602 # remove "incrlevel=$g_list_level " parts
2603 $result =~ s{<span incrlevel=$g_list_level class="$opt{style_prefix}ol-incr((?:-\d{1,2})?)">}
2604 {$idt<span class="$opt{style_prefix}ol-incr$1">}g;
2605 } else {
2606 # remove the $g_list_level incr spans entirely
2607 $result =~ s{<span incrlevel=$g_list_level class="$opt{style_prefix}ol-incr(?:-\d{1,2})?"></span>\n}{}g;
2608 # remove the class="$opt{style_prefix}lc-greek" if first_marker is greek
2609 $result =~ s{(<li[^>]*?) class="$opt{style_prefix}lc-greek">}{$1>}g
2610 if defined($first_marker_type) && $first_marker_type eq "a" && $first_marker =~ /^$greek_lower/o;
2613 # Anything left over (similar to $') goes into result, but this should always be empty
2614 $result .= _RunBlockGamut(substr($list_str, pos($list_str))) if pos($list_str) < length($list_str);
2616 $g_list_level--;
2618 # After all that, if we only got an ordered list with a single item
2619 # and its first marker is a four-digit number >= 1492 and <= 2999
2620 # or an UPPERCASE letter, then pretend we didn't see any list at all.
2622 if ($first_marker_type && $first_marker_num + 1 == $next_num) {
2623 if (($first_marker_type eq "1" && $first_marker_num >= 1492 && $first_marker_num <= 2999) ||
2624 ($first_marker_type eq "A" && !$fancy)) {
2625 return (undef, undef, undef);
2629 return ($result, $first_marker, $fancy);
2633 sub _DoCodeBlocks {
2635 # Process Markdown `<pre><code>` blocks.
2638 my $text = shift;
2640 $text =~ s{
2641 (?:\n\n|\A\n?)
2642 ( # $1 = the code block -- one or more lines, starting with indent_width spaces
2644 (?:[ ]{$opt{indent_width}}) # Lines must start with indent_width of spaces
2645 .*\n+
2648 ((?=^[ ]{0,$opt{indent_width}}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
2650 my $codeblock = $1;
2652 $codeblock =~ s/\n\n\n/\n\n/g; # undo "paragraph for last list item" change
2653 $codeblock = _EncodeCode(_Outdent($codeblock));
2654 $codeblock =~ s/\A\n+//; # trim leading newlines
2655 $codeblock =~ s/\s+\z//; # trim trailing whitespace
2657 my $result = "<div class=\"$opt{style_prefix}code\"><pre style=\"display:none\"></pre><pre><code>"
2658 . $codeblock . "\n</code></pre></div>";
2659 my $key = block_id($result, 2);
2660 $g_code_blocks{$key} = $result;
2661 "\n\n" . $key . "\n\n";
2662 }egmx;
2664 return $text;
2668 sub _DoCodeSpans {
2670 # * Backtick quotes are used for <code></code> spans.
2672 # * You can use multiple backticks as the delimiters if you want to
2673 # include literal backticks in the code span. So, this input:
2675 # Just type ``foo `bar` baz`` at the prompt.
2677 # Will translate to:
2679 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
2681 # There's no arbitrary limit to the number of backticks you
2682 # can use as delimters. If you need three consecutive backticks
2683 # in your code, use four for delimiters, etc.
2685 # * You can use spaces to get literal backticks at the edges:
2687 # ... type `` `bar` `` ...
2689 # Turns to:
2691 # ... type <code>`bar`</code> ...
2694 my $text = shift;
2696 $text =~ s@
2697 (`+) # $1 = Opening run of `
2698 (.+?) # $2 = The code block
2699 (?<!`)
2700 \1 # Matching closer
2701 (?!`)
2703 my $c = "$2";
2704 $c =~ s/^[ ]+//g; # leading whitespace
2705 $c =~ s/[ ]+$//g; # trailing whitespace
2706 $c = _EncodeCode($c);
2707 "<code>$c</code>";
2708 @egsx;
2710 return $text;
2714 sub _EncodeCode {
2716 # Encode/escape certain characters inside Markdown code runs.
2717 # The point is that in code, these characters are literals,
2718 # and lose their special Markdown meanings.
2720 local $_ = shift;
2722 # Encode all ampersands; HTML entities are not
2723 # entities within a Markdown code span.
2724 s/&/&amp;/g;
2726 # Encode $'s, but only if we're running under Blosxom.
2727 # (Blosxom interpolates Perl variables in article bodies.)
2728 s/\$/&#036;/g if $_haveBX;
2730 # Do the angle bracket song and dance:
2731 s! < !&lt;!gx;
2732 s! > !&gt;!gx;
2734 # Now, escape characters that are magic in Markdown:
2735 s!([*_~{}\[\]\\])!$g_escape_table{$1}!g;
2737 return $_;
2741 sub _DoItalicsAndBoldAndStrike {
2742 my $text = shift;
2744 my $doital1 = sub {
2745 my $text = shift;
2746 $text =~ s{ \* (?=\S) (.+?) (?<=\S) \* }
2747 {<em>$1</em>}gsx;
2748 # We've got to encode any of these remaining to
2749 # avoid conflicting with other italics and bold.
2750 $text =~ s!([*])!$g_escape_table{$1}!g;
2751 $text;
2753 my $doital2 = sub {
2754 my $text = shift;
2755 $text =~ s{ (?<!\w) _ (?=\S) (.+?) (?<=\S) _ (?!\w) }
2756 {<em>$1</em>}gsx;
2757 # We've got to encode any of these remaining to
2758 # avoid conflicting with other italics and bold.
2759 $text =~ s!([_])!$g_escape_table{$1}!g;
2760 $text;
2763 # <strong> must go first:
2764 $text =~ s{ \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }
2765 {"<strong>".&$doital1($1)."</strong>"}gsex;
2766 $text =~ s{ (?<!\w) __ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\w) }
2767 {"<strong>".&$doital2($1)."</strong>"}gsex;
2769 $text =~ s{ ~~ (?=\S) (.+?[*_]*) (?<=\S) ~~ }
2770 {<strike>$1</strike>}gsx;
2772 $text =~ s{ \* (?=\S) (.+?) (?<=\S) \* }
2773 {<em>$1</em>}gsx;
2774 $text =~ s{ (?<!\w) _ (?=\S) (.+?) (?<=\S) _ (?!\w) }
2775 {<em>$1</em>}gsx;
2777 return $text;
2781 sub _DoBlockQuotes {
2782 my $text = shift;
2784 $text =~ s{
2785 ( # Wrap whole match in $1
2787 ^[ ]*>[ ]? # '>' at the start of a line
2788 .*\n # rest of the first line
2789 (.+\n)* # subsequent consecutive lines
2790 \n* # blanks
2794 my $bq = $1;
2795 $bq =~ s/^[ ]*>[ ]?//gm; # trim one level of quoting
2796 $bq =~ s/^[ ]+$//mg; # trim whitespace-only lines
2797 $bq = _RunBlockGamut($bq); # recurse
2799 $bq =~ s/^/\027/mg;
2800 "<blockquote>\n$bq\n</blockquote>\n\n";
2801 }egmx;
2804 return $text;
2808 my ($LEAD, $TRAIL, $LEADBAR, $LEADSP, $COLPL, $SEP);
2809 BEGIN {
2810 $LEAD = qr/(?>[ ]*(?:\|[ ]*)?)/o;
2811 $TRAIL = qr/[ ]*(?<!\\)\|[ ]*/o;
2812 $LEADBAR = qr/(?>[ ]*\|[ ]*)/o;
2813 $LEADSP = qr/(?>[ ]*)/o;
2814 $COLPL = qr/(?:[^\n|\\]|\\(?:(?>[^\n])|(?=\n|$)))+/o;
2815 $SEP = qr/[ ]*:?-+:?[ ]*/o;
2818 sub _DoTables {
2819 my $text = shift;
2821 $text =~ s{
2822 ( # Wrap whole thing to avoid $&
2823 (?: (?<=\n\n) | \A\n? ) # Preceded by blank line or beginning of string
2824 ^( # Header line
2825 $LEADBAR \| [^\n]* |
2826 $LEADBAR $COLPL [^\n]* |
2827 $LEADSP $COLPL \| [^\n]*
2829 ( # Separator line
2830 $LEADBAR $SEP (?: \| $SEP )* (?: \| [ ]*)? |
2831 $SEP (?: \| $SEP )+ (?: \| [ ]*)? |
2832 $SEP \| [ ]*
2834 ((?: # Rows (0+)
2835 $LEADBAR \| [^\n]* \n |
2836 $LEADBAR $COLPL [^\n]* \n |
2837 $LEADSP $COLPL \| [^\n]* \n
2841 my ($w, $h, $s, $rows) = ($1, $2, $3, $4);
2842 my @heads = _SplitTableRow($h);
2843 my @seps = _SplitTableRow($s);
2844 if (@heads == @seps) {
2845 my @align = map {
2846 if (/^:-+:$/) {" align=\"center\""}
2847 elsif (/^:/) {" align=\"left\""}
2848 elsif (/:$/) {" align=\"right\""}
2849 else {""}
2850 } @seps;
2851 my $nohdr = "";
2852 $nohdr = " $opt{style_prefix}table-nohdr" if join("", @heads) eq "";
2853 my $tab ="\n<table border=\"1\" cellspacing=\"0\" cellpadding=\"2\" class=\"$opt{style_prefix}table$nohdr\">\n";
2854 $tab .=
2855 " <tr class=\"$opt{style_prefix}row-hdr\">" . _MakeTableRow("th", \@align, @heads) . "</tr>\n"
2856 unless $nohdr;
2857 my $cnt = 0;
2858 my @classes = ("class=\"$opt{style_prefix}row-even\"", "class=\"$opt{style_prefix}row-odd\"");
2859 $tab .= " <tr " . $classes[++$cnt % 2] . ">" . _MakeTableRow("td", \@align, @$_) . "</tr>\n"
2860 foreach (_SplitMergeRows($rows));
2861 $tab .= "</table>\n\n";
2862 } else {
2865 }egmx;
2867 return $text;
2871 sub _SplitMergeRows {
2872 my @rows = ();
2873 my ($mergeprev, $mergenext) = (0,0);
2874 foreach (split(/\n/, $_[0])) {
2875 $mergeprev = $mergenext;
2876 $mergenext = 0;
2877 my @cols = _SplitTableRow($_);
2878 if (_endswithbareslash($cols[$#cols])) {
2879 my $last = $cols[$#cols];
2880 substr($last, -1, 1) = "";
2881 $last =~ s/[ ]+$//;
2882 $cols[$#cols] = $last;
2883 $mergenext = 1;
2885 if ($mergeprev) {
2886 for (my $i = 0; $i <= $#cols; ++$i) {
2887 my $cell = $rows[$#rows]->[$i];
2888 defined($cell) or $cell = "";
2889 $rows[$#rows]->[$i] = _MergeCells($cell, $cols[$i]);
2891 } else {
2892 push(@rows, [@cols]);
2895 return @rows;
2899 sub _endswithbareslash {
2900 return 0 unless substr($_[0], -1, 1) eq "\\";
2901 my @parts = split(/\\\\/, $_[0], -1);
2902 return substr($parts[$#parts], -1, 1) eq "\\";
2906 sub _MergeCells {
2907 my ($c1, $c2) = @_;
2908 return $c1 if $c2 eq "";
2909 return $c2 if $c1 eq "";
2910 return $c1 . " " . $c2;
2914 sub _SplitTableRow {
2915 my $row = shift;
2916 $row =~ s/^$LEAD//;
2917 $row =~ s/$TRAIL$//;
2918 $row =~ s!\\\\!$g_escape_table{'\\'}!go; # Must process escaped backslashes first.
2919 $row =~ s!\\\|!$g_escape_table{'|'}!go; # Then do \|
2920 my @elems = map {
2921 s!$g_escape_table{'|'}!|!go;
2922 s!$g_escape_table{'\\'}!\\\\!go;
2923 s/^[ ]+//;
2924 s/[ ]+$//;
2926 } split(/[ ]*\|[ ]*/, $row, -1);
2927 @elems or push(@elems, "");
2928 return @elems;
2932 sub _MakeTableRow {
2933 my $etype = shift;
2934 my $align = shift;
2935 my $row = "";
2936 for (my $i = 0; $i < @$align; ++$i) {
2937 my $data = $_[$i];
2938 defined($data) or $data = "";
2939 $row .= "<" . $etype . $$align[$i] . ">" .
2940 _RunSpanGamut($data) . "</" . $etype . ">";
2942 return $row;
2946 sub _FormParagraphs {
2948 # Params:
2949 # $text - string to process with html <p> tags
2951 my ($text, $anchors) = @_;
2953 # Strip leading and trailing lines:
2954 $text =~ s/\A\n+//;
2955 $text =~ s/\n+\z//;
2957 my @grafs = split(/\n{2,}/, $text);
2960 # Wrap <p> tags.
2962 foreach (@grafs) {
2963 unless (defined($g_html_blocks{$_}) || defined($g_code_blocks{$_})) {
2964 $_ = _RunSpanGamut($_);
2965 s/^([ ]*)/$g_start_p/;
2966 $_ .= $g_close_p;
2971 # Strip standalone XML comments if requested
2973 if ($anchors && $opt{stripcomments} && @g_xml_comments) {
2974 my %xml_comment = ();
2975 $xml_comment{$_} = 1 foreach @g_xml_comments;
2976 my @grafs2 = ();
2977 do { push(@grafs2, $_) unless $xml_comment{$_} } foreach @grafs;
2978 @grafs = @grafs2;
2982 # Unhashify HTML blocks
2984 foreach (@grafs) {
2985 if (defined( $g_html_blocks{$_} )) {
2986 $_ = $g_html_blocks{$_};
2990 return join "\n\n", @grafs;
2994 # %ok_tag_name declared previously
2995 my $g_possible_tag_name;
2996 BEGIN {
2997 # note: length("blockquote") == 10
2998 $g_possible_tag_name = qr/(?i:[a-z]{1,10}|h[1-6]|\020)/o;
2999 %ok_tag_name = map({$_ => 1} "\20", qw(
3000 a abbr acronym address area
3001 b basefont bdo big blockquote br
3002 caption center cite code col colgroup
3003 dd del dfn div dl dt
3005 font
3006 h1 h2 h3 h4 h5 h6 hr
3007 i img ins
3012 p pre
3014 s samp small span strike strong sub sup
3015 table tbody td tfoot th thead tr tt
3016 u ul
3019 $ok_tag_name{$_} = 0 foreach (qw(
3020 dir menu
3025 sub _SetAllowedTag {
3026 my ($tag, $forbid) = @_;
3027 $ok_tag_name{$tag} = $forbid ? 0 : 1
3028 if defined($tag) && exists($ok_tag_name{$tag});
3032 # Encode leading '<' of any non-tags
3033 # However, "<?", "<!" and "<$" are passed through (legacy on that "<$" thing)
3034 sub _DoTag {
3035 my $tag = shift;
3036 return $tag if $tag =~ /^<[?\$!]/;
3037 if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} || $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
3038 $ok_tag_name{lc($1)}) {
3040 return _ProcessURLTag("href", $tag, 1) if $tag =~ /^<a\s/i;
3041 return _ProcessURLTag("src", $tag) if $tag =~ /^<img\s/i;
3042 return $tag;
3044 $tag =~ s/^</&lt;/;
3045 return $tag;
3048 # Strip out all tags that _DoTag would match
3049 sub _StripTags {
3050 my $text = shift;
3051 my $_StripTag = sub {
3052 my $tag = shift;
3053 return $tag if $tag =~ /^<[?\$!]/;
3054 if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} || $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
3055 $ok_tag_name{lc($1)}) {
3057 return ""; # strip it out
3059 return $tag;
3061 $text =~ s{(<[^>]*>)}{&$_StripTag($1)}ige;
3062 return $text;
3065 my %univatt; # universally allowed attribute names
3066 my %tagatt; # per-element allowed attribute names
3067 my %tagmt; # empty element tags
3068 my %tagocl; # non-empty elements with optional closing tag
3069 my %tagacl; # which %tagocl an opening %tagocl will close
3070 my %tagblk; # block elements
3071 my %taginl; # inline markup tags which trigger an auto <p> reopen
3072 my %taga1p; # open tags which require at least one attribute
3073 my %lcattval; # names of attribute values to lowercase
3074 my %impatt; # names of "implied" attributes
3075 BEGIN {
3076 %univatt = map({$_ => 1} qw(class dir id lang style title xml:lang));
3077 %tagatt = (
3078 'a' => { map({$_ => 1} qw(href name)) },
3079 'area' => { map({$_ => 1} qw(alt coords href nohref shape)) },
3080 'basefont' => { map({$_ => 1} qw(color face size)) },
3081 'br' => { map({$_ => 1} qw(clear)) },
3082 'caption' => { map({$_ => 1} qw(align)) },
3083 'col' => { map({$_ => 1} qw(align span width valign)) },
3084 'colgroup' => { map({$_ => 1} qw(align span width valign)) },
3085 'dir' => { map({$_ => 1} qw(compact)) },
3086 'div' => { map({$_ => 1} qw(align)) },
3087 'dl' => { map({$_ => 1} qw(compact)) },
3088 'font' => { map({$_ => 1} qw(color face size)) },
3089 'h1' => { map({$_ => 1} qw(align)) },
3090 'h2' => { map({$_ => 1} qw(align)) },
3091 'h3' => { map({$_ => 1} qw(align)) },
3092 'h4' => { map({$_ => 1} qw(align)) },
3093 'h5' => { map({$_ => 1} qw(align)) },
3094 'h6' => { map({$_ => 1} qw(align)) },
3095 'hr' => { map({$_ => 1} qw(align noshade size width)) },
3096 # NO server-side image maps, therefore NOT ismap !
3097 'img' => { map({$_ => 1} qw(align alt border height hspace src usemap vspace width)) },
3098 'li' => { map({$_ => 1} qw(compact type value)) },
3099 'map' => { map({$_ => 1} qw(name)) },
3100 'menu' => { map({$_ => 1} qw(compact)) },
3101 'ol' => { map({$_ => 1} qw(compact start type)) },
3102 'p' => { map({$_ => 1} qw(align)) },
3103 'pre' => { map({$_ => 1} qw(width)) },
3104 'table' => { map({$_ => 1} qw(align border cellpadding cellspacing summary width)) },
3105 'tbody' => { map({$_ => 1} qw(align valign)) },
3106 'tfoot' => { map({$_ => 1} qw(align valign)) },
3107 'thead' => { map({$_ => 1} qw(align valign)) },
3108 'td' => { map({$_ => 1} qw(align colspan height nowrap rowspan valign width)) },
3109 'th' => { map({$_ => 1} qw(align colspan height nowrap rowspan valign width)) },
3110 'tr' => { map({$_ => 1} qw(align valign)) },
3111 'ul' => { map({$_ => 1} qw(compact type)) }
3113 %tagmt = map({$_ => 1} qw(area basefont br col hr img));
3114 %tagocl = map({$_ => 1} qw(colgroup dd dt li p tbody td tfoot th thead tr));
3115 %tagacl = (
3116 'colgroup' => \%tagocl,
3117 'dd' => { map({$_ => 1} qw(colgroup dd dt li p)) },
3118 'dt' => { map({$_ => 1} qw(colgroup dd dt li p)) },
3119 'li' => { map({$_ => 1} qw(colgroup dd dt li p)) },
3120 'tbody' => \%tagocl,
3121 'td' => { map({$_ => 1} qw(colgroup dd dt li p td th)) },
3122 'tfoot' => \%tagocl,
3123 'th' => { map({$_ => 1} qw(colgroup dd dt li p td th)) },
3124 'thead' => \%tagocl,
3125 'tr' => { map({$_ => 1} qw(colgroup dd dt li p td th tr)) },
3127 %tagblk = map({$_ => 1} qw(address blockquote center div dl h1 h2 h3 h4 h5 h6 hr ol p pre table ul));
3128 %taginl = map({$_ => 1} qw(a abbr acronym b basefont bdo big br cite code dfn em font i
3129 img kbd map q s samp small span strike strong sub sup tt u var));
3130 %impatt = map({$_ => 1} qw(checked compact ismap nohref noshade nowrap));
3131 %lcattval = map({$_ => 1} qw(
3132 align border cellpadding cellspacing checked clear color colspan
3133 compact coords height hspace ismap nohref noshade nowrap rowspan size
3134 span shape valign vspace width
3136 %taga1p = map({$_ => 1} qw(a area bdo img map));
3140 # _SanitizeTags
3142 # Inspect all '<'...'>' tags in the input and HTML encode those things
3143 # that cannot possibly be tags and at the same time sanitize them.
3145 # $1 => text to process
3146 # <= sanitized text
3147 sub _SanitizeTags {
3148 my ($text, $validate, $htmlauto) = @_;
3149 $text =~ s/\s+$//;
3150 $text ne "" or return "";
3151 my @stack = ();
3152 my $ans = "";
3153 my $end = length($text);
3154 pos($text) = 0;
3155 my ($autoclose, $autoclopen);
3156 my $lastmt = "";
3157 my $reopenp = 0;
3158 $autoclose = $htmlauto ? sub {
3159 my $s = $_[0] || "";
3160 while (@stack &&
3161 ($stack[$#stack]->[0] ne $s || $_[1] && !$stack[$#stack]->[2]) &&
3162 $tagocl{$stack[$#stack]->[0]}) {
3163 $ans .= "</" . $stack[$#stack]->[0] . ">";
3164 pop(@stack);
3166 } : sub {} if $validate;
3167 $autoclopen = $htmlauto ? sub {
3168 my $s = $_[0] || "";
3169 my $c;
3170 if ($tagblk{$s}) {$c = {p=>1}}
3171 elsif ($tagocl{$s}) {$c = $tagacl{$s}}
3172 else {return}
3173 my $clp = 0;
3174 while (@stack && $c->{$stack[$#stack]->[0]}) {
3175 $clp = 0;
3176 if ($stack[$#stack]->[2] &&
3177 $stack[$#stack]->[1]+3 eq $_[1]) {
3178 $ans .= "</\20>";
3179 } else {
3180 $ans .= "</" . $stack[$#stack]->[0] . ">";
3182 if ($stack[$#stack]->[2]) {
3183 $stack[$#stack]->[0] = "\20";
3184 } else {
3185 $clp = $s ne "p" && $stack[$#stack]->[0] eq "p";
3186 pop(@stack);
3189 $clp;
3190 } : sub {} if $validate;
3191 while (pos($text) < $end) {
3192 if ($text =~ /\G(\s+)/gc) {
3193 $ans .= $1;
3194 next;
3196 if ($text =~ /\G([^<]+)/gc) {
3197 if ($validate && @stack && $stack[$#stack]->[0] eq "\20") {
3198 push(@stack,["p",pos($text)-length($1)]);
3199 $reopenp = 0;
3200 $ans .= "<p>";
3202 $reopenp && do {
3203 push(@stack,["p",pos($text)-length($1)]);
3204 $reopenp = 0;
3205 $ans .= "<p>";
3207 $ans .= _EncodeAmps($1);
3208 $lastmt = "";
3209 next;
3211 my $tstart = pos($text);
3212 if ($text =~ /\G(<!--(?:[^-]|(?:-(?!-)))*-->)/gc) {
3213 # pass "comments" through unless stripping them
3214 if ($opt{stripcomments}) {
3215 # strip any trailing whitespace + \n after comment if present
3216 $text =~ /\G[ \t]*\n/gc;
3217 } else {
3218 # pass the "comment" on through
3219 $ans .= $1;
3221 next;
3223 if ($text =~ /\G(<[^>]*>)/gc) {
3224 my $tag = $1;
3225 my $tt;
3226 if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} ||
3227 $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
3228 $ok_tag_name{$tt=lc($1)})
3230 my ($stag, $styp, $autocloseflag) = _Sanitize($tag);
3231 if ($styp == 2 && $lastmt eq $tt) {
3232 $lastmt = "";
3233 next;
3235 $lastmt = $styp == -3 ? $tt : "";
3236 $tt = "p" if $autocloseflag;
3237 if ($validate && $styp) {
3238 my $clp = &$autoclopen($tt, $tstart) if $styp != 2;
3239 if ($styp == 1) {
3240 $reopenp && $taginl{$tt} and do {
3241 push(@stack,["p",$tstart]);
3242 $ans .= "<p>";
3244 push(@stack,[$tt,$tstart,$autocloseflag,$clp]);
3245 $reopenp = 0;
3246 } elsif ($styp == 2) {
3247 $reopenp && ($tt eq "p" || $tt eq "\20") and do {
3248 $reopenp = 0;
3249 next;
3251 &$autoclose($tt, $autocloseflag);
3252 my $mtstkchk = sub {
3253 !@stack and _xmlfail("closing tag $tt without matching open at " .
3254 _linecol($tstart, $text));
3256 &$mtstkchk;
3257 if ($autocloseflag && $stack[$#stack]->[0] eq "\20") {
3258 pop(@stack);
3259 $stag = "";
3260 } elsif ($stack[$#stack]->[0] eq $tt) {
3261 $stack[$#stack]->[3] and $reopenp = 1;
3262 pop(@stack);
3263 } else {
3264 pop(@stack) while @stack && $stack[$#stack]->[0] eq "\20";
3265 &$mtstkchk;
3266 my @i = @{$stack[$#stack]};
3267 _xmlfail("opening tag $i[0] at " . _linecol($i[1], $text) .
3268 " mismatch with closing tag $tt at " . _linecol($tstart, $text));
3272 $ans .= $stag;
3273 next;
3274 } else {
3275 $tag =~ s/^</&lt;/;
3276 $ans .= _EncodeAmps($tag);
3277 $lastmt = "";
3278 next;
3281 # can only get here if "\G" char is an unmatched "<"
3282 pos($text) += 1;
3283 $ans .= "&lt;";
3284 $lastmt = "";
3286 &$autoclose if $validate;
3287 if ($validate && @stack) {
3288 my @errs;
3289 my $j;
3290 for ($j = 0; $j <= $#stack; ++$j) {
3291 my @i = @{$stack[$j]};
3292 next if $i[0] eq "\20";
3293 unshift(@errs, "opening tag $i[0] without matching close at " .
3294 _linecol($i[1], $text));
3296 _xmlfail(@errs) unless !@errs;
3298 # Remove any unwanted extra leading <p></p> sections
3299 $ans =~ s{<p></\020>}{}gs if $validate;
3301 return $ans."\n";
3305 sub _linecol {
3306 my ($pos, $txt) = @_;
3307 pos($txt) = 0;
3308 my ($l, $p);
3309 $l = 1;
3310 ++$l while ($p = pos($txt)), $txt =~ /\G[^\n]*\n/gc && pos($txt) <= $pos;
3311 return "line $l col " . (1 + ($pos - $p));
3315 sub _xmlfail {
3316 die join("", map("$_\n", @_));
3320 sub _Sanitize {
3321 my $tag = shift;
3322 my $seenatt = {};
3323 if ($tag =~ m{^</}) {
3324 my $autocloseflag = undef;
3325 $autocloseflag = 1, $tag="</p>" if $tag eq "</\20>";
3326 return (lc($tag),2,$autocloseflag);
3328 if ($tag =~ /^<([^\s<\/>]+)\s+/gs) {
3329 my $tt = lc($1);
3330 my $autocloseflag = undef;
3331 $autocloseflag = 1, $tt="p" if $tt eq "\20";
3332 my $out = "<" . $tt . " ";
3333 my $ok = $tagatt{$tt};
3334 ref($ok) eq "HASH" or $ok = {};
3335 my $atc = 0;
3336 while ($tag =~ /\G\s*([^\s\042\047<\/>=]+)((?>=)|\s*)/gcs) {
3337 my ($a,$s) = ($1, $2);
3338 if ($s eq "" && substr($tag, pos($tag), 1) =~ /^[\042\047]/) {
3339 # pretend the "=" sign wasn't overlooked
3340 $s = "=";
3342 if (substr($s,0,1) ne "=") {
3343 # it's one of "those" attributes (e.g. compact) or not
3344 # _SanitizeAtt will fix it up if it is
3345 $out .= _SanitizeAtt($a, '""', $ok, $seenatt, $tt);
3346 ++$atc;
3347 next;
3349 if ($tag =~ /\G([\042\047])((?:(?!\1)(?!<).)*)\1\s*/gcs) {
3350 $out .= _SanitizeAtt($a, $1.$2.$1, $ok, $seenatt, $tt);
3351 ++$atc;
3352 next;
3354 if ($tag =~ /\G([\042\047])((?:(?!\1)(?![<>])(?![\/][>]).)*)/gcs) {
3355 # what to do what to do what to do
3356 # trim trailing \s+ and magically add the missing quote
3357 my ($q, $v) = ($1, $2);
3358 $v =~ s/\s+$//;
3359 $out .= _SanitizeAtt($a, $q.$v.$q, $ok, $seenatt, $tt);
3360 ++$atc;
3361 next;
3363 if ($tag =~ /\G([^\s<\/>]+)\s*/gcs) {
3364 # auto quote it
3365 my $v = $1;
3366 $v =~ s/\042/&quot;/go;
3367 $out .= _SanitizeAtt($a, '"'.$v.'"', $ok, $seenatt, $tt);
3368 ++$atc;
3369 next;
3371 # give it an empty value
3372 $out .= _SanitizeAtt($a, '""', $ok, $seenatt, $tt);
3373 ++$atc;
3375 my $sfx = substr($tag, pos($tag));
3376 $out =~ s/\s+$//;
3377 my $typ = 1;
3378 if ($tagmt{$tt}) {
3379 $typ = ($tag =~ m,/>$,) ? 3 : -3;
3380 $out .= $opt{empty_element_suffix};
3381 return ("&lt;" . substr($tag,1), 0) if !$atc && $taga1p{$tt};
3382 } else {
3383 if ($tag =~ m,/>$,) {
3384 return ("&lt;" . substr($tag,1), 0) if !$atc && $taga1p{$tt};
3385 $typ = 3;
3386 } else {
3387 return ("&lt;" . substr($tag,1), 0) if !$atc && $taga1p{$tt};
3389 $out .= ">";
3390 $out .= "</$tt>" if $typ == 3;
3392 return ($out,$typ,$autocloseflag);
3393 } elsif ($tag =~ /^<([^\s<\/>]+)/s) {
3394 my $tt = lc($1);
3395 return ("&lt;" . substr($tag,1), 0) if $taga1p{$tt};
3396 if ($tagmt{$tt}) {
3397 return ("<" . $tt . $opt{empty_element_suffix}, 3);
3398 } elsif ($tag =~ m,/>$,) {
3399 return ("<" . $tt . "></" . $tt . ">", 3);
3400 } else {
3401 return ("<" . $tt . ">", 1) unless $tt eq "\20";
3402 return ("<p>", 1, 1);
3405 return (lc($tag),0);
3409 sub _SanitizeAtt {
3410 my $att = lc($_[0]);
3411 return "" unless $att =~ /^[_a-z:][_a-z:0-9.-]*$/; # no weirdo char att names
3412 return "" unless $univatt{$att} || $_[2]->{$att};
3413 return "" if $_[3]->{$att}; # no repeats
3414 $_[3]->{$att} = 1;
3415 $impatt{$att} and return $att."=".'"'.$att.'"';
3416 if ($lcattval{$att}) {
3417 return $att."="._SanitizeAttValue(lc($_[1]))." ";
3418 } else {
3419 my $satt = _SanitizeAttValue($_[1]);
3420 if (ref($opt{urlfunc}) eq 'CODE' &&
3421 (($_[4] eq "a" && $att eq "href") ||
3422 ($_[4] eq "img" && $att eq "src")) ) {
3423 my ($lq,$v,$rq);
3424 $lq = substr($satt, 0, 1);
3425 $rq = substr($satt, -1, 1);
3426 $v = unescapeXML(substr($satt, 1, length($satt)-2));
3427 my ($uhost, $upath, $uq, $uf) = SplitURL($v);
3428 $v = &{$opt{urlfunc}}($v, \%opt, $_[4], $uhost, $upath, $uq, $uf);
3429 $satt = $lq . escapeXML($v) . $rq;
3431 return $att."=".$satt." ";
3436 sub _SanitizeAttValue {
3437 my $v = shift;
3438 if ($v =~ /^([\042\047])(.*?)\1$/) {
3439 return $1.escapeXML($2).$1;
3440 } else {
3441 return '"'.escapeXML($v).'"';
3446 sub _ProcessURLTag {
3447 my ($att, $tag, $dofrag) = @_;
3449 $att = lc($att) . "=";
3450 if ($tag =~ /^(<[^\s>]+\s+)/g) {
3451 my $out = $1;
3452 while ($tag =~ /\G([^\s\042\047<\/>=]+=)([\042\047])((?:(?!\2)(?!<).)*)(\2\s*)/gcs) {
3453 my ($p, $q, $v, $s) = ($1, $2, $3, $4);
3454 if (lc($p) eq $att && $v ne "") {
3455 if ($dofrag && $v =~ m"^#") {
3456 $v = _FindFragmentMatch($v);
3457 my $bpcr;
3458 if (ref($bpcr = $opt{base_prefix}) eq 'CODE') {
3459 $v = "\2\3" . &$bpcr($v);
3461 } else {
3462 $v = _PrefixURL($v);
3464 $v = _EncodeAttText($v);
3466 $out .= $p . $q . $v . $s;
3468 $out .= substr($tag, pos($tag));
3469 substr($out,0,1) = $g_escape_table{'<'};
3470 substr($out,-1,1) = $g_escape_table{'>'};
3471 return $out;
3474 return $tag;
3478 my $oops_entities;
3479 BEGIN { $oops_entities = qr/(?:lt|gt|amp|quot|apos|nbsp)/io; }
3482 # $_[0] => the value to XML escape
3483 # returns the XML escaped value
3484 # Encodes the five required entites (amp,lt,gt,quot,apos)
3485 # while preserving any pre-existing entities which means that
3486 # calling this repeatedly on already-escaped text should return
3487 # it unchanged (i.e. it's idempotent).
3489 sub escapeXML {
3490 my $text = shift;
3492 # Treat these accidents as though they had the needed ';'
3493 $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
3495 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
3496 # http://bumppo.net/projects/amputator/
3497 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
3499 # Remaining entities now
3500 $text =~ s/\042/&quot;/g;
3501 $text =~ s/\047/&#39;/g; # Some older browsers do not grok &apos;
3502 $text =~ s/</&lt;/g;
3503 $text =~ s/>/&gt;/g;
3505 return $text;
3509 # $_[0] => value to be unescaped
3510 # returns unescaped value
3511 # The five required XML entities (amp,lt,gt,quot,apos) plus nbsp
3512 # are decoded as well as decimal (#d+) and hexadecimal (#xh+).
3514 # While the escapeXML function tries to be idempotent when presented
3515 # with an already-escaped string, this function is NOT necessarily
3516 # idempotent when presented with an already decoded string unless it's
3517 # been decoded to the point there are no more recognizable entities left.
3518 # In other words given a string such as:
3520 # &amp;amp;amp;amp;
3522 # Each call will only decode one layer of escaping and it will take four
3523 # successive calls to finally end up with just "&".
3525 sub unescapeXML {
3526 my $text = shift;
3528 # Treat these accidents as though they had the needed ';'
3529 $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
3531 $text =~ s/&[qQ][uU][oO][tT];/\042/gso;
3532 $text =~ s/&[aA][pP][oO][sS];/\047/gso;
3533 $text =~ s/&[gG][tT];/>/gso;
3534 $text =~ s/&[lL][tT];/</gso;
3535 $text =~ s/&[nN][bB][sS][pP];/&#160;/gso;
3536 $text =~ s{&([aA][mM][pP]|\#\d+|\#x[0-9a-fA-F]+);}{
3537 local $_=$1;
3538 lc($_) eq 'amp' ? '&' :
3539 /^#(\d+)$/ ? chr($1) :
3540 /^#[xX](.*)$/ ? chr(hex($1)) :
3542 }gsex;
3544 return $text;
3548 # $_[0] => the input URL to split
3549 # $_[1] => if a true value, call unescapeXML before splitting
3550 # returns array:
3551 # [0] => scheme, name+password, host, port ("" if not present)
3552 # [1] => path in url (starts with "/" if absolute otherwise relative)
3553 # [2] => query string ("" if not present otherwise starts with "?")
3554 # [3] => fragment ("" if not present otherwise starts with "#")
3555 # The returned value recovers the (possibly unescapeXML'd) input
3556 # string by simply concatenating the returned array elements.
3558 sub SplitURL {
3559 my ($url, $unesc) = @_;
3560 $unesc and $url = unescapeXML($url);
3561 my ($sh, $p, $q, $f) = ("", "", "", "");
3562 if ($url =~ m{^([A-Za-z][A-Za-z0-9.+-]*:)(//.*)$}os) {
3563 $sh = $1;
3564 $url = $2;
3566 if ($url =~ m{^(//[^/?#]*)((?:[/?#].*)?)$}os) {
3567 $sh .= $1;
3568 $url = $2;
3570 ($p, $q, $f) = $url =~ m{^([^?#]*)((?:[?][^#]*)?)((?:[#].*)?)$}os;
3571 return ($sh, $p, $q, $f);
3575 sub _EncodeAmps {
3576 my $text = shift;
3578 # Treat these accidents as though they had the needed ';'
3579 $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
3581 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
3582 # http://bumppo.net/projects/amputator/
3583 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
3585 return $text;
3589 sub _EncodeAmpsAndAngles {
3590 # Smart processing for ampersands and angle brackets that need to be encoded.
3592 my $text = shift;
3594 # Treat these accidents as though they had the needed ';'
3595 $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
3597 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
3598 # http://bumppo.net/projects/amputator/
3599 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
3601 # Encode naked <'s
3602 $text =~ s{<(?![\020a-z/?\$!])}{&lt;}gi;
3603 $text =~ s{<(?=[^>]*$)}{&lt;}g;
3605 # Encode <'s that cannot possibly be a start or end tag
3606 $text =~ s{(<[^>]*>)}{_DoTag($1)}ige;
3608 return $text;
3612 sub _EncodeBackslashEscapes {
3614 # Parameter: String.
3615 # Returns: String after processing the following backslash escape sequences.
3617 local $_ = shift;
3619 s!\\\\!$g_escape_table{'\\'}!go; # Must process escaped backslashes first.
3620 s{\\([`*_~{}\[\]()>#+\-.!`])}{$g_escape_table{$1}}g;
3622 return $_;
3626 sub _DoAutoLinks {
3627 local $_ = shift;
3629 s{<((https?|ftps?):[^'\042>\s]+)>(?!\s*</a>)}{_MakeATag($1, "&lt;".$1."&gt;")}gise;
3631 # Email addresses: <address@domain.foo>
3634 (?:mailto:)?
3636 [-.\w]+
3638 [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
3642 _EncodeEmailAddress(_UnescapeSpecialChars($1), "&#x3c;", "&#62;");
3643 }egix;
3645 # (kjm) I don't do "x" patterns
3646 s{(?:^|(?<=\s))((?:https?|ftps?)://(?:[-a-zA-Z0-9./?\&\%=_~!*;:\@+\$,\x23](?:(?<![.,:;])|(?=[^\s])))+)}
3647 {_MakeATag($1, $1)}soge;
3648 s{(?<![][])(?<!\] )\[RFC( ?)([0-9]{1,5})\](?![][])(?! \[)}
3649 {"["._MakeATag("https://tools.ietf.org/html/rfc$2", "RFC$1$2", "RFC $2")."]"}soge;
3651 return $_;
3655 sub _EncodeEmailAddress {
3657 # Input: an email address, e.g. "foo@example.com"
3659 # Output: the email address as a mailto link, with each character
3660 # of the address encoded as either a decimal or hex entity, in
3661 # the hopes of foiling most address harvesting spam bots. E.g.:
3663 # <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
3664 # x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
3665 # &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
3667 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
3668 # mailing list: <https://tinyurl.com/yu7ue>
3671 my ($addr, $prefix, $suffix) = @_;
3672 $prefix = "" unless defined($prefix);
3673 $suffix = "" unless defined($suffix);
3675 srand(unpack('N',md5($addr)));
3676 my @encode = (
3677 sub { '&#' . ord(shift) . ';' },
3678 sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
3679 sub { shift },
3682 $addr = "mailto:" . $addr;
3684 $addr =~ s{(.)}{
3685 my $char = $1;
3686 if ( $char eq '@' ) {
3687 # this *must* be encoded. I insist.
3688 $char = $encode[int rand 1]->($char);
3689 } elsif ( $char ne ':' ) {
3690 # leave ':' alone (to spot mailto: later)
3691 my $r = rand;
3692 # roughly 10% raw, 45% hex, 45% dec
3693 $char = (
3694 $r > .9 ? $encode[2]->($char) :
3695 $r < .45 ? $encode[1]->($char) :
3696 $encode[0]->($char)
3699 $char;
3700 }gex;
3702 # strip the mailto: from the visible part
3703 (my $bareaddr = $addr) =~ s/^.+?://;
3704 $addr = _MakeATag("$addr", $prefix.$bareaddr.$suffix);
3706 return $addr;
3710 sub _UnescapeSpecialChars {
3712 # Swap back in all the special characters we've hidden.
3714 my $text = shift;
3716 while( my($char, $hash) = each(%g_escape_table) ) {
3717 $text =~ s/$hash/$char/g;
3719 return $text;
3723 sub _TokenizeHTML {
3725 # Parameter: String containing HTML markup.
3726 # Returns: Reference to an array of the tokens comprising the input
3727 # string. Each token is either a tag (possibly with nested,
3728 # tags contained therein, such as <a href="<MTFoo>">, or a
3729 # run of text between tags. Each element of the array is a
3730 # two-element array; the first is either 'tag' or 'text';
3731 # the second is the actual value.
3734 # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
3735 # <https://web.archive.org/web/20041215155739/http://bradchoate.com/weblog/2002/07/27/mtregex>
3738 my $str = shift;
3739 my $pos = 0;
3740 my $len = length $str;
3741 my @tokens;
3743 my $depth = 6;
3744 my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
3745 my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment
3746 (?s: <\? .*? \?> ) | # processing instruction
3747 $nested_tags/iox; # nested tags
3749 while ($str =~ m/($match)/g) {
3750 my $whole_tag = $1;
3751 my $sec_start = pos $str;
3752 my $tag_start = $sec_start - length $whole_tag;
3753 if ($pos < $tag_start) {
3754 push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
3756 push @tokens, ['tag', $whole_tag];
3757 $pos = pos $str;
3759 push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
3760 \@tokens;
3764 sub _Outdent {
3766 # Remove one level of line-leading indent_width of spaces
3768 my $text = shift;
3770 $text =~ s/^ {1,$opt{indent_width}}//gm;
3771 return $text;
3775 # _DeTab
3777 # $1 => input text
3778 # $2 => optional tab width (default is $opt{tab_width})
3779 # $3 => leading spaces to strip off each line first (default is 0 aka none)
3780 # <= result with tabs expanded
3781 sub _DeTab {
3782 my $text = shift;
3783 my $ts = shift || $opt{tab_width};
3784 my $leadsp = shift || 0;
3785 my $spr = qr/^ {1,$leadsp}/ if $leadsp;
3786 pos($text) = 0;
3787 my $end = length($text);
3788 my $ans = "";
3789 while (pos($text) < $end) {
3790 my $line;
3791 if ($text =~ /\G(.*?\n)/gcs) {
3792 $line = $1;
3793 } else {
3794 $line = substr($text, pos($text));
3795 pos($text) = $end;
3797 $line =~ s/$spr// if $leadsp;
3798 # From the Perl camel book section "Fluent Perl" but modified a bit
3799 $line =~ s/(.*?)(\t+)/$1 . ' ' x (length($2) * $ts - length($1) % $ts)/ges;
3800 $ans .= $line;
3802 return $ans;
3806 sub _PrefixURL {
3808 # Add URL prefix if needed
3810 my $url = shift;
3811 $url =~ s/^\s+//;
3812 $url =~ s/\s+$//;
3813 $url = "#" unless $url ne "";
3815 return $url unless
3816 ref($opt{abs_prefix}) eq 'CODE' ||
3817 ref($opt{url_prefix}) eq 'CODE' ||
3818 ref($opt{img_prefix}) eq 'CODE' ;
3819 return $url if $url =~ m"^\002\003" || $url =~ m"^#" || $url =~ m,^//,;
3820 $url = &{$opt{abs_prefix}}($url) if $url =~ m,^/, && ref($opt{abs_prefix}) eq 'CODE';
3821 return $url if $url =~ /^[A-Za-z][A-Za-z0-9+.-]*:/ || $url =~ m,^//, ||
3822 ($opt{keepabs} && $url =~ m,^/,);
3823 my $cr = $opt{url_prefix};
3824 $cr = $opt{img_prefix}
3825 if ref($opt{img_prefix}) eq 'CODE' && $url =~ m"^[^#?]*\.(?:png|gif|jpe?g|svgz?)(?:[#?]|$)"i;
3826 return $url unless ref($cr) eq 'CODE';
3827 return "\2\3".&$cr(substr($url, 0, 1) eq '/' ? substr($url, 1) : $url);
3831 BEGIN {
3832 $g_style_sheet = <<'STYLESHEET';
3834 <style type="text/css">
3835 /* <![CDATA[ */
3837 /* Markdown.pl fancy style sheet
3838 ** Copyright (C) 2017,2018,2019,2020,2021 Kyle J. McKay.
3839 ** All rights reserved.
3841 ** Redistribution and use in source and binary forms, with or without
3842 ** modification, are permitted provided that the following conditions are met:
3844 ** 1. Redistributions of source code must retain the above copyright notice,
3845 ** this list of conditions and the following disclaimer.
3847 ** 2. Redistributions in binary form must reproduce the above copyright
3848 ** notice, this list of conditions and the following disclaimer in the
3849 ** documentation and/or other materials provided with the distribution.
3851 ** 3. Neither the name of the copyright holder nor the names of its
3852 ** contributors may be used to endorse or promote products derived from
3853 ** this software without specific prior written permission.
3855 ** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
3856 ** AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
3857 ** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
3858 ** ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
3859 ** LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
3860 ** CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
3861 ** SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
3862 ** INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
3863 ** CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
3864 ** ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
3865 ** POSSIBILITY OF SUCH DAMAGE.
3868 div.%(base)code-bt > pre, div.%(base)code > pre {
3869 margin: 0;
3870 padding: 0;
3871 overflow: auto;
3874 div.%(base)code-bt > pre > code, div.%(base)code > pre > code {
3875 display: inline-block;
3876 margin: 0;
3877 padding: 0.5em 0;
3878 border-top: thin dotted;
3879 border-bottom: thin dotted;
3882 table.%(base)table {
3883 margin-bottom: 0.5em;
3885 table.%(base)table, table.%(base)table th, table.%(base)table td {
3886 border-collapse: collapse;
3887 border-spacing: 0;
3888 border: thin solid;
3890 table.%(base)yaml-table {
3891 border-collapse: collapse;
3893 table.%(base)yaml-table * {
3894 border: thin solid;
3896 table.%(base)yaml-table th {
3897 text-align: center;
3899 table.%(base)yaml-table th, table.%(base)yaml-table td {
3900 padding-left: 0.5ex;
3901 padding-right: 0.5ex;
3904 ol.%(base)ol {
3905 counter-reset: %(base)item;
3907 ol.%(base)ol[start="0"] {
3908 counter-reset: %(base)item -1;
3910 ol.%(base)ol[start="5"] {
3911 counter-reset: %(base)item 4;
3913 ol.%(base)ol[start="10"] {
3914 counter-reset: %(base)item 9;
3916 ol.%(base)ol > span.%(base)ol-incr {
3917 counter-increment: %(base)item;
3919 ol.%(base)ol > span.%(base)ol-incr-2 {
3920 counter-increment: %(base)item 2;
3922 ol.%(base)ol > span.%(base)ol-incr-5 {
3923 counter-increment: %(base)item 5;
3925 ol.%(base)ol > span.%(base)ol-incr-10 {
3926 counter-increment: %(base)item 10;
3928 ol.%(base)lc-greek, li.%(base)lc-greek {
3929 list-style-type: lower-greek;
3931 ol.%(base)ol > li {
3932 counter-increment: %(base)item;
3934 ol.%(base)ol > li.%(base)li,
3935 ol.%(base)ol > li.%(base)li-lc,
3936 ol.%(base)ol > li.%(base)li-lc-greek,
3937 ol.%(base)ol > li.%(base)li-uc {
3938 list-style-type: none;
3939 display: block;
3941 ol.%(base)ol > li.%(base)li:before,
3942 ol.%(base)ol > li.%(base)li-lc:before,
3943 ol.%(base)ol > li.%(base)li-lc-greek:before,
3944 ol.%(base)ol > li.%(base)li-uc:before {
3945 position: absolute;
3946 text-align: right;
3947 white-space: nowrap;
3948 margin-left: -9ex;
3949 width: 9ex;
3951 ol.%(base)ol > li.%(base)li[type="1"]:before {
3952 content: counter(%(base)item, decimal) ")\A0 \A0 ";
3954 ol.%(base)ol > li.%(base)li-lc[type="i"]:before,
3955 ol.%(base)ol > li.%(base)li-lc[type="I"]:before {
3956 content: counter(%(base)item, lower-roman) ")\A0 \A0 ";
3958 ol.%(base)ol > li.%(base)li-uc[type="I"]:before,
3959 ol.%(base)ol > li.%(base)li-uc[type="i"]:before {
3960 content: counter(%(base)item, upper-roman) ")\A0 \A0 ";
3962 ol.%(base)ol > li.%(base)li-lc[type="a"]:before,
3963 ol.%(base)ol > li.%(base)li-lc[type="A"]:before {
3964 content: counter(%(base)item, lower-alpha) ")\A0 \A0 ";
3966 ol.%(base)ol > li.%(base)li-lc-greek[type="a"]:before,
3967 ol.%(base)ol > li.%(base)li-lc-greek[type="A"]:before {
3968 content: counter(%(base)item, lower-greek) ")\A0 \A0 ";
3970 ol.%(base)ol > li.%(base)li-uc[type="A"]:before,
3971 ol.%(base)ol > li.%(base)li-uc[type="a"]:before {
3972 content: counter(%(base)item, upper-alpha) ")\A0 \A0 ";
3975 li.%(base)checkbox-on,
3976 li.%(base)checkbox-off {
3977 list-style-type: none;
3978 display: block;
3980 li.%(base)checkbox-on > span:first-child + span + span,
3981 li.%(base)checkbox-off > span:first-child + span + span {
3982 position: absolute;
3983 clip: rect(0,0,0,0);
3985 li.%(base)checkbox-on > span:first-child,
3986 li.%(base)checkbox-off > span:first-child,
3987 li.%(base)checkbox-on > span:first-child + span,
3988 li.%(base)checkbox-off > span:first-child + span {
3989 display: block;
3990 position: absolute;
3991 margin-left: -3ex;
3992 width: 1em;
3993 height: 1em;
3995 li.%(base)checkbox-on > span:first-child > span:first-child,
3996 li.%(base)checkbox-off > span:first-child > span:first-child {
3997 display: block;
3998 position: absolute;
3999 left: 0.75pt; top: 0.75pt; right: 0.75pt; bottom: 0.75pt;
4001 li.%(base)checkbox-on > span:first-child > span:first-child:before,
4002 li.%(base)checkbox-off > span:first-child > span:first-child:before {
4003 display: inline-block;
4004 position: relative;
4005 right: 1pt;
4006 width: 100%;
4007 height: 100%;
4008 border: 1pt solid;
4009 content: "";
4011 li.%(base)checkbox-on > span:first-child + span:before {
4012 position: relative;
4013 left: 2pt;
4014 bottom: 1pt;
4015 font-size: 125%;
4016 line-height: 80%;
4017 vertical-align: text-top;
4018 content: "\2713";
4021 /* ]]> */
4022 </style>
4024 STYLESHEET
4025 $g_style_sheet =~ s/^\s+//g;
4026 $g_style_sheet =~ s/\s+$//g;
4027 $g_style_sheet .= "\n";
4032 __DATA__
4034 =head1 NAME
4036 Markdown.pl - convert Markdown format text files to HTML
4038 =head1 SYNOPSIS
4040 B<Markdown.pl> [B<--help>] [B<--html4tags>] [B<--htmlroot>=I<prefix>]
4041 [B<--imageroot>=I<prefix>] [B<--version>] [B<--shortversion>]
4042 [B<--tabwidth>=I<num>] [B<--stylesheet>] [B<--stub>] [--]
4043 [I<file>...]
4045 Options:
4046 -h show short usage help
4047 --help show long detailed help
4048 --html4tags use <br> instead of <br />
4049 --deprecated allow <dir> and <menu> tags
4050 --sanitize sanitize tag attributes
4051 --no-sanitize do not sanitize tag attributes
4052 --validate-xml check if output is valid XML
4053 --validate-xml-internal fast basic check if output is valid XML
4054 --no-validate-xml do not check output for valid XML
4055 --strip-comments remove XML comments from output
4056 --no-strip-comments do not remove XML comments (default)
4057 --tabwidth=num expand tabs to num instead of 8
4058 --auto-number automatically number h1-h6 headers
4059 -k | --keep-abs keep abspath URLs despite -r/-i
4060 -a prefix | --absroot=prefix append abspath URLs to prefix
4061 -b prefix | --base=prefix prepend prefix to fragment-only URLs
4062 -r prefix | --htmlroot=prefix append relative non-img URLs to prefix
4063 -i prefix | --imageroot=prefix append relative img URLs to prefix
4064 -w [wikipat] | --wiki[=wikipat] activate wiki links using wikipat
4065 --yaml[=(enable|disable|strip|...)] select YAML front matter processing
4066 -V | --version show version, authors, license
4067 and copyright
4068 -s | --shortversion show just the version number
4069 --raw | --raw-xml input contains only raw xhtml
4070 --raw-html input contains only raw html
4071 --stylesheet output the fancy style sheet
4072 --no-stylesheet do not output fancy style sheet
4073 --stub wrap output in stub document
4074 implies --stylesheet
4075 -- end options and treat next
4076 argument as file
4078 =head1 DESCRIPTION
4080 Markdown is a text-to-HTML filter; it translates an easy-to-read /
4081 easy-to-write structured text format into HTML. Markdown's text format
4082 is most similar to that of plain text email, and supports features such
4083 as headers, *emphasis*, code blocks, blockquotes, and links.
4085 Markdown's syntax is designed not as a generic markup language, but
4086 specifically to serve as a front-end to (X)HTML. You can use span-level
4087 HTML tags anywhere in a Markdown document, and you can use block level
4088 HTML tags (like <div> and <table> as well).
4090 For more information about Markdown's syntax, see the F<basics.md>
4091 and F<syntax.md> files included with F<Markdown.pl>.
4093 Input (auto-detected) may be either ISO-8859-1 or UTF-8. Output is always
4094 converted to the UTF-8 character set.
4097 =head1 OPTIONS
4099 Use "--" to end switch parsing. For example, to open a file named "-z", use:
4101 Markdown.pl -- -z
4103 =over
4106 =item B<--html4tags>
4108 Use HTML 4 style for empty element tags, e.g.:
4110 <br>
4112 instead of Markdown's default XHTML style tags, e.g.:
4114 <br />
4116 This option is I<NOT compatible> with the B<--validate-xml> option
4117 and will produce an immediate error if both are given.
4120 =item B<--deprecated>
4122 Both "<dir>" and "<menu>" are normally taken as literal text and the leading
4123 "<" will be automatically escaped.
4125 If this option is used, they are recognized as valid tags and passed through
4126 without being escaped.
4128 When dealing with program argument descriptions "<dir>" can be particularly
4129 problematic therefore use of this option is not recommended.
4131 Other deprecated tags (such as "<font>" and "<center>" for example) continue
4132 to be recognized and passed through even without using this option.
4135 =item B<--sanitize>
4137 Removes troublesome tag attributes from embedded tags. Only a very strictly
4138 limited set of tag attributes will be permitted, other attributes will be
4139 silently discarded. The set of allowed attributes varies by tag.
4141 Splits empty minimized elements that are not one of the HTML allowed empty
4142 elements (C<area> C<basefont> C<br> C<col> C<hr> C<img>) into separate begin
4143 and end tags. For example, C<< <p/> >> or C<< <p /> >> will be split into
4144 C<< <p></p> >>.
4146 Combines adjacent (whitespace separated only) opening and closing tags for
4147 the same HTML empty element into a single minimized tag. For example,
4148 C<< <br></br> >> will become C<< <br /> >>.
4150 Tags that require at least one attribute to be present to be meaningful
4151 (e.g. C<a>, C<area>, C<img>, C<map>) but have none will be treated as non-tags
4152 potentially creating unexpected errors. For example, the sequence
4153 C<< <a>text here</a> >> will be sanitized to C<< &lt;a>text here</a> >> since
4154 an C<a> tag without any attributes is meaningless, but then the trailing
4155 close tag C<< </a> >> will become an error because it has no matching open
4156 C<< <a ...> >> tag.
4158 The point of this check is not to cause undue frustration, but to allow
4159 such constructs to be used as text without the need for escaping since they
4160 are meaningless as tags. For example, C<< <a><c><e> >> works just fine
4161 as plain text and so does C<< <A><C><E> >> because the
4162 C<< <a> >>/C<< <A> >> will be treated as a non-tag automatically. In fact,
4163 they can even appear inside links too such as
4164 C<< <a href="#somewhere">Link to <a><c><e> article</a> >>.
4166 Problematic C<&> characters are fixed up such as standalone C<&>s (or those not
4167 part of a valid entity reference) are turned into C<&amp;>. Within attribute
4168 values, single and double quotes are turned into C<&> entity refs.
4170 This is enabled by default.
4173 =item B<--no-sanitize>
4175 Do not sanitize tag attributes. This option does not allow any tags that would
4176 not be allowed without this option, but it does completely suppress the
4177 attribute sanitation process. If this option is specified, no attributes will
4178 be removed from any tag (although C<img> and C<a> tags will still be affected
4179 by B<--imageroot>, B<--htmlroot>, B<--absroot> and/or B<--base> options).
4180 Use of this option is I<NOT RECOMMENDED>.
4183 =item B<--validate-xml>
4185 Perform XML validation on the output before it's output and die if
4186 it fails validation. This requires the C<XML::Simple> or C<XML::Parser>
4187 module be present (one is only required if this option is given).
4189 Any errors are reported to STDERR and the exit status will be
4190 non-zero on XML validation failure. Note that all line and column
4191 numbers in the error output refer to the entire output that would
4192 have been produced. Re-run with B<--no-validate-xml> to see what's
4193 actually present at those line and column positions.
4195 If the B<--stub> option has also been given, then the entire output is
4196 validated as-is. Without the B<--stub> option, the output will be wrapped
4197 in C<< <div>...</div> >> for validation purposes but that extra "div" added
4198 for validation will not be added to the final output.
4200 This option is I<NOT enabled by default>.
4202 This option is I<NOT compatible> with the B<--html4tags> option and will
4203 produce an immediate error if both are given.
4206 =item B<--validate-xml-internal>
4208 Perform XML validation on the output before it's output and die if
4209 it fails validation. This uses a simple internal consistency checker
4210 that finds unmatched and mismatched open/close tags.
4212 Non-empty elements that in HTML have optional closing tags (C<colgroup>
4213 C<dd> C<dt> C<li> C<p> C<tbody> C<td> C<tfoot> C<th> C<thead> C<tr>)
4214 will automatically have any omitted end tags inserted during the
4215 `--validate-xml-internal` process.
4217 Any errors are reported to STDERR and the exit status will be
4218 non-zero on XML validation failure. Note that all line and column
4219 numbers in the error output refer to the entire output that would
4220 have been produced before sanitization without any B<--stub> or
4221 B<--stylesheet> options. Re-run with B<--no-sanitize> and
4222 B<--no-validate-xml> and I<without> any B<--stub> or B<--stylesheet>
4223 options to see what's actually present at those line and column
4224 positions.
4226 This option validates the output I<prior to> adding any requested
4227 B<--stub> or B<--stylesheet>. As the built-in stub and stylesheet
4228 have already been validated that speeds things up. The output is
4229 I<NOT> wrapped (in a C<< <div>...</div> >>) for validation as that's
4230 not required for the internal checker.
4232 This option is I<IS enabled by default> unless B<--no-sanitize> is
4233 active.
4235 This option is I<IS compatible> with the B<--html4tags> option.
4237 This option requires the B<--sanitize> option and will produce an
4238 immediate error if both B<--no-sanitize> and B<--validate-xml-internal>
4239 are given.
4241 Note that B<--validate-xml-internal> is I<MUCH faster> than
4242 B<--validate-xml> and I<does NOT> require any extra XML modules to
4243 be present.
4246 =item B<--no-validate-xml>
4248 Do not perform XML validation on the output. Markdown.pl itself will
4249 normally generate valid XML sequences (unless B<--html4tags> has been
4250 used). However, any raw tags in the input (that are on the "approved"
4251 list), could potentially result in invalid XML output (i.e. mismatched
4252 start and end tags, missing start or end tag etc.).
4254 Markdown.pl will I<NOT check> for these issues itself. But with
4255 the B<--validate-xml> option will use C<XML::Simple> or C<XML::Parser>
4256 to do so.
4258 Note that B<--validate-xml-internal> is the default option unless
4259 B<--no-sanitize> is used in which case B<--no-validate-xml> is the
4260 default option.
4263 =item B<--strip-comments>
4265 Strip XML comments from the output. Any XML comments encountered will
4266 be omitted from the output if this option is given.
4268 This option requires the B<--sanitize> option to be used (which is
4269 the default).
4271 However, note that the XML standard section 2.5 specifically prohibits
4272 a C<--> sequence within an XML comment (i.e. C<--> cannot occur after
4273 the comment start tag C<< <!-- >> unless it is immediately followed
4274 by C<< > >> which makes it the comment end tag C<< --> >>).
4276 In other words, S<C<< <!-- --> >>>, S<C<< <!-- - --> >>>, S<C<< <!----> >>>,
4277 and S<C<< <!--- --> >>> are all valid XML comments, but S<C<< <!-----> >>>
4278 and S<C<< <!-- ---> >>> are not!
4280 As part of the "sanitation" process (triggered by the B<--sanitize>
4281 option), any invalid tags have their leading C<< < >> escaped (to
4282 C<< &#lt; >>) thus making them ordinary text and this I<includes>
4283 invalid XML comments.
4285 What this means is that the B<--strip-comments> option I<will not> remove
4286 invalid XML comments (such as S<C<< <!-----> >>>)!
4289 =item B<--no-strip-comments>
4291 Do not strip XML comments from the output. This is the default.
4294 =item B<--tabwidth>=I<num>
4296 Expand tabs to I<num> character wide tab stop positions instead of the default
4297 8. Don't use this; physical tabs should always be expanded to 8-character
4298 positions. This option does I<not> affect the number of spaces needed to
4299 start a new "indent level". That will always be 4 no matter what value is
4300 used (or implied by default) with this option. Also note that tabs inside
4301 backticks-delimited code blocks will always be expanded to 8-character tab
4302 stop positions no matter what value is used for this option.
4304 The value must be S<2 <= I<num> <= 32>.
4307 =item B<--auto-number>
4309 Automatically number all h1-h6 headings generated from Markdown markup.
4310 Explicit C<< <h1> >> ... C<< <h6> >> tag content remains unmolested.
4312 If this option is given, any YAML C<header_enum> setting will be ignored.
4315 =item B<-k>, B<--keep-abs>
4317 Normally any absolute path URLs (i.e. URLs without a scheme starting
4318 with "/" but not "//") are subject to modification by any
4319 B<-r>/B<--htmlroot> or B<-i>/B<--imageroot> option.
4321 If the B<-a>/B<--absroot> option is used and it transforms these
4322 absolute path URLs into a full absolute URL (i.e. starts with a
4323 scheme or "//") then any subsequent B<-r>/B<--htmlroot> or
4324 B<-i>/B<--imageroot> processing will be skipped because the URL is
4325 no longer relative.
4327 If the B<--keep-abs> option is given, then (after applying any
4328 B<-a>/B<--absroot> option if present) absolute path URLs will be
4329 kept as-is and will not be processed further by any B<-r>/B<--htmlroot>
4330 or B<-i>/B<--imageroot> option.
4332 Note that if the B<-a>/B<--absroot> option transforms an absolute
4333 path URL into a relative PATH URL it I<will> be subject to subsequent
4334 B<-r>/B<--htmlroot> or B<-i>/B<--imageroot> processing regardless
4335 of the B<-k>/B<--keep-abs> option.
4338 =item B<-a> I<prefix>, B<--absroot>=I<prefix>
4340 Any absolute path URLs (i.e. URLs without a scheme starting with "/" but not
4341 "//") have I<prefix> prepended which prevents them from being acted upon by the
4342 B<--htmlroot> and/or B<--imageroot> options provided the result is a full
4343 absolute URL. The default is to prepend nothing and leave them as absolute
4344 path URLs which will allow them to be processed by any B<--htmlroot> and/or
4345 B<--imageroot> options.
4347 This option can be helpful when documents are being formatted for display on a
4348 different system and the absolute path URLs need to be "fixed up".
4351 =item B<-b> I<prefix>, B<--base>=I<prefix>
4353 Any fragment-only URLs have I<prefix> prepended. The default is to prepend
4354 nothing and leave them as bare fragment URLs. Use of this option may be
4355 necessary when embedding the output of Markdown.pl into a document that makes
4356 use of the C<< <base> >> tag in order for intra-document fragment URL links to
4357 work properly in such a document.
4360 =item B<-r> I<prefix>, B<--htmlroot>=I<prefix>
4362 Any non-absolute URLs have I<prefix> prepended.
4365 =item B<-i> I<prefix>, B<--imageroot>=I<prefix>
4367 Any non-absolute URLs have I<prefix> prepended (overriding the B<-r> prefix
4368 if any) but only if they end in an image suffix.
4371 =item B<-w> [I<wikipat>], B<--wiki>[=I<wikipat>]
4373 Activate wiki links. Any link enclosed in double brackets (e.g. "[[link]]") is
4374 considered a wiki link. By default only absolute URL and fragment links are
4375 allowed in the "wiki link style" format. Any other double-bracketed strings
4376 are left unmolested.
4378 If this option is given, all other wiki links are enabled as well. Any
4379 non-absolute URL or fragment links will be transformed into a link using
4380 I<wikipat> where the default I<wikipat> if none is given is C<%{s(:md)}.html>.
4382 If the given I<wikipat> does not contain a C<%{...}> placeholder sequence
4383 then it will automatically have C<%{s(:md)}.html> suffixed to it.
4385 The C<...> part of the C<%{...}> sequence specifies zero or more
4386 case-insensitive single-letter options with the following effects:
4388 =over
4390 =item B<b>
4392 Retain blanks (aka spaces) in the output. They will become C<%20>
4393 in the final URL. Because spaces are always trimmed before processing
4394 wiki links, runs of multiple spaces will be collapsed into a single
4395 space and any leading or trailing spaces will be removed.
4397 =item B<d>
4399 Convert spaces to dashes (ASCII 0x2D) instead of underscore (ASCII
4400 0x5F). Note that if this option is given then runs of multiple
4401 dashes will be converted to a single dash I<instead> but runs of
4402 multiple underscores will be left untouched.
4404 =item B<f>
4406 Flatten the resulting name by replacing forward slashes (ASCII 0x2F)
4407 as well. They will be converted to underscores unless the C<d>
4408 option is given (in which case they will be converted to dashes).
4409 This conversion takes place before applying the runs-of-multiple
4410 reduction. This option is incompatible with the B<%> option.
4412 =item B<%>
4414 Flatten the resulting name by replacing runs of one or more forward
4415 slashes (ASCII 0x2F) with C<%2F>. Note that when encoded into a
4416 URL the C<%2F> actually becomes C<%252F>. This option is incompatible
4417 with the B<f> option.
4419 =item B<l>
4421 Convert link target (excluding any query string and/or fragment) to lowercase.
4422 Takes precedence over any C<u> option, but specifically excludes C<%>-escapes
4423 which are always UPPERCASE hexadecimal.
4425 =item B<r>
4427 Leave raw UTF-8 characters in the result. Normally anything not allowed
4428 directly in a URL ends up URL-encoded. With this option, raw valid UTF-8
4429 sequences will be left untouched. Use with care.
4431 =item B<s> or B<s(>I<< <ext> >>[B<,>I<< <ext> >>]...B<)>
4433 After (temporarily) removing any query string and/or fragment, strip any final
4434 "dot" suffix so long as it occurs after the last slash (if any slash was
4435 present before applying the C<f> option). The "dot" (ASCII 0x2E) and all
4436 following characters (if any) are removed. If the optional C<< (<ext>,...) >>
4437 part is present then only strip the extension if it consists of a "dot"
4438 followed by one of the case-insensitive I<< <ext> >> values. As a special
4439 case, using the value C<:md> for one of the I<< <ext> >> values causes that
4440 value to be expanded to all known markdown extensions.
4442 When processing wiki image links, this option is ignored.
4444 =item B<u>
4446 Convert link target (excluding any query string and/or fragment) to UPPERCASE.
4448 =item B<v>
4450 Leave runs-of-multiple characters alone (aka "verbatim"). Does not affect
4451 any of the other options except by eliminating the runs-of-multple reduction
4452 step. Also does I<not> inhibit the initial whitespace trimming.
4454 Does not affect the runs-of-multiple "/" replacement performed by the B<%>
4455 option.
4457 =back
4459 The URL target of the wiki link is created by first trimming whitespace
4460 (starting and ending whitespace is removed and all other runs of consecutive
4461 whitespace are replaced with a single space) from the wiki link target,
4462 removing (temporarily) any query string and/or fragment, if no options are
4463 present, spaces are converted to underscores (C<_>) and runs of multiple
4464 consecutive underscores are replaced with a single underscore (ASCII 0x5F).
4465 Finally, the I<wikipat> string gets its first placeholder (the C<%{...}>
4466 sequence) replaced with this computed value and the original query string
4467 and/or fragment is re-appended (if any were originally present) and
4468 URL-encoding is applied as needed to produce the actual final target URL.
4470 Note that when processing wiki image links, no extension stripping ever takes
4471 place (i.e. the "s" option is ignored) and anything after the placeholder (the
4472 C<%{...}> sequence) in the pattern is omitted from the result.
4474 See above option descriptions for possible available modifications.
4476 One of the commonly used hosting platforms does something substantially similar
4477 to using C<%{dfv}> as the placeholder.
4479 One of the commonly used wiki platforms does something similar to using C<%{%}>
4480 as the placeholder.
4483 =item B<--yaml>[=I<yamlmode>]
4485 Select YAML front matter processing. The optional I<yamlmode> value
4486 must be one of the following:
4488 =over
4490 =item B<enable>
4492 Recognize any YAML front matter and apply any options specified
4493 therein. If any unrecognized options are present, the options will
4494 also be shown in the formatted output.
4496 This is the default I<yamlmode> if omitted.
4498 =item B<disable>
4500 No YAML front matter processing at all takes place. If YAML front
4501 matter is present, it will be treated as regular non-YAML markup
4502 text to be processed.
4504 =item B<strip>
4506 If YAML front matter is present, it will be stripped and completely
4507 ignored before beginning to process the rest of the input.
4509 In this mode, any options in the YAML front matter that would have
4510 otherwise been recognized will I<not have any effect!>
4512 =item B<show>
4514 If YAML front matter is present and contains anything other than
4515 comments, the non-comments parts will be shown in the formatted
4516 output.
4518 In this mode, any options in the YAML front matter that would have
4519 otherwise been recognized will I<not have any effect!>
4521 This is a show-only mode.
4523 =item B<reveal>
4525 This mode works just like the B<enable> mode except that if the
4526 YAML front matter contains anything other than comments, then
4527 I<all> of the non-comments parts will be shown in the formatted
4528 output.
4530 In this mode, any recognized options in the YAML front matter I<are>
4531 processed the same way they would be in the B<enable> mode except
4532 that any option to suppress the B<reveal> mode is ignored.
4534 =item B<conceal>
4536 This mode works just like the B<enable> mode except that no options
4537 are ever shown in the formatted output regardless of whether or not
4538 there are any unrecognized options present.
4540 In this mode, any recognized options in the YAML front matter I<are>
4541 processed the same way they would be in the B<enable> mode except
4542 that any option to suppress the B<conceal> mode is ignored.
4544 =item B<unknown>
4546 This mode works just like the B<show> mode if any unrecognized
4547 YAML front matter options are present. Otherwise it works like
4548 the B<strip> mode.
4550 In this mode, any options in the YAML front matter that would have
4551 otherwise been recognized will I<not have any effect!>
4553 =back
4555 If B<--raw>, B<--raw-xml> or B<--raw-html> has been specified then
4556 the default if no B<--yaml> option has been given is B<--yaml=disable>.
4558 Otherwise the default if no B<--yaml> option has been given is
4559 B<--yaml=enable>.
4561 Note that only a limited subset of YAML is recognized. Specifically
4562 only comments, and top-level single-line S<C<key: value>> items
4563 where key must be plain (i.e. non-quoted), start with a letter or
4564 underscore and contain only letters, underscores, hyphens (C<->),
4565 periods (C<.>) and digits. Keys are case-insensitive (i.e. converted
4566 to lowercase). As with YAML, at least one whitespace is required
4567 between the ":" and the value (unless it's the empty value).
4569 Values may be either plain or double-quoted (single-quoted is not
4570 recognized). The double-quoted style may use C-style character
4571 escape codes but may not extend past the end of the line.
4573 For YAML front matter to be recognized, the very first line of the
4574 document must be exactly three hyphens (C<--->). The YAML terminates
4575 when a line of three hyphens (C<--->) or a line of three periods
4576 (C<...>) or the end of the file is encountered. Of course the YAML
4577 mode must also be something I<other> than B<--yaml=disable>.
4580 =item B<-V>, B<--version>
4582 Display Markdown's version number and copyright information.
4585 =item B<-s>, B<--shortversion>
4587 Display the short-form version number.
4590 =item B<--raw>, B<--raw-xml>
4592 Input contains only raw XHTML. All options other than
4593 B<--html4tags>, B<--deprecated>, B<--sanitize> (on by default),
4594 B<--strip-comments>, B<--validate-xml> and B<--validate-xml-internal>
4595 (and their B<--no-...> variants) are ignored.
4597 With this option, arbitrary XHTML input can be passed through
4598 the sanitizer and/or validator. If sanitation is requested (the
4599 default), input must only contain the contents of the "<body>"
4600 section (i.e. no "<head>" or "<html>"). Output I<will> be converted
4601 to UTF-8 regardless of the input encoding. All line endings will
4602 be normalized to C<\n> and input encodings other than UTF-8 or
4603 ISO-8859-1 or US-ASCII will end up mangled.
4605 Remember that any B<--stub> and/or B<--stylesheet> options are
4606 I<completely ignored> when B<--raw> is given.
4609 =item B<--raw-html>
4611 Input contains only raw HTML. All options other than
4612 B<--html4tags>, B<--deprecated>, B<--sanitize> (on by default),
4613 B<--strip-comments>, and B<--validate-xml-internal>
4614 (and their B<--no-...> variants) are ignored.
4616 Requires the (possibly implicit) B<--validate-xml-internal> option.
4618 Works just like B<--raw-xml> except that HTML auto closing and
4619 optional closing tag semantics are activated during the validation
4620 causing missing closing tags to be inserted where required by the
4621 standard. Non-raw mode always enables these semantics.
4623 This will transform HTML into valid XHTML or fail with an error message.
4625 Unfortunately, it will also fail to accept some documents that
4626 the plain B<--raw-xml> option will.
4628 For example, this document:
4630 <dt><li>a</li></dt>
4632 Will be rejected because upon encountering the C<< <li> >> open
4633 tag a closing C<< </dt> >> will automatically be inserted resulting
4634 in this document:
4636 <dt></dt><li>a</li></dt>
4638 Which, of course, no longer validates. Since C<li> blocks cannot
4639 directly be nested within C<dt> blocks (according to the standard),
4640 the input document is not strictly correct.
4642 Remember that any B<--stub> and/or B<--stylesheet> options are
4643 I<completely ignored> when B<--raw-html> is given.
4646 =item B<--stylesheet>
4648 Include the fancy style sheet at the beginning of the output (or in the
4649 C<head> section with B<--stub>). This style sheet makes fancy checkboxes
4650 and makes a right parenthesis C<)> show instead of a C<.> for ordered lists
4651 that use them. Without it things will still look fine except that the
4652 fancy stuff won't be there.
4654 Use this option with no other arguments and redirect standard input to
4655 /dev/null to get just the style sheet and nothing else.
4658 =item B<--no-stylesheet>
4660 Overrides a previous B<--stylesheet> and disables implicit inclusion
4661 of the style sheet by the B<--stub> option.
4664 =item B<--stub>
4666 Wrap the output in a full document stub (i.e. has C<html>, C<head> and C<body>
4667 tags). The style sheet I<will> be included in the C<head> section unless the
4668 B<--no-stylesheet> option is also used.
4670 The C<< <title> >> value for a document produced with the B<--stub> option
4671 comes from the first markdown markup C<h1> that's generated unless YAML
4672 processing has been enabled (the default) and a C<title> YAML value has
4673 been set in which case that always takes precedence.
4676 =item B<-h>, B<--help>
4678 Display Markdown's help. With B<--help> full help is shown, with B<-h> only
4679 the usage and options are shown.
4682 =back
4685 =head1 PERL MODULE
4687 Markdown can be used as a Perl module and can be "use"d like so:
4689 use Markdown qw(...);
4691 Or like so:
4693 BEGIN {require "Markdown.pl" && Markdown->import(qw(...))}
4695 where the C<...> part is the list of desired imports.
4697 The Markdown module does not export any functions by default.
4699 The C<Markdown.pm> file is a symbolic link to C<Markdown.pl>.
4701 =head2 Markdown module functions
4703 Any of these functions may be imported, but none of them
4704 are imported by default.
4706 =over
4709 =item * $result = Markdown::Markdown($string[, options...])
4711 Converts Markdown-format C<$string> to UTF-8 encoded XHTML and
4712 returns it.
4714 The C<options...> may be either a single HASH ref or one or more
4715 pairs of C<< key => value >>.
4717 See the comments for the C<_SanitizeOpts> function for a list of
4718 possible option keys.
4721 =item * $result = Markdown::ProcessRaw($string[, options...])
4723 Converts raw XHTML in C<$string> to XHTML and returns it.
4725 The C<options...> may be either a single HASH ref or one or more
4726 pairs of C<< key => value >>.
4728 See the comments for the C<_SanitizeOpts> function for a list of
4729 possible option keys.
4731 This function provides the ability to apply the internal XML
4732 validation and sanitation functionality to arbitrary XHTML without
4733 performing any of the Markdown format interpretation.
4736 =item * $stylesheet = Markdown::GenerateStyleSheet([$prefix])
4738 Returns an XHTML style sheet that supports the fancy Markdown styles
4739 such as checkboxes and right parenthesis lists.
4741 All of the style class names have C<$prefix> prepended.
4743 If C<$prefix> is omitted or C<undef> then the default S<"_markdown-">
4744 prefix will be used which is the same default prefix that the
4745 C<Markdown> function uses.
4747 The returned string value consists of a C<< <style type="text/css"> >>
4748 tag, the contents of the style sheet and ends with a C<< </style> >> tag.
4751 =item * Markdown::SetWikiOpts($hashref, $wikioption)
4753 The value of C<$wikioption> should be the value of the C<wikipat> value
4754 from the B<--wiki> option. Use the empty string S<""> to enable wiki
4755 links using the defaults and use C<undef> to disable wiki links.
4757 The C<wikipat> and C<wikiopt> keys in C<$hashref> will both be
4758 affected by this call and they should be passed in to the Markdown
4759 function as options to enable processing of wiki links.
4761 The simplest way to do this is simply to pass a HASH ref as the
4762 second argument to the Markdown function after having used this
4763 function on it to properly set the C<wikipat> and C<wikiopt>
4764 keys and values.
4767 =back
4770 =head2 Example
4772 This rudimentary example approximates running
4773 S<C<Markdown.pl --stub --wiki>>
4774 on the input (files if given, standard input if not).
4776 use Markdown qw(Markdown SetWikiOpts GenerateStyleSheet escapeXML);
4778 my $string;
4779 {local $/; $string = <>;}
4780 my %opts = ( h1 => "default title" );
4781 SetWikiOpts(\%opts, ""); # enable default --wiki processing
4782 my $xhtml = Markdown($string, \%opts);
4783 print "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n",
4784 "<head>\n<title>".escapeXML($opts{h1})."</title>\n",
4785 GenerateStyleSheet(),"</head>\n",
4786 "<body style=\"text-align:center\">\n",
4787 "<div style=\"".
4788 "display:inline-block;text-align:left;max-width:42pc\">\n",
4789 $xhtml, "</div></body></html>\n";
4792 =head1 VERSION HISTORY
4794 Z<> See the F<README> file for detailed release notes for this version.
4796 =over
4798 =item Z<> 1.1.10 - 08 Jul 2020
4800 =item Z<> 1.1.9 - 15 Dec 2019
4802 =item Z<> 1.1.8 - 22 Nov 2019
4804 =item Z<> 1.1.7 - 14 Feb 2018
4806 =item Z<> 1.1.6 - 03 Jan 2018
4808 =item Z<> 1.1.5 - 07 Dec 2017
4810 =item Z<> 1.1.4 - 24 Jun 2017
4812 =item Z<> 1.1.3 - 13 Feb 2017
4814 =item Z<> 1.1.2 - 19 Jan 2017
4816 =item Z<> 1.1.1 - 12 Jan 2017
4818 =item Z<> 1.1.0 - 11 Jan 2017
4820 =item Z<> 1.0.4 - 05 Jun 2016
4822 =item Z<> 1.0.3 - 06 Sep 2015
4824 =item Z<> 1.0.2 - 03 Sep 2015
4826 =item Z<> 1.0.1 - 14 Dec 2004
4828 =item Z<> 1.0.0 - 28 Aug 2004
4830 =back
4832 =head1 AUTHORS
4834 =over
4836 =item John Gruber
4838 =item L<https://daringfireball.net>
4840 =item L<https://daringfireball.net/projects/markdown/>
4842 =item E<160>
4844 =back
4846 =over
4848 =item PHP port and other contributions by Michel Fortin
4850 =item L<https://michelf.ca>
4852 =item E<160>
4854 =back
4856 =over
4858 =item Additional enhancements and tweaks by Kyle J. McKay
4860 =item mackyle<at>gmail.com
4862 =back
4864 =head1 COPYRIGHT AND LICENSE
4866 =over
4868 =item Copyright (C) 2003-2004 John Gruber
4870 =item Copyright (C) 2015-2021 Kyle J. McKay
4872 =item All rights reserved.
4874 =back
4876 Redistribution and use in source and binary forms, with or without
4877 modification, are permitted provided that the following conditions are
4878 met:
4880 =over
4882 =item *
4884 Redistributions of source code must retain the above copyright
4885 notice, this list of conditions and the following disclaimer.
4887 =item *
4889 Redistributions in binary form must reproduce the above copyright
4890 notice, this list of conditions and the following disclaimer in the
4891 documentation and/or other materials provided with the distribution.
4893 =item *
4895 Neither the name "Markdown" nor the names of its contributors may
4896 be used to endorse or promote products derived from this software
4897 without specific prior written permission.
4899 =back
4901 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
4902 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
4903 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
4904 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
4905 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
4906 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
4907 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
4908 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
4909 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
4910 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
4911 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4913 =cut