4 # Markdown -- A text-to-HTML conversion tool for web writers
6 # Copyright (C) 2004 John Gruber
7 # Copyright (C) 2015,2016,2017,2018,2019,2020,2021 Kyle J. McKay
9 # License is Modified BSD (aka 3-clause BSD) License\n";
10 # See LICENSE file (or <https://opensource.org/licenses/BSD-3-Clause>)
21 use vars
qw($COPYRIGHT $VERSION @ISA @EXPORT_OK);
24 \"Copyright (C) 2004 John Gruber
25 Copyright (C) 2015,2016,2017,2018,2019,2020,2021 Kyle J. McKay
28 *VERSION = \"1.1.11-PRE"
32 use Digest::MD5 qw(md5 md5_hex);
33 use File
::Basename
qw(basename);
34 use Scalar
::Util
qw(refaddr looks_like_number);
35 my ($hasxml, $hasxml_err); BEGIN { ($hasxml, $hasxml_err) = (0, "") }
36 my ($hasxmlp, $hasxmlp_err); BEGIN { ($hasxmlp, $hasxmlp_err) = (0, "") }
39 @EXPORT_OK = qw(Markdown ProcessRaw GenerateStyleSheet SetWikiOpts SplitURL
40 escapeXML unescapeXML ResolveFragment);
41 $INC{__PACKAGE__
.'.pm'} = $INC{basename
(__FILE__
)} unless exists $INC{__PACKAGE__
.'.pm'};
44 close(DATA
) if fileno(DATA
);
45 exit(&_main
(@ARGV)||0) unless caller;
48 my $msg = join(" ", @_);
50 printf STDERR
"%s: fatal: %s\n", basename
($0), $msg;
56 $encoder = Encode
::find_encoding
('Windows-1252') ||
57 Encode
::find_encoding
('ISO-8859-1') or
58 die "failed to load ISO-8859-1 encoder\n";
62 # Global default settings:
64 my ($g_style_prefix, $g_empty_element_suffix, $g_indent_width,
65 $g_start_p, $g_close_p);
67 $g_style_prefix = "_markdown-"; # Prefix for markdown css class styles
68 $g_empty_element_suffix = " />"; # Change to ">" for HTML output
69 $g_indent_width = 4; # Number of spaces considered new level
70 $g_start_p = "<p>"; # _FormParagraphs open paragraph tag
71 $g_close_p = "</p>"; # _FormParagraphs close paragraph tag
79 # Style sheet template
82 # Permanent block id table
85 # Global hashes, used by various utility routines
98 # Return a "block id" to use to identify the block that does not contain
99 # any characters that could be misinterpreted by the rest of the code
100 # Originally this used md5_hex but that's unnecessarily slow
101 # Instead just use the refaddr of the scalar ref of the entry for that
102 # key in either the global or, if the optional second argument is true,
103 # permanent table. To avoid the result being confused with anything
104 # else, it's prefixed with a control character and suffixed with another
105 # both of which are not allowed by the XML standard or Unicode.
107 $_[1] or return "\5".refaddr
(\
$g_block_ids{$_[0]})."\6";
108 $_[1] == 1 and return "\2".refaddr
(\
$g_perm_block_ids{$_[0]})."\3";
109 $_[1] == 2 and return "\25".refaddr
(\
$g_code_block_ids{$_[0]})."\26";
110 die "programmer error: bad block_id type $_[1]";
113 # Regex to match balanced [brackets]. See Friedl's
114 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
115 my $g_nested_brackets;
117 $g_nested_brackets = qr{
118 (?
> # Atomic matching
119 [^\
[\
]]+ # Anything other than brackets
122 (??
{ $g_nested_brackets }) # Recursive set of nested brackets
128 # Regex to match balanced (parentheses)
131 $g_nested_parens = qr{
132 (?
> # Atomic matching
133 [^\
(\
)]+ # Anything other than parentheses
136 (??
{ $g_nested_parens }) # Recursive set of nested parentheses
142 # Table of hash values for escaped characters:
145 $g_escape_table{""} = "\2\3";
146 foreach my $char (split //, "\\\`*_~{}[]()>#+-.!|:<") {
147 $g_escape_table{$char} = block_id
($char,1);
151 # Used to track when we're inside an ordered or unordered list
152 # (see _ProcessListItems() for details):
159 #### Blosxom plug-in interface ##########################################
163 $_haveBX = defined($blosxom::version
);
166 # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
167 # which posts Markdown should process, using a "meta-markup: markdown"
168 # header. If it's set to 0 (the default), Markdown will process all
170 my $g_blosxom_use_meta;
172 $g_blosxom_use_meta = 0;
177 my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
179 if ((! $g_blosxom_use_meta) or
180 (defined($meta::markup
) and ($meta::markup
=~ /^\s*markdown\s*$/i))
182 $$body_ref = Markdown
($$body_ref);
188 #### Movable Type plug-in interface #####################################
189 my $_haveMT = eval {require MT
; 1;}; # Test to see if we're running in MT
190 my $_haveMT3 = $_haveMT && eval {require MT
::Plugin
; 1;}; # and MT >= MT 3.0.
195 require MT
::Template
::Context
;
196 import MT
::Template
::Context
;
201 my $plugin = new MT
::Plugin
({
203 description
=> "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
204 doc_link
=> 'https://daringfireball.net/projects/markdown/'
206 MT
->add_plugin( $plugin );
209 MT
::Template
::Context
->add_container_tag(MarkdownOptions
=> sub {
212 my $builder = $ctx->stash('builder');
213 my $tokens = $ctx->stash('tokens');
215 if (defined ($args->{'output'}) ) {
216 $ctx->stash('markdown_output', lc $args->{'output'});
219 defined (my $str = $builder->build($ctx, $tokens) )
220 or return $ctx->error($builder->errstr);
224 MT
->add_text_filter('markdown' => {
226 docs
=> 'https://daringfireball.net/projects/markdown/',
232 my $output = $ctx->stash('markdown_output');
233 if (defined $output && $output =~ m/^html/i) {
234 $g_empty_element_suffix = ">";
235 $ctx->stash('markdown_output', '');
237 elsif (defined $output && $output eq 'raw') {
239 $ctx->stash('markdown_output', '');
243 $g_empty_element_suffix = " />";
246 $text = $raw ?
$text : Markdown
($text);
251 # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
256 $smartypants = $MT::Template
::Context
::Global_filters
{'smarty_pants'};
260 MT
->add_text_filter('markdown_with_smartypants' => {
261 label
=> 'Markdown With SmartyPants',
262 docs
=> 'https://daringfireball.net/projects/markdown/',
267 my $output = $ctx->stash('markdown_output');
268 if (defined $output && $output eq 'html') {
269 $g_empty_element_suffix = ">";
272 $g_empty_element_suffix = " />";
275 $text = Markdown
($text);
276 $text = $smartypants->($text, '1');
283 return $_haveBX || $_haveMT ?
4 : 8;
288 defined($str) or return undef;
296 BEGIN {%_yamlmode = (
316 sub _require_pod_usage
() {
318 eval 'require Pod::Text::Termcap; 1;' and
319 @Pod::Usage
::ISA
= (qw( Pod::Text::Termcap ));
322 #### BBEdit/command-line text filter interface ##########################
327 #### Check for command-line switches: #################
332 Getopt
::Long
::Configure
(qw(bundling require_order pass_through));
336 Pod
::Usage
::pod2usage
(-verbose
=> 2, -exitval
=> 0)},
339 Pod
::Usage
::pod2usage
(-verbose
=> 0, -exitval
=> 0)},
340 'version|V' => sub { # Version info
341 print "\nThis is Markdown, version $VERSION.\n", $COPYRIGHT;
342 print "License is Modified BSD (aka 3-clause BSD) License\n";
343 print "<https://opensource.org/licenses/BSD-3-Clause>\n";
345 'shortversion|short-version|s' => sub { # Just the version number string
348 'html4tags' => \
$cli_opts{'html4tags'},
349 'deprecated' => \
$cli_opts{'deprecated'},
350 'sanitize' => \
$cli_opts{'sanitize'},
351 'no-sanitize' => sub {$cli_opts{'sanitize'} = 0},
352 'validate-xml' => sub {$cli_opts{'validate-xml'} = 1},
353 'validate-xml-internal' => sub {$cli_opts{'validate-xml'} = 2},
354 'no-validate-xml' => sub {$cli_opts{'validate-xml'} = 0},
355 'stripcomments|strip-comments' => \
$cli_opts{'stripcomments'},
356 'no-stripcomments|no-strip-comments' => sub {$cli_opts{'stripcomments'} = 0},
357 'keepabs|keep-abs|k' => \
$cli_opts{'keepabs'},
358 'absroot|a=s' => \
$cli_opts{'absroot'},
359 'base|b=s' => \
$cli_opts{'base'},
360 'htmlroot|r=s' => \
$cli_opts{'htmlroot'},
361 'imageroot|i=s' => \
$cli_opts{'imageroot'},
362 'wiki|w:s' => \
$cli_opts{'wiki'},
363 'tabwidth|tab-width=s' => \
$cli_opts{'tabwidth'},
364 'autonumber|auto-number' => \
$cli_opts{'autonumber'},
365 'raw' => sub { $cli_opts{'raw'} = 1 },
366 'raw-xml' => sub { $cli_opts{'raw'} = 1 },
367 'raw-html' => sub { $cli_opts{'raw'} = 2 },
368 'stylesheet|style-sheet' => \
$cli_opts{'stylesheet'},
369 'no-stylesheet|no-style-sheet' => sub {$cli_opts{'stylesheet'} = 0},
370 'stub' => \
$cli_opts{'stub'},
371 'yaml:s' => \
$cli_opts{'yaml'},
373 defined($cli_opts{'raw'}) or $cli_opts{'raw'} = 0;
375 if ($cli_opts{'stub'}) {
378 if ($cli_opts{'html4tags'}) { # Use HTML tag style instead of XHTML
379 $options{empty_element_suffix
} = ">";
382 if ($cli_opts{'deprecated'}) { # Allow <dir> and <menu> tags to pass through
383 _SetAllowedTag
("dir");
384 _SetAllowedTag
("menu");
387 $options{sanitize
} = 1; # sanitize by default
388 $options{sanitize
} = $cli_opts{'sanitize'} if defined($cli_opts{'sanitize'});
389 $xmlcheck = $options{sanitize
} ?
2 : 0;
390 $xmlcheck = $cli_opts{'validate-xml'} if defined($cli_opts{'validate-xml'});
391 $options{stripcomments
} = $cli_opts{'stripcomments'} if defined($cli_opts{'stripcomments'});
392 die "--html4tags and --validate-xml are incompatible\n"
393 if $cli_opts{'html4tags'} && $xmlcheck == 1;
394 die "--no-sanitize and --validate-xml-internal are incompatible\n"
395 if !$options{'sanitize'} && $xmlcheck == 2;
396 die "--no-sanitize and --strip-comments are incompatible\n"
397 if !$options{'sanitize'} && $options{stripcomments
};
398 die "--raw-html requires --validate-xml-internal\n"
399 if $cli_opts{'raw'} == 2 && $xmlcheck != 2;
400 if ($xmlcheck == 1) {
401 eval { require XML
::Simple
; 1 } and $hasxml = 1 or $hasxml_err = $@
;
402 eval { require XML
::Parser
; 1 } and $hasxmlp = 1 or $hasxmlp_err = $@
unless $hasxml;
403 die "$hasxml_err$hasxmlp_err" unless $hasxml || $hasxmlp;
405 if ($cli_opts{'tabwidth'}) {
406 my $tw = $cli_opts{'tabwidth'};
407 die "invalid tab width (must be integer)\n" unless looks_like_number
$tw;
408 die "invalid tab width (must be >= 2 and <= 32)\n" unless $tw >= 2 && $tw <= 32;
409 $options{tab_width
} = int(0+$tw);
411 $options{auto_number
} = 6 if $cli_opts{'autonumber'};
412 $options{keepabs
} = $cli_opts{'keepabs'};
413 $options{abs_prefix
} = ""; # no abs prefix by default
414 if ($cli_opts{'absroot'}) { # Use abs prefix for absolute path URLs
415 my $abs = $cli_opts{'absroot'};
417 $options{abs_prefix
} = $abs;
419 $options{base_prefix
} = ""; # no base prefix by default
420 if ($cli_opts{'base'}) { # Use base prefix for fragment URLs
421 $options{base_prefix
} = $cli_opts{'base'};
423 if ($cli_opts{'htmlroot'}) { # Use URL prefix
424 $options{url_prefix
} = $cli_opts{'htmlroot'};
426 if ($cli_opts{'imageroot'}) { # Use image URL prefix
427 $options{img_prefix
} = $cli_opts{'imageroot'};
429 SetWikiOpts
(\
%options, $cli_opts{'wiki'}); # Set wiki links options
430 if (ref($options{wikiopt
}) eq 'HASH') {
431 my $o = $options{wikiopt
};
432 $o->{"f"} && $o->{"%"} and
433 die "--wiki sub-options 'f' and '%' are mutually exclusive\n"
435 if ($cli_opts{'raw'}) {
437 $options{htmlauto
} = 1 if $cli_opts{'raw'} == 2;
439 $options{show_styles
} = $cli_opts{'stylesheet'} if defined($cli_opts{'stylesheet'});
440 $options{show_styles
} = 1 if $stub && !defined($options{show_styles
});
441 $options{tab_width
} = 8 unless defined($options{tab_width
});
442 my $ym = $cli_opts{'yaml'};
443 defined($ym) && $ym ne "" or $ym = "enable";
445 exists($_yamlmode{$lcym}) or die "invalid --yaml= value '$ym'\n";
446 $options{yamlmode
} = $_yamlmode{$lcym};
447 $options{yamlvis
} = $_yamlvis{$lcym};
454 <html xmlns="http://www.w3.org/1999/xhtml">
456 <meta charset="utf-8" />
457 <meta http-equiv="content-type" content="text/html; charset=utf-8" />
459 } elsif ($stub < 0) {
463 <meta charset="utf-8">
464 <meta http-equiv="content-type" content="text/html; charset=utf-8">
467 if ($stub && ($options{title} || $options{h1})) {
468 my $title = $options{title};
469 defined($title) && $title ne "" or $title = $options{h1};
470 if (defined($title) && $title ne "") {
471 $title =~ s/&/&/g;
472 $title =~ s/</</g;
473 $out .= "<title>$title</title>\n";
476 $out .= GenerateStyleSheet($g_style_prefix) if $options{show_styles};
478 $out .= "</head>\n<body style=\"text-align:center\">\n" .
479 "<div style=\"display:inline-block;text-align:left;max-width:42pc\">\n";
484 #### Process incoming text: ###########################
485 my ($didhdr, $hdr, $result, $ftr) = (0, "", "", "");
486 @ARGV or push(@ARGV, "-");
488 my ($fh, $contents, $oneresult);
489 $_ eq "-" or open $fh, '<', $_ or fauxdie "could not open \"$_\": $!\n";
491 local $/; # Slurp the whole file
492 $_ eq "-" and $contents = <STDIN>;
493 $_ ne "-" and $contents = <$fh>;
495 defined($contents) or fauxdie "could not read \"$_\": $!\n";
496 $_ eq "-" or close($fh);
497 $options{xmlcheck} = ($xmlcheck == 2) ? 2 : 0;
498 $oneresult = $raw ? ProcessRaw($contents, \%options) : Markdown($contents, \%options);
499 $oneresult =~ s/\s+$//os;
500 if ($oneresult ne "") {
501 if (!$didhdr && !$raw) {
505 $result .= $oneresult . "\n";
508 $hdr = &$hdrf() unless $didhdr || $raw;
509 $ftr = "</div>\n</body>\n</html>\n" if $stub && !$raw;
510 if ($xmlcheck == 1) {
512 if ($stub && !$raw) {
513 ($good, $errs) = _xmlcheck($hdr.$result.$ftr);
515 ($good, $errs) = _xmlcheck("<div>".$result."</div>");
519 print $hdr, $result, $ftr;
527 # $2: value of --wiki= option (see docs) except
528 # that a value of undef turns off wiki links
535 ref($o) eq "HASH" or die "internal error: first arg to SetWikiOpts must be HASH ref";
536 delete $o->{wikipat};
537 delete $o->{wikiopt};
538 defined($wpat) or return;
539 # Parse wiki links option setting
541 if ($wpat =~ /^(.*?)%\{((?:[%0-9A-Za-z]|[Ss]\([^)]*\))*)\}(.*)$/) {
542 $o->{wikipat} = $1 . "%{}" . $3;
545 $o->{wikipat} = $wpat . "%{}.html";
548 while ($wopt =~ /^(.*?)s\(([^)]*)\)(.*)$/i) {
550 $wopt = $1 . "s" . $3;
551 $sarg =~ s/^\s+//; $sarg =~ s/\s+$//;
552 $sval = {} unless ref($sval) eq "HASH";
553 s/^\.//, $sval->{lc($_)}=1 foreach split(/(?:\s*,\s*)|(?:(?<!,)\s+(?!,))/, $sarg);
554 $sval = 1 unless scalar(keys(%$sval));
556 $o->{wikiopt} = { map({$_ => 1} split(//,lc($wopt))) };
557 if (ref($sval) eq "HASH" && $sval->{':md
'}) {
558 delete $sval->{':md
'};
559 $sval->{$_} = 1 foreach qw(md rmd mkd mkdn mdwn mdown markdown litcoffee);
561 $o->{wikiopt
}->{'s'} = $sval if $o->{wikiopt
}->{'s'};
565 # Return a copy of the fancy CSS style sheet that uses the
566 # passed in prefix as a prefix for the CSS style names.
567 # If no argument is passed in, use $g_style_prefix
568 # as the CSS style name prefix.
569 sub GenerateStyleSheet
{
571 defined($prefix) or $prefix = $g_style_prefix;
572 my $stylesheet = $g_style_sheet;
573 $stylesheet =~ s/%\(base\)/$prefix/g;
581 ($hasxml ?
eval { XML
::Simple
::XMLin
($text, KeepRoot
=> 1) && 1 } :
583 my $p = XML
::Parser
->new(Style
=> 'Tree', ErrorContext
=> 1);
584 $p->parse($text) && 1;
585 }) and $good = 1 or $errs = _trimerr
($@
);
592 1 while $err =~ s{\s+at\s+\.?/[^,\s\n]+\sline\s+[0-9]+\.?(\n|$)}{$1}is;
599 my ($input,$parseyaml) = @_;
600 defined $input or $input = "";
603 $input =~ s/[\x00-\x08\x0B\x0E-\x1F\x7F]+//gso;
606 if (Encode
::is_utf8
($input) || utf8
::decode
($input)) {
609 $output = $encoder->decode($input, Encode
::FB_DEFAULT
);
611 # Standardize line endings:
612 $output =~ s{\r\n}{\n}g; # DOS to Unix
613 $output =~ s{\r}{\n}g; # Mac to Unix
615 # Extract YAML front matter if requested
619 if ($output =~ /^---[ \t]*(?:\n|\z)/g) {
620 until ($output =~ /\G(?:(?:(?:---)|(?:\.\.\.))[ \t]*(?:\n|\z)|\z)/gc) {
621 next if $output =~ m
"\G[ \t]*(?:#[^\n]*)?\n"gc
; # skip comment lines
622 next if $output =~ m
"\G[ \t]*(?:#[^\n]*)\z"gc
; # skip final no EOL comment
623 last unless $output =~ /\G([^\n]+)(?:\n|\z)/gc;
625 if ($yl =~ /^([A-Za-z_][A-Za-z_0-9.-]*):[ \t]+(.*)$/os) {
626 my ($k, $v) = ($1, $2);
627 $yaml->{lc($k)} = _YAMLvalue
($2);
630 $output = substr($output, pos($output));
633 return wantarray ?
($output, $yaml) : $output;
640 if (substr($v, 0, 1) eq '"') {
641 # only $ and/or @ present issues, map them
642 $v =~ tr/\@\$/\036\037/;
643 eval '{$v='.$v."\n}1" or $v = undef;
644 $v =~ tr/\036\037/\@\$/ if defined($v);
648 $v ne "" or $v = undef;
655 my $text = _PrepareInput(shift);
657 # Any remaining arguments after the first are options; either a single
658 # hashref or a list of name, value pairs. See _SanitizeOpts comments.
661 empty_element_suffix => $g_empty_element_suffix,
664 if (ref($_[0]) eq "HASH
") {
669 while (my ($k,$v) = each %args) {
672 _SanitizeOpts(\%opt);
674 # Sanitize all '<'...'>' tags if requested
675 $text = _SanitizeTags($text, $opt{xmlcheck}, $opt{htmlauto}) if $opt{sanitize};
682 # $1: HASH ref with the following key value semantics
684 # sanitize => any-false-value (no action), any-true-value (sanitize).
685 # note that a true value of xmlcheck or a true value of
686 # stripcomments or a urlfunc value that is a CODE ref
687 # always forces sanitize to activate.
688 # tag attributes are sanitized by removing all "questionable
"
689 # attributes (such as script attributes, unknown attributes
690 # and so forth) and normalizing the remaining ones (i.e.
691 # adding missing quotes and/or values etc.).
692 # effective for both ProcessRaw and Markdown.
693 # xmlcheck => 0 (no check), any-true-value (internal check).
694 # note that the default if xmlcheck is not set/valid is 2.
695 # note that a true value is effective for both ProcessRaw
696 # and Markdown. note that a true value automatically inserts
697 # the closing tag for auto-closing tags and converts empty tags
698 # to the correct format converting empty tags that shouldn't be
699 # to an open and close pair; since xmlcheck is a function of the
700 # sanitizer, tag attributes are also always sanitized whenever
701 # xmlcheck has a true value.
702 # note that a true xmlcheck value WILL call "die" with a
703 # detailed indication of the error(s) if xml validation fails
704 # in which case any line/column numbers refer to the text that
705 # would be produced by a sanitize=>0, xmlcheck=>0 call to
706 # either ProcessRaw or Markdown, NOT the original input text.
707 # htmlauto => any-false-value (no auto close), any-true-value (auto-close)
708 # only effective for ProcessRaw; always enabled for Markdown.
709 # when xmlcheck is set to 2 provide html automatic closing tag
710 # and optional closing tag semantics where closing tags are
711 # automatically inserted when encountering an opening tag that
712 # auto closes a currently open tag and tags with an optional
713 # closing tag that's missing have that inserted as appropriate.
714 # a true value may result in some texts being rejected that
715 # would be otherwise be accepted (e.g. "<p
><pre
></pre></p>"
716 # which gets turned into "<p
></p><pre></pre
></p
>" which then
717 # no longer validates).
718 # stripcomments => any-false-value (no action), any-true-value (strip).
719 # since the strip comments mechanism is a function of the
720 # sanitizer, if stripcomments is set to any-true-value then
721 # tag attributes will also always be sanitized.
722 # effective for both ProcessRaw and Markdown.
723 # empty_element_suffix => " />" or ">"
724 # will be forced to " />" if not valid or defined.
725 # effective for both ProcessRaw and Markdown.
726 # urlfunc => if set to a CODE ref, the function will be called with
727 # seven arguments like so:
728 # $result = &$urlfunc($iresult, \%opts, $tag, $uhost, $uabs, $q, $f)
729 # where on input $iresult is the result that would be produced
730 # if no urlfunc was provided and on output $result will be
731 # used as url value. $tag is either "img
" or "a
" to indicate the
732 # source of the url. $uhost.$uabs is the result of stripping off
733 # any query string and/or fragment from $iresult. $q contains either
734 # an empty string or the stripped off query string and $f contains
735 # an empty string or the stripped off fragment if they were originally
736 # present where a non-empty $q always starts with '?' and a non-empty
737 # $f always starts with '#'. $uhost contains the scheme and
738 # host+port if present (it may be the empty string). $uabs contains
739 # the path portion which may or may not start with a "/" depending on
740 # whether or not it's a relative path. The $iresult value is related
741 # to the other arguments like so:
742 # $iresult = $uhost . $uabs . $q . $f;
743 # All values passed to the urlfunc function have already been HTML
744 # unescaped and the returned value will be automatically HTML escaped.
745 # Any provided urlfunc should treat the %opts HASH as
746 # read-only. Modifying the %opts HASH in urlfunc will
747 # likely result in unpredictable behavior! Don't do it!
748 # If urlfunc is set to a CODE ref then tags will also always be
751 # The remaining key value pairs are ignored by ProcessRaw and are only
752 # effective when using Markdown or _main
754 # tab_width => 1..32 which is how many spaces tabs are expanded to.
755 # will be forced to 8 if not in range.
756 # indent_width => 1..32 how many spaces make a new "indent" level.
757 # will be forced to 4 if not in range.
758 # style_prefix => prefix to prepend to all CSS style names in the
759 # fancy CSS style sheet.
760 # defaults to $g_style_prefix if not defined.
761 # note that _main actually adds the style sheet (when
762 # requested); use GenerateStyleSheet to retrieve the
763 # fancy style sheet when calling Markdown directly.
764 # auto_number => <= 0 (default) no numbering, 1 number h1s,
765 # 2 number h1s, h2s, 3 number h1-h3s, ... >= 6 number h1-h6s
766 # anchors => existence of this key triggers return of anchors HASH
767 # yamlmode => 0 (no YAML processing), > 0 (YAML on), < 0 (YAML ignore)
768 # if 0, the YAML front matter processor is completely
769 # disabled and any YAML front matter that might be present
770 # will be treated as markup. if > 0 any YAML front matter
771 # will be processed and any recognized options applied.
772 # if < 0 any YAML front matter will be parsed but no
773 # options will be applied at all. When != 0 the parsed
774 # YAML front matter can be retrieved via the 'yaml' key.
775 # defaults to 1 if not defined or not a number.
776 # yamlvis => 0 (invisible), > 0 (visible), < 0 (vis if unknown)
777 # if yamlmode == 0 then yamlvis has no effect. if > 0
778 # then any parsed YAML front matter options will be shown
779 # in the formatted output. if 0 then NO YAML front
780 # matter options will be shown in the formatted output.
781 # if < 0 then YAML front matter options will be shown in
782 # the formatted output only if there are any unrecognized
784 # defaults to -1 if not defined or not a number.
785 # keepabs => any-false-value (no action), any-true-value (keep)
786 # if true, any absolute path URLs remaining after applying
787 # any abs_prefix value will be kept and not be subject
788 # to modification by any url_prefix or img_prefix value.
789 # abs_prefix => value to prefix to absolute path URLs (i.e. start with /).
790 # note that this does NOT get prepended to //host/path URLs.
791 # url_prefix => value to prefix to non-absolute URLs.
792 # note that this does NOT get prepended to //host/path URLs.
793 # img_prefix => value to prefix to non-absolute image URLs.
794 # note that this does NOT get prepended to //host/path URLs.
795 # note that if img_prefix is undef or empty ("") then
796 # url_prefix will be prepended to image URLs.
797 # base_prefix => value to prefix to fragment-only URLs (i.e. start with #).
798 # note that fragment-only URLs are always left undisturbed
799 # if this is not set. Fragment-only URLs are NOT affected by
800 # any of abs_prefix, url_prefix or img_prefix.
801 # wikipat => non-empty pattern string to enable wiki links.
802 # best set with SetWikiOpts (see SetWikiOpts comments).
803 # wikiopt => HASH ref of options affecting wiki links processing.
804 # best set with SetWikiOpts (see SetWikiOpts comments).
805 # wikifunc => if set to a CODE ref, the function will be called with
806 # six arguments like so:
807 # $result = &$wikifunc($iresult, \%opts, $link, $wbase, $qf, $io)
808 # where on input $iresult is the result that would be produced
809 # if no wikifunc was provided and on output $result will be
810 # used as the wiki expansion. $link is the original wiki
811 # destination as specified in the source, $wbase is the result
812 # of stripping off any query string and/or fragment from $link
813 # and then transforming that according to the wikiopt HASH ref.
814 # $qf contains either an empty string or the stripped off
815 # query string and/or fragment if one was originally present.
816 # If $io is a HASH ref (otherwise it will be undef), then it's
817 # a wiki image link and %$io contains the options (if any) where
818 # the keys that might be present include "width", "height", "align"
819 # and "alt". If present, width and height are guaranteed to be
820 # positive integers and align is guaranteed to be "left", "right"
821 # or "center". The value for "alt" may be the empty string.
822 # The "imgflag" key has a SCALAR ref value and if the value it
823 # refers to is changed to any false value the result will become
824 # an A tag rather than an IMG tag.
825 # The $iresult value is related to the other arguments like so:
826 # $iresult = $opts->{wikipat};
827 # $iresult =~ s/%\{\}.+$/%{}/ if ref($io) eq "HASH";
828 # $iresult = s/%\{\}/$wbase/;
830 # Any provided wikifunc should treat the %opts HASH as
831 # read-only. Modifying the %opts HASH in wikifunc will
832 # likely result in unpredictable behavior! Don't do it!
834 # Special handling for abs_prefix, url_prefix, img_prefix and/or base_prefix
835 # may be activated by setting any subset (or all) of the values for these
836 # keys to a CODE ref. The single argument is the URL and the result must
837 # be the adjusted URL. For example, the equivalent CODE ref to setting
838 # url_prefix to $string is simply sub { $string.$_[0] }. By using a
839 # CODE ref, behavior other than simply performing a prepend operation
840 # can be realized when necessary for unusual situations.
842 # The following are OUTPUT values that can only be retrieved when
843 # Markdown is called with a HASH ref as the second argument
845 # anchors => if the 'anchors' key exists in the input HASH ref
846 # will be set to a HASH ref containing lookup keys
847 # for valid fragment ids in the document (only those
848 # created from Markdown markup) with the value the
849 # actual fragment link to use. Do not use this directly
850 # but pass it as the first argument to the ResolveFragment
851 # function to resolve a "fuzzy" fragment name to its
852 # actual fragment name in the generated output.
853 # NOTE: to activate return of anchors the 'anchors' key
854 # simply must exist in the input HASH ref passed to the
855 # Markdown function, its value will be replaced on output.
857 # h1 => will be set to the tag-stripped value of the first
858 # non-empty H1 generated by Markdown-style markup.
859 # note that literal <h1>...</h1> values are NOT picked up.
860 # will be left unchanged if no Markdown-style H1 detected.
861 # note that the value is NOT xml escaped but should be
862 # before embedding in an XHTML document. If yamlmode > 0
863 # and a 'title' value has been encountered, then this
864 # will be set to that 'title' value instead (and the
865 # 'title' key and value will still be present in %$yaml).
867 # yaml => if yamlmode is != 0 then this will be set to a HASH
868 # ref containing any parsed YAML front matter or left
869 # unchanged if no YAML front matter was found. If the
870 # parsed YAML front matter contains only whitespace and/or
871 # comments then this will be set to a HASH ref that has
875 my $o = shift; # hashref
876 ref($o) eq "HASH" or return;
878 $o->{xmlcheck
} = looks_like_number
($o->{xmlcheck
}) && $o->{xmlcheck
} == 0 ?
0 : 2;
879 $o->{sanitize
} = 1 if $o->{stripcomments
} && !$o->{sanitize
};
880 $o->{sanitize
} = 1 if $o->{xmlcheck
} && !$o->{sanitize
};
881 $o->{sanitize
} = 1 if ref($o->{urlfunc
}) eq 'CODE' && !$o->{sanitize
};
883 # this is gross, but having the globals avoids unnecessary slowdown
884 if ($o->{sanitize
} && $o->{xmlcheck
}) {
885 $g_start_p = "<\20>";
886 $g_close_p = "</\20>";
892 defined($o->{empty_element_suffix
}) &&
893 ($o->{empty_element_suffix
} eq " />" || $o->{empty_element_suffix
} eq ">")
894 or $o->{empty_element_suffix
} = " />";
896 $o->{tab_width
} = 8 unless looks_like_number
($o->{tab_width
}) &&
897 1 <= $o->{tab_width
} && $o->{tab_width
} <= 32;
898 $o->{tab_width
} = int($o->{tab_width
});
900 $o->{indent_width
} = 4 unless looks_like_number
($o->{indent_width
}) &&
901 1 <= $o->{indent_width
} && $o->{indent_width
} <= 32;
902 $o->{indent_width
} = int($o->{indent_width
});
904 defined($o->{auto_number
}) or $o->{auto_number
} = '';
905 $o->{auto_number
} eq '' || looks_like_number
($o->{auto_number
})
906 or $o->{auto_number
} = 6;
907 if ($o->{auto_number
} ne '') {
908 $o->{auto_number
} = int(0+$o->{auto_number
});
909 $o->{auto_number
} >= 0 or $o->{auto_number
} = 0;
910 $o->{auto_number
} <= 6 or $o->{auto_number
} = 6;
913 defined($o->{style_prefix
}) or $o->{style_prefix
} = $g_style_prefix;
915 $o->{abs_prefix
} = _MakePrefixCODERef
($o->{abs_prefix
}, 1)
916 unless ref($o->{abs_prefix
}) eq 'CODE';
917 $o->{url_prefix
} = _MakePrefixCODERef
($o->{url_prefix
}, 0)
918 unless ref($o->{url_prefix
}) eq 'CODE';
919 $o->{img_prefix
} = _MakePrefixCODERef
($o->{img_prefix
}, 0)
920 unless ref($o->{img_prefix
}) eq 'CODE';
921 $o->{base_prefix
} = _MakePrefixCODERef
($o->{base_prefix
}, -1)
922 unless ref($o->{base_prefix
}) eq 'CODE';
924 ref($o->{wikiopt
}) eq "HASH" or $o->{wikiopt
} = {};
926 # Note that because Markdown makes a copy of the options
927 # before calling this function, this does not actually remove
928 # any "h1" key that might have been set by the caller of
929 # the Markdown function. However, by deleting it here,
930 # this guarantees that any found value will actually be
931 # picked up and stored (which will not happen if the key
935 # Default is to silently strip any known YAML front matter
936 # Same comment about "yaml" key as above for "h1" key
937 $o->{yamlmode
} = 1 unless looks_like_number
($o->{yamlmode
});
938 $o->{yamlvis
} = -1 unless looks_like_number
($o->{yamlvis
});
941 # The anchors hash will only be returned if the key exists
942 # (the key's value doesn't matter), set the value to an empty
943 # HASH ref just in case to make sure it's always a HASH ref.
944 $o->{anchors
} = {} if exists($o->{anchors
});
948 BEGIN {%_yamlopts = map({$_ => 1} qw(
955 sub _HasUnknownYAMLOptions
{
956 do { return 1 unless exists($_yamlopts{$_}) } foreach keys(%{$_[0]});
962 my ($yaml, $opt) = @_;
963 if (defined($yaml->{display_metadata
}) && $opt->{yamlvis
} < 0) {
964 # ignore display_metadata except in --yaml=enable mode
965 $opt->{yamlvis
} = _YAMLTrueValue
($yaml->{display_metadata
}) ?
1 : 0;
967 $opt->{h1
} = $yaml->{title
} if defined($yaml->{title
});
968 if (defined($yaml->{header_enum
}) && $opt->{auto_number
} eq '') {
969 $opt->{auto_number
} = _YAMLTrueValue
($yaml->{header_enum
}) ?
6 : 0;
976 defined($v) or $v = "";
978 return !($v eq "" || $v eq "0" || $v eq "false" || $v eq "disable" || $v eq "off" || $v eq "no");
982 # Actually returns an empty string rather than a CODE ref
983 # if an empty prefix is passed in. Trailing "/"s are trimmed
984 # off if the second argument is positive or the string does NOT
985 # consist of only "/"s. A trailing "/" is added unless the
986 # trimmed prefix already has one or the second argument is true.
987 # If the second argument is negative, the prefix is used as-is.
988 sub _MakePrefixCODERef
{
989 my ($prefix, $mtok) = @_;
990 defined($prefix) or $prefix = "";
991 looks_like_number
($mtok) or $mtok = $mtok ?
1 : 0;
995 $prefix =~ s
,//+$,/,;
997 $prefix ne "" or return "";
998 $prefix .= '/' if !$mtok && substr($prefix, -1, 1) ne '/';
999 return sub { $prefix . $_[0] };
1005 # Primary function. The order in which other subs are called here is
1006 # essential. Link and image substitutions need to happen before
1007 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
1008 # and <img> tags get encoded.
1012 # Any remaining arguments after the first are options; either a single
1013 # hashref or a list of name, value pairs. See _SanitizeOpts comments.
1015 # set initial defaults
1016 style_prefix
=> $g_style_prefix,
1017 empty_element_suffix
=> $g_empty_element_suffix,
1018 tab_width
=> _tabDefault
,
1019 indent_width
=> $g_indent_width,
1020 abs_prefix
=> "", # Prefixed to absolute path URLs
1021 url_prefix
=> "", # Prefixed to non-absolute URLs
1022 img_prefix
=> "", # Prefixed to non-absolute image URLs
1023 base_prefix
=> "", # Prefixed to fragment-only URLs
1027 if (ref($_[0]) eq "HASH") {
1032 while (my ($k,$v) = each %args) {
1035 _SanitizeOpts
(\
%opt);
1038 ($text, $yaml) = _PrepareInput
($text, $opt{yamlmode
});
1039 _ApplyYAMLOpts
($yaml, \
%opt) if ref($yaml) eq "HASH" && $opt{yamlmode
} > 0;
1041 # Clear the globals. If we don't clear these, you get conflicts
1042 # from other articles when generating a page which contains more than
1043 # one article (e.g. an index page that shows the N most recent
1050 %g_code_block_ids = ();
1051 %g_html_blocks = ();
1052 %g_code_blocks = ();
1053 @g_xml_comments = ();
1056 # Make sure $text ends with a couple of newlines:
1059 # Handle backticks-delimited code blocks
1060 $text = _HashBTCodeBlocks
($text);
1062 # Convert all tabs to spaces.
1063 $text = _DeTab
($text);
1065 # Strip any lines consisting only of spaces.
1066 # This makes subsequent regexen easier to write, because we can
1067 # match consecutive blank lines with /\n+/ instead of something
1068 # contorted like / *\n+/ .
1069 $text =~ s/^ +$//mg;
1071 # Turn block-level HTML blocks into hash entries
1072 $text = _HashHTMLBlocks
($text, 1);
1074 # Strip link definitions, store in hashes.
1075 $text = _StripLinkDefinitions
($text);
1077 $text = _RunBlockGamut
($text, 1);
1079 # Remove indentation markers
1080 $text =~ s/\027+//gs;
1082 # Expand auto number flags
1083 $text =~ s/\034([1-6])/_AutoHeaderNum(ord($1)&0x7)/gse
1084 if $opt{auto_number
} ne '' && $opt{auto_number
} > 0;
1086 # Unhashify code blocks
1087 $text =~ s/(\025\d+\026)/$g_code_blocks{$1}/g;
1089 $text = _UnescapeSpecialChars
($text);
1091 $text .= "\n" unless $text eq "";
1093 # Sanitize all '<'...'>' tags if requested
1094 $text = _SanitizeTags
($text, $opt{xmlcheck
}, 1) if $opt{sanitize
};
1096 utf8
::encode
($text);
1097 if (ref($_[0]) eq "HASH") {
1098 ${$_[0]}{anchors
} = {%g_anchors_id} if exists(${$_[0]}{anchors
});
1099 if (defined($opt{h1
}) && $opt{h1
}) {
1100 utf8
::encode
($opt{h1
});
1101 ${$_[0]}{h1
} = $opt{h1
};
1103 ${$_[0]}{yaml
} = $yaml if ref($yaml) eq "HASH";
1106 if (ref($yaml) eq "HASH" && %$yaml && $opt{yamlmode
} && $opt{yamlvis
}) {
1107 if ($opt{yamlvis
} > 0 || _HasUnknownYAMLOptions
($yaml)) {
1108 my ($hrows, $drows) = ("", "");
1109 foreach (sort(keys(%$yaml))) {
1110 my $v = $yaml->{$_};
1116 $drows .= "<td>" . $v . "</td>\n";
1118 $rspn = " class=\"$opt{style_prefix}yaml-undef-value\" rowspan=\"2\" valign=\"top\"";
1120 $hrows .= "<th$rspn>" . $_ . "</th>\n";
1122 $yamltable = "<table class=\"$opt{style_prefix}yaml-table\" border=\"1\">\n" .
1123 "<tr>\n$hrows</tr>\n<tr>\n$drows</tr>\n</table>\n";
1126 return $yamltable.$text;
1130 sub _HashBTCodeBlocks
{
1132 # Process Markdown backticks (```) delimited code blocks
1135 my $less_than_indent = $opt{indent_width
} - 1;
1139 ([ ]{0,$less_than_indent})``(`+)[ \t]*(?:([\w.+-]+[#]?)(?:[ \t][ \t\w.+-]*)?)?\n
1140 ( # $4 = the code block -- one or more lines, starting with ```
1145 # and ending with ``` or end of document
1146 (?
:(?
:[ ]{0,$less_than_indent}``\
2[ \t]*(?
:\n|\Z
))|\Z
)
1148 # $2 contains syntax highlighting to use if defined
1149 my $leadsp = length($1);
1151 $codeblock =~ s/[ \t]+$//mg; # trim trailing spaces on lines
1152 $codeblock = _DeTab
($codeblock, 8, $leadsp); # physical tab stops are always 8
1153 $codeblock =~ s/\A\n+//; # trim leading newlines
1154 $codeblock =~ s/\s+\z//; # trim trailing whitespace
1155 $codeblock = _EncodeCode
($codeblock); # or run highlighter here
1156 $codeblock = "<div class=\"$opt{style_prefix}code-bt\"><pre style=\"display:none\"></pre><pre><code>"
1157 . $codeblock . "\n</code></pre></div>";
1159 my $key = block_id
($codeblock);
1160 $g_html_blocks{$key} = $codeblock;
1161 "\n\n" . $key . "\n\n";
1168 sub _StripLinkDefinitions
{
1170 # Strips link definitions from text, stores the URLs and titles in
1174 my $less_than_indent = $opt{indent_width
} - 1;
1176 # Link defs are in the form: ^[id]: url "optional title"
1178 ^[ ]{0,$less_than_indent}\
[(.+)\
]: # id = $1
1180 \n?
# maybe *one* newline
1182 <?
((?
:\S
(?
:\\\n\s
*[^\s
"(])?)+?)>? # url = $2
1184 \n? # maybe one newline
1187 (?<=\s) # lookbehind for whitespace
1188 (?:(['"])|(\
()) # title quote char
1190 (?
(4)\
)|\
3) # match same quote
1192 )?
# title is optional
1196 my $id = _strip
(lc $1); # Link IDs are case-insensitive
1198 my $title = _strip
($5);
1199 $url =~ s/\\\n\s*//gs;
1201 # These values always get passed through _MakeATag or _MakeIMGTag later
1202 $g_urls{$id} = $url;
1203 if (defined($title) && $title ne "") {
1204 $g_titles{$id} = $title;
1212 my %ok_tag_name; # initialized later
1213 my ($block_tags_a, $block_tags_b, $block_tags_c);
1215 $block_tags_a = qr/\020|p|div|center|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/io;
1216 $block_tags_b = qr/\020|p|div|center|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/io;
1217 $block_tags_c = qr/div|center|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/io;
1220 sub _HashHTMLBlocks
{
1221 my ($text, $toplevel) = @_;
1222 my $less_than_indent = $opt{indent_width
} - 1;
1223 my $idt = "\027" x
$g_list_level;
1224 my $blkprc = $toplevel ?
1225 sub { return $ok_tag_name{$_[1]} ? _EncodeAmpsAndAngles
($_[0]) : $_[0] } :
1226 sub { return $_[0] };
1228 # Hashify HTML blocks:
1229 # We only want to do this for block-level HTML tags, such as headers,
1230 # lists, and tables. That's because we still want to wrap <p>s around
1231 # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
1232 # phrase emphasis, and spans. The list of tags we're looking for is
1235 # First, look for nested blocks, e.g.:
1238 # tags for inner block must be indented.
1242 # The outermost tags must start at the left margin for this to match, and
1243 # the inner nested divs must be indented.
1244 # We need to do this before the next, more liberal match, because the next
1245 # match will start at the first `<div>` and stop at the first `</div>`.
1248 ^ # start of line (with /m)
1249 ((?
:\Q
$idt\E
)?
) # optional lead in = $2
1250 <($block_tags_a) # start tag = $3
1252 (?
:.*\n)*?
# any number of lines, minimally matching
1253 \
2</\
3\s
*> # the matching end tag
1254 [ ]* # trailing spaces
1255 (?
=\n+|\Z
) # followed by a newline or end of document
1258 my $blk = &$blkprc($1, $3);
1259 my $key = block_id
($blk);
1260 $g_html_blocks{$key} = $blk;
1261 "\n\n" . $key . "\n\n";
1266 # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
1270 ^ # start of line (with /m)
1271 (?
:\Q
$idt\E
)?
# optional lead in
1272 <($block_tags_b) # start tag = $2
1274 (?
:.*\n)*?
# any number of lines, minimally matching
1275 .*</\
2\s
*> # the matching end tag
1276 [ ]* # trailing spaces
1277 (?
=\n+|\Z
) # followed by a newline or end of document
1280 my $blk = &$blkprc($1, $2);
1281 my $key = block_id
($blk);
1282 $g_html_blocks{$key} = $blk;
1283 "\n\n" . $key . "\n\n";
1287 # Now match any empty block tags that should have been paired
1291 ^ # start of line (with /m)
1292 (?
:\Q
$idt\E
)?
# optional lead in
1293 <($block_tags_c) # start tag = $2
1296 /?
> # the matching end tag
1297 [ ]* # trailing spaces
1298 (?
=\n+|\Z
) # followed by a newline or end of document
1301 my $key = block_id
($1);
1302 $g_html_blocks{$key} = $1;
1303 "\n\n" . $key . "\n\n";
1306 # Special case just for <hr />. It was easier to make a special case than
1307 # to make the other regex more complicated.
1310 (?
<=\n) # Starting after end of line
1312 \A
# the beginning of the doc
1315 [ ]{0,$less_than_indent}
1319 /?
> # the matching end tag
1321 (?
=\n{1,}|\Z
) # followed by end of line or end of document
1324 my $key = block_id
($1);
1325 $g_html_blocks{$key} = $1;
1326 "\n\n" . $key . "\n\n";
1329 # Special case for standalone XML comments:
1332 (?
<=\n\n) # Starting after a blank line
1334 \A
\n?
# the beginning of the doc
1337 [ ]{0,$less_than_indent}
1340 (?
:[^-]|(?
:-(?
!-)))*
1345 (?
:[^-]|(?
:-(?
!-)))*
1350 (?
=\n{1,}|\Z
) # followed by end of line or end of document
1353 my $key = block_id
($1);
1354 push(@g_xml_comments, $key)
1355 if $opt{stripcomments
} && !exists($g_html_blocks{$key});
1356 $g_html_blocks{$key} = $1;
1357 "\n\n" . $key . "\n\n";
1365 sub _RunBlockGamut
{
1367 # These are all the transformations that form block-level
1368 # tags like paragraphs, headers, and list items.
1370 my ($text, $anchors) = @_;
1372 $text = _DoHeaders
($text, $anchors);
1374 # Do Horizontal Rules:
1375 $text =~ s{^ {0,3}\*(?: {0,2}\*){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm
;
1376 $text =~ s{^ {0,3}\_(?: {0,2}\_){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm
;
1377 $text =~ s{^ {0,3}\-(?: {0,2}\-){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm
;
1379 $text = _DoListsAndBlocks
($text);
1381 $text = _DoTables
($text);
1383 # We already ran _HashHTMLBlocks() before, in Markdown(), but that
1384 # was to escape raw HTML in the original Markdown source. This time,
1385 # we're escaping the markup we've just created, so that we don't wrap
1386 # <p> tags around block-level tags.
1387 $text = _HashHTMLBlocks
($text);
1389 $text = _FormParagraphs
($text, $anchors);
1396 return _DoBlockQuotes
(_DoCodeBlocks
($_[0])) if $_[0] ne "";
1402 # These are all the transformations that occur *within* block-level
1403 # tags like paragraphs, headers, and list items.
1407 $text = _DoCodeSpans
($text);
1409 $text = _EscapeSpecialChars
($text);
1411 # Process anchor and image tags. Images must come first,
1412 # because ![foo][f] looks like an anchor.
1413 $text = _DoImages
($text);
1414 $text = _DoAnchors
($text);
1416 # Make links out of things like `<http://example.com/>`
1417 # Must come after _DoAnchors(), because you can use < and >
1418 # delimiters in inline links like [this](<url>).
1419 $text = _DoAutoLinks
($text);
1421 $text = _EncodeAmpsAndAngles
($text);
1423 $text = _DoItalicsAndBoldAndStrike
($text);
1426 $text =~ s/ {3,}(\n|\z)/<br clear=\"all\"$opt{empty_element_suffix}$1/g;
1427 $text =~ s/ {2,}\n/<br$opt{empty_element_suffix}\n/g;
1428 $text =~ s/ {2,}\z//g;
1434 sub _EscapeSpecialChars
{
1436 my $tokens ||= _TokenizeHTML
($text);
1438 $text = ''; # rebuild $text from the tokens
1439 # my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags.
1440 # my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
1442 foreach my $cur_token (@
$tokens) {
1443 if ($cur_token->[0] eq "tag") {
1444 # Within tags, encode *, _ and ~ so they don't conflict
1445 # with their use in Markdown for italics and strong.
1446 # We're replacing each such character with its
1447 # corresponding block id value; this is likely
1448 # overkill, but it should prevent us from colliding
1449 # with the escape values by accident.
1450 $cur_token->[1] =~ s!([*_~])!$g_escape_table{$1}!g;
1451 $text .= $cur_token->[1];
1453 my $t = $cur_token->[1];
1454 $t = _EncodeBackslashEscapes
($t);
1462 sub _ProcessWikiLink
{
1463 my ($link_text, $link_loc) = @_;
1464 if (defined($link_loc) &&
1465 ($link_loc =~ m{^#\S*$} || $link_loc =~ m{^(?:http|ftp)s?://\S+$}i)) {
1466 # Return the new link
1467 return _MakeATag
(_FindFragmentMatch
($link_loc), $link_text);
1469 if (!defined($link_loc)) {
1470 $link_loc = _RunSpanGamut
($link_text);
1471 $link_loc = _strip
(unescapeXML
(_StripTags
(_UnescapeSpecialChars
($link_loc))));
1472 $link_loc =~ m{^(?:http|ftp)s?://\S+$}i and
1473 # Return the new link
1474 return _MakeATag
($link_loc, $link_text);
1476 return undef if $link_loc eq "" || $link_text eq "";
1477 if ($link_loc =~ /^[A-Za-z][A-Za-z0-9+.-]*:/os) {
1478 # Unrecognized scheme
1481 if ($opt{wikipat
}) {
1482 my $o = $opt{wikiopt
};
1483 my $img_link = _strip
($link_text);
1487 my $imgopts = undef;
1488 if ($img_link =~ /^[^#?\s]+\.(?:png|gif|jpe?g|svgz?)$/i) {
1489 $base = _wxform
($img_link, 1);
1491 $imgopts = _ParseWikiImgOpts
($link_loc);
1492 $imgopts->{imgflag
} = \
$img;
1495 if ($link_loc =~ /^(.*?)([?#].*)$/os) {
1496 ($base, $qsfrag) = ($1, $2);
1498 $base = _wxform
($base);
1500 my $result = $opt{wikipat
};
1501 $result =~ s/%\{\}.+$/%{}/os if $img;
1502 $result =~ s/%\{\}/$base/;
1503 if ($qsfrag =~ /^([^#]*)(#.+)$/os) {
1504 my ($q,$f) = ($1,$2);
1505 #$f = _wxform($f) if $f =~ / /;
1509 $result = &{$opt{wikifunc
}}($result, \
%opt, ($img?
$img_link:$link_loc), $base, $qsfrag, $imgopts)
1510 if ref($opt{wikifunc
}) eq 'CODE';
1513 $result =~ s/%(?![0-9A-Fa-f]{2})/%25/sog;
1516 s/([\x00-\x1F <>"{}|\\^`x7F])/sprintf("%%%02X",ord($1))/soge;
1519 s/([\x00-\x1F <>"{}|\\^`\x7F-\xFF])/sprintf("%%%02X",ord($1))/soge;
1521 $result =~ s/(%(?![0-9A-F]{2})[0-9A-Fa-f]{2})/uc($1)/soge;
1523 # Return the new link
1524 return $img ? _MakeIMGTag
($result, undef, undef, $imgopts) : _MakeATag
($result, $link_text);
1531 sub _ParseWikiImgOpts
{
1534 # alt= consumes the rest of the line, do it first
1535 if ($alts =~ /(?:^|,)\s*alt\s*=\s*(.*)$/ios) {
1537 $alts = substr($alts, 0, $-[0]);
1538 $o{alt
} = _strip
($atext);
1540 foreach my $kv (split(/\s*,\s*/, lc($alts))) {
1541 if ($kv =~ /^\s*([^\s]+)\s*=\s*([^\s]+)\s*$/os) {
1542 my ($k, $v) = ($1, $2);
1543 if (($k eq "width" || $k eq "height") && $v =~ /^\d+$/) {
1544 $o{$k} = 0+$v if $v > 0;
1547 if ($k eq "align" && ($v eq "left" || $v eq "right" || $v eq "center")) {
1559 my $o = $opt{wikiopt
};
1560 my $opt_s = $o->{s
};
1561 if (!$img && $opt_s) {
1563 if ($w =~ m{^(.*)[.]([^./]*)$}) {
1564 my ($base, $ext) = ($1, $2);
1565 $w = $base if $opt_s->{lc($ext)};
1568 $w =~ s{[.][^./]*$}{};
1571 $w = uc($w) if $o->{u
};
1572 $w = lc($w) if $o->{l
};
1573 $w =~ s{/+}{%252F}gos if $o->{"%"};
1574 $w =~ s/ +/%20/gos if $o->{b
};
1575 $w =~ tr{/}{ } if $o->{f
};
1576 $w =~ s{/+}{/}gos if !$o->{f
} && !$o->{v
};
1579 $w =~ s/-+/-/gos unless $o->{v
};
1582 $w =~ s/_+/_/gos unless $o->{v
};
1588 # Return a suitably encoded <a...> tag string
1589 # On input NONE of $url, $text or $title should be xmlencoded
1590 # but $url should already be url-encoded if needed, but NOT g_escape_table'd
1592 my ($url, $text, $title) = @_;
1593 defined($url) or $url="";
1594 defined($text) or $text="";
1595 defined($title) or $title="";
1597 $url =~ m
"^#" && ref($opt{base_prefix
}) eq 'CODE' and $url = &{$opt{base_prefix
}}($url);
1598 my $result = $g_escape_table{'<'}."a href=\"" . _EncodeAttText
($url) . "\"";
1599 $title = _strip
($title);
1600 $text =~ s{<(/?a)}{<$1}sogi;
1601 $text = _DoItalicsAndBoldAndStrike
($text);
1602 # We've got to encode any of these remaining to avoid
1603 # conflicting with other italics, bold and strike through and links.
1604 $text =~ s!([]*_~[])!$g_escape_table{$1}!g;
1605 $result .= " title=\"" . _EncodeAttText
($title) . "\"" if $title ne "";
1606 return $result . $g_escape_table{'>'} .
1607 $text . $g_escape_table{'<'}."/a".$g_escape_table{'>'};
1613 # Turn Markdown link shortcuts into XHTML <a> tags.
1618 # First, handle wiki-style links: [[wiki style link]]
1621 ( # wrap whole match in $1
1623 ($g_nested_brackets) # link text and id = $2
1628 my $whole_match = $1;
1630 my $link_loc = undef;
1632 if ($link_text =~ /^(.*)\|(.*)$/s) {
1634 $link_loc = _strip
($2);
1637 $result = _ProcessWikiLink
($link_text, $link_loc);
1638 defined($result) or $result = $whole_match;
1643 # Next, handle reference-style links: [link text] [id]
1646 ( # wrap whole match in $1
1648 ($g_nested_brackets) # link text = $2
1651 [ ]?
# one optional space
1652 (?
:\n[ ]*)?
# one optional newline followed by spaces
1655 ($g_nested_brackets) # id = $3
1660 my $whole_match = $1;
1664 if ($link_id eq "") {
1665 # for shortcut links like [this][].
1666 $link_id = _RunSpanGamut
($link_text);
1667 $link_id = unescapeXML
(_StripTags
(_UnescapeSpecialChars
($link_id)));
1669 $link_id = _strip
(lc $link_id);
1671 if (defined($g_urls{$link_id}) || defined($g_anchors{$link_id})) {
1672 my $url = $g_urls{$link_id};
1673 defined($url) or $url = $g_anchors{$link_id};
1674 $url = _FindFragmentMatch
($url);
1675 $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
1676 $result = _MakeATag
(_PrefixURL
($url), $link_text, $g_titles{$link_id});
1679 $result = $whole_match;
1685 # Subsequently, inline-style links: [link text](url "optional title")
1688 ( # wrap whole match in $1
1690 ($g_nested_brackets) # link text = $2
1693 ($g_nested_parens) # href and optional title = $3
1698 my $whole_match = $1;
1700 my ($url, $title) = _SplitUrlTitlePart
($3);
1702 if (defined($url)) {
1703 $url = _FindFragmentMatch
($url);
1704 $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
1705 _MakeATag
(_PrefixURL
($url), $link_text, $title);
1707 # The href/title part didn't match the pattern
1713 # Finally, handle reference-style implicit shortcut links: [link text]
1716 ( # wrap whole match in $1
1718 ($g_nested_brackets) # link text = $2
1723 my $whole_match = $1;
1725 my $link_id = _RunSpanGamut
($2);
1726 $link_id = _strip
(lc(unescapeXML
(_StripTags
(_UnescapeSpecialChars
($link_id)))));
1728 if (defined($g_urls{$link_id}) || defined($g_anchors{$link_id})) {
1729 my $url = $g_urls{$link_id};
1730 defined($url) or $url = $g_anchors{$link_id};
1731 $url = _FindFragmentMatch
($url);
1732 $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
1733 $result = _MakeATag
(_PrefixURL
($url), $link_text, $g_titles{$link_id});
1736 $result = $whole_match;
1746 defined($_[0]) or return undef;
1747 if (substr($_[0],0,1) eq "(") {
1748 return substr($_[0], 1, length($_[0]) - (substr($_[0], -1, 1) eq ")" ?
2 : 1));
1754 sub _SplitUrlTitlePart
{
1755 return ("", undef) if $_[0] =~ m{^\s*$}; # explicitly allowed
1757 $u =~ s/^\s*(['\042])/# $1/;
1761 <?
([^\s
'\042]\S*?)>? # URL = $1
1762 (?: # optional grouping
1763 \s+ # must be distinct from URL
1764 (['\042]?
) # quote char = $2
1766 \
2?
# matching quote
1767 )?
# title is optional
1771 return (undef, undef) if $_[1] && ($1 eq "" || $1 eq "#");
1772 return (_PeelWrapped
($1), $2 ?
$3 : _PeelWrapped
($3));
1774 return (undef, undef);
1779 sub _FindFragmentMatchInternal
{
1780 my ($anchors_id, $url, $undefifnomatch) = @_;
1781 if (defined($url) && $url =~ /^#\S/) {
1782 # try very hard to find a match
1783 my $idbase = _strip
(lc(substr($url, 1)));
1784 my $idbase0 = $idbase;
1785 my $id = _MakeAnchorId
($idbase);
1786 $undefifnomatch and $url = undef;
1787 if (defined($$anchors_id{$id})) {
1788 $url = $$anchors_id{$id};
1790 $idbase =~ s/-/_/gs;
1791 $id = _MakeAnchorId
($idbase);
1792 if (defined($$anchors_id{$id})) {
1793 $url = $$anchors_id{$id};
1795 $id = _MakeAnchorId
($idbase0, 1);
1796 if (defined($$anchors_id{$id})) {
1797 $url = $$anchors_id{$id};
1799 $id = _MakeAnchorId
($idbase, 1);
1800 if (defined($$anchors_id{$id})) {
1801 $url = $$anchors_id{$id};
1811 sub _FindFragmentMatch
{
1812 return _FindFragmentMatchInternal
(\
%g_anchors_id, @_);
1819 if (Encode
::is_utf8
($input) || utf8
::decode
($input)) {
1822 $output = $encoder->decode($input, Encode
::FB_DEFAULT
);
1828 # $_[0] -> HASH ref of anchors (e.g. the "anchors" OUTPUT from Markdown)
1829 # $_[1] -> fragment to resolve, may optionally start with '#'
1830 # An empty string ("") or hash ("#") is returned as-is.
1831 # returns undef if no match otherwise resolved fragment name
1832 # which will start with a '#' if $_[1] started with '#' otherwise will not.
1833 # This function can be used to connect up links to "implicit" anchors.
1834 # All Markdown-format H1-H6 headers have an implicit anchor added
1835 # based on the header item text. Passing that text to this function
1836 # will cough up the matching implicit anchor if there is one.
1839 my ($anchors, $frag) = @_;
1840 defined($frag) or return undef;
1841 $frag eq "" || $frag eq "#" and return $frag;
1842 my $hadhash = ($frag =~ s/^#//);
1843 $frag =~ /^\S/ or return undef;
1844 ref($anchors) eq 'HASH' or return undef;
1845 my $ans = _FindFragmentMatchInternal
($anchors, '#'._ToUTF8
($frag), 1);
1846 $hadhash || !defined($ans) or $ans =~ s/^#//;
1847 defined($ans) and utf8
::encode
($ans);
1852 # Return a suitably encoded <img...> tag string
1853 # On input NONE of $url, $alt or $title should be xmlencoded
1854 # but $url should already be url-encoded if needed, but NOT g_escape_table'd
1856 my ($url, $alt, $title, $iopts) = @_;
1857 defined($url) or $url="";
1858 defined($alt) or $alt="";
1859 defined($title) or $title="";
1860 ref($iopts) eq "HASH" or $iopts = {};
1861 return "" unless $url ne "";
1863 my ($w, $h, $lf, $rt) = (0, 0, '', '');
1864 ($alt, $title) = (_strip
($alt), _strip
($title));
1865 if ($title =~ /^(.*)\((<?)([1-9][0-9]*)[xX\xd7]([1-9][0-9]*)(>?)\)$/os) {
1866 ($title, $w, $h, $lf, $rt) = (_strip
($1), $3, $4, $2, $5);
1867 } elsif ($title =~ /^(.*)\((<?)\?[xX\xd7]([1-9][0-9]*)(>?)\)$/os) {
1868 ($title, $h, $lf, $rt) = (_strip
($1), $3, $2, $4);
1869 } elsif ($title =~ /^(.*)\((<?)([1-9][0-9]*)[xX\xd7]\?(>?)\)$/os) {
1870 ($title, $w, $lf, $rt) = (_strip
($1), $3, $2, $4);
1871 } elsif ($title =~ /^(.*)\((?!\))(<?)(>?)\)$/os) {
1872 ($title, $lf, $rt) = (_strip
($1), $2, $3);
1874 $iopts->{align
} = "center" if $lf && $rt;
1875 $iopts->{align
} = "left" if $lf && !$rt;
1876 $iopts->{align
} = "right" if !$lf && $rt;
1877 $iopts->{width
} = $w if $w != 0;
1878 $iopts->{height
} = $h if $h != 0;
1879 $iopts->{alt
} = $alt if $alt ne "";
1880 $iopts->{title
} = $title if $title ne "";
1881 my $iopt = sub { defined($iopts->{$_[0]}) ?
$iopts->{$_[0]} : (@_ > 1 ?
$_[1] : "") };
1883 $result .= $g_escape_table{'<'}."center".$g_escape_table{'>'}
1884 if &$iopt("align") eq "center";
1885 $result .= $g_escape_table{'<'}."img src=\"" . _EncodeAttText
($url) . "\"";
1886 $result .= " align=\"left\"" if &$iopt("align") eq "left";
1887 $result .= " align=\"right\"" if &$iopt("align") eq "right";
1888 $result .= " alt=\"" . _EncodeAttText
($iopts->{alt
}) . "\"" if &$iopt("alt") ne "";
1889 $result .= " width=\"" . $iopts->{width
} . "\"" if &$iopt("width",0) != 0;
1890 $result .= " height=\"" . $iopts->{height
} . "\"" if &$iopt("height",0) != 0;
1891 $result .= " title=\"" . _EncodeAttText
($iopts->{title
}) . "\"" if &$iopt("title") ne "";
1892 $result .= " /" unless $opt{empty_element_suffix
} eq ">";
1893 $result .= $g_escape_table{'>'};
1894 $result .= $g_escape_table{'<'}."/center".$g_escape_table{'>'}
1895 if &$iopt("align") eq "center";
1902 # Turn Markdown image shortcuts into <img> tags.
1907 # First, handle reference-style labeled images: ![alt text][id]
1910 ( # wrap whole match in $1
1912 ($g_nested_brackets) # alt text = $2
1915 [ ]?
# one optional space
1916 (?
:\n[ ]*)?
# one optional newline followed by spaces
1919 ($g_nested_brackets) # id = $3
1925 my $whole_match = $1;
1929 $link_id ne "" or $link_id = $alt_text; # for shortcut links like ![this][].
1930 $link_id = _strip
(lc $link_id);
1932 if (defined $g_urls{$link_id}) {
1933 $result = _MakeIMGTag
(
1934 _PrefixURL
($g_urls{$link_id}), $alt_text, $g_titles{$link_id});
1937 # If there's no such link ID, leave intact:
1938 $result = $whole_match;
1945 # Next, handle inline images: ![alt text](url "optional title")
1946 # Don't forget: encode * and _
1949 ( # wrap whole match in $1
1951 ($g_nested_brackets) # alt text = $2
1954 ($g_nested_parens) # src and optional title = $3
1958 my $whole_match = $1;
1960 my ($url, $title) = _SplitUrlTitlePart
($3, 1);
1961 defined($url) ? _MakeIMGTag
(_PrefixURL
($url), $alt_text, $title) : $whole_match;
1965 # Finally, handle reference-style implicitly labeled links: ![alt text]
1968 ( # wrap whole match in $1
1970 ($g_nested_brackets) # alt text = $2
1975 my $whole_match = $1;
1977 my $link_id = lc(_strip
($alt_text));
1979 if (defined $g_urls{$link_id}) {
1980 $result = _MakeIMGTag
(
1981 _PrefixURL
($g_urls{$link_id}), $alt_text, $g_titles{$link_id});
1984 # If there's no such link ID, leave intact:
1985 $result = $whole_match;
1994 sub _EncodeAttText
{
1996 defined($text) or return undef;
1997 $text = escapeXML
(_strip
($text));
1998 # We've got to encode these to avoid conflicting
1999 # with italics, bold and strike through.
2000 $text =~ s!([*_~:])!$g_escape_table{$1}!g;
2007 my ($link, $strip) = @_;
2010 $link =~ s/\s+/_/gs;
2011 $link =~ tr/-a-z0-9_//cd;
2013 $link =~ tr/-a-z0-9_/_/cs;
2015 return '' unless $link ne '';
2016 $link = "_".$link."_";
2017 $link =~ s/__+/_/gs;
2018 $link = "_".md5_hex
($link)."_" if length($link) > 66;
2023 sub _GetNewAnchorId
{
2024 my $link = _strip
(lc(shift));
2025 return '' if $link eq "" || defined($g_anchors{$link});
2026 my $id = _MakeAnchorId
($link);
2027 return '' unless $id;
2028 $g_anchors{$link} = '#'.$id;
2029 $g_anchors_id{$id} = $g_anchors{$link};
2034 defined($g_anchors_id{$id2}) or $g_anchors_id{$id2} = $g_anchors{$link};
2036 my $idd = _MakeAnchorId
($link, 1);
2038 defined($g_anchors_id{$idd}) or $g_anchors_id{$idd} = $g_anchors{$link};
2042 $idd2 =~ s/__+/_/gs;
2043 defined($g_anchors_id{$idd2}) or $g_anchors_id{$idd2} = $g_anchors{$link};
2051 my ($text, $anchors) = @_;
2053 my $geth1 = $anchors && !defined($opt{h1
}) ?
sub {
2054 return unless !defined($h1);
2056 $h1 = $h if $h ne "";
2059 # atx-style headers:
2062 # ## Header 2 with closing hashes ##
2067 ^(\#
{1,6}) # $1 = string of #'s
2069 ((?
:(?
:(?
<![#])[^\s]|[^#\s]).*?)?) # $2 = Header text
2073 my $h_level = length($1);
2077 my $rsg = _RunSpanGamut
($h);
2078 $h = _strip
(unescapeXML
(_StripTags
(_UnescapeSpecialChars
($rsg))));
2079 my $id = $h eq "" ?
"" : _GetNewAnchorId
($h);
2080 $id = " id=\"$id\"" if $id ne "";
2081 &$geth1($h) if $h_level == 1;
2082 "<h$h_level$id>" . _AutoHeaderFlag
($h_level) . $rsg . "</h$h_level>\n\n";
2085 # Setext-style headers:
2095 $text =~ s
{ ^(?
:=+[ ]*\n)?
[ ]*(.+?
)[ ]*\n=+[ ]*\n+ }{
2097 my $rsg = _RunSpanGamut
($h);
2098 $h = _strip
(unescapeXML
(_StripTags
(_UnescapeSpecialChars
($rsg))));
2099 my $id = $h eq "" ?
"" : _GetNewAnchorId
($h);
2100 $id = " id=\"$id\"" if $id ne "";
2102 "<h1$id>" . _AutoHeaderFlag
(1) . $rsg . "</h1>\n\n";
2105 $text =~ s
{ ^(?
:-+[ ]*\n)?
[ ]*(.+?
)[ ]*\n-+[ ]*\n+ }{
2107 my $rsg = _RunSpanGamut
($h);
2108 $h = _strip
(unescapeXML
(_StripTags
(_UnescapeSpecialChars
($rsg))));
2109 my $id = $h eq "" ?
"" : _GetNewAnchorId
($h);
2110 $id = " id=\"$id\"" if $id ne "";
2111 "<h2$id>" . _AutoHeaderFlag
(2) . $rsg . "</h2>\n\n";
2114 $text =~ s
{ ^(?
:~+[ ]*\n)?
[ ]*(.+?
)[ ]*\n~+[ ]*\n+ }{
2116 my $rsg = _RunSpanGamut
($h);
2117 $h = _strip
(unescapeXML
(_StripTags
(_UnescapeSpecialChars
($rsg))));
2118 my $id = $h eq "" ?
"" : _GetNewAnchorId
($h);
2119 $id = " id=\"$id\"" if $id ne "";
2120 "<h3$id>" . _AutoHeaderFlag
(3) . $rsg . "</h3>\n\n";
2123 $opt{h1
} = $h1 if defined($h1) && $h1 ne "";
2128 sub _AutoHeaderFlag
{
2130 my $auto = $opt{auto_number
} || 0;
2131 return '' unless 1 <= $level && $level <= $auto;
2132 return "\34".chr(0x30+$level);
2136 sub _AutoHeaderNum
{
2138 my $auto = $opt{auto_number
} || 0;
2139 return '' unless 1 <= $level && $level <= $auto;
2140 pop(@autonum) while @autonum > $level;
2141 push(@autonum, 1) while @autonum < $level - 1;
2142 $autonum[$level - 1] += 1;
2143 return join('.', @autonum).' ';
2147 my ($marker_ul, $marker_ol, $marker_any, $roman_numeral, $greek_lower);
2149 # Re-usable patterns to match list item bullets and number markers:
2150 $roman_numeral = qr
/(?
:
2151 [IiVvXx
]|[Ii
]{2,3}|[Ii
][VvXx
]|[VvXx
][Ii
]{1,3}|[Xx
][Vv
][Ii
]{0,3}|
2152 [Xx
][Ii
][VvXx
]|[Xx
]{2}[Ii
]{0,3}|[Xx
]{2}[Ii
]?
[Vv
]|[Xx
]{2}[Vv
][Ii
]{1,2})/ox
;
2153 $greek_lower = qr/(?:[\x{03b1}-\x{03c9}])/o;
2154 $marker_ul = qr/[*+-]/o;
2155 $marker_ol = qr/(?:\d+|[A-Za-z]|$roman_numeral|$greek_lower)[.\)]/o;
2156 $marker_any = qr/(?:$marker_ul|$marker_ol)/o;
2160 sub _GetListMarkerType
{
2161 my ($list_type, $list_marker, $last_marker) = @_;
2162 return "" unless $list_type && $list_marker && lc($list_type) eq "ol";
2163 my $last_marker_type = '';
2164 $last_marker_type = _GetListMarkerType
($list_type, $last_marker)
2165 if defined($last_marker) &&
2166 # these are roman unless $last_marker type case matches and is 'a' or 'A'
2167 $list_marker =~ /^[IiVvXx][.\)]?$/;
2168 return "I" if $list_marker =~ /^[IVX]/ && $last_marker_type ne 'A';
2169 return "i" if $list_marker =~ /^[ivx]/ && $last_marker_type ne 'a';
2170 return "A" if $list_marker =~ /^[A-Z]/;
2171 return "a" if $list_marker =~ /^[a-z]/ || $list_marker =~ /^$greek_lower/o;
2176 sub _GetListItemTypeClass
{
2177 my ($list_type, $list_marker, $last_marker) = @_;
2178 my $list_marker_type = _GetListMarkerType
($list_type, $list_marker, $last_marker);
2180 return "" unless length($list_marker) >= 2 && $list_marker_type =~ /^[IiAa1]$/;
2181 return "lower-greek" if $list_marker_type eq "a" && $list_marker =~ /^$greek_lower/o;
2182 return "" unless $list_marker =~ /\)$/;
2183 return "upper-roman" if $list_marker_type eq "I";
2184 return "lower-roman" if $list_marker_type eq "i";
2185 return "upper-alpha" if $list_marker_type eq "A";
2186 return "lower-alpha" if $list_marker_type eq "a";
2189 return ($list_marker_type, $ans);
2193 my %_roman_number_table;
2195 %_roman_number_table = (
2227 # Necessary because ς and σ are the same value grrr
2228 my %_greek_number_table;
2230 %_greek_number_table = (
2231 "\x{03b1}" => 1, # α
2232 "\x{03b2}" => 2, # β
2233 "\x{03b3}" => 3, # γ
2234 "\x{03b4}" => 4, # δ
2235 "\x{03b5}" => 5, # ε
2236 "\x{03b6}" => 6, # ζ
2237 "\x{03b7}" => 7, # η
2238 "\x{03b8}" => 8, # θ
2239 "\x{03b9}" => 9, # ι
2240 "\x{03ba}" => 10, # κ
2241 "\x{03bb}" => 11, # λ
2242 #"\x{00b5}"=> 12, # µ is "micro" not "mu"
2243 "\x{03bc}" => 12, # μ
2244 "\x{03bd}" => 13, # ν
2245 "\x{03be}" => 14, # ξ
2246 "\x{03bf}" => 15, # ο
2247 "\x{03c0}" => 16, # π
2248 "\x{03c1}" => 17, # ρ
2249 "\x{03c2}" => 18, # ς
2250 "\x{03c3}" => 18, # σ
2251 "\x{03c4}" => 19, # τ
2252 "\x{03c5}" => 20, # υ
2253 "\x{03c6}" => 21, # φ
2254 "\x{03c7}" => 22, # χ
2255 "\x{03c8}" => 23, # ψ
2256 "\x{03c9}" => 24 # ω
2261 sub _GetMarkerIntegerNum
{
2262 my ($list_marker_type, $marker_val) = @_;
2264 return 0 + $marker_val if $list_marker_type eq "1";
2265 $list_marker_type = lc($list_marker_type);
2266 return $_greek_number_table{$marker_val}
2267 if $list_marker_type eq "a" &&
2268 defined($_greek_number_table{$marker_val});
2269 $marker_val = lc($marker_val);
2270 return ord($marker_val) - ord("a") + 1 if $list_marker_type eq "a";
2271 return 1 unless $list_marker_type eq "i";
2272 defined($_roman_number_table{$marker_val}) and
2273 return $_roman_number_table{$marker_val};
2276 return $ans if $ans == 0 && $list_marker_type eq "1";
2277 return $ans >= 1 ?
$ans : 1;
2282 my ($from, $to, $extra) = @_;
2283 $extra = defined($extra) ?
" $extra" : "";
2285 while ($from + 10 <= $to) {
2286 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-10\"></span>\n";
2289 while ($from + 5 <= $to) {
2290 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-5\"></span>\n";
2293 while ($from + 2 <= $to) {
2294 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-2\"></span>\n";
2297 while ($from < $to) {
2298 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr\"></span>\n";
2305 sub _DoListsAndBlocks
{
2307 # Form HTML ordered (numbered) and unordered (bulleted) lists.
2310 my $indent = $opt{indent_width
};
2311 my $less_than_indent = $indent - 1;
2312 my $less_than_double_indent = 2 * $indent - 1;
2314 # Re-usable pattern to match any entire ul or ol list:
2315 my $whole_list = qr{
2316 ( # $1 (or $_[0]) = whole list
2319 [ ]{0,$less_than_indent}
2320 (${marker_any
}) # $3 (or $_[2]) = first list item marker
2329 (?
! # Negative lookahead for another list item marker
2336 my $list_item_sub = sub {
2338 my $list_type = ($_[2] =~ m/$marker_ul/) ?
"ul" : "ol";
2340 my $list_class = "";
2342 # Turn double returns into triple returns, so that we can make a
2343 # paragraph for the last item in a list, if necessary:
2344 $list =~ s/\n\n/\n\n\n/g;
2345 my ($result, $first_marker, $fancy) = _ProcessListItems
($list_type, $list);
2346 defined($first_marker) or return $list;
2347 my $list_marker_type = _GetListMarkerType
($list_type, $first_marker);
2348 if ($list_marker_type) {
2349 $first_marker =~ s/[.\)]$//;
2350 my $first_marker_num = _GetMarkerIntegerNum
($list_marker_type, $first_marker);
2351 $list_att = $list_marker_type eq "1" ?
"" : " type=\"$list_marker_type\"";
2353 $list_class = " class=\"$opt{style_prefix}ol\"";
2354 my $start = $first_marker_num;
2355 $start = 10 if $start > 10;
2356 $start = 5 if $start > 5 && $start < 10;
2357 $start = 1 if $start > 1 && $start < 5;
2358 $list_att .= " start=\"$start\"" unless $start == 1;
2359 $list_incr = _IncrList
($start, $first_marker_num);
2361 $list_class = " class=\"$opt{style_prefix}lc-greek\""
2362 if $list_marker_type eq "a" && $first_marker =~ /^$greek_lower/o;
2363 $list_att .= " start=\"$first_marker_num\"" unless $first_marker_num == 1;
2366 my $idt = "\027" x
$g_list_level;
2367 $result = "$idt<$list_type$list_att$list_class>\n$list_incr" . $result . "$idt</$list_type>\n\n";
2371 # We use a different prefix before nested lists than top-level lists.
2372 # See extended comment in _ProcessListItems().
2374 # Note: (jg) There's a bit of duplication here. My original implementation
2375 # created a scalar regex pattern as the conditional result of the test on
2376 # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
2377 # substitution once, using the scalar as the pattern. This worked,
2378 # everywhere except when running under MT on my hosting account at Pair
2379 # Networks. There, this caused all rebuilds to be killed by the reaper (or
2380 # perhaps they crashed, but that seems incredibly unlikely given that the
2381 # same script on the same server ran fine *except* under MT. I've spent
2382 # more time trying to figure out why this is happening than I'd like to
2383 # admit. My only guess, backed up by the fact that this workaround works,
2384 # is that Perl optimizes the substition when it can figure out that the
2385 # pattern will never change, and when this optimization isn't on, we run
2386 # afoul of the reaper. Thus, the slightly redundant code to that uses two
2387 # static s/// patterns rather than one conditional pattern.
2389 # Note: (kjm) With the addition of the two-of-the-same-kind-in-a-row-
2390 # starts-a-list-at-the-top-level rule the two patterns really are somewhat
2391 # different now, but the duplication has pretty much been eliminated via
2392 # use of a separate sub which has the side-effect of making the below
2393 # two cases much easier to grok all at once.
2395 if ($g_list_level) {
2399 while ($parse =~ /\G(?s:.)*?^$whole_list/gmc) {
2400 my @captures = ($1, $2, $3, $4);
2401 if ($-[1] > $-[0]) {
2402 $text .= _DoListBlocks
(substr($parse, $-[0], $-[1] - $-[0]));
2404 $text .= &$list_item_sub(@captures);
2406 $text .= _DoListBlocks
(substr($parse, pos($parse))) if pos($parse) < length($parse);
2412 while ($parse =~ m
{\G
(?s
:.)*?
2416 (?
:(?
<=\n) # a list starts with one unordered marker line
2417 (?
=[ ]{0,$less_than_indent}$marker_ul[ ])) |
2418 (?
:(?
<=\n) # or two ordered marker lines in a row
2419 (?
=[ ]{0,$less_than_indent}$marker_ol[ ].*\n\n?
2420 [ ]{0,$less_than_indent}$marker_ol[ ])) |
2421 (?
:(?
<=\n) # or any marker and a sublist marker
2422 (?
=[ ]{0,$less_than_indent}$marker_any[ ].*\n\n?
2423 [ ]{$indent,$less_than_double_indent}$marker_any[ ]))
2427 my @captures = ($1, $2, $3, $4);
2428 if ($-[1] > $-[0]) {
2429 $text .= _DoListBlocks
(substr($parse, $-[0], $-[1] - $-[0]));
2431 $text .= &$list_item_sub(@captures);
2433 $text .= _DoListBlocks
(substr($parse, pos($parse))) if pos($parse) < length($parse);
2440 sub _ProcessListItems
{
2442 # Process the contents of a single ordered or unordered list, splitting it
2443 # into individual list items.
2446 my $list_type = shift;
2447 my $list_str = shift;
2449 # The $g_list_level global keeps track of when we're inside a list.
2450 # Each time we enter a list, we increment it; when we leave a list,
2451 # we decrement. If it's zero, we're not in a list anymore.
2453 # We do this because when we're not inside a list, we want to treat
2454 # something like this:
2456 # I recommend upgrading to version
2457 # 8. Oops, now this line is treated
2460 # As a single paragraph, despite the fact that the second line starts
2461 # with a digit-period-space sequence.
2463 # Whereas when we're inside a list (or sub-list), that line will be
2464 # treated as the start of a sub-list. What a kludge, huh? This is
2465 # an aspect of Markdown's syntax that's hard to parse perfectly
2466 # without resorting to mind-reading. Perhaps the solution is to
2467 # change the syntax rules such that sub-lists must start with a
2468 # starting cardinal number; e.g. "1." or "a.".
2471 my $idt = "\027" x
$g_list_level;
2472 my $marker_kind = $list_type eq "ul" ?
$marker_ul : $marker_ol;
2474 my $first_marker_type;
2475 my $first_marker_num;
2482 # trim trailing blank lines:
2483 $list_str =~ s/\n{2,}\z/\n/;
2488 while ($list_str =~ m
{\G
# start where we left off
2489 (\n+)?
# leading line = $1
2490 (^[ ]*) # leading whitespace = $2
2491 ($marker_any) [ ] ([ ]*) # list marker = $3 leading item space = $4
2493 my $leading_line = $1;
2494 my $leading_space = $2;
2495 my $list_marker = $3;
2496 my $list_marker_len = length($list_marker);
2497 my $leading_item_space = $4;
2498 if ($-[0] > $oldpos) {
2499 $result .= substr($list_str, $oldpos, $-[0] - $oldpos); # Sort-of $`
2500 $oldpos = $-[0]; # point at start of this entire match
2502 if (!defined($first_marker)) {
2503 $first_marker = $list_marker;
2504 $first_marker_type = _GetListMarkerType
($list_type, $first_marker);
2505 if ($first_marker_type) {
2506 (my $marker_val = $first_marker) =~ s/[.\)]$//;
2507 $first_marker_num = _GetMarkerIntegerNum
($first_marker_type, $marker_val);
2508 $next_num = $first_marker_num;
2509 $skipped = 1 if $next_num != 1;
2511 } elsif ($list_marker !~ /$marker_kind/) {
2512 # Wrong marker kind, "fix up" the marker to a correct "lazy" marker
2513 # But keep the old length in $list_marker_len
2514 $list_marker = $last_marker;
2517 # Now grab the rest of this item's data upto but excluding the next
2518 # list marker at the SAME indent level, but sublists must be INCLUDED
2521 while ($list_str =~ m
{\G
2522 ((?
:.+?
)(?
:\n{1,2})) # list item text = $1
2523 (?
= \n* (?
: \z
| # end of string OR
2524 (^[ ]*) # leading whitespace = $2
2525 ($marker_any) # next list marker = $3
2526 ([ ]+) )) # one or more spaces after marker = $4
2529 # If $3 has a left edge that is at the left edge of the previous
2530 # marker OR $3 has a right edge that is at the right edge of the
2531 # previous marker then we stop; otherwise we go on
2533 $item .= substr($list_str, $-[0], $+[0] - $-[0]); # $&
2534 last if !defined($4) || length($2) == length($leading_space) ||
2535 length($2) + length($3) == length($leading_space) + $list_marker_len;
2536 # move along, you're not the marker droid we're looking for...
2537 $item .= substr($list_str, $+[0], $+[4] - $+[0]);
2538 pos($list_str) = $+[4]; # ...move along over the marker droid
2540 # Remember where we parked
2541 $oldpos = pos($list_str);
2543 # Process the $list_marker $item
2549 if ($list_type eq "ul" && !$leading_item_space && $item =~ /^\[([ xX\xd7])\] +(.*)$/s) {
2552 my ($checkbox_class, $checkbox_val);
2553 if ($checkmark ne " ") {
2554 ($checkbox_class, $checkbox_val) = ("checkbox-on", "x");
2556 ($checkbox_class, $checkbox_val) = ("checkbox-off", " ");
2558 $liatt = " class=\"$opt{style_prefix}$checkbox_class\"";
2559 $checkbox = "<span><span></span></span><span></span><span>[<tt>$checkbox_val</tt>] </span>";
2561 my $list_marker_type;
2562 ($list_marker_type, $liatt) = _GetListItemTypeClass
($list_type, $list_marker, $last_marker);
2563 if ($list_type eq "ol" && defined($first_marker)) {
2564 my $styled = $fancy = 1 if $liatt && $list_marker =~ /\)$/;
2565 my ($sfx, $dash) = ("", "");
2566 ($sfx, $dash) = ("li", "-") if $styled;
2567 if ($liatt =~ /lower/) {
2568 $sfx .= "${dash}lc";
2569 } elsif ($liatt =~ /upper/) {
2570 $sfx .= "${dash}uc";
2572 $sfx .= "-greek" if $liatt =~ /greek/;
2573 $liatt = " class=\"$opt{style_prefix}$sfx\"" if $sfx;
2574 $typechanged = 1 if $list_marker_type ne $first_marker_type;
2575 (my $marker_val = $list_marker) =~ s/[.\)]$//;
2576 my $marker_num = _GetMarkerIntegerNum
($list_marker_type, $marker_val);
2577 $marker_num = $next_num if $marker_num < $next_num;
2578 $skipped = 1 if $next_num < $marker_num;
2579 $incr = _IncrList
($next_num, $marker_num, "incrlevel=$g_list_level");
2580 $liatt = " value=\"$marker_num\"$liatt" if $fancy || $skipped;
2581 $liatt = " type=\"$list_marker_type\"$liatt" if $styled || $typechanged;
2582 $next_num = $marker_num + 1;
2585 $last_marker = $list_marker;
2587 if ($leading_line or ($item =~ m/\n{2,}/)) {
2588 $item = _RunBlockGamut
(_Outdent
($item));
2589 $item =~ s{(</[OUou][Ll]>)\s*\z}{$1} and $item .= "\n$idt<span style=\"display:none\"> </span>";
2592 # Recursion for sub-lists:
2593 $item = _DoListsAndBlocks
(_Outdent
($item));
2595 $item = _RunSpanGamut
($item);
2599 $result .= "$incr$idt<li$liatt>" . $checkbox . $item . "$idt</li>\n";
2602 # remove "incrlevel=$g_list_level " parts
2603 $result =~ s
{<span incrlevel
=$g_list_level class="$opt{style_prefix}ol-incr((?:-\d{1,2})?)">}
2604 {$idt<span
class="$opt{style_prefix}ol-incr$1">}g
;
2606 # remove the $g_list_level incr spans entirely
2607 $result =~ s{<span incrlevel=$g_list_level class="$opt{style_prefix}ol-incr(?:-\d{1,2})?"></span>\n}{}g;
2608 # remove the class="$opt{style_prefix}lc-greek" if first_marker is greek
2609 $result =~ s{(<li[^>]*?) class="$opt{style_prefix}lc-greek">}{$1>}g
2610 if defined($first_marker_type) && $first_marker_type eq "a" && $first_marker =~ /^$greek_lower/o;
2613 # Anything left over (similar to $') goes into result, but this should always be empty
2614 $result .= _RunBlockGamut
(substr($list_str, pos($list_str))) if pos($list_str) < length($list_str);
2618 # After all that, if we only got an ordered list with a single item
2619 # and its first marker is a four-digit number >= 1492 and <= 2999
2620 # or an UPPERCASE letter, then pretend we didn't see any list at all.
2622 if ($first_marker_type && $first_marker_num + 1 == $next_num) {
2623 if (($first_marker_type eq "1" && $first_marker_num >= 1492 && $first_marker_num <= 2999) ||
2624 ($first_marker_type eq "A" && !$fancy)) {
2625 return (undef, undef, undef);
2629 return ($result, $first_marker, $fancy);
2635 # Process Markdown `<pre><code>` blocks.
2642 ( # $1 = the code block -- one or more lines, starting with indent_width spaces
2644 (?
:[ ]{$opt{indent_width
}}) # Lines must start with indent_width of spaces
2648 ((?
=^[ ]{0,$opt{indent_width
}}\S
)|\Z
) # Lookahead for non-space at line-start, or end of doc
2652 $codeblock =~ s/\n\n\n/\n\n/g; # undo "paragraph for last list item" change
2653 $codeblock = _EncodeCode
(_Outdent
($codeblock));
2654 $codeblock =~ s/\A\n+//; # trim leading newlines
2655 $codeblock =~ s/\s+\z//; # trim trailing whitespace
2657 my $result = "<div class=\"$opt{style_prefix}code\"><pre style=\"display:none\"></pre><pre><code>"
2658 . $codeblock . "\n</code></pre></div>";
2659 my $key = block_id
($result, 2);
2660 $g_code_blocks{$key} = $result;
2661 "\n\n" . $key . "\n\n";
2670 # * Backtick quotes are used for <code></code> spans.
2672 # * You can use multiple backticks as the delimiters if you want to
2673 # include literal backticks in the code span. So, this input:
2675 # Just type ``foo `bar` baz`` at the prompt.
2677 # Will translate to:
2679 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
2681 # There's no arbitrary limit to the number of backticks you
2682 # can use as delimters. If you need three consecutive backticks
2683 # in your code, use four for delimiters, etc.
2685 # * You can use spaces to get literal backticks at the edges:
2687 # ... type `` `bar` `` ...
2691 # ... type <code>`bar`</code> ...
2697 (`+) # $1 = Opening run of `
2698 (.+?
) # $2 = The code block
2700 \1 # Matching closer
2704 $c =~ s/^[ ]+//g; # leading whitespace
2705 $c =~ s/[ ]+$//g; # trailing whitespace
2706 $c = _EncodeCode
($c);
2716 # Encode/escape certain characters inside Markdown code runs.
2717 # The point is that in code, these characters are literals,
2718 # and lose their special Markdown meanings.
2722 # Encode all ampersands; HTML entities are not
2723 # entities within a Markdown code span.
2726 # Encode $'s, but only if we're running under Blosxom.
2727 # (Blosxom interpolates Perl variables in article bodies.)
2728 s/\$/$/g if $_haveBX;
2730 # Do the angle bracket song and dance:
2734 # Now, escape characters that are magic in Markdown:
2735 s!([*_~{}\[\]\\])!$g_escape_table{$1}!g;
2741 sub _DoItalicsAndBoldAndStrike
{
2746 $text =~ s
{ \
* (?
=\S
) (.+?
) (?
<=\S
) \
* }
2748 # We've got to encode any of these remaining to
2749 # avoid conflicting with other italics and bold.
2750 $text =~ s!([*])!$g_escape_table{$1}!g;
2755 $text =~ s
{ (?
<!\w
) _
(?
=\S
) (.+?
) (?
<=\S
) _
(?
!\w
) }
2757 # We've got to encode any of these remaining to
2758 # avoid conflicting with other italics and bold.
2759 $text =~ s!([_])!$g_escape_table{$1}!g;
2763 # <strong> must go first:
2764 $text =~ s
{ \
*\
* (?
=\S
) (.+?
[*_
]*) (?
<=\S
) \
*\
* }
2765 {"<strong>".&$doital1($1)."</strong>"}gsex
;
2766 $text =~ s
{ (?
<!\w
) __
(?
=\S
) (.+?
[*_
]*) (?
<=\S
) __
(?
!\w
) }
2767 {"<strong>".&$doital2($1)."</strong>"}gsex
;
2769 $text =~ s
{ ~~ (?
=\S
) (.+?
[*_
]*) (?
<=\S
) ~~ }
2770 {<strike
>$1</strike
>}gsx
;
2772 $text =~ s
{ \
* (?
=\S
) (.+?
) (?
<=\S
) \
* }
2774 $text =~ s
{ (?
<!\w
) _
(?
=\S
) (.+?
) (?
<=\S
) _
(?
!\w
) }
2781 sub _DoBlockQuotes
{
2785 ( # Wrap whole match in $1
2787 ^[ ]*>[ ]?
# '>' at the start of a line
2788 .*\n # rest of the first line
2789 (.+\n)* # subsequent consecutive lines
2795 $bq =~ s/^[ ]*>[ ]?//gm; # trim one level of quoting
2796 $bq =~ s/^[ ]+$//mg; # trim whitespace-only lines
2797 $bq = _RunBlockGamut
($bq); # recurse
2800 "<blockquote>\n$bq\n</blockquote>\n\n";
2808 my ($LEAD, $TRAIL, $LEADBAR, $LEADSP, $COLPL, $SEP);
2810 $LEAD = qr/(?>[ ]*(?:\|[ ]*)?)/o;
2811 $TRAIL = qr/[ ]*(?<!\\)\|[ ]*/o;
2812 $LEADBAR = qr/(?>[ ]*\|[ ]*)/o;
2813 $LEADSP = qr/(?>[ ]*)/o;
2814 $COLPL = qr/(?:[^\n|\\]|\\(?:(?>[^\n])|(?=\n|$)))+/o;
2815 $SEP = qr/[ ]*:?-+:?[ ]*/o;
2822 ( # Wrap whole thing to avoid $&
2823 (?
: (?
<=\n\n) | \A
\n?
) # Preceded by blank line or beginning of string
2825 $LEADBAR \
| [^\n]* |
2826 $LEADBAR $COLPL [^\n]* |
2827 $LEADSP $COLPL \
| [^\n]*
2830 $LEADBAR $SEP (?
: \
| $SEP )* (?
: \
| [ ]*)?
|
2831 $SEP (?
: \
| $SEP )+ (?
: \
| [ ]*)?
|
2835 $LEADBAR \
| [^\n]* \n |
2836 $LEADBAR $COLPL [^\n]* \n |
2837 $LEADSP $COLPL \
| [^\n]* \n
2841 my ($w, $h, $s, $rows) = ($1, $2, $3, $4);
2842 my @heads = _SplitTableRow
($h);
2843 my @seps = _SplitTableRow
($s);
2844 if (@heads == @seps) {
2846 if (/^:-+:$/) {" align=\"center\""}
2847 elsif (/^:/) {" align=\"left\""}
2848 elsif (/:$/) {" align=\"right\""}
2852 $nohdr = " $opt{style_prefix}table-nohdr" if join("", @heads) eq "";
2853 my $tab ="\n<table border=\"1\" cellspacing=\"0\" cellpadding=\"2\" class=\"$opt{style_prefix}table$nohdr\">\n";
2855 " <tr class=\"$opt{style_prefix}row-hdr\">" . _MakeTableRow
("th", \
@align, @heads) . "</tr>\n"
2858 my @classes = ("class=\"$opt{style_prefix}row-even\"", "class=\"$opt{style_prefix}row-odd\"");
2859 $tab .= " <tr " . $classes[++$cnt % 2] . ">" . _MakeTableRow
("td", \
@align, @
$_) . "</tr>\n"
2860 foreach (_SplitMergeRows
($rows));
2861 $tab .= "</table>\n\n";
2871 sub _SplitMergeRows
{
2873 my ($mergeprev, $mergenext) = (0,0);
2874 foreach (split(/\n/, $_[0])) {
2875 $mergeprev = $mergenext;
2877 my @cols = _SplitTableRow
($_);
2878 if (_endswithbareslash
($cols[$#cols])) {
2879 my $last = $cols[$#cols];
2880 substr($last, -1, 1) = "";
2882 $cols[$#cols] = $last;
2886 for (my $i = 0; $i <= $#cols; ++$i) {
2887 my $cell = $rows[$#rows]->[$i];
2888 defined($cell) or $cell = "";
2889 $rows[$#rows]->[$i] = _MergeCells
($cell, $cols[$i]);
2892 push(@rows, [@cols]);
2899 sub _endswithbareslash
{
2900 return 0 unless substr($_[0], -1, 1) eq "\\";
2901 my @parts = split(/\\\\/, $_[0], -1);
2902 return substr($parts[$#parts], -1, 1) eq "\\";
2908 return $c1 if $c2 eq "";
2909 return $c2 if $c1 eq "";
2910 return $c1 . " " . $c2;
2914 sub _SplitTableRow
{
2917 $row =~ s/$TRAIL$//;
2918 $row =~ s!\\\\!$g_escape_table{'\\'}!go; # Must process escaped backslashes first.
2919 $row =~ s!\\\|!$g_escape_table{'|'}!go; # Then do \|
2921 s!$g_escape_table{'|'}!|!go;
2922 s!$g_escape_table{'\\'}!\\\\!go;
2926 } split(/[ ]*\|[ ]*/, $row, -1);
2927 @elems or push(@elems, "");
2936 for (my $i = 0; $i < @
$align; ++$i) {
2938 defined($data) or $data = "";
2939 $row .= "<" . $etype . $$align[$i] . ">" .
2940 _RunSpanGamut
($data) . "</" . $etype . ">";
2946 sub _FormParagraphs
{
2949 # $text - string to process with html <p> tags
2951 my ($text, $anchors) = @_;
2953 # Strip leading and trailing lines:
2957 my @grafs = split(/\n{2,}/, $text);
2963 unless (defined($g_html_blocks{$_}) || defined($g_code_blocks{$_})) {
2964 $_ = _RunSpanGamut
($_);
2965 s/^([ ]*)/$g_start_p/;
2971 # Strip standalone XML comments if requested
2973 if ($anchors && $opt{stripcomments
} && @g_xml_comments) {
2974 my %xml_comment = ();
2975 $xml_comment{$_} = 1 foreach @g_xml_comments;
2977 do { push(@grafs2, $_) unless $xml_comment{$_} } foreach @grafs;
2982 # Unhashify HTML blocks
2985 if (defined( $g_html_blocks{$_} )) {
2986 $_ = $g_html_blocks{$_};
2990 return join "\n\n", @grafs;
2994 # %ok_tag_name declared previously
2995 my $g_possible_tag_name;
2997 # note: length("blockquote") == 10
2998 $g_possible_tag_name = qr/(?i:[a-z]{1,10}|h[1-6]|\020)/o;
2999 %ok_tag_name = map({$_ => 1} "\20", qw(
3000 a abbr acronym address area
3001 b basefont bdo big blockquote br
3002 caption center cite code col colgroup
3003 dd del dfn div dl dt
3006 h1 h2 h3 h4 h5 h6 hr
3014 s samp small span strike strong sub sup
3015 table tbody td tfoot th thead tr tt
3019 $ok_tag_name{$_} = 0 foreach (qw(
3025 sub _SetAllowedTag
{
3026 my ($tag, $forbid) = @_;
3027 $ok_tag_name{$tag} = $forbid ?
0 : 1
3028 if defined($tag) && exists($ok_tag_name{$tag});
3032 # Encode leading '<' of any non-tags
3033 # However, "<?", "<!" and "<$" are passed through (legacy on that "<$" thing)
3036 return $tag if $tag =~ /^<[?\$!]/;
3037 if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} || $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
3038 $ok_tag_name{lc($1)}) {
3040 return _ProcessURLTag
("href", $tag, 1) if $tag =~ /^<a\s/i;
3041 return _ProcessURLTag
("src", $tag) if $tag =~ /^<img\s/i;
3048 # Strip out all tags that _DoTag would match
3051 my $_StripTag = sub {
3053 return $tag if $tag =~ /^<[?\$!]/;
3054 if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} || $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
3055 $ok_tag_name{lc($1)}) {
3057 return ""; # strip it out
3061 $text =~ s{(<[^>]*>)}{&$_StripTag($1)}ige;
3065 my %univatt; # universally allowed attribute names
3066 my %tagatt; # per-element allowed attribute names
3067 my %tagmt; # empty element tags
3068 my %tagocl; # non-empty elements with optional closing tag
3069 my %tagacl; # which %tagocl an opening %tagocl will close
3070 my %tagblk; # block elements
3071 my %taginl; # inline markup tags which trigger an auto <p> reopen
3072 my %taga1p; # open tags which require at least one attribute
3073 my %lcattval; # names of attribute values to lowercase
3074 my %impatt; # names of "implied" attributes
3076 %univatt = map({$_ => 1} qw(class dir id lang style title xml:lang));
3078 'a' => { map({$_ => 1} qw(href name)) },
3079 'area' => { map({$_ => 1} qw(alt coords href nohref shape)) },
3080 'basefont' => { map({$_ => 1} qw(color face size)) },
3081 'br' => { map({$_ => 1} qw(clear)) },
3082 'caption' => { map({$_ => 1} qw(align)) },
3083 'col' => { map({$_ => 1} qw(align span width valign)) },
3084 'colgroup' => { map({$_ => 1} qw(align span width valign)) },
3085 'dir' => { map({$_ => 1} qw(compact)) },
3086 'div' => { map({$_ => 1} qw(align)) },
3087 'dl' => { map({$_ => 1} qw(compact)) },
3088 'font' => { map({$_ => 1} qw(color face size)) },
3089 'h1' => { map({$_ => 1} qw(align)) },
3090 'h2' => { map({$_ => 1} qw(align)) },
3091 'h3' => { map({$_ => 1} qw(align)) },
3092 'h4' => { map({$_ => 1} qw(align)) },
3093 'h5' => { map({$_ => 1} qw(align)) },
3094 'h6' => { map({$_ => 1} qw(align)) },
3095 'hr' => { map({$_ => 1} qw(align noshade size width)) },
3096 # NO server-side image maps, therefore NOT ismap !
3097 'img' => { map({$_ => 1} qw(align alt border height hspace src usemap vspace width)) },
3098 'li' => { map({$_ => 1} qw(compact type value)) },
3099 'map' => { map({$_ => 1} qw(name)) },
3100 'menu' => { map({$_ => 1} qw(compact)) },
3101 'ol' => { map({$_ => 1} qw(compact start type)) },
3102 'p' => { map({$_ => 1} qw(align)) },
3103 'pre' => { map({$_ => 1} qw(width)) },
3104 'table' => { map({$_ => 1} qw(align border cellpadding cellspacing summary width)) },
3105 'tbody' => { map({$_ => 1} qw(align valign)) },
3106 'tfoot' => { map({$_ => 1} qw(align valign)) },
3107 'thead' => { map({$_ => 1} qw(align valign)) },
3108 'td' => { map({$_ => 1} qw(align colspan height nowrap rowspan valign width)) },
3109 'th' => { map({$_ => 1} qw(align colspan height nowrap rowspan valign width)) },
3110 'tr' => { map({$_ => 1} qw(align valign)) },
3111 'ul' => { map({$_ => 1} qw(compact type)) }
3113 %tagmt = map({$_ => 1} qw(area basefont br col hr img));
3114 %tagocl = map({$_ => 1} qw(colgroup dd dt li p tbody td tfoot th thead tr));
3116 'colgroup' => \
%tagocl,
3117 'dd' => { map({$_ => 1} qw(colgroup dd dt li p)) },
3118 'dt' => { map({$_ => 1} qw(colgroup dd dt li p)) },
3119 'li' => { map({$_ => 1} qw(colgroup dd dt li p)) },
3120 'tbody' => \
%tagocl,
3121 'td' => { map({$_ => 1} qw(colgroup dd dt li p td th)) },
3122 'tfoot' => \
%tagocl,
3123 'th' => { map({$_ => 1} qw(colgroup dd dt li p td th)) },
3124 'thead' => \
%tagocl,
3125 'tr' => { map({$_ => 1} qw(colgroup dd dt li p td th tr)) },
3127 %tagblk = map({$_ => 1} qw(address blockquote center div dl h1 h2 h3 h4 h5 h6 hr ol p pre table ul));
3128 %taginl = map({$_ => 1} qw(a abbr acronym b basefont bdo big br cite code dfn em font i
3129 img kbd map q s samp small span strike strong sub sup tt u var));
3130 %impatt = map({$_ => 1} qw(checked compact ismap nohref noshade nowrap));
3131 %lcattval = map({$_ => 1} qw(
3132 align border cellpadding cellspacing checked clear color colspan
3133 compact coords height hspace ismap nohref noshade nowrap rowspan size
3134 span shape valign vspace width
3136 %taga1p = map({$_ => 1} qw(a area bdo img map));
3142 # Inspect all '<'...'>' tags in the input and HTML encode those things
3143 # that cannot possibly be tags and at the same time sanitize them.
3145 # $1 => text to process
3148 my ($text, $validate, $htmlauto) = @_;
3150 $text ne "" or return "";
3153 my $end = length($text);
3155 my ($autoclose, $autoclopen);
3158 $autoclose = $htmlauto ?
sub {
3159 my $s = $_[0] || "";
3161 ($stack[$#stack]->[0] ne $s || $_[1] && !$stack[$#stack]->[2]) &&
3162 $tagocl{$stack[$#stack]->[0]}) {
3163 $ans .= "</" . $stack[$#stack]->[0] . ">";
3166 } : sub {} if $validate;
3167 $autoclopen = $htmlauto ?
sub {
3168 my $s = $_[0] || "";
3170 if ($tagblk{$s}) {$c = {p
=>1}}
3171 elsif ($tagocl{$s}) {$c = $tagacl{$s}}
3174 while (@stack && $c->{$stack[$#stack]->[0]}) {
3176 if ($stack[$#stack]->[2] &&
3177 $stack[$#stack]->[1]+3 eq $_[1]) {
3180 $ans .= "</" . $stack[$#stack]->[0] . ">";
3182 if ($stack[$#stack]->[2]) {
3183 $stack[$#stack]->[0] = "\20";
3185 $clp = $s ne "p" && $stack[$#stack]->[0] eq "p";
3190 } : sub {} if $validate;
3191 while (pos($text) < $end) {
3192 if ($text =~ /\G(\s+)/gc) {
3196 if ($text =~ /\G([^<]+)/gc) {
3197 if ($validate && @stack && $stack[$#stack]->[0] eq "\20") {
3198 push(@stack,["p",pos($text)-length($1)]);
3203 push(@stack,["p",pos($text)-length($1)]);
3207 $ans .= _EncodeAmps
($1);
3211 my $tstart = pos($text);
3212 if ($text =~ /\G(<!--(?:[^-]|(?:-(?!-)))*-->)/gc) {
3213 # pass "comments" through unless stripping them
3214 if ($opt{stripcomments
}) {
3215 # strip any trailing whitespace + \n after comment if present
3216 $text =~ /\G[ \t]*\n/gc;
3218 # pass the "comment" on through
3223 if ($text =~ /\G(<[^>]*>)/gc) {
3226 if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} ||
3227 $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
3228 $ok_tag_name{$tt=lc($1)})
3230 my ($stag, $styp, $autocloseflag) = _Sanitize
($tag);
3231 if ($styp == 2 && $lastmt eq $tt) {
3235 $lastmt = $styp == -3 ?
$tt : "";
3236 $tt = "p" if $autocloseflag;
3237 if ($validate && $styp) {
3238 my $clp = &$autoclopen($tt, $tstart) if $styp != 2;
3240 $reopenp && $taginl{$tt} and do {
3241 push(@stack,["p",$tstart]);
3244 push(@stack,[$tt,$tstart,$autocloseflag,$clp]);
3246 } elsif ($styp == 2) {
3247 $reopenp && ($tt eq "p" || $tt eq "\20") and do {
3251 &$autoclose($tt, $autocloseflag);
3252 my $mtstkchk = sub {
3253 !@stack and _xmlfail
("closing tag $tt without matching open at " .
3254 _linecol
($tstart, $text));
3257 if ($autocloseflag && $stack[$#stack]->[0] eq "\20") {
3260 } elsif ($stack[$#stack]->[0] eq $tt) {
3261 $stack[$#stack]->[3] and $reopenp = 1;
3264 pop(@stack) while @stack && $stack[$#stack]->[0] eq "\20";
3266 my @i = @
{$stack[$#stack]};
3267 _xmlfail
("opening tag $i[0] at " . _linecol
($i[1], $text) .
3268 " mismatch with closing tag $tt at " . _linecol
($tstart, $text));
3276 $ans .= _EncodeAmps
($tag);
3281 # can only get here if "\G" char is an unmatched "<"
3286 &$autoclose if $validate;
3287 if ($validate && @stack) {
3290 for ($j = 0; $j <= $#stack; ++$j) {
3291 my @i = @
{$stack[$j]};
3292 next if $i[0] eq "\20";
3293 unshift(@errs, "opening tag $i[0] without matching close at " .
3294 _linecol
($i[1], $text));
3296 _xmlfail
(@errs) unless !@errs;
3298 # Remove any unwanted extra leading <p></p> sections
3299 $ans =~ s{<p></\020>}{}gs if $validate;
3306 my ($pos, $txt) = @_;
3310 ++$l while ($p = pos($txt)), $txt =~ /\G[^\n]*\n/gc && pos($txt) <= $pos;
3311 return "line $l col " . (1 + ($pos - $p));
3316 die join("", map("$_\n", @_));
3323 if ($tag =~ m{^</}) {
3324 my $autocloseflag = undef;
3325 $autocloseflag = 1, $tag="</p>" if $tag eq "</\20>";
3326 return (lc($tag),2,$autocloseflag);
3328 if ($tag =~ /^<([^\s<\/>]+)\s
+/gs
) {
3330 my $autocloseflag = undef;
3331 $autocloseflag = 1, $tt="p" if $tt eq "\20";
3332 my $out = "<" . $tt . " ";
3333 my $ok = $tagatt{$tt};
3334 ref($ok) eq "HASH" or $ok = {};
3336 while ($tag =~ /\G\s*([^\s\042\047<\/>=]+)((?
>=)|\s
*)/gcs
) {
3337 my ($a,$s) = ($1, $2);
3338 if ($s eq "" && substr($tag, pos($tag), 1) =~ /^[\042\047]/) {
3339 # pretend the "=" sign wasn't overlooked
3342 if (substr($s,0,1) ne "=") {
3343 # it's one of "those" attributes (e.g. compact) or not
3344 # _SanitizeAtt will fix it up if it is
3345 $out .= _SanitizeAtt
($a, '""', $ok, $seenatt, $tt);
3349 if ($tag =~ /\G([\042\047])((?:(?!\1)(?!<).)*)\1\s*/gcs) {
3350 $out .= _SanitizeAtt
($a, $1.$2.$1, $ok, $seenatt, $tt);
3354 if ($tag =~ /\G([\042\047])((?:(?!\1)(?![<>])(?![\/][>]).)*)/gcs
) {
3355 # what to do what to do what to do
3356 # trim trailing \s+ and magically add the missing quote
3357 my ($q, $v) = ($1, $2);
3359 $out .= _SanitizeAtt
($a, $q.$v.$q, $ok, $seenatt, $tt);
3363 if ($tag =~ /\G([^\s<\/>]+)\s
*/gcs
) {
3366 $v =~ s/\042/"/go;
3367 $out .= _SanitizeAtt
($a, '"'.$v.'"', $ok, $seenatt, $tt);
3371 # give it an empty value
3372 $out .= _SanitizeAtt
($a, '""', $ok, $seenatt, $tt);
3375 my $sfx = substr($tag, pos($tag));
3379 $typ = ($tag =~ m
,/>$,) ?
3 : -3;
3380 $out .= $opt{empty_element_suffix
};
3381 return ("<" . substr($tag,1), 0) if !$atc && $taga1p{$tt};
3383 if ($tag =~ m
,/>$,) {
3384 return ("<" . substr($tag,1), 0) if !$atc && $taga1p{$tt};
3387 return ("<" . substr($tag,1), 0) if !$atc && $taga1p{$tt};
3390 $out .= "</$tt>" if $typ == 3;
3392 return ($out,$typ,$autocloseflag);
3393 } elsif ($tag =~ /^<([^\s<\/>]+)/s
) {
3395 return ("<" . substr($tag,1), 0) if $taga1p{$tt};
3397 return ("<" . $tt . $opt{empty_element_suffix
}, 3);
3398 } elsif ($tag =~ m
,/>$,) {
3399 return ("<" . $tt . "></" . $tt . ">", 3);
3401 return ("<" . $tt . ">", 1) unless $tt eq "\20";
3402 return ("<p>", 1, 1);
3405 return (lc($tag),0);
3410 my $att = lc($_[0]);
3411 return "" unless $att =~ /^[_a-z:][_a-z:0-9.-]*$/; # no weirdo char att names
3412 return "" unless $univatt{$att} || $_[2]->{$att};
3413 return "" if $_[3]->{$att}; # no repeats
3415 $impatt{$att} and return $att."=".'"'.$att.'"';
3416 if ($lcattval{$att}) {
3417 return $att."="._SanitizeAttValue
(lc($_[1]))." ";
3419 my $satt = _SanitizeAttValue
($_[1]);
3420 if (ref($opt{urlfunc
}) eq 'CODE' &&
3421 (($_[4] eq "a" && $att eq "href") ||
3422 ($_[4] eq "img" && $att eq "src")) ) {
3424 $lq = substr($satt, 0, 1);
3425 $rq = substr($satt, -1, 1);
3426 $v = unescapeXML
(substr($satt, 1, length($satt)-2));
3427 my ($uhost, $upath, $uq, $uf) = SplitURL
($v);
3428 $v = &{$opt{urlfunc
}}($v, \
%opt, $_[4], $uhost, $upath, $uq, $uf);
3429 $satt = $lq . escapeXML
($v) . $rq;
3431 return $att."=".$satt." ";
3436 sub _SanitizeAttValue
{
3438 if ($v =~ /^([\042\047])(.*?)\1$/) {
3439 return $1.escapeXML
($2).$1;
3441 return '"'.escapeXML
($v).'"';
3446 sub _ProcessURLTag
{
3447 my ($att, $tag, $dofrag) = @_;
3449 $att = lc($att) . "=";
3450 if ($tag =~ /^(<[^\s>]+\s+)/g) {
3452 while ($tag =~ /\G([^\s\042\047<\/>=]+=)([\042\047])((?
:(?
!\
2)(?
!<).)*)(\
2\s
*)/gcs
) {
3453 my ($p, $q, $v, $s) = ($1, $2, $3, $4);
3454 if (lc($p) eq $att && $v ne "") {
3455 if ($dofrag && $v =~ m
"^#") {
3456 $v = _FindFragmentMatch
($v);
3458 if (ref($bpcr = $opt{base_prefix
}) eq 'CODE') {
3459 $v = "\2\3" . &$bpcr($v);
3462 $v = _PrefixURL
($v);
3464 $v = _EncodeAttText
($v);
3466 $out .= $p . $q . $v . $s;
3468 $out .= substr($tag, pos($tag));
3469 substr($out,0,1) = $g_escape_table{'<'};
3470 substr($out,-1,1) = $g_escape_table{'>'};
3479 BEGIN { $oops_entities = qr/(?:lt|gt|amp|quot|apos|nbsp)/io; }
3482 # $_[0] => the value to XML escape
3483 # returns the XML escaped value
3484 # Encodes the five required entites (amp,lt,gt,quot,apos)
3485 # while preserving any pre-existing entities which means that
3486 # calling this repeatedly on already-escaped text should return
3487 # it unchanged (i.e. it's idempotent).
3492 # Treat these accidents as though they had the needed ';'
3493 $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
3495 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
3496 # http://bumppo.net/projects/amputator/
3497 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&/g;
3499 # Remaining entities now
3500 $text =~ s/\042/"/g;
3501 $text =~ s/\047/'/g; # Some older browsers do not grok '
3502 $text =~ s/</</g;
3503 $text =~ s/>/>/g;
3509 # $_[0] => value to be unescaped
3510 # returns unescaped value
3511 # The five required XML entities (amp,lt,gt,quot,apos) plus nbsp
3512 # are decoded as well as decimal (#d+) and hexadecimal (#xh+).
3514 # While the escapeXML function tries to be idempotent when presented
3515 # with an already-escaped string, this function is NOT necessarily
3516 # idempotent when presented with an already decoded string unless it's
3517 # been decoded to the point there are no more recognizable entities left.
3518 # In other words given a string such as:
3522 # Each call will only decode one layer of escaping and it will take four
3523 # successive calls to finally end up with just "&".
3528 # Treat these accidents as though they had the needed ';'
3529 $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
3531 $text =~ s/&[qQ][uU][oO][tT];/\042/gso;
3532 $text =~ s/&[aA][pP][oO][sS];/\047/gso;
3533 $text =~ s/&[gG][tT];/>/gso;
3534 $text =~ s/&[lL][tT];/</gso;
3535 $text =~ s/&[nN][bB][sS][pP];/ /gso;
3536 $text =~ s
{&([aA
][mM
][pP
]|\#\d
+|\#x
[0-9a
-fA
-F
]+);}{
3538 lc($_) eq 'amp' ?
'&' :
3539 /^#(\d+)$/ ?
chr($1) :
3540 /^#[xX](.*)$/ ?
chr(hex($1)) :
3548 # $_[0] => the input URL to split
3549 # $_[1] => if a true value, call unescapeXML before splitting
3551 # [0] => scheme, name+password, host, port ("" if not present)
3552 # [1] => path in url (starts with "/" if absolute otherwise relative)
3553 # [2] => query string ("" if not present otherwise starts with "?")
3554 # [3] => fragment ("" if not present otherwise starts with "#")
3555 # The returned value recovers the (possibly unescapeXML'd) input
3556 # string by simply concatenating the returned array elements.
3559 my ($url, $unesc) = @_;
3560 $unesc and $url = unescapeXML
($url);
3561 my ($sh, $p, $q, $f) = ("", "", "", "");
3562 if ($url =~ m{^([A-Za-z][A-Za-z0-9.+-]*:)(//.*)$}os) {
3566 if ($url =~ m{^(//[^/?#]*)((?:[/?#].*)?)$}os) {
3570 ($p, $q, $f) = $url =~ m{^([^?#]*)((?:[?][^#]*)?)((?:[#].*)?)$}os;
3571 return ($sh, $p, $q, $f);
3578 # Treat these accidents as though they had the needed ';'
3579 $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
3581 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
3582 # http://bumppo.net/projects/amputator/
3583 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&/g;
3589 sub _EncodeAmpsAndAngles
{
3590 # Smart processing for ampersands and angle brackets that need to be encoded.
3594 # Treat these accidents as though they had the needed ';'
3595 $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
3597 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
3598 # http://bumppo.net/projects/amputator/
3599 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&/g;
3602 $text =~ s{<(?![\020a-z/?\$!])}{<}gi;
3603 $text =~ s{<(?=[^>]*$)}{<}g;
3605 # Encode <'s that cannot possibly be a start or end tag
3606 $text =~ s{(<[^>]*>)}{_DoTag($1)}ige;
3612 sub _EncodeBackslashEscapes
{
3614 # Parameter: String.
3615 # Returns: String after processing the following backslash escape sequences.
3619 s!\\\\!$g_escape_table{'\\'}!go; # Must process escaped backslashes first.
3620 s{\\([`*_~{}\[\]()>#+\-.!`])}{$g_escape_table{$1}}g
;
3629 s{<((https?|ftps?):[^'\042>\s]+)>(?!\s*</a>)}{_MakeATag($1, "<".$1.">")}gise;
3631 # Email addresses: <address@domain.foo>
3638 [-a
-z0
-9]+(\
.[-a
-z0
-9]+)*\
.[a
-z
]+
3642 _EncodeEmailAddress
(_UnescapeSpecialChars
($1), "<", ">");
3645 # (kjm) I don't do "x" patterns
3646 s
{(?
:^|(?
<=\s
))((?
:https?
|ftps?
)://(?
:[-a
-zA
-Z0
-9./?\
&\
%=_
~!*;:\@
+\
$,\x23](?
:(?
<![.,:;])|(?
=[^\s
])))+)}
3647 {_MakeATag
($1, $1)}soge
;
3648 s
{(?
<![][])(?
<!\
] )\
[RFC
( ?
)([0-9]{1,5})\
](?
![][])(?
! \
[)}
3649 {"["._MakeATag
("https://tools.ietf.org/html/rfc$2", "RFC$1$2", "RFC $2")."]"}soge
;
3655 sub _EncodeEmailAddress
{
3657 # Input: an email address, e.g. "foo@example.com"
3659 # Output: the email address as a mailto link, with each character
3660 # of the address encoded as either a decimal or hex entity, in
3661 # the hopes of foiling most address harvesting spam bots. E.g.:
3663 # <a href="mailto:foo@e
3664 # xample.com">foo
3665 # @example.com</a>
3667 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
3668 # mailing list: <https://tinyurl.com/yu7ue>
3671 my ($addr, $prefix, $suffix) = @_;
3672 $prefix = "" unless defined($prefix);
3673 $suffix = "" unless defined($suffix);
3675 srand(unpack('N',md5
($addr)));
3677 sub { '&#' . ord(shift) . ';' },
3678 sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
3682 $addr = "mailto:" . $addr;
3686 if ( $char eq '@' ) {
3687 # this *must* be encoded. I insist.
3688 $char = $encode[int rand 1]->($char);
3689 } elsif ( $char ne ':' ) {
3690 # leave ':' alone (to spot mailto: later)
3692 # roughly 10% raw, 45% hex, 45% dec
3694 $r > .9 ?
$encode[2]->($char) :
3695 $r < .45 ?
$encode[1]->($char) :
3702 # strip the mailto: from the visible part
3703 (my $bareaddr = $addr) =~ s/^.+?://;
3704 $addr = _MakeATag
("$addr", $prefix.$bareaddr.$suffix);
3710 sub _UnescapeSpecialChars
{
3712 # Swap back in all the special characters we've hidden.
3716 while( my($char, $hash) = each(%g_escape_table) ) {
3717 $text =~ s/$hash/$char/g;
3725 # Parameter: String containing HTML markup.
3726 # Returns: Reference to an array of the tokens comprising the input
3727 # string. Each token is either a tag (possibly with nested,
3728 # tags contained therein, such as <a href="<MTFoo>">, or a
3729 # run of text between tags. Each element of the array is a
3730 # two-element array; the first is either 'tag' or 'text';
3731 # the second is the actual value.
3734 # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
3735 # <https://web.archive.org/web/20041215155739/http://bradchoate.com/weblog/2002/07/27/mtregex>
3740 my $len = length $str;
3744 my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x
$depth) . (')*>)' x
$depth);
3745 my $match = qr
/(?s
: <! ( -- .*?
-- \s
* )+ > ) | # comment
3746 (?s
: <\? .*?
\?> ) | # processing instruction
3747 $nested_tags/iox
; # nested tags
3749 while ($str =~ m/($match)/g) {
3751 my $sec_start = pos $str;
3752 my $tag_start = $sec_start - length $whole_tag;
3753 if ($pos < $tag_start) {
3754 push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
3756 push @tokens, ['tag', $whole_tag];
3759 push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
3766 # Remove one level of line-leading indent_width of spaces
3770 $text =~ s/^ {1,$opt{indent_width}}//gm;
3778 # $2 => optional tab width (default is $opt{tab_width})
3779 # $3 => leading spaces to strip off each line first (default is 0 aka none)
3780 # <= result with tabs expanded
3783 my $ts = shift || $opt{tab_width
};
3784 my $leadsp = shift || 0;
3785 my $spr = qr/^ {1,$leadsp}/ if $leadsp;
3787 my $end = length($text);
3789 while (pos($text) < $end) {
3791 if ($text =~ /\G(.*?\n)/gcs) {
3794 $line = substr($text, pos($text));
3797 $line =~ s/$spr// if $leadsp;
3798 # From the Perl camel book section "Fluent Perl" but modified a bit
3799 $line =~ s/(.*?)(\t+)/$1 . ' ' x (length($2) * $ts - length($1) % $ts)/ges;
3808 # Add URL prefix if needed
3813 $url = "#" unless $url ne "";
3816 ref($opt{abs_prefix
}) eq 'CODE' ||
3817 ref($opt{url_prefix
}) eq 'CODE' ||
3818 ref($opt{img_prefix
}) eq 'CODE' ;
3819 return $url if $url =~ m
"^\002\003" || $url =~ m
"^#" || $url =~ m
,^//,;
3820 $url = &{$opt{abs_prefix
}}($url) if $url =~ m
,^/, && ref($opt{abs_prefix
}) eq 'CODE';
3821 return $url if $url =~ /^[A-Za-z][A-Za-z0-9+.-]*:/ || $url =~ m
,^//, ||
3822 ($opt{keepabs
} && $url =~ m
,^/,);
3823 my $cr = $opt{url_prefix
};
3824 $cr = $opt{img_prefix
}
3825 if ref($opt{img_prefix
}) eq 'CODE' && $url =~ m
"^[^#?]*\.(?:png|gif|jpe?g|svgz?)(?:[#?]|$)"i
;
3826 return $url unless ref($cr) eq 'CODE';
3827 return "\2\3".&$cr(substr($url, 0, 1) eq '/' ?
substr($url, 1) : $url);
3832 $g_style_sheet = <<'STYLESHEET';
3834 <style type="text/css">
3837 /* Markdown.pl fancy style sheet
3838 ** Copyright (C) 2017,2018,2019,2020,2021 Kyle J. McKay.
3839 ** All rights reserved.
3841 ** Redistribution and use in source and binary forms, with or without
3842 ** modification, are permitted provided that the following conditions are met:
3844 ** 1. Redistributions of source code must retain the above copyright notice,
3845 ** this list of conditions and the following disclaimer.
3847 ** 2. Redistributions in binary form must reproduce the above copyright
3848 ** notice, this list of conditions and the following disclaimer in the
3849 ** documentation and/or other materials provided with the distribution.
3851 ** 3. Neither the name of the copyright holder nor the names of its
3852 ** contributors may be used to endorse or promote products derived from
3853 ** this software without specific prior written permission.
3855 ** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
3856 ** AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
3857 ** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
3858 ** ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
3859 ** LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
3860 ** CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
3861 ** SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
3862 ** INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
3863 ** CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
3864 ** ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
3865 ** POSSIBILITY OF SUCH DAMAGE.
3868 div.%(base)code-bt > pre, div.%(base)code > pre {
3874 div.%(base)code-bt > pre > code, div.%(base)code > pre > code {
3875 display: inline-block;
3878 border-top: thin dotted;
3879 border-bottom: thin dotted;
3882 table.%(base)table {
3883 margin-bottom: 0.5em;
3885 table.%(base)table, table.%(base)table th, table.%(base)table td {
3886 border-collapse: collapse;
3890 table.%(base)yaml-table {
3891 border-collapse: collapse;
3893 table.%(base)yaml-table * {
3896 table.%(base)yaml-table th {
3899 table.%(base)yaml-table th, table.%(base)yaml-table td {
3900 padding-left: 0.5ex;
3901 padding-right: 0.5ex;
3905 counter-reset: %(base)item;
3907 ol.%(base)ol[start="0"] {
3908 counter-reset: %(base)item -1;
3910 ol.%(base)ol[start="5"] {
3911 counter-reset: %(base)item 4;
3913 ol.%(base)ol[start="10"] {
3914 counter-reset: %(base)item 9;
3916 ol.%(base)ol > span.%(base)ol-incr {
3917 counter-increment: %(base)item;
3919 ol.%(base)ol > span.%(base)ol-incr-2 {
3920 counter-increment: %(base)item 2;
3922 ol.%(base)ol > span.%(base)ol-incr-5 {
3923 counter-increment: %(base)item 5;
3925 ol.%(base)ol > span.%(base)ol-incr-10 {
3926 counter-increment: %(base)item 10;
3928 ol.%(base)lc-greek, li.%(base)lc-greek {
3929 list-style-type: lower-greek;
3932 counter-increment: %(base)item;
3934 ol.%(base)ol > li.%(base)li,
3935 ol.%(base)ol > li.%(base)li-lc,
3936 ol.%(base)ol > li.%(base)li-lc-greek,
3937 ol.%(base)ol > li.%(base)li-uc {
3938 list-style-type: none;
3941 ol.%(base)ol > li.%(base)li:before,
3942 ol.%(base)ol > li.%(base)li-lc:before,
3943 ol.%(base)ol > li.%(base)li-lc-greek:before,
3944 ol.%(base)ol > li.%(base)li-uc:before {
3947 white-space: nowrap;
3951 ol.%(base)ol > li.%(base)li[type="1"]:before {
3952 content: counter(%(base)item, decimal) ")\A0 \A0 ";
3954 ol.%(base)ol > li.%(base)li-lc[type="i"]:before,
3955 ol.%(base)ol > li.%(base)li-lc[type="I"]:before {
3956 content: counter(%(base)item, lower-roman) ")\A0 \A0 ";
3958 ol.%(base)ol > li.%(base)li-uc[type="I"]:before,
3959 ol.%(base)ol > li.%(base)li-uc[type="i"]:before {
3960 content: counter(%(base)item, upper-roman) ")\A0 \A0 ";
3962 ol.%(base)ol > li.%(base)li-lc[type="a"]:before,
3963 ol.%(base)ol > li.%(base)li-lc[type="A"]:before {
3964 content: counter(%(base)item, lower-alpha) ")\A0 \A0 ";
3966 ol.%(base)ol > li.%(base)li-lc-greek[type="a"]:before,
3967 ol.%(base)ol > li.%(base)li-lc-greek[type="A"]:before {
3968 content: counter(%(base)item, lower-greek) ")\A0 \A0 ";
3970 ol.%(base)ol > li.%(base)li-uc[type="A"]:before,
3971 ol.%(base)ol > li.%(base)li-uc[type="a"]:before {
3972 content: counter(%(base)item, upper-alpha) ")\A0 \A0 ";
3975 li.%(base)checkbox-on,
3976 li.%(base)checkbox-off {
3977 list-style-type: none;
3980 li.%(base)checkbox-on > span:first-child + span + span,
3981 li.%(base)checkbox-off > span:first-child + span + span {
3983 clip: rect(0,0,0,0);
3985 li.%(base)checkbox-on > span:first-child,
3986 li.%(base)checkbox-off > span:first-child,
3987 li.%(base)checkbox-on > span:first-child + span,
3988 li.%(base)checkbox-off > span:first-child + span {
3995 li.%(base)checkbox-on > span:first-child > span:first-child,
3996 li.%(base)checkbox-off > span:first-child > span:first-child {
3999 left: 0.75pt; top: 0.75pt; right: 0.75pt; bottom: 0.75pt;
4001 li.%(base)checkbox-on > span:first-child > span:first-child:before,
4002 li.%(base)checkbox-off > span:first-child > span:first-child:before {
4003 display: inline-block;
4011 li.%(base)checkbox-on > span:first-child + span:before {
4017 vertical-align: text-top;
4025 $g_style_sheet =~ s/^\s+//g;
4026 $g_style_sheet =~ s/\s+$//g;
4027 $g_style_sheet .= "\n";
4036 Markdown.pl - convert Markdown format text files to HTML
4040 B<Markdown.pl> [B<--help>] [B<--html4tags>] [B<--htmlroot>=I<prefix>]
4041 [B<--imageroot>=I<prefix>] [B<--version>] [B<--shortversion>]
4042 [B<--tabwidth>=I<num>] [B<--stylesheet>] [B<--stub>] [--]
4046 -h show short usage help
4047 --help show long detailed help
4048 --html4tags use <br> instead of <br />
4049 --deprecated allow <dir> and <menu> tags
4050 --sanitize sanitize tag attributes
4051 --no-sanitize do not sanitize tag attributes
4052 --validate-xml check if output is valid XML
4053 --validate-xml-internal fast basic check if output is valid XML
4054 --no-validate-xml do not check output for valid XML
4055 --strip-comments remove XML comments from output
4056 --no-strip-comments do not remove XML comments (default)
4057 --tabwidth=num expand tabs to num instead of 8
4058 --auto-number automatically number h1-h6 headers
4059 -k | --keep-abs keep abspath URLs despite -r/-i
4060 -a prefix | --absroot=prefix append abspath URLs to prefix
4061 -b prefix | --base=prefix prepend prefix to fragment-only URLs
4062 -r prefix | --htmlroot=prefix append relative non-img URLs to prefix
4063 -i prefix | --imageroot=prefix append relative img URLs to prefix
4064 -w [wikipat] | --wiki[=wikipat] activate wiki links using wikipat
4065 --yaml[=(enable|disable|strip|...)] select YAML front matter processing
4066 -V | --version show version, authors, license
4068 -s | --shortversion show just the version number
4069 --raw | --raw-xml input contains only raw xhtml
4070 --raw-html input contains only raw html
4071 --stylesheet output the fancy style sheet
4072 --no-stylesheet do not output fancy style sheet
4073 --stub wrap output in stub document
4074 implies --stylesheet
4075 -- end options and treat next
4080 Markdown is a text-to-HTML filter; it translates an easy-to-read /
4081 easy-to-write structured text format into HTML. Markdown's text format
4082 is most similar to that of plain text email, and supports features such
4083 as headers, *emphasis*, code blocks, blockquotes, and links.
4085 Markdown's syntax is designed not as a generic markup language, but
4086 specifically to serve as a front-end to (X)HTML. You can use span-level
4087 HTML tags anywhere in a Markdown document, and you can use block level
4088 HTML tags (like <div> and <table> as well).
4090 For more information about Markdown's syntax, see the F<basics.md>
4091 and F<syntax.md> files included with F<Markdown.pl>.
4093 Input (auto-detected) may be either ISO-8859-1 or UTF-8. Output is always
4094 converted to the UTF-8 character set.
4099 Use "--" to end switch parsing. For example, to open a file named "-z", use:
4106 =item B<--html4tags>
4108 Use HTML 4 style for empty element tags, e.g.:
4112 instead of Markdown's default XHTML style tags, e.g.:
4116 This option is I<NOT compatible> with the B<--validate-xml> option
4117 and will produce an immediate error if both are given.
4120 =item B<--deprecated>
4122 Both "<dir>" and "<menu>" are normally taken as literal text and the leading
4123 "<" will be automatically escaped.
4125 If this option is used, they are recognized as valid tags and passed through
4126 without being escaped.
4128 When dealing with program argument descriptions "<dir>" can be particularly
4129 problematic therefore use of this option is not recommended.
4131 Other deprecated tags (such as "<font>" and "<center>" for example) continue
4132 to be recognized and passed through even without using this option.
4137 Removes troublesome tag attributes from embedded tags. Only a very strictly
4138 limited set of tag attributes will be permitted, other attributes will be
4139 silently discarded. The set of allowed attributes varies by tag.
4141 Splits empty minimized elements that are not one of the HTML allowed empty
4142 elements (C<area> C<basefont> C<br> C<col> C<hr> C<img>) into separate begin
4143 and end tags. For example, C<< <p/> >> or C<< <p /> >> will be split into
4146 Combines adjacent (whitespace separated only) opening and closing tags for
4147 the same HTML empty element into a single minimized tag. For example,
4148 C<< <br></br> >> will become C<< <br /> >>.
4150 Tags that require at least one attribute to be present to be meaningful
4151 (e.g. C<a>, C<area>, C<img>, C<map>) but have none will be treated as non-tags
4152 potentially creating unexpected errors. For example, the sequence
4153 C<< <a>text here</a> >> will be sanitized to C<< <a>text here</a> >> since
4154 an C<a> tag without any attributes is meaningless, but then the trailing
4155 close tag C<< </a> >> will become an error because it has no matching open
4158 The point of this check is not to cause undue frustration, but to allow
4159 such constructs to be used as text without the need for escaping since they
4160 are meaningless as tags. For example, C<< <a><c><e> >> works just fine
4161 as plain text and so does C<< <A><C><E> >> because the
4162 C<< <a> >>/C<< <A> >> will be treated as a non-tag automatically. In fact,
4163 they can even appear inside links too such as
4164 C<< <a href="#somewhere">Link to <a><c><e> article</a> >>.
4166 Problematic C<&> characters are fixed up such as standalone C<&>s (or those not
4167 part of a valid entity reference) are turned into C<&>. Within attribute
4168 values, single and double quotes are turned into C<&> entity refs.
4170 This is enabled by default.
4173 =item B<--no-sanitize>
4175 Do not sanitize tag attributes. This option does not allow any tags that would
4176 not be allowed without this option, but it does completely suppress the
4177 attribute sanitation process. If this option is specified, no attributes will
4178 be removed from any tag (although C<img> and C<a> tags will still be affected
4179 by B<--imageroot>, B<--htmlroot>, B<--absroot> and/or B<--base> options).
4180 Use of this option is I<NOT RECOMMENDED>.
4183 =item B<--validate-xml>
4185 Perform XML validation on the output before it's output and die if
4186 it fails validation. This requires the C<XML::Simple> or C<XML::Parser>
4187 module be present (one is only required if this option is given).
4189 Any errors are reported to STDERR and the exit status will be
4190 non-zero on XML validation failure. Note that all line and column
4191 numbers in the error output refer to the entire output that would
4192 have been produced. Re-run with B<--no-validate-xml> to see what's
4193 actually present at those line and column positions.
4195 If the B<--stub> option has also been given, then the entire output is
4196 validated as-is. Without the B<--stub> option, the output will be wrapped
4197 in C<< <div>...</div> >> for validation purposes but that extra "div" added
4198 for validation will not be added to the final output.
4200 This option is I<NOT enabled by default>.
4202 This option is I<NOT compatible> with the B<--html4tags> option and will
4203 produce an immediate error if both are given.
4206 =item B<--validate-xml-internal>
4208 Perform XML validation on the output before it's output and die if
4209 it fails validation. This uses a simple internal consistency checker
4210 that finds unmatched and mismatched open/close tags.
4212 Non-empty elements that in HTML have optional closing tags (C<colgroup>
4213 C<dd> C<dt> C<li> C<p> C<tbody> C<td> C<tfoot> C<th> C<thead> C<tr>)
4214 will automatically have any omitted end tags inserted during the
4215 `--validate-xml-internal` process.
4217 Any errors are reported to STDERR and the exit status will be
4218 non-zero on XML validation failure. Note that all line and column
4219 numbers in the error output refer to the entire output that would
4220 have been produced before sanitization without any B<--stub> or
4221 B<--stylesheet> options. Re-run with B<--no-sanitize> and
4222 B<--no-validate-xml> and I<without> any B<--stub> or B<--stylesheet>
4223 options to see what's actually present at those line and column
4226 This option validates the output I<prior to> adding any requested
4227 B<--stub> or B<--stylesheet>. As the built-in stub and stylesheet
4228 have already been validated that speeds things up. The output is
4229 I<NOT> wrapped (in a C<< <div>...</div> >>) for validation as that's
4230 not required for the internal checker.
4232 This option is I<IS enabled by default> unless B<--no-sanitize> is
4235 This option is I<IS compatible> with the B<--html4tags> option.
4237 This option requires the B<--sanitize> option and will produce an
4238 immediate error if both B<--no-sanitize> and B<--validate-xml-internal>
4241 Note that B<--validate-xml-internal> is I<MUCH faster> than
4242 B<--validate-xml> and I<does NOT> require any extra XML modules to
4246 =item B<--no-validate-xml>
4248 Do not perform XML validation on the output. Markdown.pl itself will
4249 normally generate valid XML sequences (unless B<--html4tags> has been
4250 used). However, any raw tags in the input (that are on the "approved"
4251 list), could potentially result in invalid XML output (i.e. mismatched
4252 start and end tags, missing start or end tag etc.).
4254 Markdown.pl will I<NOT check> for these issues itself. But with
4255 the B<--validate-xml> option will use C<XML::Simple> or C<XML::Parser>
4258 Note that B<--validate-xml-internal> is the default option unless
4259 B<--no-sanitize> is used in which case B<--no-validate-xml> is the
4263 =item B<--strip-comments>
4265 Strip XML comments from the output. Any XML comments encountered will
4266 be omitted from the output if this option is given.
4268 This option requires the B<--sanitize> option to be used (which is
4271 However, note that the XML standard section 2.5 specifically prohibits
4272 a C<--> sequence within an XML comment (i.e. C<--> cannot occur after
4273 the comment start tag C<< <!-- >> unless it is immediately followed
4274 by C<< > >> which makes it the comment end tag C<< --> >>).
4276 In other words, S<C<< <!-- --> >>>, S<C<< <!-- - --> >>>, S<C<< <!----> >>>,
4277 and S<C<< <!--- --> >>> are all valid XML comments, but S<C<< <!-----> >>>
4278 and S<C<< <!-- ---> >>> are not!
4280 As part of the "sanitation" process (triggered by the B<--sanitize>
4281 option), any invalid tags have their leading C<< < >> escaped (to
4282 C<< &#lt; >>) thus making them ordinary text and this I<includes>
4283 invalid XML comments.
4285 What this means is that the B<--strip-comments> option I<will not> remove
4286 invalid XML comments (such as S<C<< <!-----> >>>)!
4289 =item B<--no-strip-comments>
4291 Do not strip XML comments from the output. This is the default.
4294 =item B<--tabwidth>=I<num>
4296 Expand tabs to I<num> character wide tab stop positions instead of the default
4297 8. Don't use this; physical tabs should always be expanded to 8-character
4298 positions. This option does I<not> affect the number of spaces needed to
4299 start a new "indent level". That will always be 4 no matter what value is
4300 used (or implied by default) with this option. Also note that tabs inside
4301 backticks-delimited code blocks will always be expanded to 8-character tab
4302 stop positions no matter what value is used for this option.
4304 The value must be S<2 <= I<num> <= 32>.
4307 =item B<--auto-number>
4309 Automatically number all h1-h6 headings generated from Markdown markup.
4310 Explicit C<< <h1> >> ... C<< <h6> >> tag content remains unmolested.
4312 If this option is given, any YAML C<header_enum> setting will be ignored.
4315 =item B<-k>, B<--keep-abs>
4317 Normally any absolute path URLs (i.e. URLs without a scheme starting
4318 with "/" but not "//") are subject to modification by any
4319 B<-r>/B<--htmlroot> or B<-i>/B<--imageroot> option.
4321 If the B<-a>/B<--absroot> option is used and it transforms these
4322 absolute path URLs into a full absolute URL (i.e. starts with a
4323 scheme or "//") then any subsequent B<-r>/B<--htmlroot> or
4324 B<-i>/B<--imageroot> processing will be skipped because the URL is
4327 If the B<--keep-abs> option is given, then (after applying any
4328 B<-a>/B<--absroot> option if present) absolute path URLs will be
4329 kept as-is and will not be processed further by any B<-r>/B<--htmlroot>
4330 or B<-i>/B<--imageroot> option.
4332 Note that if the B<-a>/B<--absroot> option transforms an absolute
4333 path URL into a relative PATH URL it I<will> be subject to subsequent
4334 B<-r>/B<--htmlroot> or B<-i>/B<--imageroot> processing regardless
4335 of the B<-k>/B<--keep-abs> option.
4338 =item B<-a> I<prefix>, B<--absroot>=I<prefix>
4340 Any absolute path URLs (i.e. URLs without a scheme starting with "/" but not
4341 "//") have I<prefix> prepended which prevents them from being acted upon by the
4342 B<--htmlroot> and/or B<--imageroot> options provided the result is a full
4343 absolute URL. The default is to prepend nothing and leave them as absolute
4344 path URLs which will allow them to be processed by any B<--htmlroot> and/or
4345 B<--imageroot> options.
4347 This option can be helpful when documents are being formatted for display on a
4348 different system and the absolute path URLs need to be "fixed up".
4351 =item B<-b> I<prefix>, B<--base>=I<prefix>
4353 Any fragment-only URLs have I<prefix> prepended. The default is to prepend
4354 nothing and leave them as bare fragment URLs. Use of this option may be
4355 necessary when embedding the output of Markdown.pl into a document that makes
4356 use of the C<< <base> >> tag in order for intra-document fragment URL links to
4357 work properly in such a document.
4360 =item B<-r> I<prefix>, B<--htmlroot>=I<prefix>
4362 Any non-absolute URLs have I<prefix> prepended.
4365 =item B<-i> I<prefix>, B<--imageroot>=I<prefix>
4367 Any non-absolute URLs have I<prefix> prepended (overriding the B<-r> prefix
4368 if any) but only if they end in an image suffix.
4371 =item B<-w> [I<wikipat>], B<--wiki>[=I<wikipat>]
4373 Activate wiki links. Any link enclosed in double brackets (e.g. "[[link]]") is
4374 considered a wiki link. By default only absolute URL and fragment links are
4375 allowed in the "wiki link style" format. Any other double-bracketed strings
4376 are left unmolested.
4378 If this option is given, all other wiki links are enabled as well. Any
4379 non-absolute URL or fragment links will be transformed into a link using
4380 I<wikipat> where the default I<wikipat> if none is given is C<%{s(:md)}.html>.
4382 If the given I<wikipat> does not contain a C<%{...}> placeholder sequence
4383 then it will automatically have C<%{s(:md)}.html> suffixed to it.
4385 The C<...> part of the C<%{...}> sequence specifies zero or more
4386 case-insensitive single-letter options with the following effects:
4392 Retain blanks (aka spaces) in the output. They will become C<%20>
4393 in the final URL. Because spaces are always trimmed before processing
4394 wiki links, runs of multiple spaces will be collapsed into a single
4395 space and any leading or trailing spaces will be removed.
4399 Convert spaces to dashes (ASCII 0x2D) instead of underscore (ASCII
4400 0x5F). Note that if this option is given then runs of multiple
4401 dashes will be converted to a single dash I<instead> but runs of
4402 multiple underscores will be left untouched.
4406 Flatten the resulting name by replacing forward slashes (ASCII 0x2F)
4407 as well. They will be converted to underscores unless the C<d>
4408 option is given (in which case they will be converted to dashes).
4409 This conversion takes place before applying the runs-of-multiple
4410 reduction. This option is incompatible with the B<%> option.
4414 Flatten the resulting name by replacing runs of one or more forward
4415 slashes (ASCII 0x2F) with C<%2F>. Note that when encoded into a
4416 URL the C<%2F> actually becomes C<%252F>. This option is incompatible
4417 with the B<f> option.
4421 Convert link target (excluding any query string and/or fragment) to lowercase.
4422 Takes precedence over any C<u> option, but specifically excludes C<%>-escapes
4423 which are always UPPERCASE hexadecimal.
4427 Leave raw UTF-8 characters in the result. Normally anything not allowed
4428 directly in a URL ends up URL-encoded. With this option, raw valid UTF-8
4429 sequences will be left untouched. Use with care.
4431 =item B<s> or B<s(>I<< <ext> >>[B<,>I<< <ext> >>]...B<)>
4433 After (temporarily) removing any query string and/or fragment, strip any final
4434 "dot" suffix so long as it occurs after the last slash (if any slash was
4435 present before applying the C<f> option). The "dot" (ASCII 0x2E) and all
4436 following characters (if any) are removed. If the optional C<< (<ext>,...) >>
4437 part is present then only strip the extension if it consists of a "dot"
4438 followed by one of the case-insensitive I<< <ext> >> values. As a special
4439 case, using the value C<:md> for one of the I<< <ext> >> values causes that
4440 value to be expanded to all known markdown extensions.
4442 When processing wiki image links, this option is ignored.
4446 Convert link target (excluding any query string and/or fragment) to UPPERCASE.
4450 Leave runs-of-multiple characters alone (aka "verbatim"). Does not affect
4451 any of the other options except by eliminating the runs-of-multple reduction
4452 step. Also does I<not> inhibit the initial whitespace trimming.
4454 Does not affect the runs-of-multiple "/" replacement performed by the B<%>
4459 The URL target of the wiki link is created by first trimming whitespace
4460 (starting and ending whitespace is removed and all other runs of consecutive
4461 whitespace are replaced with a single space) from the wiki link target,
4462 removing (temporarily) any query string and/or fragment, if no options are
4463 present, spaces are converted to underscores (C<_>) and runs of multiple
4464 consecutive underscores are replaced with a single underscore (ASCII 0x5F).
4465 Finally, the I<wikipat> string gets its first placeholder (the C<%{...}>
4466 sequence) replaced with this computed value and the original query string
4467 and/or fragment is re-appended (if any were originally present) and
4468 URL-encoding is applied as needed to produce the actual final target URL.
4470 Note that when processing wiki image links, no extension stripping ever takes
4471 place (i.e. the "s" option is ignored) and anything after the placeholder (the
4472 C<%{...}> sequence) in the pattern is omitted from the result.
4474 See above option descriptions for possible available modifications.
4476 One of the commonly used hosting platforms does something substantially similar
4477 to using C<%{dfv}> as the placeholder.
4479 One of the commonly used wiki platforms does something similar to using C<%{%}>
4483 =item B<--yaml>[=I<yamlmode>]
4485 Select YAML front matter processing. The optional I<yamlmode> value
4486 must be one of the following:
4492 Recognize any YAML front matter and apply any options specified
4493 therein. If any unrecognized options are present, the options will
4494 also be shown in the formatted output.
4496 This is the default I<yamlmode> if omitted.
4500 No YAML front matter processing at all takes place. If YAML front
4501 matter is present, it will be treated as regular non-YAML markup
4502 text to be processed.
4506 If YAML front matter is present, it will be stripped and completely
4507 ignored before beginning to process the rest of the input.
4509 In this mode, any options in the YAML front matter that would have
4510 otherwise been recognized will I<not have any effect!>
4514 If YAML front matter is present and contains anything other than
4515 comments, the non-comments parts will be shown in the formatted
4518 In this mode, any options in the YAML front matter that would have
4519 otherwise been recognized will I<not have any effect!>
4521 This is a show-only mode.
4525 This mode works just like the B<enable> mode except that if the
4526 YAML front matter contains anything other than comments, then
4527 I<all> of the non-comments parts will be shown in the formatted
4530 In this mode, any recognized options in the YAML front matter I<are>
4531 processed the same way they would be in the B<enable> mode except
4532 that any option to suppress the B<reveal> mode is ignored.
4536 This mode works just like the B<enable> mode except that no options
4537 are ever shown in the formatted output regardless of whether or not
4538 there are any unrecognized options present.
4540 In this mode, any recognized options in the YAML front matter I<are>
4541 processed the same way they would be in the B<enable> mode except
4542 that any option to suppress the B<conceal> mode is ignored.
4546 This mode works just like the B<show> mode if any unrecognized
4547 YAML front matter options are present. Otherwise it works like
4550 In this mode, any options in the YAML front matter that would have
4551 otherwise been recognized will I<not have any effect!>
4555 If B<--raw>, B<--raw-xml> or B<--raw-html> has been specified then
4556 the default if no B<--yaml> option has been given is B<--yaml=disable>.
4558 Otherwise the default if no B<--yaml> option has been given is
4561 Note that only a limited subset of YAML is recognized. Specifically
4562 only comments, and top-level single-line S<C<key: value>> items
4563 where key must be plain (i.e. non-quoted), start with a letter or
4564 underscore and contain only letters, underscores, hyphens (C<->),
4565 periods (C<.>) and digits. Keys are case-insensitive (i.e. converted
4566 to lowercase). As with YAML, at least one whitespace is required
4567 between the ":" and the value (unless it's the empty value).
4569 Values may be either plain or double-quoted (single-quoted is not
4570 recognized). The double-quoted style may use C-style character
4571 escape codes but may not extend past the end of the line.
4573 For YAML front matter to be recognized, the very first line of the
4574 document must be exactly three hyphens (C<--->). The YAML terminates
4575 when a line of three hyphens (C<--->) or a line of three periods
4576 (C<...>) or the end of the file is encountered. Of course the YAML
4577 mode must also be something I<other> than B<--yaml=disable>.
4580 =item B<-V>, B<--version>
4582 Display Markdown's version number and copyright information.
4585 =item B<-s>, B<--shortversion>
4587 Display the short-form version number.
4590 =item B<--raw>, B<--raw-xml>
4592 Input contains only raw XHTML. All options other than
4593 B<--html4tags>, B<--deprecated>, B<--sanitize> (on by default),
4594 B<--strip-comments>, B<--validate-xml> and B<--validate-xml-internal>
4595 (and their B<--no-...> variants) are ignored.
4597 With this option, arbitrary XHTML input can be passed through
4598 the sanitizer and/or validator. If sanitation is requested (the
4599 default), input must only contain the contents of the "<body>"
4600 section (i.e. no "<head>" or "<html>"). Output I<will> be converted
4601 to UTF-8 regardless of the input encoding. All line endings will
4602 be normalized to C<\n> and input encodings other than UTF-8 or
4603 ISO-8859-1 or US-ASCII will end up mangled.
4605 Remember that any B<--stub> and/or B<--stylesheet> options are
4606 I<completely ignored> when B<--raw> is given.
4611 Input contains only raw HTML. All options other than
4612 B<--html4tags>, B<--deprecated>, B<--sanitize> (on by default),
4613 B<--strip-comments>, and B<--validate-xml-internal>
4614 (and their B<--no-...> variants) are ignored.
4616 Requires the (possibly implicit) B<--validate-xml-internal> option.
4618 Works just like B<--raw-xml> except that HTML auto closing and
4619 optional closing tag semantics are activated during the validation
4620 causing missing closing tags to be inserted where required by the
4621 standard. Non-raw mode always enables these semantics.
4623 This will transform HTML into valid XHTML or fail with an error message.
4625 Unfortunately, it will also fail to accept some documents that
4626 the plain B<--raw-xml> option will.
4628 For example, this document:
4632 Will be rejected because upon encountering the C<< <li> >> open
4633 tag a closing C<< </dt> >> will automatically be inserted resulting
4636 <dt></dt><li>a</li></dt>
4638 Which, of course, no longer validates. Since C<li> blocks cannot
4639 directly be nested within C<dt> blocks (according to the standard),
4640 the input document is not strictly correct.
4642 Remember that any B<--stub> and/or B<--stylesheet> options are
4643 I<completely ignored> when B<--raw-html> is given.
4646 =item B<--stylesheet>
4648 Include the fancy style sheet at the beginning of the output (or in the
4649 C<head> section with B<--stub>). This style sheet makes fancy checkboxes
4650 and makes a right parenthesis C<)> show instead of a C<.> for ordered lists
4651 that use them. Without it things will still look fine except that the
4652 fancy stuff won't be there.
4654 Use this option with no other arguments and redirect standard input to
4655 /dev/null to get just the style sheet and nothing else.
4658 =item B<--no-stylesheet>
4660 Overrides a previous B<--stylesheet> and disables implicit inclusion
4661 of the style sheet by the B<--stub> option.
4666 Wrap the output in a full document stub (i.e. has C<html>, C<head> and C<body>
4667 tags). The style sheet I<will> be included in the C<head> section unless the
4668 B<--no-stylesheet> option is also used.
4670 The C<< <title> >> value for a document produced with the B<--stub> option
4671 comes from the first markdown markup C<h1> that's generated unless YAML
4672 processing has been enabled (the default) and a C<title> YAML value has
4673 been set in which case that always takes precedence.
4676 =item B<-h>, B<--help>
4678 Display Markdown's help. With B<--help> full help is shown, with B<-h> only
4679 the usage and options are shown.
4687 Markdown can be used as a Perl module and can be "use"d like so:
4689 use Markdown qw(...);
4693 BEGIN {require "Markdown.pl" && Markdown->import(qw(...))}
4695 where the C<...> part is the list of desired imports.
4697 The Markdown module does not export any functions by default.
4699 The C<Markdown.pm> file is a symbolic link to C<Markdown.pl>.
4701 =head2 Markdown module functions
4703 Any of these functions may be imported, but none of them
4704 are imported by default.
4709 =item * $result = Markdown::Markdown($string[, options...])
4711 Converts Markdown-format C<$string> to UTF-8 encoded XHTML and
4714 The C<options...> may be either a single HASH ref or one or more
4715 pairs of C<< key => value >>.
4717 See the comments for the C<_SanitizeOpts> function for a list of
4718 possible option keys.
4721 =item * $result = Markdown::ProcessRaw($string[, options...])
4723 Converts raw XHTML in C<$string> to XHTML and returns it.
4725 The C<options...> may be either a single HASH ref or one or more
4726 pairs of C<< key => value >>.
4728 See the comments for the C<_SanitizeOpts> function for a list of
4729 possible option keys.
4731 This function provides the ability to apply the internal XML
4732 validation and sanitation functionality to arbitrary XHTML without
4733 performing any of the Markdown format interpretation.
4736 =item * $stylesheet = Markdown::GenerateStyleSheet([$prefix])
4738 Returns an XHTML style sheet that supports the fancy Markdown styles
4739 such as checkboxes and right parenthesis lists.
4741 All of the style class names have C<$prefix> prepended.
4743 If C<$prefix> is omitted or C<undef> then the default S<"_markdown-">
4744 prefix will be used which is the same default prefix that the
4745 C<Markdown> function uses.
4747 The returned string value consists of a C<< <style type="text/css"> >>
4748 tag, the contents of the style sheet and ends with a C<< </style> >> tag.
4751 =item * Markdown::SetWikiOpts($hashref, $wikioption)
4753 The value of C<$wikioption> should be the value of the C<wikipat> value
4754 from the B<--wiki> option. Use the empty string S<""> to enable wiki
4755 links using the defaults and use C<undef> to disable wiki links.
4757 The C<wikipat> and C<wikiopt> keys in C<$hashref> will both be
4758 affected by this call and they should be passed in to the Markdown
4759 function as options to enable processing of wiki links.
4761 The simplest way to do this is simply to pass a HASH ref as the
4762 second argument to the Markdown function after having used this
4763 function on it to properly set the C<wikipat> and C<wikiopt>
4772 This rudimentary example approximates running
4773 S<C<Markdown.pl --stub --wiki>>
4774 on the input (files if given, standard input if not).
4776 use Markdown qw(Markdown SetWikiOpts GenerateStyleSheet escapeXML);
4779 {local $/; $string = <>;}
4780 my %opts = ( h1 => "default title" );
4781 SetWikiOpts(\%opts, ""); # enable default --wiki processing
4782 my $xhtml = Markdown($string, \%opts);
4783 print "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n",
4784 "<head>\n<title>".escapeXML($opts{h1})."</title>\n",
4785 GenerateStyleSheet(),"</head>\n",
4786 "<body style=\"text-align:center\">\n",
4788 "display:inline-block;text-align:left;max-width:42pc\">\n",
4789 $xhtml, "</div></body></html>\n";
4792 =head1 VERSION HISTORY
4794 Z<> See the F<README> file for detailed release notes for this version.
4798 =item Z<> 1.1.10 - 08 Jul 2020
4800 =item Z<> 1.1.9 - 15 Dec 2019
4802 =item Z<> 1.1.8 - 22 Nov 2019
4804 =item Z<> 1.1.7 - 14 Feb 2018
4806 =item Z<> 1.1.6 - 03 Jan 2018
4808 =item Z<> 1.1.5 - 07 Dec 2017
4810 =item Z<> 1.1.4 - 24 Jun 2017
4812 =item Z<> 1.1.3 - 13 Feb 2017
4814 =item Z<> 1.1.2 - 19 Jan 2017
4816 =item Z<> 1.1.1 - 12 Jan 2017
4818 =item Z<> 1.1.0 - 11 Jan 2017
4820 =item Z<> 1.0.4 - 05 Jun 2016
4822 =item Z<> 1.0.3 - 06 Sep 2015
4824 =item Z<> 1.0.2 - 03 Sep 2015
4826 =item Z<> 1.0.1 - 14 Dec 2004
4828 =item Z<> 1.0.0 - 28 Aug 2004
4838 =item L<https://daringfireball.net>
4840 =item L<https://daringfireball.net/projects/markdown/>
4848 =item PHP port and other contributions by Michel Fortin
4850 =item L<https://michelf.ca>
4858 =item Additional enhancements and tweaks by Kyle J. McKay
4860 =item mackyle<at>gmail.com
4864 =head1 COPYRIGHT AND LICENSE
4868 =item Copyright (C) 2003-2004 John Gruber
4870 =item Copyright (C) 2015-2021 Kyle J. McKay
4872 =item All rights reserved.
4876 Redistribution and use in source and binary forms, with or without
4877 modification, are permitted provided that the following conditions are
4884 Redistributions of source code must retain the above copyright
4885 notice, this list of conditions and the following disclaimer.
4889 Redistributions in binary form must reproduce the above copyright
4890 notice, this list of conditions and the following disclaimer in the
4891 documentation and/or other materials provided with the distribution.
4895 Neither the name "Markdown" nor the names of its contributors may
4896 be used to endorse or promote products derived from this software
4897 without specific prior written permission.
4901 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
4902 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
4903 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
4904 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
4905 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
4906 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
4907 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
4908 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
4909 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
4910 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
4911 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.