Markdown.pl

   1 #!/usr/bin/env perl
   2
   3 #
   4 # Markdown -- A text-to-HTML conversion tool for web writers
   5 #
   6 # Copyright (C) 2004 John Gruber
   7 # Copyright (C) 2015,2016,2017,2018,2019,2020,2021 Kyle J. McKay
   8 # All rights reserved.
   9 # License is Modified BSD (aka 3-clause BSD) License\n";
  10 # See LICENSE file (or <https://opensource.org/licenses/BSD-3-Clause>)
  11 #
  12
  13 package Markdown;
  14
  15 use 5.008;
  16 use strict;
  17 use warnings;
  18
  19 use Encode;
  20
  21 use vars qw($COPYRIGHT $VERSION @ISA @EXPORT_OK);
  22
  23 BEGIN {*COPYRIGHT =
  24 \"Copyright (C) 2004 John Gruber
  25 Copyright (C) 2015,2016,2017,2018,2019,2020,2021 Kyle J. McKay
  26 All rights reserved.
  27 ";
  28 *VERSION = \"1.1.11-PRE"
  29 }
  30
  31 use Exporter ();
  32 use Digest::MD5 qw(md5 md5_hex);
  33 use File::Basename qw(basename);
  34 use Scalar::Util qw(refaddr looks_like_number);
  35 my ($hasxml, $hasxml_err); BEGIN { ($hasxml, $hasxml_err) = (0, "") }
  36 my ($hasxmlp, $hasxmlp_err); BEGIN { ($hasxmlp, $hasxmlp_err) = (0, "") }
  37 BEGIN {
  38 @ISA = qw(Exporter);
  39 @EXPORT_OK = qw(Markdown ProcessRaw GenerateStyleSheet SetWikiOpts SplitURL
  40                 escapeXML unescapeXML ResolveFragment);
  41 $INC{__PACKAGE__.'.pm'} = $INC{basename(__FILE__)} unless exists $INC{__PACKAGE__.'.pm'};
  42 }
  43
  44 close(DATA) if fileno(DATA);
  45 exit(&_main(@ARGV)||0) unless caller;
  46
  47 sub fauxdie($) {
  48     my $msg = join(" ", @_);
  49     $msg =~ s/\s+$//os;
  50     printf STDERR "%s: fatal: %s\n", basename($0), $msg;
  51     exit 1;
  52 }
  53
  54 my $encoder;
  55 BEGIN {
  56         $encoder = Encode::find_encoding('Windows-1252') ||
  57                    Encode::find_encoding('ISO-8859-1') or
  58                    die "failed to load ISO-8859-1 encoder\n";
  59 }
  60
  61 #
  62 # Global default settings:
  63 #
  64 my ($g_style_prefix, $g_empty_element_suffix, $g_indent_width,
  65     $g_start_p, $g_close_p);
  66 BEGIN {
  67     $g_style_prefix = "_markdown-";     # Prefix for markdown css class styles
  68     $g_empty_element_suffix = " />";    # Change to ">" for HTML output
  69     $g_indent_width = 4;                # Number of spaces considered new level
  70     $g_start_p = "<p>";                 # _FormParagraphs open paragraph tag
  71     $g_close_p = "</p>";                # _FormParagraphs close paragraph tag
  72 }
  73
  74
  75 #
  76 # Globals:
  77 #
  78
  79 # Style sheet template
  80 my $g_style_sheet;
  81
  82 # Permanent block id table
  83 my %g_perm_block_ids;
  84
  85 # Global hashes, used by various utility routines
  86 my %g_urls;
  87 my %g_titles;
  88 my %g_anchors;
  89 my %g_anchors_id;
  90 my %g_block_ids;
  91 my %g_code_block_ids;
  92 my %g_html_blocks;
  93 my %g_code_blocks;
  94 my @g_xml_comments;
  95 my %opt;
  96 my @autonum;
  97
  98 # Return a "block id" to use to identify the block that does not contain
  99 # any characters that could be misinterpreted by the rest of the code
 100 # Originally this used md5_hex but that's unnecessarily slow
 101 # Instead just use the refaddr of the scalar ref of the entry for that
 102 # key in either the global or, if the optional second argument is true,
 103 # permanent table.  To avoid the result being confused with anything
 104 # else, it's prefixed with a control character and suffixed with another
 105 # both of which are not allowed by the XML standard or Unicode.
 106 sub block_id {
 107     $_[1] or return "\5".refaddr(\$g_block_ids{$_[0]})."\6";
 108     $_[1] == 1 and return "\2".refaddr(\$g_perm_block_ids{$_[0]})."\3";
 109     $_[1] == 2 and return "\25".refaddr(\$g_code_block_ids{$_[0]})."\26";
 110     die "programmer error: bad block_id type $_[1]";
 111 }
 112
 113 # Regex to match balanced [brackets]. See Friedl's
 114 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
 115 my $g_nested_brackets;
 116 BEGIN {
 117     $g_nested_brackets = qr{
 118     (?>                                 # Atomic matching
 119         [^\[\]]+                        # Anything other than brackets
 120      |
 121         \[
 122             (??{ $g_nested_brackets })  # Recursive set of nested brackets
 123         \]
 124     )*
 125     }ox
 126 }
 127
 128 # Regex to match balanced (parentheses)
 129 my $g_nested_parens;
 130 BEGIN {
 131     $g_nested_parens = qr{
 132     (?>                                 # Atomic matching
 133         [^\(\)]+                        # Anything other than parentheses
 134      |
 135         \(
 136             (??{ $g_nested_parens })    # Recursive set of nested parentheses
 137         \)
 138     )*
 139     }ox
 140 }
 141
 142 # Table of hash values for escaped characters:
 143 my %g_escape_table;
 144 BEGIN {
 145     $g_escape_table{""} = "\2\3";
 146     foreach my $char (split //, "\\\`*_~{}[]()>#+-.!|:<") {
 147         $g_escape_table{$char} = block_id($char,1);
 148     }
 149 }
 150
 151 # Used to track when we're inside an ordered or unordered list
 152 # (see _ProcessListItems() for details):
 153 my $g_list_level;
 154 BEGIN {
 155     $g_list_level = 0;
 156 }
 157
 158
 159 #### Blosxom plug-in interface ##########################################
 160 my $_haveBX;
 161 BEGIN {
 162     no warnings 'once';
 163     $_haveBX = defined($blosxom::version);
 164 }
 165
 166 # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
 167 # which posts Markdown should process, using a "meta-markup: markdown"
 168 # header. If it's set to 0 (the default), Markdown will process all
 169 # entries.
 170 my $g_blosxom_use_meta;
 171 BEGIN {
 172     $g_blosxom_use_meta = 0;
 173 }
 174
 175 sub start { 1; }
 176 sub story {
 177     my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
 178
 179     if ((! $g_blosxom_use_meta) or
 180         (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
 181          ) {
 182             $$body_ref = Markdown($$body_ref);
 183     }
 184     1;
 185 }
 186
 187
 188 #### Movable Type plug-in interface #####################################
 189 my $_haveMT = eval {require MT; 1;}; # Test to see if we're running in MT
 190 my $_haveMT3 = $_haveMT && eval {require MT::Plugin; 1;}; # and MT >= MT 3.0.
 191
 192 if ($_haveMT) {
 193     require MT;
 194     import  MT;
 195     require MT::Template::Context;
 196     import  MT::Template::Context;
 197
 198     if ($_haveMT3) {
 199         require MT::Plugin;
 200         import  MT::Plugin;
 201         my $plugin = new MT::Plugin({
 202             name => "Markdown",
 203             description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
 204             doc_link => 'https://daringfireball.net/projects/markdown/'
 205         });
 206         MT->add_plugin( $plugin );
 207     }
 208
 209     MT::Template::Context->add_container_tag(MarkdownOptions => sub {
 210         my $ctx  = shift;
 211         my $args = shift;
 212         my $builder = $ctx->stash('builder');
 213         my $tokens = $ctx->stash('tokens');
 214
 215         if (defined ($args->{'output'}) ) {
 216             $ctx->stash('markdown_output', lc $args->{'output'});
 217         }
 218
 219         defined (my $str = $builder->build($ctx, $tokens) )
 220             or return $ctx->error($builder->errstr);
 221         $str; # return value
 222     });
 223
 224     MT->add_text_filter('markdown' => {
 225         label     => 'Markdown',
 226         docs      => 'https://daringfireball.net/projects/markdown/',
 227         on_format => sub {
 228             my $text = shift;
 229             my $ctx  = shift;
 230             my $raw  = 0;
 231             if (defined $ctx) {
 232             my $output = $ctx->stash('markdown_output');
 233                 if (defined $output && $output =~ m/^html/i) {
 234                     $g_empty_element_suffix = ">";
 235                     $ctx->stash('markdown_output', '');
 236                 }
 237                 elsif (defined $output && $output eq 'raw') {
 238                     $raw = 1;
 239                     $ctx->stash('markdown_output', '');
 240                 }
 241                 else {
 242                     $raw = 0;
 243                     $g_empty_element_suffix = " />";
 244                 }
 245             }
 246             $text = $raw ? $text : Markdown($text);
 247             $text;
 248         },
 249     });
 250
 251     # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
 252     my $smartypants;
 253
 254     {
 255         no warnings "once";
 256         $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
 257     }
 258
 259     if ($smartypants) {
 260         MT->add_text_filter('markdown_with_smartypants' => {
 261             label     => 'Markdown With SmartyPants',
 262             docs      => 'https://daringfireball.net/projects/markdown/',
 263             on_format => sub {
 264                 my $text = shift;
 265                 my $ctx  = shift;
 266                 if (defined $ctx) {
 267                     my $output = $ctx->stash('markdown_output');
 268                     if (defined $output && $output eq 'html') {
 269                         $g_empty_element_suffix = ">";
 270                     }
 271                     else {
 272                         $g_empty_element_suffix = " />";
 273                     }
 274                 }
 275                 $text = Markdown($text);
 276                 $text = $smartypants->($text, '1');
 277             },
 278         });
 279     }
 280 }
 281
 282 sub _tabDefault {
 283         return $_haveBX || $_haveMT ? 4 : 8;
 284 }
 285
 286 sub _strip {
 287         my $str = shift;
 288         defined($str) or return undef;
 289         $str =~ s/^\s+//;
 290         $str =~ s/\s+$//;
 291         $str =~ s/\s+/ /g;
 292         $str;
 293 }
 294
 295 my %_yamlmode;
 296 BEGIN {%_yamlmode = (
 297     disable => 0,
 298     reveal => 1,
 299     enable => 1,
 300     conceal => 1,
 301     show => -1,
 302     unknown => -1,
 303     strip => -1
 304 )}
 305 my %_yamlvis;
 306 BEGIN {%_yamlvis = (
 307     disable => 0,
 308     reveal => 1,
 309     enable => -1,
 310     conceal => 0,
 311     show => 1,
 312     unknown => -1,
 313     strip => 0
 314 )}
 315
 316 sub _require_pod_usage() {
 317         require Pod::Usage;
 318         eval 'require Pod::Text::Termcap; 1;' and
 319                 @Pod::Usage::ISA = (qw( Pod::Text::Termcap ));
 320 }
 321
 322 #### BBEdit/command-line text filter interface ##########################
 323 sub _main {
 324     local *ARGV = \@_;
 325
 326
 327     #### Check for command-line switches: #################
 328     my %options = ();
 329     my %cli_opts;
 330     my $raw = 0;
 331     use Getopt::Long;
 332     Getopt::Long::Configure(qw(bundling require_order pass_through));
 333     GetOptions(
 334         'help' => sub {
 335                         _require_pod_usage;
 336                         Pod::Usage::pod2usage(-verbose => 2, -exitval => 0)},
 337         'h' => sub {
 338                         _require_pod_usage;
 339                         Pod::Usage::pod2usage(-verbose => 0, -exitval => 0)},
 340         'version|V' => sub { # Version info
 341                         print "\nThis is Markdown, version $VERSION.\n", $COPYRIGHT;
 342                         print "License is Modified BSD (aka 3-clause BSD) License\n";
 343                         print "<https://opensource.org/licenses/BSD-3-Clause>\n";
 344                         exit 0},
 345         'shortversion|short-version|s' => sub { # Just the version number string
 346                         print $VERSION;
 347                         exit 0},
 348         'html4tags' => \$cli_opts{'html4tags'},
 349         'deprecated' => \$cli_opts{'deprecated'},
 350         'sanitize' => \$cli_opts{'sanitize'},
 351         'no-sanitize' => sub {$cli_opts{'sanitize'} = 0},
 352         'validate-xml' => sub {$cli_opts{'validate-xml'} = 1},
 353         'validate-xml-internal' => sub {$cli_opts{'validate-xml'} = 2},
 354         'no-validate-xml' => sub {$cli_opts{'validate-xml'} = 0},
 355         'stripcomments|strip-comments' => \$cli_opts{'stripcomments'},
 356         'no-stripcomments|no-strip-comments' => sub {$cli_opts{'stripcomments'} = 0},
 357         'keepabs|keep-abs|k' => \$cli_opts{'keepabs'},
 358         'absroot|a=s' => \$cli_opts{'absroot'},
 359         'base|b=s' => \$cli_opts{'base'},
 360         'htmlroot|r=s' => \$cli_opts{'htmlroot'},
 361         'imageroot|i=s' => \$cli_opts{'imageroot'},
 362         'wiki|w:s' => \$cli_opts{'wiki'},
 363         'tabwidth|tab-width=s' => \$cli_opts{'tabwidth'},
 364         'autonumber|auto-number' => \$cli_opts{'autonumber'},
 365         'raw' => sub { $cli_opts{'raw'} = 1 },
 366         'raw-xml' => sub { $cli_opts{'raw'} = 1 },
 367         'raw-html' => sub { $cli_opts{'raw'} = 2 },
 368         'stylesheet|style-sheet' => \$cli_opts{'stylesheet'},
 369         'no-stylesheet|no-style-sheet' => sub {$cli_opts{'stylesheet'} = 0},
 370         'stub' => \$cli_opts{'stub'},
 371         'yaml:s' => \$cli_opts{'yaml'},
 372     );
 373     defined($cli_opts{'raw'}) or $cli_opts{'raw'} = 0;
 374     my $stub = 0;
 375     if ($cli_opts{'stub'}) {
 376         $stub = 1;
 377     }
 378     if ($cli_opts{'html4tags'}) {        # Use HTML tag style instead of XHTML
 379         $options{empty_element_suffix} = ">";
 380         $stub = -$stub;
 381     }
 382     if ($cli_opts{'deprecated'}) {       # Allow <dir> and <menu> tags to pass through
 383         _SetAllowedTag("dir");
 384         _SetAllowedTag("menu");
 385     }
 386     my $xmlcheck;
 387     $options{sanitize} = 1; # sanitize by default
 388     $options{sanitize} = $cli_opts{'sanitize'} if defined($cli_opts{'sanitize'});
 389     $xmlcheck = $options{sanitize} ? 2 : 0;
 390     $xmlcheck = $cli_opts{'validate-xml'} if defined($cli_opts{'validate-xml'});
 391     $options{stripcomments} = $cli_opts{'stripcomments'} if defined($cli_opts{'stripcomments'});
 392     die "--html4tags and --validate-xml are incompatible\n"
 393         if $cli_opts{'html4tags'} && $xmlcheck == 1;
 394     die "--no-sanitize and --validate-xml-internal are incompatible\n"
 395         if !$options{'sanitize'} && $xmlcheck == 2;
 396     die "--no-sanitize and --strip-comments are incompatible\n"
 397         if !$options{'sanitize'} && $options{stripcomments};
 398     die "--raw-html requires --validate-xml-internal\n"
 399         if $cli_opts{'raw'} == 2 && $xmlcheck != 2;
 400     if ($xmlcheck == 1) {
 401         eval { require XML::Simple; 1 } and $hasxml = 1 or $hasxml_err = $@;
 402         eval { require XML::Parser; 1 } and $hasxmlp = 1 or $hasxmlp_err = $@ unless $hasxml;
 403         die "$hasxml_err$hasxmlp_err" unless $hasxml || $hasxmlp;
 404     }
 405     if ($cli_opts{'tabwidth'}) {
 406         my $tw = $cli_opts{'tabwidth'};
 407         die "invalid tab width (must be integer)\n" unless looks_like_number $tw;
 408         die "invalid tab width (must be >= 2 and <= 32)\n" unless $tw >= 2 && $tw <= 32;
 409         $options{tab_width} = int(0+$tw);
 410     }
 411     $options{auto_number} = 6 if $cli_opts{'autonumber'};
 412     $options{keepabs} = $cli_opts{'keepabs'};
 413     $options{abs_prefix} = "";          # no abs prefix by default
 414     if ($cli_opts{'absroot'}) {         # Use abs prefix for absolute path URLs
 415         my $abs = $cli_opts{'absroot'};
 416         $abs =~ s{/+$}{};
 417         $options{abs_prefix} = $abs;
 418     }
 419     $options{base_prefix} = "";         # no base prefix by default
 420     if ($cli_opts{'base'}) {            # Use base prefix for fragment URLs
 421         $options{base_prefix} = $cli_opts{'base'};
 422     }
 423     if ($cli_opts{'htmlroot'}) {         # Use URL prefix
 424         $options{url_prefix} = $cli_opts{'htmlroot'};
 425     }
 426     if ($cli_opts{'imageroot'}) {        # Use image URL prefix
 427         $options{img_prefix} = $cli_opts{'imageroot'};
 428     }
 429     SetWikiOpts(\%options, $cli_opts{'wiki'}); # Set wiki links options
 430     if (ref($options{wikiopt}) eq 'HASH') {
 431         my $o = $options{wikiopt};
 432         $o->{"f"} && $o->{"%"} and
 433             die "--wiki sub-options 'f' and '%' are mutually exclusive\n"
 434     }
 435     if ($cli_opts{'raw'}) {
 436         $raw = 1;
 437         $options{htmlauto} = 1 if $cli_opts{'raw'} == 2;
 438     }
 439     $options{show_styles} = $cli_opts{'stylesheet'} if defined($cli_opts{'stylesheet'});
 440     $options{show_styles} = 1 if $stub && !defined($options{show_styles});
 441     $options{tab_width} = 8 unless defined($options{tab_width});
 442     my $ym = $cli_opts{'yaml'};
 443     defined($ym) && $ym ne "" or $ym = "enable";
 444     my $lcym = lc($ym);
 445     exists($_yamlmode{$lcym}) or die "invalid --yaml= value '$ym'\n";
 446     $options{yamlmode} = $_yamlmode{$lcym};
 447     $options{yamlvis} = $_yamlvis{$lcym};
 448
 449     my $hdrf = sub {
 450         my $out = "";
 451         if ($stub > 0) {
 452             $out .= <<'HTML5';
 453 <!DOCTYPE html>
 454 <html xmlns="http://www.w3.org/1999/xhtml">
 455 <head>
 456 <meta charset="utf-8" />
 457 <meta http-equiv="content-type" content="text/html; charset=utf-8" />
 458 HTML5
 459         } elsif ($stub < 0) {
 460             $out .= <<'HTML4';
 461 <html>
 462 <head>
 463 <meta charset="utf-8">
 464 <meta http-equiv="content-type" content="text/html; charset=utf-8">
 465 HTML4
 466         }
 467         if ($stub && ($options{title} || $options{h1})) {
 468             my $title = $options{title};
 469             defined($title) && $title ne "" or $title = $options{h1};
 470             if (defined($title) && $title ne "") {
 471                 $title =~ s/&/&amp;/g;
 472                 $title =~ s/</&lt;/g;
 473                 $out .= "<title>$title</title>\n";
 474             }
 475         }
 476         $out .= GenerateStyleSheet($g_style_prefix) if $options{show_styles};
 477         if ($stub) {
 478             $out .= "</head>\n<body style=\"text-align:center\">\n" .
 479                 "<div style=\"display:inline-block;text-align:left;max-width:42pc\">\n";
 480         }
 481         $out;
 482     };
 483
 484     #### Process incoming text: ###########################
 485     my ($didhdr, $hdr, $result, $ftr) = (0, "", "", "");
 486     @ARGV or push(@ARGV, "-");
 487     foreach (@ARGV) {
 488         my ($fh, $contents, $oneresult);
 489         $_ eq "-" or open $fh, '<', $_ or fauxdie "could not open \"$_\": $!\n";
 490         {
 491             local $/; # Slurp the whole file
 492             $_ eq "-" and $contents = <STDIN>;
 493             $_ ne "-" and $contents = <$fh>;
 494         }
 495         defined($contents) or fauxdie "could not read \"$_\": $!\n";
 496         $_ eq "-" or close($fh);
 497         $options{xmlcheck} = ($xmlcheck == 2) ? 2 : 0;
 498         $oneresult = $raw ? ProcessRaw($contents, \%options) : Markdown($contents, \%options);
 499         $oneresult =~ s/\s+$//os;
 500         if ($oneresult ne "") {
 501             if (!$didhdr && !$raw) {
 502                 $hdr = &$hdrf();
 503                 $didhdr = 1;
 504             }
 505             $result .= $oneresult . "\n";
 506         }
 507     }
 508     $hdr = &$hdrf() unless $didhdr || $raw;
 509     $ftr = "</div>\n</body>\n</html>\n" if $stub && !$raw;
 510     if ($xmlcheck == 1) {
 511         my ($good, $errs);
 512         if ($stub && !$raw) {
 513             ($good, $errs) = _xmlcheck($hdr.$result.$ftr);
 514         } else {
 515             ($good, $errs) = _xmlcheck("<div>".$result."</div>");
 516         }
 517         $good or die $errs;
 518     }
 519     print $hdr, $result, $ftr;
 520
 521     exit 0;
 522 }
 523
 524
 525 # INPUT
 526 #   $1: HASH ref
 527 #   $2: value of --wiki= option (see docs) except
 528 #       that a value of undef turns off wiki links
 529 # OUTPUT
 530 #   $1->{wikipat}
 531 #   $1->{wikiopt}
 532 #
 533 sub SetWikiOpts {
 534     my ($o, $wpat) = @_;
 535     ref($o) eq "HASH" or die "internal error: first arg to SetWikiOpts must be HASH ref";
 536     delete $o->{wikipat};
 537     delete $o->{wikiopt};
 538     defined($wpat) or return;
 539     # Parse wiki links option setting
 540     my $wopt = "s(:md)";
 541     if ($wpat =~ /^(.*?)%\{((?:[%0-9A-Za-z]|[Ss]\([^)]*\))*)\}(.*)$/) {
 542         $o->{wikipat} = $1 . "%{}" . $3;
 543         $wopt = $2;
 544     } else {
 545         $o->{wikipat} = $wpat . "%{}.html";
 546     }
 547     my $sval = 1;
 548     while ($wopt =~ /^(.*?)s\(([^)]*)\)(.*)$/i) {
 549         my $sarg = $2;
 550         $wopt = $1 . "s" . $3;
 551         $sarg =~ s/^\s+//; $sarg =~ s/\s+$//;
 552         $sval = {} unless ref($sval) eq "HASH";
 553         s/^\.//, $sval->{lc($_)}=1 foreach split(/(?:\s*,\s*)|(?:(?<!,)\s+(?!,))/, $sarg);
 554         $sval = 1 unless scalar(keys(%$sval));
 555     }
 556     $o->{wikiopt} = { map({$_ => 1} split(//,lc($wopt))) };
 557     if (ref($sval) eq "HASH" && $sval->{':md'}) {
 558         delete $sval->{':md'};
 559         $sval->{$_} = 1 foreach qw(md rmd mkd mkdn mdwn mdown markdown litcoffee);
 560     }
 561     $o->{wikiopt}->{'s'} = $sval if $o->{wikiopt}->{'s'};
 562 }
 563
 564
 565 # Return a copy of the fancy CSS style sheet that uses the
 566 # passed in prefix as a prefix for the CSS style names.
 567 # If no argument is passed in, use $g_style_prefix
 568 # as the CSS style name prefix.
 569 sub GenerateStyleSheet {
 570     my $prefix = shift;
 571     defined($prefix) or $prefix = $g_style_prefix;
 572     my $stylesheet = $g_style_sheet;
 573     $stylesheet =~ s/%\(base\)/$prefix/g;
 574     return $stylesheet;
 575 }
 576
 577
 578 sub _xmlcheck {
 579         my $text = shift;
 580         my ($good, $errs);
 581         ($hasxml ? eval { XML::Simple::XMLin($text, KeepRoot => 1) && 1 } :
 582          eval {
 583                 my $p = XML::Parser->new(Style => 'Tree', ErrorContext => 1);
 584                 $p->parse($text) && 1;
 585         }) and $good = 1 or $errs = _trimerr($@);
 586         ($good, $errs);
 587 }
 588
 589
 590 sub _trimerr {
 591         my $err = shift;
 592         1 while $err =~ s{\s+at\s+\.?/[^,\s\n]+\sline\s+[0-9]+\.?(\n|$)}{$1}is;
 593         $err =~ s/\s+$//os;
 594         $err . "\n";
 595 }
 596
 597
 598 sub _PrepareInput {
 599     my ($input,$parseyaml) = @_;
 600     defined $input or $input = "";
 601     {
 602         use bytes;
 603         $input =~ s/[\x00-\x08\x0B\x0E-\x1F\x7F]+//gso;
 604     }
 605     my $output;
 606     if (Encode::is_utf8($input) || utf8::decode($input)) {
 607         $output = $input;
 608     } else {
 609         $output = $encoder->decode($input, Encode::FB_DEFAULT);
 610     }
 611     # Standardize line endings:
 612     $output =~ s{\r\n}{\n}g;  # DOS to Unix
 613     $output =~ s{\r}{\n}g;    # Mac to Unix
 614
 615     # Extract YAML front matter if requested
 616     my $yaml = undef;
 617     if ($parseyaml) {
 618         $yaml = {};
 619         if ($output =~ /^---[ \t]*(?:\n|\z)/g) {
 620             until ($output =~ /\G(?:(?:(?:---)|(?:\.\.\.))[ \t]*(?:\n|\z)|\z)/gc) {
 621                 next if $output =~ m"\G[ \t]*(?:#[^\n]*)?\n"gc; # skip comment lines
 622                 next if $output =~ m"\G[ \t]*(?:#[^\n]*)\z"gc; # skip final no EOL comment
 623                 last unless $output =~ /\G([^\n]+)(?:\n|\z)/gc;
 624                 my $yl = $1;
 625                 if ($yl =~ /^([A-Za-z_][A-Za-z_0-9.-]*):[ \t]+(.*)$/os) {
 626                         my ($k, $v) = ($1, $2);
 627                         $yaml->{lc($k)} = _YAMLvalue($2);
 628                 }
 629             }
 630             $output = substr($output, pos($output));
 631         }
 632     }
 633     return wantarray ? ($output, $yaml) : $output;
 634 }
 635
 636
 637 sub _YAMLvalue {
 638         my $v = shift;
 639         $v =~ s/^\s+//;
 640         if (substr($v, 0, 1) eq '"') {
 641                 # only $ and/or @ present issues, map them
 642                 $v =~ tr/\@\$/\036\037/;
 643                 eval '{$v='.$v."\n}1" or $v = undef;
 644                 $v =~ tr/\036\037/\@\$/ if defined($v);
 645         } else {
 646                 $v =~ s"#.*$""os;
 647                 $v =~ s/\s+$//os;
 648                 $v ne "" or $v = undef;
 649         }
 650         return $v;
 651 }
 652
 653
 654 sub ProcessRaw {
 655     my $text = _PrepareInput(shift);
 656
 657     # Any remaining arguments after the first are options; either a single
 658     # hashref or a list of name, value pairs.  See _SanitizeOpts comments.
 659
 660     %opt = (
 661         empty_element_suffix    => $g_empty_element_suffix,
 662     );
 663     my %args = ();
 664     if (ref($_[0]) eq "HASH") {
 665         %args = %{$_[0]};
 666     } else {
 667         %args = @_;
 668     }
 669     while (my ($k,$v) = each %args) {
 670         $opt{$k} = $v;
 671     }
 672     _SanitizeOpts(\%opt);
 673
 674     # Sanitize all '<'...'>' tags if requested
 675     $text = _SanitizeTags($text, $opt{xmlcheck}, $opt{htmlauto}) if $opt{sanitize};
 676
 677     utf8::encode($text);
 678     return $text;
 679 }
 680
 681
 682 # $1: HASH ref with the following key value semantics
 683 #
 684 #   sanitize => any-false-value (no action), any-true-value (sanitize).
 685 #               note that a true value of xmlcheck or a true value of
 686 #               stripcomments or a urlfunc value that is a CODE ref
 687 #               always forces sanitize to activate.
 688 #               tag attributes are sanitized by removing all "questionable"
 689 #               attributes (such as script attributes, unknown attributes
 690 #               and so forth) and normalizing the remaining ones (i.e.
 691 #               adding missing quotes and/or values etc.).
 692 #               effective for both ProcessRaw and Markdown.
 693 #   xmlcheck => 0 (no check), any-true-value (internal check).
 694 #               note that the default if xmlcheck is not set/valid is 2.
 695 #               note that a true value is effective for both ProcessRaw
 696 #               and Markdown.  note that a true value automatically inserts
 697 #               the closing tag for auto-closing tags and converts empty tags
 698 #               to the correct format converting empty tags that shouldn't be
 699 #               to an open and close pair; since xmlcheck is a function of the
 700 #               sanitizer, tag attributes are also always sanitized whenever
 701 #               xmlcheck has a true value.
 702 #               note that a true xmlcheck value WILL call "die" with a
 703 #               detailed indication of the error(s) if xml validation fails
 704 #               in which case any line/column numbers refer to the text that
 705 #               would be produced by a sanitize=>0, xmlcheck=>0 call to
 706 #               either ProcessRaw or Markdown, NOT the original input text.
 707 #   htmlauto => any-false-value (no auto close), any-true-value (auto-close)
 708 #               only effective for ProcessRaw; always enabled for Markdown.
 709 #               when xmlcheck is set to 2 provide html automatic closing tag
 710 #               and optional closing tag semantics where closing tags are
 711 #               automatically inserted when encountering an opening tag that
 712 #               auto closes a currently open tag and tags with an optional
 713 #               closing tag that's missing have that inserted as appropriate.
 714 #               a true value may result in some texts being rejected that
 715 #               would be otherwise be accepted (e.g. "<p><pre></pre></p>"
 716 #               which gets turned into "<p></p><pre></pre></p>" which then
 717 #               no longer validates).
 718 #   stripcomments => any-false-value (no action), any-true-value (strip).
 719 #               since the strip comments mechanism is a function of the
 720 #               sanitizer, if stripcomments is set to any-true-value then
 721 #               tag attributes will also always be sanitized.
 722 #               effective for both ProcessRaw and Markdown.
 723 #   empty_element_suffix => " />" or ">"
 724 #               will be forced to " />" if not valid or defined.
 725 #               effective for both ProcessRaw and Markdown.
 726 #   urlfunc     => if set to a CODE ref, the function will be called with
 727 #               seven arguments like so:
 728 #                 $result = &$urlfunc($iresult, \%opts, $tag, $uhost, $uabs, $q, $f)
 729 #               where on input $iresult is the result that would be produced
 730 #               if no urlfunc was provided and on output $result will be
 731 #               used as url value.  $tag is either "img" or "a" to indicate the
 732 #               source of the url.  $uhost.$uabs is the result of stripping off
 733 #               any query string and/or fragment from $iresult.  $q contains either
 734 #               an empty string or the stripped off query string and $f contains
 735 #               an empty string or the stripped off fragment if they were originally
 736 #               present where a non-empty $q always starts with '?' and a non-empty
 737 #               $f always starts with '#'.  $uhost contains the scheme and
 738 #               host+port if present (it may be the empty string).  $uabs contains
 739 #               the path portion which may or may not start with a "/" depending on
 740 #               whether or not it's a relative path.  The $iresult value is related
 741 #               to the other arguments like so:
 742 #                 $iresult = $uhost . $uabs . $q . $f;
 743 #               All values passed to the urlfunc function have already been HTML
 744 #               unescaped and the returned value will be automatically HTML escaped.
 745 #               Any provided urlfunc should treat the %opts HASH as
 746 #               read-only.  Modifying the %opts HASH in urlfunc will
 747 #               likely result in unpredictable behavior!  Don't do it!
 748 #               If urlfunc is set to a CODE ref then tags will also always be
 749 #               sanitized.
 750 #
 751 #  The remaining key value pairs are ignored by ProcessRaw and are only
 752 #  effective when using Markdown or _main
 753 #
 754 #   tab_width => 1..32 which is how many spaces tabs are expanded to.
 755 #               will be forced to 8 if not in range.
 756 #   indent_width => 1..32 how many spaces make a new "indent" level.
 757 #               will be forced to 4 if not in range.
 758 #   style_prefix => prefix to prepend to all CSS style names in the
 759 #               fancy CSS style sheet.
 760 #               defaults to $g_style_prefix if not defined.
 761 #               note that _main actually adds the style sheet (when
 762 #               requested); use GenerateStyleSheet to retrieve the
 763 #               fancy style sheet when calling Markdown directly.
 764 #   auto_number => <= 0 (default) no numbering, 1 number h1s,
 765 #               2 number h1s, h2s, 3 number h1-h3s, ... >= 6 number h1-h6s
 766 #   anchors    => existence of this key triggers return of anchors HASH
 767 #   yamlmode   => 0 (no YAML processing), > 0 (YAML on), < 0 (YAML ignore)
 768 #               if 0, the YAML front matter processor is completely
 769 #               disabled and any YAML front matter that might be present
 770 #               will be treated as markup.  if > 0 any YAML front matter
 771 #               will be processed and any recognized options applied.
 772 #               if < 0 any YAML front matter will be parsed but no
 773 #               options will be applied at all.  When != 0 the parsed
 774 #               YAML front matter can be retrieved via the 'yaml' key.
 775 #               defaults to 1 if not defined or not a number.
 776 #   yamlvis    => 0 (invisible), > 0 (visible), < 0 (vis if unknown)
 777 #               if yamlmode == 0 then yamlvis has no effect.  if > 0
 778 #               then any parsed YAML front matter options will be shown
 779 #               in the formatted output.  if 0 then NO YAML front
 780 #               matter options will be shown in the formatted output.
 781 #               if < 0 then YAML front matter options will be shown in
 782 #               the formatted output only if there are any unrecognized
 783 #               options present.
 784 #               defaults to -1 if not defined or not a number.
 785 #   keepabs    => any-false-value (no action), any-true-value (keep)
 786 #               if true, any absolute path URLs remaining after applying
 787 #               any abs_prefix value will be kept and not be subject
 788 #               to modification by any url_prefix or img_prefix value.
 789 #   abs_prefix => value to prefix to absolute path URLs (i.e. start with /).
 790 #               note that this does NOT get prepended to //host/path URLs.
 791 #   url_prefix => value to prefix to non-absolute URLs.
 792 #               note that this does NOT get prepended to //host/path URLs.
 793 #   img_prefix => value to prefix to non-absolute image URLs.
 794 #               note that this does NOT get prepended to //host/path URLs.
 795 #               note that if img_prefix is undef or empty ("") then
 796 #               url_prefix will be prepended to image URLs.
 797 #   base_prefix => value to prefix to fragment-only URLs (i.e. start with #).
 798 #               note that fragment-only URLs are always left undisturbed
 799 #               if this is not set.  Fragment-only URLs are NOT affected by
 800 #               any of abs_prefix, url_prefix or img_prefix.
 801 #   wikipat     => non-empty pattern string to enable wiki links.
 802 #               best set with SetWikiOpts (see SetWikiOpts comments).
 803 #   wikiopt     => HASH ref of options affecting wiki links processing.
 804 #               best set with SetWikiOpts (see SetWikiOpts comments).
 805 #   wikifunc    => if set to a CODE ref, the function will be called with
 806 #               six arguments like so:
 807 #                 $result = &$wikifunc($iresult, \%opts, $link, $wbase, $qf, $io)
 808 #               where on input $iresult is the result that would be produced
 809 #               if no wikifunc was provided and on output $result will be
 810 #               used as the wiki expansion.  $link is the original wiki
 811 #               destination as specified in the source, $wbase is the result
 812 #               of stripping off any query string and/or fragment from $link
 813 #               and then transforming that according to the wikiopt HASH ref.
 814 #               $qf contains either an empty string or the stripped off
 815 #               query string and/or fragment if one was originally present.
 816 #               If $io is a HASH ref (otherwise it will be undef), then it's
 817 #               a wiki image link and %$io contains the options (if any) where
 818 #               the keys that might be present include "width", "height", "align"
 819 #               and "alt".  If present, width and height are guaranteed to be
 820 #               positive integers and align is guaranteed to be "left", "right"
 821 #               or "center".  The value for "alt" may be the empty string.
 822 #               The "imgflag" key has a SCALAR ref value and if the value it
 823 #               refers to is changed to any false value the result will become
 824 #               an A tag rather than an IMG tag.
 825 #               The $iresult value is related to the other arguments like so:
 826 #                 $iresult = $opts->{wikipat};
 827 #                 $iresult =~ s/%\{\}.+$/%{}/ if ref($io) eq "HASH";
 828 #                 $iresult = s/%\{\}/$wbase/;
 829 #                 $iresult .= $qf;
 830 #               Any provided wikifunc should treat the %opts HASH as
 831 #               read-only.  Modifying the %opts HASH in wikifunc will
 832 #               likely result in unpredictable behavior!  Don't do it!
 833 #
 834 #  Special handling for abs_prefix, url_prefix, img_prefix and/or base_prefix
 835 #  may be activated by setting any subset (or all) of the values for these
 836 #  keys to a CODE ref.  The single argument is the URL and the result must
 837 #  be the adjusted URL.  For example, the equivalent CODE ref to setting
 838 #  url_prefix to $string is simply sub { $string.$_[0] }.  By using a
 839 #  CODE ref, behavior other than simply performing a prepend operation
 840 #  can be realized when necessary for unusual situations.
 841 #
 842 #  The following are OUTPUT values that can only be retrieved when
 843 #  Markdown is called with a HASH ref as the second argument
 844 #
 845 #   anchors    => if the 'anchors' key exists in the input HASH ref
 846 #               will be set to a HASH ref containing lookup keys
 847 #               for valid fragment ids in the document (only those
 848 #               created from Markdown markup) with the value the
 849 #               actual fragment link to use.  Do not use this directly
 850 #               but pass it as the first argument to the ResolveFragment
 851 #               function to resolve a "fuzzy" fragment name to its
 852 #               actual fragment name in the generated output.
 853 #               NOTE: to activate return of anchors the 'anchors' key
 854 #               simply must exist in the input HASH ref passed to the
 855 #               Markdown function, its value will be replaced on output.
 856 #
 857 #   h1         => will be set to the tag-stripped value of the first
 858 #               non-empty H1 generated by Markdown-style markup.
 859 #               note that literal <h1>...</h1> values are NOT picked up.
 860 #               will be left unchanged if no Markdown-style H1 detected.
 861 #               note that the value is NOT xml escaped but should be
 862 #               before embedding in an XHTML document.  If yamlmode > 0
 863 #               and a 'title' value has been encountered, then this
 864 #               will be set to that 'title' value instead (and the
 865 #               'title' key and value will still be present in %$yaml).
 866 #
 867 #   yaml       => if yamlmode is != 0 then this will be set to a HASH
 868 #               ref containing any parsed YAML front matter or left
 869 #               unchanged if no YAML front matter was found.  If the
 870 #               parsed YAML front matter contains only whitespace and/or
 871 #               comments then this will be set to a HASH ref that has
 872 #               no keys or values.
 873 #
 874 sub _SanitizeOpts {
 875     my $o = shift; # hashref
 876     ref($o) eq "HASH" or return;
 877
 878     $o->{xmlcheck} = looks_like_number($o->{xmlcheck}) && $o->{xmlcheck} == 0 ? 0 : 2;
 879     $o->{sanitize} = 1 if $o->{stripcomments} && !$o->{sanitize};
 880     $o->{sanitize} = 1 if $o->{xmlcheck} && !$o->{sanitize};
 881     $o->{sanitize} = 1 if ref($o->{urlfunc}) eq 'CODE' && !$o->{sanitize};
 882
 883     # this is gross, but having the globals avoids unnecessary slowdown
 884     if ($o->{sanitize} && $o->{xmlcheck}) {
 885         $g_start_p = "<\20>";
 886         $g_close_p = "</\20>";
 887     } else {
 888         $g_start_p = "<p>";
 889         $g_close_p = "</p>";
 890     }
 891
 892     defined($o->{empty_element_suffix}) &&
 893     ($o->{empty_element_suffix} eq " />" || $o->{empty_element_suffix} eq ">")
 894         or $o->{empty_element_suffix} = " />";
 895
 896     $o->{tab_width} = 8 unless looks_like_number($o->{tab_width}) &&
 897         1 <= $o->{tab_width} && $o->{tab_width} <= 32;
 898     $o->{tab_width} = int($o->{tab_width});
 899
 900     $o->{indent_width} = 4 unless looks_like_number($o->{indent_width}) &&
 901         1 <= $o->{indent_width} && $o->{indent_width} <= 32;
 902     $o->{indent_width} = int($o->{indent_width});
 903
 904     defined($o->{auto_number}) or $o->{auto_number} = '';
 905     $o->{auto_number} eq '' || looks_like_number($o->{auto_number})
 906         or $o->{auto_number} = 6;
 907     if ($o->{auto_number} ne '') {
 908         $o->{auto_number} = int(0+$o->{auto_number});
 909         $o->{auto_number} >= 0 or $o->{auto_number} = 0;
 910         $o->{auto_number} <= 6 or $o->{auto_number} = 6;
 911     }
 912
 913     defined($o->{style_prefix}) or $o->{style_prefix} = $g_style_prefix;
 914
 915     $o->{abs_prefix} = _MakePrefixCODERef($o->{abs_prefix}, 1)
 916         unless ref($o->{abs_prefix}) eq 'CODE';
 917     $o->{url_prefix} = _MakePrefixCODERef($o->{url_prefix}, 0)
 918         unless ref($o->{url_prefix}) eq 'CODE';
 919     $o->{img_prefix} = _MakePrefixCODERef($o->{img_prefix}, 0)
 920         unless ref($o->{img_prefix}) eq 'CODE';
 921     $o->{base_prefix} = _MakePrefixCODERef($o->{base_prefix}, -1)
 922         unless ref($o->{base_prefix}) eq 'CODE';
 923
 924     ref($o->{wikiopt}) eq "HASH" or $o->{wikiopt} = {};
 925
 926     # Note that because Markdown makes a copy of the options
 927     # before calling this function, this does not actually remove
 928     # any "h1" key that might have been set by the caller of
 929     # the Markdown function.  However, by deleting it here,
 930     # this guarantees that any found value will actually be
 931     # picked up and stored (which will not happen if the key
 932     # already exists).
 933     delete $o->{h1};
 934
 935     # Default is to silently strip any known YAML front matter
 936     # Same comment about "yaml" key as above for "h1" key
 937     $o->{yamlmode} = 1 unless looks_like_number($o->{yamlmode});
 938     $o->{yamlvis} = -1 unless looks_like_number($o->{yamlvis});
 939     delete $o->{yaml};
 940
 941     # The anchors hash will only be returned if the key exists
 942     # (the key's value doesn't matter), set the value to an empty
 943     # HASH ref just in case to make sure it's always a HASH ref.
 944     $o->{anchors} = {} if exists($o->{anchors});
 945 }
 946
 947 my %_yamlopts;
 948 BEGIN {%_yamlopts = map({$_ => 1} qw(
 949         display_metadata
 950         header_enum
 951         title
 952 ))}
 953
 954
 955 sub _HasUnknownYAMLOptions {
 956     do { return 1 unless exists($_yamlopts{$_}) } foreach keys(%{$_[0]});
 957     return 0;
 958 }
 959
 960
 961 sub _ApplyYAMLOpts {
 962     my ($yaml, $opt) = @_;
 963     if (defined($yaml->{display_metadata}) && $opt->{yamlvis} < 0) {
 964         # ignore display_metadata except in --yaml=enable mode
 965         $opt->{yamlvis} = _YAMLTrueValue($yaml->{display_metadata}) ? 1 : 0;
 966     }
 967     $opt->{h1} = $yaml->{title} if defined($yaml->{title});
 968     if (defined($yaml->{header_enum}) && $opt->{auto_number} eq '') {
 969         $opt->{auto_number} = _YAMLTrueValue($yaml->{header_enum}) ? 6 : 0;
 970     }
 971 }
 972
 973
 974 sub _YAMLTrueValue {
 975     my $v = shift;
 976     defined($v) or $v = "";
 977     $v = lc($v);
 978     return !($v eq "" || $v eq "0" || $v eq "false" || $v eq "disable" || $v eq "off" || $v eq "no");
 979 }
 980
 981
 982 # Actually returns an empty string rather than a CODE ref
 983 # if an empty prefix is passed in.  Trailing "/"s are trimmed
 984 # off if the second argument is positive or the string does NOT
 985 # consist of only "/"s.  A trailing "/" is added unless the
 986 # trimmed prefix already has one or the second argument is true.
 987 # If the second argument is negative, the prefix is used as-is.
 988 sub _MakePrefixCODERef {
 989     my ($prefix, $mtok) = @_;
 990     defined($prefix) or $prefix = "";
 991     looks_like_number($mtok) or $mtok = $mtok ? 1 : 0;
 992     if ($mtok > 0) {
 993         $prefix =~ s,/+$,,;
 994     } elsif (!$mtok) {
 995         $prefix =~ s,//+$,/,;
 996     }
 997     $prefix ne "" or return "";
 998     $prefix .= '/' if !$mtok && substr($prefix, -1, 1) ne '/';
 999     return sub { $prefix . $_[0] };
1000 }
1001
1002
1003 sub Markdown {
1004 #
1005 # Primary function. The order in which other subs are called here is
1006 # essential. Link and image substitutions need to happen before
1007 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
1008 # and <img> tags get encoded.
1009 #
1010     my $text = shift;
1011
1012     # Any remaining arguments after the first are options; either a single
1013     # hashref or a list of name, value pairs.  See _SanitizeOpts comments.
1014     %opt = (
1015         # set initial defaults
1016         style_prefix            => $g_style_prefix,
1017         empty_element_suffix    => $g_empty_element_suffix,
1018         tab_width               => _tabDefault,
1019         indent_width            => $g_indent_width,
1020         abs_prefix              => "", # Prefixed to absolute path URLs
1021         url_prefix              => "", # Prefixed to non-absolute URLs
1022         img_prefix              => "", # Prefixed to non-absolute image URLs
1023         base_prefix             => "", # Prefixed to fragment-only URLs
1024     );
1025     @autonum = ();
1026     my %args = ();
1027     if (ref($_[0]) eq "HASH") {
1028         %args = %{$_[0]};
1029     } else {
1030         %args = @_;
1031     }
1032     while (my ($k,$v) = each %args) {
1033         $opt{$k} = $v;
1034     }
1035     _SanitizeOpts(\%opt);
1036
1037     my $yaml;
1038     ($text, $yaml) = _PrepareInput($text, $opt{yamlmode});
1039     _ApplyYAMLOpts($yaml, \%opt) if ref($yaml) eq "HASH" && $opt{yamlmode} > 0;
1040
1041     # Clear the globals. If we don't clear these, you get conflicts
1042     # from other articles when generating a page which contains more than
1043     # one article (e.g. an index page that shows the N most recent
1044     # articles):
1045     %g_urls = ();
1046     %g_titles = ();
1047     %g_anchors = ();
1048     %g_anchors_id = ();
1049     %g_block_ids = ();
1050     %g_code_block_ids = ();
1051     %g_html_blocks = ();
1052     %g_code_blocks = ();
1053     @g_xml_comments = ();
1054     $g_list_level = 0;
1055
1056     # Make sure $text ends with a couple of newlines:
1057     $text .= "\n\n";
1058
1059     # Handle backticks-delimited code blocks
1060     $text = _HashBTCodeBlocks($text);
1061
1062     # Convert all tabs to spaces.
1063     $text = _DeTab($text);
1064
1065     # Strip any lines consisting only of spaces.
1066     # This makes subsequent regexen easier to write, because we can
1067     # match consecutive blank lines with /\n+/ instead of something
1068     # contorted like / *\n+/ .
1069     $text =~ s/^ +$//mg;
1070
1071     # Turn block-level HTML blocks into hash entries
1072     $text = _HashHTMLBlocks($text, 1);
1073
1074     # Strip link definitions, store in hashes.
1075     $text = _StripLinkDefinitions($text);
1076
1077     $text = _RunBlockGamut($text, 1);
1078
1079     # Remove indentation markers
1080     $text =~ s/\027+//gs;
1081
1082     # Expand auto number flags
1083     $text =~ s/\034([1-6])/_AutoHeaderNum(ord($1)&0x7)/gse
1084         if $opt{auto_number} ne '' && $opt{auto_number} > 0;
1085
1086     # Unhashify code blocks
1087     $text =~ s/(\025\d+\026)/$g_code_blocks{$1}/g;
1088
1089     $text = _UnescapeSpecialChars($text);
1090
1091     $text .= "\n" unless $text eq "";
1092
1093     # Sanitize all '<'...'>' tags if requested
1094     $text = _SanitizeTags($text, $opt{xmlcheck}, 1) if $opt{sanitize};
1095
1096     utf8::encode($text);
1097     if (ref($_[0]) eq "HASH") {
1098         ${$_[0]}{anchors} = {%g_anchors_id} if exists(${$_[0]}{anchors});
1099         if (defined($opt{h1}) && $opt{h1}) {
1100             utf8::encode($opt{h1});
1101             ${$_[0]}{h1} = $opt{h1};
1102         }
1103         ${$_[0]}{yaml} = $yaml if ref($yaml) eq "HASH";
1104     }
1105     my $yamltable = "";
1106     if (ref($yaml) eq "HASH" && %$yaml && $opt{yamlmode} && $opt{yamlvis}) {
1107         if ($opt{yamlvis} > 0 || _HasUnknownYAMLOptions($yaml)) {
1108             my ($hrows, $drows) = ("", "");
1109             foreach (sort(keys(%$yaml))) {
1110                 my $v = $yaml->{$_};
1111                 my $rspn = '';
1112                 if (defined($v)) {
1113                     $v =~ s/&/&amp;/g;
1114                     $v =~ s/</&lt;/g;
1115                     utf8::encode($v);
1116                     $drows .= "<td>" . $v . "</td>\n";
1117                 } else {
1118                     $rspn = " class=\"$opt{style_prefix}yaml-undef-value\" rowspan=\"2\" valign=\"top\"";
1119                 }
1120                 $hrows .= "<th$rspn>" . $_ . "</th>\n";
1121             }
1122             $yamltable = "<table class=\"$opt{style_prefix}yaml-table\" border=\"1\">\n" .
1123                 "<tr>\n$hrows</tr>\n<tr>\n$drows</tr>\n</table>\n";
1124         }
1125     }
1126     return $yamltable.$text;
1127 }
1128
1129
1130 sub _HashBTCodeBlocks {
1131 #
1132 #   Process Markdown backticks (```) delimited code blocks
1133 #
1134     my $text = shift;
1135     my $less_than_indent = $opt{indent_width} - 1;
1136
1137     $text =~ s{
1138             (?:(?<=\n)|\A)
1139                 ([ ]{0,$less_than_indent})``(`+)[ \t]*(?:([\w.+-]+[#]?)(?:[ \t][ \t\w.+-]*)?)?\n
1140              ( # $4 = the code block -- one or more lines, starting with ```
1141               (?:
1142                 .*\n+
1143               )+?
1144              )
1145             # and ending with ``` or end of document
1146             (?:(?:[ ]{0,$less_than_indent}``\2[ \t]*(?:\n|\Z))|\Z)
1147         }{
1148             # $2 contains syntax highlighting to use if defined
1149             my $leadsp = length($1);
1150             my $codeblock = $4;
1151             $codeblock =~ s/[ \t]+$//mg; # trim trailing spaces on lines
1152             $codeblock = _DeTab($codeblock, 8, $leadsp); # physical tab stops are always 8
1153             $codeblock =~ s/\A\n+//; # trim leading newlines
1154             $codeblock =~ s/\s+\z//; # trim trailing whitespace
1155             $codeblock = _EncodeCode($codeblock); # or run highlighter here
1156             $codeblock = "<div class=\"$opt{style_prefix}code-bt\"><pre style=\"display:none\"></pre><pre><code>"
1157                 . $codeblock . "\n</code></pre></div>";
1158
1159             my $key = block_id($codeblock);
1160             $g_html_blocks{$key} = $codeblock;
1161             "\n\n" . $key . "\n\n";
1162         }egmx;
1163
1164     return $text;
1165 }
1166
1167
1168 sub _StripLinkDefinitions {
1169 #
1170 # Strips link definitions from text, stores the URLs and titles in
1171 # hash references.
1172 #
1173     my $text = shift;
1174     my $less_than_indent = $opt{indent_width} - 1;
1175
1176     # Link defs are in the form: ^[id]: url "optional title"
1177     while ($text =~ s{
1178                         ^[ ]{0,$less_than_indent}\[(.+)\]: # id = $1
1179                           [ ]*
1180                           \n?               # maybe *one* newline
1181                           [ ]*
1182                         <?((?:\S(?:\\\n\s*[^\s"(])?)+?)>? # url = $2
1183                           [ ]*
1184                           \n?               # maybe one newline
1185                           [ ]*
1186                         (?:
1187                             (?<=\s)         # lookbehind for whitespace
1188                             (?:(['"])|(\()) # title quote char
1189                             (.+?)           # title = $5
1190                             (?(4)\)|\3)     # match same quote
1191                             [ ]*
1192                         )?  # title is optional
1193                         (?:\n+|\Z)
1194                     }
1195                     {}mx) {
1196         my $id = _strip(lc $1); # Link IDs are case-insensitive
1197         my $url = $2;
1198         my $title = _strip($5);
1199         $url =~ s/\\\n\s*//gs;
1200         if ($id ne "") {
1201                 # These values always get passed through _MakeATag or _MakeIMGTag later
1202                 $g_urls{$id} = $url;
1203                 if (defined($title) && $title ne "") {
1204                     $g_titles{$id} = $title;
1205                 }
1206         }
1207     }
1208
1209     return $text;
1210 }
1211
1212 my %ok_tag_name; # initialized later
1213 my ($block_tags_a, $block_tags_b, $block_tags_c);
1214 BEGIN {
1215     $block_tags_a = qr/\020|p|div|center|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/io;
1216     $block_tags_b = qr/\020|p|div|center|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/io;
1217     $block_tags_c = qr/div|center|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/io;
1218 }
1219
1220 sub _HashHTMLBlocks {
1221     my ($text, $toplevel) = @_;
1222     my $less_than_indent = $opt{indent_width} - 1;
1223     my $idt = "\027" x $g_list_level;
1224     my $blkprc = $toplevel ?
1225         sub { return $ok_tag_name{$_[1]} ? _EncodeAmpsAndAngles($_[0]) : $_[0] } :
1226         sub { return $_[0] };
1227
1228     # Hashify HTML blocks:
1229     # We only want to do this for block-level HTML tags, such as headers,
1230     # lists, and tables. That's because we still want to wrap <p>s around
1231     # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
1232     # phrase emphasis, and spans. The list of tags we're looking for is
1233     # hard-coded:
1234
1235     # First, look for nested blocks, e.g.:
1236     #   <div>
1237     #       <div>
1238     #       tags for inner block must be indented.
1239     #       </div>
1240     #   </div>
1241     #
1242     # The outermost tags must start at the left margin for this to match, and
1243     # the inner nested divs must be indented.
1244     # We need to do this before the next, more liberal match, because the next
1245     # match will start at the first `<div>` and stop at the first `</div>`.
1246     $text =~ s{
1247                 (                       # save in $1
1248                     ^                   # start of line (with /m)
1249                     ((?:\Q$idt\E)?)     # optional lead in = $2
1250                     <($block_tags_a)    # start tag = $3
1251                     \b                  # word break
1252                     (?:.*\n)*?          # any number of lines, minimally matching
1253                     \2</\3\s*>          # the matching end tag
1254                     [ ]*                # trailing spaces
1255                     (?=\n+|\Z) # followed by a newline or end of document
1256                 )
1257             }{
1258                 my $blk = &$blkprc($1, $3);
1259                 my $key = block_id($blk);
1260                 $g_html_blocks{$key} = $blk;
1261                 "\n\n" . $key . "\n\n";
1262             }eigmx;
1263
1264
1265     #
1266     # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
1267     #
1268     $text =~ s{
1269                 (                       # save in $1
1270                     ^                   # start of line (with /m)
1271                     (?:\Q$idt\E)?       # optional lead in
1272                     <($block_tags_b)    # start tag = $2
1273                     \b                  # word break
1274                     (?:.*\n)*?          # any number of lines, minimally matching
1275                     .*</\2\s*>          # the matching end tag
1276                     [ ]*                # trailing spaces
1277                     (?=\n+|\Z) # followed by a newline or end of document
1278                 )
1279             }{
1280                 my $blk = &$blkprc($1, $2);
1281                 my $key = block_id($blk);
1282                 $g_html_blocks{$key} = $blk;
1283                 "\n\n" . $key . "\n\n";
1284             }eigmx;
1285
1286     #
1287     # Now match any empty block tags that should have been paired
1288     #
1289     $text =~ s{
1290                 (                       # save in $1
1291                     ^                   # start of line (with /m)
1292                     (?:\Q$idt\E)?       # optional lead in
1293                     <($block_tags_c)    # start tag = $2
1294                     \b                  # word break
1295                     (?:[^<>])*?         #
1296                     /?>                 # the matching end tag
1297                     [ ]*                # trailing spaces
1298                     (?=\n+|\Z) # followed by a newline or end of document
1299                 )
1300             }{
1301                 my $key = block_id($1);
1302                 $g_html_blocks{$key} = $1;
1303                 "\n\n" . $key . "\n\n";
1304             }eigmx;
1305
1306     # Special case just for <hr />. It was easier to make a special case than
1307     # to make the other regex more complicated.
1308     $text =~ s{
1309                 (?:
1310                     (?<=\n)         # Starting after end of line
1311                     |               # or
1312                     \A              # the beginning of the doc
1313                 )
1314                 (                       # save in $1
1315                     [ ]{0,$less_than_indent}
1316                     <(?:hr)             # start tag
1317                     \b                  # word break
1318                     (?:[^<>])*?         #
1319                     /?>                 # the matching end tag
1320                     [ ]*
1321                     (?=\n{1,}|\Z)       # followed by end of line or end of document
1322                 )
1323             }{
1324                 my $key = block_id($1);
1325                 $g_html_blocks{$key} = $1;
1326                 "\n\n" . $key . "\n\n";
1327             }eigx;
1328
1329     # Special case for standalone XML comments:
1330     $text =~ s{
1331                 (?:
1332                     (?<=\n\n)       # Starting after a blank line
1333                     |               # or
1334                     \A\n?           # the beginning of the doc
1335                 )
1336                 (                   # save in $1
1337                     [ ]{0,$less_than_indent}
1338                     (?s:
1339                         <!--
1340                         (?:[^-]|(?:-(?!-)))*
1341                         -->
1342                         (?:
1343                             (?:[ \t]*\n[ \t]*)?
1344                             <!--
1345                             (?:[^-]|(?:-(?!-)))*
1346                             -->
1347                         )*
1348                     )
1349                     [ ]*
1350                     (?=\n{1,}|\Z)   # followed by end of line or end of document
1351                 )
1352             }{
1353                 my $key = block_id($1);
1354                 push(@g_xml_comments, $key)
1355                         if $opt{stripcomments} && !exists($g_html_blocks{$key});
1356                 $g_html_blocks{$key} = $1;
1357                 "\n\n" . $key . "\n\n";
1358             }egx;
1359
1360
1361     return $text;
1362 }
1363
1364
1365 sub _RunBlockGamut {
1366 #
1367 # These are all the transformations that form block-level
1368 # tags like paragraphs, headers, and list items.
1369 #
1370     my ($text, $anchors) = @_;
1371
1372     $text = _DoHeaders($text, $anchors);
1373
1374     # Do Horizontal Rules:
1375     $text =~ s{^ {0,3}\*(?: {0,2}\*){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm;
1376     $text =~ s{^ {0,3}\_(?: {0,2}\_){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm;
1377     $text =~ s{^ {0,3}\-(?: {0,2}\-){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm;
1378
1379     $text = _DoListsAndBlocks($text);
1380
1381     $text = _DoTables($text);
1382
1383     # We already ran _HashHTMLBlocks() before, in Markdown(), but that
1384     # was to escape raw HTML in the original Markdown source. This time,
1385     # we're escaping the markup we've just created, so that we don't wrap
1386     # <p> tags around block-level tags.
1387     $text = _HashHTMLBlocks($text);
1388
1389     $text = _FormParagraphs($text, $anchors);
1390
1391     return $text;
1392 }
1393
1394
1395 sub _DoListBlocks {
1396     return _DoBlockQuotes(_DoCodeBlocks($_[0])) if $_[0] ne "";
1397 }
1398
1399
1400 sub _RunSpanGamut {
1401 #
1402 # These are all the transformations that occur *within* block-level
1403 # tags like paragraphs, headers, and list items.
1404 #
1405     my $text = shift;
1406
1407     $text = _DoCodeSpans($text);
1408
1409     $text = _EscapeSpecialChars($text);
1410
1411     # Process anchor and image tags. Images must come first,
1412     # because ![foo][f] looks like an anchor.
1413     $text = _DoImages($text);
1414     $text = _DoAnchors($text);
1415
1416     # Make links out of things like `<http://example.com/>`
1417     # Must come after _DoAnchors(), because you can use < and >
1418     # delimiters in inline links like [this](<url>).
1419     $text = _DoAutoLinks($text);
1420
1421     $text = _EncodeAmpsAndAngles($text);
1422
1423     $text = _DoItalicsAndBoldAndStrike($text);
1424
1425     # Do hard breaks:
1426     $text =~ s/ {3,}(\n|\z)/<br clear=\"all\"$opt{empty_element_suffix}$1/g;
1427     $text =~ s/ {2,}\n/<br$opt{empty_element_suffix}\n/g;
1428     $text =~ s/ {2,}\z//g;
1429
1430     return $text;
1431 }
1432
1433
1434 sub _EscapeSpecialChars {
1435     my $text = shift;
1436     my $tokens ||= _TokenizeHTML($text);
1437
1438     $text = ''; # rebuild $text from the tokens
1439 #   my $in_pre = 0;  # Keep track of when we're inside <pre> or <code> tags.
1440 #   my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
1441
1442     foreach my $cur_token (@$tokens) {
1443         if ($cur_token->[0] eq "tag") {
1444             # Within tags, encode *, _ and ~ so they don't conflict
1445             # with their use in Markdown for italics and strong.
1446             # We're replacing each such character with its
1447             # corresponding block id value; this is likely
1448             # overkill, but it should prevent us from colliding
1449             # with the escape values by accident.
1450             $cur_token->[1] =~ s!([*_~])!$g_escape_table{$1}!g;
1451             $text .= $cur_token->[1];
1452         } else {
1453             my $t = $cur_token->[1];
1454             $t = _EncodeBackslashEscapes($t);
1455             $text .= $t;
1456         }
1457     }
1458     return $text;
1459 }
1460
1461
1462 sub _ProcessWikiLink {
1463     my ($link_text, $link_loc) = @_;
1464     if (defined($link_loc) &&
1465         ($link_loc =~ m{^#\S*$} || $link_loc =~ m{^(?:http|ftp)s?://\S+$}i)) {
1466         # Return the new link
1467         return _MakeATag(_FindFragmentMatch($link_loc), $link_text);
1468     }
1469     if (!defined($link_loc)) {
1470         $link_loc = _RunSpanGamut($link_text);
1471         $link_loc = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($link_loc))));
1472         $link_loc =~ m{^(?:http|ftp)s?://\S+$}i and
1473             # Return the new link
1474             return _MakeATag($link_loc, $link_text);
1475     }
1476     return undef if $link_loc eq "" || $link_text eq "";
1477     if ($link_loc =~ /^[A-Za-z][A-Za-z0-9+.-]*:/os) {
1478         # Unrecognized scheme
1479         return undef;
1480     }
1481     if ($opt{wikipat}) {
1482         my $o = $opt{wikiopt};
1483         my $img_link = _strip($link_text);
1484         my $img = 0;
1485         my $qsfrag = "";
1486         my $base;
1487         my $imgopts = undef;
1488         if ($img_link =~ /^[^#?\s]+\.(?:png|gif|jpe?g|svgz?)$/i) {
1489             $base = _wxform($img_link, 1);
1490             $img = 1;
1491             $imgopts = _ParseWikiImgOpts($link_loc);
1492             $imgopts->{imgflag} = \$img;
1493         } else {
1494             $base = $link_loc;
1495             if ($link_loc =~ /^(.*?)([?#].*)$/os) {
1496                 ($base, $qsfrag) = ($1, $2);
1497             }
1498             $base = _wxform($base);
1499         }
1500         my $result = $opt{wikipat};
1501         $result =~ s/%\{\}.+$/%{}/os if $img;
1502         $result =~ s/%\{\}/$base/;
1503         if ($qsfrag =~ /^([^#]*)(#.+)$/os) {
1504             my ($q,$f) = ($1,$2);
1505             #$f = _wxform($f) if $f =~ / /;
1506             $qsfrag = $q . $f;
1507         }
1508         $result .= $qsfrag;
1509         $result = &{$opt{wikifunc}}($result, \%opt, ($img?$img_link:$link_loc), $base, $qsfrag, $imgopts)
1510                 if ref($opt{wikifunc}) eq 'CODE';
1511         {
1512             use bytes;
1513             $result =~ s/%(?![0-9A-Fa-f]{2})/%25/sog;
1514             if ($o->{r}) {
1515                 $result =~
1516                 s/([\x00-\x1F <>"{}|\\^`x7F])/sprintf("%%%02X",ord($1))/soge;
1517             } else {
1518                 $result =~
1519                 s/([\x00-\x1F <>"{}|\\^`\x7F-\xFF])/sprintf("%%%02X",ord($1))/soge;
1520             }
1521             $result =~ s/(%(?![0-9A-F]{2})[0-9A-Fa-f]{2})/uc($1)/soge;
1522         }
1523         # Return the new link
1524         return $img ? _MakeIMGTag($result, undef, undef, $imgopts) : _MakeATag($result, $link_text);
1525     }
1526     # leave it alone
1527     return undef;
1528 }
1529
1530
1531 sub _ParseWikiImgOpts {
1532     my $alts = shift;
1533     my %o = ();
1534     # alt= consumes the rest of the line, do it first
1535     if ($alts =~ /(?:^|,)\s*alt\s*=\s*(.*)$/ios) {
1536         my $atext = $1;
1537         $alts = substr($alts, 0, $-[0]);
1538         $o{alt} = _strip($atext);
1539     }
1540     foreach my $kv (split(/\s*,\s*/, lc($alts))) {
1541         if ($kv =~ /^\s*([^\s]+)\s*=\s*([^\s]+)\s*$/os) {
1542             my ($k, $v) = ($1, $2);
1543             if (($k eq "width" || $k eq "height") && $v =~ /^\d+$/) {
1544                 $o{$k} = 0+$v if $v > 0;
1545                 next;
1546             }
1547             if ($k eq "align" && ($v eq "left" || $v eq "right" || $v eq "center")) {
1548                 $o{$k} = $v;
1549                 next;
1550             }
1551         }
1552     }
1553     return \%o;
1554 }
1555
1556
1557 sub _wxform {
1558     my ($w, $img) = @_;
1559     my $o = $opt{wikiopt};
1560     my $opt_s = $o->{s};
1561     if (!$img && $opt_s) {
1562         if (ref($opt_s)) {
1563             if ($w =~ m{^(.*)[.]([^./]*)$}) {
1564                 my ($base, $ext) = ($1, $2);
1565                 $w = $base if $opt_s->{lc($ext)};
1566             }
1567         } else {
1568             $w =~ s{[.][^./]*$}{};
1569         }
1570     }
1571     $w = uc($w) if $o->{u};
1572     $w = lc($w) if $o->{l};
1573     $w =~ s{/+}{%252F}gos if $o->{"%"};
1574     $w =~ s/ +/%20/gos if $o->{b};
1575     $w =~ tr{/}{ } if $o->{f};
1576     $w =~ s{/+}{/}gos if !$o->{f} && !$o->{v};
1577     if ($o->{d}) {
1578         $w =~ tr{ }{-};
1579         $w =~ s/-+/-/gos unless $o->{v};
1580     } else {
1581         $w =~ tr{ }{_};
1582         $w =~ s/_+/_/gos unless $o->{v};
1583     }
1584     return $w;
1585 }
1586
1587
1588 # Return a suitably encoded <a...> tag string
1589 # On input NONE of $url, $text or $title should be xmlencoded
1590 # but $url should already be url-encoded if needed, but NOT g_escape_table'd
1591 sub _MakeATag {
1592     my ($url, $text, $title) = @_;
1593     defined($url) or $url="";
1594     defined($text) or $text="";
1595     defined($title) or $title="";
1596
1597     $url =~ m"^#" && ref($opt{base_prefix}) eq 'CODE' and $url = &{$opt{base_prefix}}($url);
1598     my $result = $g_escape_table{'<'}."a href=\"" . _EncodeAttText($url) . "\"";
1599     $title = _strip($title);
1600     $text =~ s{<(/?a)}{&lt;$1}sogi;
1601     $text = _DoItalicsAndBoldAndStrike($text);
1602     # We've got to encode any of these remaining to avoid
1603     # conflicting with other italics, bold and strike through and links.
1604     $text =~ s!([]*_~[])!$g_escape_table{$1}!g;
1605     $result .= " title=\"" . _EncodeAttText($title) . "\"" if $title ne "";
1606     return $result . $g_escape_table{'>'} .
1607         $text . $g_escape_table{'<'}."/a".$g_escape_table{'>'};
1608 }
1609
1610
1611 sub _DoAnchors {
1612 #
1613 # Turn Markdown link shortcuts into XHTML <a> tags.
1614 #
1615     my $text = shift;
1616
1617     #
1618     # First, handle wiki-style links: [[wiki style link]]
1619     #
1620     $text =~ s{
1621         (                   # wrap whole match in $1
1622           \[\[
1623             ($g_nested_brackets) # link text and id = $2
1624           \]\]
1625         )
1626     }{
1627         my $result;
1628         my $whole_match = $1;
1629         my $link_text   = $2;
1630         my $link_loc    = undef;
1631
1632         if ($link_text =~ /^(.*)\|(.*)$/s) {
1633             $link_text = $1;
1634             $link_loc = _strip($2);
1635         }
1636
1637         $result = _ProcessWikiLink($link_text, $link_loc);
1638         defined($result) or $result = $whole_match;
1639         $result;
1640     }xsge;
1641
1642     #
1643     # Next, handle reference-style links: [link text] [id]
1644     #
1645     $text =~ s{
1646         (                   # wrap whole match in $1
1647           \[
1648             ($g_nested_brackets) # link text = $2
1649           \]
1650
1651           [ ]?              # one optional space
1652           (?:\n[ ]*)?       # one optional newline followed by spaces
1653
1654           \[
1655             ($g_nested_brackets) # id = $3
1656           \]
1657         )
1658     }{
1659         my $result;
1660         my $whole_match = $1;
1661         my $link_text   = $2;
1662         my $link_id     = $3;
1663
1664         if ($link_id eq "") {
1665             # for shortcut links like [this][].
1666             $link_id = _RunSpanGamut($link_text);
1667             $link_id = unescapeXML(_StripTags(_UnescapeSpecialChars($link_id)));
1668         }
1669         $link_id = _strip(lc $link_id);
1670
1671         if (defined($g_urls{$link_id}) || defined($g_anchors{$link_id})) {
1672             my $url = $g_urls{$link_id};
1673             defined($url) or $url = $g_anchors{$link_id};
1674             $url = _FindFragmentMatch($url);
1675             $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
1676             $result = _MakeATag(_PrefixURL($url), $link_text, $g_titles{$link_id});
1677         }
1678         else {
1679             $result = $whole_match;
1680         }
1681         $result;
1682     }xsge;
1683
1684     #
1685     # Subsequently, inline-style links: [link text](url "optional title")
1686     #
1687     $text =~ s{
1688         (               # wrap whole match in $1
1689           \[
1690             ($g_nested_brackets) # link text = $2
1691           \]
1692           \(            # literal paren
1693             ($g_nested_parens) # href and optional title = $3
1694           \)
1695         )
1696     }{
1697         #my $result;
1698         my $whole_match = $1;
1699         my $link_text   = $2;
1700         my ($url, $title) = _SplitUrlTitlePart($3);
1701
1702         if (defined($url)) {
1703             $url = _FindFragmentMatch($url);
1704             $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
1705             _MakeATag(_PrefixURL($url), $link_text, $title);
1706         } else {
1707             # The href/title part didn't match the pattern
1708             $whole_match;
1709         }
1710     }xsge;
1711
1712     #
1713     # Finally, handle reference-style implicit shortcut links: [link text]
1714     #
1715     $text =~ s{
1716         (                   # wrap whole match in $1
1717           \[
1718             ($g_nested_brackets) # link text = $2
1719           \]
1720         )
1721     }{
1722         my $result;
1723         my $whole_match = $1;
1724         my $link_text   = $2;
1725         my $link_id     = _RunSpanGamut($2);
1726         $link_id        = _strip(lc(unescapeXML(_StripTags(_UnescapeSpecialChars($link_id)))));
1727
1728         if (defined($g_urls{$link_id}) || defined($g_anchors{$link_id})) {
1729             my $url = $g_urls{$link_id};
1730             defined($url) or $url = $g_anchors{$link_id};
1731             $url = _FindFragmentMatch($url);
1732             $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
1733             $result = _MakeATag(_PrefixURL($url), $link_text, $g_titles{$link_id});
1734         }
1735         else {
1736             $result = $whole_match;
1737         }
1738         $result;
1739     }xsge;
1740
1741     return $text;
1742 }
1743
1744
1745 sub _PeelWrapped {
1746     defined($_[0]) or return undef;
1747     if (substr($_[0],0,1) eq "(") {
1748         return substr($_[0], 1, length($_[0]) - (substr($_[0], -1, 1) eq ")" ? 2 : 1));
1749     }
1750     return $_[0];
1751 }
1752
1753
1754 sub _SplitUrlTitlePart {
1755     return ("", undef) if $_[0] =~ m{^\s*$}; # explicitly allowed
1756     my $u = $_[0];
1757     $u =~ s/^\s*(['\042])/# $1/;
1758     if ($u =~ m{
1759         ^               # match beginning
1760         \s*?
1761         <?([^\s'\042]\S*?)>? # URL = $1
1762         (?:             # optional grouping
1763           \s+           # must be distinct from URL
1764           (['\042]?)    # quote char = $2
1765           (.*?)         # Title = $3
1766           \2?           # matching quote
1767         )?              # title is optional
1768         \s*
1769         \z              # match end
1770     }osx) {
1771         return (undef, undef) if $_[1] && ($1 eq "" || $1 eq "#");
1772         return (_PeelWrapped($1), $2 ? $3 : _PeelWrapped($3));
1773     } else {
1774         return (undef, undef);
1775     }
1776 }
1777
1778
1779 sub _FindFragmentMatchInternal {
1780     my ($anchors_id, $url, $undefifnomatch) = @_;
1781     if (defined($url) && $url =~ /^#\S/) {
1782         # try very hard to find a match
1783         my $idbase = _strip(lc(substr($url, 1)));
1784         my $idbase0 = $idbase;
1785         my $id = _MakeAnchorId($idbase);
1786         $undefifnomatch and $url = undef;
1787         if (defined($$anchors_id{$id})) {
1788             $url = $$anchors_id{$id};
1789         } else {
1790             $idbase =~ s/-/_/gs;
1791             $id = _MakeAnchorId($idbase);
1792             if (defined($$anchors_id{$id})) {
1793                 $url = $$anchors_id{$id};
1794             } else {
1795                 $id = _MakeAnchorId($idbase0, 1);
1796                 if (defined($$anchors_id{$id})) {
1797                     $url = $$anchors_id{$id};
1798                 } else {
1799                     $id = _MakeAnchorId($idbase, 1);
1800                     if (defined($$anchors_id{$id})) {
1801                         $url = $$anchors_id{$id};
1802                     }
1803                 }
1804             }
1805         }
1806     }
1807     return $url;
1808 }
1809
1810
1811 sub _FindFragmentMatch {
1812     return _FindFragmentMatchInternal(\%g_anchors_id, @_);
1813 }
1814
1815
1816 sub _ToUTF8 {
1817     my $input = shift;
1818     my $output;
1819     if (Encode::is_utf8($input) || utf8::decode($input)) {
1820         $output = $input;
1821     } else {
1822         $output = $encoder->decode($input, Encode::FB_DEFAULT);
1823     }
1824     return $output;
1825 }
1826
1827
1828 # $_[0] -> HASH ref of anchors (e.g. the "anchors" OUTPUT from Markdown)
1829 # $_[1] -> fragment to resolve, may optionally start with '#'
1830 # An empty string ("") or hash ("#") is returned as-is.
1831 # returns undef if no match otherwise resolved fragment name
1832 # which will start with a '#' if $_[1] started with '#' otherwise will not.
1833 # This function can be used to connect up links to "implicit" anchors.
1834 # All Markdown-format H1-H6 headers have an implicit anchor added
1835 # based on the header item text.  Passing that text to this function
1836 # will cough up the matching implicit anchor if there is one.
1837 sub ResolveFragment
1838 {
1839     my ($anchors, $frag) = @_;
1840     defined($frag) or return undef;
1841     $frag eq "" || $frag eq "#" and return $frag;
1842     my $hadhash = ($frag =~ s/^#//);
1843     $frag =~ /^\S/ or return undef;
1844     ref($anchors) eq 'HASH' or return undef;
1845     my $ans = _FindFragmentMatchInternal($anchors, '#'._ToUTF8($frag), 1);
1846     $hadhash || !defined($ans) or $ans =~ s/^#//;
1847     defined($ans) and utf8::encode($ans);
1848     return $ans;
1849 }
1850
1851
1852 # Return a suitably encoded <img...> tag string
1853 # On input NONE of $url, $alt or $title should be xmlencoded
1854 # but $url should already be url-encoded if needed, but NOT g_escape_table'd
1855 sub _MakeIMGTag {
1856     my ($url, $alt, $title, $iopts) = @_;
1857     defined($url) or $url="";
1858     defined($alt) or $alt="";
1859     defined($title) or $title="";
1860     ref($iopts) eq "HASH" or $iopts = {};
1861     return "" unless $url ne "";
1862
1863     my ($w, $h, $lf, $rt) = (0, 0, '', '');
1864     ($alt, $title) = (_strip($alt), _strip($title));
1865     if ($title =~ /^(.*)\((<?)([1-9][0-9]*)[xX\xd7]([1-9][0-9]*)(>?)\)$/os) {
1866         ($title, $w, $h, $lf, $rt) = (_strip($1), $3, $4, $2, $5);
1867     } elsif ($title =~ /^(.*)\((<?)\?[xX\xd7]([1-9][0-9]*)(>?)\)$/os) {
1868         ($title, $h, $lf, $rt) = (_strip($1), $3, $2, $4);
1869     } elsif ($title =~ /^(.*)\((<?)([1-9][0-9]*)[xX\xd7]\?(>?)\)$/os) {
1870         ($title, $w, $lf, $rt) = (_strip($1), $3, $2, $4);
1871     } elsif ($title =~ /^(.*)\((?!\))(<?)(>?)\)$/os) {
1872         ($title, $lf, $rt) = (_strip($1), $2, $3);
1873     }
1874     $iopts->{align} = "center" if $lf && $rt;
1875     $iopts->{align} = "left"   if $lf && !$rt;
1876     $iopts->{align} = "right"  if !$lf && $rt;
1877     $iopts->{width}  = $w if $w != 0;
1878     $iopts->{height} = $h if $h != 0;
1879     $iopts->{alt} = $alt if $alt ne "";
1880     $iopts->{title} = $title if $title ne "";
1881     my $iopt = sub { defined($iopts->{$_[0]}) ? $iopts->{$_[0]} : (@_ > 1 ? $_[1] : "") };
1882     my $result = '';
1883     $result .= $g_escape_table{'<'}."center".$g_escape_table{'>'}
1884         if &$iopt("align") eq "center";
1885     $result .= $g_escape_table{'<'}."img src=\"" . _EncodeAttText($url) . "\"";
1886     $result .= " align=\"left\"" if &$iopt("align") eq "left";
1887     $result .= " align=\"right\"" if &$iopt("align") eq "right";
1888     $result .= " alt=\"" . _EncodeAttText($iopts->{alt}) . "\"" if &$iopt("alt") ne "";
1889     $result .= " width=\"" . $iopts->{width} . "\"" if &$iopt("width",0) != 0;
1890     $result .= " height=\"" . $iopts->{height} . "\"" if &$iopt("height",0) != 0;
1891     $result .= " title=\"" . _EncodeAttText($iopts->{title}) . "\"" if &$iopt("title") ne "";
1892     $result .= " /" unless $opt{empty_element_suffix} eq ">";
1893     $result .= $g_escape_table{'>'};
1894     $result .= $g_escape_table{'<'}."/center".$g_escape_table{'>'}
1895         if &$iopt("align") eq "center";
1896     return $result;
1897 }
1898
1899
1900 sub _DoImages {
1901 #
1902 # Turn Markdown image shortcuts into <img> tags.
1903 #
1904     my $text = shift;
1905
1906     #
1907     # First, handle reference-style labeled images: ![alt text][id]
1908     #
1909     $text =~ s{
1910         (               # wrap whole match in $1
1911           !\[
1912             ($g_nested_brackets) # alt text = $2
1913           \]
1914
1915           [ ]?          # one optional space
1916           (?:\n[ ]*)?   # one optional newline followed by spaces
1917
1918           \[
1919             ($g_nested_brackets) # id = $3
1920           \]
1921
1922         )
1923     }{
1924         my $result;
1925         my $whole_match = $1;
1926         my $alt_text    = $2;
1927         my $link_id     = $3;
1928
1929         $link_id ne "" or $link_id = $alt_text; # for shortcut links like ![this][].
1930         $link_id = _strip(lc $link_id);
1931
1932         if (defined $g_urls{$link_id}) {
1933             $result = _MakeIMGTag(
1934                 _PrefixURL($g_urls{$link_id}), $alt_text, $g_titles{$link_id});
1935         }
1936         else {
1937             # If there's no such link ID, leave intact:
1938             $result = $whole_match;
1939         }
1940
1941         $result;
1942     }xsge;
1943
1944     #
1945     # Next, handle inline images:  ![alt text](url "optional title")
1946     # Don't forget: encode * and _
1947
1948     $text =~ s{
1949         (               # wrap whole match in $1
1950           !\[
1951             ($g_nested_brackets) # alt text = $2
1952           \]
1953           \(            # literal paren
1954             ($g_nested_parens) # src and optional title = $3
1955           \)
1956         )
1957     }{
1958         my $whole_match = $1;
1959         my $alt_text    = $2;
1960         my ($url, $title) = _SplitUrlTitlePart($3, 1);
1961         defined($url) ?  _MakeIMGTag(_PrefixURL($url), $alt_text, $title) : $whole_match;
1962     }xsge;
1963
1964     #
1965     # Finally, handle reference-style implicitly labeled links: ![alt text]
1966     #
1967     $text =~ s{
1968         (               # wrap whole match in $1
1969           !\[
1970             ($g_nested_brackets) # alt text = $2
1971           \]
1972         )
1973     }{
1974         my $result;
1975         my $whole_match = $1;
1976         my $alt_text    = $2;
1977         my $link_id     = lc(_strip($alt_text));
1978
1979         if (defined $g_urls{$link_id}) {
1980             $result = _MakeIMGTag(
1981                 _PrefixURL($g_urls{$link_id}), $alt_text, $g_titles{$link_id});
1982         }
1983         else {
1984             # If there's no such link ID, leave intact:
1985             $result = $whole_match;
1986         }
1987
1988         $result;
1989     }xsge;
1990
1991     return $text;
1992 }
1993
1994 sub _EncodeAttText {
1995     my $text = shift;
1996     defined($text) or return undef;
1997     $text = escapeXML(_strip($text));
1998     # We've got to encode these to avoid conflicting
1999     # with italics, bold and strike through.
2000     $text =~ s!([*_~:])!$g_escape_table{$1}!g;
2001     return $text;
2002 }
2003
2004
2005 sub _MakeAnchorId {
2006     use bytes;
2007     my ($link, $strip) = @_;
2008     $link = lc($link);
2009     if ($strip) {
2010         $link =~ s/\s+/_/gs;
2011         $link =~ tr/-a-z0-9_//cd;
2012     } else {
2013         $link =~ tr/-a-z0-9_/_/cs;
2014     }
2015     return '' unless $link ne '';
2016     $link = "_".$link."_";
2017     $link =~ s/__+/_/gs;
2018     $link = "_".md5_hex($link)."_" if length($link) > 66;
2019     return $link;
2020 }
2021
2022
2023 sub _GetNewAnchorId {
2024     my $link = _strip(lc(shift));
2025     return '' if $link eq "" || defined($g_anchors{$link});
2026     my $id = _MakeAnchorId($link);
2027     return '' unless $id;
2028     $g_anchors{$link} = '#'.$id;
2029     $g_anchors_id{$id} = $g_anchors{$link};
2030     if ($id =~ /-/) {
2031         my $id2 = $id;
2032         $id2 =~ s/-/_/gs;
2033         $id2 =~ s/__+/_/gs;
2034         defined($g_anchors_id{$id2}) or $g_anchors_id{$id2} = $g_anchors{$link};
2035     }
2036     my $idd = _MakeAnchorId($link, 1);
2037     if ($idd) {
2038         defined($g_anchors_id{$idd}) or $g_anchors_id{$idd} = $g_anchors{$link};
2039         if ($idd =~ /-/) {
2040             my $idd2 = $idd;
2041             $idd2 =~ s/-/_/gs;
2042             $idd2 =~ s/__+/_/gs;
2043             defined($g_anchors_id{$idd2}) or $g_anchors_id{$idd2} = $g_anchors{$link};
2044         }
2045     }
2046     $id;
2047 }
2048
2049
2050 sub _DoHeaders {
2051     my ($text, $anchors) = @_;
2052     my $h1;
2053     my $geth1 = $anchors && !defined($opt{h1}) ? sub {
2054         return unless !defined($h1);
2055         my $h = shift;
2056         $h1 = $h if $h ne "";
2057     } : sub {};
2058
2059     # atx-style headers:
2060     #   # Header 1
2061     #   ## Header 2
2062     #   ## Header 2 with closing hashes ##
2063     #   ...
2064     #   ###### Header 6
2065     #
2066     $text =~ s{
2067             ^(\#{1,6})  # $1 = string of #'s
2068             [ ]*
2069             ((?:(?:(?<![#])[^\s]|[^#\s]).*?)?) # $2 = Header text
2070             [ ]*
2071             \n+
2072         }{
2073             my $h_level = length($1);
2074             my $h = $2;
2075             $h =~ s/#+$//;
2076             $h =~ s/\s+$//;
2077             my $rsg = _RunSpanGamut($h);
2078             $h = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($rsg))));
2079             my $id = $h eq "" ? "" : _GetNewAnchorId($h);
2080             $id = " id=\"$id\"" if $id ne "";
2081             &$geth1($h) if $h_level == 1;
2082             "<h$h_level$id>" . _AutoHeaderFlag($h_level) . $rsg . "</h$h_level>\n\n";
2083         }egmx;
2084
2085     # Setext-style headers:
2086     #     Header 1
2087     #     ========
2088     #
2089     #     Header 2
2090     #     --------
2091     #
2092     #     Header 3
2093     #     ~~~~~~~~
2094     #
2095     $text =~ s{ ^(?:=+[ ]*\n)?[ ]*(.+?)[ ]*\n=+[ ]*\n+ }{
2096         my $h = $1;
2097         my $rsg = _RunSpanGamut($h);
2098         $h = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($rsg))));
2099         my $id = $h eq "" ? "" : _GetNewAnchorId($h);
2100         $id = " id=\"$id\"" if $id ne "";
2101         &$geth1($h);
2102         "<h1$id>" . _AutoHeaderFlag(1) . $rsg . "</h1>\n\n";
2103     }egmx;
2104
2105     $text =~ s{ ^(?:-+[ ]*\n)?[ ]*(.+?)[ ]*\n-+[ ]*\n+ }{
2106         my $h = $1;
2107         my $rsg = _RunSpanGamut($h);
2108         $h = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($rsg))));
2109         my $id = $h eq "" ? "" : _GetNewAnchorId($h);
2110         $id = " id=\"$id\"" if $id ne "";
2111         "<h2$id>" . _AutoHeaderFlag(2) . $rsg . "</h2>\n\n";
2112     }egmx;
2113
2114     $text =~ s{ ^(?:~+[ ]*\n)?[ ]*(.+?)[ ]*\n~+[ ]*\n+ }{
2115         my $h = $1;
2116         my $rsg = _RunSpanGamut($h);
2117         $h = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($rsg))));
2118         my $id = $h eq "" ? "" : _GetNewAnchorId($h);
2119         $id = " id=\"$id\"" if $id ne "";
2120         "<h3$id>" . _AutoHeaderFlag(3) . $rsg . "</h3>\n\n";
2121     }egmx;
2122
2123     $opt{h1} = $h1 if defined($h1) && $h1 ne "";
2124     return $text;
2125 }
2126
2127
2128 sub _AutoHeaderFlag {
2129     my $level = shift;
2130     my $auto = $opt{auto_number} || 0;
2131     return '' unless 1 <= $level && $level <= $auto;
2132     return "\34".chr(0x30+$level);
2133 }
2134
2135
2136 sub _AutoHeaderNum {
2137     my $level = shift;
2138     my $auto = $opt{auto_number} || 0;
2139     return '' unless 1 <= $level && $level <= $auto;
2140     pop(@autonum) while @autonum > $level;
2141     push(@autonum, 1) while @autonum < $level - 1;
2142     $autonum[$level - 1] += 1;
2143     return join('.', @autonum).' ';
2144 }
2145
2146
2147 my ($marker_ul, $marker_ol, $marker_any, $roman_numeral, $greek_lower);
2148 BEGIN {
2149     # Re-usable patterns to match list item bullets and number markers:
2150     $roman_numeral = qr/(?:
2151         [IiVvXx]|[Ii]{2,3}|[Ii][VvXx]|[VvXx][Ii]{1,3}|[Xx][Vv][Ii]{0,3}|
2152         [Xx][Ii][VvXx]|[Xx]{2}[Ii]{0,3}|[Xx]{2}[Ii]?[Vv]|[Xx]{2}[Vv][Ii]{1,2})/ox;
2153     $greek_lower = qr/(?:[\x{03b1}-\x{03c9}])/o;
2154     $marker_ul  = qr/[*+-]/o;
2155     $marker_ol  = qr/(?:\d+|[A-Za-z]|$roman_numeral|$greek_lower)[.\)]/o;
2156     $marker_any = qr/(?:$marker_ul|$marker_ol)/o;
2157 }
2158
2159
2160 sub _GetListMarkerType {
2161     my ($list_type, $list_marker, $last_marker) = @_;
2162     return "" unless $list_type && $list_marker && lc($list_type) eq "ol";
2163     my $last_marker_type = '';
2164     $last_marker_type = _GetListMarkerType($list_type, $last_marker)
2165         if defined($last_marker) &&
2166             # these are roman unless $last_marker type case matches and is 'a' or 'A'
2167             $list_marker =~ /^[IiVvXx][.\)]?$/;
2168     return "I" if $list_marker =~ /^[IVX]/ && $last_marker_type ne 'A';
2169     return "i" if $list_marker =~ /^[ivx]/ && $last_marker_type ne 'a';
2170     return "A" if $list_marker =~ /^[A-Z]/;
2171     return "a" if $list_marker =~ /^[a-z]/ || $list_marker =~ /^$greek_lower/o;
2172     return "1";
2173 }
2174
2175
2176 sub _GetListItemTypeClass {
2177     my ($list_type, $list_marker, $last_marker) = @_;
2178     my $list_marker_type = _GetListMarkerType($list_type, $list_marker, $last_marker);
2179     my $ans = &{sub{
2180         return "" unless length($list_marker) >= 2 && $list_marker_type =~ /^[IiAa1]$/;
2181         return "lower-greek" if $list_marker_type eq "a" && $list_marker =~ /^$greek_lower/o;
2182         return "" unless $list_marker =~ /\)$/;
2183         return "upper-roman" if $list_marker_type eq "I";
2184         return "lower-roman" if $list_marker_type eq "i";
2185         return "upper-alpha" if $list_marker_type eq "A";
2186         return "lower-alpha" if $list_marker_type eq "a";
2187         return "decimal";
2188     }};
2189     return ($list_marker_type, $ans);
2190 }
2191
2192
2193 my %_roman_number_table;
2194 BEGIN {
2195     %_roman_number_table = (
2196         i       =>  1,
2197         ii      =>  2,
2198         iii     =>  3,
2199         iv      =>  4,
2200         v       =>  5,
2201         vi      =>  6,
2202         vii     =>  7,
2203         viii    =>  8,
2204         ix      =>  9,
2205         x       => 10,
2206         xi      => 11,
2207         xii     => 12,
2208         xiii    => 13,
2209         xiv     => 14,
2210         xv      => 15,
2211         xvi     => 16,
2212         xvii    => 17,
2213         xviii   => 18,
2214         xix     => 19,
2215         xx      => 20,
2216         xxi     => 21,
2217         xxii    => 22,
2218         xxiii   => 23,
2219         xxiv    => 24,
2220         xxv     => 25,
2221         xxvi    => 26,
2222         xxvii   => 27
2223     );
2224 }
2225
2226
2227 # Necessary because ς and σ are the same value grrr
2228 my %_greek_number_table;
2229 BEGIN {
2230     %_greek_number_table = (
2231         "\x{03b1}" =>  1, # α
2232         "\x{03b2}" =>  2, # β
2233         "\x{03b3}" =>  3, # γ
2234         "\x{03b4}" =>  4, # δ
2235         "\x{03b5}" =>  5, # ε
2236         "\x{03b6}" =>  6, # ζ
2237         "\x{03b7}" =>  7, # η
2238         "\x{03b8}" =>  8, # θ
2239         "\x{03b9}" =>  9, # ι
2240         "\x{03ba}" => 10, # κ
2241         "\x{03bb}" => 11, # λ
2242         #"\x{00b5}"=> 12, # µ is "micro" not "mu"
2243         "\x{03bc}" => 12, # μ
2244         "\x{03bd}" => 13, # ν
2245         "\x{03be}" => 14, # ξ
2246         "\x{03bf}" => 15, # ο
2247         "\x{03c0}" => 16, # π
2248         "\x{03c1}" => 17, # ρ
2249         "\x{03c2}" => 18, # ς
2250         "\x{03c3}" => 18, # σ
2251         "\x{03c4}" => 19, # τ
2252         "\x{03c5}" => 20, # υ
2253         "\x{03c6}" => 21, # φ
2254         "\x{03c7}" => 22, # χ
2255         "\x{03c8}" => 23, # ψ
2256         "\x{03c9}" => 24  # ω
2257     );
2258 }
2259
2260
2261 sub _GetMarkerIntegerNum {
2262     my ($list_marker_type, $marker_val) = @_;
2263     my $ans = &{sub{
2264         return 0 + $marker_val if $list_marker_type eq "1";
2265         $list_marker_type = lc($list_marker_type);
2266         return $_greek_number_table{$marker_val}
2267             if $list_marker_type eq "a" &&
2268             defined($_greek_number_table{$marker_val});
2269         $marker_val = lc($marker_val);
2270         return ord($marker_val) - ord("a") + 1 if $list_marker_type eq "a";
2271         return 1 unless $list_marker_type eq "i";
2272         defined($_roman_number_table{$marker_val}) and
2273             return $_roman_number_table{$marker_val};
2274         return 1;
2275     }};
2276     return $ans if $ans == 0 && $list_marker_type eq "1";
2277     return $ans >= 1 ? $ans : 1;
2278 }
2279
2280
2281 sub _IncrList {
2282     my ($from, $to, $extra) = @_;
2283     $extra = defined($extra) ? " $extra" : "";
2284     my $result = "";
2285     while ($from + 10 <= $to) {
2286         $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-10\"></span>\n";
2287         $from += 10;
2288     }
2289     while ($from + 5 <= $to) {
2290         $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-5\"></span>\n";
2291         $from += 5;
2292     }
2293     while ($from + 2 <= $to) {
2294         $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-2\"></span>\n";
2295         $from += 2;
2296     }
2297     while ($from < $to) {
2298         $result .= "<span$extra class=\"$opt{style_prefix}ol-incr\"></span>\n";
2299         ++$from;
2300     }
2301     return $result;
2302 }
2303
2304
2305 sub _DoListsAndBlocks {
2306 #
2307 # Form HTML ordered (numbered) and unordered (bulleted) lists.
2308 #
2309     my $text = shift;
2310     my $indent = $opt{indent_width};
2311     my $less_than_indent = $indent - 1;
2312     my $less_than_double_indent = 2 * $indent - 1;
2313
2314     # Re-usable pattern to match any entire ul or ol list:
2315     my $whole_list = qr{
2316         (                           # $1 (or $_[0]) = whole list
2317           (                         # $2 (or $_[1])
2318             (?:(?<=\n)|\A)
2319             [ ]{0,$less_than_indent}
2320             (${marker_any})         # $3 (or $_[2]) = first list item marker
2321             [ ]+
2322           )
2323           (?s:.+?)
2324           (                         # $4 (or $_[3])
2325               \z
2326             |
2327               \n{2,}
2328               (?=\S)
2329               (?!                   # Negative lookahead for another list item marker
2330                 ${marker_any}[ ]
2331               )
2332           )
2333         )
2334     }mx;
2335
2336     my $list_item_sub = sub {
2337         my $list = $_[0];
2338         my $list_type = ($_[2] =~ m/$marker_ul/) ? "ul" : "ol";
2339         my $list_att = "";
2340         my $list_class = "";
2341         my $list_incr = "";
2342         # Turn double returns into triple returns, so that we can make a
2343         # paragraph for the last item in a list, if necessary:
2344         $list =~ s/\n\n/\n\n\n/g;
2345         my ($result, $first_marker, $fancy) = _ProcessListItems($list_type, $list);
2346         defined($first_marker) or return $list;
2347         my $list_marker_type = _GetListMarkerType($list_type, $first_marker);
2348         if ($list_marker_type) {
2349                 $first_marker =~ s/[.\)]$//;
2350                 my $first_marker_num = _GetMarkerIntegerNum($list_marker_type, $first_marker);
2351                 $list_att = $list_marker_type eq "1" ? "" : " type=\"$list_marker_type\"";
2352                 if ($fancy) {
2353                     $list_class = " class=\"$opt{style_prefix}ol\"";
2354                     my $start = $first_marker_num;
2355                     $start = 10 if $start > 10;
2356                     $start = 5 if $start > 5 && $start < 10;
2357                     $start = 1 if $start > 1 && $start < 5;
2358                     $list_att .= " start=\"$start\"" unless $start == 1;
2359                     $list_incr = _IncrList($start, $first_marker_num);
2360                 } else {
2361                     $list_class = " class=\"$opt{style_prefix}lc-greek\""
2362                         if $list_marker_type eq "a" && $first_marker =~ /^$greek_lower/o;
2363                     $list_att .= " start=\"$first_marker_num\"" unless $first_marker_num == 1;
2364                 }
2365         }
2366         my $idt = "\027" x $g_list_level;
2367         $result = "$idt<$list_type$list_att$list_class>\n$list_incr" . $result . "$idt</$list_type>\n\n";
2368         $result;
2369     };
2370
2371     # We use a different prefix before nested lists than top-level lists.
2372     # See extended comment in _ProcessListItems().
2373     #
2374     # Note: (jg) There's a bit of duplication here. My original implementation
2375     # created a scalar regex pattern as the conditional result of the test on
2376     # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
2377     # substitution once, using the scalar as the pattern. This worked,
2378     # everywhere except when running under MT on my hosting account at Pair
2379     # Networks. There, this caused all rebuilds to be killed by the reaper (or
2380     # perhaps they crashed, but that seems incredibly unlikely given that the
2381     # same script on the same server ran fine *except* under MT. I've spent
2382     # more time trying to figure out why this is happening than I'd like to
2383     # admit. My only guess, backed up by the fact that this workaround works,
2384     # is that Perl optimizes the substition when it can figure out that the
2385     # pattern will never change, and when this optimization isn't on, we run
2386     # afoul of the reaper. Thus, the slightly redundant code to that uses two
2387     # static s/// patterns rather than one conditional pattern.
2388     #
2389     # Note: (kjm) With the addition of the two-of-the-same-kind-in-a-row-
2390     # starts-a-list-at-the-top-level rule the two patterns really are somewhat
2391     # different now, but the duplication has pretty much been eliminated via
2392     # use of a separate sub which has the side-effect of making the below
2393     # two cases much easier to grok all at once.
2394
2395     if ($g_list_level) {
2396         my $parse = $text;
2397         $text = "";
2398         pos($parse) = 0;
2399         while ($parse =~ /\G(?s:.)*?^$whole_list/gmc) {
2400             my @captures = ($1, $2, $3, $4);
2401             if ($-[1] > $-[0]) {
2402                 $text .= _DoListBlocks(substr($parse, $-[0], $-[1] - $-[0]));
2403             }
2404             $text .= &$list_item_sub(@captures);
2405         }
2406         $text .= _DoListBlocks(substr($parse, pos($parse))) if pos($parse) < length($parse);
2407     }
2408     else {
2409         my $parse = $text;
2410         $text = "";
2411         pos($parse) = 0;
2412         while ($parse =~ m{\G(?s:.)*?
2413                 (?: (?<=\n\n) |
2414                     \A\n? |
2415                     (?<=:\n) |
2416                     (?:(?<=\n) # a list starts with one unordered marker line
2417                        (?=[ ]{0,$less_than_indent}$marker_ul[ ])) |
2418                     (?:(?<=\n) # or two ordered marker lines in a row
2419                        (?=[ ]{0,$less_than_indent}$marker_ol[ ].*\n\n?
2420                           [ ]{0,$less_than_indent}$marker_ol[ ])) |
2421                     (?:(?<=\n) # or any marker and a sublist marker
2422                        (?=[ ]{0,$less_than_indent}$marker_any[ ].*\n\n?
2423                           [ ]{$indent,$less_than_double_indent}$marker_any[ ]))
2424                 )
2425                 $whole_list
2426             }gmcx) {
2427             my @captures = ($1, $2, $3, $4);
2428             if ($-[1] > $-[0]) {
2429                 $text .= _DoListBlocks(substr($parse, $-[0], $-[1] - $-[0]));
2430             }
2431             $text .= &$list_item_sub(@captures);
2432         }
2433         $text .= _DoListBlocks(substr($parse, pos($parse))) if pos($parse) < length($parse);
2434     }
2435
2436     return $text;
2437 }
2438
2439
2440 sub _ProcessListItems {
2441 #
2442 #   Process the contents of a single ordered or unordered list, splitting it
2443 #   into individual list items.
2444 #
2445
2446     my $list_type = shift;
2447     my $list_str = shift;
2448
2449     # The $g_list_level global keeps track of when we're inside a list.
2450     # Each time we enter a list, we increment it; when we leave a list,
2451     # we decrement. If it's zero, we're not in a list anymore.
2452     #
2453     # We do this because when we're not inside a list, we want to treat
2454     # something like this:
2455     #
2456     #   I recommend upgrading to version
2457     #   8. Oops, now this line is treated
2458     #   as a sub-list.
2459     #
2460     # As a single paragraph, despite the fact that the second line starts
2461     # with a digit-period-space sequence.
2462     #
2463     # Whereas when we're inside a list (or sub-list), that line will be
2464     # treated as the start of a sub-list. What a kludge, huh? This is
2465     # an aspect of Markdown's syntax that's hard to parse perfectly
2466     # without resorting to mind-reading. Perhaps the solution is to
2467     # change the syntax rules such that sub-lists must start with a
2468     # starting cardinal number; e.g. "1." or "a.".
2469
2470     $g_list_level++;
2471     my $idt = "\027" x $g_list_level;
2472     my $marker_kind = $list_type eq "ul" ? $marker_ul : $marker_ol;
2473     my $first_marker;
2474     my $first_marker_type;
2475     my $first_marker_num;
2476     my $last_marker;
2477     my $fancy;
2478     my $skipped;
2479     my $typechanged;
2480     my $next_num = 1;
2481
2482     # trim trailing blank lines:
2483     $list_str =~ s/\n{2,}\z/\n/;
2484
2485     my $result = "";
2486     my $oldpos = 0;
2487     pos($list_str) = 0;
2488     while ($list_str =~ m{\G            # start where we left off
2489         (\n+)?                          # leading line = $1
2490         (^[ ]*)                         # leading whitespace = $2
2491         ($marker_any) [ ] ([ ]*)        # list marker = $3 leading item space = $4
2492     }cgmx) {
2493         my $leading_line = $1;
2494         my $leading_space = $2;
2495         my $list_marker = $3;
2496         my $list_marker_len = length($list_marker);
2497         my $leading_item_space = $4;
2498         if ($-[0] > $oldpos) {
2499             $result .= substr($list_str, $oldpos, $-[0] - $oldpos); # Sort-of $`
2500             $oldpos = $-[0]; # point at start of this entire match
2501         }
2502         if (!defined($first_marker)) {
2503             $first_marker = $list_marker;
2504             $first_marker_type = _GetListMarkerType($list_type, $first_marker);
2505             if ($first_marker_type) {
2506                 (my $marker_val = $first_marker) =~ s/[.\)]$//;
2507                 $first_marker_num = _GetMarkerIntegerNum($first_marker_type, $marker_val);
2508                 $next_num = $first_marker_num;
2509                 $skipped = 1 if $next_num != 1;
2510             }
2511         } elsif ($list_marker !~ /$marker_kind/) {
2512             # Wrong marker kind, "fix up" the marker to a correct "lazy" marker
2513             # But keep the old length in $list_marker_len
2514             $list_marker = $last_marker;
2515         }
2516
2517         # Now grab the rest of this item's data upto but excluding the next
2518         # list marker at the SAME indent level, but sublists must be INCLUDED
2519
2520         my $item = "";
2521         while ($list_str =~ m{\G
2522             ((?:.+?)(?:\n{1,2}))        # list item text = $1
2523             (?= \n* (?: \z |            # end of string OR
2524                     (^[ ]*)             # leading whitespace = $2
2525                     ($marker_any)       # next list marker = $3
2526                     ([ ]+) ))           # one or more spaces after marker = $4
2527         }cgmxs) {
2528
2529             # If $3 has a left edge that is at the left edge of the previous
2530             # marker OR $3 has a right edge that is at the right edge of the
2531             # previous marker then we stop; otherwise we go on
2532
2533             $item .= substr($list_str, $-[0], $+[0] - $-[0]); # $&
2534             last if !defined($4) || length($2) == length($leading_space) ||
2535                 length($2) + length($3) == length($leading_space) + $list_marker_len;
2536             # move along, you're not the marker droid we're looking for...
2537             $item .= substr($list_str, $+[0], $+[4] - $+[0]);
2538             pos($list_str) = $+[4]; # ...move along over the marker droid
2539         }
2540         # Remember where we parked
2541         $oldpos = pos($list_str);
2542
2543         # Process the $list_marker $item
2544
2545         my $liatt = '';
2546         my $checkbox = '';
2547         my $incr = '';
2548
2549         if ($list_type eq "ul" && !$leading_item_space && $item =~ /^\[([ xX\xd7])\] +(.*)$/s) {
2550             my $checkmark = $1;
2551             $item = $2;
2552             my ($checkbox_class, $checkbox_val);
2553             if ($checkmark ne " ") {
2554                 ($checkbox_class, $checkbox_val) = ("checkbox-on", "x");
2555             } else {
2556                 ($checkbox_class, $checkbox_val) = ("checkbox-off", "&#160;");
2557             }
2558             $liatt = " class=\"$opt{style_prefix}$checkbox_class\"";
2559             $checkbox = "<span><span></span></span><span></span><span>[<tt>$checkbox_val</tt>]&#160;</span>";
2560         } else {
2561             my $list_marker_type;
2562             ($list_marker_type, $liatt) = _GetListItemTypeClass($list_type, $list_marker, $last_marker);
2563             if ($list_type eq "ol" && defined($first_marker)) {
2564                 my $styled = $fancy = 1 if $liatt && $list_marker =~ /\)$/;
2565                 my ($sfx, $dash) = ("", "");
2566                 ($sfx, $dash) = ("li", "-") if $styled;
2567                 if ($liatt =~ /lower/) {
2568                     $sfx .= "${dash}lc";
2569                 } elsif ($liatt =~ /upper/) {
2570                     $sfx .= "${dash}uc";
2571                 }
2572                 $sfx .= "-greek" if $liatt =~ /greek/;
2573                 $liatt = " class=\"$opt{style_prefix}$sfx\"" if $sfx;
2574                 $typechanged = 1 if $list_marker_type ne $first_marker_type;
2575                 (my $marker_val = $list_marker) =~ s/[.\)]$//;
2576                 my $marker_num = _GetMarkerIntegerNum($list_marker_type, $marker_val);
2577                 $marker_num = $next_num if $marker_num < $next_num;
2578                 $skipped = 1 if $next_num < $marker_num;
2579                 $incr = _IncrList($next_num, $marker_num, "incrlevel=$g_list_level");
2580                 $liatt = " value=\"$marker_num\"$liatt" if $fancy || $skipped;
2581                 $liatt = " type=\"$list_marker_type\"$liatt" if $styled || $typechanged;
2582                 $next_num = $marker_num + 1;
2583             }
2584         }
2585         $last_marker = $list_marker;
2586
2587         if ($leading_line or ($item =~ m/\n{2,}/)) {
2588             $item = _RunBlockGamut(_Outdent($item));
2589             $item =~ s{(</[OUou][Ll]>)\s*\z}{$1} and $item .= "\n$idt<span style=\"display:none\">&#160;</span>";
2590         }
2591         else {
2592             # Recursion for sub-lists:
2593             $item = _DoListsAndBlocks(_Outdent($item));
2594             chomp $item;
2595             $item = _RunSpanGamut($item);
2596         }
2597
2598         # Append to $result
2599         $result .= "$incr$idt<li$liatt>" . $checkbox . $item . "$idt</li>\n";
2600     }
2601     if ($fancy) {
2602         # remove "incrlevel=$g_list_level " parts
2603         $result =~ s{<span incrlevel=$g_list_level class="$opt{style_prefix}ol-incr((?:-\d{1,2})?)">}
2604             {$idt<span class="$opt{style_prefix}ol-incr$1">}g;
2605     } else {
2606         # remove the $g_list_level incr spans entirely
2607         $result =~ s{<span incrlevel=$g_list_level class="$opt{style_prefix}ol-incr(?:-\d{1,2})?"></span>\n}{}g;
2608         # remove the class="$opt{style_prefix}lc-greek" if first_marker is greek
2609         $result =~ s{(<li[^>]*?) class="$opt{style_prefix}lc-greek">}{$1>}g
2610             if defined($first_marker_type) && $first_marker_type eq "a" && $first_marker =~ /^$greek_lower/o;
2611     }
2612
2613     # Anything left over (similar to $') goes into result, but this should always be empty
2614     $result .= _RunBlockGamut(substr($list_str, pos($list_str))) if pos($list_str) < length($list_str);
2615
2616     $g_list_level--;
2617
2618     # After all that, if we only got an ordered list with a single item
2619     # and its first marker is a four-digit number >= 1492 and <= 2999
2620     # or an UPPERCASE letter, then pretend we didn't see any list at all.
2621
2622     if ($first_marker_type && $first_marker_num + 1 == $next_num) {
2623         if (($first_marker_type eq "1" && $first_marker_num >= 1492 && $first_marker_num <= 2999) ||
2624             ($first_marker_type eq "A" && !$fancy)) {
2625             return (undef, undef, undef);
2626         }
2627     }
2628
2629     return ($result, $first_marker, $fancy);
2630 }
2631
2632
2633 sub _DoCodeBlocks {
2634 #
2635 #   Process Markdown `<pre><code>` blocks.
2636 #
2637
2638     my $text = shift;
2639
2640     $text =~ s{
2641             (?:\n\n|\A\n?)
2642             (           # $1 = the code block -- one or more lines, starting with indent_width spaces
2643               (?:
2644                 (?:[ ]{$opt{indent_width}})  # Lines must start with indent_width of spaces
2645                 .*\n+
2646               )+
2647             )
2648             ((?=^[ ]{0,$opt{indent_width}}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
2649         }{
2650             my $codeblock = $1;
2651
2652             $codeblock =~ s/\n\n\n/\n\n/g; # undo "paragraph for last list item" change
2653             $codeblock = _EncodeCode(_Outdent($codeblock));
2654             $codeblock =~ s/\A\n+//; # trim leading newlines
2655             $codeblock =~ s/\s+\z//; # trim trailing whitespace
2656
2657             my $result = "<div class=\"$opt{style_prefix}code\"><pre style=\"display:none\"></pre><pre><code>"
2658                 . $codeblock . "\n</code></pre></div>";
2659             my $key = block_id($result, 2);
2660             $g_code_blocks{$key} = $result;
2661             "\n\n" . $key . "\n\n";
2662         }egmx;
2663
2664     return $text;
2665 }
2666
2667
2668 sub _DoCodeSpans {
2669 #
2670 # * Backtick quotes are used for <code></code> spans.
2671 #
2672 # * You can use multiple backticks as the delimiters if you want to
2673 #   include literal backticks in the code span. So, this input:
2674 #
2675 #     Just type ``foo `bar` baz`` at the prompt.
2676 #
2677 #   Will translate to:
2678 #
2679 #     <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
2680 #
2681 #   There's no arbitrary limit to the number of backticks you
2682 #   can use as delimters. If you need three consecutive backticks
2683 #   in your code, use four for delimiters, etc.
2684 #
2685 # * You can use spaces to get literal backticks at the edges:
2686 #
2687 #     ... type `` `bar` `` ...
2688 #
2689 #   Turns to:
2690 #
2691 #     ... type <code>`bar`</code> ...
2692 #
2693
2694     my $text = shift;
2695
2696     $text =~ s@
2697             (`+)        # $1 = Opening run of `
2698             (.+?)       # $2 = The code block
2699             (?<!`)
2700             \1          # Matching closer
2701             (?!`)
2702         @
2703             my $c = "$2";
2704             $c =~ s/^[ ]+//g; # leading whitespace
2705             $c =~ s/[ ]+$//g; # trailing whitespace
2706             $c = _EncodeCode($c);
2707             "<code>$c</code>";
2708         @egsx;
2709
2710     return $text;
2711 }
2712
2713
2714 sub _EncodeCode {
2715 #
2716 # Encode/escape certain characters inside Markdown code runs.
2717 # The point is that in code, these characters are literals,
2718 # and lose their special Markdown meanings.
2719 #
2720     local $_ = shift;
2721
2722     # Encode all ampersands; HTML entities are not
2723     # entities within a Markdown code span.
2724     s/&/&amp;/g;
2725
2726     # Encode $'s, but only if we're running under Blosxom.
2727     # (Blosxom interpolates Perl variables in article bodies.)
2728     s/\$/&#036;/g if $_haveBX;
2729
2730     # Do the angle bracket song and dance:
2731     s! <  !&lt;!gx;
2732     s! >  !&gt;!gx;
2733
2734     # Now, escape characters that are magic in Markdown:
2735     s!([*_~{}\[\]\\])!$g_escape_table{$1}!g;
2736
2737     return $_;
2738 }
2739
2740
2741 sub _DoItalicsAndBoldAndStrike {
2742     my $text = shift;
2743
2744     my $doital1 = sub {
2745         my $text = shift;
2746         $text =~ s{ \* (?=\S) (.+?) (?<=\S) \* }
2747             {<em>$1</em>}gsx;
2748         # We've got to encode any of these remaining to
2749         # avoid conflicting with other italics and bold.
2750         $text =~ s!([*])!$g_escape_table{$1}!g;
2751         $text;
2752     };
2753     my $doital2 = sub {
2754         my $text = shift;
2755         $text =~ s{ (?<!\w) _ (?=\S) (.+?) (?<=\S) _ (?!\w) }
2756             {<em>$1</em>}gsx;
2757         # We've got to encode any of these remaining to
2758         # avoid conflicting with other italics and bold.
2759         $text =~ s!([_])!$g_escape_table{$1}!g;
2760         $text;
2761     };
2762
2763     # <strong> must go first:
2764     $text =~ s{ \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }
2765         {"<strong>".&$doital1($1)."</strong>"}gsex;
2766     $text =~ s{ (?<!\w) __ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\w) }
2767         {"<strong>".&$doital2($1)."</strong>"}gsex;
2768
2769     $text =~ s{ ~~ (?=\S) (.+?[*_]*) (?<=\S) ~~ }
2770         {<strike>$1</strike>}gsx;
2771
2772     $text =~ s{ \* (?=\S) (.+?) (?<=\S) \* }
2773         {<em>$1</em>}gsx;
2774     $text =~ s{ (?<!\w) _ (?=\S) (.+?) (?<=\S) _ (?!\w) }
2775         {<em>$1</em>}gsx;
2776
2777     return $text;
2778 }
2779
2780
2781 sub _DoBlockQuotes {
2782     my $text = shift;
2783
2784     $text =~ s{
2785           (                     # Wrap whole match in $1
2786             (
2787               ^[ ]*>[ ]?        # '>' at the start of a line
2788                 .*\n            # rest of the first line
2789               (.+\n)*           # subsequent consecutive lines
2790               \n*               # blanks
2791             )+
2792           )
2793         }{
2794             my $bq = $1;
2795             $bq =~ s/^[ ]*>[ ]?//gm; # trim one level of quoting
2796             $bq =~ s/^[ ]+$//mg;         # trim whitespace-only lines
2797             $bq = _RunBlockGamut($bq);   # recurse
2798
2799             $bq =~ s/^/\027/mg;
2800             "<blockquote>\n$bq\n</blockquote>\n\n";
2801         }egmx;
2802
2803
2804     return $text;
2805 }
2806
2807
2808 my ($LEAD, $TRAIL, $LEADBAR, $LEADSP, $COLPL, $SEP);
2809 BEGIN {
2810     $LEAD = qr/(?>[ ]*(?:\|[ ]*)?)/o;
2811     $TRAIL = qr/[ ]*(?<!\\)\|[ ]*/o;
2812     $LEADBAR = qr/(?>[ ]*\|[ ]*)/o;
2813     $LEADSP = qr/(?>[ ]*)/o;
2814     $COLPL = qr/(?:[^\n|\\]|\\(?:(?>[^\n])|(?=\n|$)))+/o;
2815     $SEP = qr/[ ]*:?-+:?[ ]*/o;
2816 }
2817
2818 sub _DoTables {
2819     my $text = shift;
2820
2821     $text =~ s{
2822         (                               # Wrap whole thing to avoid $&
2823          (?: (?<=\n\n) | \A\n? )        # Preceded by blank line or beginning of string
2824          ^(                             # Header line
2825             $LEADBAR \| [^\n]* |
2826             $LEADBAR $COLPL [^\n]* |
2827             $LEADSP $COLPL \| [^\n]*
2828           )\n
2829           (                             # Separator line
2830             $LEADBAR $SEP (?: \| $SEP )* (?: \| [ ]*)? |
2831             $SEP (?: \| $SEP )+ (?: \| [ ]*)? |
2832             $SEP \| [ ]*
2833           )\n
2834           ((?:                          # Rows (0+)
2835             $LEADBAR \| [^\n]* \n |
2836             $LEADBAR $COLPL [^\n]* \n |
2837             $LEADSP $COLPL \| [^\n]* \n
2838           )*)
2839         )
2840     } {
2841         my ($w, $h, $s, $rows) = ($1, $2, $3, $4);
2842         my @heads = _SplitTableRow($h);
2843         my @seps = _SplitTableRow($s);
2844         if (@heads == @seps) {
2845             my @align = map {
2846                 if (/^:-+:$/) {" align=\"center\""}
2847                 elsif (/^:/) {" align=\"left\""}
2848                 elsif (/:$/) {" align=\"right\""}
2849                 else {""}
2850             } @seps;
2851             my $nohdr = "";
2852             $nohdr = " $opt{style_prefix}table-nohdr" if join("", @heads) eq "";
2853             my $tab ="\n<table border=\"1\" cellspacing=\"0\" cellpadding=\"2\" class=\"$opt{style_prefix}table$nohdr\">\n";
2854             $tab .=
2855                 "  <tr class=\"$opt{style_prefix}row-hdr\">" . _MakeTableRow("th", \@align, @heads) . "</tr>\n"
2856                 unless $nohdr;
2857             my $cnt = 0;
2858             my @classes = ("class=\"$opt{style_prefix}row-even\"", "class=\"$opt{style_prefix}row-odd\"");
2859             $tab .= "  <tr " . $classes[++$cnt % 2] . ">" . _MakeTableRow("td", \@align, @$_) . "</tr>\n"
2860                     foreach (_SplitMergeRows($rows));
2861             $tab .= "</table>\n\n";
2862         } else {
2863             $w;
2864         }
2865     }egmx;
2866
2867     return $text;
2868 }
2869
2870
2871 sub _SplitMergeRows {
2872     my @rows = ();
2873     my ($mergeprev, $mergenext) = (0,0);
2874     foreach (split(/\n/, $_[0])) {
2875         $mergeprev = $mergenext;
2876         $mergenext = 0;
2877         my @cols = _SplitTableRow($_);
2878         if (_endswithbareslash($cols[$#cols])) {
2879             my $last = $cols[$#cols];
2880             substr($last, -1, 1) = "";
2881             $last =~ s/[ ]+$//;
2882             $cols[$#cols] = $last;
2883             $mergenext = 1;
2884         }
2885         if ($mergeprev) {
2886             for (my $i = 0; $i <= $#cols; ++$i) {
2887                 my $cell = $rows[$#rows]->[$i];
2888                 defined($cell) or $cell = "";
2889                 $rows[$#rows]->[$i] = _MergeCells($cell, $cols[$i]);
2890             }
2891         } else {
2892             push(@rows, [@cols]);
2893         }
2894     }
2895     return @rows;
2896 }
2897
2898
2899 sub _endswithbareslash {
2900     return 0 unless substr($_[0], -1, 1) eq "\\";
2901     my @parts = split(/\\\\/, $_[0], -1);
2902     return substr($parts[$#parts], -1, 1) eq "\\";
2903 }
2904
2905
2906 sub _MergeCells {
2907     my ($c1, $c2) = @_;
2908     return $c1 if $c2 eq "";
2909     return $c2 if $c1 eq "";
2910     return $c1 . " " . $c2;
2911 }
2912
2913
2914 sub _SplitTableRow {
2915     my $row = shift;
2916     $row =~ s/^$LEAD//;
2917     $row =~ s/$TRAIL$//;
2918     $row =~ s!\\\\!$g_escape_table{'\\'}!go; # Must process escaped backslashes first.
2919     $row =~ s!\\\|!$g_escape_table{'|'}!go; # Then do \|
2920     my @elems = map {
2921       s!$g_escape_table{'|'}!|!go;
2922       s!$g_escape_table{'\\'}!\\\\!go;
2923       s/^[ ]+//;
2924       s/[ ]+$//;
2925       $_;
2926     } split(/[ ]*\|[ ]*/, $row, -1);
2927     @elems or push(@elems, "");
2928     return @elems;
2929 }
2930
2931
2932 sub _MakeTableRow {
2933     my $etype = shift;
2934     my $align = shift;
2935     my $row = "";
2936     for (my $i = 0; $i < @$align; ++$i) {
2937         my $data = $_[$i];
2938         defined($data) or $data = "";
2939         $row .= "<" . $etype . $$align[$i] . ">" .
2940             _RunSpanGamut($data) . "</" . $etype . ">";
2941     }
2942     return $row;
2943 }
2944
2945
2946 sub _FormParagraphs {
2947 #
2948 # Params:
2949 #   $text - string to process with html <p> tags
2950 #
2951     my ($text, $anchors) = @_;
2952
2953     # Strip leading and trailing lines:
2954     $text =~ s/\A\n+//;
2955     $text =~ s/\n+\z//;
2956
2957     my @grafs = split(/\n{2,}/, $text);
2958
2959     #
2960     # Wrap <p> tags.
2961     #
2962     foreach (@grafs) {
2963         unless (defined($g_html_blocks{$_}) || defined($g_code_blocks{$_})) {
2964             $_ = _RunSpanGamut($_);
2965             s/^([ ]*)/$g_start_p/;
2966             $_ .= $g_close_p;
2967         }
2968     }
2969
2970     #
2971     # Strip standalone XML comments if requested
2972     #
2973     if ($anchors && $opt{stripcomments} && @g_xml_comments) {
2974         my %xml_comment = ();
2975         $xml_comment{$_} = 1 foreach @g_xml_comments;
2976         my @grafs2 = ();
2977         do { push(@grafs2, $_) unless $xml_comment{$_} } foreach @grafs;
2978         @grafs = @grafs2;
2979     }
2980
2981     #
2982     # Unhashify HTML blocks
2983     #
2984     foreach (@grafs) {
2985         if (defined( $g_html_blocks{$_} )) {
2986             $_ = $g_html_blocks{$_};
2987         }
2988     }
2989
2990     return join "\n\n", @grafs;
2991 }
2992
2993
2994 # %ok_tag_name declared previously
2995 my $g_possible_tag_name;
2996 BEGIN {
2997     # note: length("blockquote") == 10
2998     $g_possible_tag_name = qr/(?i:[a-z]{1,10}|h[1-6]|\020)/o;
2999     %ok_tag_name = map({$_ => 1} "\20", qw(
3000         a abbr acronym address area
3001         b basefont bdo big blockquote br
3002         caption center cite code col colgroup
3003         dd del dfn div dl dt
3004         em
3005         font
3006         h1 h2 h3 h4 h5 h6 hr
3007         i img ins
3008         kbd
3009         li
3010         map
3011         ol
3012         p pre
3013         q
3014         s samp small span strike strong sub sup
3015         table tbody td tfoot th thead tr tt
3016         u ul
3017         var
3018     ));
3019     $ok_tag_name{$_} = 0 foreach (qw(
3020         dir menu
3021     ));
3022 }
3023
3024
3025 sub _SetAllowedTag {
3026     my ($tag, $forbid) = @_;
3027     $ok_tag_name{$tag} = $forbid ? 0 : 1
3028         if defined($tag) && exists($ok_tag_name{$tag});
3029 }
3030
3031
3032 # Encode leading '<' of any non-tags
3033 # However, "<?", "<!" and "<$" are passed through (legacy on that "<$" thing)
3034 sub _DoTag {
3035     my $tag = shift;
3036     return $tag if $tag =~ /^<[?\$!]/;
3037     if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} || $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
3038         $ok_tag_name{lc($1)}) {
3039
3040         return _ProcessURLTag("href", $tag, 1) if $tag =~ /^<a\s/i;
3041         return _ProcessURLTag("src", $tag) if $tag =~ /^<img\s/i;
3042         return $tag;
3043     }
3044     $tag =~ s/^</&lt;/;
3045     return $tag;
3046 }
3047
3048 # Strip out all tags that _DoTag would match
3049 sub _StripTags {
3050     my $text = shift;
3051     my $_StripTag = sub {
3052         my $tag = shift;
3053         return $tag if $tag =~ /^<[?\$!]/;
3054         if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} || $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
3055             $ok_tag_name{lc($1)}) {
3056
3057             return ""; # strip it out
3058         }
3059         return $tag;
3060     };
3061     $text =~ s{(<[^>]*>)}{&$_StripTag($1)}ige;
3062     return $text;
3063 }
3064
3065 my %univatt;    # universally allowed attribute names
3066 my %tagatt;     # per-element allowed attribute names
3067 my %tagmt;      # empty element tags
3068 my %tagocl;     # non-empty elements with optional closing tag
3069 my %tagacl;     # which %tagocl an opening %tagocl will close
3070 my %tagblk;     # block elements
3071 my %taginl;     # inline markup tags which trigger an auto <p> reopen
3072 my %taga1p;     # open tags which require at least one attribute
3073 my %lcattval;   # names of attribute values to lowercase
3074 my %impatt;     # names of "implied" attributes
3075 BEGIN {
3076     %univatt = map({$_ => 1} qw(class dir id lang style title xml:lang));
3077     %tagatt = (
3078         'a' => { map({$_ => 1} qw(href name)) },
3079         'area' => { map({$_ => 1} qw(alt coords href nohref shape)) },
3080         'basefont' => { map({$_ => 1} qw(color face size)) },
3081         'br' => { map({$_ => 1} qw(clear)) },
3082         'caption' => { map({$_ => 1} qw(align)) },
3083         'col' => { map({$_ => 1} qw(align span width valign)) },
3084         'colgroup' => { map({$_ => 1} qw(align span width valign)) },
3085         'dir' => { map({$_ => 1} qw(compact)) },
3086         'div' => { map({$_ => 1} qw(align)) },
3087         'dl' => { map({$_ => 1} qw(compact)) },
3088         'font' => { map({$_ => 1} qw(color face size)) },
3089         'h1' => { map({$_ => 1} qw(align)) },
3090         'h2' => { map({$_ => 1} qw(align)) },
3091         'h3' => { map({$_ => 1} qw(align)) },
3092         'h4' => { map({$_ => 1} qw(align)) },
3093         'h5' => { map({$_ => 1} qw(align)) },
3094         'h6' => { map({$_ => 1} qw(align)) },
3095         'hr' => { map({$_ => 1} qw(align noshade size width)) },
3096         # NO server-side image maps, therefore NOT ismap !
3097         'img' => { map({$_ => 1} qw(align alt border height hspace src usemap vspace width)) },
3098         'li' => { map({$_ => 1} qw(compact type value)) },
3099         'map' => { map({$_ => 1} qw(name)) },
3100         'menu' => { map({$_ => 1} qw(compact)) },
3101         'ol' => { map({$_ => 1} qw(compact start type)) },
3102         'p' => { map({$_ => 1} qw(align)) },
3103         'pre' => { map({$_ => 1} qw(width)) },
3104         'table' => { map({$_ => 1} qw(align border cellpadding cellspacing summary width)) },
3105         'tbody' => { map({$_ => 1} qw(align valign)) },
3106         'tfoot' => { map({$_ => 1} qw(align valign)) },
3107         'thead' => { map({$_ => 1} qw(align valign)) },
3108         'td' => { map({$_ => 1} qw(align colspan height nowrap rowspan valign width)) },
3109         'th' => { map({$_ => 1} qw(align colspan height nowrap rowspan valign width)) },
3110         'tr' => { map({$_ => 1} qw(align valign)) },
3111         'ul' => { map({$_ => 1} qw(compact type)) }
3112     );
3113     %tagmt = map({$_ => 1} qw(area basefont br col hr img));
3114     %tagocl = map({$_ => 1} qw(colgroup dd dt li p tbody td tfoot th thead tr));
3115     %tagacl = (
3116         'colgroup' => \%tagocl,
3117         'dd' => { map({$_ => 1} qw(colgroup dd dt li p)) },
3118         'dt' => { map({$_ => 1} qw(colgroup dd dt li p)) },
3119         'li' => { map({$_ => 1} qw(colgroup dd dt li p)) },
3120         'tbody' => \%tagocl,
3121         'td' => { map({$_ => 1} qw(colgroup dd dt li p td th)) },
3122         'tfoot' => \%tagocl,
3123         'th' => { map({$_ => 1} qw(colgroup dd dt li p td th)) },
3124         'thead' => \%tagocl,
3125         'tr' => { map({$_ => 1} qw(colgroup dd dt li p td th tr)) },
3126     );
3127     %tagblk = map({$_ => 1} qw(address blockquote center div dl h1 h2 h3 h4 h5 h6 hr ol p pre table ul));
3128     %taginl = map({$_ => 1} qw(a abbr acronym b basefont bdo big br cite code dfn em font i
3129                                img kbd map q s samp small span strike strong sub sup tt u var));
3130     %impatt = map({$_ => 1} qw(checked compact ismap nohref noshade nowrap));
3131     %lcattval = map({$_ => 1} qw(
3132         align border cellpadding cellspacing checked clear color colspan
3133         compact coords height hspace ismap nohref noshade nowrap rowspan size
3134         span shape valign vspace width
3135     ));
3136     %taga1p = map({$_ => 1} qw(a area bdo img map));
3137 }
3138
3139
3140 # _SanitizeTags
3141 #
3142 # Inspect all '<'...'>' tags in the input and HTML encode those things
3143 # that cannot possibly be tags and at the same time sanitize them.
3144 #
3145 # $1 => text to process
3146 # <= sanitized text
3147 sub _SanitizeTags {
3148     my ($text, $validate, $htmlauto) = @_;
3149     $text =~ s/\s+$//;
3150     $text ne "" or return "";
3151     my @stack = ();
3152     my $ans = "";
3153     my $end = length($text);
3154     pos($text) = 0;
3155     my ($autoclose, $autoclopen);
3156     my $lastmt = "";
3157     my $reopenp = 0;
3158     $autoclose = $htmlauto ? sub {
3159         my $s = $_[0] || "";
3160         while (@stack &&
3161                ($stack[$#stack]->[0] ne $s || $_[1] && !$stack[$#stack]->[2]) &&
3162                $tagocl{$stack[$#stack]->[0]}) {
3163             $ans .= "</" . $stack[$#stack]->[0] . ">";
3164             pop(@stack);
3165         }
3166     } : sub {} if $validate;
3167     $autoclopen = $htmlauto ? sub {
3168         my $s = $_[0] || "";
3169         my $c;
3170         if ($tagblk{$s}) {$c = {p=>1}}
3171         elsif ($tagocl{$s}) {$c = $tagacl{$s}}
3172         else {return}
3173         my $clp = 0;
3174         while (@stack && $c->{$stack[$#stack]->[0]}) {
3175             $clp = 0;
3176             if ($stack[$#stack]->[2] &&
3177                 $stack[$#stack]->[1]+3 eq $_[1]) {
3178                 $ans .= "</\20>";
3179             } else {
3180                 $ans .= "</" . $stack[$#stack]->[0] . ">";
3181             }
3182             if ($stack[$#stack]->[2]) {
3183                 $stack[$#stack]->[0] = "\20";
3184             } else {
3185                 $clp = $s ne "p" && $stack[$#stack]->[0] eq "p";
3186                 pop(@stack);
3187             }
3188         }
3189         $clp;
3190     } : sub {} if $validate;
3191     while (pos($text) < $end) {
3192         if ($text =~ /\G(\s+)/gc) {
3193             $ans .= $1;
3194             next;
3195         }
3196         if ($text =~ /\G([^<]+)/gc) {
3197             if ($validate && @stack && $stack[$#stack]->[0] eq "\20") {
3198                 push(@stack,["p",pos($text)-length($1)]);
3199                 $reopenp = 0;
3200                 $ans .= "<p>";
3201             }
3202             $reopenp && do {
3203                 push(@stack,["p",pos($text)-length($1)]);
3204                 $reopenp = 0;
3205                 $ans .= "<p>";
3206             };
3207             $ans .= _EncodeAmps($1);
3208             $lastmt = "";
3209             next;
3210         }
3211         my $tstart = pos($text);
3212         if ($text =~ /\G(<!--(?:[^-]|(?:-(?!-)))*-->)/gc) {
3213             # pass "comments" through unless stripping them
3214             if ($opt{stripcomments}) {
3215                 # strip any trailing whitespace + \n after comment if present
3216                 $text =~ /\G[ \t]*\n/gc;
3217             } else {
3218                 # pass the "comment" on through
3219                 $ans .= $1;
3220             }
3221             next;
3222         }
3223         if ($text =~ /\G(<[^>]*>)/gc) {
3224             my $tag = $1;
3225             my $tt;
3226             if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} ||
3227                  $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
3228                 $ok_tag_name{$tt=lc($1)})
3229             {
3230                 my ($stag, $styp, $autocloseflag) = _Sanitize($tag);
3231                 if ($styp == 2 && $lastmt eq $tt) {
3232                     $lastmt = "";
3233                     next;
3234                 }
3235                 $lastmt = $styp == -3 ? $tt : "";
3236                 $tt = "p" if $autocloseflag;
3237                 if ($validate && $styp) {
3238                     my $clp = &$autoclopen($tt, $tstart) if $styp != 2;
3239                     if ($styp == 1) {
3240                         $reopenp && $taginl{$tt} and do {
3241                             push(@stack,["p",$tstart]);
3242                             $ans .= "<p>";
3243                         };
3244                         push(@stack,[$tt,$tstart,$autocloseflag,$clp]);
3245                         $reopenp = 0;
3246                     } elsif ($styp == 2) {
3247                         $reopenp && ($tt eq "p" || $tt eq "\20") and do {
3248                             $reopenp = 0;
3249                             next;
3250                         };
3251                         &$autoclose($tt, $autocloseflag);
3252                         my $mtstkchk = sub {
3253                             !@stack and _xmlfail("closing tag $tt without matching open at " .
3254                                 _linecol($tstart, $text));
3255                         };
3256                         &$mtstkchk;
3257                         if ($autocloseflag && $stack[$#stack]->[0] eq "\20") {
3258                             pop(@stack);
3259                             $stag = "";
3260                         } elsif ($stack[$#stack]->[0] eq $tt) {
3261                             $stack[$#stack]->[3] and $reopenp = 1;
3262                             pop(@stack);
3263                         } else {
3264                             pop(@stack) while @stack && $stack[$#stack]->[0] eq "\20";
3265                             &$mtstkchk;
3266                             my @i = @{$stack[$#stack]};
3267                             _xmlfail("opening tag $i[0] at " . _linecol($i[1], $text) .
3268                                 " mismatch with closing tag $tt at " . _linecol($tstart, $text));
3269                         }
3270                     }
3271                 }
3272                 $ans .= $stag;
3273                 next;
3274             } else {
3275                 $tag =~ s/^</&lt;/;
3276                 $ans .= _EncodeAmps($tag);
3277                 $lastmt = "";
3278                 next;
3279             }
3280         }
3281         # can only get here if "\G" char is an unmatched "<"
3282         pos($text) += 1;
3283         $ans .= "&lt;";
3284         $lastmt = "";
3285     }
3286     &$autoclose if $validate;
3287     if ($validate && @stack) {
3288         my @errs;
3289         my $j;
3290         for ($j = 0; $j <= $#stack; ++$j) {
3291                 my @i = @{$stack[$j]};
3292                 next if $i[0] eq "\20";
3293                 unshift(@errs, "opening tag $i[0] without matching close at " .
3294                             _linecol($i[1], $text));
3295         }
3296         _xmlfail(@errs) unless !@errs;
3297     }
3298     # Remove any unwanted extra leading <p></p> sections
3299     $ans =~ s{<p></\020>}{}gs if $validate;
3300
3301     return $ans."\n";
3302 }
3303
3304
3305 sub _linecol {
3306         my ($pos, $txt) = @_;
3307         pos($txt) = 0;
3308         my ($l, $p);
3309         $l = 1;
3310         ++$l while ($p = pos($txt)), $txt =~ /\G[^\n]*\n/gc && pos($txt) <= $pos;
3311         return "line $l col " . (1 + ($pos - $p));
3312 }
3313
3314
3315 sub _xmlfail {
3316         die join("", map("$_\n", @_));
3317 }
3318
3319
3320 sub _Sanitize {
3321     my $tag = shift;
3322     my $seenatt = {};
3323     if ($tag =~ m{^</}) {
3324         my $autocloseflag = undef;
3325         $autocloseflag = 1, $tag="</p>" if $tag eq "</\20>";
3326         return (lc($tag),2,$autocloseflag);
3327     }
3328     if ($tag =~ /^<([^\s<\/>]+)\s+/gs) {
3329         my $tt = lc($1);
3330         my $autocloseflag = undef;
3331         $autocloseflag = 1, $tt="p" if $tt eq "\20";
3332         my $out = "<" . $tt . " ";
3333         my $ok = $tagatt{$tt};
3334         ref($ok) eq "HASH" or $ok = {};
3335         my $atc = 0;
3336         while ($tag =~ /\G\s*([^\s\042\047<\/>=]+)((?>=)|\s*)/gcs) {
3337             my ($a,$s) = ($1, $2);
3338             if ($s eq "" && substr($tag, pos($tag), 1) =~ /^[\042\047]/) {
3339                 # pretend the "=" sign wasn't overlooked
3340                 $s = "=";
3341             }
3342             if (substr($s,0,1) ne "=") {
3343                 # it's one of "those" attributes (e.g. compact) or not
3344                 # _SanitizeAtt will fix it up if it is
3345                 $out .= _SanitizeAtt($a, '""', $ok, $seenatt, $tt);
3346                 ++$atc;
3347                 next;
3348             }
3349             if ($tag =~ /\G([\042\047])((?:(?!\1)(?!<).)*)\1\s*/gcs) {
3350                 $out .= _SanitizeAtt($a, $1.$2.$1, $ok, $seenatt, $tt);
3351                 ++$atc;
3352                 next;
3353             }
3354             if ($tag =~ /\G([\042\047])((?:(?!\1)(?![<>])(?![\/][>]).)*)/gcs) {
3355                 # what to do what to do what to do
3356                 # trim trailing \s+ and magically add the missing quote
3357                 my ($q, $v) = ($1, $2);
3358                 $v =~ s/\s+$//;
3359                 $out .= _SanitizeAtt($a, $q.$v.$q, $ok, $seenatt, $tt);
3360                 ++$atc;
3361                 next;
3362             }
3363             if ($tag =~ /\G([^\s<\/>]+)\s*/gcs) {
3364                 # auto quote it
3365                 my $v = $1;
3366                 $v =~ s/\042/&quot;/go;
3367                 $out .= _SanitizeAtt($a, '"'.$v.'"', $ok, $seenatt, $tt);
3368                 ++$atc;
3369                 next;
3370             }
3371             # give it an empty value
3372             $out .= _SanitizeAtt($a, '""', $ok, $seenatt, $tt);
3373             ++$atc;
3374         }
3375         my $sfx = substr($tag, pos($tag));
3376         $out =~ s/\s+$//;
3377         my $typ = 1;
3378         if ($tagmt{$tt}) {
3379             $typ = ($tag =~ m,/>$,) ? 3 : -3;
3380             $out .= $opt{empty_element_suffix};
3381             return ("&lt;" . substr($tag,1), 0) if !$atc && $taga1p{$tt};
3382         } else {
3383             if ($tag =~ m,/>$,) {
3384                 return ("&lt;" . substr($tag,1), 0) if !$atc && $taga1p{$tt};
3385                 $typ = 3;
3386             } else {
3387                 return ("&lt;" . substr($tag,1), 0) if !$atc && $taga1p{$tt};
3388             }
3389             $out .= ">";
3390             $out .= "</$tt>" if $typ == 3;
3391         }
3392         return ($out,$typ,$autocloseflag);
3393     } elsif ($tag =~ /^<([^\s<\/>]+)/s) {
3394         my $tt = lc($1);
3395         return ("&lt;" . substr($tag,1), 0) if $taga1p{$tt};
3396         if ($tagmt{$tt}) {
3397             return ("<" . $tt . $opt{empty_element_suffix}, 3);
3398         } elsif ($tag =~ m,/>$,) {
3399             return ("<" . $tt . "></" . $tt . ">", 3);
3400         } else {
3401             return ("<" . $tt . ">", 1) unless $tt eq "\20";
3402             return ("<p>", 1, 1);
3403         }
3404     }
3405     return (lc($tag),0);
3406 }
3407
3408
3409 sub _SanitizeAtt {
3410     my $att = lc($_[0]);
3411     return "" unless $att =~ /^[_a-z:][_a-z:0-9.-]*$/; # no weirdo char att names
3412     return "" unless $univatt{$att} || $_[2]->{$att};
3413     return "" if $_[3]->{$att}; # no repeats
3414     $_[3]->{$att} = 1;
3415     $impatt{$att} and return $att."=".'"'.$att.'"';
3416     if ($lcattval{$att}) {
3417         return $att."="._SanitizeAttValue(lc($_[1]))." ";
3418     } else {
3419         my $satt = _SanitizeAttValue($_[1]);
3420         if (ref($opt{urlfunc}) eq 'CODE' &&
3421             (($_[4] eq "a" && $att eq "href") ||
3422              ($_[4] eq "img" && $att eq "src")) ) {
3423                 my ($lq,$v,$rq);
3424                 $lq = substr($satt, 0, 1);
3425                 $rq = substr($satt, -1, 1);
3426                 $v = unescapeXML(substr($satt, 1, length($satt)-2));
3427                 my ($uhost, $upath, $uq, $uf) = SplitURL($v);
3428                 $v = &{$opt{urlfunc}}($v, \%opt, $_[4], $uhost, $upath, $uq, $uf);
3429                 $satt = $lq . escapeXML($v) . $rq;
3430         }
3431         return $att."=".$satt." ";
3432     }
3433 }
3434
3435
3436 sub _SanitizeAttValue {
3437     my $v = shift;
3438     if ($v =~ /^([\042\047])(.*?)\1$/) {
3439         return $1.escapeXML($2).$1;
3440     } else {
3441         return '"'.escapeXML($v).'"';
3442     }
3443 }
3444
3445
3446 sub _ProcessURLTag {
3447     my ($att, $tag, $dofrag) = @_;
3448
3449     $att = lc($att) . "=";
3450     if ($tag =~ /^(<[^\s>]+\s+)/g) {
3451         my $out = $1;
3452         while ($tag =~ /\G([^\s\042\047<\/>=]+=)([\042\047])((?:(?!\2)(?!<).)*)(\2\s*)/gcs) {
3453             my ($p, $q, $v, $s) = ($1, $2, $3, $4);
3454             if (lc($p) eq $att && $v ne "") {
3455                 if ($dofrag && $v =~ m"^#") {
3456                     $v = _FindFragmentMatch($v);
3457                     my $bpcr;
3458                     if (ref($bpcr = $opt{base_prefix}) eq 'CODE') {
3459                         $v = "\2\3" . &$bpcr($v);
3460                     }
3461                 } else {
3462                     $v = _PrefixURL($v);
3463                 }
3464                 $v = _EncodeAttText($v);
3465             }
3466             $out .= $p . $q . $v . $s;
3467         }
3468         $out .= substr($tag, pos($tag));
3469         substr($out,0,1) = $g_escape_table{'<'};
3470         substr($out,-1,1) = $g_escape_table{'>'};
3471         return $out;
3472     }
3473
3474     return $tag;
3475 }
3476
3477
3478 my $oops_entities;
3479 BEGIN { $oops_entities = qr/(?:lt|gt|amp|quot|apos|nbsp)/io; }
3480
3481
3482 # $_[0] => the value to XML escape
3483 # returns the XML escaped value
3484 # Encodes the five required entites (amp,lt,gt,quot,apos)
3485 # while preserving any pre-existing entities which means that
3486 # calling this repeatedly on already-escaped text should return
3487 # it unchanged (i.e. it's idempotent).
3488 #
3489 sub escapeXML {
3490     my $text = shift;
3491
3492     # Treat these accidents as though they had the needed ';'
3493     $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
3494
3495     # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
3496     #   http://bumppo.net/projects/amputator/
3497     $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
3498
3499     # Remaining entities now
3500     $text =~ s/\042/&quot;/g;
3501     $text =~ s/\047/&#39;/g; # Some older browsers do not grok &apos;
3502     $text =~ s/</&lt;/g;
3503     $text =~ s/>/&gt;/g;
3504
3505     return $text;
3506 }
3507
3508
3509 # $_[0] => value to be unescaped
3510 # returns unescaped value
3511 # The five required XML entities (amp,lt,gt,quot,apos) plus nbsp
3512 # are decoded as well as decimal (#d+) and hexadecimal (#xh+).
3513 #
3514 # While the escapeXML function tries to be idempotent when presented
3515 # with an already-escaped string, this function is NOT necessarily
3516 # idempotent when presented with an already decoded string unless it's
3517 # been decoded to the point there are no more recognizable entities left.
3518 # In other words given a string such as:
3519 #
3520 #   &amp;amp;amp;amp;
3521 #
3522 # Each call will only decode one layer of escaping and it will take four
3523 # successive calls to finally end up with just "&".
3524 #
3525 sub unescapeXML {
3526     my $text = shift;
3527
3528     # Treat these accidents as though they had the needed ';'
3529     $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
3530
3531     $text =~ s/&[qQ][uU][oO][tT];/\042/gso;
3532     $text =~ s/&[aA][pP][oO][sS];/\047/gso;
3533     $text =~ s/&[gG][tT];/>/gso;
3534     $text =~ s/&[lL][tT];/</gso;
3535     $text =~ s/&[nN][bB][sS][pP];/&#160;/gso;
3536     $text =~ s{&([aA][mM][pP]|\#\d+|\#x[0-9a-fA-F]+);}{
3537         local $_=$1;
3538         lc($_) eq 'amp' ? '&' :
3539         /^#(\d+)$/ ? chr($1) :
3540         /^#[xX](.*)$/ ? chr(hex($1)) :
3541         $_
3542     }gsex;
3543
3544     return $text;
3545 }
3546
3547
3548 # $_[0] => the input URL to split
3549 # $_[1] => if a true value, call unescapeXML before splitting
3550 # returns array:
3551 #  [0] => scheme, name+password, host, port ("" if not present)
3552 #  [1] => path in url (starts with "/" if absolute otherwise relative)
3553 #  [2] => query string ("" if not present otherwise starts with "?")
3554 #  [3] => fragment ("" if not present otherwise starts with "#")
3555 # The returned value recovers the (possibly unescapeXML'd) input
3556 # string by simply concatenating the returned array elements.
3557 #
3558 sub SplitURL {
3559         my ($url, $unesc) = @_;
3560         $unesc and $url = unescapeXML($url);
3561         my ($sh, $p, $q, $f) = ("", "", "", "");
3562         if ($url =~ m{^([A-Za-z][A-Za-z0-9.+-]*:)(//.*)$}os) {
3563                 $sh = $1;
3564                 $url = $2;
3565         }
3566         if ($url =~ m{^(//[^/?#]*)((?:[/?#].*)?)$}os) {
3567                 $sh .= $1;
3568                 $url = $2;
3569         }
3570         ($p, $q, $f) = $url =~ m{^([^?#]*)((?:[?][^#]*)?)((?:[#].*)?)$}os;
3571         return ($sh, $p, $q, $f);
3572 }
3573
3574
3575 sub _EncodeAmps {
3576     my $text = shift;
3577
3578     # Treat these accidents as though they had the needed ';'
3579     $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
3580
3581     # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
3582     #   http://bumppo.net/projects/amputator/
3583     $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
3584
3585     return $text;
3586 }
3587
3588
3589 sub _EncodeAmpsAndAngles {
3590 # Smart processing for ampersands and angle brackets that need to be encoded.
3591
3592     my $text = shift;
3593
3594     # Treat these accidents as though they had the needed ';'
3595     $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
3596
3597     # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
3598     #   http://bumppo.net/projects/amputator/
3599     $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
3600
3601     # Encode naked <'s
3602     $text =~ s{<(?![\020a-z/?\$!])}{&lt;}gi;
3603     $text =~ s{<(?=[^>]*$)}{&lt;}g;
3604
3605     # Encode <'s that cannot possibly be a start or end tag
3606     $text =~ s{(<[^>]*>)}{_DoTag($1)}ige;
3607
3608     return $text;
3609 }
3610
3611
3612 sub _EncodeBackslashEscapes {
3613 #
3614 # Parameter: String.
3615 # Returns:   String after processing the following backslash escape sequences.
3616 #
3617     local $_ = shift;
3618
3619     s!\\\\!$g_escape_table{'\\'}!go; # Must process escaped backslashes first.
3620     s{\\([`*_~{}\[\]()>#+\-.!`])}{$g_escape_table{$1}}g;
3621
3622     return $_;
3623 }
3624
3625
3626 sub _DoAutoLinks {
3627     local $_ = shift;
3628
3629     s{<((https?|ftps?):[^'\042>\s]+)>(?!\s*</a>)}{_MakeATag($1, "&lt;".$1."&gt;")}gise;
3630
3631     # Email addresses: <address@domain.foo>
3632     s{
3633         <
3634         (?:mailto:)?
3635         (
3636             [-.\w]+
3637             \@
3638             [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
3639         )
3640         >
3641     }{
3642         _EncodeEmailAddress(_UnescapeSpecialChars($1), "&#x3c;", "&#62;");
3643     }egix;
3644
3645     # (kjm) I don't do "x" patterns
3646     s{(?:^|(?<=\s))((?:https?|ftps?)://(?:[-a-zA-Z0-9./?\&\%=_~!*;:\@+\$,\x23](?:(?<![.,:;])|(?=[^\s])))+)}
3647      {_MakeATag($1, $1)}soge;
3648     s{(?<![][])(?<!\] )\[RFC( ?)([0-9]{1,5})\](?![][])(?! \[)}
3649      {"["._MakeATag("https://tools.ietf.org/html/rfc$2", "RFC$1$2", "RFC $2")."]"}soge;
3650
3651     return $_;
3652 }
3653
3654
3655 sub _EncodeEmailAddress {
3656 #
3657 # Input: an email address, e.g. "foo@example.com"
3658 #
3659 # Output: the email address as a mailto link, with each character
3660 #         of the address encoded as either a decimal or hex entity, in
3661 #         the hopes of foiling most address harvesting spam bots. E.g.:
3662 #
3663 #   <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
3664 #   x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
3665 #   &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
3666 #
3667 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
3668 # mailing list: <https://tinyurl.com/yu7ue>
3669 #
3670
3671     my ($addr, $prefix, $suffix) = @_;
3672     $prefix = "" unless defined($prefix);
3673     $suffix = "" unless defined($suffix);
3674
3675     srand(unpack('N',md5($addr)));
3676     my @encode = (
3677         sub { '&#' .                 ord(shift)   . ';' },
3678         sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
3679         sub {                            shift          },
3680     );
3681
3682     $addr = "mailto:" . $addr;
3683
3684     $addr =~ s{(.)}{
3685         my $char = $1;
3686         if ( $char eq '@' ) {
3687             # this *must* be encoded. I insist.
3688             $char = $encode[int rand 1]->($char);
3689         } elsif ( $char ne ':' ) {
3690             # leave ':' alone (to spot mailto: later)
3691             my $r = rand;
3692             # roughly 10% raw, 45% hex, 45% dec
3693             $char = (
3694                 $r > .9   ?  $encode[2]->($char)  :
3695                 $r < .45  ?  $encode[1]->($char)  :
3696                              $encode[0]->($char)
3697             );
3698         }
3699         $char;
3700     }gex;
3701
3702     # strip the mailto: from the visible part
3703     (my $bareaddr = $addr) =~ s/^.+?://;
3704     $addr = _MakeATag("$addr", $prefix.$bareaddr.$suffix);
3705
3706     return $addr;
3707 }
3708
3709
3710 sub _UnescapeSpecialChars {
3711 #
3712 # Swap back in all the special characters we've hidden.
3713 #
3714     my $text = shift;
3715
3716     while( my($char, $hash) = each(%g_escape_table) ) {
3717         $text =~ s/$hash/$char/g;
3718     }
3719     return $text;
3720 }
3721
3722
3723 sub _TokenizeHTML {
3724 #
3725 # Parameter: String containing HTML markup.
3726 # Returns:   Reference to an array of the tokens comprising the input
3727 #            string. Each token is either a tag (possibly with nested,
3728 #            tags contained therein, such as <a href="<MTFoo>">, or a
3729 #            run of text between tags. Each element of the array is a
3730 #            two-element array; the first is either 'tag' or 'text';
3731 #            the second is the actual value.
3732 #
3733 #
3734 # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
3735 #   <https://web.archive.org/web/20041215155739/http://bradchoate.com/weblog/2002/07/27/mtregex>
3736 #
3737
3738     my $str = shift;
3739     my $pos = 0;
3740     my $len = length $str;
3741     my @tokens;
3742
3743     my $depth = 6;
3744     my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
3745     my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment
3746                    (?s: <\? .*? \?> ) |             # processing instruction
3747                    $nested_tags/iox;                # nested tags
3748
3749     while ($str =~ m/($match)/g) {
3750         my $whole_tag = $1;
3751         my $sec_start = pos $str;
3752         my $tag_start = $sec_start - length $whole_tag;
3753         if ($pos < $tag_start) {
3754             push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
3755         }
3756         push @tokens, ['tag', $whole_tag];
3757         $pos = pos $str;
3758     }
3759     push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
3760     \@tokens;
3761 }
3762
3763
3764 sub _Outdent {
3765 #
3766 # Remove one level of line-leading indent_width of spaces
3767 #
3768     my $text = shift;
3769
3770     $text =~ s/^ {1,$opt{indent_width}}//gm;
3771     return $text;
3772 }
3773
3774
3775 # _DeTab
3776 #
3777 # $1 => input text
3778 # $2 => optional tab width (default is $opt{tab_width})
3779 # $3 => leading spaces to strip off each line first (default is 0 aka none)
3780 # <= result with tabs expanded
3781 sub _DeTab {
3782     my $text = shift;
3783     my $ts = shift || $opt{tab_width};
3784     my $leadsp = shift || 0;
3785     my $spr = qr/^ {1,$leadsp}/ if $leadsp;
3786     pos($text) = 0;
3787     my $end = length($text);
3788     my $ans = "";
3789     while (pos($text) < $end) {
3790         my $line;
3791         if ($text =~ /\G(.*?\n)/gcs) {
3792             $line = $1;
3793         } else {
3794             $line = substr($text, pos($text));
3795             pos($text) = $end;
3796         }
3797         $line =~ s/$spr// if $leadsp;
3798         # From the Perl camel book section "Fluent Perl" but modified a bit
3799         $line =~ s/(.*?)(\t+)/$1 . ' ' x (length($2) * $ts - length($1) % $ts)/ges;
3800         $ans .= $line;
3801     }
3802     return $ans;
3803 }
3804
3805
3806 sub _PrefixURL {
3807 #
3808 # Add URL prefix if needed
3809 #
3810     my $url = shift;
3811     $url =~ s/^\s+//;
3812     $url =~ s/\s+$//;
3813     $url = "#" unless $url ne "";
3814
3815     return $url unless
3816         ref($opt{abs_prefix}) eq 'CODE' ||
3817         ref($opt{url_prefix}) eq 'CODE' ||
3818         ref($opt{img_prefix}) eq 'CODE' ;
3819     return $url if $url =~ m"^\002\003" || $url =~ m"^#" || $url =~ m,^//,;
3820     $url = &{$opt{abs_prefix}}($url) if $url =~ m,^/, && ref($opt{abs_prefix}) eq 'CODE';
3821     return $url if $url =~ /^[A-Za-z][A-Za-z0-9+.-]*:/ || $url =~ m,^//, ||
3822         ($opt{keepabs} && $url =~ m,^/,);
3823     my $cr = $opt{url_prefix};
3824     $cr = $opt{img_prefix}
3825         if ref($opt{img_prefix}) eq 'CODE' && $url =~ m"^[^#?]*\.(?:png|gif|jpe?g|svgz?)(?:[#?]|$)"i;
3826     return $url unless ref($cr) eq 'CODE';
3827     return "\2\3".&$cr(substr($url, 0, 1) eq '/' ? substr($url, 1) : $url);
3828 }
3829
3830
3831 BEGIN {
3832     $g_style_sheet = <<'STYLESHEET';
3833
3834 <style type="text/css">
3835 /* <![CDATA[ */
3836
3837 /* Markdown.pl fancy style sheet
3838 ** Copyright (C) 2017,2018,2019,2020,2021 Kyle J. McKay.
3839 ** All rights reserved.
3840 **
3841 ** Redistribution and use in source and binary forms, with or without
3842 ** modification, are permitted provided that the following conditions are met:
3843 **
3844 **   1. Redistributions of source code must retain the above copyright notice,
3845 **      this list of conditions and the following disclaimer.
3846 **
3847 **   2. Redistributions in binary form must reproduce the above copyright
3848 **      notice, this list of conditions and the following disclaimer in the
3849 **      documentation and/or other materials provided with the distribution.
3850 **
3851 **   3. Neither the name of the copyright holder nor the names of its
3852 **      contributors may be used to endorse or promote products derived from
3853 **      this software without specific prior written permission.
3854 **
3855 ** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
3856 ** AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
3857 ** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
3858 ** ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
3859 ** LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
3860 ** CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
3861 ** SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
3862 ** INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
3863 ** CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
3864 ** ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
3865 ** POSSIBILITY OF SUCH DAMAGE.
3866 */
3867
3868 div.%(base)code-bt > pre, div.%(base)code > pre {
3869         margin: 0;
3870         padding: 0;
3871         overflow: auto;
3872 }
3873
3874 div.%(base)code-bt > pre > code, div.%(base)code > pre > code {
3875         display: inline-block;
3876         margin: 0;
3877         padding: 0.5em 0;
3878         border-top: thin dotted;
3879         border-bottom: thin dotted;
3880 }
3881
3882 table.%(base)table {
3883         margin-bottom: 0.5em;
3884 }
3885 table.%(base)table, table.%(base)table th, table.%(base)table td {
3886         border-collapse: collapse;
3887         border-spacing: 0;
3888         border: thin solid;
3889 }
3890 table.%(base)yaml-table {
3891         border-collapse: collapse;
3892 }
3893 table.%(base)yaml-table * {
3894         border: thin solid;
3895 }
3896 table.%(base)yaml-table th {
3897         text-align: center;
3898 }
3899 table.%(base)yaml-table th, table.%(base)yaml-table td {
3900         padding-left: 0.5ex;
3901         padding-right: 0.5ex;
3902 }
3903
3904 ol.%(base)ol {
3905         counter-reset: %(base)item;
3906 }
3907 ol.%(base)ol[start="0"] {
3908         counter-reset: %(base)item -1;
3909 }
3910 ol.%(base)ol[start="5"] {
3911         counter-reset: %(base)item 4;
3912 }
3913 ol.%(base)ol[start="10"] {
3914         counter-reset: %(base)item 9;
3915 }
3916 ol.%(base)ol > span.%(base)ol-incr {
3917         counter-increment: %(base)item;
3918 }
3919 ol.%(base)ol > span.%(base)ol-incr-2 {
3920         counter-increment: %(base)item 2;
3921 }
3922 ol.%(base)ol > span.%(base)ol-incr-5 {
3923         counter-increment: %(base)item 5;
3924 }
3925 ol.%(base)ol > span.%(base)ol-incr-10 {
3926         counter-increment: %(base)item 10;
3927 }
3928 ol.%(base)lc-greek, li.%(base)lc-greek {
3929         list-style-type: lower-greek;
3930 }
3931 ol.%(base)ol > li {
3932         counter-increment: %(base)item;
3933 }
3934 ol.%(base)ol > li.%(base)li,
3935 ol.%(base)ol > li.%(base)li-lc,
3936 ol.%(base)ol > li.%(base)li-lc-greek,
3937 ol.%(base)ol > li.%(base)li-uc {
3938         list-style-type: none;
3939         display: block;
3940 }
3941 ol.%(base)ol > li.%(base)li:before,
3942 ol.%(base)ol > li.%(base)li-lc:before,
3943 ol.%(base)ol > li.%(base)li-lc-greek:before,
3944 ol.%(base)ol > li.%(base)li-uc:before {
3945         position: absolute;
3946         text-align: right;
3947         white-space: nowrap;
3948         margin-left: -9ex;
3949         width: 9ex;
3950 }
3951 ol.%(base)ol > li.%(base)li[type="1"]:before {
3952         content: counter(%(base)item, decimal) ")\A0 \A0 ";
3953 }
3954 ol.%(base)ol > li.%(base)li-lc[type="i"]:before,
3955 ol.%(base)ol > li.%(base)li-lc[type="I"]:before {
3956         content: counter(%(base)item, lower-roman) ")\A0 \A0 ";
3957 }
3958 ol.%(base)ol > li.%(base)li-uc[type="I"]:before,
3959 ol.%(base)ol > li.%(base)li-uc[type="i"]:before {
3960         content: counter(%(base)item, upper-roman) ")\A0 \A0 ";
3961 }
3962 ol.%(base)ol > li.%(base)li-lc[type="a"]:before,
3963 ol.%(base)ol > li.%(base)li-lc[type="A"]:before {
3964         content: counter(%(base)item, lower-alpha) ")\A0 \A0 ";
3965 }
3966 ol.%(base)ol > li.%(base)li-lc-greek[type="a"]:before,
3967 ol.%(base)ol > li.%(base)li-lc-greek[type="A"]:before {
3968         content: counter(%(base)item, lower-greek) ")\A0 \A0 ";
3969 }
3970 ol.%(base)ol > li.%(base)li-uc[type="A"]:before,
3971 ol.%(base)ol > li.%(base)li-uc[type="a"]:before {
3972         content: counter(%(base)item, upper-alpha) ")\A0 \A0 ";
3973 }
3974
3975 li.%(base)checkbox-on,
3976 li.%(base)checkbox-off {
3977         list-style-type: none;
3978         display: block;
3979 }
3980 li.%(base)checkbox-on > span:first-child + span + span,
3981 li.%(base)checkbox-off > span:first-child + span + span {
3982         position: absolute;
3983         clip: rect(0,0,0,0);
3984 }
3985 li.%(base)checkbox-on > span:first-child,
3986 li.%(base)checkbox-off > span:first-child,
3987 li.%(base)checkbox-on > span:first-child + span,
3988 li.%(base)checkbox-off > span:first-child + span {
3989         display: block;
3990         position: absolute;
3991         margin-left: -3ex;
3992         width: 1em;
3993         height: 1em;
3994 }
3995 li.%(base)checkbox-on > span:first-child > span:first-child,
3996 li.%(base)checkbox-off > span:first-child > span:first-child {
3997         display: block;
3998         position: absolute;
3999         left: 0.75pt; top: 0.75pt; right: 0.75pt; bottom: 0.75pt;
4000 }
4001 li.%(base)checkbox-on > span:first-child > span:first-child:before,
4002 li.%(base)checkbox-off > span:first-child > span:first-child:before {
4003         display: inline-block;
4004         position: relative;
4005         right: 1pt;
4006         width: 100%;
4007         height: 100%;
4008         border: 1pt solid;
4009         content: "";
4010 }
4011 li.%(base)checkbox-on > span:first-child + span:before {
4012         position: relative;
4013         left: 2pt;
4014         bottom: 1pt;
4015         font-size: 125%;
4016         line-height: 80%;
4017         vertical-align: text-top;
4018         content: "\2713";
4019 }
4020
4021 /* ]]> */
4022 </style>
4023
4024 STYLESHEET
4025     $g_style_sheet =~ s/^\s+//g;
4026     $g_style_sheet =~ s/\s+$//g;
4027     $g_style_sheet .= "\n";
4028 }
4029
4030 1;
4031
4032 __DATA__
4033
4034 =head1 NAME
4035
4036 Markdown.pl - convert Markdown format text files to HTML
4037
4038 =head1 SYNOPSIS
4039
4040 B<Markdown.pl> [B<--help>] [B<--html4tags>] [B<--htmlroot>=I<prefix>]
4041     [B<--imageroot>=I<prefix>] [B<--version>] [B<--shortversion>]
4042     [B<--tabwidth>=I<num>] [B<--stylesheet>] [B<--stub>] [--]
4043     [I<file>...]
4044
4045  Options:
4046    -h                                   show short usage help
4047    --help                               show long detailed help
4048    --html4tags                          use <br> instead of <br />
4049    --deprecated                         allow <dir> and <menu> tags
4050    --sanitize                           sanitize tag attributes
4051    --no-sanitize                        do not sanitize tag attributes
4052    --validate-xml                       check if output is valid XML
4053    --validate-xml-internal              fast basic check if output is valid XML
4054    --no-validate-xml                    do not check output for valid XML
4055    --strip-comments                     remove XML comments from output
4056    --no-strip-comments                  do not remove XML comments (default)
4057    --tabwidth=num                       expand tabs to num instead of 8
4058    --auto-number                        automatically number h1-h6 headers
4059    -k | --keep-abs                      keep abspath URLs despite -r/-i
4060    -a prefix | --absroot=prefix         append abspath URLs to prefix
4061    -b prefix | --base=prefix            prepend prefix to fragment-only URLs
4062    -r prefix | --htmlroot=prefix        append relative non-img URLs to prefix
4063    -i prefix | --imageroot=prefix       append relative img URLs to prefix
4064    -w [wikipat] | --wiki[=wikipat]      activate wiki links using wikipat
4065    --yaml[=(enable|disable|strip|...)]  select YAML front matter processing
4066    -V | --version                       show version, authors, license
4067                                         and copyright
4068    -s | --shortversion                  show just the version number
4069    --raw | --raw-xml                    input contains only raw xhtml
4070    --raw-html                           input contains only raw html
4071    --stylesheet                         output the fancy style sheet
4072    --no-stylesheet                      do not output fancy style sheet
4073    --stub                               wrap output in stub document
4074                                         implies --stylesheet
4075    --                                   end options and treat next
4076                                         argument as file
4077
4078 =head1 DESCRIPTION
4079
4080 Markdown is a text-to-HTML filter; it translates an easy-to-read /
4081 easy-to-write structured text format into HTML. Markdown's text format
4082 is most similar to that of plain text email, and supports features such
4083 as headers, *emphasis*, code blocks, blockquotes, and links.
4084
4085 Markdown's syntax is designed not as a generic markup language, but
4086 specifically to serve as a front-end to (X)HTML. You can  use span-level
4087 HTML tags anywhere in a Markdown document, and you can use block level
4088 HTML tags (like <div> and <table> as well).
4089
4090 For more information about Markdown's syntax, see the F<basics.md>
4091 and F<syntax.md> files included with F<Markdown.pl>.
4092
4093 Input (auto-detected) may be either ISO-8859-1 or UTF-8.  Output is always
4094 converted to the UTF-8 character set.
4095
4096
4097 =head1 OPTIONS
4098
4099 Use "--" to end switch parsing. For example, to open a file named "-z", use:
4100
4101     Markdown.pl -- -z
4102
4103 =over
4104
4105
4106 =item B<--html4tags>
4107
4108 Use HTML 4 style for empty element tags, e.g.:
4109
4110     <br>
4111
4112 instead of Markdown's default XHTML style tags, e.g.:
4113
4114     <br />
4115
4116 This option is I<NOT compatible> with the B<--validate-xml> option
4117 and will produce an immediate error if both are given.
4118
4119
4120 =item B<--deprecated>
4121
4122 Both "<dir>" and "<menu>" are normally taken as literal text and the leading
4123 "<" will be automatically escaped.
4124
4125 If this option is used, they are recognized as valid tags and passed through
4126 without being escaped.
4127
4128 When dealing with program argument descriptions "<dir>" can be particularly
4129 problematic therefore use of this option is not recommended.
4130
4131 Other deprecated tags (such as "<font>" and "<center>" for example) continue
4132 to be recognized and passed through even without using this option.
4133
4134
4135 =item B<--sanitize>
4136
4137 Removes troublesome tag attributes from embedded tags.  Only a very strictly
4138 limited set of tag attributes will be permitted, other attributes will be
4139 silently discarded.  The set of allowed attributes varies by tag.
4140
4141 Splits empty minimized elements that are not one of the HTML allowed empty
4142 elements (C<area> C<basefont> C<br> C<col> C<hr> C<img>) into separate begin
4143 and end tags.  For example, C<< <p/> >> or C<< <p /> >> will be split into
4144 C<< <p></p> >>.
4145
4146 Combines adjacent (whitespace separated only) opening and closing tags for
4147 the same HTML empty element into a single minimized tag.  For example,
4148 C<< <br></br> >> will become C<< <br /> >>.
4149
4150 Tags that require at least one attribute to be present to be meaningful
4151 (e.g. C<a>, C<area>, C<img>, C<map>) but have none will be treated as non-tags
4152 potentially creating unexpected errors.  For example, the sequence
4153 C<< <a>text here</a> >> will be sanitized to C<< &lt;a>text here</a> >> since
4154 an C<a> tag without any attributes is meaningless, but then the trailing
4155 close tag C<< </a> >> will become an error because it has no matching open
4156 C<< <a ...> >> tag.
4157
4158 The point of this check is not to cause undue frustration, but to allow
4159 such constructs to be used as text without the need for escaping since they
4160 are meaningless as tags.  For example, C<< <a><c><e> >> works just fine
4161 as plain text and so does C<< <A><C><E> >> because the
4162 C<< <a> >>/C<< <A> >> will be treated as a non-tag automatically.  In fact,
4163 they can even appear inside links too such as
4164 C<< <a href="#somewhere">Link to <a><c><e> article</a> >>.
4165
4166 Problematic C<&> characters are fixed up such as standalone C<&>s (or those not
4167 part of a valid entity reference) are turned into C<&amp;>.  Within attribute
4168 values, single and double quotes are turned into C<&> entity refs.
4169
4170 This is enabled by default.
4171
4172
4173 =item B<--no-sanitize>
4174
4175 Do not sanitize tag attributes.  This option does not allow any tags that would
4176 not be allowed without this option, but it does completely suppress the
4177 attribute sanitation process.   If this option is specified, no attributes will
4178 be removed from any tag (although C<img> and C<a> tags will still be affected
4179 by B<--imageroot>, B<--htmlroot>, B<--absroot> and/or B<--base> options).
4180 Use of this option is I<NOT RECOMMENDED>.
4181
4182
4183 =item B<--validate-xml>
4184
4185 Perform XML validation on the output before it's output and die if
4186 it fails validation.  This requires the C<XML::Simple> or C<XML::Parser>
4187 module be present (one is only required if this option is given).
4188
4189 Any errors are reported to STDERR and the exit status will be
4190 non-zero on XML validation failure.  Note that all line and column
4191 numbers in the error output refer to the entire output that would
4192 have been produced.  Re-run with B<--no-validate-xml> to see what's
4193 actually present at those line and column positions.
4194
4195 If the B<--stub> option has also been given, then the entire output is
4196 validated as-is.  Without the B<--stub> option, the output will be wrapped
4197 in C<< <div>...</div> >> for validation purposes but that extra "div" added
4198 for validation will not be added to the final output.
4199
4200 This option is I<NOT enabled by default>.
4201
4202 This option is I<NOT compatible> with the B<--html4tags> option and will
4203 produce an immediate error if both are given.
4204
4205
4206 =item B<--validate-xml-internal>
4207
4208 Perform XML validation on the output before it's output and die if
4209 it fails validation.  This uses a simple internal consistency checker
4210 that finds unmatched and mismatched open/close tags.
4211
4212 Non-empty elements that in HTML have optional closing tags (C<colgroup>
4213 C<dd> C<dt> C<li> C<p> C<tbody> C<td> C<tfoot> C<th> C<thead> C<tr>)
4214 will automatically have any omitted end tags inserted during the
4215 `--validate-xml-internal` process.
4216
4217 Any errors are reported to STDERR and the exit status will be
4218 non-zero on XML validation failure.  Note that all line and column
4219 numbers in the error output refer to the entire output that would
4220 have been produced before sanitization without any B<--stub> or
4221 B<--stylesheet> options.  Re-run with B<--no-sanitize> and
4222 B<--no-validate-xml> and I<without> any B<--stub> or B<--stylesheet>
4223 options to see what's actually present at those line and column
4224 positions.
4225
4226 This option validates the output I<prior to> adding any requested
4227 B<--stub> or B<--stylesheet>.  As the built-in stub and stylesheet
4228 have already been validated that speeds things up.  The output is
4229 I<NOT> wrapped (in a C<< <div>...</div> >>) for validation as that's
4230 not required for the internal checker.
4231
4232 This option is I<IS enabled by default> unless B<--no-sanitize> is
4233 active.
4234
4235 This option is I<IS compatible> with the B<--html4tags> option.
4236
4237 This option requires the B<--sanitize> option and will produce an
4238 immediate error if both B<--no-sanitize> and B<--validate-xml-internal>
4239 are given.
4240
4241 Note that B<--validate-xml-internal> is I<MUCH faster> than
4242 B<--validate-xml> and I<does NOT> require any extra XML modules to
4243 be present.
4244
4245
4246 =item B<--no-validate-xml>
4247
4248 Do not perform XML validation on the output.  Markdown.pl itself will
4249 normally generate valid XML sequences (unless B<--html4tags> has been
4250 used).  However, any raw tags in the input (that are on the "approved"
4251 list), could potentially result in invalid XML output (i.e. mismatched
4252 start and end tags, missing start or end tag etc.).
4253
4254 Markdown.pl will I<NOT check> for these issues itself.  But with
4255 the B<--validate-xml> option will use C<XML::Simple> or C<XML::Parser>
4256 to do so.
4257
4258 Note that B<--validate-xml-internal> is the default option unless
4259 B<--no-sanitize> is used in which case B<--no-validate-xml> is the
4260 default option.
4261
4262
4263 =item B<--strip-comments>
4264
4265 Strip XML comments from the output.  Any XML comments encountered will
4266 be omitted from the output if this option is given.
4267
4268 This option requires the B<--sanitize> option to be used (which is
4269 the default).
4270
4271 However, note that the XML standard section 2.5 specifically prohibits
4272 a C<--> sequence within an XML comment (i.e. C<--> cannot occur after
4273 the comment start tag C<< <!-- >> unless it is immediately followed
4274 by C<< > >> which makes it the comment end tag C<< --> >>).
4275
4276 In other words, S<C<< <!-- --> >>>, S<C<< <!-- - --> >>>, S<C<< <!----> >>>,
4277 and S<C<< <!--- --> >>> are all valid XML comments, but S<C<< <!-----> >>>
4278 and S<C<< <!-- ---> >>> are not!
4279
4280 As part of the "sanitation" process (triggered by the B<--sanitize>
4281 option), any invalid tags have their leading C<< < >> escaped (to
4282 C<< &#lt; >>) thus making them ordinary text and this I<includes>
4283 invalid XML comments.
4284
4285 What this means is that the B<--strip-comments> option I<will not> remove
4286 invalid XML comments (such as S<C<< <!-----> >>>)!
4287
4288
4289 =item B<--no-strip-comments>
4290
4291 Do not strip XML comments from the output.  This is the default.
4292
4293
4294 =item B<--tabwidth>=I<num>
4295
4296 Expand tabs to I<num> character wide tab stop positions instead of the default
4297 8.  Don't use this; physical tabs should always be expanded to 8-character
4298 positions.  This option does I<not> affect the number of spaces needed to
4299 start a new "indent level".  That will always be 4 no matter what value is
4300 used (or implied by default) with this option.  Also note that tabs inside
4301 backticks-delimited code blocks will always be expanded to 8-character tab
4302 stop positions no matter what value is used for this option.
4303
4304 The value must be S<2 <= I<num> <= 32>.
4305
4306
4307 =item B<--auto-number>
4308
4309 Automatically number all h1-h6 headings generated from Markdown markup.
4310 Explicit C<< <h1> >> ... C<< <h6> >> tag content remains unmolested.
4311
4312 If this option is given, any YAML C<header_enum> setting will be ignored.
4313
4314
4315 =item B<-k>, B<--keep-abs>
4316
4317 Normally any absolute path URLs (i.e. URLs without a scheme starting
4318 with "/" but not "//") are subject to modification by any
4319 B<-r>/B<--htmlroot> or B<-i>/B<--imageroot> option.
4320
4321 If the B<-a>/B<--absroot> option is used and it transforms these
4322 absolute path URLs into a full absolute URL (i.e. starts with a
4323 scheme or "//") then any subsequent B<-r>/B<--htmlroot> or
4324 B<-i>/B<--imageroot> processing will be skipped because the URL is
4325 no longer relative.
4326
4327 If the B<--keep-abs> option is given, then (after applying any
4328 B<-a>/B<--absroot> option if present) absolute path URLs will be
4329 kept as-is and will not be processed further by any B<-r>/B<--htmlroot>
4330 or B<-i>/B<--imageroot> option.
4331
4332 Note that if the B<-a>/B<--absroot> option transforms an absolute
4333 path URL into a relative PATH URL it I<will> be subject to subsequent
4334 B<-r>/B<--htmlroot> or B<-i>/B<--imageroot> processing regardless
4335 of the B<-k>/B<--keep-abs> option.
4336
4337
4338 =item B<-a> I<prefix>, B<--absroot>=I<prefix>
4339
4340 Any absolute path URLs (i.e. URLs without a scheme starting with "/" but not
4341 "//") have I<prefix> prepended which prevents them from being acted upon by the
4342 B<--htmlroot> and/or B<--imageroot> options provided the result is a full
4343 absolute URL.  The default is to prepend nothing and leave them as absolute
4344 path URLs which will allow them to be processed by any B<--htmlroot> and/or
4345 B<--imageroot> options.
4346
4347 This option can be helpful when documents are being formatted for display on a
4348 different system and the absolute path URLs need to be "fixed up".
4349
4350
4351 =item B<-b> I<prefix>, B<--base>=I<prefix>
4352
4353 Any fragment-only URLs have I<prefix> prepended.  The default is to prepend
4354 nothing and leave them as bare fragment URLs.  Use of this option may be
4355 necessary when embedding the output of Markdown.pl into a document that makes
4356 use of the C<< <base> >> tag in order for intra-document fragment URL links to
4357 work properly in such a document.
4358
4359
4360 =item B<-r> I<prefix>, B<--htmlroot>=I<prefix>
4361
4362 Any non-absolute URLs have I<prefix> prepended.
4363
4364
4365 =item B<-i> I<prefix>, B<--imageroot>=I<prefix>
4366
4367 Any non-absolute URLs have I<prefix> prepended (overriding the B<-r> prefix
4368 if any) but only if they end in an image suffix.
4369
4370
4371 =item B<-w> [I<wikipat>], B<--wiki>[=I<wikipat>]
4372
4373 Activate wiki links.  Any link enclosed in double brackets (e.g. "[[link]]") is
4374 considered a wiki link.  By default only absolute URL and fragment links are
4375 allowed in the "wiki link style" format.  Any other double-bracketed strings
4376 are left unmolested.
4377
4378 If this option is given, all other wiki links are enabled as well.  Any
4379 non-absolute URL or fragment links will be transformed into a link using
4380 I<wikipat> where the default I<wikipat> if none is given is C<%{s(:md)}.html>.
4381
4382 If the given I<wikipat> does not contain a C<%{...}> placeholder sequence
4383 then it will automatically have C<%{s(:md)}.html> suffixed to it.
4384
4385 The C<...> part of the C<%{...}> sequence specifies zero or more
4386 case-insensitive single-letter options with the following effects:
4387
4388 =over
4389
4390 =item B<b>
4391
4392 Retain blanks (aka spaces) in the output.  They will become C<%20>
4393 in the final URL.  Because spaces are always trimmed before processing
4394 wiki links, runs of multiple spaces will be collapsed into a single
4395 space and any leading or trailing spaces will be removed.
4396
4397 =item B<d>
4398
4399 Convert spaces to dashes (ASCII 0x2D) instead of underscore (ASCII
4400 0x5F).  Note that if this option is given then runs of multiple
4401 dashes will be converted to a single dash I<instead> but runs of
4402 multiple underscores will be left untouched.
4403
4404 =item B<f>
4405
4406 Flatten the resulting name by replacing forward slashes (ASCII 0x2F)
4407 as well.  They will be converted to underscores unless the C<d>
4408 option is given (in which case they will be converted to dashes).
4409 This conversion takes place before applying the runs-of-multiple
4410 reduction.  This option is incompatible with the B<%> option.
4411
4412 =item B<%>
4413
4414 Flatten the resulting name by replacing runs of one or more forward
4415 slashes (ASCII 0x2F) with C<%2F>.  Note that when encoded into a
4416 URL the C<%2F> actually becomes C<%252F>.  This option is incompatible
4417 with the B<f> option.
4418
4419 =item B<l>
4420
4421 Convert link target (excluding any query string and/or fragment) to lowercase.
4422 Takes precedence over any C<u> option, but specifically excludes C<%>-escapes
4423 which are always UPPERCASE hexadecimal.
4424
4425 =item B<r>
4426
4427 Leave raw UTF-8 characters in the result.  Normally anything not allowed
4428 directly in a URL ends up URL-encoded.  With this option, raw valid UTF-8
4429 sequences will be left untouched.  Use with care.
4430
4431 =item B<s> or B<s(>I<< <ext> >>[B<,>I<< <ext> >>]...B<)>
4432
4433 After (temporarily) removing any query string and/or fragment, strip any final
4434 "dot" suffix so long as it occurs after the last slash (if any slash was
4435 present before applying the C<f> option).  The "dot" (ASCII 0x2E) and all
4436 following characters (if any) are removed.  If the optional C<< (<ext>,...) >>
4437 part is present then only strip the extension if it consists of a "dot"
4438 followed by one of the case-insensitive I<< <ext> >> values.  As a special
4439 case, using the value C<:md> for one of the I<< <ext> >> values causes that
4440 value to be expanded to all known markdown extensions.
4441
4442 When processing wiki image links, this option is ignored.
4443
4444 =item B<u>
4445
4446 Convert link target (excluding any query string and/or fragment) to UPPERCASE.
4447
4448 =item B<v>
4449
4450 Leave runs-of-multiple characters alone (aka "verbatim").  Does not affect
4451 any of the other options except by eliminating the runs-of-multple reduction
4452 step.  Also does I<not> inhibit the initial whitespace trimming.
4453
4454 Does not affect the runs-of-multiple "/" replacement performed by the B<%>
4455 option.
4456
4457 =back
4458
4459 The URL target of the wiki link is created by first trimming whitespace
4460 (starting and ending whitespace is removed and all other runs of consecutive
4461 whitespace are replaced with a single space) from the wiki link target,
4462 removing (temporarily) any query string and/or fragment, if no options are
4463 present, spaces are converted to underscores (C<_>) and runs of multiple
4464 consecutive underscores are replaced with a single underscore (ASCII 0x5F).
4465 Finally, the I<wikipat> string gets its first placeholder (the C<%{...}>
4466 sequence) replaced with this computed value and the original query string
4467 and/or fragment is re-appended (if any were originally present) and
4468 URL-encoding is applied as needed to produce the actual final target URL.
4469
4470 Note that when processing wiki image links, no extension stripping ever takes
4471 place (i.e. the "s" option is ignored) and anything after the placeholder (the
4472 C<%{...}> sequence) in the pattern is omitted from the result.
4473
4474 See above option descriptions for possible available modifications.
4475
4476 One of the commonly used hosting platforms does something substantially similar
4477 to using C<%{dfv}> as the placeholder.
4478
4479 One of the commonly used wiki platforms does something similar to using C<%{%}>
4480 as the placeholder.
4481
4482
4483 =item B<--yaml>[=I<yamlmode>]
4484
4485 Select YAML front matter processing.  The optional I<yamlmode> value
4486 must be one of the following:
4487
4488 =over
4489
4490 =item B<enable>
4491
4492 Recognize any YAML front matter and apply any options specified
4493 therein.  If any unrecognized options are present, the options will
4494 also be shown in the formatted output.
4495
4496 This is the default I<yamlmode> if omitted.
4497
4498 =item B<disable>
4499
4500 No YAML front matter processing at all takes place.  If YAML front
4501 matter is present, it will be treated as regular non-YAML markup
4502 text to be processed.
4503
4504 =item B<strip>
4505
4506 If YAML front matter is present, it will be stripped and completely
4507 ignored before beginning to process the rest of the input.
4508
4509 In this mode, any options in the YAML front matter that would have
4510 otherwise been recognized will I<not have any effect!>
4511
4512 =item B<show>
4513
4514 If YAML front matter is present and contains anything other than
4515 comments, the non-comments parts will be shown in the formatted
4516 output.
4517
4518 In this mode, any options in the YAML front matter that would have
4519 otherwise been recognized will I<not have any effect!>
4520
4521 This is a show-only mode.
4522
4523 =item B<reveal>
4524
4525 This mode works just like the B<enable> mode except that if the
4526 YAML front matter contains anything other than comments, then
4527 I<all> of the non-comments parts will be shown in the formatted
4528 output.
4529
4530 In this mode, any recognized options in the YAML front matter I<are>
4531 processed the same way they would be in the B<enable> mode except
4532 that any option to suppress the B<reveal> mode is ignored.
4533
4534 =item B<conceal>
4535
4536 This mode works just like the B<enable> mode except that no options
4537 are ever shown in the formatted output regardless of whether or not
4538 there are any unrecognized options present.
4539
4540 In this mode, any recognized options in the YAML front matter I<are>
4541 processed the same way they would be in the B<enable> mode except
4542 that any option to suppress the B<conceal> mode is ignored.
4543
4544 =item B<unknown>
4545
4546 This mode works just like the B<show> mode if any unrecognized
4547 YAML front matter options are present.  Otherwise it works like
4548 the B<strip> mode.
4549
4550 In this mode, any options in the YAML front matter that would have
4551 otherwise been recognized will I<not have any effect!>
4552
4553 =back
4554
4555 If B<--raw>, B<--raw-xml> or B<--raw-html> has been specified then
4556 the default if no B<--yaml> option has been given is B<--yaml=disable>.
4557
4558 Otherwise the default if no B<--yaml> option has been given is
4559 B<--yaml=enable>.
4560
4561 Note that only a limited subset of YAML is recognized.  Specifically
4562 only comments, and top-level single-line S<C<key: value>> items
4563 where key must be plain (i.e. non-quoted), start with a letter or
4564 underscore and contain only letters, underscores, hyphens (C<->),
4565 periods (C<.>) and digits.  Keys are case-insensitive (i.e. converted
4566 to lowercase).  As with YAML, at least one whitespace is required
4567 between the ":" and the value (unless it's the empty value).
4568
4569 Values may be either plain or double-quoted (single-quoted is not
4570 recognized).  The double-quoted style may use C-style character
4571 escape codes but may not extend past the end of the line.
4572
4573 For YAML front matter to be recognized, the very first line of the
4574 document must be exactly three hyphens (C<--->).  The YAML terminates
4575 when a line of three hyphens (C<--->) or a line of three periods
4576 (C<...>) or the end of the file is encountered.  Of course the YAML
4577 mode must also be something I<other> than B<--yaml=disable>.
4578
4579
4580 =item B<-V>, B<--version>
4581
4582 Display Markdown's version number and copyright information.
4583
4584
4585 =item B<-s>, B<--shortversion>
4586
4587 Display the short-form version number.
4588
4589
4590 =item B<--raw>, B<--raw-xml>
4591
4592 Input contains only raw XHTML.  All options other than
4593 B<--html4tags>, B<--deprecated>, B<--sanitize> (on by default),
4594 B<--strip-comments>, B<--validate-xml> and B<--validate-xml-internal>
4595 (and their B<--no-...> variants) are ignored.
4596
4597 With this option, arbitrary XHTML input can be passed through
4598 the sanitizer and/or validator.  If sanitation is requested (the
4599 default), input must only contain the contents of the "<body>"
4600 section (i.e. no "<head>" or "<html>").  Output I<will> be converted
4601 to UTF-8 regardless of the input encoding.  All line endings will
4602 be normalized to C<\n> and input encodings other than UTF-8 or
4603 ISO-8859-1 or US-ASCII will end up mangled.
4604
4605 Remember that any B<--stub> and/or B<--stylesheet> options are
4606 I<completely ignored> when B<--raw> is given.
4607
4608
4609 =item B<--raw-html>
4610
4611 Input contains only raw HTML.  All options other than
4612 B<--html4tags>, B<--deprecated>, B<--sanitize> (on by default),
4613 B<--strip-comments>, and B<--validate-xml-internal>
4614 (and their B<--no-...> variants) are ignored.
4615
4616 Requires the (possibly implicit) B<--validate-xml-internal> option.
4617
4618 Works just like B<--raw-xml> except that HTML auto closing and
4619 optional closing tag semantics are activated during the validation
4620 causing missing closing tags to be inserted where required by the
4621 standard.  Non-raw mode always enables these semantics.
4622
4623 This will transform HTML into valid XHTML or fail with an error message.
4624
4625 Unfortunately, it will also fail to accept some documents that
4626 the plain B<--raw-xml> option will.
4627
4628 For example, this document:
4629
4630  <dt><li>a</li></dt>
4631
4632 Will be rejected because upon encountering the C<< <li> >> open
4633 tag a closing C<< </dt> >> will automatically be inserted resulting
4634 in this document:
4635
4636  <dt></dt><li>a</li></dt>
4637
4638 Which, of course, no longer validates.  Since C<li> blocks cannot
4639 directly be nested within C<dt> blocks (according to the standard),
4640 the input document is not strictly correct.
4641
4642 Remember that any B<--stub> and/or B<--stylesheet> options are
4643 I<completely ignored> when B<--raw-html> is given.
4644
4645
4646 =item B<--stylesheet>
4647
4648 Include the fancy style sheet at the beginning of the output (or in the
4649 C<head> section with B<--stub>).  This style sheet makes fancy checkboxes
4650 and makes a right parenthesis C<)> show instead of a C<.> for ordered lists
4651 that use them.  Without it things will still look fine except that the
4652 fancy stuff won't be there.
4653
4654 Use this option with no other arguments and redirect standard input to
4655 /dev/null to get just the style sheet and nothing else.
4656
4657
4658 =item B<--no-stylesheet>
4659
4660 Overrides a previous B<--stylesheet> and disables implicit inclusion
4661 of the style sheet by the B<--stub> option.
4662
4663
4664 =item B<--stub>
4665
4666 Wrap the output in a full document stub (i.e. has C<html>, C<head> and C<body>
4667 tags).  The style sheet I<will> be included in the C<head> section unless the
4668 B<--no-stylesheet> option is also used.
4669
4670 The C<< <title> >> value for a document produced with the B<--stub> option
4671 comes from the first markdown markup C<h1> that's generated unless YAML
4672 processing has been enabled (the default) and a C<title> YAML value has
4673 been set in which case that always takes precedence.
4674
4675
4676 =item B<-h>, B<--help>
4677
4678 Display Markdown's help.  With B<--help> full help is shown, with B<-h> only
4679 the usage and options are shown.
4680
4681
4682 =back
4683
4684
4685 =head1 PERL MODULE
4686
4687 Markdown can be used as a Perl module and can be "use"d like so:
4688
4689  use Markdown qw(...);
4690
4691 Or like so:
4692
4693  BEGIN {require "Markdown.pl" && Markdown->import(qw(...))}
4694
4695 where the C<...> part is the list of desired imports.
4696
4697 The Markdown module does not export any functions by default.
4698
4699 The C<Markdown.pm> file is a symbolic link to C<Markdown.pl>.
4700
4701 =head2 Markdown module functions
4702
4703 Any of these functions may be imported, but none of them
4704 are imported by default.
4705
4706 =over
4707
4708
4709 =item * $result = Markdown::Markdown($string[, options...])
4710
4711 Converts Markdown-format C<$string> to UTF-8 encoded XHTML and
4712 returns it.
4713
4714 The C<options...> may be either a single HASH ref or one or more
4715 pairs of C<< key => value >>.
4716
4717 See the comments for the C<_SanitizeOpts> function for a list of
4718 possible option keys.
4719
4720
4721 =item * $result = Markdown::ProcessRaw($string[, options...])
4722
4723 Converts raw XHTML in C<$string> to XHTML and returns it.
4724
4725 The C<options...> may be either a single HASH ref or one or more
4726 pairs of C<< key => value >>.
4727
4728 See the comments for the C<_SanitizeOpts> function for a list of
4729 possible option keys.
4730
4731 This function provides the ability to apply the internal XML
4732 validation and sanitation functionality to arbitrary XHTML without
4733 performing any of the Markdown format interpretation.
4734
4735
4736 =item * $stylesheet = Markdown::GenerateStyleSheet([$prefix])
4737
4738 Returns an XHTML style sheet that supports the fancy Markdown styles
4739 such as checkboxes and right parenthesis lists.
4740
4741 All of the style class names have C<$prefix> prepended.
4742
4743 If C<$prefix> is omitted or C<undef> then the default S<"_markdown-">
4744 prefix will be used which is the same default prefix that the
4745 C<Markdown> function uses.
4746
4747 The returned string value consists of a C<< <style type="text/css"> >>
4748 tag, the contents of the style sheet and ends with a C<< </style> >> tag.
4749
4750
4751 =item * Markdown::SetWikiOpts($hashref, $wikioption)
4752
4753 The value of C<$wikioption> should be the value of the C<wikipat> value
4754 from the B<--wiki> option.  Use the empty string S<""> to enable wiki
4755 links using the defaults and use C<undef> to disable wiki links.
4756
4757 The C<wikipat> and C<wikiopt> keys in C<$hashref> will both be
4758 affected by this call and they should be passed in to the Markdown
4759 function as options to enable processing of wiki links.
4760
4761 The simplest way to do this is simply to pass a HASH ref as the
4762 second argument to the Markdown function after having used this
4763 function on it to properly set the C<wikipat> and C<wikiopt>
4764 keys and values.
4765
4766
4767 =back
4768
4769
4770 =head2 Example
4771
4772 This rudimentary example approximates running
4773 S<C<Markdown.pl --stub --wiki>>
4774 on the input (files if given, standard input if not).
4775
4776  use Markdown qw(Markdown SetWikiOpts GenerateStyleSheet escapeXML);
4777
4778  my $string;
4779  {local $/; $string = <>;}
4780  my %opts = ( h1 => "default title" );
4781  SetWikiOpts(\%opts, ""); # enable default --wiki processing
4782  my $xhtml = Markdown($string, \%opts);
4783  print "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n",
4784    "<head>\n<title>".escapeXML($opts{h1})."</title>\n",
4785    GenerateStyleSheet(),"</head>\n",
4786    "<body style=\"text-align:center\">\n",
4787    "<div style=\"".
4788    "display:inline-block;text-align:left;max-width:42pc\">\n",
4789    $xhtml, "</div></body></html>\n";
4790
4791
4792 =head1 VERSION HISTORY
4793
4794 Z<> See the F<README> file for detailed release notes for this version.
4795
4796 =over
4797
4798 =item Z<> 1.1.10 - 08 Jul 2020
4799
4800 =item Z<> 1.1.9 - 15 Dec 2019
4801
4802 =item Z<> 1.1.8 - 22 Nov 2019
4803
4804 =item Z<> 1.1.7 - 14 Feb 2018
4805
4806 =item Z<> 1.1.6 - 03 Jan 2018
4807
4808 =item Z<> 1.1.5 - 07 Dec 2017
4809
4810 =item Z<> 1.1.4 - 24 Jun 2017
4811
4812 =item Z<> 1.1.3 - 13 Feb 2017
4813
4814 =item Z<> 1.1.2 - 19 Jan 2017
4815
4816 =item Z<> 1.1.1 - 12 Jan 2017
4817
4818 =item Z<> 1.1.0 - 11 Jan 2017
4819
4820 =item Z<> 1.0.4 - 05 Jun 2016
4821
4822 =item Z<> 1.0.3 - 06 Sep 2015
4823
4824 =item Z<> 1.0.2 - 03 Sep 2015
4825
4826 =item Z<> 1.0.1 - 14 Dec 2004
4827
4828 =item Z<> 1.0.0 - 28 Aug 2004
4829
4830 =back
4831
4832 =head1 AUTHORS
4833
4834 =over
4835
4836 =item John Gruber
4837
4838 =item L<https://daringfireball.net>
4839
4840 =item L<https://daringfireball.net/projects/markdown/>
4841
4842 =item E<160>
4843
4844 =back
4845
4846 =over
4847
4848 =item PHP port and other contributions by Michel Fortin
4849
4850 =item L<https://michelf.ca>
4851
4852 =item E<160>
4853
4854 =back
4855
4856 =over
4857
4858 =item Additional enhancements and tweaks by Kyle J. McKay
4859
4860 =item mackyle<at>gmail.com
4861
4862 =back
4863
4864 =head1 COPYRIGHT AND LICENSE
4865
4866 =over
4867
4868 =item Copyright (C) 2003-2004 John Gruber
4869
4870 =item Copyright (C) 2015-2021 Kyle J. McKay
4871
4872 =item All rights reserved.
4873
4874 =back
4875
4876 Redistribution and use in source and binary forms, with or without
4877 modification, are permitted provided that the following conditions are
4878 met:
4879
4880 =over
4881
4882 =item *
4883
4884 Redistributions of source code must retain the above copyright
4885 notice, this list of conditions and the following disclaimer.
4886
4887 =item *
4888
4889 Redistributions in binary form must reproduce the above copyright
4890 notice, this list of conditions and the following disclaimer in the
4891 documentation and/or other materials provided with the distribution.
4892
4893 =item *
4894
4895 Neither the name "Markdown" nor the names of its contributors may
4896 be used to endorse or promote products derived from this software
4897 without specific prior written permission.
4898
4899 =back
4900
4901 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
4902 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
4903 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
4904 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
4905 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
4906 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
4907 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
4908 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
4909 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
4910 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
4911 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4912
4913 =cut