Markdown.pl

   1 #!/usr/bin/env perl
   2
   3 #
   4 # Markdown -- A text-to-HTML conversion tool for web writers
   5 #
   6 # Copyright (C) 2004 John Gruber
   7 # Copyright (C) 2015,2016,2017,2018,2019,2020,2021 Kyle J. McKay
   8 # All rights reserved.
   9 # License is Modified BSD (aka 3-clause BSD) License\n";
  10 # See LICENSE file (or <https://opensource.org/licenses/BSD-3-Clause>)
  11 #
  12
  13 package Markdown;
  14
  15 use 5.008;
  16 use strict;
  17 use warnings;
  18
  19 use Encode;
  20
  21 use vars qw($COPYRIGHT $DATE $VERSION @ISA @EXPORT_OK);
  22
  23 BEGIN {*COPYRIGHT =
  24 \"Copyright (C) 2004 John Gruber
  25 Copyright (C) 2015,2016,2017,2018,2019,2020,2021 Kyle J. McKay
  26 All rights reserved.
  27 ";
  28 *VERSION = \"1.1.15";
  29 *DATE = \"2021-08-15";
  30 }
  31
  32 use Exporter ();
  33 use Digest::MD5 qw(md5 md5_hex);
  34 use File::Basename qw(basename);
  35 use Scalar::Util qw(refaddr looks_like_number);
  36 my ($hasxml, $hasxml_err); BEGIN { ($hasxml, $hasxml_err) = (0, "") }
  37 my ($hasxmlp, $hasxmlp_err); BEGIN { ($hasxmlp, $hasxmlp_err) = (0, "") }
  38 BEGIN {
  39 @ISA = qw(Exporter);
  40 @EXPORT_OK = qw(Markdown ProcessRaw GenerateStyleSheet SetWikiOpts SplitURL
  41                 escapeXML unescapeXML ResolveFragment ConvertNamedCharacterEntities);
  42 $INC{__PACKAGE__.'.pm'} = $INC{basename(__FILE__)} unless exists $INC{__PACKAGE__.'.pm'};
  43 }
  44
  45 close(DATA) if fileno(DATA);
  46 exit(&_main(@ARGV)||0) unless caller;
  47
  48 sub fauxdie($) {
  49     my $msg = join(" ", @_);
  50     $msg =~ s/\s+$//os;
  51     printf STDERR "%s: fatal: %s\n", basename($0), $msg;
  52     exit 1;
  53 }
  54
  55 my $encoder;
  56 BEGIN {
  57         $encoder = Encode::find_encoding('Windows-1252') ||
  58                    Encode::find_encoding('ISO-8859-1') or
  59                    die "failed to load ISO-8859-1 encoder\n";
  60 }
  61
  62 #
  63 # Global default settings:
  64 #
  65 my ($g_style_prefix, $g_empty_element_suffix, $g_indent_width,
  66     $g_start_p, $g_close_p);
  67 BEGIN {
  68     $g_style_prefix = "_markdown-";     # Prefix for markdown css class styles
  69     $g_empty_element_suffix = " />";    # Change to ">" for HTML output
  70     $g_indent_width = 4;                # Number of spaces considered new level
  71     $g_start_p = "<p>";                 # _FormParagraphs open paragraph tag
  72     $g_close_p = "</p>";                # _FormParagraphs close paragraph tag
  73 }
  74
  75
  76 #
  77 # Globals:
  78 #
  79
  80 # Style sheet template
  81 my $g_style_sheet;
  82
  83 # Permanent block id table
  84 my %g_perm_block_ids;
  85
  86 # Global hashes, used by various utility routines
  87 my %g_urls;
  88 my %g_titles;
  89 my %g_anchors;
  90 my %g_anchors_id;
  91 my %g_block_ids;
  92 my %g_code_block_ids;
  93 my %g_html_blocks;
  94 my %g_code_blocks;
  95 my @g_xml_comments;
  96 my %opt;
  97 my @autonum;
  98
  99 # Return a "block id" to use to identify the block that does not contain
 100 # any characters that could be misinterpreted by the rest of the code
 101 # Originally this used md5_hex but that's unnecessarily slow
 102 # Instead just use the refaddr of the scalar ref of the entry for that
 103 # key in either the global or, if the optional second argument is true,
 104 # permanent table.  To avoid the result being confused with anything
 105 # else, it's prefixed with a control character and suffixed with another
 106 # both of which are not allowed by the XML standard or Unicode.
 107 sub block_id {
 108     $_[1] or return "\5".refaddr(\$g_block_ids{$_[0]})."\6";
 109     $_[1] == 1 and return "\2".refaddr(\$g_perm_block_ids{$_[0]})."\3";
 110     $_[1] == 2 and return "\25".refaddr(\$g_code_block_ids{$_[0]})."\26";
 111     die "programmer error: bad block_id type $_[1]";
 112 }
 113
 114 # Regex to match balanced [brackets]. See Friedl's
 115 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
 116 my $g_nested_brackets;
 117 BEGIN {
 118     $g_nested_brackets = qr{
 119     (?>                                 # Atomic matching
 120         [^\[\]]+                        # Anything other than brackets
 121      |
 122         \[
 123             (??{ $g_nested_brackets })  # Recursive set of nested brackets
 124         \]
 125     )*
 126     }ox
 127 }
 128
 129 # Regex to match balanced (parentheses)
 130 my $g_nested_parens;
 131 BEGIN {
 132     $g_nested_parens = qr{
 133     (?>                                 # Atomic matching
 134         [^\(\)]+                        # Anything other than parentheses
 135      |
 136         \(
 137             (??{ $g_nested_parens })    # Recursive set of nested parentheses
 138         \)
 139     )*
 140     }ox
 141 }
 142
 143 # Table of hash values for escaped characters:
 144 my %g_escape_table;
 145 BEGIN {
 146     $g_escape_table{""} = "\2\3";
 147     foreach my $char (split //, "\\\`*_~{}[]()>#+-.!|:<") {
 148         $g_escape_table{$char} = block_id($char,1);
 149     }
 150 }
 151
 152 # Used to track when we're inside an ordered or unordered list
 153 # (see _ProcessListItems() for details):
 154 my $g_list_level;
 155 BEGIN {
 156     $g_list_level = 0;
 157 }
 158
 159 # Entity conversion table
 160 my %named_character_entity;
 161 BEGIN { %named_character_entity = (
 162     'Aacute'   =>   '193',
 163     'aacute'   =>   '225',
 164     'Acirc'    =>   '194',
 165     'acirc'    =>   '226',
 166     'acute'    =>   '180',
 167     'AElig'    =>   '198',
 168     'aelig'    =>   '230',
 169     'Agrave'   =>   '192',
 170     'agrave'   =>   '224',
 171     'alefsym'  => 'x2135',
 172     'Alpha'    =>   '913',
 173     'alpha'    =>   '945',
 174     'and'      => 'x2227',
 175     'ang'      => 'x2220',
 176     'apos'     =>    '39',
 177     'Aring'    =>   '197',
 178     'aring'    =>   '229',
 179     'asymp'    => 'x2248',
 180     'Atilde'   =>   '195',
 181     'atilde'   =>   '227',
 182     'Auml'     =>   '196',
 183     'auml'     =>   '228',
 184     'bdquo'    => 'x201e',
 185     'Beta'     =>   '914',
 186     'beta'     =>   '946',
 187     'brvbar'   =>   '166',
 188     'bull'     => 'x2022',
 189     'cap'      => 'x2229',
 190     'Ccedil'   =>   '199',
 191     'ccedil'   =>   '231',
 192     'cedil'    =>   '184',
 193     'cent'     =>   '162',
 194     'Chi'      =>   '935',
 195     'chi'      =>   '967',
 196     'circ'     =>   '710',
 197     'clubs'    => 'x2663',
 198     'cong'     => 'x2245',
 199     'copy'     =>   '169',
 200     'crarr'    => 'x21b5',
 201     'cup'      => 'x222a',
 202     'curren'   =>   '164',
 203     'Dagger'   => 'x2021',
 204     'dagger'   => 'x2020',
 205     'dArr'     => 'x21d3',
 206     'darr'     => 'x2193',
 207     'deg'      =>   '176',
 208     'Delta'    =>   '916',
 209     'delta'    =>   '948',
 210     'diams'    => 'x2666',
 211     'divide'   =>   '247',
 212     'Eacute'   =>   '201',
 213     'eacute'   =>   '233',
 214     'Ecirc'    =>   '202',
 215     'ecirc'    =>   '234',
 216     'Egrave'   =>   '200',
 217     'egrave'   =>   '232',
 218     'empty'    => 'x2205',
 219     'emsp'     => 'x2003',
 220     'ensp'     => 'x2002',
 221     'Epsilon'  =>   '917',
 222     'epsilon'  =>   '949',
 223     'equiv'    => 'x2261',
 224     'Eta'      =>   '919',
 225     'eta'      =>   '951',
 226     'ETH'      =>   '208',
 227     'eth'      =>   '240',
 228     'Euml'     =>   '203',
 229     'euml'     =>   '235',
 230     'euro'     => 'x20ac',
 231     'exist'    => 'x2203',
 232     'fnof'     =>   '402',
 233     'forall'   => 'x2200',
 234     'frac12'   =>   '189',
 235     'frac14'   =>   '188',
 236     'frac34'   =>   '190',
 237     'frasl'    => 'x2044',
 238     'Gamma'    =>   '915',
 239     'gamma'    =>   '947',
 240     'ge'       => 'x2265',
 241     'hArr'     => 'x21d4',
 242     'harr'     => 'x2194',
 243     'hearts'   => 'x2665',
 244     'hellip'   => 'x2026',
 245     'Iacute'   =>   '205',
 246     'iacute'   =>   '237',
 247     'Icirc'    =>   '206',
 248     'icirc'    =>   '238',
 249     'iexcl'    =>   '161',
 250     'Igrave'   =>   '204',
 251     'igrave'   =>   '236',
 252     'image'    => 'x2111',
 253     'infin'    => 'x221e',
 254     'int'      => 'x222b',
 255     'Iota'     =>   '921',
 256     'iota'     =>   '953',
 257     'iquest'   =>   '191',
 258     'isin'     => 'x2208',
 259     'Iuml'     =>   '207',
 260     'iuml'     =>   '239',
 261     'Kappa'    =>   '922',
 262     'kappa'    =>   '954',
 263     'Lambda'   =>   '923',
 264     'lambda'   =>   '955',
 265     'lang'     => 'x2329',
 266     'laquo'    =>   '171',
 267     'lArr'     => 'x21d0',
 268     'larr'     => 'x2190',
 269     'lceil'    => 'x2308',
 270     'ldquo'    => 'x201c',
 271     'le'       => 'x2264',
 272     'lfloor'   => 'x230a',
 273     'lowast'   => 'x2217',
 274     'loz'      => 'x25ca',
 275     'lrm'      => 'x200e',
 276     'lsaquo'   => 'x2039',
 277     'lsquo'    => 'x2018',
 278     'macr'     =>   '175',
 279     'mdash'    => 'x2014',
 280     'micro'    =>   '181',
 281     'middot'   =>   '183',
 282     'minus'    => 'x2212',
 283     'Mu'       =>   '924',
 284     'mu'       =>   '956',
 285     'nabla'    => 'x2207',
 286     'nbsp'     =>   '160',
 287     'ndash'    => 'x2013',
 288     'ne'       => 'x2260',
 289     'ni'       => 'x220b',
 290     'not'      =>   '172',
 291     'notin'    => 'x2209',
 292     'nsub'     => 'x2284',
 293     'Ntilde'   =>   '209',
 294     'ntilde'   =>   '241',
 295     'Nu'       =>   '925',
 296     'nu'       =>   '957',
 297     'Oacute'   =>   '211',
 298     'oacute'   =>   '243',
 299     'Ocirc'    =>   '212',
 300     'ocirc'    =>   '244',
 301     'OElig'    =>   '338',
 302     'oelig'    =>   '339',
 303     'Ograve'   =>   '210',
 304     'ograve'   =>   '242',
 305     'oline'    => 'x203e',
 306     'Omega'    =>   '937',
 307     'omega'    =>   '969',
 308     'Omicron'  =>   '927',
 309     'omicron'  =>   '959',
 310     'oplus'    => 'x2295',
 311     'or'       => 'x2228',
 312     'ordf'     =>   '170',
 313     'ordm'     =>   '186',
 314     'Oslash'   =>   '216',
 315     'oslash'   =>   '248',
 316     'Otilde'   =>   '213',
 317     'otilde'   =>   '245',
 318     'otimes'   => 'x2297',
 319     'Ouml'     =>   '214',
 320     'ouml'     =>   '246',
 321     'para'     =>   '182',
 322     'part'     => 'x2202',
 323     'permil'   => 'x2030',
 324     'perp'     => 'x22a5',
 325     'Phi'      =>   '934',
 326     'phi'      =>   '966',
 327     'Pi'       =>   '928',
 328     'pi'       =>   '960',
 329     'piv'      =>   '982',
 330     'plusmn'   =>   '177',
 331     'pound'    =>   '163',
 332     'Prime'    => 'x2033',
 333     'prime'    => 'x2032',
 334     'prod'     => 'x220f',
 335     'prop'     => 'x221d',
 336     'Psi'      =>   '936',
 337     'psi'      =>   '968',
 338     'radic'    => 'x221a',
 339     'rang'     => 'x232a',
 340     'raquo'    =>   '187',
 341     'rArr'     => 'x21d2',
 342     'rarr'     => 'x2192',
 343     'rceil'    => 'x2309',
 344     'rdquo'    => 'x201d',
 345     'real'     => 'x211c',
 346     'reg'      =>   '174',
 347     'rfloor'   => 'x230b',
 348     'Rho'      =>   '929',
 349     'rho'      =>   '961',
 350     'rlm'      => 'x200f',
 351     'rsaquo'   => 'x203a',
 352     'rsquo'    => 'x2019',
 353     'sbquo'    => 'x201a',
 354     'Scaron'   =>   '352',
 355     'scaron'   =>   '353',
 356     'sdot'     => 'x22c5',
 357     'sect'     =>   '167',
 358     'shy'      =>   '173',
 359     'Sigma'    =>   '931',
 360     'sigma'    =>   '963',
 361     'sigmaf'   =>   '962',
 362     'sim'      => 'x223c',
 363     'spades'   => 'x2660',
 364     'sub'      => 'x2282',
 365     'sube'     => 'x2286',
 366     'sum'      => 'x2211',
 367     'sup'      => 'x2283',
 368     'sup1'     =>   '185',
 369     'sup2'     =>   '178',
 370     'sup3'     =>   '179',
 371     'supe'     => 'x2287',
 372     'szlig'    =>   '223',
 373     'Tau'      =>   '932',
 374     'tau'      =>   '964',
 375     'there4'   => 'x2234',
 376     'Theta'    =>   '920',
 377     'theta'    =>   '952',
 378     'thetasym' =>   '977',
 379     'thinsp'   => 'x2009',
 380     'THORN'    =>   '222',
 381     'thorn'    =>   '254',
 382     'tilde'    =>   '732',
 383     'times'    =>   '215',
 384     'trade'    => 'x2122',
 385     'Uacute'   =>   '218',
 386     'uacute'   =>   '250',
 387     'uArr'     => 'x21d1',
 388     'uarr'     => 'x2191',
 389     'Ucirc'    =>   '219',
 390     'ucirc'    =>   '251',
 391     'Ugrave'   =>   '217',
 392     'ugrave'   =>   '249',
 393     'uml'      =>   '168',
 394     'upsih'    =>   '978',
 395     'Upsilon'  =>   '933',
 396     'upsilon'  =>   '965',
 397     'Uuml'     =>   '220',
 398     'uuml'     =>   '252',
 399     'weierp'   => 'x2118',
 400     'Xi'       =>   '926',
 401     'xi'       =>   '958',
 402     'Yacute'   =>   '221',
 403     'yacute'   =>   '253',
 404     'yen'      =>   '165',
 405     'Yuml'     =>   '376',
 406     'yuml'     =>   '255',
 407     'Zeta'     =>   '918',
 408     'zeta'     =>   '950',
 409     'zwj'      => 'x200d',
 410     'zwnj'     => 'x200c'
 411 ) }
 412
 413
 414 #### Blosxom plug-in interface ##########################################
 415 my $_haveBX;
 416 BEGIN {
 417     no warnings 'once';
 418     $_haveBX = defined($blosxom::version);
 419 }
 420
 421 # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
 422 # which posts Markdown should process, using a "meta-markup: markdown"
 423 # header. If it's set to 0 (the default), Markdown will process all
 424 # entries.
 425 my $g_blosxom_use_meta;
 426 BEGIN {
 427     $g_blosxom_use_meta = 0;
 428 }
 429
 430 sub start { 1; }
 431 sub story {
 432     my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
 433
 434     if ((! $g_blosxom_use_meta) or
 435         (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
 436          ) {
 437             $$body_ref = Markdown($$body_ref);
 438     }
 439     1;
 440 }
 441
 442
 443 #### Movable Type plug-in interface #####################################
 444 my $_haveMT = eval {require MT; 1;}; # Test to see if we're running in MT
 445 my $_haveMT3 = $_haveMT && eval {require MT::Plugin; 1;}; # and MT >= MT 3.0.
 446
 447 if ($_haveMT) {
 448     require MT;
 449     import  MT;
 450     require MT::Template::Context;
 451     import  MT::Template::Context;
 452
 453     if ($_haveMT3) {
 454         require MT::Plugin;
 455         import  MT::Plugin;
 456         my $plugin = new MT::Plugin({
 457             name => "Markdown",
 458             description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
 459             doc_link => 'https://daringfireball.net/projects/markdown/'
 460         });
 461         MT->add_plugin( $plugin );
 462     }
 463
 464     MT::Template::Context->add_container_tag(MarkdownOptions => sub {
 465         my $ctx  = shift;
 466         my $args = shift;
 467         my $builder = $ctx->stash('builder');
 468         my $tokens = $ctx->stash('tokens');
 469
 470         if (defined ($args->{'output'}) ) {
 471             $ctx->stash('markdown_output', lc $args->{'output'});
 472         }
 473
 474         defined (my $str = $builder->build($ctx, $tokens) )
 475             or return $ctx->error($builder->errstr);
 476         $str; # return value
 477     });
 478
 479     MT->add_text_filter('markdown' => {
 480         label     => 'Markdown',
 481         docs      => 'https://daringfireball.net/projects/markdown/',
 482         on_format => sub {
 483             my $text = shift;
 484             my $ctx  = shift;
 485             my $raw  = 0;
 486             if (defined $ctx) {
 487             my $output = $ctx->stash('markdown_output');
 488                 if (defined $output && $output =~ m/^html/i) {
 489                     $g_empty_element_suffix = ">";
 490                     $ctx->stash('markdown_output', '');
 491                 }
 492                 elsif (defined $output && $output eq 'raw') {
 493                     $raw = 1;
 494                     $ctx->stash('markdown_output', '');
 495                 }
 496                 else {
 497                     $raw = 0;
 498                     $g_empty_element_suffix = " />";
 499                 }
 500             }
 501             $text = $raw ? $text : Markdown($text);
 502             $text;
 503         },
 504     });
 505
 506     # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
 507     my $smartypants;
 508
 509     {
 510         no warnings "once";
 511         $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
 512     }
 513
 514     if ($smartypants) {
 515         MT->add_text_filter('markdown_with_smartypants' => {
 516             label     => 'Markdown With SmartyPants',
 517             docs      => 'https://daringfireball.net/projects/markdown/',
 518             on_format => sub {
 519                 my $text = shift;
 520                 my $ctx  = shift;
 521                 if (defined $ctx) {
 522                     my $output = $ctx->stash('markdown_output');
 523                     if (defined $output && $output eq 'html') {
 524                         $g_empty_element_suffix = ">";
 525                     }
 526                     else {
 527                         $g_empty_element_suffix = " />";
 528                     }
 529                 }
 530                 $text = Markdown($text);
 531                 $text = $smartypants->($text, '1');
 532             },
 533         });
 534     }
 535 }
 536
 537 sub _tabDefault {
 538         return $_haveBX || $_haveMT ? 4 : 8;
 539 }
 540
 541 sub _strip {
 542         my $str = shift;
 543         defined($str) or return undef;
 544         $str =~ s/^\s+//;
 545         $str =~ s/\s+$//;
 546         $str =~ s/\s+/ /g;
 547         $str;
 548 }
 549
 550 my %_yamlmode;
 551 BEGIN {%_yamlmode = (
 552     disable => 0,
 553     reveal => 1,
 554     enable => 1,
 555     conceal => 1,
 556     show => -1,
 557     unknown => -1,
 558     strip => -1
 559 )}
 560 my %_yamlvis;
 561 BEGIN {%_yamlvis = (
 562     disable => 0,
 563     reveal => 1,
 564     enable => -1,
 565     conceal => 0,
 566     show => 1,
 567     unknown => -1,
 568     strip => 0
 569 )}
 570
 571 sub _require_pod_usage() {
 572         require Pod::Usage;
 573         eval 'require Pod::Text::Termcap; 1;' and
 574                 @Pod::Usage::ISA = (qw( Pod::Text::Termcap ));
 575         defined($ENV{PERLDOC}) && $ENV{PERLDOC} ne "" or
 576                 $ENV{PERLDOC} = "-oterm -oman";
 577 }
 578
 579 #### BBEdit/command-line text filter interface ##########################
 580 sub _main {
 581     local *ARGV = \@_;
 582
 583
 584     #### Check for command-line switches: #################
 585     my %options = ();
 586     my %cli_opts;
 587     my $raw = 0;
 588     use Getopt::Long;
 589     Getopt::Long::Configure(qw(bundling require_order pass_through));
 590     GetOptions(
 591         'help' => sub {
 592                         _require_pod_usage;
 593                         Pod::Usage::pod2usage(-verbose => 2, -exitval => 0)},
 594         'h' => sub {
 595                         _require_pod_usage;
 596                         Pod::Usage::pod2usage(-verbose => 0, -exitval => 0)},
 597         'version|V' => sub { # Version info
 598                         print "\nThis is Markdown, version $VERSION $DATE.\n", $COPYRIGHT;
 599                         print "License is Modified BSD (aka 3-clause BSD) License\n";
 600                         print "<https://opensource.org/licenses/BSD-3-Clause>\n";
 601                         exit 0},
 602         'shortversion|short-version|s' => sub { # Just the version number string
 603                         print $VERSION;
 604                         exit 0},
 605         'html4tags' => \$cli_opts{'html4tags'},
 606         'deprecated' => \$cli_opts{'deprecated'},
 607         'sanitize' => \$cli_opts{'sanitize'},
 608         'no-sanitize' => sub {$cli_opts{'sanitize'} = 0},
 609         'validate-xml' => sub {$cli_opts{'validate-xml'} = 1},
 610         'validate-xml-internal' => sub {$cli_opts{'validate-xml'} = 2},
 611         'no-validate-xml' => sub {$cli_opts{'validate-xml'} = 0},
 612         'stripcommentsstrict|stripcomments-strict|strip-comments-strict' =>
 613                 sub {$cli_opts{'stripcomments'} = 1},
 614         'stripcomments|stripcommentslax|stripcomments-lax|strip-comments|strip-comments-lax' =>
 615                 sub {$cli_opts{'stripcomments'} = 2},
 616         'stripcommentslaxonly|stripcomments-laxonly|stripcomments-lax-only|strip-comments-lax-only' =>
 617                 sub {$cli_opts{'stripcomments'} = 3},
 618         'nostripcomments|no-stripcomments|no-strip-comments' => sub {$cli_opts{'stripcomments'} = 0},
 619         'keepabs|keep-abs|k' => \$cli_opts{'keepabs'},
 620         'absroot|a=s' => \$cli_opts{'absroot'},
 621         'base|b=s' => \$cli_opts{'base'},
 622         'htmlroot|r=s' => \$cli_opts{'htmlroot'},
 623         'imageroot|i=s' => \$cli_opts{'imageroot'},
 624         'div:s' => \$cli_opts{'divname'},
 625         'wiki|w:s' => \$cli_opts{'wiki'},
 626         'tabwidth|tab-width=s' => \$cli_opts{'tabwidth'},
 627         'autonumber|auto-number' => \$cli_opts{'autonumber'},
 628         'raw' => sub { $cli_opts{'raw'} = 1 },
 629         'raw-xml' => sub { $cli_opts{'raw'} = 1 },
 630         'raw-html' => sub { $cli_opts{'raw'} = 2 },
 631         'stylesheet|style-sheet' => \$cli_opts{'stylesheet'},
 632         'no-stylesheet|no-style-sheet' => sub {$cli_opts{'stylesheet'} = 0},
 633         'keep-named-character-entities' => \$cli_opts{'keepcharents'},
 634         'no-keep-named-character-entities' => sub {$cli_opts{'keepcharents'} = 0},
 635         'us-ascii|ascii' => \$cli_opts{'us_ascii'},
 636         'no-us-ascii|no-ascii' => sub {$cli_opts{'us_ascii'} = 0},
 637         'stub' => \$cli_opts{'stub'},
 638         'yaml:s' => \$cli_opts{'yaml'},
 639     );
 640     defined($cli_opts{'raw'}) or $cli_opts{'raw'} = 0;
 641     my $stub = 0;
 642     if ($cli_opts{'stub'}) {
 643         $stub = 1;
 644     }
 645     if ($cli_opts{'html4tags'}) {        # Use HTML tag style instead of XHTML
 646         $options{empty_element_suffix} = ">";
 647         $stub = -$stub;
 648     }
 649     if ($cli_opts{'deprecated'}) {       # Allow <dir> and <menu> tags to pass through
 650         _SetAllowedTag("dir");
 651         _SetAllowedTag("menu");
 652     }
 653     my $xmlcheck;
 654     $options{'keep_named_character_entities'} = $cli_opts{'keepcharents'} ? "1" : 0;
 655     $options{'us_ascii'} = $cli_opts{'us_ascii'} ? "1" : 0;
 656     $options{divwrap} = defined($cli_opts{'divname'});
 657     $options{divname} = defined($cli_opts{'divname'}) ? $cli_opts{'divname'} : "";
 658     $options{sanitize} = 1; # sanitize by default
 659     $options{sanitize} = $cli_opts{'sanitize'} if defined($cli_opts{'sanitize'});
 660     $xmlcheck = $options{sanitize} ? 2 : 0;
 661     $xmlcheck = $cli_opts{'validate-xml'} if defined($cli_opts{'validate-xml'});
 662     $options{stripcomments} = $cli_opts{'stripcomments'} if defined($cli_opts{'stripcomments'});
 663     die "--html4tags and --validate-xml are incompatible\n"
 664         if $cli_opts{'html4tags'} && $xmlcheck == 1;
 665     die "--no-sanitize and --validate-xml-internal are incompatible\n"
 666         if !$options{'sanitize'} && $xmlcheck == 2;
 667     die "--no-sanitize and --strip-comments are incompatible\n"
 668         if !$options{'sanitize'} && $options{stripcomments};
 669     die "--raw-html requires --validate-xml-internal\n"
 670         if $cli_opts{'raw'} == 2 && $xmlcheck != 2;
 671     if ($xmlcheck == 1) {
 672         eval { require XML::Simple; 1 } and $hasxml = 1 or $hasxml_err = $@;
 673         eval { require XML::Parser; 1 } and $hasxmlp = 1 or $hasxmlp_err = $@ unless $hasxml;
 674         die "$hasxml_err$hasxmlp_err" unless $hasxml || $hasxmlp;
 675     }
 676     if ($cli_opts{'tabwidth'}) {
 677         my $tw = $cli_opts{'tabwidth'};
 678         die "invalid tab width (must be integer)\n" unless looks_like_number $tw;
 679         die "invalid tab width (must be >= 2 and <= 32)\n" unless $tw >= 2 && $tw <= 32;
 680         $options{tab_width} = int(0+$tw);
 681     }
 682     $options{auto_number} = 6 if $cli_opts{'autonumber'};
 683     $options{keepabs} = $cli_opts{'keepabs'};
 684     $options{abs_prefix} = "";          # no abs prefix by default
 685     if ($cli_opts{'absroot'}) {         # Use abs prefix for absolute path URLs
 686         my $abs = $cli_opts{'absroot'};
 687         $abs =~ s{/+$}{};
 688         $options{abs_prefix} = $abs;
 689     }
 690     $options{base_prefix} = "";         # no base prefix by default
 691     if ($cli_opts{'base'}) {            # Use base prefix for fragment URLs
 692         $options{base_prefix} = $cli_opts{'base'};
 693     }
 694     if ($cli_opts{'htmlroot'}) {         # Use URL prefix
 695         $options{url_prefix} = $cli_opts{'htmlroot'};
 696     }
 697     if ($cli_opts{'imageroot'}) {        # Use image URL prefix
 698         $options{img_prefix} = $cli_opts{'imageroot'};
 699     }
 700     SetWikiOpts(\%options, $cli_opts{'wiki'}); # Set wiki links options
 701     if (ref($options{wikiopt}) eq 'HASH') {
 702         my $o = $options{wikiopt};
 703         $o->{"f"} && $o->{"%"} and
 704             die "--wiki sub-options 'f' and '%' are mutually exclusive\n"
 705     }
 706     if ($cli_opts{'raw'}) {
 707         $raw = 1;
 708         $options{htmlauto} = 1 if $cli_opts{'raw'} == 2;
 709     }
 710     $options{show_styles} = $cli_opts{'stylesheet'} if defined($cli_opts{'stylesheet'});
 711     $options{show_styles} = 1 if $stub && !defined($options{show_styles});
 712     $options{tab_width} = 8 unless defined($options{tab_width});
 713     my $ym = $cli_opts{'yaml'};
 714     defined($ym) && $ym ne "" or $ym = "enable";
 715     my $lcym = lc($ym);
 716     exists($_yamlmode{$lcym}) or die "invalid --yaml= value '$ym'\n";
 717     $options{yamlmode} = $_yamlmode{$lcym};
 718     $options{yamlvis} = $_yamlvis{$lcym};
 719
 720     my $hdrf = sub {
 721         my $out = "";
 722         if ($stub > 0) {
 723             $out .= <<'HTML5';
 724 <!DOCTYPE html>
 725 <html xmlns="http://www.w3.org/1999/xhtml">
 726 <head>
 727 <meta charset="utf-8" />
 728 <meta http-equiv="content-type" content="text/html; charset=utf-8" />
 729 HTML5
 730         } elsif ($stub < 0) {
 731             $out .= <<'HTML4';
 732 <html>
 733 <head>
 734 <meta charset="utf-8">
 735 <meta http-equiv="content-type" content="text/html; charset=utf-8">
 736 HTML4
 737         }
 738         if ($stub && ($options{title} || $options{h1})) {
 739             my $title = $options{title};
 740             defined($title) && $title ne "" or $title = $options{h1};
 741             if (defined($title) && $title ne "") {
 742                 $title =~ s/&/&amp;/g;
 743                 $title =~ s/</&lt;/g;
 744                 $out .= "<title>$title</title>\n";
 745             }
 746         }
 747         $out .= GenerateStyleSheet($g_style_prefix) if $options{show_styles};
 748         if ($stub) {
 749             $out .= "</head>\n<body style=\"text-align:center\">\n" .
 750                 "<div style=\"display:inline-block;text-align:left;max-width:42pc\">\n";
 751         }
 752         $out;
 753     };
 754
 755     #### Process incoming text: ###########################
 756     my ($didhdr, $hdr, $result, $ftr) = (0, "", "", "");
 757     @ARGV or push(@ARGV, "-");
 758     foreach (@ARGV) {
 759         my ($fh, $contents, $oneresult);
 760         $_ eq "-" or open $fh, '<', $_ or fauxdie "could not open \"$_\": $!\n";
 761         {
 762             local $/; # Slurp the whole file
 763             $_ eq "-" and $contents = <STDIN>;
 764             $_ ne "-" and $contents = <$fh>;
 765         }
 766         defined($contents) or fauxdie "could not read \"$_\": $!\n";
 767         $_ eq "-" or close($fh);
 768         $options{xmlcheck} = ($xmlcheck == 2) ? 2 : 0;
 769         $oneresult = $raw ? ProcessRaw($contents, \%options) : Markdown($contents, \%options);
 770         $oneresult =~ s/\s+$//os;
 771         if ($oneresult ne "") {
 772             if (!$didhdr && !$raw) {
 773                 $hdr = &$hdrf();
 774                 $didhdr = 1;
 775             }
 776             $result .= $oneresult . "\n";
 777         }
 778     }
 779     $hdr = &$hdrf() unless $didhdr || $raw;
 780     $ftr = "</div>\n</body>\n</html>\n" if $stub && !$raw;
 781     if ($xmlcheck == 1) {
 782         my ($good, $errs);
 783         if ($stub && !$raw) {
 784             ($good, $errs) = _xmlcheck($hdr.$result.$ftr);
 785         } else {
 786             ($good, $errs) = _xmlcheck("<div>".$result."</div>");
 787         }
 788         $good or die $errs;
 789     }
 790     print $hdr, $result, $ftr;
 791
 792     exit 0;
 793 }
 794
 795
 796 # INPUT
 797 #   $1: HASH ref
 798 #   $2: value of --wiki= option (see docs) except
 799 #       that a value of undef turns off wiki links
 800 # OUTPUT
 801 #   $1->{wikipat}
 802 #   $1->{wikiopt}
 803 #
 804 sub SetWikiOpts {
 805     my ($o, $wpat) = @_;
 806     ref($o) eq "HASH" or die "internal error: first arg to SetWikiOpts must be HASH ref";
 807     delete $o->{wikipat};
 808     delete $o->{wikiopt};
 809     defined($wpat) or return;
 810     # Parse wiki links option setting
 811     my $wopt = "s(:md)";
 812     if ($wpat =~ /^(.*?)%\{((?:[%0-9A-Za-z]|[Ss]\([^)]*\))*)\}(.*)$/) {
 813         $o->{wikipat} = $1 . "%{}" . $3;
 814         $wopt = $2;
 815     } else {
 816         $o->{wikipat} = $wpat . "%{}.html";
 817     }
 818     my $sval = 1;
 819     while ($wopt =~ /^(.*?)s\(([^)]*)\)(.*)$/i) {
 820         my $sarg = $2;
 821         $wopt = $1 . "s" . $3;
 822         $sarg =~ s/^\s+//; $sarg =~ s/\s+$//;
 823         $sval = {} unless ref($sval) eq "HASH";
 824         s/^\.//, $sval->{lc($_)}=1 foreach split(/(?:\s*,\s*)|(?:(?<!,)\s+(?!,))/, $sarg);
 825         $sval = 1 unless scalar(keys(%$sval));
 826     }
 827     $o->{wikiopt} = { map({$_ => 1} split(//,lc($wopt))) };
 828     if (ref($sval) eq "HASH" && $sval->{':md'}) {
 829         delete $sval->{':md'};
 830         $sval->{$_} = 1 foreach qw(md rmd mkd mkdn mdwn mdown markdown litcoffee);
 831     }
 832     $o->{wikiopt}->{'s'} = $sval if $o->{wikiopt}->{'s'};
 833 }
 834
 835
 836 # Return a copy of the fancy CSS style sheet that uses the
 837 # passed in prefix as a prefix for the CSS style names.
 838 # If no argument is passed in, use $g_style_prefix
 839 # as the CSS style name prefix.
 840 sub GenerateStyleSheet {
 841     my $prefix = shift;
 842     defined($prefix) or $prefix = $g_style_prefix;
 843     my $stylesheet = $g_style_sheet;
 844     $stylesheet =~ s/%\(base\)/$prefix/g;
 845     return $stylesheet;
 846 }
 847
 848
 849 sub _xmlcheck {
 850         my $text = shift;
 851         my ($good, $errs);
 852         ($hasxml ? eval { XML::Simple::XMLin($text, KeepRoot => 1) && 1 } :
 853          eval {
 854                 my $p = XML::Parser->new(Style => 'Tree', ErrorContext => 1);
 855                 $p->parse($text) && 1;
 856         }) and $good = 1 or $errs = _trimerr($@);
 857         ($good, $errs);
 858 }
 859
 860
 861 sub _trimerr {
 862         my $err = shift;
 863         1 while $err =~ s{\s+at\s+\.?/[^,\s\n]+\sline\s+[0-9]+\.?(\n|$)}{$1}is;
 864         $err =~ s/\s+$//os;
 865         $err . "\n";
 866 }
 867
 868
 869 sub _PrepareInput {
 870     my ($input,$parseyaml) = @_;
 871     defined $input or $input = "";
 872     {
 873         use bytes;
 874         $input =~ s/[\x00-\x08\x0B\x0E-\x1F\x7F]+//gso;
 875     }
 876     my $output;
 877     if (Encode::is_utf8($input) || utf8::decode($input)) {
 878         $output = $input;
 879     } else {
 880         $output = $encoder->decode($input, Encode::FB_DEFAULT);
 881     }
 882     # Standardize line endings:
 883     $output =~ s{\r\n}{\n}g;  # DOS to Unix
 884     $output =~ s{\r}{\n}g;    # Mac to Unix
 885
 886     # Extract YAML front matter if requested
 887     my $yaml = undef;
 888     if ($parseyaml) {
 889         $yaml = {};
 890         if ($output =~ /^---[ \t]*(?:\n|\z)/g) {
 891             until ($output =~ /\G(?:(?:(?:---)|(?:\.\.\.))[ \t]*(?:\n|\z)|\z)/gc) {
 892                 next if $output =~ m"\G[ \t]*(?:#[^\n]*)?\n"gc; # skip comment lines
 893                 next if $output =~ m"\G[ \t]*(?:#[^\n]*)\z"gc; # skip final no EOL comment
 894                 last unless $output =~ /\G([^\n]+)(?:\n|\z)/gc;
 895                 my $yl = $1;
 896                 if ($yl =~ /^([A-Za-z_][A-Za-z_0-9.-]*):[ \t]+(.*)$/os) {
 897                         my ($k, $v) = ($1, $2);
 898                         $yaml->{lc($k)} = _YAMLvalue($2);
 899                 }
 900             }
 901             $output = substr($output, pos($output));
 902         }
 903     }
 904     return wantarray ? ($output, $yaml) : $output;
 905 }
 906
 907
 908 sub _YAMLvalue {
 909         my $v = shift;
 910         $v =~ s/^\s+//;
 911         if (substr($v, 0, 1) eq '"') {
 912                 # only $ and/or @ present issues, map them
 913                 $v =~ tr/\@\$/\036\037/;
 914                 eval '{$v='.$v."\n}1" or $v = undef;
 915                 $v =~ tr/\036\037/\@\$/ if defined($v);
 916         } else {
 917                 $v =~ s"#.*$""os;
 918                 $v =~ s/\s+$//os;
 919                 $v ne "" or $v = undef;
 920         }
 921         return $v;
 922 }
 923
 924
 925 sub ProcessRaw {
 926     my $text = _PrepareInput(shift);
 927
 928     # Any remaining arguments after the first are options; either a single
 929     # hashref or a list of name, value pairs.  See _SanitizeOpts comments.
 930
 931     %opt = (
 932         empty_element_suffix    => $g_empty_element_suffix,
 933     );
 934     my %args = ();
 935     if (ref($_[0]) eq "HASH") {
 936         %args = %{$_[0]};
 937     } else {
 938         %args = @_;
 939     }
 940     while (my ($k,$v) = each %args) {
 941         $opt{$k} = $v;
 942     }
 943     _SanitizeOpts(\%opt);
 944
 945     # Sanitize all '<'...'>' tags if requested
 946     $text = _SanitizeTags($text, $opt{xmlcheck}, $opt{htmlauto}) if $opt{sanitize};
 947
 948     # Eliminate known named character entities
 949     $opt{keep_named_character_entities} or
 950         $text = ConvertNamedCharacterEntities($text);
 951
 952     # Convert to US-ASCII only if requested
 953     $opt{us_ascii} and
 954         $text = ConvertToASCII($text);
 955
 956     utf8::encode($text);
 957     if ($opt{divwrap}) {
 958         my $id = $opt{divname};
 959         defined($id) or $id = "";
 960         $id eq "" or $id = ' id="'.escapeXML($id).'"';
 961         chomp($text);
 962         return "<div$id>\n".$text."\n</div>\n";
 963     }
 964     return $text;
 965 }
 966
 967
 968 # $1: HASH ref with the following key value semantics
 969 #
 970 #   sanitize => any-false-value (no action), any-true-value (sanitize).
 971 #               note that a true value of xmlcheck or a true value of
 972 #               stripcomments or a urlfunc value that is a CODE ref
 973 #               always forces sanitize to activate.
 974 #               tag attributes are sanitized by removing all "questionable"
 975 #               attributes (such as script attributes, unknown attributes
 976 #               and so forth) and normalizing the remaining ones (i.e.
 977 #               adding missing quotes and/or values etc.).
 978 #               effective for both ProcessRaw and Markdown.
 979 #   xmlcheck => 0 (no check), any-true-value (internal check).
 980 #               note that the default if xmlcheck is not set/valid is 2.
 981 #               note that a true value is effective for both ProcessRaw
 982 #               and Markdown.  note that a true value automatically inserts
 983 #               the closing tag for auto-closing tags and converts empty tags
 984 #               to the correct format converting empty tags that shouldn't be
 985 #               to an open and close pair; since xmlcheck is a function of the
 986 #               sanitizer, tag attributes are also always sanitized whenever
 987 #               xmlcheck has a true value.
 988 #               note that a true xmlcheck value WILL call "die" with a
 989 #               detailed indication of the error(s) if xml validation fails
 990 #               in which case any line/column numbers refer to the text that
 991 #               would be produced by a sanitize=>0, xmlcheck=>0 call to
 992 #               either ProcessRaw or Markdown, NOT the original input text.
 993 #   htmlauto => any-false-value (no auto close), any-true-value (auto-close)
 994 #               only effective for ProcessRaw; always enabled for Markdown.
 995 #               when xmlcheck is set to 2 provide html automatic closing tag
 996 #               and optional closing tag semantics where closing tags are
 997 #               automatically inserted when encountering an opening tag that
 998 #               auto closes a currently open tag and tags with an optional
 999 #               closing tag that's missing have that inserted as appropriate.
1000 #               a true value may result in some texts being rejected that
1001 #               would be otherwise be accepted (e.g. "<p><pre></pre></p>"
1002 #               which gets turned into "<p></p><pre></pre></p>" which then
1003 #               no longer validates).
1004 #   stripcomments => any-false-value (no action), any-true-value (strip).
1005 #                 => 1 (strip-strict), 2 (strip-lax), 3 (strip-lax-only)
1006 #               a non-numeric true value will be forced to 2.
1007 #               a numeric value < 0 will be forced to 2.
1008 #               a numeric value > 0 and < 1 will be forced to 2.
1009 #               a numeric value > 3 will be forced to 3.
1010 #               a non-integer value will forced to an integral value.
1011 #               1, 2, and 3 correspond to the command line options
1012 #               --strip-comments-strict, --strip-comments-lax and
1013 #               --strip-comments-lax-only respectively.
1014 #               since the strip comments mechanism is a function of the
1015 #               sanitizer, if stripcomments is set to any-true-value then
1016 #               tag attributes will also always be sanitized.
1017 #               if stripcomments is not set or is set to the empty string,
1018 #               then it will be set to 3 if sanitize is true and 0 otherwise.
1019 #               effective for both ProcessRaw and Markdown.
1020 #   empty_element_suffix => " />" or ">"
1021 #               will be forced to " />" if not valid or defined.
1022 #               effective for both ProcessRaw and Markdown.
1023 #   keep_named_character_entities => "1" (keep them), any-other-value (convert).
1024 #               unless this option is present and has exactly the value "1"
1025 #               then known named character entities will be converted to
1026 #               their equivalent numerical entity.  Use of this option is
1027 #               strongly discouraged to avoid strict XML validation failures.
1028 #   us_ascii    => if true, non-US-ASCII characters will be converted to
1029 #               numerical character entities making the output US-ASCII only.
1030 #   divwrap     => if true, wrap output contents in <div>...</div>
1031 #   divname     => if defined and non-empty will be id of divwrap div tag
1032 #   urlfunc     => if set to a CODE ref, the function will be called with
1033 #               seven arguments like so:
1034 #                 $result = &$urlfunc($iresult, \%opts, $tag, $uhost, $uabs, $q, $f)
1035 #               where on input $iresult is the result that would be produced
1036 #               if no urlfunc was provided and on output $result will be
1037 #               used as url value.  $tag is either "img" or "a" to indicate the
1038 #               source of the url.  $uhost.$uabs is the result of stripping off
1039 #               any query string and/or fragment from $iresult.  $q contains either
1040 #               an empty string or the stripped off query string and $f contains
1041 #               an empty string or the stripped off fragment if they were originally
1042 #               present where a non-empty $q always starts with '?' and a non-empty
1043 #               $f always starts with '#'.  $uhost contains the scheme and
1044 #               host+port if present (it may be the empty string).  $uabs contains
1045 #               the path portion which may or may not start with a "/" depending on
1046 #               whether or not it's a relative path.  The $iresult value is related
1047 #               to the other arguments like so:
1048 #                 $iresult = $uhost . $uabs . $q . $f;
1049 #               All values passed to the urlfunc function have already been HTML
1050 #               unescaped and the returned value will be automatically HTML escaped.
1051 #               Any provided urlfunc should treat the %opts HASH as
1052 #               read-only.  Modifying the %opts HASH in urlfunc will
1053 #               likely result in unpredictable behavior!  Don't do it!
1054 #               If urlfunc is set to a CODE ref then tags will also always be
1055 #               sanitized.
1056 #
1057 #  The remaining key value pairs are ignored by ProcessRaw and are only
1058 #  effective when using Markdown or _main
1059 #
1060 #   tab_width => 1..32 which is how many spaces tabs are expanded to.
1061 #               will be forced to 8 if not in range.
1062 #   indent_width => 1..32 how many spaces make a new "indent" level.
1063 #               will be forced to 4 if not in range.
1064 #   style_prefix => prefix to prepend to all CSS style names in the
1065 #               fancy CSS style sheet.
1066 #               defaults to $g_style_prefix if not defined.
1067 #               note that _main actually adds the style sheet (when
1068 #               requested); use GenerateStyleSheet to retrieve the
1069 #               fancy style sheet when calling Markdown directly.
1070 #   auto_number => <= 0 (default) no numbering, 1 number h1s,
1071 #               2 number h1s, h2s, 3 number h1-h3s, ... >= 6 number h1-h6s
1072 #   anchors    => existence of this key triggers return of anchors HASH
1073 #   yamlmode   => 0 (no YAML processing), > 0 (YAML on), < 0 (YAML ignore)
1074 #               if 0, the YAML front matter processor is completely
1075 #               disabled and any YAML front matter that might be present
1076 #               will be treated as markup.  if > 0 any YAML front matter
1077 #               will be processed and any recognized options applied.
1078 #               if < 0 any YAML front matter will be parsed but no
1079 #               options will be applied at all.  When != 0 the parsed
1080 #               YAML front matter can be retrieved via the 'yaml' key.
1081 #               defaults to 1 if not defined or not a number.
1082 #   yamlvis    => 0 (invisible), > 0 (visible), < 0 (vis if unknown)
1083 #               if yamlmode == 0 then yamlvis has no effect.  if > 0
1084 #               then any parsed YAML front matter options will be shown
1085 #               in the formatted output.  if 0 then NO YAML front
1086 #               matter options will be shown in the formatted output.
1087 #               if < 0 then YAML front matter options will be shown in
1088 #               the formatted output only if there are any unrecognized
1089 #               options present.
1090 #               defaults to -1 if not defined or not a number.
1091 #   keepabs    => any-false-value (no action), any-true-value (keep)
1092 #               if true, any absolute path URLs remaining after applying
1093 #               any abs_prefix value will be kept and not be subject
1094 #               to modification by any url_prefix or img_prefix value.
1095 #   abs_prefix => value to prefix to absolute path URLs (i.e. start with /).
1096 #               note that this does NOT get prepended to //host/path URLs.
1097 #   url_prefix => value to prefix to non-absolute URLs.
1098 #               note that this does NOT get prepended to //host/path URLs.
1099 #   img_prefix => value to prefix to non-absolute image URLs.
1100 #               note that this does NOT get prepended to //host/path URLs.
1101 #               note that if img_prefix is undef or empty ("") then
1102 #               url_prefix will be prepended to image URLs.
1103 #   base_prefix => value to prefix to fragment-only URLs (i.e. start with #).
1104 #               note that fragment-only URLs are always left undisturbed
1105 #               if this is not set.  Fragment-only URLs are NOT affected by
1106 #               any of abs_prefix, url_prefix or img_prefix.
1107 #   wikipat     => non-empty pattern string to enable wiki links.
1108 #               best set with SetWikiOpts (see SetWikiOpts comments).
1109 #   wikiopt     => HASH ref of options affecting wiki links processing.
1110 #               best set with SetWikiOpts (see SetWikiOpts comments).
1111 #   wikifunc    => if set to a CODE ref, the function will be called with
1112 #               six arguments like so:
1113 #                 $result = &$wikifunc($iresult, \%opts, $link, $wbase, $qf, $io)
1114 #               where on input $iresult is the result that would be produced
1115 #               if no wikifunc was provided and on output $result will be
1116 #               used as the wiki expansion.  $link is the original wiki
1117 #               destination as specified in the source, $wbase is the result
1118 #               of stripping off any query string and/or fragment from $link
1119 #               and then transforming that according to the wikiopt HASH ref.
1120 #               $qf contains either an empty string or the stripped off
1121 #               query string and/or fragment if one was originally present.
1122 #               If $io is a HASH ref (otherwise it will be undef), then it's
1123 #               a wiki image link and %$io contains the options (if any) where
1124 #               the keys that might be present include "width", "height", "align"
1125 #               and "alt".  If present, width and height are guaranteed to be
1126 #               positive integers and align is guaranteed to be "left", "right"
1127 #               or "center".  The value for "alt" may be the empty string.
1128 #               The "imgflag" key has a SCALAR ref value and if the value it
1129 #               refers to is changed to any false value the result will become
1130 #               an A tag rather than an IMG tag.
1131 #               The $iresult value is related to the other arguments like so:
1132 #                 $iresult = $opts->{wikipat};
1133 #                 $iresult =~ s/%\{\}.+$/%{}/ if ref($io) eq "HASH";
1134 #                 $iresult = s/%\{\}/$wbase/;
1135 #                 $iresult .= $qf;
1136 #               Any provided wikifunc should treat the %opts HASH as
1137 #               read-only.  Modifying the %opts HASH in wikifunc will
1138 #               likely result in unpredictable behavior!  Don't do it!
1139 #
1140 #  Special handling for abs_prefix, url_prefix, img_prefix and/or base_prefix
1141 #  may be activated by setting any subset (or all) of the values for these
1142 #  keys to a CODE ref.  The single argument is the URL and the result must
1143 #  be the adjusted URL.  For example, the equivalent CODE ref to setting
1144 #  url_prefix to $string is simply sub { $string.$_[0] }.  By using a
1145 #  CODE ref, behavior other than simply performing a prepend operation
1146 #  can be realized when necessary for unusual situations.
1147 #
1148 #  The following are OUTPUT values that can only be retrieved when
1149 #  Markdown is called with a HASH ref as the second argument
1150 #
1151 #   anchors    => if the 'anchors' key exists in the input HASH ref
1152 #               will be set to a HASH ref containing lookup keys
1153 #               for valid fragment ids in the document (only those
1154 #               created from Markdown markup) with the value the
1155 #               actual fragment link to use.  Do not use this directly
1156 #               but pass it as the first argument to the ResolveFragment
1157 #               function to resolve a "fuzzy" fragment name to its
1158 #               actual fragment name in the generated output.
1159 #               NOTE: to activate return of anchors the 'anchors' key
1160 #               simply must exist in the input HASH ref passed to the
1161 #               Markdown function, its value will be replaced on output.
1162 #
1163 #   h1         => will be set to the tag-stripped value of the first
1164 #               non-empty H1 generated by Markdown-style markup.
1165 #               note that literal <h1>...</h1> values are NOT picked up.
1166 #               will be left unchanged if no Markdown-style H1 detected.
1167 #               note that the value is NOT xml escaped but should be
1168 #               before embedding in an XHTML document.  If yamlmode > 0
1169 #               and a 'title' value has been encountered, then this
1170 #               will be set to that 'title' value instead (and the
1171 #               'title' key and value will still be present in %$yaml).
1172 #
1173 #   yaml       => if yamlmode is != 0 then this will be set to a HASH
1174 #               ref containing any parsed YAML front matter or left
1175 #               unchanged if no YAML front matter was found.  If the
1176 #               parsed YAML front matter contains only whitespace and/or
1177 #               comments then this will be set to a HASH ref that has
1178 #               no keys or values.
1179 #
1180 sub _SanitizeOpts {
1181     my $o = shift; # hashref
1182     ref($o) eq "HASH" or return;
1183
1184     $o->{firstline} = 0;
1185     $o->{keep_named_character_entities} = 0 unless
1186         defined($o->{keep_named_character_entities}) && $o->{keep_named_character_entities} eq "1";
1187     $o->{xmlcheck} = looks_like_number($o->{xmlcheck}) && $o->{xmlcheck} == 0 ? 0 : 2;
1188     $o->{sanitize} = 1 if $o->{xmlcheck} && !$o->{sanitize};
1189     $o->{sanitize} = 1 if ref($o->{urlfunc}) eq 'CODE' && !$o->{sanitize};
1190     !looks_like_number($o->{stripcomments}) and
1191         $o->{stripcomments} = $o->{stripcomments} ? 2 :
1192             ($o->{sanitize} && (!defined($o->{stripcomments}) || $o->{stripcomments} eq "") ? 3 : 0);
1193     $o->{stripcomments} && $o->{stripcomments} < 1 and $o->{stripcomments} = 2;
1194     $o->{stripcomments} = int($o->{stripcomments});
1195     $o->{stripcomments} > 3 and $o->{stripcomments} = 3;
1196     $o->{stripcomments} && !$o->{sanitize} and $o->{sanitize} = 1;
1197
1198     # this is gross, but having the globals avoids unnecessary slowdown
1199     if ($o->{sanitize} && $o->{xmlcheck}) {
1200         $g_start_p = "<\20>";
1201         $g_close_p = "</\20>";
1202     } else {
1203         $g_start_p = "<p>";
1204         $g_close_p = "</p>";
1205     }
1206
1207     defined($o->{empty_element_suffix}) &&
1208     ($o->{empty_element_suffix} eq " />" || $o->{empty_element_suffix} eq ">")
1209         or $o->{empty_element_suffix} = " />";
1210
1211     $o->{tab_width} = 8 unless looks_like_number($o->{tab_width}) &&
1212         1 <= $o->{tab_width} && $o->{tab_width} <= 32;
1213     $o->{tab_width} = int($o->{tab_width});
1214
1215     $o->{indent_width} = 4 unless looks_like_number($o->{indent_width}) &&
1216         1 <= $o->{indent_width} && $o->{indent_width} <= 32;
1217     $o->{indent_width} = int($o->{indent_width});
1218
1219     defined($o->{auto_number}) or $o->{auto_number} = '';
1220     $o->{auto_number} eq '' || looks_like_number($o->{auto_number})
1221         or $o->{auto_number} = 6;
1222     if ($o->{auto_number} ne '') {
1223         $o->{auto_number} = int(0+$o->{auto_number});
1224         $o->{auto_number} >= 0 or $o->{auto_number} = 0;
1225         $o->{auto_number} <= 6 or $o->{auto_number} = 6;
1226     }
1227
1228     defined($o->{style_prefix}) or $o->{style_prefix} = $g_style_prefix;
1229
1230     $o->{abs_prefix} = _MakePrefixCODERef($o->{abs_prefix}, 1)
1231         unless ref($o->{abs_prefix}) eq 'CODE';
1232     $o->{url_prefix} = _MakePrefixCODERef($o->{url_prefix}, 0)
1233         unless ref($o->{url_prefix}) eq 'CODE';
1234     $o->{img_prefix} = _MakePrefixCODERef($o->{img_prefix}, 0)
1235         unless ref($o->{img_prefix}) eq 'CODE';
1236     $o->{base_prefix} = _MakePrefixCODERef($o->{base_prefix}, -1)
1237         unless ref($o->{base_prefix}) eq 'CODE';
1238
1239     ref($o->{wikiopt}) eq "HASH" or $o->{wikiopt} = {};
1240
1241     # Note that because Markdown makes a copy of the options
1242     # before calling this function, this does not actually remove
1243     # any "h1" key that might have been set by the caller of
1244     # the Markdown function.  However, by deleting it here,
1245     # this guarantees that any found value will actually be
1246     # picked up and stored (which will not happen if the key
1247     # already exists).
1248     delete $o->{h1};
1249
1250     # Default is to silently strip any known YAML front matter
1251     # Same comment about "yaml" key as above for "h1" key
1252     $o->{yamlmode} = 1 unless looks_like_number($o->{yamlmode});
1253     $o->{yamlvis} = -1 unless looks_like_number($o->{yamlvis});
1254     delete $o->{yaml};
1255
1256     # The anchors hash will only be returned if the key exists
1257     # (the key's value doesn't matter), set the value to an empty
1258     # HASH ref just in case to make sure it's always a HASH ref.
1259     $o->{anchors} = {} if exists($o->{anchors});
1260 }
1261
1262 my %_yamlopts;
1263 BEGIN {%_yamlopts = map({$_ => 1} qw(
1264         display_metadata
1265         header_enum
1266         title
1267 ))}
1268
1269
1270 sub _HasUnknownYAMLOptions {
1271     do { return 1 unless exists($_yamlopts{$_}) } foreach keys(%{$_[0]});
1272     return 0;
1273 }
1274
1275
1276 sub _ApplyYAMLOpts {
1277     my ($yaml, $opt) = @_;
1278     if (defined($yaml->{display_metadata}) && $opt->{yamlvis} < 0) {
1279         # ignore display_metadata except in --yaml=enable mode
1280         $opt->{yamlvis} = _YAMLTrueValue($yaml->{display_metadata}) ? 1 : 0;
1281     }
1282     $opt->{h1} = $yaml->{title} if defined($yaml->{title});
1283     if (defined($yaml->{header_enum}) && $opt->{auto_number} eq '') {
1284         $opt->{auto_number} = _YAMLTrueValue($yaml->{header_enum}) ? 6 : 0;
1285     }
1286 }
1287
1288
1289 sub _YAMLTrueValue {
1290     my $v = shift;
1291     defined($v) or $v = "";
1292     $v = lc($v);
1293     return !($v eq "" || $v eq "0" || $v eq "false" || $v eq "disable" || $v eq "off" || $v eq "no");
1294 }
1295
1296
1297 # Actually returns an empty string rather than a CODE ref
1298 # if an empty prefix is passed in.  Trailing "/"s are trimmed
1299 # off if the second argument is positive or the string does NOT
1300 # consist of only "/"s.  A trailing "/" is added unless the
1301 # trimmed prefix already has one or the second argument is true.
1302 # If the second argument is negative, the prefix is used as-is.
1303 sub _MakePrefixCODERef {
1304     my ($prefix, $mtok) = @_;
1305     defined($prefix) or $prefix = "";
1306     looks_like_number($mtok) or $mtok = $mtok ? 1 : 0;
1307     if ($mtok > 0) {
1308         $prefix =~ s,/+$,,;
1309     } elsif (!$mtok) {
1310         $prefix =~ s,//+$,/,;
1311     }
1312     $prefix ne "" or return "";
1313     $prefix .= '/' if !$mtok && substr($prefix, -1, 1) ne '/';
1314     return sub { $prefix . $_[0] };
1315 }
1316
1317
1318 sub Markdown {
1319 #
1320 # Primary function. The order in which other subs are called here is
1321 # essential. Link and image substitutions need to happen before
1322 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
1323 # and <img> tags get encoded.
1324 #
1325     my $text = shift;
1326
1327     # Any remaining arguments after the first are options; either a single
1328     # hashref or a list of name, value pairs.  See _SanitizeOpts comments.
1329     %opt = (
1330         # set initial defaults
1331         style_prefix            => $g_style_prefix,
1332         empty_element_suffix    => $g_empty_element_suffix,
1333         tab_width               => _tabDefault,
1334         indent_width            => $g_indent_width,
1335         abs_prefix              => "", # Prefixed to absolute path URLs
1336         url_prefix              => "", # Prefixed to non-absolute URLs
1337         img_prefix              => "", # Prefixed to non-absolute image URLs
1338         base_prefix             => "", # Prefixed to fragment-only URLs
1339     );
1340     @autonum = ();
1341     my %args = ();
1342     if (ref($_[0]) eq "HASH") {
1343         %args = %{$_[0]};
1344     } else {
1345         %args = @_;
1346     }
1347     while (my ($k,$v) = each %args) {
1348         $opt{$k} = $v;
1349     }
1350     _SanitizeOpts(\%opt);
1351
1352     my $yaml;
1353     ($text, $yaml) = _PrepareInput($text, $opt{yamlmode});
1354     _ApplyYAMLOpts($yaml, \%opt) if ref($yaml) eq "HASH" && $opt{yamlmode} > 0;
1355     my $yamltable = "";
1356     if (ref($yaml) eq "HASH" && %$yaml && $opt{yamlmode} && $opt{yamlvis}) {
1357         if ($opt{yamlvis} > 0 || _HasUnknownYAMLOptions($yaml)) {
1358             my ($hrows, $drows) = ("", "");
1359             foreach (sort(keys(%$yaml))) {
1360                 my $v = $yaml->{$_};
1361                 my $rspn = '';
1362                 if (defined($v)) {
1363                     $v =~ s/&/&amp;/g;
1364                     $v =~ s/</&lt;/g;
1365                     utf8::encode($v);
1366                     $drows .= "<td>" . $v . "</td>\n";
1367                 } else {
1368                     $rspn = " class=\"$opt{style_prefix}yaml-undef-value\" rowspan=\"2\" valign=\"top\"";
1369                 }
1370                 $hrows .= "<th$rspn>" . $_ . "</th>\n";
1371             }
1372             $yamltable = "<table class=\"$opt{style_prefix}yaml-table\" border=\"1\">\n" .
1373                 "<tr>\n$hrows</tr>\n<tr>\n$drows</tr>\n</table>\n";
1374             $opt{firstline} = scalar(@{[$yamltable =~ /\n/g]});
1375         }
1376     }
1377
1378     # Clear the globals. If we don't clear these, you get conflicts
1379     # from other articles when generating a page which contains more than
1380     # one article (e.g. an index page that shows the N most recent
1381     # articles):
1382     %g_urls = ();
1383     %g_titles = ();
1384     %g_anchors = ();
1385     %g_anchors_id = ();
1386     %g_block_ids = ();
1387     %g_code_block_ids = ();
1388     %g_html_blocks = ();
1389     %g_code_blocks = ();
1390     @g_xml_comments = ();
1391     $g_list_level = 0;
1392
1393     # Make sure $text ends with a couple of newlines:
1394     $text .= "\n\n";
1395
1396     # Handle backticks-delimited code blocks
1397     $text = _HashBTCodeBlocks($text);
1398
1399     # Convert all tabs to spaces.
1400     $text = _DeTab($text);
1401
1402     # Strip any lines consisting only of spaces.
1403     # This makes subsequent regexen easier to write, because we can
1404     # match consecutive blank lines with /\n+/ instead of something
1405     # contorted like / *\n+/ .
1406     $text =~ s/^ +$//mg;
1407
1408     # Turn block-level HTML blocks into hash entries
1409     $text = _HashHTMLBlocks($text, 1);
1410
1411     # Strip link definitions, store in hashes.
1412     $text = _StripLinkDefinitions($text);
1413
1414     $text = _RunBlockGamut($text, 1);
1415
1416     # Remove indentation markers
1417     $text =~ s/\027+//gs;
1418
1419     # Expand auto number flags
1420     $text =~ s/\034([1-6])/_AutoHeaderNum(ord($1)&0x7)/gse
1421         if $opt{auto_number} ne '' && $opt{auto_number} > 0;
1422
1423     # Unhashify code blocks
1424     $text =~ s/(\025\d+\026)/$g_code_blocks{$1}/g;
1425
1426     $text = _UnescapeSpecialChars($text);
1427
1428     $text .= "\n" unless $text eq "";
1429
1430     # Sanitize all '<'...'>' tags if requested
1431     $text = _SanitizeTags($text, $opt{xmlcheck}, 1) if $opt{sanitize};
1432
1433     # Eliminate known named character entities
1434     $opt{keep_named_character_entities} or do {
1435         $yamltable = ConvertNamedCharacterEntities($yamltable);
1436         $text = ConvertNamedCharacterEntities($text);
1437     };
1438
1439     # Convert to US-ASCII only if requested
1440     $opt{us_ascii} and do {
1441         utf8::decode($yamltable);
1442         $yamltable = ConvertToASCII($yamltable);
1443         utf8::encode($yamltable);
1444         $text = ConvertToASCII($text);
1445     };
1446
1447     utf8::encode($text);
1448     if (ref($_[0]) eq "HASH") {
1449         ${$_[0]}{anchors} = {%g_anchors_id} if exists(${$_[0]}{anchors});
1450         if (defined($opt{h1}) && $opt{h1}) {
1451             utf8::encode($opt{h1});
1452             ${$_[0]}{h1} = $opt{h1};
1453         }
1454         ${$_[0]}{yaml} = $yaml if ref($yaml) eq "HASH";
1455     }
1456
1457     if ($opt{divwrap}) {
1458         my $id = $opt{divname};
1459         defined($id) or $id = "";
1460         $id eq "" or $id = ' id="'.escapeXML($id).'"';
1461         chomp($text);
1462         return "<div$id>\n".$yamltable.$text."\n</div>\n";
1463     }
1464     return $yamltable.$text;
1465 }
1466
1467
1468 sub _HashBTCodeBlocks {
1469 #
1470 #   Process Markdown backticks (```) delimited code blocks
1471 #   Process some (limited recognition) tilde (~~~) delimited code blocks
1472 #
1473     my $text = shift;
1474     my $less_than_indent = $opt{indent_width} - 1;
1475
1476     $text =~ s{
1477             (?:(?<=\n)|\A)
1478                 ([ ]{0,$less_than_indent})``(`+)[ \t]*(?:([\w.+-]+[#]?)(?:[ \t][ \t\w.+-]*)?)?\n
1479              ( # $4 = the code block -- one or more lines, starting with ```
1480               (?:
1481                 .*\n
1482               )+?
1483              )
1484             # and ending with ``` or end of document
1485             (?:(?:[ ]{0,$less_than_indent}``\2`*[ \t]*(?:\n|\Z))|\Z)
1486         }{
1487             # $2 contains syntax highlighting to use if defined
1488             my $leadsp = length($1);
1489             my $codeblock = $4;
1490             $codeblock =~ s/[ \t]+$//mg; # trim trailing spaces on lines
1491             $codeblock = _DeTab($codeblock, 8, $leadsp); # physical tab stops are always 8
1492             $codeblock =~ s/\A\n+//; # trim leading newlines
1493             $codeblock =~ s/\s+\z//; # trim trailing whitespace
1494             $codeblock = _EncodeCode($codeblock); # or run highlighter here
1495             $codeblock = "<div class=\"$opt{style_prefix}code-bt\"><pre style=\"display:none\"></pre><pre><code>"
1496                 . $codeblock . "\n</code></pre></div>";
1497
1498             my $key = block_id($codeblock, 2);
1499             $g_code_blocks{$key} = $codeblock;
1500             "\n\n" . $key . "\n\n";
1501         }egmx;
1502
1503     $text =~ s{
1504             (?:(?<=\n)|\A)
1505                 ([ ]{0,$less_than_indent})~~(~)[ \t]*(?:([\w.+-]+[#]?)(?:[ \t][ \t\w.+-]*)?)?\n
1506              ( # $4 = the code block -- one or more lines, starting with ~~~
1507               (?:
1508                 .*\n
1509               )+?
1510              )
1511             # and ending with ~~~ or end of document
1512             (?:(?:[ ]{0,$less_than_indent}~~\2~*[ \t]*(?:\n|\Z))|\Z)
1513         }{
1514             # $2 contains syntax highlighting to use if defined
1515             my $leadsp = length($1);
1516             my $codeblock = $4;
1517             $codeblock =~ s/[ \t]+$//mg; # trim trailing spaces on lines
1518             $codeblock = _DeTab($codeblock, 8, $leadsp); # physical tab stops are always 8
1519             $codeblock =~ s/\A\n+//; # trim leading newlines
1520             $codeblock =~ s/\s+\z//; # trim trailing whitespace
1521             $codeblock = _EncodeCode($codeblock); # or run highlighter here
1522             $codeblock = "<div class=\"$opt{style_prefix}code-bt\"><pre style=\"display:none\"></pre><pre><code>"
1523                 . $codeblock . "\n</code></pre></div>";
1524
1525             my $key = block_id($codeblock);
1526             $g_html_blocks{$key} = $codeblock;
1527             "\n\n" . $key . "\n\n";
1528         }egmx;
1529
1530     return $text;
1531 }
1532
1533
1534 sub _StripLinkDefinitions {
1535 #
1536 # Strips link definitions from text, stores the URLs and titles in
1537 # hash references.
1538 #
1539     my $text = shift;
1540     my $less_than_indent = $opt{indent_width} - 1;
1541
1542     # Link defs are in the form: ^[id]: url "optional title"
1543     while ($text =~ s{
1544                         ^[ ]{0,$less_than_indent}\[(.+)\]: # id = $1
1545                           [ ]*
1546                           \n?               # maybe *one* newline
1547                           [ ]*
1548                         <?((?:\S(?:\\\n\s*[^\s"(])?)+?)>? # url = $2
1549                           [ ]*
1550                           \n?               # maybe one newline
1551                           [ ]*
1552                         (?:
1553                             (?<=\s)         # lookbehind for whitespace
1554                             (?:(['"])|(\()) # title quote char
1555                             (.+?)           # title = $5
1556                             (?(4)\)|\3)     # match same quote
1557                             [ ]*
1558                         )?  # title is optional
1559                         (?:\n+|\Z)
1560                     }
1561                     {}mx) {
1562         my $id = _strip(lc $1); # Link IDs are case-insensitive
1563         my $url = $2;
1564         my $title = _strip($5);
1565         $url =~ s/\\\n\s*//gs;
1566         if ($id ne "") {
1567                 # These values always get passed through _MakeATag or _MakeIMGTag later
1568                 $g_urls{$id} = $url;
1569                 if (defined($title) && $title ne "") {
1570                     $g_titles{$id} = $title;
1571                 }
1572         }
1573     }
1574
1575     return $text;
1576 }
1577
1578 my %ok_tag_name; # initialized later
1579 my ($block_tags_a, $block_tags_b, $block_tags_c);
1580 BEGIN {
1581     $block_tags_a = qr/\020|p|div|center|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/io;
1582     $block_tags_b = qr/\020|p|div|center|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/io;
1583     $block_tags_c = qr/div|center|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/io;
1584 }
1585
1586 sub _HashHTMLBlocks {
1587     my ($text, $toplevel) = @_;
1588     my $less_than_indent = $opt{indent_width} - 1;
1589     my $idt = "\027" x $g_list_level;
1590     my $blkprc = $toplevel ?
1591         sub { return $ok_tag_name{$_[1]} ? _EncodeAmpsAndAngles($_[0]) : $_[0] } :
1592         sub { return $_[0] };
1593
1594     # Hashify HTML blocks:
1595     # We only want to do this for block-level HTML tags, such as headers,
1596     # lists, and tables. That's because we still want to wrap <p>s around
1597     # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
1598     # phrase emphasis, and spans. The list of tags we're looking for is
1599     # hard-coded:
1600
1601     # First, look for nested blocks, e.g.:
1602     #   <div>
1603     #       <div>
1604     #       tags for inner block must be indented.
1605     #       </div>
1606     #   </div>
1607     #
1608     # The outermost tags must start at the left margin for this to match, and
1609     # the inner nested divs must be indented.
1610     # We need to do this before the next, more liberal match, because the next
1611     # match will start at the first `<div>` and stop at the first `</div>`.
1612     $text =~ s{
1613                 (                       # save in $1
1614                     ^                   # start of line (with /m)
1615                     ((?:\Q$idt\E)?)     # optional lead in = $2
1616                     <($block_tags_a)    # start tag = $3
1617                     \b                  # word break
1618                     (?:.*\n)*?          # any number of lines, minimally matching
1619                     \2</\3\s*>          # the matching end tag
1620                     [ ]*                # trailing spaces
1621                     (?=\n+|\Z) # followed by a newline or end of document
1622                 )
1623             }{
1624                 my $blk = &$blkprc($1, $3);
1625                 my $key = block_id($blk);
1626                 $g_html_blocks{$key} = $blk;
1627                 "\n\n" . $key . "\n\n";
1628             }eigmx;
1629
1630
1631     #
1632     # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
1633     #
1634     $text =~ s{
1635                 (                       # save in $1
1636                     ^                   # start of line (with /m)
1637                     (?:\Q$idt\E)?       # optional lead in
1638                     <($block_tags_b)    # start tag = $2
1639                     \b                  # word break
1640                     (?:.*\n)*?          # any number of lines, minimally matching
1641                     .*</\2\s*>          # the matching end tag
1642                     [ ]*                # trailing spaces
1643                     (?=\n+|\Z) # followed by a newline or end of document
1644                 )
1645             }{
1646                 my $blk = &$blkprc($1, $2);
1647                 my $key = block_id($blk);
1648                 $g_html_blocks{$key} = $blk;
1649                 "\n\n" . $key . "\n\n";
1650             }eigmx;
1651
1652     #
1653     # Now match any empty block tags that should have been paired
1654     #
1655     $text =~ s{
1656                 (                       # save in $1
1657                     ^                   # start of line (with /m)
1658                     (?:\Q$idt\E)?       # optional lead in
1659                     <($block_tags_c)    # start tag = $2
1660                     \b                  # word break
1661                     (?:[^<>])*?         #
1662                     /?>                 # the matching end tag
1663                     [ ]*                # trailing spaces
1664                     (?=\n+|\Z) # followed by a newline or end of document
1665                 )
1666             }{
1667                 my $key = block_id($1);
1668                 $g_html_blocks{$key} = $1;
1669                 "\n\n" . $key . "\n\n";
1670             }eigmx;
1671
1672     # Special case just for <hr />. It was easier to make a special case than
1673     # to make the other regex more complicated.
1674     $text =~ s{
1675                 (?:
1676                     (?<=\n)         # Starting after end of line
1677                     |               # or
1678                     \A              # the beginning of the doc
1679                 )
1680                 (                       # save in $1
1681                     [ ]{0,$less_than_indent}
1682                     <(?:hr)             # start tag
1683                     \b                  # word break
1684                     (?:[^<>])*?         #
1685                     /?>                 # the matching end tag
1686                     [ ]*
1687                     (?=\n{1,}|\Z)       # followed by end of line or end of document
1688                 )
1689             }{
1690                 my $key = block_id($1);
1691                 $g_html_blocks{$key} = $1;
1692                 "\n\n" . $key . "\n\n";
1693             }eigx;
1694
1695     # Special case for standalone XML comments:
1696     $opt{stripcomments} != 2 &&
1697     $text =~ s{
1698                 (?:
1699                     (?<=\n\n)       # Starting after a blank line
1700                     |               # or
1701                     \A\n?           # the beginning of the doc
1702                 )
1703                 (                   # save in $1
1704                     [ ]{0,$less_than_indent}
1705                     (?s:
1706                         <!--
1707                         (?:[^-]|(?:-(?!-)))*
1708                         -->
1709                         (?:
1710                             (?:[ \t]*\n[ \t]*)?
1711                             <!--
1712                             (?:[^-]|(?:-(?!-)))*
1713                             -->
1714                         )*
1715                     )
1716                     [ ]*
1717                     (?=\n{1,}|\Z)   # followed by end of line or end of document
1718                 )
1719             }{
1720                 my $key = block_id($1);
1721                 push(@g_xml_comments, $key)
1722                         if $opt{stripcomments} && $opt{stripcomments} < 3 &&
1723                            !exists($g_html_blocks{$key});
1724                 $g_html_blocks{$key} = $1;
1725                 "\n\n" . $key . "\n\n";
1726             }egx;
1727
1728     # Special case for standalone XML-like comments:
1729     $opt{stripcomments} >= 2 &&
1730     $text =~ s{
1731                 (?:
1732                     (?<=\n\n)       # Starting after a blank line
1733                     |               # or
1734                     \A\n?           # the beginning of the doc
1735                 )
1736                 (                   # save in $1
1737                     [ ]{0,$less_than_indent}
1738                     (?s:
1739                         <!--
1740                         (?:[^-]|(?:-(?!->)))*
1741                         -->
1742                         (?:
1743                             (?:[ \t]*\n[ \t]*)?
1744                             <!--
1745                             (?:[^-]|(?:-(?!->)))*
1746                             -->
1747                         )*
1748                     )
1749                     [ ]*
1750                     (?=\n{1,}|\Z)   # followed by end of line or end of document
1751                 )
1752             }{
1753                 my $key = block_id($1);
1754                 push(@g_xml_comments, $key) unless exists($g_html_blocks{$key});
1755                 $g_html_blocks{$key} = $1;
1756                 "\n\n" . $key . "\n\n";
1757             }egx;
1758
1759
1760     return $text;
1761 }
1762
1763
1764 sub _RunBlockGamut {
1765 #
1766 # These are all the transformations that form block-level
1767 # tags like paragraphs, headers, and list items.
1768 #
1769     my ($text, $anchors) = @_;
1770
1771     $text = _DoHeaders($text, $anchors);
1772
1773     # Do Horizontal Rules:
1774     $text =~ s{^ {0,3}\*(?: {0,2}\*){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm;
1775     $text =~ s{^ {0,3}\_(?: {0,2}\_){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm;
1776     $text =~ s{^ {0,3}\-(?: {0,2}\-){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm;
1777
1778     $text = _DoListsAndBlocks($text);
1779
1780     $text = _DoTables($text);
1781
1782     # We already ran _HashHTMLBlocks() before, in Markdown(), but that
1783     # was to escape raw HTML in the original Markdown source. This time,
1784     # we're escaping the markup we've just created, so that we don't wrap
1785     # <p> tags around block-level tags.
1786     $text = _HashHTMLBlocks($text);
1787
1788     $text = _FormParagraphs($text, $anchors);
1789
1790     return $text;
1791 }
1792
1793
1794 sub _DoBTListBlocks {
1795     return _DoBlockQuotes(_DoCodeBlocks(_HashBTCodeBlocks($_[0]))) if $_[0] ne "";
1796 }
1797
1798
1799 sub _DoListBlocks {
1800     return _DoBlockQuotes(_DoCodeBlocks($_[0])) if $_[0] ne "";
1801 }
1802
1803
1804 sub _RunSpanGamut {
1805 #
1806 # These are all the transformations that occur *within* block-level
1807 # tags like paragraphs, headers, and list items.
1808 #
1809     my $text = shift;
1810
1811     $text = _DoCodeSpans($text);
1812
1813     $text = _EscapeSpecialChars($text);
1814
1815     # Process anchor and image tags. Images must come first,
1816     # because ![foo][f] looks like an anchor.
1817     $text = _DoImages($text);
1818     $text = _DoAnchors($text);
1819
1820     # Make links out of things like `<http://example.com/>`
1821     # Must come after _DoAnchors(), because you can use < and >
1822     # delimiters in inline links like [this](<url>).
1823     $text = _DoAutoLinks($text);
1824
1825     $text = _EncodeAmpsAndAngles($text);
1826
1827     $text = _DoItalicsAndBoldAndStrike($text);
1828
1829     # Do hard breaks:
1830     $text =~ s/ {3,}(\n|\z)/<br clear=\"all\"$opt{empty_element_suffix}$1/g;
1831     $text =~ s/ {2,}\n/<br$opt{empty_element_suffix}\n/g;
1832     $text =~ s/ ?\\\n/<br$opt{empty_element_suffix}\n/g;
1833     $text =~ s/ {2,}\z//g;
1834
1835     return $text;
1836 }
1837
1838
1839 sub _EscapeSpecialChars {
1840     my $text = shift;
1841     my $tokens ||= _TokenizeHTML($text);
1842
1843     $text = ''; # rebuild $text from the tokens
1844 #   my $in_pre = 0;  # Keep track of when we're inside <pre> or <code> tags.
1845 #   my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
1846
1847     foreach my $cur_token (@$tokens) {
1848         if ($cur_token->[0] eq "tag") {
1849             # Within tags, encode *, _ and ~ so they don't conflict
1850             # with their use in Markdown for italics and strong.
1851             # We're replacing each such character with its
1852             # corresponding block id value; this is likely
1853             # overkill, but it should prevent us from colliding
1854             # with the escape values by accident.
1855             $cur_token->[1] =~ s!([*_~])!$g_escape_table{$1}!go;
1856             $text .= $cur_token->[1];
1857         } else {
1858             my $t = $cur_token->[1];
1859             $t = _EncodeBackslashEscapes($t);
1860             $text .= $t;
1861         }
1862     }
1863     return $text;
1864 }
1865
1866
1867 sub _ProcessWikiLink {
1868     my ($link_text, $link_loc) = @_;
1869     if (defined($link_loc) &&
1870         ($link_loc =~ m{^#\S*$} || $link_loc =~ m{^(?:http|ftp)s?://\S+$}i)) {
1871         # Return the new link
1872         return _MakeATag(_FindFragmentMatch($link_loc), $link_text);
1873     }
1874     if (!defined($link_loc)) {
1875         $link_loc = _RunSpanGamut($link_text);
1876         $link_loc = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($link_loc))));
1877         $link_loc =~ m{^(?:http|ftp)s?://\S+$}i and
1878             # Return the new link
1879             return _MakeATag($link_loc, $link_text);
1880     }
1881     return undef if $link_loc eq "" || $link_text eq "";
1882     if ($link_loc =~ /^[A-Za-z][A-Za-z0-9+.-]*:/os) {
1883         # Unrecognized scheme
1884         return undef;
1885     }
1886     if ($opt{wikipat}) {
1887         my $o = $opt{wikiopt};
1888         my $img_link = _strip($link_text);
1889         my $img = 0;
1890         my $qsfrag = "";
1891         my $base;
1892         my $imgopts = undef;
1893         if ($img_link =~ /^[^#?\s]+\.(?:png|gif|jpe?g|svgz?)$/i) {
1894             $base = _wxform($img_link, 1);
1895             $img = 1;
1896             $imgopts = _ParseWikiImgOpts($link_loc);
1897             $imgopts->{imgflag} = \$img;
1898         } else {
1899             $base = $link_loc;
1900             if ($link_loc =~ /^(.*?)([?#].*)$/os) {
1901                 ($base, $qsfrag) = ($1, $2);
1902             }
1903             $base = _wxform($base);
1904         }
1905         my $result = $opt{wikipat};
1906         $result =~ s/%\{\}.+$/%{}/os if $img;
1907         $result =~ s/%\{\}/$base/;
1908         if ($qsfrag =~ /^([^#]*)(#.+)$/os) {
1909             my ($q,$f) = ($1,$2);
1910             #$f = _wxform($f) if $f =~ / /;
1911             $qsfrag = $q . $f;
1912         }
1913         $result .= $qsfrag;
1914         $result = &{$opt{wikifunc}}($result, \%opt, ($img?$img_link:$link_loc), $base, $qsfrag, $imgopts)
1915                 if ref($opt{wikifunc}) eq 'CODE';
1916         {
1917             use bytes;
1918             $result =~ s/%(?![0-9A-Fa-f]{2})/%25/sog;
1919             if ($o->{r}) {
1920                 $result =~
1921                 s/([\x00-\x1F <>"{}|\\^`x7F])/sprintf("%%%02X",ord($1))/soge;
1922             } else {
1923                 $result =~
1924                 s/([\x00-\x1F <>"{}|\\^`\x7F-\xFF])/sprintf("%%%02X",ord($1))/soge;
1925             }
1926             $result =~ s/(%(?![0-9A-F]{2})[0-9A-Fa-f]{2})/uc($1)/soge;
1927         }
1928         # Return the new link
1929         return $img ? _MakeIMGTag($result, undef, undef, $imgopts) : _MakeATag($result, $link_text);
1930     }
1931     # leave it alone
1932     return undef;
1933 }
1934
1935
1936 sub _ParseWikiImgOpts {
1937     my $alts = shift;
1938     my %o = ();
1939     # alt= consumes the rest of the line, do it first
1940     if ($alts =~ /(?:^|,)\s*alt\s*=\s*(.*)$/ios) {
1941         my $atext = $1;
1942         $alts = substr($alts, 0, $-[0]);
1943         $o{alt} = _strip($atext);
1944     }
1945     foreach my $kv (split(/\s*,\s*/, lc($alts))) {
1946         if ($kv =~ /^\s*([^\s]+)\s*=\s*([^\s]+)\s*$/os) {
1947             my ($k, $v) = ($1, $2);
1948             if (($k eq "width" || $k eq "height") && $v =~ /^\d+$/) {
1949                 $o{$k} = 0+$v if $v > 0;
1950                 next;
1951             }
1952             if ($k eq "align" && ($v eq "left" || $v eq "right" || $v eq "center")) {
1953                 $o{$k} = $v;
1954                 next;
1955             }
1956         }
1957     }
1958     return \%o;
1959 }
1960
1961
1962 sub _wxform {
1963     my ($w, $img) = @_;
1964     my $o = $opt{wikiopt};
1965     my $opt_s = $o->{s};
1966     if (!$img && $opt_s) {
1967         if (ref($opt_s)) {
1968             if ($w =~ m{^(.*)[.]([^./]*)$}) {
1969                 my ($base, $ext) = ($1, $2);
1970                 $w = $base if $opt_s->{lc($ext)};
1971             }
1972         } else {
1973             $w =~ s{[.][^./]*$}{};
1974         }
1975     }
1976     $w = uc($w) if $o->{u};
1977     $w = lc($w) if $o->{l};
1978     $w =~ s{/+}{%252F}gos if $o->{"%"};
1979     $w =~ s/ +/%20/gos if $o->{b};
1980     $w =~ tr{/}{ } if $o->{f};
1981     $w =~ s{/+}{/}gos if !$o->{f} && !$o->{v};
1982     if ($o->{d}) {
1983         $w =~ tr{ }{-};
1984         $w =~ s/-+/-/gos unless $o->{v};
1985     } else {
1986         $w =~ tr{ }{_};
1987         $w =~ s/_+/_/gos unless $o->{v};
1988     }
1989     return $w;
1990 }
1991
1992
1993 # Return a suitably encoded <a...> tag string
1994 # On input NONE of $url, $text or $title should be xmlencoded
1995 # but $url should already be url-encoded if needed, but NOT g_escape_table'd
1996 sub _MakeATag {
1997     my ($url, $text, $title) = @_;
1998     defined($url) or $url="";
1999     defined($text) or $text="";
2000     defined($title) or $title="";
2001
2002     $url =~ m"^#" && ref($opt{base_prefix}) eq 'CODE' and $url = &{$opt{base_prefix}}($url);
2003     my $result = $g_escape_table{'<'}."a href=\"" . _EncodeAttText($url) . "\"";
2004     $title = _strip($title);
2005     $text =~ s{<(/?a)}{&lt;$1}sogi;
2006     $text = _DoItalicsAndBoldAndStrike($text);
2007     # We've got to encode any of these remaining to avoid
2008     # conflicting with other italics, bold and strike through and links.
2009     $text =~ s!([]*_~[])!$g_escape_table{$1}!go;
2010     $result .= " title=\"" . _EncodeAttText($title) . "\"" if $title ne "";
2011     return $result . $g_escape_table{'>'} .
2012         $text . $g_escape_table{'<'}."/a".$g_escape_table{'>'};
2013 }
2014
2015
2016 sub _DoAnchors {
2017 #
2018 # Turn Markdown link shortcuts into XHTML <a> tags.
2019 #
2020     my $text = shift;
2021
2022     #
2023     # First, handle wiki-style links: [[wiki style link]]
2024     #
2025     $text =~ s{
2026         (                   # wrap whole match in $1
2027           \[\[
2028             ($g_nested_brackets) # link text and id = $2
2029           \]\]
2030         )
2031     }{
2032         my $result;
2033         my $whole_match = $1;
2034         my $link_text   = $2;
2035         my $link_loc    = undef;
2036
2037         if ($link_text =~ /^(.*)\|(.*)$/s) {
2038             $link_text = $1;
2039             $link_loc = _strip($2);
2040         }
2041
2042         $result = _ProcessWikiLink($link_text, $link_loc);
2043         defined($result) or $result = $whole_match;
2044         $result;
2045     }xsge;
2046
2047     #
2048     # Next, handle reference-style links: [link text] [id]
2049     #
2050     $text =~ s{
2051         (                   # wrap whole match in $1
2052           \[
2053             ($g_nested_brackets) # link text = $2
2054           \]
2055
2056           [ ]?              # one optional space
2057           (?:\n[ ]*)?       # one optional newline followed by spaces
2058
2059           \[
2060             ($g_nested_brackets) # id = $3
2061           \]
2062         )
2063     }{
2064         my $result;
2065         my $whole_match = $1;
2066         my $link_text   = $2;
2067         my $link_id     = $3;
2068
2069         if ($link_id eq "") {
2070             # for shortcut links like [this][].
2071             $link_id = _RunSpanGamut($link_text);
2072             $link_id = unescapeXML(_StripTags(_UnescapeSpecialChars($link_id)));
2073         }
2074         $link_id = _strip(lc $link_id);
2075
2076         if (defined($g_urls{$link_id}) || defined($g_anchors{$link_id})) {
2077             my $url = $g_urls{$link_id};
2078             defined($url) or $url = $g_anchors{$link_id};
2079             $url = _FindFragmentMatch($url);
2080             $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
2081             $result = _MakeATag(_PrefixURL($url), $link_text, $g_titles{$link_id});
2082         }
2083         else {
2084             $result = $whole_match;
2085         }
2086         $result;
2087     }xsge;
2088
2089     #
2090     # Subsequently, inline-style links: [link text](url "optional title")
2091     #
2092     $text =~ s{
2093         (               # wrap whole match in $1
2094           \[
2095             ($g_nested_brackets) # link text = $2
2096           \]
2097           \(            # literal paren
2098             ($g_nested_parens) # href and optional title = $3
2099           \)
2100         )
2101     }{
2102         #my $result;
2103         my $whole_match = $1;
2104         my $link_text   = $2;
2105         my ($url, $title) = _SplitUrlTitlePart($3);
2106
2107         if (defined($url)) {
2108             $url = _FindFragmentMatch($url);
2109             $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
2110             _MakeATag(_PrefixURL($url), $link_text, $title);
2111         } else {
2112             # The href/title part didn't match the pattern
2113             $whole_match;
2114         }
2115     }xsge;
2116
2117     #
2118     # Finally, handle reference-style implicit shortcut links: [link text]
2119     #
2120     $text =~ s{
2121         (                   # wrap whole match in $1
2122           \[
2123             ($g_nested_brackets) # link text = $2
2124           \]
2125         )
2126     }{
2127         my $result;
2128         my $whole_match = $1;
2129         my $link_text   = $2;
2130         my $link_id     = _RunSpanGamut($2);
2131         $link_id        = _strip(lc(unescapeXML(_StripTags(_UnescapeSpecialChars($link_id)))));
2132
2133         if (defined($g_urls{$link_id}) || defined($g_anchors{$link_id})) {
2134             my $url = $g_urls{$link_id};
2135             defined($url) or $url = $g_anchors{$link_id};
2136             $url = _FindFragmentMatch($url);
2137             $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
2138             $result = _MakeATag(_PrefixURL($url), $link_text, $g_titles{$link_id});
2139         }
2140         else {
2141             $result = $whole_match;
2142         }
2143         $result;
2144     }xsge;
2145
2146     return $text;
2147 }
2148
2149
2150 sub _PeelWrapped {
2151     defined($_[0]) or return undef;
2152     if (substr($_[0],0,1) eq "(") {
2153         return substr($_[0], 1, length($_[0]) - (substr($_[0], -1, 1) eq ")" ? 2 : 1));
2154     }
2155     return $_[0];
2156 }
2157
2158
2159 sub _SplitUrlTitlePart {
2160     return ("", undef) if $_[0] =~ m{^\s*$}; # explicitly allowed
2161     my $u = $_[0];
2162     $u =~ s/^\s*(['\042])/# $1/;
2163     if ($u =~ m{
2164         ^               # match beginning
2165         \s*?
2166         <?([^\s'\042]\S*?)>? # URL = $1
2167         (?:             # optional grouping
2168           \s+           # must be distinct from URL
2169           (['\042]?)    # quote char = $2
2170           (.*?)         # Title = $3
2171           \2?           # matching quote
2172         )?              # title is optional
2173         \s*
2174         \z              # match end
2175     }osx) {
2176         return (undef, undef) if $_[1] && ($1 eq "" || $1 eq "#");
2177         return (_PeelWrapped($1), $2 ? $3 : _PeelWrapped($3));
2178     } else {
2179         return (undef, undef);
2180     }
2181 }
2182
2183
2184 sub _FindFragmentMatchInternal {
2185     my ($anchors_id, $url, $undefifnomatch) = @_;
2186     if (defined($url) && $url =~ /^#\S/) {
2187         # try very hard to find a match
2188         my $idbase = _strip(lc(substr($url, 1)));
2189         my $idbase0 = $idbase;
2190         my $id = _MakeAnchorId($idbase);
2191         $undefifnomatch and $url = undef;
2192         if (defined($$anchors_id{$id})) {
2193             $url = $$anchors_id{$id};
2194         } else {
2195             $idbase =~ s/-/_/gs;
2196             $id = _MakeAnchorId($idbase);
2197             if (defined($$anchors_id{$id})) {
2198                 $url = $$anchors_id{$id};
2199             } else {
2200                 $id = _MakeAnchorId($idbase0, 1);
2201                 if (defined($$anchors_id{$id})) {
2202                     $url = $$anchors_id{$id};
2203                 } else {
2204                     $id = _MakeAnchorId($idbase, 1);
2205                     if (defined($$anchors_id{$id})) {
2206                         $url = $$anchors_id{$id};
2207                     }
2208                 }
2209             }
2210         }
2211     }
2212     return $url;
2213 }
2214
2215
2216 sub _FindFragmentMatch {
2217     return _FindFragmentMatchInternal(\%g_anchors_id, @_);
2218 }
2219
2220
2221 sub _ToUTF8 {
2222     my $input = shift;
2223     my $output;
2224     if (Encode::is_utf8($input) || utf8::decode($input)) {
2225         $output = $input;
2226     } else {
2227         $output = $encoder->decode($input, Encode::FB_DEFAULT);
2228     }
2229     return $output;
2230 }
2231
2232
2233 # $_[0] -> HASH ref of anchors (e.g. the "anchors" OUTPUT from Markdown)
2234 # $_[1] -> fragment to resolve, may optionally start with '#'
2235 # An empty string ("") or hash ("#") is returned as-is.
2236 # returns undef if no match otherwise resolved fragment name
2237 # which will start with a '#' if $_[1] started with '#' otherwise will not.
2238 # This function can be used to connect up links to "implicit" anchors.
2239 # All Markdown-format H1-H6 headers have an implicit anchor added
2240 # based on the header item text.  Passing that text to this function
2241 # will cough up the matching implicit anchor if there is one.
2242 sub ResolveFragment
2243 {
2244     my ($anchors, $frag) = @_;
2245     defined($frag) or return undef;
2246     $frag eq "" || $frag eq "#" and return $frag;
2247     my $hadhash = ($frag =~ s/^#//);
2248     $frag =~ /^\S/ or return undef;
2249     ref($anchors) eq 'HASH' or return undef;
2250     my $ans = _FindFragmentMatchInternal($anchors, '#'._ToUTF8($frag), 1);
2251     $hadhash || !defined($ans) or $ans =~ s/^#//;
2252     defined($ans) and utf8::encode($ans);
2253     return $ans;
2254 }
2255
2256
2257 # Return a suitably encoded <img...> tag string
2258 # On input NONE of $url, $alt or $title should be xmlencoded
2259 # but $url should already be url-encoded if needed, but NOT g_escape_table'd
2260 sub _MakeIMGTag {
2261     my ($url, $alt, $title, $iopts) = @_;
2262     defined($url) or $url="";
2263     defined($alt) or $alt="";
2264     defined($title) or $title="";
2265     ref($iopts) eq "HASH" or $iopts = {};
2266     return "" unless $url ne "";
2267
2268     my ($w, $h, $lf, $rt) = (0, 0, '', '');
2269     ($alt, $title) = (_strip($alt), _strip($title));
2270     if ($title =~ /^(.*)\((<?)([1-9][0-9]*)[xX\xd7]([1-9][0-9]*)(>?)\)$/os) {
2271         ($title, $w, $h, $lf, $rt) = (_strip($1), $3, $4, $2, $5);
2272     } elsif ($title =~ /^(.*)\((<?)\?[xX\xd7]([1-9][0-9]*)(>?)\)$/os) {
2273         ($title, $h, $lf, $rt) = (_strip($1), $3, $2, $4);
2274     } elsif ($title =~ /^(.*)\((<?)([1-9][0-9]*)[xX\xd7]\?(>?)\)$/os) {
2275         ($title, $w, $lf, $rt) = (_strip($1), $3, $2, $4);
2276     } elsif ($title =~ /^(.*)\((?!\))(<?)(>?)\)$/os) {
2277         ($title, $lf, $rt) = (_strip($1), $2, $3);
2278     }
2279     $iopts->{align} = "center" if $lf && $rt;
2280     $iopts->{align} = "left"   if $lf && !$rt;
2281     $iopts->{align} = "right"  if !$lf && $rt;
2282     $iopts->{width}  = $w if $w != 0;
2283     $iopts->{height} = $h if $h != 0;
2284     $iopts->{alt} = $alt if $alt ne "";
2285     $iopts->{title} = $title if $title ne "";
2286     my $iopt = sub { defined($iopts->{$_[0]}) ? $iopts->{$_[0]} : (@_ > 1 ? $_[1] : "") };
2287     my $result = '';
2288     $result .= $g_escape_table{'<'}."center".$g_escape_table{'>'}
2289         if &$iopt("align") eq "center";
2290     $result .= $g_escape_table{'<'}."img src=\"" . _EncodeAttText($url) . "\"";
2291     $result .= " align=\"left\"" if &$iopt("align") eq "left";
2292     $result .= " align=\"right\"" if &$iopt("align") eq "right";
2293     $result .= " alt=\"" . _EncodeAttText($iopts->{alt}) . "\"" if &$iopt("alt") ne "";
2294     $result .= " width=\"" . $iopts->{width} . "\"" if &$iopt("width",0) != 0;
2295     $result .= " height=\"" . $iopts->{height} . "\"" if &$iopt("height",0) != 0;
2296     $result .= " title=\"" . _EncodeAttText($iopts->{title}) . "\"" if &$iopt("title") ne "";
2297     $result .= " /" unless $opt{empty_element_suffix} eq ">";
2298     $result .= $g_escape_table{'>'};
2299     $result .= $g_escape_table{'<'}."/center".$g_escape_table{'>'}
2300         if &$iopt("align") eq "center";
2301     return $result;
2302 }
2303
2304
2305 sub _DoImages {
2306 #
2307 # Turn Markdown image shortcuts into <img> tags.
2308 #
2309     my $text = shift;
2310
2311     #
2312     # First, handle reference-style labeled images: ![alt text][id]
2313     #
2314     $text =~ s{
2315         (               # wrap whole match in $1
2316           !\[
2317             ($g_nested_brackets) # alt text = $2
2318           \]
2319
2320           [ ]?          # one optional space
2321           (?:\n[ ]*)?   # one optional newline followed by spaces
2322
2323           \[
2324             ($g_nested_brackets) # id = $3
2325           \]
2326
2327         )
2328     }{
2329         my $result;
2330         my $whole_match = $1;
2331         my $alt_text    = $2;
2332         my $link_id     = $3;
2333
2334         $link_id ne "" or $link_id = $alt_text; # for shortcut links like ![this][].
2335         $link_id = _strip(lc $link_id);
2336
2337         if (defined $g_urls{$link_id}) {
2338             $result = _MakeIMGTag(
2339                 _PrefixURL($g_urls{$link_id}), $alt_text, $g_titles{$link_id});
2340         }
2341         else {
2342             # If there's no such link ID, leave intact:
2343             $result = $whole_match;
2344         }
2345
2346         $result;
2347     }xsge;
2348
2349     #
2350     # Next, handle inline images:  ![alt text](url "optional title")
2351     # Don't forget: encode * and _
2352
2353     $text =~ s{
2354         (               # wrap whole match in $1
2355           !\[
2356             ($g_nested_brackets) # alt text = $2
2357           \]
2358           \(            # literal paren
2359             ($g_nested_parens) # src and optional title = $3
2360           \)
2361         )
2362     }{
2363         my $whole_match = $1;
2364         my $alt_text    = $2;
2365         my ($url, $title) = _SplitUrlTitlePart($3, 1);
2366         defined($url) ?  _MakeIMGTag(_PrefixURL($url), $alt_text, $title) : $whole_match;
2367     }xsge;
2368
2369     #
2370     # Finally, handle reference-style implicitly labeled links: ![alt text]
2371     #
2372     $text =~ s{
2373         (               # wrap whole match in $1
2374           !\[
2375             ($g_nested_brackets) # alt text = $2
2376           \]
2377         )
2378     }{
2379         my $result;
2380         my $whole_match = $1;
2381         my $alt_text    = $2;
2382         my $link_id     = lc(_strip($alt_text));
2383
2384         if (defined $g_urls{$link_id}) {
2385             $result = _MakeIMGTag(
2386                 _PrefixURL($g_urls{$link_id}), $alt_text, $g_titles{$link_id});
2387         }
2388         else {
2389             # If there's no such link ID, leave intact:
2390             $result = $whole_match;
2391         }
2392
2393         $result;
2394     }xsge;
2395
2396     return $text;
2397 }
2398
2399 sub _EncodeAttText {
2400     my $text = shift;
2401     defined($text) or return undef;
2402     $text = escapeXML(_strip($text));
2403     # We've got to encode these to avoid conflicting
2404     # with italics, bold and strike through.
2405     $text =~ s!([*_~:])!$g_escape_table{$1}!go;
2406     return $text;
2407 }
2408
2409
2410 sub _MakeAnchorId {
2411     use bytes;
2412     my ($link, $strip) = @_;
2413     $link = lc($link);
2414     if ($strip) {
2415         $link =~ s/\s+/_/gs;
2416         $link =~ tr/-a-z0-9_//cd;
2417     } else {
2418         $link =~ tr/-a-z0-9_/_/cs;
2419     }
2420     return '' unless $link ne '';
2421     $link = "_".$link."_";
2422     $link =~ s/__+/_/gs;
2423     $link = "_".md5_hex($link)."_" if length($link) > 66;
2424     return $link;
2425 }
2426
2427
2428 sub _GetNewAnchorId {
2429     my $link = _strip(lc(shift));
2430     return '' if $link eq "" || defined($g_anchors{$link});
2431     my $id = _MakeAnchorId($link);
2432     return '' unless $id;
2433     $g_anchors{$link} = '#'.$id;
2434     $g_anchors_id{$id} = $g_anchors{$link};
2435     if ($id =~ /-/) {
2436         my $id2 = $id;
2437         $id2 =~ s/-/_/gs;
2438         $id2 =~ s/__+/_/gs;
2439         defined($g_anchors_id{$id2}) or $g_anchors_id{$id2} = $g_anchors{$link};
2440     }
2441     my $idd = _MakeAnchorId($link, 1);
2442     if ($idd) {
2443         defined($g_anchors_id{$idd}) or $g_anchors_id{$idd} = $g_anchors{$link};
2444         if ($idd =~ /-/) {
2445             my $idd2 = $idd;
2446             $idd2 =~ s/-/_/gs;
2447             $idd2 =~ s/__+/_/gs;
2448             defined($g_anchors_id{$idd2}) or $g_anchors_id{$idd2} = $g_anchors{$link};
2449         }
2450     }
2451     $id;
2452 }
2453
2454
2455 sub _DoHeaders {
2456     my ($text, $anchors) = @_;
2457     my $h1;
2458     my $geth1 = $anchors && !defined($opt{h1}) ? sub {
2459         return unless !defined($h1);
2460         my $h = shift;
2461         $h1 = $h if $h ne "";
2462     } : sub {};
2463
2464     # atx-style headers:
2465     #   # Header 1
2466     #   ## Header 2
2467     #   ## Header 2 with closing hashes ##
2468     #   ...
2469     #   ###### Header 6
2470     #
2471     $text =~ s{
2472             ^(\#{1,6})  # $1 = string of #'s
2473             [ ]*
2474             ((?:(?:(?<![#])[^\s]|[^#\s]).*?)?) # $2 = Header text
2475             [ ]*
2476             \n+
2477         }{
2478             my $h_level = length($1);
2479             my $h = $2;
2480             $h =~ s/#+$//;
2481             $h =~ s/\s+$//;
2482             my $rsg = _RunSpanGamut($h);
2483             $h = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($rsg))));
2484             my $id = $h eq "" ? "" : _GetNewAnchorId($h);
2485             $id = " id=\"$id\"" if $id ne "";
2486             &$geth1($h) if $h_level == 1;
2487             "<h$h_level$id>" . _AutoHeaderFlag($h_level) . $rsg . "</h$h_level>\n\n";
2488         }egmx;
2489
2490     # Setext-style headers:
2491     #     Header 1
2492     #     ========
2493     #
2494     #     Header 2
2495     #     --------
2496     #
2497     #     Header 3
2498     #     ~~~~~~~~
2499     #
2500     $text =~ s{ ^(?:=+[ ]*\n)?[ ]*(.+?)[ ]*\n=+[ ]*\n+ }{
2501         my $h = $1;
2502         my $rsg = _RunSpanGamut($h);
2503         $h = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($rsg))));
2504         my $id = $h eq "" ? "" : _GetNewAnchorId($h);
2505         $id = " id=\"$id\"" if $id ne "";
2506         &$geth1($h);
2507         "<h1$id>" . _AutoHeaderFlag(1) . $rsg . "</h1>\n\n";
2508     }egmx;
2509
2510     $text =~ s{ ^(?:-+[ ]*\n)?[ ]*(.+?)[ ]*\n-+[ ]*\n+ }{
2511         my $h = $1;
2512         my $rsg = _RunSpanGamut($h);
2513         $h = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($rsg))));
2514         my $id = $h eq "" ? "" : _GetNewAnchorId($h);
2515         $id = " id=\"$id\"" if $id ne "";
2516         "<h2$id>" . _AutoHeaderFlag(2) . $rsg . "</h2>\n\n";
2517     }egmx;
2518
2519     $text =~ s{ ^(?:~+[ ]*\n)?[ ]*(.+?)[ ]*\n~+[ ]*\n+ }{
2520         my $h = $1;
2521         my $rsg = _RunSpanGamut($h);
2522         $h = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($rsg))));
2523         my $id = $h eq "" ? "" : _GetNewAnchorId($h);
2524         $id = " id=\"$id\"" if $id ne "";
2525         "<h3$id>" . _AutoHeaderFlag(3) . $rsg . "</h3>\n\n";
2526     }egmx;
2527
2528     $opt{h1} = $h1 if defined($h1) && $h1 ne "";
2529     return $text;
2530 }
2531
2532
2533 sub _AutoHeaderFlag {
2534     my $level = shift;
2535     my $auto = $opt{auto_number} || 0;
2536     return '' unless 1 <= $level && $level <= $auto;
2537     return "\34".chr(0x30+$level);
2538 }
2539
2540
2541 sub _AutoHeaderNum {
2542     my $level = shift;
2543     my $auto = $opt{auto_number} || 0;
2544     return '' unless 1 <= $level && $level <= $auto;
2545     pop(@autonum) while @autonum > $level;
2546     push(@autonum, 1) while @autonum < $level - 1;
2547     $autonum[$level - 1] += 1;
2548     return join('.', @autonum).' ';
2549 }
2550
2551
2552 my ($marker_ul, $marker_ol, $marker_any, $roman_numeral, $greek_lower);
2553 BEGIN {
2554     # Re-usable patterns to match list item bullets and number markers:
2555     $roman_numeral = qr/(?:
2556         [IiVvXx]|[Ii]{2,3}|[Ii][VvXx]|[VvXx][Ii]{1,3}|[Xx][Vv][Ii]{0,3}|
2557         [Xx][Ii][VvXx]|[Xx]{2}[Ii]{0,3}|[Xx]{2}[Ii]?[Vv]|[Xx]{2}[Vv][Ii]{1,2})/ox;
2558     $greek_lower = qr/(?:[\x{03b1}-\x{03c9}])/o;
2559     $marker_ul  = qr/[*+-]/o;
2560     $marker_ol  = qr/(?:\d+|[A-Za-z]|$roman_numeral|$greek_lower)[.\)]/o;
2561     $marker_any = qr/(?:$marker_ul|$marker_ol)/o;
2562 }
2563
2564
2565 sub _GetListMarkerType {
2566     my ($list_type, $list_marker, $last_marker) = @_;
2567     return "" unless $list_type && $list_marker && lc($list_type) eq "ol";
2568     my $last_marker_type = '';
2569     $last_marker_type = _GetListMarkerType($list_type, $last_marker)
2570         if defined($last_marker) &&
2571             # these are roman unless $last_marker type case matches and is 'a' or 'A'
2572             $list_marker =~ /^[IiVvXx][.\)]?$/;
2573     return "I" if $list_marker =~ /^[IVX]/ && $last_marker_type ne 'A';
2574     return "i" if $list_marker =~ /^[ivx]/ && $last_marker_type ne 'a';
2575     return "A" if $list_marker =~ /^[A-Z]/;
2576     return "a" if $list_marker =~ /^[a-z]/ || $list_marker =~ /^$greek_lower/o;
2577     return "1";
2578 }
2579
2580
2581 sub _GetListItemTypeClass {
2582     my ($list_type, $list_marker, $last_marker) = @_;
2583     my $list_marker_type = _GetListMarkerType($list_type, $list_marker, $last_marker);
2584     my $ans = &{sub{
2585         return "" unless length($list_marker) >= 2 && $list_marker_type =~ /^[IiAa1]$/;
2586         return "lower-greek" if $list_marker_type eq "a" && $list_marker =~ /^$greek_lower/o;
2587         return "" unless $list_marker =~ /\)$/;
2588         return "upper-roman" if $list_marker_type eq "I";
2589         return "lower-roman" if $list_marker_type eq "i";
2590         return "upper-alpha" if $list_marker_type eq "A";
2591         return "lower-alpha" if $list_marker_type eq "a";
2592         return "decimal";
2593     }};
2594     return ($list_marker_type, $ans);
2595 }
2596
2597
2598 my %_roman_number_table;
2599 BEGIN {
2600     %_roman_number_table = (
2601         i       =>  1,
2602         ii      =>  2,
2603         iii     =>  3,
2604         iv      =>  4,
2605         v       =>  5,
2606         vi      =>  6,
2607         vii     =>  7,
2608         viii    =>  8,
2609         ix      =>  9,
2610         x       => 10,
2611         xi      => 11,
2612         xii     => 12,
2613         xiii    => 13,
2614         xiv     => 14,
2615         xv      => 15,
2616         xvi     => 16,
2617         xvii    => 17,
2618         xviii   => 18,
2619         xix     => 19,
2620         xx      => 20,
2621         xxi     => 21,
2622         xxii    => 22,
2623         xxiii   => 23,
2624         xxiv    => 24,
2625         xxv     => 25,
2626         xxvi    => 26,
2627         xxvii   => 27
2628     );
2629 }
2630
2631
2632 # Necessary because ς and σ are the same value grrr
2633 my %_greek_number_table;
2634 BEGIN {
2635     %_greek_number_table = (
2636         "\x{03b1}" =>  1, # α
2637         "\x{03b2}" =>  2, # β
2638         "\x{03b3}" =>  3, # γ
2639         "\x{03b4}" =>  4, # δ
2640         "\x{03b5}" =>  5, # ε
2641         "\x{03b6}" =>  6, # ζ
2642         "\x{03b7}" =>  7, # η
2643         "\x{03b8}" =>  8, # θ
2644         "\x{03b9}" =>  9, # ι
2645         "\x{03ba}" => 10, # κ
2646         "\x{03bb}" => 11, # λ
2647         #"\x{00b5}"=> 12, # µ is "micro" not "mu"
2648         "\x{03bc}" => 12, # μ
2649         "\x{03bd}" => 13, # ν
2650         "\x{03be}" => 14, # ξ
2651         "\x{03bf}" => 15, # ο
2652         "\x{03c0}" => 16, # π
2653         "\x{03c1}" => 17, # ρ
2654         "\x{03c2}" => 18, # ς
2655         "\x{03c3}" => 18, # σ
2656         "\x{03c4}" => 19, # τ
2657         "\x{03c5}" => 20, # υ
2658         "\x{03c6}" => 21, # φ
2659         "\x{03c7}" => 22, # χ
2660         "\x{03c8}" => 23, # ψ
2661         "\x{03c9}" => 24  # ω
2662     );
2663 }
2664
2665
2666 sub _GetMarkerIntegerNum {
2667     my ($list_marker_type, $marker_val) = @_;
2668     my $ans = &{sub{
2669         return 0 + $marker_val if $list_marker_type eq "1";
2670         $list_marker_type = lc($list_marker_type);
2671         return $_greek_number_table{$marker_val}
2672             if $list_marker_type eq "a" &&
2673             defined($_greek_number_table{$marker_val});
2674         $marker_val = lc($marker_val);
2675         return ord($marker_val) - ord("a") + 1 if $list_marker_type eq "a";
2676         return 1 unless $list_marker_type eq "i";
2677         defined($_roman_number_table{$marker_val}) and
2678             return $_roman_number_table{$marker_val};
2679         return 1;
2680     }};
2681     return $ans if $ans == 0 && $list_marker_type eq "1";
2682     return $ans >= 1 ? $ans : 1;
2683 }
2684
2685
2686 sub _IncrList {
2687     my ($from, $to, $extra) = @_;
2688     $extra = defined($extra) ? " $extra" : "";
2689     my $result = "";
2690     while ($from + 10 <= $to) {
2691         $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-10\"></span>\n";
2692         $from += 10;
2693     }
2694     while ($from + 5 <= $to) {
2695         $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-5\"></span>\n";
2696         $from += 5;
2697     }
2698     while ($from + 2 <= $to) {
2699         $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-2\"></span>\n";
2700         $from += 2;
2701     }
2702     while ($from < $to) {
2703         $result .= "<span$extra class=\"$opt{style_prefix}ol-incr\"></span>\n";
2704         ++$from;
2705     }
2706     return $result;
2707 }
2708
2709
2710 sub _DoListsAndBlocks {
2711 #
2712 # Form HTML ordered (numbered) and unordered (bulleted) lists.
2713 #
2714     my $text = shift;
2715     my $indent = $opt{indent_width};
2716     my $less_than_indent = $indent - 1;
2717     my $less_than_double_indent = 2 * $indent - 1;
2718
2719     # Re-usable pattern to match any entire ul or ol list:
2720     my $whole_list = qr{
2721         (                           # $1 (or $_[0]) = whole list
2722           (                         # $2 (or $_[1])
2723             (?:(?<=\n)|\A)
2724             [ ]{0,$less_than_indent}
2725             (${marker_any})         # $3 (or $_[2]) = first list item marker
2726             [ ]+
2727           )
2728           (?s:.+?)
2729           (                         # $4 (or $_[3])
2730               \z
2731             |
2732               \n{2,}
2733               (?=\S)
2734               (?!                   # Negative lookahead for another list item marker
2735                 ${marker_any}[ ]
2736               )
2737           )
2738         )
2739     }mx;
2740
2741     my $list_item_sub = sub {
2742         my $list = $_[0];
2743         my $list_type = ($_[2] =~ m/$marker_ul/) ? "ul" : "ol";
2744         my $list_att = "";
2745         my $list_class = "";
2746         my $list_incr = "";
2747         # Turn double returns into triple returns, so that we can make a
2748         # paragraph for the last item in a list, if necessary:
2749         $list =~ s/\n\n/\n\n\n/g;
2750         my ($result, $first_marker, $fancy) = _ProcessListItems($list_type, $list);
2751         defined($first_marker) or return $list;
2752         my $list_marker_type = _GetListMarkerType($list_type, $first_marker);
2753         if ($list_marker_type) {
2754                 $first_marker =~ s/[.\)]$//;
2755                 my $first_marker_num = _GetMarkerIntegerNum($list_marker_type, $first_marker);
2756                 $list_att = $list_marker_type eq "1" ? "" : " type=\"$list_marker_type\"";
2757                 if ($fancy) {
2758                     $list_class = " class=\"$opt{style_prefix}ol\"";
2759                     my $start = $first_marker_num;
2760                     $start = 10 if $start > 10;
2761                     $start = 5 if $start > 5 && $start < 10;
2762                     $start = 1 if $start > 1 && $start < 5;
2763                     $list_att .= " start=\"$start\"" unless $start == 1;
2764                     $list_incr = _IncrList($start, $first_marker_num);
2765                 } else {
2766                     $list_class = " class=\"$opt{style_prefix}lc-greek\""
2767                         if $list_marker_type eq "a" && $first_marker =~ /^$greek_lower/o;
2768                     $list_att .= " start=\"$first_marker_num\"" unless $first_marker_num == 1;
2769                 }
2770         }
2771         my $idt = "\027" x $g_list_level;
2772         $result = "$idt<$list_type$list_att$list_class>\n$list_incr" . $result . "$idt</$list_type>\n\n";
2773         $result;
2774     };
2775
2776     # We use a different prefix before nested lists than top-level lists.
2777     # See extended comment in _ProcessListItems().
2778     #
2779     # Note: (jg) There's a bit of duplication here. My original implementation
2780     # created a scalar regex pattern as the conditional result of the test on
2781     # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
2782     # substitution once, using the scalar as the pattern. This worked,
2783     # everywhere except when running under MT on my hosting account at Pair
2784     # Networks. There, this caused all rebuilds to be killed by the reaper (or
2785     # perhaps they crashed, but that seems incredibly unlikely given that the
2786     # same script on the same server ran fine *except* under MT. I've spent
2787     # more time trying to figure out why this is happening than I'd like to
2788     # admit. My only guess, backed up by the fact that this workaround works,
2789     # is that Perl optimizes the substition when it can figure out that the
2790     # pattern will never change, and when this optimization isn't on, we run
2791     # afoul of the reaper. Thus, the slightly redundant code to that uses two
2792     # static s/// patterns rather than one conditional pattern.
2793     #
2794     # Note: (kjm) With the addition of the two-of-the-same-kind-in-a-row-
2795     # starts-a-list-at-the-top-level rule the two patterns really are somewhat
2796     # different now, but the duplication has pretty much been eliminated via
2797     # use of a separate sub which has the side-effect of making the below
2798     # two cases much easier to grok all at once.
2799
2800     if ($g_list_level) {
2801         my $parse = $text;
2802         $text = "";
2803         pos($parse) = 0;
2804         while ($parse =~ /\G(?s:.)*?^$whole_list/gmc) {
2805             my @captures = ($1, $2, $3, $4);
2806             if ($-[1] > $-[0]) {
2807                 $text .= _DoBTListBlocks(substr($parse, $-[0], $-[1] - $-[0]));
2808             }
2809             $text .= &$list_item_sub(@captures);
2810         }
2811         $text .= _DoBTListBlocks(substr($parse, pos($parse))) if pos($parse) < length($parse);
2812     }
2813     else {
2814         my $parse = $text;
2815         $text = "";
2816         pos($parse) = 0;
2817         while ($parse =~ m{\G(?s:.)*?
2818                 (?: (?<=\n\n) |
2819                     \A\n? |
2820                     (?<=:\n) |
2821                     (?:(?<=\n) # a list starts with one unordered marker line
2822                        (?=[ ]{0,$less_than_indent}$marker_ul[ ])) |
2823                     (?:(?<=\n) # or two ordered marker lines in a row
2824                        (?=[ ]{0,$less_than_indent}$marker_ol[ ].*\n\n?
2825                           [ ]{0,$less_than_indent}$marker_ol[ ])) |
2826                     (?:(?<=\n) # or any marker and a sublist marker
2827                        (?=[ ]{0,$less_than_indent}$marker_any[ ].*\n\n?
2828                           [ ]{$indent,$less_than_double_indent}$marker_any[ ]))
2829                 )
2830                 $whole_list
2831             }gmcx) {
2832             my @captures = ($1, $2, $3, $4);
2833             if ($-[1] > $-[0]) {
2834                 $text .= _DoListBlocks(substr($parse, $-[0], $-[1] - $-[0]));
2835             }
2836             $text .= &$list_item_sub(@captures);
2837         }
2838         $text .= _DoListBlocks(substr($parse, pos($parse))) if pos($parse) < length($parse);
2839     }
2840
2841     return $text;
2842 }
2843
2844
2845 sub _ProcessListItems {
2846 #
2847 #   Process the contents of a single ordered or unordered list, splitting it
2848 #   into individual list items.
2849 #
2850
2851     my $list_type = shift;
2852     my $list_str = shift;
2853
2854     # The $g_list_level global keeps track of when we're inside a list.
2855     # Each time we enter a list, we increment it; when we leave a list,
2856     # we decrement. If it's zero, we're not in a list anymore.
2857     #
2858     # We do this because when we're not inside a list, we want to treat
2859     # something like this:
2860     #
2861     #   I recommend upgrading to version
2862     #   8. Oops, now this line is treated
2863     #   as a sub-list.
2864     #
2865     # As a single paragraph, despite the fact that the second line starts
2866     # with a digit-period-space sequence.
2867     #
2868     # Whereas when we're inside a list (or sub-list), that line will be
2869     # treated as the start of a sub-list. What a kludge, huh? This is
2870     # an aspect of Markdown's syntax that's hard to parse perfectly
2871     # without resorting to mind-reading. Perhaps the solution is to
2872     # change the syntax rules such that sub-lists must start with a
2873     # starting cardinal number; e.g. "1." or "a.".
2874
2875     $g_list_level++;
2876     my $idt = "\027" x $g_list_level;
2877     my $marker_kind = $list_type eq "ul" ? $marker_ul : $marker_ol;
2878     my $first_marker;
2879     my $first_marker_type;
2880     my $first_marker_num;
2881     my $last_marker;
2882     my $fancy;
2883     my $skipped;
2884     my $typechanged;
2885     my $next_num = 1;
2886
2887     # trim trailing blank lines:
2888     $list_str =~ s/\n{2,}\z/\n/;
2889
2890     my $result = "";
2891     my $oldpos = 0;
2892     pos($list_str) = 0;
2893     while ($list_str =~ m{\G            # start where we left off
2894         (\n+)?                          # leading line = $1
2895         (^[ ]*)                         # leading whitespace = $2
2896         ($marker_any) [ ] ([ ]*)        # list marker = $3 leading item space = $4
2897     }cgmx) {
2898         my $leading_line = $1;
2899         my $leading_space = $2;
2900         my $list_marker = $3;
2901         my $list_marker_len = length($list_marker);
2902         my $leading_item_space = $4;
2903         if ($-[0] > $oldpos) {
2904             $result .= substr($list_str, $oldpos, $-[0] - $oldpos); # Sort-of $`
2905             $oldpos = $-[0]; # point at start of this entire match
2906         }
2907         if (!defined($first_marker)) {
2908             $first_marker = $list_marker;
2909             $first_marker_type = _GetListMarkerType($list_type, $first_marker);
2910             if ($first_marker_type) {
2911                 (my $marker_val = $first_marker) =~ s/[.\)]$//;
2912                 $first_marker_num = _GetMarkerIntegerNum($first_marker_type, $marker_val);
2913                 $next_num = $first_marker_num;
2914                 $skipped = 1 if $next_num != 1;
2915             }
2916         } elsif ($list_marker !~ /$marker_kind/) {
2917             # Wrong marker kind, "fix up" the marker to a correct "lazy" marker
2918             # But keep the old length in $list_marker_len
2919             $list_marker = $last_marker;
2920         }
2921
2922         # Now grab the rest of this item's data upto but excluding the next
2923         # list marker at the SAME indent level, but sublists must be INCLUDED
2924
2925         my $item = "";
2926         while ($list_str =~ m{\G
2927             ((?:.+?)(?:\n{1,2}))        # list item text = $1
2928             (?= \n* (?: \z |            # end of string OR
2929                     (^[ ]*)             # leading whitespace = $2
2930                     ($marker_any)       # next list marker = $3
2931                     ([ ]+) ))           # one or more spaces after marker = $4
2932         }cgmxs) {
2933
2934             # If $3 has a left edge that is at the left edge of the previous
2935             # marker OR $3 has a right edge that is at the right edge of the
2936             # previous marker then we stop; otherwise we go on
2937
2938             $item .= substr($list_str, $-[0], $+[0] - $-[0]); # $&
2939             last if !defined($4) || length($2) == length($leading_space) ||
2940                 length($2) + length($3) == length($leading_space) + $list_marker_len;
2941             # move along, you're not the marker droid we're looking for...
2942             $item .= substr($list_str, $+[0], $+[4] - $+[0]);
2943             pos($list_str) = $+[4]; # ...move along over the marker droid
2944         }
2945         # Remember where we parked
2946         $oldpos = pos($list_str);
2947
2948         # Process the $list_marker $item
2949
2950         my $liatt = '';
2951         my $checkbox = '';
2952         my $incr = '';
2953
2954         if ($list_type eq "ul" && !$leading_item_space && $item =~ /^\[([ xX\xd7])\] +(.*)$/s) {
2955             my $checkmark = $1;
2956             $item = $2;
2957             my ($checkbox_class, $checkbox_val);
2958             if ($checkmark ne " ") {
2959                 ($checkbox_class, $checkbox_val) = ("checkbox-on", "x");
2960             } else {
2961                 ($checkbox_class, $checkbox_val) = ("checkbox-off", "&#160;");
2962             }
2963             $liatt = " class=\"$opt{style_prefix}$checkbox_class\"";
2964             $checkbox = "<span><span></span></span><span></span><span>[<tt>$checkbox_val</tt>]&#160;</span>";
2965         } else {
2966             my $list_marker_type;
2967             ($list_marker_type, $liatt) = _GetListItemTypeClass($list_type, $list_marker, $last_marker);
2968             if ($list_type eq "ol" && defined($first_marker)) {
2969                 my $styled = $fancy = 1 if $liatt && $list_marker =~ /\)$/;
2970                 my ($sfx, $dash) = ("", "");
2971                 ($sfx, $dash) = ("li", "-") if $styled;
2972                 if ($liatt =~ /lower/) {
2973                     $sfx .= "${dash}lc";
2974                 } elsif ($liatt =~ /upper/) {
2975                     $sfx .= "${dash}uc";
2976                 }
2977                 $sfx .= "-greek" if $liatt =~ /greek/;
2978                 $liatt = " class=\"$opt{style_prefix}$sfx\"" if $sfx;
2979                 $typechanged = 1 if $list_marker_type ne $first_marker_type;
2980                 (my $marker_val = $list_marker) =~ s/[.\)]$//;
2981                 my $marker_num = _GetMarkerIntegerNum($list_marker_type, $marker_val);
2982                 $marker_num = $next_num if $marker_num < $next_num;
2983                 $skipped = 1 if $next_num < $marker_num;
2984                 $incr = _IncrList($next_num, $marker_num, "incrlevel=$g_list_level");
2985                 $liatt = " value=\"$marker_num\"$liatt" if $fancy || $skipped;
2986                 $liatt = " type=\"$list_marker_type\"$liatt" if $styled || $typechanged;
2987                 $next_num = $marker_num + 1;
2988             }
2989         }
2990         $last_marker = $list_marker;
2991
2992         if ($item =~ /^(.+)/) {
2993             my $ml_text = $1;
2994             my $ml_len = length($1);
2995             my $ml_sub = sub {my $ml_mk = shift; $ml_mk =~ s!([-+*.\)])!$g_escape_table{$1}!go; $ml_mk};
2996             $ml_text =~ s/(?:(?<= )|\A)(${marker_any})(?= )/&$ml_sub($1)/ge;
2997             $item = $ml_text . substr($item, $ml_len);
2998         }
2999         if ($leading_line or ($item =~ m/\n{2,}/)) {
3000             $item = _RunBlockGamut(_Outdent($item));
3001             $item =~ s{(</[OUou][Ll]>)\s*\z}{$1} and $item .= "\n$idt<span style=\"display:none\">&#160;</span>";
3002         }
3003         else {
3004             # Recursion for sub-lists:
3005             $item = _DoListsAndBlocks(_Outdent($item));
3006             chomp $item;
3007             $item = _RunSpanGamut($item);
3008         }
3009
3010         # Append to $result
3011         $result .= "$incr$idt<li$liatt>" . $checkbox . $item . "$idt</li>\n";
3012     }
3013     if ($fancy) {
3014         # remove "incrlevel=$g_list_level " parts
3015         $result =~ s{<span incrlevel=$g_list_level class="$opt{style_prefix}ol-incr((?:-\d{1,2})?)">}
3016             {$idt<span class="$opt{style_prefix}ol-incr$1">}g;
3017     } else {
3018         # remove the $g_list_level incr spans entirely
3019         $result =~ s{<span incrlevel=$g_list_level class="$opt{style_prefix}ol-incr(?:-\d{1,2})?"></span>\n}{}g;
3020         # remove the class="$opt{style_prefix}lc-greek" if first_marker is greek
3021         $result =~ s{(<li[^>]*?) class="$opt{style_prefix}lc-greek">}{$1>}g
3022             if defined($first_marker_type) && $first_marker_type eq "a" && $first_marker =~ /^$greek_lower/o;
3023     }
3024
3025     # Anything left over (similar to $') goes into result, but this should always be empty
3026     $result .= _RunBlockGamut(substr($list_str, pos($list_str))) if pos($list_str) < length($list_str);
3027
3028     $g_list_level--;
3029
3030     # After all that, if we only got an ordered list with a single item
3031     # and its first marker is a four-digit number >= 1492 and <= 2999
3032     # or an UPPERCASE letter, then pretend we didn't see any list at all.
3033
3034     if ($first_marker_type && $first_marker_num + 1 == $next_num) {
3035         if (($first_marker_type eq "1" && $first_marker_num >= 1492 && $first_marker_num <= 2999) ||
3036             ($first_marker_type eq "A" && !$fancy)) {
3037             return (undef, undef, undef);
3038         }
3039     }
3040
3041     return ($result, $first_marker, $fancy);
3042 }
3043
3044
3045 sub _DoCodeBlocks {
3046 #
3047 #   Process Markdown `<pre><code>` blocks.
3048 #
3049
3050     my $text = shift;
3051     my $less_than_indent = $opt{indent_width} - 1;
3052
3053     $text =~ s{
3054             (\n\n|\A\n?)
3055             (           # $2 = the code block -- one or more lines, starting with indent_width spaces
3056               (?:
3057                 (?:[ ]{$opt{indent_width}})  # Lines must start with indent_width of spaces
3058                 .*\n+
3059               )+
3060             )
3061             (?:(?=(^[ ]{0,$less_than_indent}\S.*))|\Z) # Lookahead for non-space at line-start, or end of doc
3062         }{&{sub{
3063             my ($prefix, $codeblock, $n) = ($1, $2, $3);
3064
3065             if (defined($n) && length($n) && (()=($codeblock =~ /\n/g)) == 1 && _IsTableStart($codeblock.$n."\n")) {
3066                 return $prefix.$codeblock;
3067             }
3068
3069             $codeblock =~ s/\n\n\n/\n\n/g; # undo "paragraph for last list item" change
3070             $codeblock = _EncodeCode(_Outdent($codeblock));
3071             $codeblock =~ s/\A\n+//; # trim leading newlines
3072             $codeblock =~ s/\s+\z//; # trim trailing whitespace
3073
3074             my $result = "<div class=\"$opt{style_prefix}code\"><pre style=\"display:none\"></pre><pre><code>"
3075                 . $codeblock . "\n</code></pre></div>";
3076             my $key = block_id($result, 2);
3077             $g_code_blocks{$key} = $result;
3078             "\n\n" . $key . "\n\n";
3079         }}}egmx;
3080
3081     return $text;
3082 }
3083
3084
3085 sub _DoCodeSpans {
3086 #
3087 # * Backtick quotes are used for <code></code> spans.
3088 #
3089 # * You can use multiple backticks as the delimiters if you want to
3090 #   include literal backticks in the code span. So, this input:
3091 #
3092 #     Just type ``foo `bar` baz`` at the prompt.
3093 #
3094 #   Will translate to:
3095 #
3096 #     <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
3097 #
3098 #   There's no arbitrary limit to the number of backticks you
3099 #   can use as delimters. If you need three consecutive backticks
3100 #   in your code, use four for delimiters, etc.
3101 #
3102 # * You can use spaces to get literal backticks at the edges:
3103 #
3104 #     ... type `` `bar` `` ...
3105 #
3106 #   Turns to:
3107 #
3108 #     ... type <code>`bar`</code> ...
3109 #
3110
3111     my $text = shift;
3112
3113     $text =~ s@
3114             (`+)        # $1 = Opening run of `
3115             (.+?)       # $2 = The code block
3116             (?<!`)
3117             \1          # Matching closer
3118             (?!`)
3119         @
3120             my $c = "$2";
3121             $c =~ s/^[ ]+//g; # leading whitespace
3122             $c =~ s/[ ]+$//g; # trailing whitespace
3123             $c = _EncodeCode($c);
3124             "<code>$c</code>";
3125         @egsx;
3126
3127     return $text;
3128 }
3129
3130
3131 sub _EncodeCode {
3132 #
3133 # Encode/escape certain characters inside Markdown code runs.
3134 # The point is that in code, these characters are literals,
3135 # and lose their special Markdown meanings.
3136 #
3137     local $_ = shift;
3138
3139     # Encode all ampersands; HTML entities are not
3140     # entities within a Markdown code span.
3141     s/&/&amp;/g;
3142
3143     # Encode $'s, but only if we're running under Blosxom.
3144     # (Blosxom interpolates Perl variables in article bodies.)
3145     s/\$/&#036;/g if $_haveBX;
3146
3147     # Do the angle bracket song and dance:
3148     s! <  !&lt;!gx;
3149     s! >  !&gt;!gx;
3150
3151     # Now, escape characters that are magic in Markdown:
3152     s!([*_~{}\[\]\\])!$g_escape_table{$1}!go;
3153
3154     return $_;
3155 }
3156
3157
3158 sub _DoItalicsAndBoldAndStrike {
3159     my $text = shift;
3160
3161     my $doital1 = sub {
3162         my $text = shift;
3163         $text =~ s{ \* (?=\S) (.+?) (?<=\S) \* }
3164             {<em>$1</em>}gsx;
3165         # We've got to encode any of these remaining to
3166         # avoid conflicting with other italics and bold.
3167         $text =~ s!([*])!$g_escape_table{$1}!go;
3168         $text;
3169     };
3170     my $doital2 = sub {
3171         my $text = shift;
3172         $text =~ s{ (?<!\w) _ (?=\S) (.+?) (?<=\S) _ (?!\w) }
3173             {<em>$1</em>}gsx;
3174         # We've got to encode any of these remaining to
3175         # avoid conflicting with other italics and bold.
3176         $text =~ s!([_])!$g_escape_table{$1}!go;
3177         $text;
3178     };
3179
3180     # <strong> must go first:
3181     $text =~ s{ \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }
3182         {"<strong>".&$doital1($1)."</strong>"}gsex;
3183     $text =~ s{ (?<!\w) __ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\w) }
3184         {"<strong>".&$doital2($1)."</strong>"}gsex;
3185
3186     $text =~ s{ ~~ (?=\S) (.+?[*_]*) (?<=\S) ~~ }
3187         {<strike>$1</strike>}gsx;
3188
3189     $text =~ s{ \* (?=\S) (.+?) (?<=\S) \* }
3190         {<em>$1</em>}gsx;
3191     $text =~ s{ (?<!\w) _ (?=\S) (.+?) (?<=\S) _ (?!\w) }
3192         {<em>$1</em>}gsx;
3193
3194     return $text;
3195 }
3196
3197
3198 sub _DoBlockQuotes {
3199     my $text = shift;
3200
3201     $text =~ s{
3202           (                     # Wrap whole match in $1
3203             (
3204               ^[ ]*>[ ]?        # '>' at the start of a line
3205                 .*\n            # rest of the first line
3206               (.+\n)*           # subsequent consecutive lines
3207               \n*               # blanks
3208             )+
3209           )
3210         }{
3211             my $bq = $1;
3212             $bq =~ s/^[ ]*>[ ]?//gm; # trim one level of quoting
3213             $bq =~ s/^[ ]+$//mg;         # trim whitespace-only lines
3214             $bq = _RunBlockGamut($bq);   # recurse
3215
3216             $bq =~ s/^/\027/mg;
3217             "<blockquote>\n$bq\n</blockquote>\n\n";
3218         }egmx;
3219
3220
3221     return $text;
3222 }
3223
3224
3225 my ($LEAD, $TRAIL, $LEADBAR, $LEADSP, $COLPL, $SEP);
3226 BEGIN {
3227     $LEAD = qr/(?>[ ]*(?:\|[ ]*)?)/o;
3228     $TRAIL = qr/[ ]*(?<!\\)\|[ ]*/o;
3229     $LEADBAR = qr/(?>[ ]*\|[ ]*)/o;
3230     $LEADSP = qr/(?>[ ]*)/o;
3231     $COLPL = qr/(?:[^\n|\\]|\\(?:(?>[^\n])|(?=\n|$)))+/o;
3232     $SEP = qr/[ ]*:?-+:?[ ]*/o;
3233 }
3234
3235 sub _IsTableStart {
3236     my $text = shift;
3237     my $ans = 0;
3238
3239     if ($text =~ m{
3240          ^(                             # Header line
3241             $LEADBAR \| [^\n]* |
3242             $LEADBAR $COLPL [^\n]* |
3243             $LEADSP $COLPL \| [^\n]*
3244           )\n
3245           (                             # Separator line
3246             $LEADBAR $SEP (?: \| $SEP )* (?: \| [ ]*)? |
3247             $SEP (?: \| $SEP )+ (?: \| [ ]*)? |
3248             $SEP \| [ ]*
3249           )\n
3250                         }mx) {
3251         my ($h, $s) = ($1, $2);
3252         _SplitTableRow($h) == _SplitTableRow($s) and $ans = 1;
3253     }
3254
3255     return $ans;
3256 }
3257
3258 sub _DoTables {
3259     my $text = shift;
3260
3261     $text =~ s{
3262         (                               # Wrap whole thing to avoid $&
3263          (?: (?<=\n\n) | \A\n? )        # Preceded by blank line or beginning of string
3264          ^(                             # Header line
3265             $LEADBAR \| [^\n]* |
3266             $LEADBAR $COLPL [^\n]* |
3267             $LEADSP $COLPL \| [^\n]*
3268           )\n
3269           (                             # Separator line
3270             $LEADBAR $SEP (?: \| $SEP )* (?: \| [ ]*)? |
3271             $SEP (?: \| $SEP )+ (?: \| [ ]*)? |
3272             $SEP \| [ ]*
3273           )\n
3274           ((?:                          # Rows (0+)
3275             $LEADBAR \| [^\n]* \n |
3276             $LEADBAR $COLPL [^\n]* \n |
3277             $LEADSP $COLPL \| [^\n]* \n
3278           )*)
3279         )
3280     } {
3281         my ($w, $h, $s, $rows) = ($1, $2, $3, $4);
3282         my @heads = _SplitTableRow($h);
3283         my @seps = _SplitTableRow($s);
3284         if (@heads == @seps) {
3285             my @align = map {
3286                 if (/^:-+:$/) {" align=\"center\""}
3287                 elsif (/^:/) {" align=\"left\""}
3288                 elsif (/:$/) {" align=\"right\""}
3289                 else {""}
3290             } @seps;
3291             my $nohdr = "";
3292             $nohdr = " $opt{style_prefix}table-nohdr" if join("", @heads) eq "";
3293             my $tab ="\n<table border=\"1\" cellspacing=\"0\" cellpadding=\"2\" class=\"$opt{style_prefix}table$nohdr\">\n";
3294             $tab .=
3295                 "  <tr class=\"$opt{style_prefix}row-hdr\">" . _MakeTableRow("th", \@align, @heads) . "</tr>\n"
3296                 unless $nohdr;
3297             my $cnt = 0;
3298             my @classes = ("class=\"$opt{style_prefix}row-even\"", "class=\"$opt{style_prefix}row-odd\"");
3299             $tab .= "  <tr " . $classes[++$cnt % 2] . ">" . _MakeTableRow("td", \@align, @$_) . "</tr>\n"
3300                     foreach (_SplitMergeRows($rows));
3301             $tab .= "</table>\n\n";
3302         } else {
3303             $w;
3304         }
3305     }egmx;
3306
3307     return $text;
3308 }
3309
3310
3311 sub _SplitMergeRows {
3312     my @rows = ();
3313     my ($mergeprev, $mergenext) = (0,0);
3314     foreach (split(/\n/, $_[0])) {
3315         $mergeprev = $mergenext;
3316         $mergenext = 0;
3317         my @cols = _SplitTableRow($_);
3318         if (_endswithbareslash($cols[$#cols])) {
3319             my $last = $cols[$#cols];
3320             substr($last, -1, 1) = "";
3321             $last =~ s/[ ]+$//;
3322             $cols[$#cols] = $last;
3323             $mergenext = 1;
3324         }
3325         if ($mergeprev) {
3326             for (my $i = 0; $i <= $#cols; ++$i) {
3327                 my $cell = $rows[$#rows]->[$i];
3328                 defined($cell) or $cell = "";
3329                 $rows[$#rows]->[$i] = _MergeCells($cell, $cols[$i]);
3330             }
3331         } else {
3332             push(@rows, [@cols]);
3333         }
3334     }
3335     return @rows;
3336 }
3337
3338
3339 sub _endswithbareslash {
3340     return 0 unless substr($_[0], -1, 1) eq "\\";
3341     my @parts = split(/\\\\/, $_[0], -1);
3342     return substr($parts[$#parts], -1, 1) eq "\\";
3343 }
3344
3345
3346 sub _MergeCells {
3347     my ($c1, $c2) = @_;
3348     return $c1 if $c2 eq "";
3349     return $c2 if $c1 eq "";
3350     return $c1 . " " . $c2;
3351 }
3352
3353
3354 sub _SplitTableRow {
3355     my $row = shift;
3356     $row =~ s/^$LEAD//;
3357     $row =~ s/$TRAIL$//;
3358     $row =~ s!\\\\!$g_escape_table{'\\'}!go; # Must process escaped backslashes first.
3359     $row =~ s!\\\|!$g_escape_table{'|'}!go; # Then do \|
3360     my @elems = map {
3361       s!$g_escape_table{'|'}!|!go;
3362       s!$g_escape_table{'\\'}!\\\\!go;
3363       s/^[ ]+//;
3364       s/[ ]+$//;
3365       $_;
3366     } split(/[ ]*\|[ ]*/, $row, -1);
3367     @elems or push(@elems, "");
3368     return @elems;
3369 }
3370
3371
3372 sub _MakeTableRow {
3373     my $etype = shift;
3374     my $align = shift;
3375     my $row = "";
3376     for (my $i = 0; $i < @$align; ++$i) {
3377         my $data = $_[$i];
3378         defined($data) or $data = "";
3379         $row .= "<" . $etype . $$align[$i] . ">" .
3380             _RunSpanGamut($data) . "</" . $etype . ">";
3381     }
3382     return $row;
3383 }
3384
3385
3386 sub _FormParagraphs {
3387 #
3388 # Params:
3389 #   $text - string to process with html <p> tags
3390 #
3391     my ($text, $anchors) = @_;
3392
3393     # Strip leading and trailing lines:
3394     $text =~ s/\A\n+//;
3395     $text =~ s/\n+\z//;
3396
3397     my @grafs = split(/\n{2,}/, $text);
3398
3399     #
3400     # Wrap <p> tags.
3401     #
3402     foreach (@grafs) {
3403         unless (defined($g_html_blocks{$_}) || defined($g_code_blocks{$_})) {
3404             $_ = _RunSpanGamut($_);
3405             s/^([ ]*)/$g_start_p/;
3406             $_ .= $g_close_p;
3407         }
3408     }
3409
3410     #
3411     # Strip standalone XML comments if requested
3412     #
3413     if ($anchors && $opt{stripcomments} && @g_xml_comments) {
3414         my %xml_comment = ();
3415         $xml_comment{$_} = 1 foreach @g_xml_comments;
3416         my @grafs2 = ();
3417         do { push(@grafs2, $_) unless $xml_comment{$_} } foreach @grafs;
3418         @grafs = @grafs2;
3419     }
3420
3421     #
3422     # Unhashify HTML blocks
3423     #
3424     foreach (@grafs) {
3425         if (defined( $g_html_blocks{$_} )) {
3426             $_ = $g_html_blocks{$_};
3427         }
3428     }
3429
3430     return join "\n\n", @grafs;
3431 }
3432
3433
3434 # %ok_tag_name declared previously
3435 my $g_possible_tag_name;
3436 BEGIN {
3437     # note: length("blockquote") == 10
3438     $g_possible_tag_name = qr/(?i:[a-z]{1,10}|h[1-6]|\020)/o;
3439     %ok_tag_name = map({$_ => 1} "\20", qw(
3440         a abbr acronym address area
3441         b basefont bdo big blockquote br
3442         caption center cite code col colgroup
3443         dd del dfn div dl dt
3444         em
3445         font
3446         h1 h2 h3 h4 h5 h6 hr
3447         i img ins
3448         kbd
3449         li
3450         map
3451         ol
3452         p pre
3453         q
3454         s samp small span strike strong sub sup
3455         table tbody td tfoot th thead tr tt
3456         u ul
3457         var
3458     ));
3459     $ok_tag_name{$_} = 0 foreach (qw(
3460         dir menu
3461     ));
3462 }
3463
3464
3465 sub _SetAllowedTag {
3466     my ($tag, $forbid) = @_;
3467     $ok_tag_name{$tag} = $forbid ? 0 : 1
3468         if defined($tag) && exists($ok_tag_name{$tag});
3469 }
3470
3471
3472 # Encode leading '<' of any non-tags
3473 # However, "<?", "<!" and "<$" are passed through (legacy on that "<$" thing)
3474 sub _DoTag {
3475     my $tag = shift;
3476     return $tag if $tag =~ /^<[?\$!]/;
3477     if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} || $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
3478         $ok_tag_name{lc($1)}) {
3479
3480         return _ProcessURLTag("href", $tag, 1) if $tag =~ /^<a\s/i;
3481         return _ProcessURLTag("src", $tag) if $tag =~ /^<img\s/i;
3482         return $tag;
3483     }
3484     $tag =~ s/^</&lt;/;
3485     return $tag;
3486 }
3487
3488 # Strip out all tags that _DoTag would match
3489 sub _StripTags {
3490     my $text = shift;
3491     my $_StripTag = sub {
3492         my $tag = shift;
3493         return $tag if $tag =~ /^<[?\$!]/;
3494         if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} || $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
3495             $ok_tag_name{lc($1)}) {
3496
3497             return ""; # strip it out
3498         }
3499         return $tag;
3500     };
3501     $text =~ s{(<[^>]*>)}{&$_StripTag($1)}ige;
3502     return $text;
3503 }
3504
3505 my %univatt;    # universally allowed attribute names
3506 my %tagatt;     # per-element allowed attribute names
3507 my %tagmt;      # empty element tags
3508 my %tagocl;     # non-empty elements with optional closing tag
3509 my %tagacl;     # which %tagocl an opening %tagocl will close
3510 my %tagblk;     # block elements
3511 my %taginl;     # inline markup tags which trigger an auto <p> reopen
3512 my %taga1p;     # open tags which require at least one attribute
3513 my %lcattval;   # names of attribute values to lowercase
3514 my %impatt;     # names of "implied" attributes
3515 BEGIN {
3516     %univatt = map({$_ => 1} qw(class dir id lang style title xml:lang));
3517     %tagatt = (
3518         'a' => { map({$_ => 1} qw(href name rel target)) },
3519         'area' => { map({$_ => 1} qw(alt coords href nohref shape)) },
3520         'basefont' => { map({$_ => 1} qw(color face size)) },
3521         'br' => { map({$_ => 1} qw(clear)) },
3522         'caption' => { map({$_ => 1} qw(align)) },
3523         'col' => { map({$_ => 1} qw(align char charoff span width valign)) },
3524         'colgroup' => { map({$_ => 1} qw(align char charoff span width valign)) },
3525         'dir' => { map({$_ => 1} qw(compact)) },
3526         'div' => { map({$_ => 1} qw(align)) },
3527         'dl' => { map({$_ => 1} qw(compact)) },
3528         'font' => { map({$_ => 1} qw(color face size)) },
3529         'h1' => { map({$_ => 1} qw(align)) },
3530         'h2' => { map({$_ => 1} qw(align)) },
3531         'h3' => { map({$_ => 1} qw(align)) },
3532         'h4' => { map({$_ => 1} qw(align)) },
3533         'h5' => { map({$_ => 1} qw(align)) },
3534         'h6' => { map({$_ => 1} qw(align)) },
3535         'hr' => { map({$_ => 1} qw(align noshade size width)) },
3536         # NO server-side image maps, therefore NOT ismap !
3537         'img' => { map({$_ => 1} qw(align alt border height hspace src usemap vspace width)) },
3538         'li' => { map({$_ => 1} qw(compact type value)) },
3539         'map' => { map({$_ => 1} qw(name)) },
3540         'menu' => { map({$_ => 1} qw(compact)) },
3541         'ol' => { map({$_ => 1} qw(compact start type)) },
3542         'p' => { map({$_ => 1} qw(align)) },
3543         'pre' => { map({$_ => 1} qw(width)) },
3544         'table' => { map({$_ => 1} qw(align bgcolor border cellpadding cellspacing frame rules summary width)) },
3545         'tbody' => { map({$_ => 1} qw(align char charoff valign)) },
3546         'tfoot' => { map({$_ => 1} qw(align char charoff valign)) },
3547         'thead' => { map({$_ => 1} qw(align char charoff valign)) },
3548         'td' => { map({$_ => 1} qw(align bgcolor char charoff colspan height nowrap rowspan valign width)) },
3549         'th' => { map({$_ => 1} qw(align bgcolor char charoff colspan height nowrap rowspan valign width)) },
3550         'tr' => { map({$_ => 1} qw(align bgcolor char charoff valign)) },
3551         'ul' => { map({$_ => 1} qw(compact type)) }
3552     );
3553     %tagmt = map({$_ => 1} qw(area basefont br col hr img));
3554     %tagocl = map({$_ => 1} qw(colgroup dd dt li p tbody td tfoot th thead tr));
3555     %tagacl = (
3556         'colgroup' => \%tagocl,
3557         'dd' => { map({$_ => 1} qw(colgroup dd dt li p)) },
3558         'dt' => { map({$_ => 1} qw(colgroup dd dt li p)) },
3559         'li' => { map({$_ => 1} qw(colgroup dd dt li p)) },
3560         'tbody' => \%tagocl,
3561         'td' => { map({$_ => 1} qw(colgroup dd dt li p td th)) },
3562         'tfoot' => \%tagocl,
3563         'th' => { map({$_ => 1} qw(colgroup dd dt li p td th)) },
3564         'thead' => \%tagocl,
3565         'tr' => { map({$_ => 1} qw(colgroup dd dt li p td th tr)) },
3566     );
3567     %tagblk = map({$_ => 1} qw(address blockquote center div dl h1 h2 h3 h4 h5 h6 hr ol p pre table ul));
3568     %taginl = map({$_ => 1} qw(a abbr acronym b basefont bdo big br cite code dfn em font i
3569                                img kbd map q s samp small span strike strong sub sup tt u var));
3570     %impatt = map({$_ => 1} qw(checked compact ismap nohref noshade nowrap));
3571     %lcattval = map({$_ => 1} qw(
3572         align border cellpadding cellspacing checked clear color colspan
3573         compact coords height hspace ismap nohref noshade nowrap rowspan size
3574         span shape valign vspace width
3575     ));
3576     %taga1p = map({$_ => 1} qw(a area bdo img map));
3577 }
3578
3579
3580 # _SanitizeTags
3581 #
3582 # Inspect all '<'...'>' tags in the input and HTML encode those things
3583 # that cannot possibly be tags and at the same time sanitize them.
3584 #
3585 # $1 => text to process
3586 # <= sanitized text
3587 sub _SanitizeTags {
3588     my ($text, $validate, $htmlauto) = @_;
3589     $text =~ s/\s+$//;
3590     $text ne "" or return "";
3591     my @stack = ();
3592     my $ans = "";
3593     my $end = length($text);
3594     pos($text) = 0;
3595     my ($autoclose, $autoclopen);
3596     my $lastmt = "";
3597     my $reopenp = 0;
3598     $autoclose = $htmlauto ? sub {
3599         my $s = $_[0] || "";
3600         while (@stack &&
3601                ($stack[$#stack]->[0] ne $s || $_[1] && !$stack[$#stack]->[2]) &&
3602                $tagocl{$stack[$#stack]->[0]}) {
3603             $ans .= "</" . $stack[$#stack]->[0] . ">";
3604             pop(@stack);
3605         }
3606     } : sub {} if $validate;
3607     $autoclopen = $htmlauto ? sub {
3608         my $s = $_[0] || "";
3609         my $c;
3610         if ($tagblk{$s}) {$c = {p=>1}}
3611         elsif ($tagocl{$s}) {$c = $tagacl{$s}}
3612         else {return}
3613         my $clp = 0;
3614         while (@stack && $c->{$stack[$#stack]->[0]}) {
3615             $clp = 0;
3616             if ($stack[$#stack]->[2] &&
3617                 $stack[$#stack]->[1]+3 eq $_[1]) {
3618                 $ans .= "</\20>";
3619             } else {
3620                 $ans .= "</" . $stack[$#stack]->[0] . ">";
3621             }
3622             if ($stack[$#stack]->[2]) {
3623                 $stack[$#stack]->[0] = "\20";
3624             } else {
3625                 $clp = $s ne "p" && $stack[$#stack]->[0] eq "p";
3626                 pop(@stack);
3627             }
3628         }
3629         $clp;
3630     } : sub {} if $validate;
3631     while (pos($text) < $end) {
3632         if ($text =~ /\G(\s+)/gc) {
3633             $ans .= $1;
3634             next;
3635         }
3636         if ($text =~ /\G([^<]+)/gc) {
3637             if ($validate && @stack && $stack[$#stack]->[0] eq "\20") {
3638                 push(@stack,["p",pos($text)-length($1)]);
3639                 $reopenp = 0;
3640                 $ans .= "<p>";
3641             }
3642             $reopenp && do {
3643                 push(@stack,["p",pos($text)-length($1)]);
3644                 $reopenp = 0;
3645                 $ans .= "<p>";
3646             };
3647             $ans .= _EncodeAmps($1);
3648             $lastmt = "";
3649             next;
3650         }
3651         my $tstart = pos($text);
3652         if ($opt{stripcomments} != 2 &&
3653             $text =~ /\G(<!--(?:[^-]|(?:-(?!-)))*-->)/gc) {
3654             # pass "comments" through unless stripping them
3655             if ($opt{stripcomments} && $opt{stripcomments} < 3) {
3656                 # strip any trailing whitespace + \n after comment if present
3657                 $text =~ /\G[ \t]*\n/gc;
3658             } else {
3659                 # pass the "comment" on through
3660                 $ans .= $1;
3661             }
3662             next;
3663         }
3664         if ($opt{stripcomments} >= 2 &&
3665             $text =~ /\G(<!--(?:[^-]|(?:-(?!->)))*-->)/gc) {
3666             # strip any trailing whitespace + \n after lax comment if present
3667             $text =~ /\G[ \t]*\n/gc;
3668             next;
3669         }
3670         if ($text =~ /\G(<[^>]*>)/gc) {
3671             my $tag = $1;
3672             my $tt;
3673             if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} ||
3674                  $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
3675                 $ok_tag_name{$tt=lc($1)})
3676             {
3677                 my ($stag, $styp, $autocloseflag) = _Sanitize($tag);
3678                 if ($styp == 2 && $lastmt eq $tt) {
3679                     $lastmt = "";
3680                     next;
3681                 }
3682                 $lastmt = $styp == -3 ? $tt : "";
3683                 $tt = "p" if $autocloseflag;
3684                 if ($validate && $styp) {
3685                     my $clp = &$autoclopen($tt, $tstart) if $styp != 2;
3686                     if ($styp == 1) {
3687                         $reopenp && $taginl{$tt} and do {
3688                             push(@stack,["p",$tstart]);
3689                             $ans .= "<p>";
3690                         };
3691                         push(@stack,[$tt,$tstart,$autocloseflag,$clp]);
3692                         $reopenp = 0;
3693                     } elsif ($styp == 2) {
3694                         $reopenp && ($tt eq "p" || $tt eq "\20") and do {
3695                             $reopenp = 0;
3696                             next;
3697                         };
3698                         &$autoclose($tt, $autocloseflag);
3699                         my $mtstkchk = sub {
3700                             !@stack and _xmlfail("closing tag $tt without matching open at " .
3701                                 _linecol($tstart, $text));
3702                         };
3703                         &$mtstkchk;
3704                         if ($autocloseflag && $stack[$#stack]->[0] eq "\20") {
3705                             pop(@stack);
3706                             $stag = "";
3707                         } elsif ($stack[$#stack]->[0] eq $tt) {
3708                             $stack[$#stack]->[3] and $reopenp = 1;
3709                             pop(@stack);
3710                         } else {
3711                             pop(@stack) while @stack && $stack[$#stack]->[0] eq "\20";
3712                             &$mtstkchk;
3713                             my @i = @{$stack[$#stack]};
3714                             _xmlfail("opening tag $i[0] at " . _linecol($i[1], $text) .
3715                                 " mismatch with closing tag $tt at " . _linecol($tstart, $text));
3716                         }
3717                     }
3718                 }
3719                 $ans .= $stag;
3720                 next;
3721             } else {
3722                 $tag =~ s/^</&lt;/;
3723                 $ans .= _EncodeAmps($tag);
3724                 $lastmt = "";
3725                 next;
3726             }
3727         }
3728         # can only get here if "\G" char is an unmatched "<"
3729         pos($text) += 1;
3730         $ans .= "&lt;";
3731         $lastmt = "";
3732     }
3733     &$autoclose if $validate;
3734     if ($validate && @stack) {
3735         my @errs;
3736         my $j;
3737         for ($j = 0; $j <= $#stack; ++$j) {
3738                 my @i = @{$stack[$j]};
3739                 next if $i[0] eq "\20";
3740                 unshift(@errs, "opening tag $i[0] without matching close at " .
3741                             _linecol($i[1], $text));
3742         }
3743         _xmlfail(@errs) unless !@errs;
3744     }
3745     # Remove any unwanted extra leading <p></p> sections
3746     $ans =~ s{<p></\020>}{}gs if $validate;
3747
3748     return $ans."\n";
3749 }
3750
3751
3752 sub _linecol {
3753         my ($pos, $txt) = @_;
3754         pos($txt) = 0;
3755         my ($l, $p);
3756         $l = 1 + $opt{firstline};
3757         ++$l while ($p = pos($txt)), $txt =~ /\G[^\n]*\n/gc && pos($txt) <= $pos;
3758         return "line $l col " . (1 + ($pos - $p));
3759 }
3760
3761
3762 sub _xmlfail {
3763         die join("", map("$_\n", @_));
3764 }
3765
3766
3767 sub _Sanitize {
3768     my $tag = shift;
3769     my $seenatt = {};
3770     if ($tag =~ m{^</}) {
3771         my $autocloseflag = undef;
3772         $autocloseflag = 1, $tag="</p>" if $tag eq "</\20>";
3773         $tag =~ tr/\t\n\f\r //d; # remove whitespace
3774         return (lc($tag),2,$autocloseflag);
3775     }
3776     if ($tag =~ m{^<([^\s</>]+)\s+}gs) {
3777         my $tt = lc($1);
3778         my $autocloseflag = undef;
3779         $autocloseflag = 1, $tt="p" if $tt eq "\20";
3780         my $out = "<" . $tt . " ";
3781         my $ok = $tagatt{$tt};
3782         ref($ok) eq "HASH" or $ok = {};
3783         my $atc = 0;
3784         while ($tag =~ m{\G\s*([^\s\042\047</>=]+)((?>=)|\s*)}gcs) {
3785             my ($a,$s) = ($1, $2);
3786             if ($s eq "" && substr($tag, pos($tag), 1) =~ /^[\042\047]/) {
3787                 # pretend the "=" sign wasn't overlooked
3788                 $s = "=";
3789             }
3790             if (substr($s,0,1) ne "=") {
3791                 # it's one of "those" attributes (e.g. compact) or not
3792                 # _SanitizeAtt will fix it up if it is
3793                 $out .= _SanitizeAtt($a, '""', $ok, $seenatt, $tt);
3794                 ++$atc;
3795                 next;
3796             }
3797             if ($tag =~ /\G([\042\047])((?:(?!\1)(?!<).)*)\1\s*/gcs) {
3798                 $out .= _SanitizeAtt($a, $1.$2.$1, $ok, $seenatt, $tt);
3799                 ++$atc;
3800                 next;
3801             }
3802             if ($tag =~ m{\G([\042\047])((?:(?!\1)(?![<>])(?!/>).)*)}gcs) {
3803                 # what to do what to do what to do
3804                 # trim trailing \s+ and magically add the missing quote
3805                 my ($q, $v) = ($1, $2);
3806                 $v =~ s/\s+$//;
3807                 $out .= _SanitizeAtt($a, $q.$v.$q, $ok, $seenatt, $tt);
3808                 ++$atc;
3809                 next;
3810             }
3811             if ($tag =~ m{\G([^\s<>]+)\s*}gcs) {
3812                 # auto quote it
3813                 my $v = $1;
3814                 $v =~ s/\042/&quot;/go;
3815                 $out .= _SanitizeAtt($a, '"'.$v.'"', $ok, $seenatt, $tt);
3816                 ++$atc;
3817                 next;
3818             }
3819             # give it an empty value
3820             $out .= _SanitizeAtt($a, '""', $ok, $seenatt, $tt);
3821             ++$atc;
3822         }
3823         my $sfx = substr($tag, pos($tag));
3824         $out =~ s/\s+$//;
3825         my $typ = 1;
3826         if ($tagmt{$tt}) {
3827             $typ = ($sfx =~ m,/>$,) ? 3 : -3;
3828             $out .= $opt{empty_element_suffix};
3829             return ("&lt;" . substr($tag,1), 0) if !$atc && $taga1p{$tt};
3830         } else {
3831             if ($sfx =~ m,/>$,) {
3832                 return ("&lt;" . substr($tag,1), 0) if !$atc && $taga1p{$tt};
3833                 $typ = 3;
3834             } else {
3835                 return ("&lt;" . substr($tag,1), 0) if !$atc && $taga1p{$tt};
3836             }
3837             $out .= ">";
3838             $out .= "</$tt>" if $typ == 3;
3839         }
3840         return ($out,$typ,$autocloseflag);
3841     } elsif ($tag =~ /^<([^\s<\/>]+)/s) {
3842         my $tt = lc($1);
3843         return ("&lt;" . substr($tag,1), 0) if $taga1p{$tt};
3844         if ($tagmt{$tt}) {
3845             my $typ = ($tag =~ m,/>$,) ? 3 : -3;
3846             return ("<" . $tt . $opt{empty_element_suffix}, $typ);
3847         } elsif ($tag =~ m,/>$,) {
3848             return ("<" . $tt . "></" . $tt . ">", 3);
3849         } else {
3850             return ("<" . $tt . ">", 1) unless $tt eq "\20";
3851             return ("<p>", 1, 1);
3852         }
3853     }
3854     return (lc($tag),0);
3855 }
3856
3857
3858 sub _SanitizeAtt {
3859     my $att = lc($_[0]);
3860     return "" unless $att =~ /^[_a-z:][_a-z:0-9.-]*$/; # no weirdo char att names
3861     return "" unless $univatt{$att} || $_[2]->{$att};
3862     return "" if $_[3]->{$att}; # no repeats
3863     $_[3]->{$att} = 1;
3864     $impatt{$att} and return $att."=".'"'.$att.'" ';
3865     (($_[4] eq "a" && $att eq "href") ||
3866      ($_[4] eq "img" && $att eq "src")) &&
3867     $_[1] =~ /^\s*[\047\042]\s*javascript:/io and do {
3868         $_[1] = '"#"';
3869         ref($opt{base_prefix}) eq 'CODE' and
3870             $_[1] = '"' . escapeXML(&{$opt{base_prefix}}("#")) . '"';
3871     };
3872     if ($_[4] eq "a") {
3873         $att eq "target" and
3874             return $_[1] =~ /^([\042\047])\s*_blank\s*\1$/io ? 'target="_blank" ' : "";
3875         $att eq "rel" and
3876             return $_[1] =~ /^([\042\047])\s*nofollow\s*\1$/io ? 'rel="nofollow" ' : "";
3877     }
3878     if ($lcattval{$att}) {
3879         return $att."="._SanitizeAttValue(lc($_[1]))." ";
3880     } else {
3881         my $satt = _SanitizeAttValue($_[1]);
3882         if (ref($opt{urlfunc}) eq 'CODE' &&
3883             (($_[4] eq "a" && $att eq "href") ||
3884              ($_[4] eq "img" && $att eq "src")) ) {
3885                 my ($lq,$v,$rq);
3886                 $lq = substr($satt, 0, 1);
3887                 $rq = substr($satt, -1, 1);
3888                 $v = unescapeXML(substr($satt, 1, length($satt)-2));
3889                 my ($uhost, $upath, $uq, $uf) = SplitURL($v);
3890                 $v = &{$opt{urlfunc}}($v, \%opt, $_[4], $uhost, $upath, $uq, $uf);
3891                 $satt = $lq . escapeXML($v) . $rq;
3892         }
3893         return $att."=".$satt." ";
3894     }
3895 }
3896
3897
3898 sub _SanitizeAttValue {
3899     my $v = shift;
3900     if ($v =~ /^([\042\047])(.*?)\1$/s) {
3901         return $1.escapeXML($2).$1;
3902     } else {
3903         return '"'.escapeXML($v).'"';
3904     }
3905 }
3906
3907
3908 sub _ProcessURLTag {
3909     my ($att, $tag, $dofrag) = @_;
3910
3911     $att = lc($att) . "=";
3912     if ($tag =~ /^(<[^\s>]+\s+)/g) {
3913         my $out = $1;
3914         while ($tag =~ /\G([^\s\042\047<\/>=]+=)([\042\047])((?:(?!\2)(?!<).)*)(\2\s*)/gcs) {
3915             my ($p, $q, $v, $s) = ($1, $2, $3, $4);
3916             if (lc($p) eq $att && $v ne "") {
3917                 if ($dofrag && $v =~ m"^#") {
3918                     $v = _FindFragmentMatch($v);
3919                     my $bpcr;
3920                     if (ref($bpcr = $opt{base_prefix}) eq 'CODE') {
3921                         $v = "\2\3" . &$bpcr($v);
3922                     }
3923                 } else {
3924                     $v = _PrefixURL($v);
3925                 }
3926                 $v = _EncodeAttText($v);
3927             }
3928             $out .= $p . $q . $v . $s;
3929         }
3930         $out .= substr($tag, pos($tag));
3931         substr($out,0,1) = $g_escape_table{'<'};
3932         substr($out,-1,1) = $g_escape_table{'>'};
3933         return $out;
3934     }
3935
3936     return $tag;
3937 }
3938
3939
3940 my $oops_entities;
3941 BEGIN { $oops_entities = qr/(?:lt|gt|amp|quot|apos|nbsp)/io; }
3942
3943
3944 # $_[0] => the value to XML escape
3945 # returns the XML escaped value
3946 # Encodes the five required entites (amp,lt,gt,quot,apos)
3947 # while preserving any pre-existing entities which means that
3948 # calling this repeatedly on already-escaped text should return
3949 # it unchanged (i.e. it's idempotent).
3950 #
3951 sub escapeXML {
3952     my $text = shift;
3953
3954     # Treat these accidents as though they had the needed ';'
3955     $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
3956
3957     # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
3958     #   http://bumppo.net/projects/amputator/
3959     $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
3960
3961     # Remaining entities now
3962     $text =~ s/\042/&quot;/g;
3963     $text =~ s/\047/&#39;/g; # Some older browsers do not grok &apos;
3964     $text =~ s/</&lt;/g;
3965     $text =~ s/>/&gt;/g;
3966
3967     return $text;
3968 }
3969
3970
3971 # $_[0] => value to be unescaped
3972 # returns unescaped value
3973 # The five required XML entities (amp,lt,gt,quot,apos) plus nbsp
3974 # are decoded as well as decimal (#d+) and hexadecimal (#xh+).
3975 #
3976 # While the escapeXML function tries to be idempotent when presented
3977 # with an already-escaped string, this function is NOT necessarily
3978 # idempotent when presented with an already decoded string unless it's
3979 # been decoded to the point there are no more recognizable entities left.
3980 # In other words given a string such as:
3981 #
3982 #   &amp;amp;amp;amp;
3983 #
3984 # Each call will only decode one layer of escaping and it will take four
3985 # successive calls to finally end up with just "&".
3986 #
3987 sub unescapeXML {
3988     my $text = shift;
3989
3990     # Treat these accidents as though they had the needed ';'
3991     $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
3992
3993     $text =~ s/&[qQ][uU][oO][tT];/\042/gso;
3994     $text =~ s/&[aA][pP][oO][sS];/\047/gso;
3995     $text =~ s/&[gG][tT];/>/gso;
3996     $text =~ s/&[lL][tT];/</gso;
3997     $text =~ s/&[nN][bB][sS][pP];/&#160;/gso;
3998     $text =~ s{&([aA][mM][pP]|\#\d+|\#x[0-9a-fA-F]+);}{
3999         local $_=$1;
4000         lc($_) eq 'amp' ? '&' :
4001         /^#(\d+)$/ ? chr($1) :
4002         /^#[xX](.*)$/ ? chr(hex($1)) :
4003         $_
4004     }gsex;
4005
4006     return $text;
4007 }
4008
4009
4010 # $_[0] => the input URL to split
4011 # $_[1] => if a true value, call unescapeXML before splitting
4012 # returns array:
4013 #  [0] => scheme, name+password, host, port ("" if not present)
4014 #  [1] => path in url (starts with "/" if absolute otherwise relative)
4015 #  [2] => query string ("" if not present otherwise starts with "?")
4016 #  [3] => fragment ("" if not present otherwise starts with "#")
4017 # The returned value recovers the (possibly unescapeXML'd) input
4018 # string by simply concatenating the returned array elements.
4019 #
4020 sub SplitURL {
4021         my ($url, $unesc) = @_;
4022         $unesc and $url = unescapeXML($url);
4023         my ($sh, $p, $q, $f) = ("", "", "", "");
4024         if ($url =~ m{^([A-Za-z][A-Za-z0-9.+-]*:)(//.*)$}os) {
4025                 $sh = $1;
4026                 $url = $2;
4027         }
4028         if ($url =~ m{^(//[^/?#]*)((?:[/?#].*)?)$}os) {
4029                 $sh .= $1;
4030                 $url = $2;
4031         }
4032         ($p, $q, $f) = $url =~ m{^([^?#]*)((?:[?][^#]*)?)((?:[#].*)?)$}os;
4033         return ($sh, $p, $q, $f);
4034 }
4035
4036
4037 my $_replacesub;
4038 BEGIN { $_replacesub = sub {
4039     my $x = $named_character_entity{$_[1]};
4040     $x ? '&#'.$x.';' : $_[0];
4041 } }
4042
4043
4044 # $_[0] => the input text to process
4045 # returns text with all known named character entities replaced
4046 # with their equivalent numerical entity
4047 sub ConvertNamedCharacterEntities {
4048     use bytes;
4049     my $text = shift;
4050     defined($text) or return undef;
4051     $text =~ s/(&([A-Za-z]{3,8}[1-4]{0,2});)/&$_replacesub($1,$2)/goes;
4052     return $text;
4053 }
4054
4055
4056 my $_usasciisub;
4057 BEGIN { $_usasciisub = sub {
4058     my $c = $_[0];
4059     my $o = ord($c);
4060     return ($o <= 999) ? (($o < 128) ? $c : "&#$o;") : sprintf("&#x%x;", $o);
4061 } }
4062
4063
4064 # $_[0] => the input text to process
4065 # returns text with non-US-ASCII characters replaced
4066 # with their equivalent numerical character entities,
4067 # but only if the input text has already been utf8::decode'd
4068 sub ConvertToASCII {
4069     my $text = shift;
4070     defined($text) or return undef;
4071     $text =~ s/([^\x00-\x7F])/&$_usasciisub($1)/goes;
4072     return $text;
4073 }
4074
4075
4076 sub _EncodeAmps {
4077     my $text = shift;
4078
4079     # Treat these accidents as though they had the needed ';'
4080     $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
4081
4082     # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
4083     #   http://bumppo.net/projects/amputator/
4084     $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
4085
4086     return $text;
4087 }
4088
4089
4090 sub _EncodeAmpsAndAngles {
4091 # Smart processing for ampersands and angle brackets that need to be encoded.
4092
4093     my $text = shift;
4094
4095     # Treat these accidents as though they had the needed ';'
4096     $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
4097
4098     # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
4099     #   http://bumppo.net/projects/amputator/
4100     $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
4101
4102     # Encode naked <'s
4103     $text =~ s{<(?![\020a-z/?\$!])}{&lt;}gi;
4104     $text =~ s{<(?=[^>]*$)}{&lt;}g;
4105
4106     # Encode <'s that cannot possibly be a start or end tag
4107     $text =~ s{(<[^>]*>)}{_DoTag($1)}ige;
4108
4109     return $text;
4110 }
4111
4112
4113 sub _EncodeBackslashEscapes {
4114 #
4115 # Parameter: String.
4116 # Returns:   String after processing the following backslash escape sequences.
4117 #
4118     local $_ = shift;
4119
4120     s!\\\\!$g_escape_table{'\\'}!go; # Must process escaped backslashes first.
4121     s{\\([`*_~{}\[\]()>#+\-.!`])}{$g_escape_table{$1}}go;
4122
4123     return $_;
4124 }
4125
4126
4127 sub _DoAutoLinks {
4128     local $_ = shift;
4129
4130     s{<((https?|ftps?):[^'\042>\s]+)>(?!\s*</a>)}{_MakeATag($1, "&lt;".$1."&gt;")}gise;
4131
4132     # Email addresses: <address@domain.foo>
4133     s{
4134         <
4135         (?:mailto:)?
4136         (
4137             [-.\w]+
4138             \@
4139             [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
4140         )
4141         >
4142     }{
4143         _EncodeEmailAddress(_UnescapeSpecialChars($1), "&#x3c;", "&#62;");
4144     }egix;
4145
4146     # (kjm) I don't do "x" patterns
4147     s{(?:^|(?<=\s))((?:https?|ftps?)://(?:[-a-zA-Z0-9./?\&\%=_~!*;:\@+\$,\x23](?:(?<![.,:;])|(?=[^\s])))+)}
4148      {_MakeATag($1, $1)}soge;
4149     s{(?<![][])(?<!\] )\[RFC( ?)([0-9]{1,5})\](?![][])(?! \[)}
4150      {"["._MakeATag("https://tools.ietf.org/html/rfc$2", "RFC$1$2", "RFC $2")."]"}soge;
4151
4152     return $_;
4153 }
4154
4155
4156 sub _EncodeEmailAddress {
4157 #
4158 # Input: an email address, e.g. "foo@example.com"
4159 #
4160 # Output: the email address as a mailto link, with each character
4161 #         of the address encoded as either a decimal or hex entity, in
4162 #         the hopes of foiling most address harvesting spam bots. E.g.:
4163 #
4164 #   <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
4165 #   x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
4166 #   &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
4167 #
4168 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
4169 # mailing list: <https://tinyurl.com/yu7ue>
4170 #
4171
4172     my ($addr, $prefix, $suffix) = @_;
4173     $prefix = "" unless defined($prefix);
4174     $suffix = "" unless defined($suffix);
4175
4176     srand(unpack('N',md5($addr)));
4177     my @encode = (
4178         sub { '&#' .                 ord(shift)   . ';' },
4179         sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
4180         sub {                            shift          },
4181     );
4182
4183     $addr = "mailto:" . $addr;
4184
4185     $addr =~ s{(.)}{
4186         my $char = $1;
4187         if ( $char eq '@' ) {
4188             # this *must* be encoded. I insist.
4189             $char = $encode[int rand 1]->($char);
4190         } elsif ( $char ne ':' ) {
4191             # leave ':' alone (to spot mailto: later)
4192             my $r = rand;
4193             # roughly 10% raw, 45% hex, 45% dec
4194             $char = (
4195                 $r > .9   ?  $encode[2]->($char)  :
4196                 $r < .45  ?  $encode[1]->($char)  :
4197                              $encode[0]->($char)
4198             );
4199         }
4200         $char;
4201     }gex;
4202
4203     # strip the mailto: from the visible part
4204     (my $bareaddr = $addr) =~ s/^.+?://;
4205     $addr = _MakeATag("$addr", $prefix.$bareaddr.$suffix);
4206
4207     return $addr;
4208 }
4209
4210
4211 sub _UnescapeSpecialChars {
4212 #
4213 # Swap back in all the special characters we've hidden.
4214 #
4215     my $text = shift;
4216
4217     while( my($char, $hash) = each(%g_escape_table) ) {
4218         $text =~ s/$hash/$char/g;
4219     }
4220     return $text;
4221 }
4222
4223
4224 sub _TokenizeHTML {
4225 #
4226 # Parameter: String containing HTML markup.
4227 # Returns:   Reference to an array of the tokens comprising the input
4228 #            string. Each token is either a tag (possibly with nested,
4229 #            tags contained therein, such as <a href="<MTFoo>">, or a
4230 #            run of text between tags. Each element of the array is a
4231 #            two-element array; the first is either 'tag' or 'text';
4232 #            the second is the actual value.
4233 #
4234 #
4235 # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
4236 #   <https://web.archive.org/web/20041215155739/http://bradchoate.com/weblog/2002/07/27/mtregex>
4237 #
4238
4239     my $str = shift;
4240     my $pos = 0;
4241     my $len = length $str;
4242     my @tokens;
4243
4244     my $depth = 6;
4245     my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
4246     my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment
4247                    (?s: <\? .*? \?> ) |             # processing instruction
4248                    $nested_tags/iox;                # nested tags
4249
4250     while ($str =~ m/($match)/g) {
4251         my $whole_tag = $1;
4252         my $sec_start = pos $str;
4253         my $tag_start = $sec_start - length $whole_tag;
4254         if ($pos < $tag_start) {
4255             push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
4256         }
4257         push @tokens, ['tag', $whole_tag];
4258         $pos = pos $str;
4259     }
4260     push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
4261     \@tokens;
4262 }
4263
4264
4265 sub _Outdent {
4266 #
4267 # Remove one level of line-leading indent_width of spaces
4268 #
4269     my $text = shift;
4270
4271     $text =~ s/^ {1,$opt{indent_width}}//gm;
4272     return $text;
4273 }
4274
4275
4276 # _DeTab
4277 #
4278 # $1 => input text
4279 # $2 => optional tab width (default is $opt{tab_width})
4280 # $3 => leading spaces to strip off each line first (default is 0 aka none)
4281 # <= result with tabs expanded
4282 sub _DeTab {
4283     my $text = shift;
4284     my $ts = shift || $opt{tab_width};
4285     my $leadsp = shift || 0;
4286     my $spr = qr/^ {1,$leadsp}/ if $leadsp;
4287     pos($text) = 0;
4288     my $end = length($text);
4289     my $ans = "";
4290     while (pos($text) < $end) {
4291         my $line;
4292         if ($text =~ /\G(.*?\n)/gcs) {
4293             $line = $1;
4294         } else {
4295             $line = substr($text, pos($text));
4296             pos($text) = $end;
4297         }
4298         $line =~ s/$spr// if $leadsp;
4299         # From the Perl camel book section "Fluent Perl" but modified a bit
4300         $line =~ s/(.*?)(\t+)/$1 . ' ' x (length($2) * $ts - length($1) % $ts)/ges;
4301         $ans .= $line;
4302     }
4303     return $ans;
4304 }
4305
4306
4307 sub _PrefixURL {
4308 #
4309 # Add URL prefix if needed
4310 #
4311     my $url = shift;
4312     $url =~ s/^\s+//;
4313     $url =~ s/\s+$//;
4314     $url = "#" unless $url ne "";
4315
4316     return $url unless
4317         ref($opt{abs_prefix}) eq 'CODE' ||
4318         ref($opt{url_prefix}) eq 'CODE' ||
4319         ref($opt{img_prefix}) eq 'CODE' ;
4320     return $url if $url =~ m"^\002\003" || $url =~ m"^#" || $url =~ m,^//,;
4321     $url = &{$opt{abs_prefix}}($url) if $url =~ m,^/, && ref($opt{abs_prefix}) eq 'CODE';
4322     return $url if $url =~ /^[A-Za-z][A-Za-z0-9+.-]*:/ || $url =~ m,^//, ||
4323         ($opt{keepabs} && $url =~ m,^/,);
4324     my $cr = $opt{url_prefix};
4325     $cr = $opt{img_prefix}
4326         if ref($opt{img_prefix}) eq 'CODE' && $url =~ m"^[^#?]*\.(?:png|gif|jpe?g|svgz?)(?:[#?]|$)"i;
4327     return $url unless ref($cr) eq 'CODE';
4328     return "\2\3".&$cr(substr($url, 0, 1) eq '/' ? substr($url, 1) : $url);
4329 }
4330
4331
4332 BEGIN {
4333     $g_style_sheet = <<'STYLESHEET';
4334
4335 <style type="text/css">
4336 /* <![CDATA[ */
4337
4338 /* Markdown.pl fancy style sheet
4339 ** Copyright (C) 2017,2018,2019,2020,2021 Kyle J. McKay.
4340 ** All rights reserved.
4341 **
4342 ** Redistribution and use in source and binary forms, with or without
4343 ** modification, are permitted provided that the following conditions are met:
4344 **
4345 **   1. Redistributions of source code must retain the above copyright notice,
4346 **      this list of conditions and the following disclaimer.
4347 **
4348 **   2. Redistributions in binary form must reproduce the above copyright
4349 **      notice, this list of conditions and the following disclaimer in the
4350 **      documentation and/or other materials provided with the distribution.
4351 **
4352 **   3. Neither the name of the copyright holder nor the names of its
4353 **      contributors may be used to endorse or promote products derived from
4354 **      this software without specific prior written permission.
4355 **
4356 ** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
4357 ** AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
4358 ** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
4359 ** ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
4360 ** LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
4361 ** CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
4362 ** SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
4363 ** INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
4364 ** CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
4365 ** ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
4366 ** POSSIBILITY OF SUCH DAMAGE.
4367 */
4368
4369 div.%(base)code-bt > pre, div.%(base)code > pre {
4370         margin: 0;
4371         padding: 0;
4372         overflow: auto;
4373 }
4374
4375 div.%(base)code-bt > pre > code, div.%(base)code > pre > code {
4376         display: inline-block;
4377         margin: 0;
4378         padding: 0.5em 0;
4379         border-top: thin dotted;
4380         border-bottom: thin dotted;
4381 }
4382
4383 table.%(base)table {
4384         margin-bottom: 0.5em;
4385 }
4386 table.%(base)table, table.%(base)table th, table.%(base)table td {
4387         border-collapse: collapse;
4388         border-spacing: 0;
4389         border: thin solid;
4390 }
4391 table.%(base)yaml-table {
4392         border-collapse: collapse;
4393 }
4394 table.%(base)yaml-table * {
4395         border: thin solid;
4396 }
4397 table.%(base)yaml-table th {
4398         text-align: center;
4399 }
4400 table.%(base)yaml-table th, table.%(base)yaml-table td {
4401         padding-left: 0.5ex;
4402         padding-right: 0.5ex;
4403 }
4404
4405 ol.%(base)ol {
4406         counter-reset: %(base)item;
4407 }
4408 ol.%(base)ol[start="0"] {
4409         counter-reset: %(base)item -1;
4410 }
4411 ol.%(base)ol[start="5"] {
4412         counter-reset: %(base)item 4;
4413 }
4414 ol.%(base)ol[start="10"] {
4415         counter-reset: %(base)item 9;
4416 }
4417 ol.%(base)ol > span.%(base)ol-incr {
4418         counter-increment: %(base)item;
4419 }
4420 ol.%(base)ol > span.%(base)ol-incr-2 {
4421         counter-increment: %(base)item 2;
4422 }
4423 ol.%(base)ol > span.%(base)ol-incr-5 {
4424         counter-increment: %(base)item 5;
4425 }
4426 ol.%(base)ol > span.%(base)ol-incr-10 {
4427         counter-increment: %(base)item 10;
4428 }
4429 ol.%(base)lc-greek, li.%(base)lc-greek {
4430         list-style-type: lower-greek;
4431 }
4432 ol.%(base)ol > li {
4433         counter-increment: %(base)item;
4434 }
4435 ol.%(base)ol > li.%(base)li,
4436 ol.%(base)ol > li.%(base)li-lc,
4437 ol.%(base)ol > li.%(base)li-lc-greek,
4438 ol.%(base)ol > li.%(base)li-uc {
4439         list-style-type: none;
4440         display: block;
4441 }
4442 ol.%(base)ol > li.%(base)li:before,
4443 ol.%(base)ol > li.%(base)li-lc:before,
4444 ol.%(base)ol > li.%(base)li-lc-greek:before,
4445 ol.%(base)ol > li.%(base)li-uc:before {
4446         position: absolute;
4447         text-align: right;
4448         white-space: nowrap;
4449         margin-left: -9ex;
4450         width: 9ex;
4451 }
4452 ol.%(base)ol > li.%(base)li[type="1"]:before {
4453         content: counter(%(base)item, decimal) ")\A0 \A0 ";
4454 }
4455 ol.%(base)ol > li.%(base)li-lc[type="i"]:before,
4456 ol.%(base)ol > li.%(base)li-lc[type="I"]:before {
4457         content: counter(%(base)item, lower-roman) ")\A0 \A0 ";
4458 }
4459 ol.%(base)ol > li.%(base)li-uc[type="I"]:before,
4460 ol.%(base)ol > li.%(base)li-uc[type="i"]:before {
4461         content: counter(%(base)item, upper-roman) ")\A0 \A0 ";
4462 }
4463 ol.%(base)ol > li.%(base)li-lc[type="a"]:before,
4464 ol.%(base)ol > li.%(base)li-lc[type="A"]:before {
4465         content: counter(%(base)item, lower-alpha) ")\A0 \A0 ";
4466 }
4467 ol.%(base)ol > li.%(base)li-lc-greek[type="a"]:before,
4468 ol.%(base)ol > li.%(base)li-lc-greek[type="A"]:before {
4469         content: counter(%(base)item, lower-greek) ")\A0 \A0 ";
4470 }
4471 ol.%(base)ol > li.%(base)li-uc[type="A"]:before,
4472 ol.%(base)ol > li.%(base)li-uc[type="a"]:before {
4473         content: counter(%(base)item, upper-alpha) ")\A0 \A0 ";
4474 }
4475
4476 li.%(base)checkbox-on,
4477 li.%(base)checkbox-off {
4478         list-style-type: none;
4479         display: block;
4480 }
4481 li.%(base)checkbox-on > span:first-child + span + span,
4482 li.%(base)checkbox-off > span:first-child + span + span {
4483         position: absolute;
4484         clip: rect(0,0,0,0);
4485 }
4486 li.%(base)checkbox-on > span:first-child,
4487 li.%(base)checkbox-off > span:first-child,
4488 li.%(base)checkbox-on > span:first-child + span,
4489 li.%(base)checkbox-off > span:first-child + span {
4490         display: block;
4491         position: absolute;
4492         margin-left: -3ex;
4493         width: 1em;
4494         height: 1em;
4495 }
4496 li.%(base)checkbox-on > span:first-child > span:first-child,
4497 li.%(base)checkbox-off > span:first-child > span:first-child {
4498         display: block;
4499         position: absolute;
4500         left: 0.75pt; top: 0.75pt; right: 0.75pt; bottom: 0.75pt;
4501 }
4502 li.%(base)checkbox-on > span:first-child > span:first-child:before,
4503 li.%(base)checkbox-off > span:first-child > span:first-child:before {
4504         display: inline-block;
4505         position: relative;
4506         right: 1pt;
4507         width: 100%;
4508         height: 100%;
4509         border: 1pt solid;
4510         content: "";
4511 }
4512 li.%(base)checkbox-on > span:first-child + span:before {
4513         position: relative;
4514         left: 2pt;
4515         bottom: 1pt;
4516         font-size: 125%;
4517         line-height: 80%;
4518         vertical-align: text-top;
4519         content: "\2713";
4520 }
4521
4522 /* ]]> */
4523 </style>
4524
4525 STYLESHEET
4526     $g_style_sheet =~ s/^\s+//g;
4527     $g_style_sheet =~ s/\s+$//g;
4528     $g_style_sheet .= "\n";
4529 }
4530
4531 1;
4532
4533 __DATA__
4534
4535 =head1 NAME
4536
4537 Markdown.pl - convert Markdown format text files to HTML
4538
4539 =head1 SYNOPSIS
4540
4541 B<Markdown.pl> [B<--help>] [B<--html4tags>] [B<--htmlroot>=I<prefix>]
4542     [B<--imageroot>=I<prefix>] [B<--version>] [B<--shortversion>]
4543     [B<--tabwidth>=I<num>] [B<--stylesheet>] [B<--stub>] [--]
4544     [I<file>...]
4545
4546  Options:
4547    -h                                   show short usage help
4548    --help                               show long detailed help
4549    --html4tags                          use <br> instead of <br />
4550    --deprecated                         allow <dir> and <menu> tags
4551    --sanitize                           sanitize tag attributes
4552    --no-sanitize                        do not sanitize tag attributes
4553    --validate-xml                       check if output is valid XML
4554    --validate-xml-internal              fast basic check if output is valid XML
4555    --no-validate-xml                    do not check output for valid XML
4556    --strip-comments                     remove XML-like comments from output
4557    --strip-comments-lax                 remove XML-like comments from output
4558    --strip-comments-strict              remove only strictly valid XML comments
4559    --strip-comments-lax-only            remove only invalid XML-like comments
4560    --no-strip-comments                  do not remove any XML/XML-like comments
4561    --tabwidth=num                       expand tabs to num instead of 8
4562    --auto-number                        automatically number h1-h6 headers
4563    -k | --keep-abs                      keep abspath URLs despite -r/-i
4564    -a prefix | --absroot=prefix         append abspath URLs to prefix
4565    -b prefix | --base=prefix            prepend prefix to fragment-only URLs
4566    -r prefix | --htmlroot=prefix        append relative non-img URLs to prefix
4567    -i prefix | --imageroot=prefix       append relative img URLs to prefix
4568    -w [wikipat] | --wiki[=wikipat]      activate wiki links using wikipat
4569    --yaml[=(enable|disable|strip|...)]  select YAML front matter processing
4570    -V | --version                       show version, authors, license
4571                                         and copyright
4572    -s | --shortversion                  show just the version number
4573    --raw | --raw-xml                    input contains only raw xhtml
4574    --raw-html                           input contains only raw html
4575    --div[=id]                           wrap body in div with given id
4576    --stylesheet                         output the fancy style sheet
4577    --no-stylesheet                      do not output fancy style sheet
4578    --keep-named-character-entities      do not convert named character entities
4579    --us-ascii                           convert non-ASCII to character entities
4580    --stub                               wrap output in stub document
4581                                         implies --stylesheet
4582    --                                   end options and treat next
4583                                         argument as file
4584
4585 =head1 DESCRIPTION
4586
4587 Markdown is a text-to-HTML filter; it translates an easy-to-read /
4588 easy-to-write structured text format into HTML. Markdown's text format
4589 is most similar to that of plain text email, and supports features such
4590 as headers, *emphasis*, code blocks, blockquotes, and links.
4591
4592 Markdown's syntax is designed not as a generic markup language, but
4593 specifically to serve as a front-end to (X)HTML. You can  use span-level
4594 HTML tags anywhere in a Markdown document, and you can use block level
4595 HTML tags (like <div> and <table> as well).
4596
4597 For more information about Markdown's syntax, see the F<basics.md>
4598 and F<syntax.md> files included with F<Markdown.pl>.
4599
4600 Input (auto-detected) may be either ISO-8859-1 or UTF-8.  Output is always
4601 converted to the UTF-8 character set.
4602
4603
4604 =head1 OPTIONS
4605
4606 Use "--" to end switch parsing. For example, to open a file named "-z", use:
4607
4608     Markdown.pl -- -z
4609
4610 =over
4611
4612
4613 =item B<--html4tags>
4614
4615 Use HTML 4 style for empty element tags, e.g.:
4616
4617     <br>
4618
4619 instead of Markdown's default XHTML style tags, e.g.:
4620
4621     <br />
4622
4623 This option is I<NOT compatible> with the B<--validate-xml> option
4624 and will produce an immediate error if both are given.
4625
4626
4627 =item B<--deprecated>
4628
4629 Both "<dir>" and "<menu>" are normally taken as literal text and the leading
4630 "<" will be automatically escaped.
4631
4632 If this option is used, they are recognized as valid tags and passed through
4633 without being escaped.
4634
4635 When dealing with program argument descriptions "<dir>" can be particularly
4636 problematic therefore use of this option is not recommended.
4637
4638 Other deprecated tags (such as "<font>" and "<center>" for example) continue
4639 to be recognized and passed through even without using this option.
4640
4641
4642 =item B<--sanitize>
4643
4644 Removes troublesome tag attributes from embedded tags.  Only a very strictly
4645 limited set of tag attributes will be permitted, other attributes will be
4646 silently discarded.  The set of allowed attributes varies by tag.
4647
4648 Splits empty minimized elements that are not one of the HTML allowed empty
4649 elements (C<area> C<basefont> C<br> C<col> C<hr> C<img>) into separate begin
4650 and end tags.  For example, C<< <p/> >> or C<< <p /> >> will be split into
4651 C<< <p></p> >>.
4652
4653 Combines adjacent (whitespace separated only) opening and closing tags for
4654 the same HTML empty element into a single minimized tag.  For example,
4655 C<< <br></br> >> will become C<< <br /> >>.
4656
4657 Tags that require at least one attribute to be present to be meaningful
4658 (e.g. C<a>, C<area>, C<img>, C<map>) but have none will be treated as non-tags
4659 potentially creating unexpected errors.  For example, the sequence
4660 C<< <a>text here</a> >> will be sanitized to C<< &lt;a>text here</a> >> since
4661 an C<a> tag without any attributes is meaningless, but then the trailing
4662 close tag C<< </a> >> will become an error because it has no matching open
4663 C<< <a ...> >> tag.
4664
4665 The point of this check is not to cause undue frustration, but to allow
4666 such constructs to be used as text without the need for escaping since they
4667 are meaningless as tags.  For example, C<< <a><c><e> >> works just fine
4668 as plain text and so does C<< <A><C><E> >> because the
4669 C<< <a> >>/C<< <A> >> will be treated as a non-tag automatically.  In fact,
4670 they can even appear inside links too such as
4671 C<< <a href="#somewhere">Link to <a><c><e> article</a> >>.
4672
4673 Problematic C<&> characters are fixed up such as standalone C<&>s (or those not
4674 part of a valid entity reference) are turned into C<&amp;>.  Within attribute
4675 values, single and double quotes are turned into C<&> entity refs.
4676
4677 This is enabled by default.
4678
4679
4680 =item B<--no-sanitize>
4681
4682 Do not sanitize tag attributes.  This option does not allow any tags that would
4683 not be allowed without this option, but it does completely suppress the
4684 attribute sanitation process.   If this option is specified, no attributes will
4685 be removed from any tag (although C<img> and C<a> tags will still be affected
4686 by B<--imageroot>, B<--htmlroot>, B<--absroot> and/or B<--base> options).
4687 Use of this option is I<NOT RECOMMENDED>.
4688
4689
4690 =item B<--validate-xml>
4691
4692 Perform XML validation on the output before it's output and die if
4693 it fails validation.  This requires the C<XML::Simple> or C<XML::Parser>
4694 module be present (one is only required if this option is given).
4695
4696 Any errors are reported to STDERR and the exit status will be
4697 non-zero on XML validation failure.  Note that all line and column
4698 numbers in the error output refer to the entire output that would
4699 have been produced.  Re-run with B<--no-validate-xml> to see what's
4700 actually present at those line and column positions.
4701
4702 If the B<--stub> option has also been given, then the entire output is
4703 validated as-is.  Without the B<--stub> option, the output will be wrapped
4704 in C<< <div>...</div> >> for validation purposes but that extra "div" added
4705 for validation will not be added to the final output.
4706
4707 This option is I<NOT enabled by default>.
4708
4709 This option is I<NOT compatible> with the B<--html4tags> option and will
4710 produce an immediate error if both are given.
4711
4712
4713 =item B<--validate-xml-internal>
4714
4715 Perform XML validation on the output before it's output and die if
4716 it fails validation.  This uses a simple internal consistency checker
4717 that finds unmatched and mismatched open/close tags.
4718
4719 Non-empty elements that in HTML have optional closing tags (C<colgroup>
4720 C<dd> C<dt> C<li> C<p> C<tbody> C<td> C<tfoot> C<th> C<thead> C<tr>)
4721 will automatically have any omitted end tags inserted during the
4722 `--validate-xml-internal` process.
4723
4724 Any errors are reported to STDERR and the exit status will be
4725 non-zero on XML validation failure.  Note that all line and column
4726 numbers in the error output refer to the entire output that would
4727 have been produced before sanitization without any B<--stub> or
4728 B<--stylesheet> options.  Re-run with B<--no-sanitize> and
4729 B<--no-validate-xml> and I<without> any B<--stub> or B<--stylesheet>
4730 options to see what's actually present at those line and column
4731 positions.
4732
4733 This option validates the output I<prior to> adding any requested
4734 B<--stub> or B<--stylesheet>.  As the built-in stub and stylesheet
4735 have already been validated that speeds things up.  The output is
4736 I<NOT> wrapped (in a C<< <div>...</div> >>) for validation as that's
4737 not required for the internal checker.
4738
4739 This option is I<IS enabled by default> unless B<--no-sanitize> is
4740 active.
4741
4742 This option is I<IS compatible> with the B<--html4tags> option.
4743
4744 This option requires the B<--sanitize> option and will produce an
4745 immediate error if both B<--no-sanitize> and B<--validate-xml-internal>
4746 are given.
4747
4748 Note that B<--validate-xml-internal> is I<MUCH faster> than
4749 B<--validate-xml> and I<does NOT> require any extra XML modules to
4750 be present.
4751
4752
4753 =item B<--no-validate-xml>
4754
4755 Do not perform XML validation on the output.  Markdown.pl itself will
4756 normally generate valid XML sequences (unless B<--html4tags> has been
4757 used).  However, any raw tags in the input (that are on the "approved"
4758 list), could potentially result in invalid XML output (i.e. mismatched
4759 start and end tags, missing start or end tag etc.).
4760
4761 Markdown.pl can check for these issues itself using its own internal
4762 B<--validate-xml-internal> check or, with the B<--validate-xml>
4763 option, it can use C<XML::Simple> or C<XML::Parser> to do so.
4764
4765 Note that B<--validate-xml-internal> is the default option unless
4766 B<--no-sanitize> is used in which case B<--no-validate-xml> is the
4767 default option.
4768
4769
4770 =item B<--strip-comments>/B<--strip-comments-lax>
4771
4772 (N.B. B<--strip-comments> is just a short form of B<--strip-comments-lax>)
4773
4774 Strip XML and XML-like comments from the output.  Any XML or XML-like
4775 comments encountered will be omitted from the output if either of these
4776 options is given.
4777
4778 Unlike the B<--strip-comments-strict> option, these options I<will>
4779 strip any XML-like comments that contain internal double hyphen
4780 (i.e. C<-->) sequences.
4781
4782 This option requires the B<--sanitize> option to be used (which is
4783 the default).
4784
4785 If either of these options is given, it will supersede any previous
4786 B<--strip-comments-strict>, B<--strip-comments-lax-only> or
4787 B<--no-strip-comments> options.
4788
4789
4790 =item B<--strip-comments-strict>
4791
4792 Strip only strictly XML standard compliant comments from the output.
4793
4794 Note that the XML standard section 2.5 specifically prohibits
4795 a C<--> sequence within an XML comment (i.e. C<--> cannot occur after
4796 the comment start tag C<< <!-- >> unless it is immediately followed
4797 by C<< > >> which makes it the comment end tag C<< --> >>).
4798
4799 In other words, S<C<< <!-- --> >>>, S<C<< <!-- - --> >>>, S<C<< <!----> >>>,
4800 and S<C<< <!--- --> >>> are all valid XML comments, but S<C<< <!-----> >>>
4801 and S<C<< <!-- ---> >>> are not!
4802
4803 As part of the "sanitation" process (triggered by the B<--sanitize>
4804 option), any invalid tags have their leading C<< < >> escaped (to
4805 C<< &#lt; >>) thus making them ordinary text and this I<includes>
4806 invalid XML comments.
4807
4808 What this means is that the B<--strip-comments-strict> option I<will not>
4809 remove invalid XML comments (such as S<C<< <!-----> >>>)!
4810
4811 But see the B<--strip-comments-lax> option for a solution.
4812
4813 If this option is given, it will supersede any previous
4814 B<--strip-comments>, B<--strip-comments-lax>, B<--strip-comments-lax-only>
4815 or B<--no-strip-comments> options.
4816
4817
4818 =item B<--strip-comments-lax-only>
4819
4820 This is the default option if no other strip comments options are given
4821 AND the B<--sanitize> option is active (the default).
4822
4823 This is a compromise option.  It works just like the B<--strip-comments-lax>
4824 option, but I<ONLY> on strictly invalid XML-like comments.
4825
4826 In other words, if a strictly valid XML comment is present, it will be retained
4827 in the output.  If a strictly invalid XML comment is present which would have
4828 been stripped by B<--strip-comments-lax> but would have had its leading C<< < >>
4829 escaped automatically by the B<--no-strip-comments> or B<--strip-comments-strict>
4830 modes (because it's not a strictly valid XML comment), then it I<will> be stripped
4831 by this mode.
4832
4833 This option prevents ugly invalid XML comments from slipping through into the
4834 output as escaped plain text while still passing through valid XML comments
4835 without stripping them.
4836
4837 If this option is given, it will supersede any previous
4838 B<--strip-comments>, B<--strip-comments-lax>, B<--strip-comments-lax-only>
4839 or B<--no-strip-comments> options.
4840
4841
4842 =item B<--no-strip-comments>
4843
4844 Do not strip XML or XML-like comments from the output.
4845
4846 This is the default option I<ONLY> when no other strip comments options have
4847 been give I<and> the B<--no-sanitize> option is in effect (which is I<not> the
4848 default).
4849
4850 When B<--no-strip-comments> is active, strictly invalid XML comments such
4851 as those that contain an internal double hyphen (C<-->) sequence will end
4852 up having their leading C<< < >> escaped automatically and end up as plain
4853 text in the output!
4854
4855 If this option is given, it will supersede any previous
4856 B<--strip-comments>, B<--strip-comments-lax>, B<--strip-comments-lax-only>
4857 or B<--no-strip-comments> options.
4858
4859
4860 =item B<--tabwidth>=I<num>
4861
4862 Expand tabs to I<num> character wide tab stop positions instead of the default
4863 8.  Don't use this; physical tabs should always be expanded to 8-character
4864 positions.  This option does I<not> affect the number of spaces needed to
4865 start a new "indent level".  That will always be 4 no matter what value is
4866 used (or implied by default) with this option.  Also note that tabs inside
4867 backticks-delimited code blocks will always be expanded to 8-character tab
4868 stop positions no matter what value is used for this option.
4869
4870 The value must be S<2 <= I<num> <= 32>.
4871
4872
4873 =item B<--auto-number>
4874
4875 Automatically number all h1-h6 headings generated from Markdown markup.
4876 Explicit C<< <h1> >> ... C<< <h6> >> tag content remains unmolested.
4877
4878 If this option is given, any YAML C<header_enum> setting will be ignored.
4879
4880
4881 =item B<-k>, B<--keep-abs>
4882
4883 Normally any absolute path URLs (i.e. URLs without a scheme starting
4884 with "/" but not "//") are subject to modification by any
4885 B<-r>/B<--htmlroot> or B<-i>/B<--imageroot> option.
4886
4887 If the B<-a>/B<--absroot> option is used and it transforms these
4888 absolute path URLs into a full absolute URL (i.e. starts with a
4889 scheme or "//") then any subsequent B<-r>/B<--htmlroot> or
4890 B<-i>/B<--imageroot> processing will be skipped because the URL is
4891 no longer relative.
4892
4893 If the B<--keep-abs> option is given, then (after applying any
4894 B<-a>/B<--absroot> option if present) absolute path URLs will be
4895 kept as-is and will not be processed further by any B<-r>/B<--htmlroot>
4896 or B<-i>/B<--imageroot> option.
4897
4898 Note that if the B<-a>/B<--absroot> option transforms an absolute
4899 path URL into a relative PATH URL it I<will> be subject to subsequent
4900 B<-r>/B<--htmlroot> or B<-i>/B<--imageroot> processing regardless
4901 of the B<-k>/B<--keep-abs> option.
4902
4903
4904 =item B<-a> I<prefix>, B<--absroot>=I<prefix>
4905
4906 Any absolute path URLs (i.e. URLs without a scheme starting with "/" but not
4907 "//") have I<prefix> prepended which prevents them from being acted upon by the
4908 B<--htmlroot> and/or B<--imageroot> options provided the result is a full
4909 absolute URL.  The default is to prepend nothing and leave them as absolute
4910 path URLs which will allow them to be processed by any B<--htmlroot> and/or
4911 B<--imageroot> options.
4912
4913 This option can be helpful when documents are being formatted for display on a
4914 different system and the absolute path URLs need to be "fixed up".
4915
4916
4917 =item B<-b> I<prefix>, B<--base>=I<prefix>
4918
4919 Any fragment-only URLs have I<prefix> prepended.  The default is to prepend
4920 nothing and leave them as bare fragment URLs.  Use of this option may be
4921 necessary when embedding the output of Markdown.pl into a document that makes
4922 use of the C<< <base> >> tag in order for intra-document fragment URL links to
4923 work properly in such a document.
4924
4925
4926 =item B<-r> I<prefix>, B<--htmlroot>=I<prefix>
4927
4928 Any non-absolute URLs have I<prefix> prepended.
4929
4930
4931 =item B<-i> I<prefix>, B<--imageroot>=I<prefix>
4932
4933 Any non-absolute URLs have I<prefix> prepended (overriding the B<-r> prefix
4934 if any) but only if they end in an image suffix.
4935
4936
4937 =item B<-w> [I<wikipat>], B<--wiki>[=I<wikipat>]
4938
4939 Activate wiki links.  Any link enclosed in double brackets (e.g. "[[link]]") is
4940 considered a wiki link.  By default only absolute URL and fragment links are
4941 allowed in the "wiki link style" format.  Any other double-bracketed strings
4942 are left unmolested.
4943
4944 If this option is given, all other wiki links are enabled as well.  Any
4945 non-absolute URL or fragment links will be transformed into a link using
4946 I<wikipat> where the default I<wikipat> if none is given is C<%{s(:md)}.html>.
4947
4948 If the given I<wikipat> does not contain a C<%{...}> placeholder sequence
4949 then it will automatically have C<%{s(:md)}.html> suffixed to it.
4950
4951 The C<...> part of the C<%{...}> sequence specifies zero or more
4952 case-insensitive single-letter options with the following effects:
4953
4954 =over
4955
4956 =item B<b>
4957
4958 Retain blanks (aka spaces) in the output.  They will become C<%20>
4959 in the final URL.  Because spaces are always trimmed before processing
4960 wiki links, runs of multiple spaces will be collapsed into a single
4961 space and any leading or trailing spaces will be removed.
4962
4963 =item B<d>
4964
4965 Convert spaces to dashes (ASCII 0x2D) instead of underscore (ASCII
4966 0x5F).  Note that if this option is given then runs of multiple
4967 dashes will be converted to a single dash I<instead> but runs of
4968 multiple underscores will be left untouched.
4969
4970 =item B<f>
4971
4972 Flatten the resulting name by replacing forward slashes (ASCII 0x2F)
4973 as well.  They will be converted to underscores unless the C<d>
4974 option is given (in which case they will be converted to dashes).
4975 This conversion takes place before applying the runs-of-multiple
4976 reduction.  This option is incompatible with the B<%> option.
4977
4978 =item B<%>
4979
4980 Flatten the resulting name by replacing runs of one or more forward
4981 slashes (ASCII 0x2F) with C<%2F>.  Note that when encoded into a
4982 URL the C<%2F> actually becomes C<%252F>.  This option is incompatible
4983 with the B<f> option.
4984
4985 =item B<l>
4986
4987 Convert link target (excluding any query string and/or fragment) to lowercase.
4988 Takes precedence over any C<u> option, but specifically excludes C<%>-escapes
4989 which are always UPPERCASE hexadecimal.
4990
4991 =item B<r>
4992
4993 Leave raw UTF-8 characters in the result.  Normally anything not allowed
4994 directly in a URL ends up URL-encoded.  With this option, raw valid UTF-8
4995 sequences will be left untouched.  Use with care.
4996
4997 =item B<s> or B<s(>I<< <ext> >>[B<,>I<< <ext> >>]...B<)>
4998
4999 After (temporarily) removing any query string and/or fragment, strip any final
5000 "dot" suffix so long as it occurs after the last slash (if any slash was
5001 present before applying the C<f> option).  The "dot" (ASCII 0x2E) and all
5002 following characters (if any) are removed.  If the optional C<< (<ext>,...) >>
5003 part is present then only strip the extension if it consists of a "dot"
5004 followed by one of the case-insensitive I<< <ext> >> values.  As a special
5005 case, using the value C<:md> for one of the I<< <ext> >> values causes that
5006 value to be expanded to all known markdown extensions.
5007
5008 When processing wiki image links, this option is ignored.
5009
5010 =item B<u>
5011
5012 Convert link target (excluding any query string and/or fragment) to UPPERCASE.
5013
5014 =item B<v>
5015
5016 Leave runs-of-multiple characters alone (aka "verbatim").  Does not affect
5017 any of the other options except by eliminating the runs-of-multple reduction
5018 step.  Also does I<not> inhibit the initial whitespace trimming.
5019
5020 Does not affect the runs-of-multiple "/" replacement performed by the B<%>
5021 option.
5022
5023 =back
5024
5025 The URL target of the wiki link is created by first trimming whitespace
5026 (starting and ending whitespace is removed and all other runs of consecutive
5027 whitespace are replaced with a single space) from the wiki link target,
5028 removing (temporarily) any query string and/or fragment, if no options are
5029 present, spaces are converted to underscores (C<_>) and runs of multiple
5030 consecutive underscores are replaced with a single underscore (ASCII 0x5F).
5031 Finally, the I<wikipat> string gets its first placeholder (the C<%{...}>
5032 sequence) replaced with this computed value and the original query string
5033 and/or fragment is re-appended (if any were originally present) and
5034 URL-encoding is applied as needed to produce the actual final target URL.
5035
5036 Note that when processing wiki image links, no extension stripping ever takes
5037 place (i.e. the "s" option is ignored) and anything after the placeholder (the
5038 C<%{...}> sequence) in the pattern is omitted from the result.
5039
5040 See above option descriptions for possible available modifications.
5041
5042 One of the commonly used hosting platforms does something substantially similar
5043 to using C<%{dfv}> as the placeholder.
5044
5045 One of the commonly used wiki platforms does something similar to using C<%{%}>
5046 as the placeholder.
5047
5048
5049 =item B<--yaml>[=I<yamlmode>]
5050
5051 Select YAML front matter processing.  The optional I<yamlmode> value
5052 must be one of the following:
5053
5054 =over
5055
5056 =item B<enable>
5057
5058 Recognize any YAML front matter and apply any options specified
5059 therein.  If any unrecognized options are present, the options will
5060 also be shown in the formatted output.
5061
5062 This is the default I<yamlmode> if omitted.
5063
5064 =item B<disable>
5065
5066 No YAML front matter processing at all takes place.  If YAML front
5067 matter is present, it will be treated as regular non-YAML markup
5068 text to be processed.
5069
5070 =item B<strip>
5071
5072 If YAML front matter is present, it will be stripped and completely
5073 ignored before beginning to process the rest of the input.
5074
5075 In this mode, any options in the YAML front matter that would have
5076 otherwise been recognized will I<not have any effect!>
5077
5078 =item B<show>
5079
5080 If YAML front matter is present and contains anything other than
5081 comments, the non-comments parts will be shown in the formatted
5082 output.
5083
5084 In this mode, any options in the YAML front matter that would have
5085 otherwise been recognized will I<not have any effect!>
5086
5087 This is a show-only mode.
5088
5089 =item B<reveal>
5090
5091 This mode works just like the B<enable> mode except that if the
5092 YAML front matter contains anything other than comments, then
5093 I<all> of the non-comments parts will be shown in the formatted
5094 output.
5095
5096 In this mode, any recognized options in the YAML front matter I<are>
5097 processed the same way they would be in the B<enable> mode except
5098 that any option to suppress the B<reveal> mode is ignored.
5099
5100 =item B<conceal>
5101
5102 This mode works just like the B<enable> mode except that no options
5103 are ever shown in the formatted output regardless of whether or not
5104 there are any unrecognized options present.
5105
5106 In this mode, any recognized options in the YAML front matter I<are>
5107 processed the same way they would be in the B<enable> mode except
5108 that any option to suppress the B<conceal> mode is ignored.
5109
5110 =item B<unknown>
5111
5112 This mode works just like the B<show> mode if any unrecognized
5113 YAML front matter options are present.  Otherwise it works like
5114 the B<strip> mode.
5115
5116 In this mode, any options in the YAML front matter that would have
5117 otherwise been recognized will I<not have any effect!>
5118
5119 =back
5120
5121 If B<--raw>, B<--raw-xml> or B<--raw-html> has been specified then
5122 the default if no B<--yaml> option has been given is B<--yaml=disable>.
5123
5124 Otherwise the default if no B<--yaml> option has been given is
5125 B<--yaml=enable>.
5126
5127 Note that only a limited subset of YAML is recognized.  Specifically
5128 only comments, and top-level single-line S<C<key: value>> items
5129 where key must be plain (i.e. non-quoted), start with a letter or
5130 underscore and contain only letters, underscores, hyphens (C<->),
5131 periods (C<.>) and digits.  Keys are case-insensitive (i.e. converted
5132 to lowercase).  As with YAML, at least one whitespace is required
5133 between the ":" and the value (unless it's the empty value).
5134
5135 Values may be either plain or double-quoted (single-quoted is not
5136 recognized).  The double-quoted style may use C-style character
5137 escape codes but may not extend past the end of the line.
5138
5139 For YAML front matter to be recognized, the very first line of the
5140 document must be exactly three hyphens (C<--->).  The YAML terminates
5141 when a line of three hyphens (C<--->) or a line of three periods
5142 (C<...>) or the end of the file is encountered.  Of course the YAML
5143 mode must also be something I<other> than B<--yaml=disable>.
5144
5145
5146 =item B<-V>, B<--version>
5147
5148 Display Markdown's version number and copyright information.
5149
5150
5151 =item B<-s>, B<--shortversion>
5152
5153 Display the short-form version number.
5154
5155
5156 =item B<--raw>, B<--raw-xml>
5157
5158 Input contains only raw XHTML.  All options other than B<--html4tags>,
5159 B<--deprecated>, B<--sanitize> (on by default), B<--strip-comments>,
5160 B<--div>, B<--keep-named-character-entities>, B<--validate-xml> and
5161 B<--validate-xml-internal> (and their B<--no-...> variants) are
5162 ignored.
5163
5164 With this option, arbitrary XHTML input can be passed through
5165 the sanitizer and/or validator.  If sanitation is requested (the
5166 default), input must only contain the contents of the "<body>"
5167 section (i.e. no "<head>" or "<html>").  Output I<will> be converted
5168 to UTF-8 regardless of the input encoding.  All line endings will
5169 be normalized to C<\n> and input encodings other than UTF-8 or
5170 ISO-8859-1 or US-ASCII will end up mangled.
5171
5172 Remember that any B<--stub> and/or B<--stylesheet> options are
5173 I<completely ignored> when B<--raw> is given.
5174
5175
5176 =item B<--raw-html>
5177
5178 Input contains only raw HTML.  All options other than
5179 B<--html4tags>, B<--deprecated>, B<--sanitize> (on by default),
5180 B<--strip-comments>, and B<--validate-xml-internal>
5181 (and their B<--no-...> variants) are ignored.
5182
5183 Requires the (possibly implicit) B<--validate-xml-internal> option.
5184
5185 Works just like B<--raw-xml> except that HTML auto closing and
5186 optional closing tag semantics are activated during the validation
5187 causing missing closing tags to be inserted where required by the
5188 standard.  Non-raw mode always enables these semantics.
5189
5190 This will transform HTML into valid XHTML or fail with an error message.
5191
5192 Unfortunately, it will also fail to accept some documents that
5193 the plain B<--raw-xml> option will.
5194
5195 For example, this document:
5196
5197  <dt><li>a</li></dt>
5198
5199 Will be rejected because upon encountering the C<< <li> >> open
5200 tag a closing C<< </dt> >> will automatically be inserted resulting
5201 in this document:
5202
5203  <dt></dt><li>a</li></dt>
5204
5205 Which, of course, no longer validates.  Since C<li> blocks cannot
5206 directly be nested within C<dt> blocks (according to the standard),
5207 the input document is not strictly correct.
5208
5209 Remember that any B<--stub> and/or B<--stylesheet> options are
5210 I<completely ignored> when B<--raw-html> is given.
5211
5212
5213 =item B<--div>[=I<divname>]
5214
5215 Wrap the output contents in a C<div> tag.  If I<divname> is given the
5216 tag will have that C<id> attribute value.  If the B<--stub> option and/or
5217 the B<--stylesheet> option are active, they are applied I<after> wrapping
5218 the output contents in the C<div>.  Note that if a YAML table ends up
5219 being generated, it I<will> be included I<inside> the C<div> wrapper.
5220
5221 In contrast to the B<--stylesheet> and B<--stub> options, this option
5222 I<is> allowed with the B<--raw-xml> and B<--raw-html> options.
5223
5224
5225 =item B<--stylesheet>
5226
5227 Include the fancy style sheet at the beginning of the output (or in the
5228 C<head> section with B<--stub>).  This style sheet makes fancy checkboxes
5229 and makes a right parenthesis C<)> show instead of a C<.> for ordered lists
5230 that use them.  Without it things will still look fine except that the
5231 fancy stuff won't be there.
5232
5233 Use this option with no other arguments and redirect standard input to
5234 /dev/null to get just the style sheet and nothing else.
5235
5236
5237 =item B<--no-stylesheet>
5238
5239 Overrides a previous B<--stylesheet> and disables implicit inclusion
5240 of the style sheet by the B<--stub> option.
5241
5242
5243 =item B<--keep-named-character-entities>
5244
5245 Do not convert named character entities to their equivalent numerical character
5246 entity.  Normally any occurrence of a named character entity such as
5247 C<&hellip;> would be converted to its equivalent character entity such as
5248 C<&#x2026;>.  If this option is given, that conversion is suppressed.
5249
5250 The only always-valid named entities as far as XML is concerned are the five
5251 entities C<&amp;>, C<&lt;>, C<&gt;>, C<&quot;> and C<&apos;>.  Even that last
5252 one (C<&apos;>) may not be universally supported in XHTML user agents (and it
5253 is converted to C<&#39;> for that reason unless this option is given).
5254
5255 Regardless of this option, C<&amp;>, C<&lt;>, C<&gt;> and C<&quot;> are always
5256 left alone since they are universally supported.
5257
5258 Use of this option is I<NOT RECOMMENDED>.
5259
5260
5261 =item B<--us-ascii>/B<--ascii>
5262
5263 (N.B. B<--ascii> is just a short form of B<--us-ascii>)
5264
5265 Convert any non-US-ASCII characters to their equivalent numerical character
5266 entity.  Any characters with a code point value greater than or equal to
5267 128 will be converted.  Note that the output is still technically UTF-8 since
5268 the US-ASCII code points coincide with the same code points of UTF-8.
5269
5270 Using this option will make the output strictly 7-bit and therefore it should
5271 survive just about any transport mechanism at the expense of an increase in
5272 size that depends on how many non-US-ASCII characters are present.
5273
5274
5275 =item B<--stub>
5276
5277 Wrap the output in a full document stub (i.e. has C<html>, C<head> and C<body>
5278 tags).  The style sheet I<will> be included in the C<head> section unless the
5279 B<--no-stylesheet> option is also used.
5280
5281 The C<< <title> >> value for a document produced with the B<--stub> option
5282 comes from the first markdown markup C<h1> that's generated unless YAML
5283 processing has been enabled (the default) and a C<title> YAML value has
5284 been set in which case that always takes precedence.
5285
5286
5287 =item B<-h>, B<--help>
5288
5289 Display Markdown's help.  With B<--help> full help is shown, with B<-h> only
5290 the usage and options are shown.
5291
5292
5293 =back
5294
5295
5296 =head1 HTML CONTENT
5297
5298 Markdown format documents are intended to be human readable without the use
5299 of XML-like markup.
5300
5301 Nevertheless, html content can be included verbatim provided that the tags
5302 used are limited to those of the HTML 4 specification and only those tags
5303 that represent body content -- scripting tags and attributes are not allowed.
5304
5305 The final version of the HTML specification (including a DTD) can be found
5306 here:
5307
5308 =over
5309
5310 =over
5311
5312 =item L<https://www.w3.org/TR/1999/REC-html401-19991224/>
5313
5314 =back
5315
5316 =back
5317
5318 Note that attempts to use any of the new tags from the "HTML Living Standard"
5319 will simply result in them being escaped into literal text.
5320
5321 Stick to markdown-format text or HTML 4 tags to avoid unexpected output.
5322
5323
5324 =head1 PERL MODULE
5325
5326 Markdown can be used as a Perl module and can be "use"d like so:
5327
5328  use Markdown qw(...);
5329
5330 Or like so:
5331
5332  BEGIN {require "Markdown.pl" && Markdown->import(qw(...))}
5333
5334 where the C<...> part is the list of desired imports.
5335
5336 The Markdown module does not export any functions by default.
5337
5338 The C<Markdown.pm> file is a symbolic link to C<Markdown.pl>.
5339
5340 =head2 Markdown module functions
5341
5342 Any of these functions may be imported, but none of them
5343 are imported by default.
5344
5345 =over
5346
5347
5348 =item * $result = Markdown::Markdown($string[, options...])
5349
5350 Converts Markdown-format C<$string> to UTF-8 encoded XHTML and
5351 returns it.
5352
5353 The C<options...> may be either a single HASH ref or one or more
5354 pairs of C<< key => value >>.
5355
5356 See the comments for the C<_SanitizeOpts> function for a list of
5357 possible option keys.
5358
5359
5360 =item * $result = Markdown::ProcessRaw($string[, options...])
5361
5362 Converts raw XHTML in C<$string> to XHTML and returns it.
5363
5364 The C<options...> may be either a single HASH ref or one or more
5365 pairs of C<< key => value >>.
5366
5367 See the comments for the C<_SanitizeOpts> function for a list of
5368 possible option keys.
5369
5370 This function provides the ability to apply the internal XML
5371 validation and sanitation functionality to arbitrary XHTML without
5372 performing any of the Markdown format interpretation.
5373
5374
5375 =item * $stylesheet = Markdown::GenerateStyleSheet([$prefix])
5376
5377 Returns an XHTML style sheet that supports the fancy Markdown styles
5378 such as checkboxes and right parenthesis lists.
5379
5380 All of the style class names have C<$prefix> prepended.
5381
5382 If C<$prefix> is omitted or C<undef> then the default S<"_markdown-">
5383 prefix will be used which is the same default prefix that the
5384 C<Markdown> function uses.
5385
5386 The returned string value consists of a C<< <style type="text/css"> >>
5387 tag, the contents of the style sheet and ends with a C<< </style> >> tag.
5388
5389
5390 =item * Markdown::SetWikiOpts($hashref, $wikioption)
5391
5392 The value of C<$wikioption> should be the value of the C<wikipat> value
5393 from the B<--wiki> option.  Use the empty string S<""> to enable wiki
5394 links using the defaults and use C<undef> to disable wiki links.
5395
5396 The C<wikipat> and C<wikiopt> keys in C<$hashref> will both be
5397 affected by this call and they should be passed in to the Markdown
5398 function as options to enable processing of wiki links.
5399
5400 The simplest way to do this is simply to pass a HASH ref as the
5401 second argument to the Markdown function after having used this
5402 function on it to properly set the C<wikipat> and C<wikiopt>
5403 keys and values.
5404
5405
5406 =back
5407
5408
5409 =head2 Example
5410
5411 This rudimentary example approximates running
5412 S<C<Markdown.pl --stub --wiki>>
5413 on the input (files if given, standard input if not).
5414
5415  use Markdown qw(Markdown SetWikiOpts GenerateStyleSheet escapeXML);
5416
5417  my $string;
5418  {local $/; $string = <>;}
5419  my %opts = ( h1 => "default title" );
5420  SetWikiOpts(\%opts, ""); # enable default --wiki processing
5421  my $xhtml = Markdown($string, \%opts);
5422  print "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n",
5423    "<head>\n<title>".escapeXML($opts{h1})."</title>\n",
5424    GenerateStyleSheet(),"</head>\n",
5425    "<body style=\"text-align:center\">\n",
5426    "<div style=\"".
5427    "display:inline-block;text-align:left;max-width:42pc\">\n",
5428    $xhtml, "</div></body></html>\n";
5429
5430
5431 =head1 VERSION HISTORY
5432
5433 Z<> See the F<README> file for detailed release notes for this version.
5434
5435 =over
5436
5437 =item Z<> 1.1.15 - 15 Aug 2021
5438
5439 =item Z<> 1.1.14 - 14 Jun 2021
5440
5441 =item Z<> 1.1.13 - 13 Jun 2021
5442
5443 =item Z<> 1.1.12 - 17 Mar 2021
5444
5445 =item Z<> 1.1.11 - 05 Mar 2021
5446
5447 =item Z<> 1.1.10 - 08 Jul 2020
5448
5449 =item Z<> 1.1.9 - 15 Dec 2019
5450
5451 =item Z<> 1.1.8 - 22 Nov 2019
5452
5453 =item Z<> 1.1.7 - 14 Feb 2018
5454
5455 =item Z<> 1.1.6 - 03 Jan 2018
5456
5457 =item Z<> 1.1.5 - 07 Dec 2017
5458
5459 =item Z<> 1.1.4 - 24 Jun 2017
5460
5461 =item Z<> 1.1.3 - 13 Feb 2017
5462
5463 =item Z<> 1.1.2 - 19 Jan 2017
5464
5465 =item Z<> 1.1.1 - 12 Jan 2017
5466
5467 =item Z<> 1.1.0 - 11 Jan 2017
5468
5469 =item Z<> 1.0.4 - 05 Jun 2016
5470
5471 =item Z<> 1.0.3 - 06 Sep 2015
5472
5473 =item Z<> 1.0.2 - 03 Sep 2015
5474
5475 =item Z<> 1.0.1 - 14 Dec 2004
5476
5477 =item Z<> 1.0.0 - 28 Aug 2004
5478
5479 =back
5480
5481 =head1 AUTHORS
5482
5483 =over
5484
5485 =item John Gruber
5486
5487 =item L<https://daringfireball.net>
5488
5489 =item L<https://daringfireball.net/projects/markdown/>
5490
5491 =item E<160>
5492
5493 =back
5494
5495 =over
5496
5497 =item PHP port and other contributions by Michel Fortin
5498
5499 =item L<https://michelf.ca>
5500
5501 =item E<160>
5502
5503 =back
5504
5505 =over
5506
5507 =item Additional enhancements and tweaks by Kyle J. McKay
5508
5509 =item mackyle<at>gmail.com
5510
5511 =item L<https://repo.or.cz/markdown.git>
5512
5513 =back
5514
5515 =head1 COPYRIGHT AND LICENSE
5516
5517 =over
5518
5519 =item Copyright (C) 2003-2004 John Gruber
5520
5521 =item Copyright (C) 2015-2021 Kyle J. McKay
5522
5523 =item All rights reserved.
5524
5525 =back
5526
5527 Redistribution and use in source and binary forms, with or without
5528 modification, are permitted provided that the following conditions are
5529 met:
5530
5531 =over
5532
5533 =item *
5534
5535 Redistributions of source code must retain the above copyright
5536 notice, this list of conditions and the following disclaimer.
5537
5538 =item *
5539
5540 Redistributions in binary form must reproduce the above copyright
5541 notice, this list of conditions and the following disclaimer in the
5542 documentation and/or other materials provided with the distribution.
5543
5544 =item *
5545
5546 Neither the name "Markdown" nor the names of its contributors may
5547 be used to endorse or promote products derived from this software
5548 without specific prior written permission.
5549
5550 =back
5551
5552 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
5553 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
5554 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
5555 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
5556 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
5557 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
5558 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
5559 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
5560 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
5561 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
5562 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5563
5564 =cut