Markdown.pl

   1 #!/usr/bin/env perl
   2
   3 #
   4 # Markdown -- A text-to-HTML conversion tool for web writers
   5 #
   6 # Copyright (C) 2004 John Gruber
   7 # Copyright (C) 2015,2016 Kyle J. McKay
   8 #
   9
  10
  11 package Markdown;
  12 require 5.006_000;
  13 use strict;
  14 use warnings;
  15 close(DATA) if fileno(DATA);
  16
  17 require Exporter;
  18 use Digest::MD5 qw(md5_hex);
  19 use File::Basename qw(basename);
  20 use vars qw($VERSION @ISA @EXPORT_OK);
  21 @ISA = qw(Exporter);
  22 @EXPORT_OK = qw(Markdown);
  23 $INC{__PACKAGE__.'.pm'} = $INC{basename(__FILE__)} unless exists $INC{__PACKAGE__.'.pm'};
  24
  25 $VERSION = '1.0.4';
  26 # Sun 05 Jun 2016
  27
  28
  29 ## Disabled; causes problems under Perl 5.6.1:
  30 # use utf8;
  31 # binmode( STDOUT, ":utf8" );  # c.f.: http://acis.openlib.org/dev/perl-unicode-struggle.html
  32
  33
  34 #
  35 # Global default settings:
  36 #
  37 my $g_empty_element_suffix = " />";     # Change to ">" for HTML output
  38 my $g_tab_width = 4;                    # Legacy even though it's wrong
  39
  40
  41 #
  42 # Globals:
  43 #
  44
  45 # Regex to match balanced [brackets]. See Friedl's
  46 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
  47 my $g_nested_brackets;
  48 $g_nested_brackets = qr{
  49     (?>                             # Atomic matching
  50        [^\[\]]+                         # Anything other than brackets
  51      |
  52        \[
  53          (??{ $g_nested_brackets })     # Recursive set of nested brackets
  54        \]
  55     )*
  56 }x;
  57
  58
  59 # Table of hash values for escaped characters:
  60 my %g_escape_table;
  61 foreach my $char (split //, "\\\`*_{}[]()>#+-.!~") {
  62     $g_escape_table{$char} = md5_hex($char);
  63 }
  64
  65
  66 # Global hashes, used by various utility routines
  67 my %g_urls;
  68 my %g_titles;
  69 my %g_html_blocks;
  70 my %opt;
  71
  72 # Used to track when we're inside an ordered or unordered list
  73 # (see _ProcessListItems() for details):
  74 my $g_list_level = 0;
  75
  76
  77 #### Blosxom plug-in interface ##########################################
  78
  79 # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
  80 # which posts Markdown should process, using a "meta-markup: markdown"
  81 # header. If it's set to 0 (the default), Markdown will process all
  82 # entries.
  83 my $g_blosxom_use_meta = 0;
  84
  85 sub start { 1; }
  86 sub story {
  87     my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
  88
  89     if ( (! $g_blosxom_use_meta) or
  90          (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
  91          ){
  92             $$body_ref  = Markdown($$body_ref);
  93      }
  94      1;
  95 }
  96
  97
  98 #### Movable Type plug-in interface #####################################
  99 eval {require MT};  # Test to see if we're running in MT.
 100 unless ($@) {
 101     require MT;
 102     import  MT;
 103     require MT::Template::Context;
 104     import  MT::Template::Context;
 105
 106     eval {require MT::Plugin};  # Test to see if we're running >= MT 3.0.
 107     unless ($@) {
 108         require MT::Plugin;
 109         import  MT::Plugin;
 110         my $plugin = new MT::Plugin({
 111             name => "Markdown",
 112             description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
 113             doc_link => 'http://daringfireball.net/projects/markdown/'
 114         });
 115         MT->add_plugin( $plugin );
 116     }
 117
 118     MT::Template::Context->add_container_tag(MarkdownOptions => sub {
 119         my $ctx  = shift;
 120         my $args = shift;
 121         my $builder = $ctx->stash('builder');
 122         my $tokens = $ctx->stash('tokens');
 123
 124         if (defined ($args->{'output'}) ) {
 125             $ctx->stash('markdown_output', lc $args->{'output'});
 126         }
 127
 128         defined (my $str = $builder->build($ctx, $tokens) )
 129             or return $ctx->error($builder->errstr);
 130         $str;       # return value
 131     });
 132
 133     MT->add_text_filter('markdown' => {
 134         label     => 'Markdown',
 135         docs      => 'http://daringfireball.net/projects/markdown/',
 136         on_format => sub {
 137             my $text = shift;
 138             my $ctx  = shift;
 139             my $raw  = 0;
 140             if (defined $ctx) {
 141             my $output = $ctx->stash('markdown_output');
 142                 if (defined $output  &&  $output =~ m/^html/i) {
 143                     $g_empty_element_suffix = ">";
 144                     $ctx->stash('markdown_output', '');
 145                 }
 146                 elsif (defined $output  &&  $output eq 'raw') {
 147                     $raw = 1;
 148                     $ctx->stash('markdown_output', '');
 149                 }
 150                 else {
 151                     $raw = 0;
 152                     $g_empty_element_suffix = " />";
 153                 }
 154             }
 155             $text = $raw ? $text : Markdown($text);
 156             $text;
 157         },
 158     });
 159
 160     # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
 161     my $smartypants;
 162
 163     {
 164         no warnings "once";
 165         $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
 166     }
 167
 168     if ($smartypants) {
 169         MT->add_text_filter('markdown_with_smartypants' => {
 170             label     => 'Markdown With SmartyPants',
 171             docs      => 'http://daringfireball.net/projects/markdown/',
 172             on_format => sub {
 173                 my $text = shift;
 174                 my $ctx  = shift;
 175                 if (defined $ctx) {
 176                     my $output = $ctx->stash('markdown_output');
 177                     if (defined $output  &&  $output eq 'html') {
 178                         $g_empty_element_suffix = ">";
 179                     }
 180                     else {
 181                         $g_empty_element_suffix = " />";
 182                     }
 183                 }
 184                 $text = Markdown($text);
 185                 $text = $smartypants->($text, '1');
 186             },
 187         });
 188     }
 189 }
 190 elsif (!caller) {
 191 #### BBEdit/command-line text filter interface ##########################
 192 # Needs to be hidden from MT (and Blosxom when running in static mode).
 193
 194     # We're only using $blosxom::version once; tell Perl not to warn us:
 195     no warnings 'once';
 196     unless ( defined($blosxom::version) ) {
 197         use warnings;
 198
 199         #### Check for command-line switches: #################
 200         my %options = ();
 201         my %cli_opts;
 202         use Getopt::Long;
 203         Getopt::Long::Configure('pass_through');
 204         GetOptions(\%cli_opts,
 205             'help|h',
 206             'version|V',
 207             'shortversion|short-version|s',
 208             'html4tags',
 209             'htmlroot|r=s',
 210             'imageroot|i=s',
 211         );
 212         if ($cli_opts{'help'}) {
 213             exec 'perldoc', $0;
 214         }
 215         if ($cli_opts{'version'}) {     # Version info
 216             print "\nThis is Markdown, version $VERSION.\n";
 217             print "Copyright (C) 2004 John Gruber\n";
 218             print "Copyright (C) 2015 Kyle J. McKay\n";
 219             exit 0;
 220         }
 221         if ($cli_opts{'shortversion'}) {        # Just the version number string.
 222             print $VERSION;
 223             exit 0;
 224         }
 225         if ($cli_opts{'html4tags'}) {           # Use HTML tag style instead of XHTML
 226             $options{empty_element_suffix} = ">";
 227         }
 228         if ($cli_opts{'htmlroot'}) {            # Use URL prefix
 229             $options{url_prefix} = $cli_opts{'htmlroot'};
 230         }
 231         if ($cli_opts{'imageroot'}) {           # Use image URL prefix
 232             $options{img_prefix} = $cli_opts{'imageroot'};
 233         }
 234
 235
 236         #### Process incoming text: ###########################
 237         my $text;
 238         {
 239             local $/;               # Slurp the whole file
 240             $text = <>;
 241         }
 242         print Markdown($text, \%options);
 243     }
 244 }
 245
 246
 247
 248 sub Markdown {
 249 #
 250 # Main function. The order in which other subs are called here is
 251 # essential. Link and image substitutions need to happen before
 252 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
 253 # and <img> tags get encoded.
 254 #
 255     my $text = shift;
 256     defined $text or $text='';
 257
 258     # Any remaining arguments after the first are options; either a single
 259     # hashref or a list of name, value paurs.
 260     %opt = (
 261         # set initial defaults
 262         empty_element_suffix    => $g_empty_element_suffix,
 263         tab_width               => $g_tab_width,
 264         url_prefix              => "",  # Prefixed to non-absolute URLs
 265         img_prefix              => "",  # Prefixed to non-absolute image URLs
 266     );
 267     my %args = ();
 268     if (ref($_[0]) eq "HASH") {
 269         %args = %{$_[0]};
 270     } else {
 271         %args = @_;
 272     }
 273     while (my ($k,$v) = each %args) {
 274         $opt{$k} = $v;
 275     }
 276
 277     # Clear the globals. If we don't clear these, you get conflicts
 278     # from other articles when generating a page which contains more than
 279     # one article (e.g. an index page that shows the N most recent
 280     # articles):
 281     %g_urls = ();
 282     %g_titles = ();
 283     %g_html_blocks = ();
 284     $g_list_level = 0;
 285
 286     # Standardize line endings:
 287     $text =~ s{\r\n}{\n}g;  # DOS to Unix
 288     $text =~ s{\r}{\n}g;    # Mac to Unix
 289
 290     # Make sure $text ends with a couple of newlines:
 291     $text .= "\n\n";
 292
 293     # Handle backticks-delimited code blocks
 294     $text = _HashBTCodeBlocks($text);
 295
 296     # Convert all tabs to spaces.
 297     $text = _Detab($text);
 298
 299     # Strip any lines consisting only of spaces and tabs.
 300     # This makes subsequent regexen easier to write, because we can
 301     # match consecutive blank lines with /\n+/ instead of something
 302     # contorted like /[ \t]*\n+/ .
 303     $text =~ s/^[ \t]+$//mg;
 304
 305     # Turn block-level HTML blocks into hash entries
 306     $text = _HashHTMLBlocks($text);
 307
 308     # Strip link definitions, store in hashes.
 309     $text = _StripLinkDefinitions($text);
 310
 311     $text = _RunBlockGamut($text);
 312
 313     $text = _UnescapeSpecialChars($text);
 314
 315     return $text . "\n";
 316 }
 317
 318
 319 sub _HashBTCodeBlocks {
 320 #
 321 #   Process Markdown backticks (```) delimited code blocks
 322 #
 323     my $text = shift;
 324
 325     $text =~ s{
 326             (?:\n|\A)
 327                 ``(`+)[ \t]*(?:([\w.+-]+)[ \t]*)?\n
 328             (               # $3 = the code block -- one or more lines, starting with ```
 329               (?:
 330                 .*\n+
 331               )+?
 332             )
 333             (?:(?:``\1[ \t]*(?:\n|\Z))|\Z) # and ending with ``` or end of document
 334         }{
 335             # $2 contains syntax highlighting to use if defined
 336             my $codeblock = $3;
 337             $codeblock =~ s/[ \t]+$//mg; # trim trailing spaces on lines
 338             $codeblock = _Detab($codeblock, 8); # physical tab stops are always 8
 339             $codeblock =~ s/\A\n+//; # trim leading newlines
 340             $codeblock =~ s/\s+\z//; # trim trailing whitespace
 341             $codeblock = _EncodeCode($codeblock); # or run highlighter here
 342             $codeblock = "<pre><code>" . $codeblock . "\n</code></pre>";
 343
 344             my $key = md5_hex($codeblock);
 345             $g_html_blocks{$key} = $codeblock;
 346             "\n\n" . $key . "\n\n";
 347         }egmx;
 348
 349     return $text;
 350 }
 351
 352
 353 sub _StripLinkDefinitions {
 354 #
 355 # Strips link definitions from text, stores the URLs and titles in
 356 # hash references.
 357 #
 358     my $text = shift;
 359     my $less_than_tab = $opt{tab_width} - 1;
 360
 361     # Link defs are in the form: ^[id]: url "optional title"
 362     while ($text =~ s{
 363                         ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1
 364                           [ \t]*
 365                           \n?               # maybe *one* newline
 366                           [ \t]*
 367                         <?(\S+?)>?          # url = $2
 368                           [ \t]*
 369                           \n?               # maybe one newline
 370                           [ \t]*
 371                         (?:
 372                             (?<=\s)         # lookbehind for whitespace
 373                             ["(]
 374                             (.+?)           # title = $3
 375                             [")]
 376                             [ \t]*
 377                         )?  # title is optional
 378                         (?:\n+|\Z)
 379                     }
 380                     {}mx) {
 381         $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 );    # Link IDs are case-insensitive
 382         if ($3) {
 383             $g_titles{lc $1} = $3;
 384             $g_titles{lc $1} =~ s/\042/&quot;/g;
 385         }
 386     }
 387
 388     return $text;
 389 }
 390
 391
 392 sub _HashHTMLBlocks {
 393     my $text = shift;
 394     my $less_than_tab = $opt{tab_width} - 1;
 395
 396     # Hashify HTML blocks:
 397     # We only want to do this for block-level HTML tags, such as headers,
 398     # lists, and tables. That's because we still want to wrap <p>s around
 399     # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
 400     # phrase emphasis, and spans. The list of tags we're looking for is
 401     # hard-coded:
 402     my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/;
 403     my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/;
 404
 405     # First, look for nested blocks, e.g.:
 406     #   <div>
 407     #       <div>
 408     #       tags for inner block must be indented.
 409     #       </div>
 410     #   </div>
 411     #
 412     # The outermost tags must start at the left margin for this to match, and
 413     # the inner nested divs must be indented.
 414     # We need to do this before the next, more liberal match, because the next
 415     # match will start at the first `<div>` and stop at the first `</div>`.
 416     $text =~ s{
 417                 (                       # save in $1
 418                     ^                   # start of line  (with /m)
 419                     <($block_tags_a)    # start tag = $2
 420                     \b                  # word break
 421                     (.*\n)*?            # any number of lines, minimally matching
 422                     </\2>               # the matching end tag
 423                     [ \t]*              # trailing spaces/tabs
 424                     (?=\n+|\Z)  # followed by a newline or end of document
 425                 )
 426             }{
 427                 my $key = md5_hex($1);
 428                 $g_html_blocks{$key} = $1;
 429                 "\n\n" . $key . "\n\n";
 430             }egmx;
 431
 432
 433     #
 434     # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
 435     #
 436     $text =~ s{
 437                 (                       # save in $1
 438                     ^                   # start of line  (with /m)
 439                     <($block_tags_b)    # start tag = $2
 440                     \b                  # word break
 441                     (.*\n)*?            # any number of lines, minimally matching
 442                     .*</\2>             # the matching end tag
 443                     [ \t]*              # trailing spaces/tabs
 444                     (?=\n+|\Z)  # followed by a newline or end of document
 445                 )
 446             }{
 447                 my $key = md5_hex($1);
 448                 $g_html_blocks{$key} = $1;
 449                 "\n\n" . $key . "\n\n";
 450             }egmx;
 451     # Special case just for <hr />. It was easier to make a special case than
 452     # to make the other regex more complicated.
 453     $text =~ s{
 454                 (?:
 455                     (?<=\n\n)       # Starting after a blank line
 456                     |               # or
 457                     \A\n?           # the beginning of the doc
 458                 )
 459                 (                       # save in $1
 460                     [ ]{0,$less_than_tab}
 461                     <(hr)               # start tag = $2
 462                     \b                  # word break
 463                     ([^<>])*?           #
 464                     /?>                 # the matching end tag
 465                     [ \t]*
 466                     (?=\n{2,}|\Z)       # followed by a blank line or end of document
 467                 )
 468             }{
 469                 my $key = md5_hex($1);
 470                 $g_html_blocks{$key} = $1;
 471                 "\n\n" . $key . "\n\n";
 472             }egx;
 473
 474     # Special case for standalone HTML comments:
 475     $text =~ s{
 476                 (?:
 477                     (?<=\n\n)       # Starting after a blank line
 478                     |               # or
 479                     \A\n?           # the beginning of the doc
 480                 )
 481                 (                       # save in $1
 482                     [ ]{0,$less_than_tab}
 483                     (?s:
 484                         <!
 485                         (--.*?--\s*)+
 486                         >
 487                     )
 488                     [ \t]*
 489                     (?=\n{2,}|\Z)       # followed by a blank line or end of document
 490                 )
 491             }{
 492                 my $key = md5_hex($1);
 493                 $g_html_blocks{$key} = $1;
 494                 "\n\n" . $key . "\n\n";
 495             }egx;
 496
 497
 498     return $text;
 499 }
 500
 501
 502 sub _RunBlockGamut {
 503 #
 504 # These are all the transformations that form block-level
 505 # tags like paragraphs, headers, and list items.
 506 #
 507     my $text = shift;
 508
 509     $text = _DoHeaders($text);
 510
 511     # Do Horizontal Rules:
 512     $text =~ s{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}{\n<hr$opt{empty_element_suffix}\n}gmx;
 513     $text =~ s{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}{\n<hr$opt{empty_element_suffix}\n}gmx;
 514     $text =~ s{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}{\n<hr$opt{empty_element_suffix}\n}gmx;
 515
 516     $text = _DoLists($text);
 517
 518     $text = _DoCodeBlocks($text);
 519
 520     $text = _DoBlockQuotes($text);
 521
 522     # We already ran _HashHTMLBlocks() before, in Markdown(), but that
 523     # was to escape raw HTML in the original Markdown source. This time,
 524     # we're escaping the markup we've just created, so that we don't wrap
 525     # <p> tags around block-level tags.
 526     $text = _HashHTMLBlocks($text);
 527
 528     $text = _FormParagraphs($text);
 529
 530     return $text;
 531 }
 532
 533
 534 sub _RunSpanGamut {
 535 #
 536 # These are all the transformations that occur *within* block-level
 537 # tags like paragraphs, headers, and list items.
 538 #
 539     my $text = shift;
 540
 541     $text = _DoCodeSpans($text);
 542
 543     $text = _EscapeSpecialChars($text);
 544
 545     # Process anchor and image tags. Images must come first,
 546     # because ![foo][f] looks like an anchor.
 547     $text = _DoImages($text);
 548     $text = _DoAnchors($text);
 549
 550     # Make links out of things like `<http://example.com/>`
 551     # Must come after _DoAnchors(), because you can use < and >
 552     # delimiters in inline links like [this](<url>).
 553     $text = _DoAutoLinks($text);
 554
 555     $text = _EncodeAmpsAndAngles($text);
 556
 557     $text = _DoItalicsAndBoldAndStrike($text);
 558
 559     # Do hard breaks:
 560     $text =~ s/ {2,}\n/ <br$opt{empty_element_suffix}\n/g;
 561
 562     return $text;
 563 }
 564
 565
 566 sub _EscapeSpecialChars {
 567     my $text = shift;
 568     my $tokens ||= _TokenizeHTML($text);
 569
 570     $text = '';   # rebuild $text from the tokens
 571 #   my $in_pre = 0;  # Keep track of when we're inside <pre> or <code> tags.
 572 #   my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
 573
 574     foreach my $cur_token (@$tokens) {
 575         if ($cur_token->[0] eq "tag") {
 576             # Within tags, encode *, _ and ~ so they don't conflict
 577             # with their use in Markdown for italics and strong.
 578             # We're replacing each such character with its
 579             # corresponding MD5 checksum value; this is likely
 580             # overkill, but it should prevent us from colliding
 581             # with the escape values by accident.
 582             $cur_token->[1] =~  s! \* !$g_escape_table{'*'}!gx;
 583             $cur_token->[1] =~  s! _  !$g_escape_table{'_'}!gx;
 584             $cur_token->[1] =~  s! ~  !$g_escape_table{'~'}!gx;
 585             $text .= $cur_token->[1];
 586         } else {
 587             my $t = $cur_token->[1];
 588             $t = _EncodeBackslashEscapes($t);
 589             $text .= $t;
 590         }
 591     }
 592     return $text;
 593 }
 594
 595
 596 sub _DoAnchors {
 597 #
 598 # Turn Markdown link shortcuts into XHTML <a> tags.
 599 #
 600     my $text = shift;
 601
 602     #
 603     # First, handle reference-style links: [link text] [id]
 604     #
 605     $text =~ s{
 606         (                   # wrap whole match in $1
 607           \[
 608             ($g_nested_brackets)    # link text = $2
 609           \]
 610
 611           [ ]?              # one optional space
 612           (?:\n[ ]*)?       # one optional newline followed by spaces
 613
 614           \[
 615             (.*?)       # id = $3
 616           \]
 617         )
 618     }{
 619         my $result;
 620         my $whole_match = $1;
 621         my $link_text   = $2;
 622         my $link_id     = lc $3;
 623
 624         if ($link_id eq "") {
 625             $link_id = lc $link_text;     # for shortcut links like [this][].
 626         }
 627
 628         if (defined $g_urls{$link_id}) {
 629             my $url = _PrefixURL($g_urls{$link_id});
 630             $url =~ s! \* !$g_escape_table{'*'}!gx;     # We've got to encode these to avoid
 631             $url =~ s!  _ !$g_escape_table{'_'}!gx;     # conflicting with italics, bold
 632             $url =~ s!  ~ !$g_escape_table{'~'}!gx;     # and strike through.
 633             $result = "<a href=\"$url\"";
 634             if ( defined $g_titles{$link_id} ) {
 635                 my $title = $g_titles{$link_id};
 636                 $title =~ s! \* !$g_escape_table{'*'}!gx;
 637                 $title =~ s!  _ !$g_escape_table{'_'}!gx;
 638                 $title =~ s!  ~ !$g_escape_table{'~'}!gx;
 639                 $result .=  " title=\"$title\"";
 640             }
 641             $result .= ">$link_text</a>";
 642         }
 643         else {
 644             $result = $whole_match;
 645         }
 646         $result;
 647     }xsge;
 648
 649     #
 650     # Next, inline-style links: [link text](url "optional title")
 651     #
 652     $text =~ s{
 653         (               # wrap whole match in $1
 654           \[
 655             ($g_nested_brackets)    # link text = $2
 656           \]
 657           \(            # literal paren
 658             [ \t]*
 659             <?(.*?)>?   # href = $3
 660             [ \t]*
 661             (           # $4
 662               (['\042]) # quote char = $5
 663               (.*?)     # Title = $6
 664               \5        # matching quote
 665             )?          # title is optional
 666           \)
 667         )
 668     }{
 669         my $result;
 670         my $whole_match = $1;
 671         my $link_text   = $2;
 672         my $url     = $3;
 673         my $title   = $6;
 674
 675         $url = _PrefixURL($url);
 676         $url =~ s! \* !$g_escape_table{'*'}!gx;     # We've got to encode these to avoid
 677         $url =~ s!  _ !$g_escape_table{'_'}!gx;     # conflicting with italics, bold
 678         $url =~ s!  ~ !$g_escape_table{'~'}!gx;     # and strike through.
 679         $result = "<a href=\"$url\"";
 680
 681         if (defined $title) {
 682             $title =~ s/\042/&quot;/g;
 683             $title =~ s! \* !$g_escape_table{'*'}!gx;
 684             $title =~ s!  _ !$g_escape_table{'_'}!gx;
 685             $title =~ s!  ~ !$g_escape_table{'~'}!gx;
 686             $result .=  " title=\"$title\"";
 687         }
 688
 689         $result .= ">$link_text</a>";
 690
 691         $result;
 692     }xsge;
 693
 694     return $text;
 695 }
 696
 697
 698 sub _DoImages {
 699 #
 700 # Turn Markdown image shortcuts into <img> tags.
 701 #
 702     my $text = shift;
 703
 704     #
 705     # First, handle reference-style labeled images: ![alt text][id]
 706     #
 707     $text =~ s{
 708         (               # wrap whole match in $1
 709           !\[
 710             (.*?)       # alt text = $2
 711           \]
 712
 713           [ ]?              # one optional space
 714           (?:\n[ ]*)?       # one optional newline followed by spaces
 715
 716           \[
 717             (.*?)       # id = $3
 718           \]
 719
 720         )
 721     }{
 722         my $result;
 723         my $whole_match = $1;
 724         my $alt_text    = $2;
 725         my $link_id     = lc $3;
 726
 727         if ($link_id eq "") {
 728             $link_id = lc $alt_text;     # for shortcut links like ![this][].
 729         }
 730
 731         $alt_text =~ s/"/&quot;/g;
 732         if (defined $g_urls{$link_id}) {
 733             my $url = _PrefixURL($g_urls{$link_id});
 734             $url =~ s! \* !$g_escape_table{'*'}!gx;     # We've got to encode these to avoid
 735             $url =~ s!  _ !$g_escape_table{'_'}!gx;     # conflicting with italics, bold
 736             $url =~ s!  ~ !$g_escape_table{'~'}!gx;     # and strike through.
 737             $result = "<img src=\"$url\" alt=\"$alt_text\"";
 738             if (defined $g_titles{$link_id}) {
 739                 my $title = $g_titles{$link_id};
 740                 $title =~ s! \* !$g_escape_table{'*'}!gx;
 741                 $title =~ s!  _ !$g_escape_table{'_'}!gx;
 742                 $title =~ s!  ~ !$g_escape_table{'~'}!gx;
 743                 $result .=  " title=\"$title\"";
 744             }
 745             $result .= $opt{empty_element_suffix};
 746         }
 747         else {
 748             # If there's no such link ID, leave intact:
 749             $result = $whole_match;
 750         }
 751
 752         $result;
 753     }xsge;
 754
 755     #
 756     # Next, handle inline images:  ![alt text](url "optional title")
 757     # Don't forget: encode * and _
 758
 759     $text =~ s{
 760         (               # wrap whole match in $1
 761           !\[
 762             (.*?)       # alt text = $2
 763           \]
 764           \(            # literal paren
 765             [ \t]*
 766             <?(\S+?)>?  # src url = $3
 767             [ \t]*
 768             (           # $4
 769               (['\042]) # quote char = $5
 770               (.*?)     # title = $6
 771               \5        # matching quote
 772               [ \t]*
 773             )?          # title is optional
 774           \)
 775         )
 776     }{
 777         my $result;
 778         my $whole_match = $1;
 779         my $alt_text    = $2;
 780         my $url     = $3;
 781         my $title   = '';
 782         if (defined($6)) {
 783             $title  = $6;
 784         }
 785
 786         $url = _PrefixURL($url);
 787         $alt_text =~ s/"/&quot;/g;
 788         $title    =~ s/"/&quot;/g;
 789         $url =~ s! \* !$g_escape_table{'*'}!gx;     # We've got to encode these to avoid
 790         $url =~ s!  _ !$g_escape_table{'_'}!gx;     # conflicting with italics, bold
 791         $url =~ s!  ~ !$g_escape_table{'~'}!gx;     # and strike through.
 792         $result = "<img src=\"$url\" alt=\"$alt_text\"";
 793         if (defined $title) {
 794             $title =~ s! \* !$g_escape_table{'*'}!gx;
 795             $title =~ s!  _ !$g_escape_table{'_'}!gx;
 796             $title =~ s!  ~ !$g_escape_table{'~'}!gx;
 797             $result .=  " title=\"$title\"";
 798         }
 799         $result .= $opt{empty_element_suffix};
 800
 801         $result;
 802     }xsge;
 803
 804     return $text;
 805 }
 806
 807
 808 sub _DoHeaders {
 809     my $text = shift;
 810
 811     # Setext-style headers:
 812     #     Header 1
 813     #     ========
 814     #
 815     #     Header 2
 816     #     --------
 817     #
 818     #     Header 3
 819     #     ~~~~~~~~
 820     #
 821     $text =~ s{ ^(?:=+[ \t]*\n)?(.+)[ \t]*\n=+[ \t]*\n+ }{
 822         "<h1>"  .  _RunSpanGamut($1)  .  "</h1>\n\n";
 823     }egmx;
 824
 825     $text =~ s{ ^(?:-+[ \t]*\n)?(.+)[ \t]*\n-+[ \t]*\n+ }{
 826         "<h2>"  .  _RunSpanGamut($1)  .  "</h2>\n\n";
 827     }egmx;
 828
 829     $text =~ s{ ^(?:~+[ \t]*\n)?(.+)[ \t]*\n~+[ \t]*\n+ }{
 830         "<h3>"  .  _RunSpanGamut($1)  .  "</h3>\n\n";
 831     }egmx;
 832
 833
 834     # atx-style headers:
 835     #   # Header 1
 836     #   ## Header 2
 837     #   ## Header 2 with closing hashes ##
 838     #   ...
 839     #   ###### Header 6
 840     #
 841     $text =~ s{
 842             ^(\#{1,6})  # $1 = string of #'s
 843             [ \t]*
 844             (.+?)       # $2 = Header text
 845             [ \t]*
 846             \#*         # optional closing #'s (not counted)
 847             \n+
 848         }{
 849             my $h_level = length($1);
 850             "<h$h_level>"  .  _RunSpanGamut($2)  .  "</h$h_level>\n\n";
 851         }egmx;
 852
 853     return $text;
 854 }
 855
 856
 857 sub _DoLists {
 858 #
 859 # Form HTML ordered (numbered) and unordered (bulleted) lists.
 860 #
 861     my $text = shift;
 862     my $less_than_tab = $opt{tab_width} - 1;
 863
 864     # Re-usable patterns to match list item bullets and number markers:
 865     my $marker_ul  = qr/[*+-]/;
 866     my $marker_ol  = qr/\d+[.]/;
 867     my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
 868
 869     # Re-usable pattern to match any entirel ul or ol list:
 870     my $whole_list = qr{
 871         (                               # $1 = whole list
 872           (                             # $2
 873             [ ]{0,$less_than_tab}
 874             (${marker_any})             # $3 = first list item marker
 875             [ \t]+
 876           )
 877           (?s:.+?)
 878           (                             # $4
 879               \z
 880             |
 881               \n{2,}
 882               (?=\S)
 883               (?!                       # Negative lookahead for another list item marker
 884                 [ \t]*
 885                 ${marker_any}[ \t]+
 886               )
 887           )
 888         )
 889     }mx;
 890
 891     # We use a different prefix before nested lists than top-level lists.
 892     # See extended comment in _ProcessListItems().
 893     #
 894     # Note: There's a bit of duplication here. My original implementation
 895     # created a scalar regex pattern as the conditional result of the test on
 896     # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
 897     # substitution once, using the scalar as the pattern. This worked,
 898     # everywhere except when running under MT on my hosting account at Pair
 899     # Networks. There, this caused all rebuilds to be killed by the reaper (or
 900     # perhaps they crashed, but that seems incredibly unlikely given that the
 901     # same script on the same server ran fine *except* under MT. I've spent
 902     # more time trying to figure out why this is happening than I'd like to
 903     # admit. My only guess, backed up by the fact that this workaround works,
 904     # is that Perl optimizes the substition when it can figure out that the
 905     # pattern will never change, and when this optimization isn't on, we run
 906     # afoul of the reaper. Thus, the slightly redundant code to that uses two
 907     # static s/// patterns rather than one conditional pattern.
 908
 909     if ($g_list_level) {
 910         $text =~ s{
 911                 ^
 912                 $whole_list
 913             }{
 914                 my $list = $1;
 915                 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
 916                 # Turn double returns into triple returns, so that we can make a
 917                 # paragraph for the last item in a list, if necessary:
 918                 $list =~ s/\n{2,}/\n\n\n/g;
 919                 my $result = _ProcessListItems($list, $marker_any);
 920                 $result = "<$list_type>\n" . $result . "</$list_type>\n";
 921                 $result;
 922             }egmx;
 923     }
 924     else {
 925         $text =~ s{
 926                 (?:(?<=\n\n)|\A\n?)
 927                 $whole_list
 928             }{
 929                 my $list = $1;
 930                 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
 931                 # Turn double returns into triple returns, so that we can make a
 932                 # paragraph for the last item in a list, if necessary:
 933                 $list =~ s/\n{2,}/\n\n\n/g;
 934                 my $result = _ProcessListItems($list, $marker_any);
 935                 $result = "<$list_type>\n" . $result . "</$list_type>\n";
 936                 $result;
 937             }egmx;
 938     }
 939
 940
 941     return $text;
 942 }
 943
 944
 945 sub _ProcessListItems {
 946 #
 947 #   Process the contents of a single ordered or unordered list, splitting it
 948 #   into individual list items.
 949 #
 950
 951     my $list_str = shift;
 952     my $marker_any = shift;
 953
 954
 955     # The $g_list_level global keeps track of when we're inside a list.
 956     # Each time we enter a list, we increment it; when we leave a list,
 957     # we decrement. If it's zero, we're not in a list anymore.
 958     #
 959     # We do this because when we're not inside a list, we want to treat
 960     # something like this:
 961     #
 962     #       I recommend upgrading to version
 963     #       8. Oops, now this line is treated
 964     #       as a sub-list.
 965     #
 966     # As a single paragraph, despite the fact that the second line starts
 967     # with a digit-period-space sequence.
 968     #
 969     # Whereas when we're inside a list (or sub-list), that line will be
 970     # treated as the start of a sub-list. What a kludge, huh? This is
 971     # an aspect of Markdown's syntax that's hard to parse perfectly
 972     # without resorting to mind-reading. Perhaps the solution is to
 973     # change the syntax rules such that sub-lists must start with a
 974     # starting cardinal number; e.g. "1." or "a.".
 975
 976     $g_list_level++;
 977
 978     # trim trailing blank lines:
 979     $list_str =~ s/\n{2,}\z/\n/;
 980
 981
 982     $list_str =~ s{
 983         (\n)?                           # leading line = $1
 984         (^[ \t]*)                       # leading whitespace = $2
 985         ($marker_any) [ \t]+            # list marker = $3
 986         ((?s:.+?)                       # list item text   = $4
 987         (\n{1,2}))
 988         (?= \n* (\z | \2 ($marker_any) [ \t]+))
 989     }{
 990         my $item = $4;
 991         my $leading_line = $1;
 992         my $leading_space = $2;
 993
 994         if ($leading_line or ($item =~ m/\n{2,}/)) {
 995             $item = _RunBlockGamut(_Outdent($item));
 996         }
 997         else {
 998             # Recursion for sub-lists:
 999             $item = _DoLists(_Outdent($item));
1000             chomp $item;
1001             $item = _RunSpanGamut($item);
1002         }
1003
1004         "<li>" . $item . "</li>\n";
1005     }egmx;
1006
1007     $g_list_level--;
1008     return $list_str;
1009 }
1010
1011
1012
1013 sub _DoCodeBlocks {
1014 #
1015 #   Process Markdown `<pre><code>` blocks.
1016 #
1017
1018     my $text = shift;
1019
1020     $text =~ s{
1021             (?:\n\n|\A)
1022             (               # $1 = the code block -- one or more lines, starting with a space/tab
1023               (?:
1024                 (?:[ ]{$opt{tab_width}} | \t)  # Lines must start with a tab or a tab-width of spaces
1025                 .*\n+
1026               )+
1027             )
1028             ((?=^[ ]{0,$opt{tab_width}}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
1029         }{
1030             my $codeblock = $1;
1031             my $result; # return value
1032
1033             $codeblock = _EncodeCode(_Outdent($codeblock));
1034             $codeblock = _Detab($codeblock);
1035             $codeblock =~ s/\A\n+//; # trim leading newlines
1036             $codeblock =~ s/\s+\z//; # trim trailing whitespace
1037
1038             $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
1039
1040             $result;
1041         }egmx;
1042
1043     return $text;
1044 }
1045
1046
1047 sub _DoCodeSpans {
1048 #
1049 #   *   Backtick quotes are used for <code></code> spans.
1050 #
1051 #   *   You can use multiple backticks as the delimiters if you want to
1052 #       include literal backticks in the code span. So, this input:
1053 #
1054 #         Just type ``foo `bar` baz`` at the prompt.
1055 #
1056 #   Will translate to:
1057 #
1058 #         <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1059 #
1060 #       There's no arbitrary limit to the number of backticks you
1061 #       can use as delimters. If you need three consecutive backticks
1062 #       in your code, use four for delimiters, etc.
1063 #
1064 #   *   You can use spaces to get literal backticks at the edges:
1065 #
1066 #         ... type `` `bar` `` ...
1067 #
1068 #   Turns to:
1069 #
1070 #         ... type <code>`bar`</code> ...
1071 #
1072
1073     my $text = shift;
1074
1075     $text =~ s@
1076             (`+)        # $1 = Opening run of `
1077             (.+?)       # $2 = The code block
1078             (?<!`)
1079             \1          # Matching closer
1080             (?!`)
1081         @
1082             my $c = "$2";
1083             $c =~ s/^[ \t]*//g; # leading whitespace
1084             $c =~ s/[ \t]*$//g; # trailing whitespace
1085             $c = _EncodeCode($c);
1086             "<code>$c</code>";
1087         @egsx;
1088
1089     return $text;
1090 }
1091
1092
1093 sub _EncodeCode {
1094 #
1095 # Encode/escape certain characters inside Markdown code runs.
1096 # The point is that in code, these characters are literals,
1097 # and lose their special Markdown meanings.
1098 #
1099     local $_ = shift;
1100
1101     # Encode all ampersands; HTML entities are not
1102     # entities within a Markdown code span.
1103     s/&/&amp;/g;
1104
1105     # Encode $'s, but only if we're running under Blosxom.
1106     # (Blosxom interpolates Perl variables in article bodies.)
1107     {
1108         no warnings 'once';
1109     if (defined($blosxom::version)) {
1110         s/\$/&#036;/g;
1111     }
1112     }
1113
1114
1115     # Do the angle bracket song and dance:
1116     s! <  !&lt;!gx;
1117     s! >  !&gt;!gx;
1118
1119     # Now, escape characters that are magic in Markdown:
1120     s! \* !$g_escape_table{'*'}!gx;
1121     s! _  !$g_escape_table{'_'}!gx;
1122     s! ~  !$g_escape_table{'~'}!gx;
1123     s! {  !$g_escape_table{'{'}!gx;
1124     s! }  !$g_escape_table{'}'}!gx;
1125     s! \[ !$g_escape_table{'['}!gx;
1126     s! \] !$g_escape_table{']'}!gx;
1127     s! \\ !$g_escape_table{'\\'}!gx;
1128
1129     return $_;
1130 }
1131
1132
1133 sub _DoItalicsAndBoldAndStrike {
1134     my $text = shift;
1135
1136     # <strong> must go first:
1137     $text =~ s{ \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }
1138         {<strong>$1</strong>}gsx;
1139     $text =~ s{ (?<!\w) __ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\w) }
1140         {<strong>$1</strong>}gsx;
1141
1142     $text =~ s{ ~~ (?=\S) (.+?[*_]*) (?<=\S) ~~ }
1143         {<strike>$1</strike>}gsx;
1144
1145     $text =~ s{ \* (?=\S) (.+?) (?<=\S) \* }
1146         {<em>$1</em>}gsx;
1147     $text =~ s{ (?<!\w) _ (?=\S) (.+?) (?<=\S) _ (?!\w) }
1148         {<em>$1</em>}gsx;
1149
1150     return $text;
1151 }
1152
1153
1154 sub _DoBlockQuotes {
1155     my $text = shift;
1156
1157     $text =~ s{
1158           (                             # Wrap whole match in $1
1159             (
1160               ^[ \t]*>[ \t]?            # '>' at the start of a line
1161                 .+\n                    # rest of the first line
1162               (.+\n)*                   # subsequent consecutive lines
1163               \n*                       # blanks
1164             )+
1165           )
1166         }{
1167             my $bq = $1;
1168             $bq =~ s/^[ \t]*>[ \t]?//gm;    # trim one level of quoting
1169             $bq =~ s/^[ \t]+$//mg;          # trim whitespace-only lines
1170             $bq = _RunBlockGamut($bq);      # recurse
1171
1172             $bq =~ s/^/  /g;
1173             # These leading spaces screw with <pre> content, so we need to fix that:
1174             $bq =~ s{
1175                     (\s*<pre>.+?</pre>)
1176                 }{
1177                     my $pre = $1;
1178                     $pre =~ s/^  //mg;
1179                     $pre;
1180                 }egsx;
1181
1182             "<blockquote>\n$bq\n</blockquote>\n\n";
1183         }egmx;
1184
1185
1186     return $text;
1187 }
1188
1189
1190 sub _FormParagraphs {
1191 #
1192 #   Params:
1193 #       $text - string to process with html <p> tags
1194 #
1195     my $text = shift;
1196
1197     # Strip leading and trailing lines:
1198     $text =~ s/\A\n+//;
1199     $text =~ s/\n+\z//;
1200
1201     my @grafs = split(/\n{2,}/, $text);
1202
1203     #
1204     # Wrap <p> tags.
1205     #
1206     foreach (@grafs) {
1207         unless (defined( $g_html_blocks{$_} )) {
1208             $_ = _RunSpanGamut($_);
1209             s/^([ \t]*)/<p>/;
1210             $_ .= "</p>";
1211         }
1212     }
1213
1214     #
1215     # Unhashify HTML blocks
1216     #
1217     foreach (@grafs) {
1218         if (defined( $g_html_blocks{$_} )) {
1219             $_ = $g_html_blocks{$_};
1220         }
1221     }
1222
1223     return join "\n\n", @grafs;
1224 }
1225
1226
1227 sub _EncodeAmpsAndAngles {
1228 # Smart processing for ampersands and angle brackets that need to be encoded.
1229
1230     my $text = shift;
1231
1232     # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1233     #   http://bumppo.net/projects/amputator/
1234     $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
1235
1236     # Encode naked <'s
1237     $text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
1238
1239     return $text;
1240 }
1241
1242
1243 sub _EncodeBackslashEscapes {
1244 #
1245 #   Parameter:  String.
1246 #   Returns:    The string, with after processing the following backslash
1247 #               escape sequences.
1248 #
1249     local $_ = shift;
1250
1251     s! \\\\  !$g_escape_table{'\\'}!gx;     # Must process escaped backslashes first.
1252     s! \\`   !$g_escape_table{'`'}!gx;
1253     s! \\\*  !$g_escape_table{'*'}!gx;
1254     s! \\_   !$g_escape_table{'_'}!gx;
1255     s! \\~   !$g_escape_table{'~'}!gx;
1256     s! \\\{  !$g_escape_table{'{'}!gx;
1257     s! \\\}  !$g_escape_table{'}'}!gx;
1258     s! \\\[  !$g_escape_table{'['}!gx;
1259     s! \\\]  !$g_escape_table{']'}!gx;
1260     s! \\\(  !$g_escape_table{'('}!gx;
1261     s! \\\)  !$g_escape_table{')'}!gx;
1262     s! \\>   !$g_escape_table{'>'}!gx;
1263     s! \\\#  !$g_escape_table{'#'}!gx;
1264     s! \\\+  !$g_escape_table{'+'}!gx;
1265     s! \\\-  !$g_escape_table{'-'}!gx;
1266     s! \\\.  !$g_escape_table{'.'}!gx;
1267     s{ \\!  }{$g_escape_table{'!'}}gx;
1268
1269     return $_;
1270 }
1271
1272
1273 sub _DoAutoLinks {
1274     my $text = shift;
1275
1276     $text =~ s{<((https?|ftp):[^'\042>\s]+)>}{<a href="$1">$1</a>}gi;
1277
1278     # Email addresses: <address@domain.foo>
1279     $text =~ s{
1280         <
1281         (?:mailto:)?
1282         (
1283             [-.\w]+
1284             \@
1285             [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
1286         )
1287         >
1288     }{
1289         _EncodeEmailAddress( _UnescapeSpecialChars($1) );
1290     }egix;
1291
1292     return $text;
1293 }
1294
1295
1296 sub _EncodeEmailAddress {
1297 #
1298 #   Input: an email address, e.g. "foo@example.com"
1299 #
1300 #   Output: the email address as a mailto link, with each character
1301 #       of the address encoded as either a decimal or hex entity, in
1302 #       the hopes of foiling most address harvesting spam bots. E.g.:
1303 #
1304 #     <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1305 #       x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1306 #       &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1307 #
1308 #   Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1309 #   mailing list: <http://tinyurl.com/yu7ue>
1310 #
1311
1312     my $addr = shift;
1313
1314     srand;
1315     my @encode = (
1316         sub { '&#' .                 ord(shift)   . ';' },
1317         sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
1318         sub {                            shift          },
1319     );
1320
1321     $addr = "mailto:" . $addr;
1322
1323     $addr =~ s{(.)}{
1324         my $char = $1;
1325         if ( $char eq '@' ) {
1326             # this *must* be encoded. I insist.
1327             $char = $encode[int rand 1]->($char);
1328         } elsif ( $char ne ':' ) {
1329             # leave ':' alone (to spot mailto: later)
1330             my $r = rand;
1331             # roughly 10% raw, 45% hex, 45% dec
1332             $char = (
1333                 $r > .9   ?  $encode[2]->($char)  :
1334                 $r < .45  ?  $encode[1]->($char)  :
1335                              $encode[0]->($char)
1336             );
1337         }
1338         $char;
1339     }gex;
1340
1341     $addr = qq{<a href="$addr">$addr</a>};
1342     $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
1343
1344     return $addr;
1345 }
1346
1347
1348 sub _UnescapeSpecialChars {
1349 #
1350 # Swap back in all the special characters we've hidden.
1351 #
1352     my $text = shift;
1353
1354     while( my($char, $hash) = each(%g_escape_table) ) {
1355         $text =~ s/$hash/$char/g;
1356     }
1357     return $text;
1358 }
1359
1360
1361 sub _TokenizeHTML {
1362 #
1363 #   Parameter:  String containing HTML markup.
1364 #   Returns:    Reference to an array of the tokens comprising the input
1365 #               string. Each token is either a tag (possibly with nested,
1366 #               tags contained therein, such as <a href="<MTFoo>">, or a
1367 #               run of text between tags. Each element of the array is a
1368 #               two-element array; the first is either 'tag' or 'text';
1369 #               the second is the actual value.
1370 #
1371 #
1372 #   Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
1373 #       <http://www.bradchoate.com/past/mtregex.php>
1374 #
1375
1376     my $str = shift;
1377     my $pos = 0;
1378     my $len = length $str;
1379     my @tokens;
1380
1381     my $depth = 6;
1382     my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x  $depth);
1383     my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) |  # comment
1384                    (?s: <\? .*? \?> ) |              # processing instruction
1385                    $nested_tags/ix;                   # nested tags
1386
1387     while ($str =~ m/($match)/g) {
1388         my $whole_tag = $1;
1389         my $sec_start = pos $str;
1390         my $tag_start = $sec_start - length $whole_tag;
1391         if ($pos < $tag_start) {
1392             push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
1393         }
1394         push @tokens, ['tag', $whole_tag];
1395         $pos = pos $str;
1396     }
1397     push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
1398     \@tokens;
1399 }
1400
1401
1402 sub _Outdent {
1403 #
1404 # Remove one level of line-leading tabs or spaces
1405 #
1406     my $text = shift;
1407
1408     $text =~ s/^(\t|[ ]{1,$opt{tab_width}})//gm;
1409     return $text;
1410 }
1411
1412
1413 sub _Detab {
1414 #
1415 # Expand tabs to spaces using $opt{tab_width} if no second argument
1416 #
1417     my $text = shift;
1418     my $ts = shift || $opt{tab_width};
1419     # From the Perl camel book "Fluent Perl" section (slightly modified)
1420     $text =~ s/(.*?)(\t+)/$1 . ' ' x (length($2) * $ts - length($1) % $ts)/ge;
1421     return $text;
1422 }
1423
1424
1425 sub _PrefixURL {
1426 #
1427 # Add URL prefix if needed
1428 #
1429     my $url = shift;
1430
1431     return $url unless $opt{url_prefix} ne '' || $opt{img_prefix} ne '';
1432     return $url if $url =~ m,^//, || $url =~ /^[A-Za-z][A-Za-z0-9+.-]*:/;
1433     my $ans = $opt{url_prefix};
1434     $ans = $opt{img_prefix}
1435         if $opt{img_prefix} ne '' && $url =~ /\.(?:png|gif|jpe?g|svg?z)$/i;
1436     return $url unless $ans ne '';
1437     $ans .= '/' if substr($ans, -1, 1) ne '/';
1438     $ans .= substr($url, 0, 1) eq '/' ? substr($url, 1) : $url;
1439     return $ans;
1440 }
1441
1442
1443 1;
1444
1445 __DATA__
1446
1447
1448 =pod
1449
1450 =head1 NAME
1451
1452 B<Markdown>
1453
1454
1455 =head1 SYNOPSIS
1456
1457 B<Markdown.pl> [ B<--help> ] [ B<--html4tags> ] [ B<--htmlroot>=I<prefix> ]
1458     [ B<--imageroot>=I<prefix> ] [ B<--version> ] [ B<--shortversion> ]
1459     [ I<file> ... ]
1460
1461
1462 =head1 DESCRIPTION
1463
1464 Markdown is a text-to-HTML filter; it translates an easy-to-read /
1465 easy-to-write structured text format into HTML. Markdown's text format
1466 is most similar to that of plain text email, and supports features such
1467 as headers, *emphasis*, code blocks, blockquotes, and links.
1468
1469 Markdown's syntax is designed not as a generic markup language, but
1470 specifically to serve as a front-end to (X)HTML. You can  use span-level
1471 HTML tags anywhere in a Markdown document, and you can use block level
1472 HTML tags (like <div> and <table> as well).
1473
1474 For more information about Markdown's syntax, see the `basics.text`
1475 and `syntax.text` files included with `Markdown.pl`.
1476
1477
1478 =head1 OPTIONS
1479
1480 Use "--" to end switch parsing. For example, to open a file named "-z", use:
1481
1482     Markdown.pl -- -z
1483
1484 =over 4
1485
1486
1487 =item B<--html4tags>
1488
1489 Use HTML 4 style for empty element tags, e.g.:
1490
1491     <br>
1492
1493 instead of Markdown's default XHTML style tags, e.g.:
1494
1495     <br />
1496
1497
1498 =item B<-r> I<prefix>, B<--htmlroot>=I<prefix>
1499
1500 Any non-absolute URLs have I<prefix> prepended.
1501
1502
1503 =item B<-i> I<prefix>, B<--imageroot>=I<prefix>
1504
1505 Any non-absolute URLs have I<prefix> prepended (overriding the B<-r> prefix
1506 if any) but only if they end in an image suffix.
1507
1508
1509 =item B<-V>, B<--version>
1510
1511 Display Markdown's version number and copyright information.
1512
1513
1514 =item B<-s>, B<--shortversion>
1515
1516 Display the short-form version number.
1517
1518
1519 =item B<-h>, B<--help>
1520
1521 Display Markdown's help.
1522
1523
1524 =back
1525
1526
1527 =head1 VERSION HISTORY
1528
1529 See the readme file for detailed release notes for this version.
1530
1531 1.0.4 - 05 Jun 2016
1532
1533 1.0.3 - 06 Sep 2015
1534
1535 1.0.2 - 03 Sep 2015
1536
1537 1.0.1 - 14 Dec 2004
1538
1539 1.0 - 28 Aug 2004
1540
1541
1542 =head1 AUTHORS
1543
1544     John Gruber
1545     http://daringfireball.net
1546     http://daringfireball.net/projects/markdown/
1547
1548     PHP port and other contributions by Michel Fortin
1549     http://michelf.com
1550
1551     Additional enhancements and tweaks by Kyle J. McKay
1552     mackyle<at>gmail.com
1553
1554
1555 =head1 COPYRIGHT AND LICENSE
1556
1557  Copyright (C) 2003-2004 John Gruber
1558  Copyright (C) 2015,2016 Kyle J. McKay
1559  All rights reserved.
1560
1561 Redistribution and use in source and binary forms, with or without
1562 modification, are permitted provided that the following conditions are
1563 met:
1564
1565 * Redistributions of source code must retain the above copyright
1566   notice, this list of conditions and the following disclaimer.
1567
1568 * Redistributions in binary form must reproduce the above copyright
1569   notice, this list of conditions and the following disclaimer in the
1570   documentation and/or other materials provided with the distribution.
1571
1572 * Neither the name "Markdown" nor the names of its contributors may
1573   be used to endorse or promote products derived from this software
1574   without specific prior written permission.
1575
1576 This software is provided by the copyright holders and contributors "as
1577 is" and any express or implied warranties, including, but not limited
1578 to, the implied warranties of merchantability and fitness for a
1579 particular purpose are disclaimed. In no event shall the copyright owner
1580 or contributors be liable for any direct, indirect, incidental, special,
1581 exemplary, or consequential damages (including, but not limited to,
1582 procurement of substitute goods or services; loss of use, data, or
1583 profits; or business interruption) however caused and on any theory of
1584 liability, whether in contract, strict liability, or tort (including
1585 negligence or otherwise) arising in any way out of the use of this
1586 software, even if advised of the possibility of such damage.
1587
1588 =cut