lib/Semece/Markdown.pm

   1 # my patched markdown version, it basically adds a '&nbsp;&nbsp' after each
   2 # '<p>'
   3 package Text::Markdown;
   4 require 5.008_000;
   5 use strict;
   6 use warnings;
   7 use re 'eval';
   8
   9 use Digest::MD5 qw(md5_hex);
  10 use Encode      qw();
  11 use Carp        qw(croak);
  12 use base        'Exporter';
  13
  14 our $VERSION   = '1.0.24';
  15 our @EXPORT_OK = qw(markdown);
  16
  17 =head1 NAME
  18
  19 Text::Markdown - Convert Markdown syntax to (X)HTML
  20
  21 =head1 SYNOPSIS
  22
  23     use Text::Markdown 'markdown';
  24     my $html = markdown($text);
  25
  26     use Text::Markdown 'markdown';
  27     my $html = markdown( $text, {
  28         empty_element_suffix => '>',
  29         tab_width => 2,
  30     } );
  31
  32     use Text::Markdown;
  33     my $m = Text::Markdown->new;
  34     my $html = $m->markdown($text);
  35
  36     use Text::Markdown;
  37     my $m = Text::MultiMarkdown->new(
  38         empty_element_suffix => '>',
  39         tab_width => 2,
  40     );
  41     my $html = $m->markdown( $text );
  42
  43 =head1 DESCRIPTION
  44
  45 Markdown is a text-to-HTML filter; it translates an easy-to-read /
  46 easy-to-write structured text format into HTML. Markdown's text format
  47 is most similar to that of plain text email, and supports features such
  48 as headers, *emphasis*, code blocks, blockquotes, and links.
  49
  50 Markdown's syntax is designed not as a generic markup language, but
  51 specifically to serve as a front-end to (X)HTML. You can use span-level
  52 HTML tags anywhere in a Markdown document, and you can use block level
  53 HTML tags (like <div> and <table> as well).
  54
  55 =head1 SYNTAX
  56
  57 This module implements the 'original' Markdown markdown syntax from:
  58
  59     http://daringfireball.net/projects/markdown/
  60
  61 =head1 OPTIONS
  62
  63 Text::Markdown supports a number of options to it's processor which control the behaviour of the output document.
  64
  65 These options can be supplied to the constructor, on in a hash with the individual calls to the markdown method.
  66 See the synopsis for examples of both of the above styles.
  67
  68 The options for the processor are:
  69
  70 =over
  71
  72 =item empty_element_suffix
  73
  74 This option can be used to generate normal HTML output. By default, it is ' />', which is xHTML, change to '>' for normal HTML.
  75
  76 =item tab_width
  77
  78 Controls indent width in the generated markup, defaults to 4
  79
  80 =item markdown_in_html_blocks
  81
  82 Controls if Markdown is processed when inside HTML blocks. Defaults to 0.
  83
  84 =item trust_list_start_value
  85
  86 If true, ordered lists will use the first number as the starting point for
  87 numbering.  This will let you pick up where you left off by writing:
  88
  89   1. foo
  90   2. bar
  91
  92   some paragraph
  93
  94   3. baz
  95   6. quux
  96
  97 (Note that in the above, quux will be numbered 4.)
  98
  99 =back
 100
 101 =head1 METHODS
 102
 103 =cut
 104
 105 # Regex to match balanced [brackets]. See Friedl's
 106 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
 107 our ($g_nested_brackets, $g_nested_parens);
 108 $g_nested_brackets = qr{
 109     (?>                                 # Atomic matching
 110        [^\[\]]+                         # Anything other than brackets
 111      |
 112        \[
 113          (??{ $g_nested_brackets })     # Recursive set of nested brackets
 114        \]
 115     )*
 116 }x;
 117 # Doesn't allow for whitespace, because we're using it to match URLs:
 118 $g_nested_parens = qr{
 119         (?>                                                             # Atomic matching
 120            [^()\s]+                                                     # Anything other than parens or whitespace
 121          |
 122            \(
 123                  (??{ $g_nested_parens })               # Recursive set of nested brackets
 124            \)
 125         )*
 126 }x;
 127
 128 # Table of hash values for escaped characters:
 129 our %g_escape_table;
 130 foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
 131     $g_escape_table{$char} = md5_hex($char);
 132 }
 133
 134 =head1 METHODS
 135
 136 =head2 new
 137
 138 A simple constructor, see the SYNTAX and OPTIONS sections for more information.
 139
 140 =cut
 141
 142 sub new {
 143     my ($class, %p) = @_;
 144
 145     $p{base_url} ||= ''; # This is the base url to be used for WikiLinks
 146
 147     $p{tab_width} = 4 unless (defined $p{tab_width} and $p{tab_width} =~ m/^\d+$/);
 148
 149     $p{empty_element_suffix} ||= ' />'; # Change to ">" for HTML output
 150
 151     # Is markdown processed in HTML blocks? See t/15inlinehtmldonotturnoffmarkdown.t
 152     $p{markdown_in_html_blocks} = $p{markdown_in_html_blocks} ? 1 : 0;
 153
 154     $p{trust_list_start_value} = $p{trust_list_start_value} ? 1 : 0;
 155
 156     my $self = { params => \%p };
 157     bless $self, ref($class) || $class;
 158     return $self;
 159 }
 160
 161 =head2 markdown
 162
 163 The main function as far as the outside world is concerned. See the SYNOPSIS
 164 for details on use.
 165
 166 =cut
 167
 168 sub markdown {
 169     my ( $self, $text, $options ) = @_;
 170
 171     # Detect functional mode, and create an instance for this run..
 172     unless (ref $self) {
 173         if ( $self ne __PACKAGE__ ) {
 174             my $ob = __PACKAGE__->new();
 175                                 # $self is text, $text is options
 176             return $ob->markdown($self, $text);
 177         }
 178         else {
 179             croak('Calling ' . $self . '->markdown (as a class method) is not supported.');
 180         }
 181     }
 182
 183     $options ||= {};
 184
 185     %$self = (%{ $self->{params} }, %$options, params => $self->{params});
 186
 187     $self->_CleanUpRunData($options);
 188
 189     return $self->_Markdown($text);
 190 }
 191
 192 sub _CleanUpRunData {
 193     my ($self, $options) = @_;
 194     # Clear the global hashes. If we don't clear these, you get conflicts
 195     # from other articles when generating a page which contains more than
 196     # one article (e.g. an index page that shows the N most recent
 197     # articles):
 198     $self->{_urls}        = $options->{urls} ? $options->{urls} : {}; # FIXME - document passing this option (tested in 05options.t).
 199     $self->{_titles}      = {};
 200     $self->{_html_blocks} = {};
 201     # Used to track when we're inside an ordered or unordered list
 202     # (see _ProcessListItems() for details)
 203     $self->{_list_level} = 0;
 204
 205 }
 206
 207 sub _Markdown {
 208 #
 209 # Main function. The order in which other subs are called here is
 210 # essential. Link and image substitutions need to happen before
 211 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
 212 # and <img> tags get encoded.
 213 #
 214     my ($self, $text, $options) = @_;
 215
 216     $text = $self->_CleanUpDoc($text);
 217
 218     # Turn block-level HTML blocks into hash entries
 219     $text = $self->_HashHTMLBlocks($text) unless $self->{markdown_in_html_blocks};
 220
 221     $text = $self->_StripLinkDefinitions($text);
 222
 223     $text = $self->_RunBlockGamut($text);
 224
 225     $text = $self->_UnescapeSpecialChars($text);
 226
 227     $text = $self->_ConvertCopyright($text);
 228
 229     return $text . "\n";
 230 }
 231
 232 =head2 urls
 233
 234 Returns a reference to a hash with the key being the markdown reference and the value being the URL.
 235
 236 Useful for building scripts which preprocess a list of links before the main content. See t/05options.t
 237 for an example of this hashref being passed back into the markdown method to create links.
 238
 239 =cut
 240
 241 sub urls {
 242     my ( $self ) = @_;
 243
 244     return $self->{_urls};
 245 }
 246
 247 sub _CleanUpDoc {
 248     my ($self, $text) = @_;
 249
 250     # Standardize line endings:
 251     $text =~ s{\r\n}{\n}g;  # DOS to Unix
 252     $text =~ s{\r}{\n}g;    # Mac to Unix
 253
 254     # Make sure $text ends with a couple of newlines:
 255     $text .= "\n\n";
 256
 257     # Convert all tabs to spaces.
 258     $text = $self->_Detab($text);
 259
 260     # Strip any lines consisting only of spaces and tabs.
 261     # This makes subsequent regexen easier to write, because we can
 262     # match consecutive blank lines with /\n+/ instead of something
 263     # contorted like /[ \t]*\n+/ .
 264     $text =~ s/^[ \t]+$//mg;
 265
 266     return $text;
 267 }
 268
 269 sub _StripLinkDefinitions {
 270 #
 271 # Strips link definitions from text, stores the URLs and titles in
 272 # hash references.
 273 #
 274     my ($self, $text) = @_;
 275     my $less_than_tab = $self->{tab_width} - 1;
 276
 277     # Link defs are in the form: ^[id]: url "optional title"
 278     while ($text =~ s{
 279             ^[ ]{0,$less_than_tab}\[(.+)\]: # id = \$1
 280               [ \t]*
 281               \n?               # maybe *one* newline
 282               [ \t]*
 283             <?(\S+?)>?          # url = \$2
 284               [ \t]*
 285               \n?               # maybe one newline
 286               [ \t]*
 287             (?:
 288                 (?<=\s)         # lookbehind for whitespace
 289                 ["(]
 290                 (.+?)           # title = \$3
 291                 [")]
 292                 [ \t]*
 293             )?  # title is optional
 294             (?:\n+|\Z)
 295         }{}omx) {
 296         $self->{_urls}{lc $1} = $self->_EncodeAmpsAndAngles( $2 );    # Link IDs are case-insensitive
 297         if ($3) {
 298             $self->{_titles}{lc $1} = $3;
 299             $self->{_titles}{lc $1} =~ s/"/&quot;/g;
 300         }
 301
 302     }
 303
 304     return $text;
 305 }
 306
 307 sub _md5_utf8 {
 308    # Internal function used to safely MD5sum chunks of the input, which might be Unicode in Perl's internal representation.
 309    my $input = shift;
 310    return unless defined $input;
 311    if (Encode::is_utf8 $input) {
 312        return md5_hex(Encode::encode('utf8', $input));
 313     }
 314     else {
 315         return md5_hex($input);
 316     }
 317 }
 318
 319 sub _HashHTMLBlocks {
 320     my ($self, $text) = @_;
 321     my $less_than_tab = $self->{tab_width} - 1;
 322
 323         # Hashify HTML blocks:
 324         # We only want to do this for block-level HTML tags, such as headers,
 325         # lists, and tables. That's because we still want to wrap <p>s around
 326         # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
 327         # phrase emphasis, and spans. The list of tags we're looking for is
 328         # hard-coded:
 329         my $block_tags = qr{
 330                   (?:
 331                         p         |  div     |  h[1-6]  |  blockquote  |  pre       |  table  |
 332                         dl        |  ol      |  ul      |  script      |  noscript  |  form   |
 333                         fieldset  |  iframe  |  math    |  ins         |  del
 334                   )
 335                 }x;
 336
 337         my $tag_attrs = qr{
 338                                                 (?:                             # Match one attr name/value pair
 339                                                         \s+                             # There needs to be at least some whitespace
 340                                                                                         # before each attribute name.
 341                                                         [\w.:_-]+               # Attribute name
 342                                                         \s*=\s*
 343                                                         (?:
 344                                                                 ".+?"           # "Attribute value"
 345                                                          |
 346                                                                 '.+?'           # 'Attribute value'
 347                                                         )
 348                                                 )*                              # Zero or more
 349                                         }x;
 350
 351         my $empty_tag = qr{< \w+ $tag_attrs \s* />}oxms;
 352         my $open_tag =  qr{< $block_tags $tag_attrs \s* >}oxms;
 353         my $close_tag = undef;  # let Text::Balanced handle this
 354
 355         use Text::Balanced qw(gen_extract_tagged);
 356         my $extract_block = gen_extract_tagged($open_tag, $close_tag, undef, { ignore => [$empty_tag] });
 357
 358         my @chunks;
 359         while ($text =~ s{^(([ ]{0,$less_than_tab}<)?.*\n)}{}m) {
 360                 my $cur_line = $1;
 361                 if (defined $2) {
 362                         # current line could be start of code block
 363
 364                         my ($tag, $remainder) = $extract_block->($cur_line . $text);
 365                         if ($tag) {
 366                                 my $key = _md5_utf8($tag);
 367                                 $self->{_html_blocks}{$key} = $tag;
 368                                 push @chunks, "\n\n" . $key . "\n\n";
 369                                 $text = $remainder;
 370                         }
 371                         else {
 372                                 # No tag match, so toss $cur_line into @chunks
 373                                 push @chunks, $cur_line;
 374                         }
 375                 }
 376                 else {
 377                         # current line could NOT be start of code block
 378                         push @chunks, $cur_line;
 379                 }
 380
 381         }
 382         push @chunks, $text; # Whatever is left.
 383
 384         $text = join '', @chunks;
 385
 386         # Special case just for <hr />. It was easier to make a special case than
 387         # to make the other regex more complicated.
 388         $text = $self->_HashHR($text);
 389
 390     $text = $self->_HashHTMLComments($text);
 391
 392     $text = $self->_HashPHPASPBlocks($text);
 393
 394         return $text;
 395 }
 396
 397 sub _HashHR {
 398     my ($self, $text) = @_;
 399     my $less_than_tab = $self->{tab_width} - 1;
 400
 401         $text =~ s{
 402                                 (?:
 403                                         (?<=\n\n)               # Starting after a blank line
 404                                         |                               # or
 405                                         \A\n?                   # the beginning of the doc
 406                                 )
 407                                 (                                               # save in $1
 408                                         [ ]{0,$less_than_tab}
 409                                         <(hr)                           # start tag = $2
 410                                         \b                                      # word break
 411                                         ([^<>])*?                       #
 412                                         /?>                                     # the matching end tag
 413                                         [ \t]*
 414                                         (?=\n{2,}|\Z)           # followed by a blank line or end of document
 415                                 )
 416         }{
 417                 my $key = _md5_utf8($1);
 418                 $self->{_html_blocks}{$key} = $1;
 419                 "\n\n" . $key . "\n\n";
 420         }egx;
 421
 422         return $text;
 423 }
 424
 425 sub _HashHTMLComments {
 426     my ($self, $text) = @_;
 427     my $less_than_tab = $self->{tab_width} - 1;
 428
 429     # Special case for standalone HTML comments:
 430         $text =~ s{
 431                                 (?:
 432                                         (?<=\n\n)               # Starting after a blank line
 433                                         |                               # or
 434                                         \A\n?                   # the beginning of the doc
 435                                 )
 436                                 (                                               # save in $1
 437                                         [ ]{0,$less_than_tab}
 438                                         (?s:
 439                                                 <!
 440                                                 (--.*?--\s*)+
 441                                                 >
 442                                         )
 443                                         [ \t]*
 444                                         (?=\n{2,}|\Z)           # followed by a blank line or end of document
 445                                 )
 446         }{
 447                 my $key = _md5_utf8($1);
 448                 $self->{_html_blocks}{$key} = $1;
 449                 "\n\n" . $key . "\n\n";
 450         }egx;
 451
 452         return $text;
 453 }
 454
 455 sub _HashPHPASPBlocks {
 456     my ($self, $text) = @_;
 457     my $less_than_tab = $self->{tab_width} - 1;
 458
 459     # PHP and ASP-style processor instructions (<?…?> and <%…%>)
 460         $text =~ s{
 461                                 (?:
 462                                         (?<=\n\n)               # Starting after a blank line
 463                                         |                               # or
 464                                         \A\n?                   # the beginning of the doc
 465                                 )
 466                                 (                                               # save in $1
 467                                         [ ]{0,$less_than_tab}
 468                                         (?s:
 469                                                 <([?%])                 # $2
 470                                                 .*?
 471                                                 \2>
 472                                         )
 473                                         [ \t]*
 474                                         (?=\n{2,}|\Z)           # followed by a blank line or end of document
 475                                 )
 476                         }{
 477                                 my $key = _md5_utf8($1);
 478                                 $self->{_html_blocks}{$key} = $1;
 479                                 "\n\n" . $key . "\n\n";
 480                         }egx;
 481         return $text;
 482 }
 483
 484 sub _RunBlockGamut {
 485 #
 486 # These are all the transformations that form block-level
 487 # tags like paragraphs, headers, and list items.
 488 #
 489     my ($self, $text) = @_;
 490
 491     # Do headers first, as these populate cross-refs
 492     $text = $self->_DoHeaders($text);
 493
 494     # And now, protect our tables
 495     $text = $self->_HashHTMLBlocks($text) unless $self->{markdown_in_html_blocks};
 496
 497     # Do Horizontal Rules:
 498     my $less_than_tab = $self->{tab_width} - 1;
 499     $text =~ s{^[ ]{0,$less_than_tab}(\*[ ]?){3,}[ \t]*$}{\n<hr$self->{empty_element_suffix}\n}gmx;
 500     $text =~ s{^[ ]{0,$less_than_tab}(-[ ]?){3,}[ \t]*$}{\n<hr$self->{empty_element_suffix}\n}gmx;
 501     $text =~ s{^[ ]{0,$less_than_tab}(_[ ]?){3,}[ \t]*$}{\n<hr$self->{empty_element_suffix}\n}gmx;
 502
 503     $text = $self->_DoLists($text);
 504
 505     $text = $self->_DoCodeBlocks($text);
 506
 507     $text = $self->_DoBlockQuotes($text);
 508
 509     # We already ran _HashHTMLBlocks() before, in Markdown(), but that
 510     # was to escape raw HTML in the original Markdown source. This time,
 511     # we're escaping the markup we've just created, so that we don't wrap
 512     # <p> tags around block-level tags.
 513     $text = $self->_HashHTMLBlocks($text);
 514
 515     $text = $self->_FormParagraphs($text);
 516
 517     return $text;
 518 }
 519
 520 sub _RunSpanGamut {
 521 #
 522 # These are all the transformations that occur *within* block-level
 523 # tags like paragraphs, headers, and list items.
 524 #
 525     my ($self, $text) = @_;
 526
 527     $text = $self->_DoCodeSpans($text);
 528         $text = $self->_EscapeSpecialCharsWithinTagAttributes($text);
 529     $text = $self->_EscapeSpecialChars($text);
 530
 531     # Process anchor and image tags. Images must come first,
 532     # because ![foo][f] looks like an anchor.
 533     $text = $self->_DoImages($text);
 534     $text = $self->_DoAnchors($text);
 535
 536     # Make links out of things like `<http://example.com/>`
 537     # Must come after _DoAnchors(), because you can use < and >
 538     # delimiters in inline links like [this](<url>).
 539     $text = $self->_DoAutoLinks($text);
 540
 541     $text = $self->_EncodeAmpsAndAngles($text);
 542
 543     $text = $self->_DoItalicsAndBold($text);
 544
 545     # FIXME - Is hard coding space here sane, or does this want to be related to tab width?
 546     # Do hard breaks:
 547     $text =~ s/ {2,}\n/ <br$self->{empty_element_suffix}\n/g;
 548
 549     return $text;
 550 }
 551
 552 sub _EscapeSpecialChars {
 553     my ($self, $text) = @_;
 554     my $tokens ||= $self->_TokenizeHTML($text);
 555
 556     $text = '';   # rebuild $text from the tokens
 557 #   my $in_pre = 0;  # Keep track of when we're inside <pre> or <code> tags.
 558 #   my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
 559
 560     foreach my $cur_token (@$tokens) {
 561         if ($cur_token->[0] eq "tag") {
 562             # Within tags, encode * and _ so they don't conflict
 563             # with their use in Markdown for italics and strong.
 564             # We're replacing each such character with its
 565             # corresponding MD5 checksum value; this is likely
 566             # overkill, but it should prevent us from colliding
 567             # with the escape values by accident.
 568             $cur_token->[1] =~  s! \* !$g_escape_table{'*'}!ogx;
 569             $cur_token->[1] =~  s! _  !$g_escape_table{'_'}!ogx;
 570             $text .= $cur_token->[1];
 571         } else {
 572             my $t = $cur_token->[1];
 573             $t = $self->_EncodeBackslashEscapes($t);
 574             $text .= $t;
 575         }
 576     }
 577     return $text;
 578 }
 579
 580 sub _EscapeSpecialCharsWithinTagAttributes {
 581 #
 582 # Within tags -- meaning between < and > -- encode [\ ` * _] so they
 583 # don't conflict with their use in Markdown for code, italics and strong.
 584 # We're replacing each such character with its corresponding MD5 checksum
 585 # value; this is likely overkill, but it should prevent us from colliding
 586 # with the escape values by accident.
 587 #
 588         my ($self, $text) = @_;
 589         my $tokens ||= $self->_TokenizeHTML($text);
 590         $text = '';   # rebuild $text from the tokens
 591
 592         foreach my $cur_token (@$tokens) {
 593                 if ($cur_token->[0] eq "tag") {
 594                         $cur_token->[1] =~  s! \\ !$g_escape_table{'\\'}!gox;
 595                         $cur_token->[1] =~  s{ (?<=.)</?code>(?=.)  }{$g_escape_table{'`'}}gox;
 596                         $cur_token->[1] =~  s! \* !$g_escape_table{'*'}!gox;
 597                         $cur_token->[1] =~  s! _  !$g_escape_table{'_'}!gox;
 598                 }
 599                 $text .= $cur_token->[1];
 600         }
 601         return $text;
 602 }
 603
 604 sub _DoAnchors {
 605 #
 606 # Turn Markdown link shortcuts into XHTML <a> tags.
 607 #
 608     my ($self, $text) = @_;
 609
 610     #
 611     # First, handle reference-style links: [link text] [id]
 612     #
 613     $text =~ s{
 614         (                   # wrap whole match in $1
 615           \[
 616             ($g_nested_brackets)    # link text = $2
 617           \]
 618
 619           [ ]?              # one optional space
 620           (?:\n[ ]*)?       # one optional newline followed by spaces
 621
 622           \[
 623             (.*?)       # id = $3
 624           \]
 625         )
 626     }{
 627         my $whole_match = $1;
 628         my $link_text   = $2;
 629         my $link_id     = lc $3;
 630
 631         if ($link_id eq "") {
 632             $link_id = lc $link_text;   # for shortcut links like [this][].
 633         }
 634
 635         $link_id =~ s{[ ]*\n}{ }g; # turn embedded newlines into spaces
 636
 637         $self->_GenerateAnchor($whole_match, $link_text, $link_id);
 638     }xsge;
 639
 640     #
 641     # Next, inline-style links: [link text](url "optional title")
 642     #
 643     $text =~ s{
 644         (               # wrap whole match in $1
 645           \[
 646             ($g_nested_brackets)    # link text = $2
 647           \]
 648           \(            # literal paren
 649             [ \t]*
 650             ($g_nested_parens)   # href = $3
 651             [ \t]*
 652             (           # $4
 653               (['"])    # quote char = $5
 654               (.*?)     # Title = $6
 655               \5        # matching quote
 656               [ \t]*    # ignore any spaces/tabs between closing quote and )
 657             )?          # title is optional
 658           \)
 659         )
 660     }{
 661         my $result;
 662         my $whole_match = $1;
 663         my $link_text   = $2;
 664         my $url         = $3;
 665         my $title       = $6;
 666
 667         $self->_GenerateAnchor($whole_match, $link_text, undef, $url, $title);
 668     }xsge;
 669
 670     #
 671         # Last, handle reference-style shortcuts: [link text]
 672         # These must come last in case you've also got [link test][1]
 673         # or [link test](/foo)
 674         #
 675         $text =~ s{
 676                 (                                       # wrap whole match in $1
 677                   \[
 678                     ([^\[\]]+)          # link text = $2; can't contain '[' or ']'
 679                   \]
 680                 )
 681         }{
 682                 my $result;
 683                 my $whole_match = $1;
 684                 my $link_text   = $2;
 685                 (my $link_id = lc $2) =~ s{[ ]*\n}{ }g; # lower-case and turn embedded newlines into spaces
 686
 687         $self->_GenerateAnchor($whole_match, $link_text, $link_id);
 688         }xsge;
 689
 690     return $text;
 691 }
 692
 693 sub _GenerateAnchor {
 694     # FIXME - Fugly, change to named params?
 695     my ($self, $whole_match, $link_text, $link_id, $url, $title, $attributes) = @_;
 696
 697     my $result;
 698
 699     $attributes = '' unless defined $attributes;
 700
 701     if ( !defined $url && defined $self->{_urls}{$link_id}) {
 702         $url = $self->{_urls}{$link_id};
 703     }
 704
 705     if (!defined $url) {
 706         return $whole_match;
 707     }
 708
 709     $url =~ s! \* !$g_escape_table{'*'}!gox;     # We've got to encode these to avoid
 710     $url =~ s!  _ !$g_escape_table{'_'}!gox;     # conflicting with italics/bold.
 711     $url =~ s{^<(.*)>$}{$1};                                    # Remove <>'s surrounding URL, if present
 712
 713     $result = qq{<a href="$url"};
 714
 715     if ( !defined $title && defined $link_id && defined $self->{_titles}{$link_id} ) {
 716         $title = $self->{_titles}{$link_id};
 717     }
 718
 719     if ( defined $title ) {
 720         $title =~ s/"/&quot;/g;
 721         $title =~ s! \* !$g_escape_table{'*'}!gox;
 722         $title =~ s!  _ !$g_escape_table{'_'}!gox;
 723         $result .=  qq{ title="$title"};
 724     }
 725
 726     $result .= "$attributes>$link_text</a>";
 727
 728     return $result;
 729 }
 730
 731 sub _DoImages {
 732 #
 733 # Turn Markdown image shortcuts into <img> tags.
 734 #
 735     my ($self, $text) = @_;
 736
 737     #
 738     # First, handle reference-style labeled images: ![alt text][id]
 739     #
 740     $text =~ s{
 741         (               # wrap whole match in $1
 742           !\[
 743             (.*?)       # alt text = $2
 744           \]
 745
 746           [ ]?              # one optional space
 747           (?:\n[ ]*)?       # one optional newline followed by spaces
 748
 749           \[
 750             (.*?)       # id = $3
 751           \]
 752
 753         )
 754     }{
 755         my $result;
 756         my $whole_match = $1;
 757         my $alt_text    = $2;
 758         my $link_id     = lc $3;
 759
 760         if ($link_id eq '') {
 761             $link_id = lc $alt_text;     # for shortcut links like ![this][].
 762         }
 763
 764         $self->_GenerateImage($whole_match, $alt_text, $link_id);
 765     }xsge;
 766
 767     #
 768     # Next, handle inline images:  ![alt text](url "optional title")
 769     # Don't forget: encode * and _
 770
 771     $text =~ s{
 772         (               # wrap whole match in $1
 773           !\[
 774             (.*?)       # alt text = $2
 775           \]
 776           \(            # literal paren
 777             [ \t]*
 778             ($g_nested_parens)  # src url - href = $3
 779             [ \t]*
 780             (           # $4
 781               (['"])    # quote char = $5
 782               (.*?)     # title = $6
 783               \5        # matching quote
 784               [ \t]*
 785             )?          # title is optional
 786           \)
 787         )
 788     }{
 789         my $result;
 790         my $whole_match = $1;
 791         my $alt_text    = $2;
 792         my $url         = $3;
 793         my $title       = '';
 794         if (defined($6)) {
 795             $title      = $6;
 796         }
 797
 798         $self->_GenerateImage($whole_match, $alt_text, undef, $url, $title);
 799     }xsge;
 800
 801     return $text;
 802 }
 803
 804 sub _GenerateImage {
 805     # FIXME - Fugly, change to named params?
 806     my ($self, $whole_match, $alt_text, $link_id, $url, $title, $attributes) = @_;
 807
 808     my $result;
 809
 810     $attributes = '' unless defined $attributes;
 811
 812     $alt_text ||= '';
 813     $alt_text =~ s/"/&quot;/g;
 814     # FIXME - how about >
 815
 816     if ( !defined $url && defined $self->{_urls}{$link_id}) {
 817         $url = $self->{_urls}{$link_id};
 818     }
 819
 820     # If there's no such link ID, leave intact:
 821     return $whole_match unless defined $url;
 822
 823     $url =~ s! \* !$g_escape_table{'*'}!ogx;     # We've got to encode these to avoid
 824     $url =~ s!  _ !$g_escape_table{'_'}!ogx;     # conflicting with italics/bold.
 825     $url =~ s{^<(.*)>$}{$1};                                    # Remove <>'s surrounding URL, if present
 826
 827     if (!defined $title && length $link_id && defined $self->{_titles}{$link_id} && length $self->{_titles}{$link_id}) {
 828         $title = $self->{_titles}{$link_id};
 829     }
 830
 831     $result = qq{<img src="$url" alt="$alt_text"};
 832     if (defined $title && length $title) {
 833         $title =~ s! \* !$g_escape_table{'*'}!ogx;
 834         $title =~ s!  _ !$g_escape_table{'_'}!ogx;
 835         $title    =~ s/"/&quot;/g;
 836         $result .=  qq{ title="$title"};
 837     }
 838     $result .= $attributes . $self->{empty_element_suffix};
 839
 840     return $result;
 841 }
 842
 843 sub _DoHeaders {
 844     my ($self, $text) = @_;
 845
 846     # Setext-style headers:
 847     #     Header 1
 848     #     ========
 849     #
 850     #     Header 2
 851     #     --------
 852     #
 853     $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
 854         $self->_GenerateHeader('1', $1);
 855     }egmx;
 856
 857     $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
 858         $self->_GenerateHeader('2', $1);
 859     }egmx;
 860
 861
 862     # atx-style headers:
 863     #   # Header 1
 864     #   ## Header 2
 865     #   ## Header 2 with closing hashes ##
 866     #   ...
 867     #   ###### Header 6
 868     #
 869     my $l;
 870     $text =~ s{
 871             ^(\#{1,6})  # $1 = string of #'s
 872             [ \t]*
 873             (.+?)       # $2 = Header text
 874             [ \t]*
 875             \#*         # optional closing #'s (not counted)
 876             \n+
 877         }{
 878             my $h_level = length($1);
 879             $self->_GenerateHeader($h_level, $2);
 880         }egmx;
 881
 882     return $text;
 883 }
 884
 885 sub _GenerateHeader {
 886     my ($self, $level, $id) = @_;
 887
 888     return "<h$level>"  .  $self->_RunSpanGamut($id)  .  "</h$level>\n\n";
 889 }
 890
 891 sub _DoLists {
 892 #
 893 # Form HTML ordered (numbered) and unordered (bulleted) lists.
 894 #
 895     my ($self, $text) = @_;
 896     my $less_than_tab = $self->{tab_width} - 1;
 897
 898     # Re-usable patterns to match list item bullets and number markers:
 899     my $marker_ul  = qr/[*+-]/;
 900     my $marker_ol  = qr/\d+[.]/;
 901     my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
 902
 903     # Re-usable pattern to match any entirel ul or ol list:
 904     my $whole_list = qr{
 905         (                               # $1 = whole list
 906           (                             # $2
 907             [ ]{0,$less_than_tab}
 908             (${marker_any})             # $3 = first list item marker
 909             [ \t]+
 910           )
 911           (?s:.+?)
 912           (                             # $4
 913               \z
 914             |
 915               \n{2,}
 916               (?=\S)
 917               (?!                       # Negative lookahead for another list item marker
 918                 [ \t]*
 919                 ${marker_any}[ \t]+
 920               )
 921           )
 922         )
 923     }mx;
 924
 925     # We use a different prefix before nested lists than top-level lists.
 926     # See extended comment in _ProcessListItems().
 927     #
 928     # Note: There's a bit of duplication here. My original implementation
 929     # created a scalar regex pattern as the conditional result of the test on
 930     # $self->{_list_level}, and then only ran the $text =~ s{...}{...}egmx
 931     # substitution once, using the scalar as the pattern. This worked,
 932     # everywhere except when running under MT on my hosting account at Pair
 933     # Networks. There, this caused all rebuilds to be killed by the reaper (or
 934     # perhaps they crashed, but that seems incredibly unlikely given that the
 935     # same script on the same server ran fine *except* under MT. I've spent
 936     # more time trying to figure out why this is happening than I'd like to
 937     # admit. My only guess, backed up by the fact that this workaround works,
 938     # is that Perl optimizes the substition when it can figure out that the
 939     # pattern will never change, and when this optimization isn't on, we run
 940     # afoul of the reaper. Thus, the slightly redundant code to that uses two
 941     # static s/// patterns rather than one conditional pattern.
 942
 943     if ($self->{_list_level}) {
 944         $text =~ s{
 945                 ^
 946                 $whole_list
 947             }{
 948                 my $list = $1;
 949                 my $marker = $3;
 950                 my $list_type = ($marker =~ m/$marker_ul/) ? "ul" : "ol";
 951                 # Turn double returns into triple returns, so that we can make a
 952                 # paragraph for the last item in a list, if necessary:
 953                 $list =~ s/\n{2,}/\n\n\n/g;
 954                 my $result = ( $list_type eq 'ul' ) ?
 955                     $self->_ProcessListItemsUL($list, $marker_ul)
 956                   : $self->_ProcessListItemsOL($list, $marker_ol);
 957
 958                 $result = $self->_MakeList($list_type, $result, $marker);
 959                 $result;
 960             }egmx;
 961     }
 962     else {
 963         $text =~ s{
 964                 (?:(?<=\n\n)|\A\n?)
 965                 $whole_list
 966             }{
 967                 my $list = $1;
 968                 my $marker = $3;
 969                 my $list_type = ($marker =~ m/$marker_ul/) ? "ul" : "ol";
 970                 # Turn double returns into triple returns, so that we can make a
 971                 # paragraph for the last item in a list, if necessary:
 972                 $list =~ s/\n{2,}/\n\n\n/g;
 973                 my $result = ( $list_type eq 'ul' ) ?
 974                     $self->_ProcessListItemsUL($list, $marker_ul)
 975                   : $self->_ProcessListItemsOL($list, $marker_ol);
 976                 $result = $self->_MakeList($list_type, $result, $marker);
 977                 $result;
 978             }egmx;
 979     }
 980
 981
 982     return $text;
 983 }
 984
 985 sub _MakeList {
 986   my ($self, $list_type, $content, $marker) = @_;
 987
 988   if ($list_type eq 'ol' and $self->{trust_list_start_value}) {
 989     my ($num) = $marker =~ /^(\d+)[.]/;
 990     return "<ol start='$num'>\n" . $content . "</ol>\n";
 991   }
 992
 993   return "<$list_type>\n" . $content . "</$list_type>\n";
 994 }
 995
 996 sub _ProcessListItemsOL {
 997 #
 998 #   Process the contents of a single ordered list, splitting it
 999 #   into individual list items.
1000 #
1001
1002     my ($self, $list_str, $marker_any) = @_;
1003
1004
1005     # The $self->{_list_level} global keeps track of when we're inside a list.
1006     # Each time we enter a list, we increment it; when we leave a list,
1007     # we decrement. If it's zero, we're not in a list anymore.
1008     #
1009     # We do this because when we're not inside a list, we want to treat
1010     # something like this:
1011     #
1012     #       I recommend upgrading to version
1013     #       8. Oops, now this line is treated
1014     #       as a sub-list.
1015     #
1016     # As a single paragraph, despite the fact that the second line starts
1017     # with a digit-period-space sequence.
1018     #
1019     # Whereas when we're inside a list (or sub-list), that line will be
1020     # treated as the start of a sub-list. What a kludge, huh? This is
1021     # an aspect of Markdown's syntax that's hard to parse perfectly
1022     # without resorting to mind-reading. Perhaps the solution is to
1023     # change the syntax rules such that sub-lists must start with a
1024     # starting cardinal number; e.g. "1." or "a.".
1025
1026     $self->{_list_level}++;
1027
1028     # trim trailing blank lines:
1029     $list_str =~ s/\n{2,}\z/\n/;
1030
1031
1032     $list_str =~ s{
1033         (\n)?                           # leading line = $1
1034         (^[ \t]*)                       # leading whitespace = $2
1035         ($marker_any) [ \t]+            # list marker = $3
1036         ((?s:.+?)                       # list item text   = $4
1037         (\n{1,2}))
1038         (?= \n* (\z | \2 ($marker_any) [ \t]+))
1039     }{
1040         my $item = $4;
1041         my $leading_line = $1;
1042         my $leading_space = $2;
1043
1044         if ($leading_line or ($item =~ m/\n{2,}/)) {
1045             $item = $self->_RunBlockGamut($self->_Outdent($item));
1046         }
1047         else {
1048             # Recursion for sub-lists:
1049             $item = $self->_DoLists($self->_Outdent($item));
1050             chomp $item;
1051             $item = $self->_RunSpanGamut($item);
1052         }
1053
1054         "<li>" . $item . "</li>\n";
1055     }egmxo;
1056
1057     $self->{_list_level}--;
1058     return $list_str;
1059 }
1060
1061 sub _ProcessListItemsUL {
1062 #
1063 #   Process the contents of a single unordered list, splitting it
1064 #   into individual list items.
1065 #
1066
1067     my ($self, $list_str, $marker_any) = @_;
1068
1069
1070     # The $self->{_list_level} global keeps track of when we're inside a list.
1071     # Each time we enter a list, we increment it; when we leave a list,
1072     # we decrement. If it's zero, we're not in a list anymore.
1073     #
1074     # We do this because when we're not inside a list, we want to treat
1075     # something like this:
1076     #
1077     #       I recommend upgrading to version
1078     #       8. Oops, now this line is treated
1079     #       as a sub-list.
1080     #
1081     # As a single paragraph, despite the fact that the second line starts
1082     # with a digit-period-space sequence.
1083     #
1084     # Whereas when we're inside a list (or sub-list), that line will be
1085     # treated as the start of a sub-list. What a kludge, huh? This is
1086     # an aspect of Markdown's syntax that's hard to parse perfectly
1087     # without resorting to mind-reading. Perhaps the solution is to
1088     # change the syntax rules such that sub-lists must start with a
1089     # starting cardinal number; e.g. "1." or "a.".
1090
1091     $self->{_list_level}++;
1092
1093     # trim trailing blank lines:
1094     $list_str =~ s/\n{2,}\z/\n/;
1095
1096
1097     $list_str =~ s{
1098         (\n)?                           # leading line = $1
1099         (^[ \t]*)                       # leading whitespace = $2
1100         ($marker_any) [ \t]+            # list marker = $3
1101         ((?s:.+?)                       # list item text   = $4
1102         (\n{1,2}))
1103         (?= \n* (\z | \2 ($marker_any) [ \t]+))
1104     }{
1105         my $item = $4;
1106         my $leading_line = $1;
1107         my $leading_space = $2;
1108
1109         if ($leading_line or ($item =~ m/\n{2,}/)) {
1110             $item = $self->_RunBlockGamut($self->_Outdent($item));
1111         }
1112         else {
1113             # Recursion for sub-lists:
1114             $item = $self->_DoLists($self->_Outdent($item));
1115             chomp $item;
1116             $item = $self->_RunSpanGamut($item);
1117         }
1118
1119         "<li>" . $item . "</li>\n";
1120     }egmxo;
1121
1122     $self->{_list_level}--;
1123     return $list_str;
1124 }
1125
1126 sub _DoCodeBlocks {
1127 #
1128 #   Process Markdown `<pre><code>` blocks.
1129 #
1130
1131     my ($self, $text) = @_;
1132
1133         $text =~ s{
1134                 (?:\n\n|\A)
1135                 (                   # $1 = the code block -- one or more lines, starting with a space/tab
1136                   (?:
1137                     (?:[ ]{$self->{tab_width}} | \t)  # Lines must start with a tab or a tab-width of spaces
1138                     .*\n+
1139                   )+
1140                 )
1141                 ((?=^[ ]{0,$self->{tab_width}}\S)|\Z)   # Lookahead for non-space at line-start, or end of doc
1142         }{
1143         my $codeblock = $1;
1144         my $result; # return value
1145
1146         $codeblock = $self->_EncodeCode($self->_Outdent($codeblock));
1147         $codeblock = $self->_Detab($codeblock);
1148         $codeblock =~ s/\A\n+//; # trim leading newlines
1149         $codeblock =~ s/\n+\z//; # trim trailing newlines
1150
1151         $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
1152
1153         $result;
1154         }egmx;
1155
1156         return $text;
1157 }
1158
1159 sub _DoCodeSpans {
1160 #
1161 #   *   Backtick quotes are used for <code></code> spans.
1162 #
1163 #   *   You can use multiple backticks as the delimiters if you want to
1164 #       include literal backticks in the code span. So, this input:
1165 #
1166 #         Just type ``foo `bar` baz`` at the prompt.
1167 #
1168 #       Will translate to:
1169 #
1170 #         <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1171 #
1172 #       There's no arbitrary limit to the number of backticks you
1173 #       can use as delimters. If you need three consecutive backticks
1174 #       in your code, use four for delimiters, etc.
1175 #
1176 #   *   You can use spaces to get literal backticks at the edges:
1177 #
1178 #         ... type `` `bar` `` ...
1179 #
1180 #       Turns to:
1181 #
1182 #         ... type <code>`bar`</code> ...
1183 #
1184
1185     my ($self, $text) = @_;
1186
1187         $text =~ s@
1188                         (?<!\\)         # Character before opening ` can't be a backslash
1189                         (`+)            # $1 = Opening run of `
1190                         (.+?)           # $2 = The code block
1191                         (?<!`)
1192                         \1                      # Matching closer
1193                         (?!`)
1194                 @
1195                         my $c = "$2";
1196                         $c =~ s/^[ \t]*//g; # leading whitespace
1197                         $c =~ s/[ \t]*$//g; # trailing whitespace
1198                         $c = $self->_EncodeCode($c);
1199                         "<code>$c</code>";
1200                 @egsx;
1201
1202     return $text;
1203 }
1204
1205 sub _EncodeCode {
1206 #
1207 # Encode/escape certain characters inside Markdown code runs.
1208 # The point is that in code, these characters are literals,
1209 # and lose their special Markdown meanings.
1210 #
1211     my $self = shift;
1212     local $_ = shift;
1213
1214     # Encode all ampersands; HTML entities are not
1215     # entities within a Markdown code span.
1216     s/&/&amp;/g;
1217
1218     # Encode $'s, but only if we're running under Blosxom.
1219     # (Blosxom interpolates Perl variables in article bodies.)
1220     {
1221         no warnings 'once';
1222         if (defined($blosxom::version)) {
1223             s/\$/&#036;/g;
1224         }
1225     }
1226
1227
1228     # Do the angle bracket song and dance:
1229     s! <  !&lt;!gx;
1230     s! >  !&gt;!gx;
1231
1232     # Now, escape characters that are magic in Markdown:
1233     s! \* !$g_escape_table{'*'}!ogx;
1234     s! _  !$g_escape_table{'_'}!ogx;
1235     s! {  !$g_escape_table{'{'}!ogx;
1236     s! }  !$g_escape_table{'}'}!ogx;
1237     s! \[ !$g_escape_table{'['}!ogx;
1238     s! \] !$g_escape_table{']'}!ogx;
1239     s! \\ !$g_escape_table{'\\'}!ogx;
1240
1241     return $_;
1242 }
1243
1244 sub _DoItalicsAndBold {
1245     my ($self, $text) = @_;
1246
1247     # Handle at beginning of lines:
1248     $text =~ s{ ^(\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
1249         {<strong>$2</strong>}gsx;
1250
1251     $text =~ s{ ^(\*|_) (?=\S) (.+?) (?<=\S) \1 }
1252         {<em>$2</em>}gsx;
1253
1254     # <strong> must go first:
1255     $text =~ s{ (?<=\W) (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
1256         {<strong>$2</strong>}gsx;
1257
1258     $text =~ s{ (?<=\W) (\*|_) (?=\S) (.+?) (?<=\S) \1 }
1259         {<em>$2</em>}gsx;
1260
1261     # And now, a second pass to catch nested strong and emphasis special cases
1262     $text =~ s{ (?<=\W) (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
1263         {<strong>$2</strong>}gsx;
1264
1265     $text =~ s{ (?<=\W) (\*|_) (?=\S) (.+?) (?<=\S) \1 }
1266         {<em>$2</em>}gsx;
1267
1268     return $text;
1269 }
1270
1271 sub _DoBlockQuotes {
1272     my ($self, $text) = @_;
1273
1274     $text =~ s{
1275           (                             # Wrap whole match in $1
1276             (
1277               ^[ \t]*>[ \t]?            # '>' at the start of a line
1278                 .+\n                    # rest of the first line
1279               (.+\n)*                   # subsequent consecutive lines
1280               \n*                       # blanks
1281             )+
1282           )
1283         }{
1284             my $bq = $1;
1285             $bq =~ s/^[ \t]*>[ \t]?//gm;    # trim one level of quoting
1286             $bq =~ s/^[ \t]+$//mg;          # trim whitespace-only lines
1287             $bq = $self->_RunBlockGamut($bq);      # recurse
1288
1289             $bq =~ s/^/  /mg;
1290             # These leading spaces screw with <pre> content, so we need to fix that:
1291             $bq =~ s{
1292                     (\s*<pre>.+?</pre>)
1293                 }{
1294                     my $pre = $1;
1295                     $pre =~ s/^  //mg;
1296                     $pre;
1297                 }egsx;
1298
1299             "<blockquote>\n$bq\n</blockquote>\n\n";
1300         }egmx;
1301
1302
1303     return $text;
1304 }
1305
1306 sub _FormParagraphs {
1307 #
1308 #   Params:
1309 #       $text - string to process with html <p> tags
1310 #
1311     my ($self, $text) = @_;
1312
1313     # Strip leading and trailing lines:
1314     $text =~ s/\A\n+//;
1315     $text =~ s/\n+\z//;
1316
1317     my @grafs = split(/\n{2,}/, $text);
1318
1319     #
1320     # Wrap <p> tags.
1321     #
1322     foreach (@grafs) {
1323         unless (defined( $self->{_html_blocks}{$_} )) {
1324             $_ = $self->_RunSpanGamut($_);
1325             s/^([ \t]*)/<p>&nbsp;&nbsp;/;
1326             $_ .= "</p>";
1327         }
1328     }
1329
1330     #
1331     # Unhashify HTML blocks
1332     #
1333     foreach (@grafs) {
1334         if (defined( $self->{_html_blocks}{$_} )) {
1335             $_ = $self->{_html_blocks}{$_};
1336         }
1337     }
1338
1339     return join "\n\n", @grafs;
1340 }
1341
1342 sub _EncodeAmpsAndAngles {
1343 # Smart processing for ampersands and angle brackets that need to be encoded.
1344
1345     my ($self, $text) = @_;
1346     return '' if (!defined $text or !length $text);
1347
1348     # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1349     #   http://bumppo.net/projects/amputator/
1350     $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
1351
1352     # Encode naked <'s
1353     $text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
1354
1355     # And >'s - added by Fletcher Penney
1356 #   $text =~ s{>(?![a-z/?\$!])}{&gt;}gi;
1357 #   Causes problems...
1358
1359     # Remove encoding inside comments
1360     $text =~ s{
1361         (?<=<!--) # Begin comment
1362         (.*?)     # Anything inside
1363         (?=-->)   # End comments
1364     }{
1365         my $t = $1;
1366         $t =~ s/&amp;/&/g;
1367         $t =~ s/&lt;/</g;
1368         $t;
1369     }egsx;
1370
1371     return $text;
1372 }
1373
1374 sub _EncodeBackslashEscapes {
1375 #
1376 #   Parameter:  String.
1377 #   Returns:    The string, with after processing the following backslash
1378 #               escape sequences.
1379 #
1380     my $self = shift;
1381     local $_ = shift;
1382
1383     s! \\\\  !$g_escape_table{'\\'}!ogx;     # Must process escaped backslashes first.
1384     s! \\`   !$g_escape_table{'`'}!ogx;
1385     s! \\\*  !$g_escape_table{'*'}!ogx;
1386     s! \\_   !$g_escape_table{'_'}!ogx;
1387     s! \\\{  !$g_escape_table{'{'}!ogx;
1388     s! \\\}  !$g_escape_table{'}'}!ogx;
1389     s! \\\[  !$g_escape_table{'['}!ogx;
1390     s! \\\]  !$g_escape_table{']'}!ogx;
1391     s! \\\(  !$g_escape_table{'('}!ogx;
1392     s! \\\)  !$g_escape_table{')'}!ogx;
1393     s! \\>   !$g_escape_table{'>'}!ogx;
1394     s! \\\#  !$g_escape_table{'#'}!ogx;
1395     s! \\\+  !$g_escape_table{'+'}!ogx;
1396     s! \\\-  !$g_escape_table{'-'}!ogx;
1397     s! \\\.  !$g_escape_table{'.'}!ogx;
1398     s{ \\!  }{$g_escape_table{'!'}}ogx;
1399
1400     return $_;
1401 }
1402
1403 sub _DoAutoLinks {
1404     my ($self, $text) = @_;
1405
1406     $text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi;
1407
1408     # Email addresses: <address@domain.foo>
1409     $text =~ s{
1410         <
1411         (?:mailto:)?
1412         (
1413             [-.\w\+]+
1414             \@
1415             [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
1416         )
1417         >
1418     }{
1419         $self->_EncodeEmailAddress( $self->_UnescapeSpecialChars($1) );
1420     }egix;
1421
1422     return $text;
1423 }
1424
1425 sub _EncodeEmailAddress {
1426 #
1427 #   Input: an email address, e.g. "foo@example.com"
1428 #
1429 #   Output: the email address as a mailto link, with each character
1430 #       of the address encoded as either a decimal or hex entity, in
1431 #       the hopes of foiling most address harvesting spam bots. E.g.:
1432 #
1433 #     <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1434 #       x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1435 #       &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1436 #
1437 #   Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1438 #   mailing list: <http://tinyurl.com/yu7ue>
1439 #
1440
1441     my ($self, $addr) = @_;
1442
1443     my @encode = (
1444         sub { '&#' .                 ord(shift)   . ';' },
1445         sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
1446         sub {                            shift          },
1447     );
1448
1449     $addr = "mailto:" . $addr;
1450
1451     $addr =~ s{(.)}{
1452         my $char = $1;
1453         if ( $char eq '@' ) {
1454             # this *must* be encoded. I insist.
1455             $char = $encode[int rand 1]->($char);
1456         }
1457         elsif ( $char ne ':' ) {
1458             # leave ':' alone (to spot mailto: later)
1459             my $r = rand;
1460             # roughly 10% raw, 45% hex, 45% dec
1461             $char = (
1462                 $r > .9   ?  $encode[2]->($char)  :
1463                 $r < .45  ?  $encode[1]->($char)  :
1464                              $encode[0]->($char)
1465             );
1466         }
1467         $char;
1468     }gex;
1469
1470     $addr = qq{<a href="$addr">$addr</a>};
1471     $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
1472
1473     return $addr;
1474 }
1475
1476 sub _UnescapeSpecialChars {
1477 #
1478 # Swap back in all the special characters we've hidden.
1479 #
1480     my ($self, $text) = @_;
1481
1482     while( my($char, $hash) = each(%g_escape_table) ) {
1483         $text =~ s/$hash/$char/g;
1484     }
1485     return $text;
1486 }
1487
1488 sub _TokenizeHTML {
1489 #
1490 #   Parameter:  String containing HTML markup.
1491 #   Returns:    Reference to an array of the tokens comprising the input
1492 #               string. Each token is either a tag (possibly with nested,
1493 #               tags contained therein, such as <a href="<MTFoo>">, or a
1494 #               run of text between tags. Each element of the array is a
1495 #               two-element array; the first is either 'tag' or 'text';
1496 #               the second is the actual value.
1497 #
1498 #
1499 #   Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
1500 #       <http://www.bradchoate.com/past/mtregex.php>
1501 #
1502
1503     my ($self, $str) = @_;
1504     my $pos = 0;
1505     my $len = length $str;
1506     my @tokens;
1507
1508     my $depth = 6;
1509     my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x  $depth);
1510     my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) |  # comment
1511                    (?s: <\? .*? \?> ) |              # processing instruction
1512                    $nested_tags/iox;                   # nested tags
1513
1514     while ($str =~ m/($match)/og) {
1515         my $whole_tag = $1;
1516         my $sec_start = pos $str;
1517         my $tag_start = $sec_start - length $whole_tag;
1518         if ($pos < $tag_start) {
1519             push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
1520         }
1521         push @tokens, ['tag', $whole_tag];
1522         $pos = pos $str;
1523     }
1524     push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
1525     \@tokens;
1526 }
1527
1528 sub _Outdent {
1529 #
1530 # Remove one level of line-leading tabs or spaces
1531 #
1532     my ($self, $text) = @_;
1533
1534     $text =~ s/^(\t|[ ]{1,$self->{tab_width}})//gm;
1535     return $text;
1536 }
1537
1538 sub _Detab {
1539 #
1540 # Cribbed from a post by Bart Lateur:
1541 # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
1542 #
1543     my ($self, $text) = @_;
1544
1545     # FIXME - Better anchor/regex would be quicker.
1546
1547     # Original:
1548     #$text =~ s{(.*?)\t}{$1.(' ' x ($self->{tab_width} - length($1) % $self->{tab_width}))}ge;
1549
1550     # Much swifter, but pretty hateful:
1551     do {} while ($text =~ s{^(.*?)\t}{$1.(' ' x ($self->{tab_width} - length($1) % $self->{tab_width}))}mge);
1552     return $text;
1553 }
1554
1555 sub _ConvertCopyright {
1556     my ($self, $text) = @_;
1557     # Convert to an XML compatible form of copyright symbol
1558
1559     $text =~ s/&copy;/&#xA9;/gi;
1560
1561     return $text;
1562 }
1563
1564 1;
1565
1566 __END__
1567
1568 =head1 OTHER IMPLEMENTATIONS
1569
1570 Markdown has been re-implemented in a number of languages, and with a number of additions.
1571
1572 Those that I have found are listed below:
1573
1574 =over
1575
1576 =item C - <http://www.pell.portland.or.us/~orc/Code/discount>
1577
1578 Discount - Original Markdown, but in C. Fastest implementation available, and passes MDTest.
1579 Adds it's own set of custom features.
1580
1581 =item python - <http://www.freewisdom.org/projects/python-markdown/>
1582
1583 Python Markdown which is mostly compatible with the original, with an interesting extension API.
1584
1585 =item ruby (maruku) - <http://maruku.rubyforge.org/>
1586
1587 One of the nicest implementations out there. Builds a parse tree internally so very flexible.
1588
1589 =item php - <http://michelf.com/projects/php-markdown/>
1590
1591 A direct port of Markdown.pl, also has a separately maintained 'extra' version,
1592 which adds a number of features that were borrowed by MultiMarkdown.
1593
1594 =item lua - <http://www.frykholm.se/files/markdown.lua>
1595
1596 Port to lua. Simple and lightweight (as lua is).
1597
1598 =item haskell - <http://johnmacfarlane.net/pandoc/>
1599
1600 Pandoc is a more general library, supporting Markdown, reStructuredText, LaTeX and more.
1601
1602 =item javascript - <http://www.attacklab.net/showdown-gui.html>
1603
1604 Direct(ish) port of Markdown.pl to JavaScript
1605
1606 =back
1607
1608 =head1 BUGS
1609
1610 To file bug reports or feature requests please send email to:
1611
1612     bug-Text-Markdown@rt.cpan.org
1613
1614 Please include with your report: (1) the example input; (2) the output
1615 you expected; (3) the output Markdown actually produced.
1616
1617 =head1 VERSION HISTORY
1618
1619 See the Changes file for detailed release notes for this version.
1620
1621 =head1 AUTHOR
1622
1623     John Gruber
1624     http://daringfireball.net/
1625
1626     PHP port and other contributions by Michel Fortin
1627     http://michelf.com/
1628
1629     MultiMarkdown changes by Fletcher Penney
1630     http://fletcher.freeshell.org/
1631
1632     CPAN Module Text::MultiMarkdown (based on Text::Markdown by Sebastian
1633     Riedel) originally by Darren Kulp (http://kulp.ch/)
1634
1635     This module is maintained by: Tomas Doran http://www.bobtfish.net/
1636
1637 =head1 THIS DISTRIBUTION
1638
1639 Please note that this distribution is a fork of John Gruber's original Markdown project,
1640 and it *is not* in any way blessed by him.
1641
1642 Whilst this code aims to be compatible with the original Markdown.pl (and incorporates
1643 and passes the Markdown test suite) whilst fixing a number of bugs in the original -
1644 there may be differences between the behaviour of this module and Markdown.pl. If you find
1645 any differences where you believe Text::Markdown behaves contrary to the Markdown spec,
1646 please report them as bugs.
1647
1648 Text::Markdown *does not* extend the markdown dialect in any way from that which is documented at
1649 daringfireball. If you want additional features, you should look at L<Text::MultiMarkdown>.
1650
1651 =head1 COPYRIGHT AND LICENSE
1652
1653 Original Code Copyright (c) 2003-2004 John Gruber
1654 <http://daringfireball.net/>
1655 All rights reserved.
1656
1657 MultiMarkdown changes Copyright (c) 2005-2006 Fletcher T. Penney
1658 <http://fletcher.freeshell.org/>
1659 All rights reserved.
1660
1661 Text::MultiMarkdown changes Copyright (c) 2006-2008 Darren Kulp
1662 <http://kulp.ch> and Tomas Doran <http://www.bobtfish.net>
1663
1664 Redistribution and use in source and binary forms, with or without
1665 modification, are permitted provided that the following conditions are
1666 met:
1667
1668 * Redistributions of source code must retain the above copyright notice,
1669   this list of conditions and the following disclaimer.
1670
1671 * Redistributions in binary form must reproduce the above copyright
1672   notice, this list of conditions and the following disclaimer in the
1673   documentation and/or other materials provided with the distribution.
1674
1675 * Neither the name "Markdown" nor the names of its contributors may
1676   be used to endorse or promote products derived from this software
1677   without specific prior written permission.
1678
1679 This software is provided by the copyright holders and contributors "as
1680 is" and any express or implied warranties, including, but not limited
1681 to, the implied warranties of merchantability and fitness for a
1682 particular purpose are disclaimed. In no event shall the copyright owner
1683 or contributors be liable for any direct, indirect, incidental, special,
1684 exemplary, or consequential damages (including, but not limited to,
1685 procurement of substitute goods or services; loss of use, data, or
1686 profits; or business interruption) however caused and on any theory of
1687 liability, whether in contract, strict liability, or tort (including
1688 negligence or otherwise) arising in any way out of the use of this
1689 software, even if advised of the possibility of such damage.
1690
1691 =cut