lib/markdown.php

   1 <?php
   2
   3 #
   4 # Markdown  -  A text-to-HTML conversion tool for web writers
   5 #
   6 # Copyright (c) 2004 John Gruber
   7 # <http://daringfireball.net/projects/markdown/>
   8 #
   9 # Copyright (c) 2004 Michel Fortin - Translation to PHP
  10 # <http://www.michelf.com/projects/php-markdown/>
  11 #
  12
  13
  14
  15 global  $MarkdownPHPVersion, $MarkdownSyntaxVersion,
  16                 $md_empty_element_suffix, $md_tab_width,
  17                 $md_nested_brackets_depth, $md_nested_brackets,
  18                 $md_escape_table, $md_backslash_escape_table;
  19
  20
  21 $MarkdownPHPVersion    = '1.0'; # Sat 21 Aug 2004
  22 $MarkdownSyntaxVersion = '1.0'; # Fri 20 Aug 2004
  23
  24
  25 #
  26 # Global default settings:
  27 #
  28 $md_empty_element_suffix = " />";     # Change to ">" for HTML output
  29 $md_tab_width = 4;
  30
  31
  32 # -- WordPress Plugin Interface -----------------------------------------------
  33 /*
  34 Plugin Name: Markdown
  35 Plugin URI: http://www.michelf.com/projects/php-markdown/
  36 Description: <a href="http://daringfireball.net/projects/markdown/syntax">Markdown syntax</a> allows you to write using an easy-to-read, easy-to-write plain text format. Based on the original Perl version by <a href="http://daringfireball.net/">John Gruber</a>. <a href="http://www.michelf.com/projects/php-markdown/">More...</a>
  37 Version: 1.0
  38 Author: Michel Fortin
  39 Author URI: http://www.michelf.com/
  40 */
  41 if (isset($wp_version)) {
  42         # Remove default WordPress auto-paragraph filter.
  43         remove_filter('the_content', 'wpautop');
  44         remove_filter('the_excerpt', 'wpautop');
  45         remove_filter('comment_text', 'wpautop');
  46         # Add Markdown filter with priority 6 (same as Textile).
  47         add_filter('the_content', 'Markdown', 6);
  48         add_filter('the_excerpt', 'Markdown', 6);
  49         add_filter('comment_text', 'Markdown', 6);
  50 }
  51
  52 # -- bBlog Plugin Info --------------------------------------------------------
  53 function identify_modifier_markdown() {
  54         global $MarkdownPHPVersion;
  55         return array(
  56                 'name'                  => 'markdown',
  57                 'type'                  => 'modifier',
  58                 'nicename'              => 'Markdown',
  59                 'description'   => 'A text-to-HTML conversion tool for web writers',
  60                 'authors'               => 'Michel Fortin and John Gruber',
  61                 'licence'               => 'GPL',
  62                 'version'               => $MarkdownPHPVersion,
  63                 'help'                  => '<a href="http://daringfireball.net/projects/markdown/syntax">Markdown syntax</a> allows you to write using an easy-to-read, easy-to-write plain text format. Based on the original Perl version by <a href="http://daringfireball.net/">John Gruber</a>. <a href="http://www.michelf.com/projects/php-markdown/">More...</a>'
  64         );
  65 }
  66
  67 # -- Smarty Modifier Interface ------------------------------------------------
  68 function smarty_modifier_markdown($text) {
  69         return Markdown($text);
  70 }
  71
  72 # -- Textile Compatibility Mode -----------------------------------------------
  73 # Rename this file to "classTextile.php" and it can replace Textile anywhere.
  74 if (strcasecmp(substr(__FILE__, -16), "classTextile.php") == 0) {
  75         # Try to include PHP SmartyPants. Should be in the same directory.
  76         @include_once 'smartypants.php';
  77         # Fake Textile class. It calls Markdown instead.
  78         class Textile {
  79                 function TextileThis($text, $lite='', $encode='', $noimage='', $strict='') {
  80                         if ($lite == '' && $encode == '')   $text = Markdown($text);
  81                         if (function_exists('SmartyPants')) $text = SmartyPants($text);
  82                         return $text;
  83                 }
  84         }
  85 }
  86
  87
  88
  89 #
  90 # Globals:
  91 #
  92
  93 # Regex to match balanced [brackets].
  94 # Needed to insert a maximum bracked depth while converting to PHP.
  95 $md_nested_brackets_depth = 6;
  96 $md_nested_brackets =
  97         str_repeat('(?>[^\[\]]+|\[', $md_nested_brackets_depth).
  98         str_repeat('\])*', $md_nested_brackets_depth);
  99
 100 # Table of hash values for escaped characters:
 101 $md_escape_table = array(
 102         "\\" => md5("\\"),
 103         "`" => md5("`"),
 104         "*" => md5("*"),
 105         "_" => md5("_"),
 106         "{" => md5("{"),
 107         "}" => md5("}"),
 108         "[" => md5("["),
 109         "]" => md5("]"),
 110         "(" => md5("("),
 111         ")" => md5(")"),
 112         "#" => md5("#"),
 113         "." => md5("."),
 114         "!" => md5("!")
 115 );
 116 # Create an identical table but for escaped characters.
 117 $md_backslash_escape_table;
 118 foreach ($md_escape_table as $key => $char)
 119         $md_backslash_escape_table["\\$key"] = $char;
 120
 121
 122 function Markdown($text) {
 123 #
 124 # Main function. The order in which other subs are called here is
 125 # essential. Link and image substitutions need to happen before
 126 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
 127 # and <img> tags get encoded.
 128 #
 129         # Clear the global hashes. If we don't clear these, you get conflicts
 130         # from other articles when generating a page which contains more than
 131         # one article (e.g. an index page that shows the N most recent
 132         # articles):
 133         global $md_urls, $md_titles, $md_html_blocks;
 134         $md_urls = array();
 135         $md_titles = array();
 136         $md_html_blocks = array();
 137
 138         # Standardize line endings:
 139         #   DOS to Unix and Mac to Unix
 140         $text = str_replace(array("\r\n", "\r"), "\n", $text);
 141
 142         # Make sure $text ends with a couple of newlines:
 143         $text .= "\n\n";
 144
 145         # Convert all tabs to spaces.
 146         $text = _Detab($text);
 147
 148         # Strip any lines consisting only of spaces and tabs.
 149         # This makes subsequent regexen easier to write, because we can
 150         # match consecutive blank lines with /\n+/ instead of something
 151         # contorted like /[ \t]*\n+/ .
 152         $text = preg_replace('/^[ \t]+$/m', '', $text);
 153
 154         # Turn block-level HTML blocks into hash entries
 155         $text = _HashHTMLBlocks($text);
 156
 157         # Strip link definitions, store in hashes.
 158         $text = _StripLinkDefinitions($text);
 159
 160         # _EscapeSpecialChars() must be called very early, to get
 161         # backslash escapes processed.
 162         $text = _EscapeSpecialChars($text);
 163
 164         $text = _RunBlockGamut($text);
 165
 166         $text = _UnescapeSpecialChars($text);
 167
 168         return $text . "\n";
 169 }
 170
 171
 172 function _StripLinkDefinitions($text) {
 173 #
 174 # Strips link definitions from text, stores the URLs and titles in
 175 # hash references.
 176 #
 177         # Link defs are in the form: ^[id]: url "optional title"
 178         $text = preg_replace_callback('{
 179                                                 ^[ \t]*\[(.+)\]:        # id = $1
 180                                                   [ \t]*
 181                                                   \n?                           # maybe *one* newline
 182                                                   [ \t]*
 183                                                 <?(\S+?)>?                      # url = $2
 184                                                   [ \t]*
 185                                                   \n?                           # maybe one newline
 186                                                   [ \t]*
 187                                                 (?:
 188                                                         # Todo: Titles are delimited by "quotes" or (parens).
 189                                                         ["(]
 190                                                         (.+?)                   # title = $3
 191                                                         [")]
 192                                                         [ \t]*
 193                                                 )?      # title is optional
 194                                                 (?:\n+|\Z)
 195                 }xm',
 196                 '_StripLinkDefinitions_callback',
 197                 $text);
 198         return $text;
 199 }
 200 function _StripLinkDefinitions_callback($matches) {
 201         global $md_urls, $md_titles;
 202         $link_id = strtolower($matches[1]);
 203         $md_urls[$link_id] = _EncodeAmpsAndAngles($matches[2]);
 204         if (isset($matches[3]))
 205                 $md_titles[$link_id] = htmlentities($matches[3]);
 206         return ''; # String that will replace the block
 207 }
 208
 209
 210 function _HashHTMLBlocks($text) {
 211         # Hashify HTML blocks:
 212         # We only want to do this for block-level HTML tags, such as headers,
 213         # lists, and tables. That's because we still want to wrap <p>s around
 214         # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
 215         # phrase emphasis, and spans. The list of tags we're looking for is
 216         # hard-coded:
 217         $block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|'.
 218                                         'script|noscript|form|fieldset|iframe|math|ins|del';
 219         $block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|'.
 220                                         'script|noscript|form|fieldset|iframe|math';
 221
 222         # First, look for nested blocks, e.g.:
 223         #       <div>
 224         #               <div>
 225         #               tags for inner block must be indented.
 226         #               </div>
 227         #       </div>
 228         #
 229         # The outermost tags must start at the left margin for this to match, and
 230         # the inner nested divs must be indented.
 231         # We need to do this before the next, more liberal match, because the next
 232         # match will start at the first `<div>` and stop at the first `</div>`.
 233         $text = preg_replace_callback("{
 234                                 (                                               # save in $1
 235                                         ^                                       # start of line  (with /m)
 236                                         <($block_tags_a)        # start tag = $2
 237                                         \\b                                     # word break
 238                                         (.*\\n)*?                       # any number of lines, minimally matching
 239                                         </\\2>                          # the matching end tag
 240                                         [ \\t]*                         # trailing spaces/tabs
 241                                         (?=\\n+|\\Z)    # followed by a newline or end of document
 242                                 )
 243                 }xm",
 244                 '_HashHTMLBlocks_callback',
 245                 $text);
 246
 247         #
 248         # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
 249         #
 250         $text = preg_replace_callback("{
 251                                 (                                               # save in $1
 252                                         ^                                       # start of line  (with /m)
 253                                         <($block_tags_b)        # start tag = $2
 254                                         \\b                                     # word break
 255                                         (.*\\n)*?                       # any number of lines, minimally matching
 256                                         .*</\\2>                                # the matching end tag
 257                                         [ \\t]*                         # trailing spaces/tabs
 258                                         (?=\\n+|\\Z)    # followed by a newline or end of document
 259                                 )
 260                 }xm",
 261                 '_HashHTMLBlocks_callback',
 262                 $text);
 263
 264         # Special case just for <hr />. It was easier to make a special case than
 265         # to make the other regex more complicated.
 266         $text = preg_replace_callback('{
 267                                 (?:
 268                                         (?<=\n\n)               # Starting after a blank line
 269                                         |                               # or
 270                                         \A\n?                   # the beginning of the doc
 271                                 )
 272                                 (                                               # save in $1
 273                                         [ \t]*
 274                                         <(hr)                           # start tag = $2
 275                                         \b                                      # word break
 276                                         ([^<>])*?                       #
 277                                         /?>                                     # the matching end tag
 278                                         (?=\n{2,}|\Z)           # followed by a blank line or end of document
 279                                 )
 280                 }x',
 281                 '_HashHTMLBlocks_callback',
 282                 $text);
 283
 284         return $text;
 285 }
 286 function _HashHTMLBlocks_callback($matches) {
 287         global $md_html_blocks;
 288         $text = $matches[1];
 289         $key = md5($text);
 290         $md_html_blocks[$key] = $text;
 291         return "\n\n$key\n\n"; # String that will replace the block
 292 }
 293
 294
 295 function _RunBlockGamut($text) {
 296 #
 297 # These are all the transformations that form block-level
 298 # tags like paragraphs, headers, and list items.
 299 #
 300         global $md_empty_element_suffix;
 301
 302         $text = _DoHeaders($text);
 303
 304         # Do Horizontal Rules:
 305         $text = preg_replace(
 306                 array('/^( ?\* ?){3,}$/m',
 307                           '/^( ?- ?){3,}$/m',
 308                           '/^( ?_ ?){3,}$/m'),
 309                 "\n<hr$md_empty_element_suffix\n",
 310                 $text);
 311
 312         $text = _DoLists($text);
 313
 314         $text = _DoCodeBlocks($text);
 315
 316         $text = _DoBlockQuotes($text);
 317
 318         # Make links out of things like `<http://example.com/>`
 319         $text = _DoAutoLinks($text);
 320
 321         # We already ran _HashHTMLBlocks() before, in Markdown(), but that
 322         # was to escape raw HTML in the original Markdown source. This time,
 323         # we're escaping the markup we've just created, so that we don't wrap
 324         # <p> tags around block-level tags.
 325         $text = _HashHTMLBlocks($text);
 326
 327         $text = _FormParagraphs($text);
 328
 329         return $text;
 330 }
 331
 332
 333 function _RunSpanGamut($text) {
 334 #
 335 # These are all the transformations that occur *within* block-level
 336 # tags like paragraphs, headers, and list items.
 337 #
 338         global $md_empty_element_suffix;
 339         $text = _DoCodeSpans($text);
 340
 341         # Fix unencoded ampersands and <'s:
 342         $text = _EncodeAmpsAndAngles($text);
 343
 344         # Process anchor and image tags. Images must come first,
 345         # because ![foo][f] looks like an anchor.
 346         $text = _DoImages($text);
 347         $text = _DoAnchors($text);
 348
 349
 350         $text = _DoItalicsAndBold($text);
 351
 352         # Do hard breaks:
 353         $text = preg_replace('/ {2,}\n/', "<br$md_empty_element_suffix\n", $text);
 354
 355         return $text;
 356 }
 357
 358
 359 function _EscapeSpecialChars($text) {
 360         global $md_escape_table;
 361         $tokens = _TokenizeHTML($text);
 362
 363         $text = '';   # rebuild $text from the tokens
 364 #       $in_pre = 0;  # Keep track of when we're inside <pre> or <code> tags.
 365 #       $tags_to_skip = "!<(/?)(?:pre|code|kbd|script|math)[\s>]!";
 366
 367         foreach ($tokens as $cur_token) {
 368                 if ($cur_token[0] == 'tag') {
 369                         # Within tags, encode * and _ so they don't conflict
 370                         # with their use in Markdown for italics and strong.
 371                         # We're replacing each such character with its
 372                         # corresponding MD5 checksum value; this is likely
 373                         # overkill, but it should prevent us from colliding
 374                         # with the escape values by accident.
 375                         $cur_token[1] = str_replace(array('*', '_'),
 376                                 array($md_escape_table['*'], $md_escape_table['_']),
 377                                 $cur_token[1]);
 378                         $text .= $cur_token[1];
 379                 } else {
 380                         $t = $cur_token[1];
 381                         $t = _EncodeBackslashEscapes($t);
 382                         $text .= $t;
 383                 }
 384         }
 385         return $text;
 386 }
 387
 388
 389 function _DoAnchors($text) {
 390 #
 391 # Turn Markdown link shortcuts into XHTML <a> tags.
 392 #
 393         global $md_nested_brackets;
 394         #
 395         # First, handle reference-style links: [link text] [id]
 396         #
 397         $text = preg_replace_callback("{
 398                 (                                       # wrap whole match in $1
 399                   \\[
 400                         ($md_nested_brackets)   # link text = $2
 401                   \\]
 402
 403                   [ ]?                          # one optional space
 404                   (?:\\n[ ]*)?          # one optional newline followed by spaces
 405
 406                   \\[
 407                         (.*?)           # id = $3
 408                   \\]
 409                 )
 410                 }xs",
 411                 '_DoAnchors_reference_callback', $text);
 412
 413         #
 414         # Next, inline-style links: [link text](url "optional title")
 415         #
 416         $text = preg_replace_callback("{
 417                 (                               # wrap whole match in $1
 418                   \\[
 419                         ($md_nested_brackets)   # link text = $2
 420                   \\]
 421                   \\(                   # literal paren
 422                         [ \\t]*
 423                         <?(.+?)>?       # href = $3
 424                         [ \\t]*
 425                         (                       # $4
 426                           (['\"])       # quote char = $5
 427                           (.*?)         # Title = $6
 428                           \\5           # matching quote
 429                         )?                      # title is optional
 430                   \\)
 431                 )
 432                 }xs",
 433                 '_DoAnchors_inline_callback', $text);
 434
 435         return $text;
 436 }
 437 function _DoAnchors_reference_callback($matches) {
 438         global $md_urls, $md_titles, $md_escape_table;
 439         $whole_match = $matches[1];
 440         $link_text   = $matches[2];
 441         $link_id     = strtolower($matches[3]);
 442
 443         if ($link_id == "") {
 444                 $link_id = strtolower($link_text); # for shortcut links like [this][].
 445         }
 446
 447         if (isset($md_urls[$link_id])) {
 448                 $url = $md_urls[$link_id];
 449                 # We've got to encode these to avoid conflicting with italics/bold.
 450                 $url = str_replace(array('*', '_'),
 451                                                    array($md_escape_table['*'], $md_escape_table['_']),
 452                                                    $url);
 453                 $result = "<a href=\"$url\"";
 454                 if ( isset( $md_titles[$link_id] ) ) {
 455                         $title = $md_titles[$link_id];
 456                         $title = str_replace(array('*',     '_'),
 457                                                                  array($md_escape_table['*'],
 458                                                                            $md_escape_table['_']), $title);
 459                         $result .=  " title=\"$title\"";
 460                 }
 461                 $result .= ">$link_text</a>";
 462         }
 463         else {
 464                 $result = $whole_match;
 465         }
 466         return $result;
 467 }
 468 function _DoAnchors_inline_callback($matches) {
 469         global $md_escape_table;
 470         $whole_match = $matches[1];
 471         $link_text   = $matches[2];
 472         $url                    = $matches[3];
 473         $title          = $matches[6];
 474
 475         # We've got to encode these to avoid conflicting with italics/bold.
 476         $url = str_replace(array('*', '_'),
 477                                            array($md_escape_table['*'], $md_escape_table['_']),
 478                                            $url);
 479         $result = "<a href=\"$url\"";
 480         if (isset($title)) {
 481                 $title = str_replace('"', '&quot', $title);
 482                 $title = str_replace(array('*', '_'),
 483                                                          array($md_escape_table['*'], $md_escape_table['_']),
 484                                                          $title);
 485                 $result .=  " title=\"$title\"";
 486         }
 487
 488         $result .= ">$link_text</a>";
 489
 490         return $result;
 491 }
 492
 493
 494 function _DoImages($text) {
 495 #
 496 # Turn Markdown image shortcuts into <img> tags.
 497 #
 498         #
 499         # First, handle reference-style labeled images: ![alt text][id]
 500         #
 501         $text = preg_replace_callback('{
 502                 (                               # wrap whole match in $1
 503                   !\[
 504                         (.*?)           # alt text = $2
 505                   \]
 506
 507                   [ ]?                          # one optional space
 508                   (?:\n[ ]*)?           # one optional newline followed by spaces
 509
 510                   \[
 511                         (.*?)           # id = $3
 512                   \]
 513
 514                 )
 515                 }xs',
 516                 '_DoImages_reference_callback', $text);
 517
 518         #
 519         # Next, handle inline images:  ![alt text](url "optional title")
 520         # Don't forget: encode * and _
 521
 522         $text = preg_replace_callback("{
 523                 (                               # wrap whole match in $1
 524                   !\\[
 525                         (.*?)           # alt text = $2
 526                   \\]
 527                   \\(                   # literal paren
 528                         [ \\t]*
 529                         <?(\S+?)>?      # src url = $3
 530                         [ \\t]*
 531                         (                       # $4
 532                           (['\"])       # quote char = $5
 533                           (.*?)         # title = $6
 534                           \\5           # matching quote
 535                           [ \\t]*
 536                         )?                      # title is optional
 537                   \\)
 538                 )
 539                 }xs",
 540                 '_DoImages_inline_callback', $text);
 541
 542         return $text;
 543 }
 544 function _DoImages_reference_callback($matches) {
 545         global $md_urls, $md_titles, $md_empty_element_suffix, $md_escape_table;
 546         $whole_match = $matches[1];
 547         $alt_text    = $matches[2];
 548         $link_id     = strtolower($matches[3]);
 549
 550         if ($link_id == "") {
 551                 $link_id = strtolower($alt_text); # for shortcut links like ![this][].
 552         }
 553
 554         $alt_text = str_replace('"', '&quot;', $alt_text);
 555         if (isset($md_urls[$link_id])) {
 556                 $url = $md_urls[$link_id];
 557                 # We've got to encode these to avoid conflicting with italics/bold.
 558                 $url = str_replace(array('*', '_'),
 559                                                    array($md_escape_table['*'], $md_escape_table['_']),
 560                                                    $url);
 561                 $result = "<img src=\"$url\" alt=\"$alt_text\"";
 562                 if (isset($md_titles[$link_id])) {
 563                         $title = $md_titles[$link_id];
 564                         $title = str_replace(array('*', '_'),
 565                                                                  array($md_escape_table['*'],
 566                                                                            $md_escape_table['_']), $title);
 567                         $result .=  " title=\"$title\"";
 568                 }
 569                 $result .= $md_empty_element_suffix;
 570         }
 571         else {
 572                 # If there's no such link ID, leave intact:
 573                 $result = $whole_match;
 574         }
 575
 576         return $result;
 577 }
 578 function _DoImages_inline_callback($matches) {
 579         global $md_empty_element_suffix, $md_escape_table;
 580         $whole_match = $matches[1];
 581         $alt_text    = $matches[2];
 582         $url                    = $matches[3];
 583         $title          = '';
 584         if (isset($matches[6])) {
 585                 $title = $matches[6];
 586         }
 587
 588         $alt_text = str_replace('"', '&quot;', $alt_text);
 589         $title    = str_replace('"', '&quot;', $title);
 590         # We've got to encode these to avoid conflicting with italics/bold.
 591         $url = str_replace(array('*', '_'),
 592                                            array($md_escape_table['*'], $md_escape_table['_']),
 593                                            $url);
 594         $result = "<img src=\"$url\" alt=\"$alt_text\"";
 595         if (isset($title)) {
 596                 $title = str_replace(array('*', '_'),
 597                                                          array($md_escape_table['*'], $md_escape_table['_']),
 598                                                          $title);
 599                 $result .=  " title=\"$title\""; # $title already quoted
 600         }
 601         $result .= $md_empty_element_suffix;
 602
 603         return $result;
 604 }
 605
 606
 607 function _DoHeaders($text) {
 608         # Setext-style headers:
 609         #         Header 1
 610         #         ========
 611         #
 612         #         Header 2
 613         #         --------
 614         #
 615         $text = preg_replace(
 616                 array("/(.+)[ \t]*\n=+[ \t]*\n+/e",
 617                           "/(.+)[ \t]*\n-+[ \t]*\n+/e"),
 618                 array("'<h1>'._RunSpanGamut(_UnslashQuotes('\\1')).'</h1>\n\n'",
 619                           "'<h2>'._RunSpanGamut(_UnslashQuotes('\\1')).'</h2>\n\n'"),
 620                 $text);
 621
 622         # atx-style headers:
 623         #       # Header 1
 624         #       ## Header 2
 625         #       ## Header 2 with closing hashes ##
 626         #       ...
 627         #       ###### Header 6
 628         #
 629         $text = preg_replace("{
 630                         ^(\\#{1,6})     # $1 = string of #'s
 631                         [ \\t]*
 632                         (.+?)           # $2 = Header text
 633                         [ \\t]*
 634                         \\#*                    # optional closing #'s (not counted)
 635                         \\n+
 636                 }xme",
 637                 "'<h'.strlen('\\1').'>'._RunSpanGamut(_UnslashQuotes('\\2')).'</h'.strlen('\\1').'>\n\n'",
 638                 $text);
 639
 640         return $text;
 641 }
 642
 643
 644 function _DoLists($text) {
 645 #
 646 # Form HTML ordered (numbered) and unordered (bulleted) lists.
 647 #
 648         global $md_tab_width;
 649         $less_than_tab = $md_tab_width - 1;
 650
 651         # Re-usable patterns to match list item bullets and number markers:
 652         $marker_ul  = '[*+-]';
 653         $marker_ol  = '\d+[.]';
 654         $marker_any = "(?:$marker_ul|$marker_ol)";
 655
 656         $text = preg_replace_callback("{
 657                         (                                                               # $1
 658                           (                                                             # $2
 659                                 ^[ ]{0,$less_than_tab}
 660                             ($marker_any)                               # $3 - first list item marker
 661                                 [ \\t]+
 662                           )
 663                           (?s:.+?)
 664                           (                                                             # $4
 665                                   \\z
 666                                 |
 667                                   \\n{2,}
 668                                   (?=\\S)
 669                                   (?!                                           # Negative lookahead for another list item marker
 670                                         [ \\t]*
 671                                         {$marker_any}[ \\t]+
 672                                   )
 673                           )
 674                         )
 675                 }xm",
 676                 '_DoLists_callback', $text);
 677
 678         return $text;
 679 }
 680 function _DoLists_callback($matches) {
 681         # Re-usable patterns to match list item bullets and number markers:
 682         $marker_ul  = '[*+-]';
 683         $marker_ol  = '\d+[.]';
 684         $marker_any = "(?:$marker_ul|$marker_ol)";
 685
 686         $list = $matches[1];
 687         $list_type = preg_match('/[*+-]/', $matches[3]) ? "ul" : "ol";
 688         # Turn double returns into triple returns, so that we can make a
 689         # paragraph for the last item in a list, if necessary:
 690         $list = preg_replace("/\n{2,}/", "\n\n\n", $list);
 691         $result = _ProcessListItems($list, $marker_any);
 692         $result = "<$list_type>\n" . $result . "</$list_type>\n\n";
 693         return $result;
 694 }
 695
 696
 697 function _ProcessListItems($list_str, $marker_any) {
 698         # trim trailing blank lines:
 699         $list_str = preg_replace("/\n{2,}\\z/", "\n", $list_str);
 700
 701         $list_str = preg_replace_callback('{
 702                 (\n)?                                                   # leading line = $1
 703                 (^[ \t]*)                                               # leading whitespace = $2
 704                 ('.$marker_any.') [ \t]+                # list marker = $3
 705                 ((?s:.+?)                                               # list item text   = $4
 706                 (\n{1,2}))
 707                 (?= \n* (\z | \2 ('.$marker_any.') [ \t]+))
 708                 }xm',
 709                 '_ProcessListItems_callback', $list_str);
 710
 711         return $list_str;
 712 }
 713 function _ProcessListItems_callback($matches) {
 714         $item = $matches[4];
 715         $leading_line = $matches[1];
 716         $leading_space = $matches[2];
 717
 718         if ($leading_line || preg_match('/\n{2,}/', $item)) {
 719                 $item = _RunBlockGamut(_Outdent($item));
 720                 #$item =~ s/\n+/\n/g;
 721         }
 722         else {
 723                 # Recursion for sub-lists:
 724                 $item = _DoLists(_Outdent($item));
 725                 $item = rtrim($item, "\n");
 726                 $item = _RunSpanGamut($item);
 727         }
 728
 729         return "<li>" . $item . "</li>\n";
 730 }
 731
 732
 733 function _DoCodeBlocks($text) {
 734 #
 735 #       Process Markdown `<pre><code>` blocks.
 736 #
 737         global $md_tab_width;
 738         $text = preg_replace_callback("{
 739                         (?:\\n\\n|\\A)
 740                         (                   # $1 = the code block -- one or more lines, starting with a space/tab
 741                           (?:
 742                                 (?:[ ]\{$md_tab_width} | \\t)  # Lines must start with a tab or a tab-width of spaces
 743                                 .*\\n+
 744                           )+
 745                         )
 746                         ((?=^[ ]{0,$md_tab_width}\\S)|\\Z)      # Lookahead for non-space at line-start, or end of doc
 747                 }xm",
 748                 '_DoCodeBlocks_callback', $text);
 749
 750         return $text;
 751 }
 752 function _DoCodeBlocks_callback($matches) {
 753         $codeblock = $matches[1];
 754
 755         $codeblock = _EncodeCode(_Outdent($codeblock));
 756         $codeblock = _Detab($codeblock);
 757         # trim leading newlines and trailing whitespace
 758         $codeblock = preg_replace(array('/\A\n+/', '/\s+\z/'), '', $codeblock);
 759
 760         $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
 761
 762         return $result;
 763 }
 764
 765
 766 function _DoCodeSpans($text) {
 767 #
 768 #       *       Backtick quotes are used for <code></code> spans.
 769 #
 770 #       *       You can use multiple backticks as the delimiters if you want to
 771 #               include literal backticks in the code span. So, this input:
 772 #
 773 #                 Just type ``foo `bar` baz`` at the prompt.
 774 #
 775 #               Will translate to:
 776 #
 777 #                 <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
 778 #
 779 #               There's no arbitrary limit to the number of backticks you
 780 #               can use as delimters. If you need three consecutive backticks
 781 #               in your code, use four for delimiters, etc.
 782 #
 783 #       *       You can use spaces to get literal backticks at the edges:
 784 #
 785 #                 ... type `` `bar` `` ...
 786 #
 787 #               Turns to:
 788 #
 789 #                 ... type <code>`bar`</code> ...
 790 #
 791         $text = preg_replace_callback("@
 792                         (`+)            # $1 = Opening run of `
 793                         (.+?)           # $2 = The code block
 794                         (?<!`)
 795                         \\1
 796                         (?!`)
 797                 @xs",
 798                 '_DoCodeSpans_callback', $text);
 799
 800         return $text;
 801 }
 802 function _DoCodeSpans_callback($matches) {
 803         $c = $matches[2];
 804         $c = preg_replace('/^[ \t]*/', '', $c); # leading whitespace
 805         $c = preg_replace('/[ \t]*$/', '', $c); # trailing whitespace
 806         $c = _EncodeCode($c);
 807         return "<code>$c</code>";
 808 }
 809
 810
 811 function _EncodeCode($_) {
 812 #
 813 # Encode/escape certain characters inside Markdown code runs.
 814 # The point is that in code, these characters are literals,
 815 # and lose their special Markdown meanings.
 816 #
 817         global $md_escape_table;
 818
 819         # Encode all ampersands; HTML entities are not
 820         # entities within a Markdown code span.
 821         $_ = str_replace('&', '&amp;', $_);
 822
 823         # Do the angle bracket song and dance:
 824         $_ = str_replace(array('<',    '>'),
 825                                          array('&lt;', '&gt;'), $_);
 826
 827         # Now, escape characters that are magic in Markdown:
 828         $_ = str_replace(array_keys($md_escape_table),
 829                                          array_values($md_escape_table), $_);
 830
 831         return $_;
 832 }
 833
 834
 835 function _DoItalicsAndBold($text) {
 836         # <strong> must go first:
 837         $text = preg_replace('{ (\*\*|__) (?=\S) (.+?) (?<=\S) \1 }sx',
 838                 '<strong>\2</strong>', $text);
 839         # Then <em>:
 840         $text = preg_replace('{ (\*|_) (?=\S) (.+?) (?<=\S) \1 }sx',
 841                 '<em>\2</em>', $text);
 842
 843         return $text;
 844 }
 845
 846
 847 function _DoBlockQuotes($text) {
 848         $text = preg_replace_callback('/
 849                   (                                                             # Wrap whole match in $1
 850                         (
 851                           ^[ \t]*>[ \t]?                        # ">" at the start of a line
 852                                 .+\n                                    # rest of the first line
 853                           (.+\n)*                                       # subsequent consecutive lines
 854                           \n*                                           # blanks
 855                         )+
 856                   )
 857                 /xm',
 858                 '_DoBlockQuotes_callback', $text);
 859
 860         return $text;
 861 }
 862 function _DoBlockQuotes_callback($matches) {
 863         $bq = $matches[1];
 864         # trim one level of quoting - trim whitespace-only lines
 865         $bq = preg_replace(array('/^[ \t]*>[ \t]?/m', '/^[ \t]+$/m'), '', $bq);
 866         $bq = _RunBlockGamut($bq);              # recurse
 867
 868         $bq = preg_replace('/^/m', "  ", $bq);
 869         # These leading spaces screw with <pre> content, so we need to fix that:
 870         $bq = preg_replace_callback('{(\s*<pre>.+?</pre>)}sx',
 871                                                                 '_DoBlockQuotes_callback2', $bq);
 872
 873         return "<blockquote>\n$bq\n</blockquote>\n\n";
 874 }
 875 function _DoBlockQuotes_callback2($matches) {
 876         $pre = $matches[1];
 877         $pre = preg_replace('/^  /m', '', $pre);
 878         return $pre;
 879 }
 880
 881
 882 function _FormParagraphs($text) {
 883 #
 884 #       Params:
 885 #               $text - string to process with html <p> tags
 886 #
 887         global $md_html_blocks;
 888
 889         # Strip leading and trailing lines:
 890         $text = preg_replace(array('/\A\n+/', '/\n+\z/'), '', $text);
 891
 892         $grafs = preg_split('/\n{2,}/', $text, -1, PREG_SPLIT_NO_EMPTY);
 893         $count = count($grafs);
 894
 895         #
 896         # Wrap <p> tags.
 897         #
 898         foreach ($grafs as $key => $value) {
 899                 if (!isset( $md_html_blocks[$value] )) {
 900                         $value = _RunSpanGamut($value);
 901                         $value = preg_replace('/^([ \t]*)/', '<p>', $value);
 902                         $value .= "</p>";
 903                         $grafs[$key] = $value;
 904                 }
 905         }
 906
 907         #
 908         # Unhashify HTML blocks
 909         #
 910         foreach ($grafs as $key => $value) {
 911                 if (isset( $md_html_blocks[$value] )) {
 912                         $grafs[$key] = $md_html_blocks[$value];
 913                 }
 914         }
 915
 916         return implode("\n\n", $grafs);
 917 }
 918
 919
 920 function _EncodeAmpsAndAngles($text) {
 921 # Smart processing for ampersands and angle brackets that need to be encoded.
 922
 923         # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
 924         #   http://bumppo.net/projects/amputator/
 925         $text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/',
 926                                                  '&amp;', $text);;
 927
 928         # Encode naked <'s
 929         $text = preg_replace('{<(?![a-z/?\$!])}i', '&lt;', $text);
 930
 931         return $text;
 932 }
 933
 934
 935 function _EncodeBackslashEscapes($text) {
 936 #
 937 #       Parameter:  String.
 938 #       Returns:    The string, with after processing the following backslash
 939 #                               escape sequences.
 940 #
 941         global $md_escape_table, $md_backslash_escape_table;
 942         # Must process escaped backslashes first.
 943         return str_replace(array_keys($md_backslash_escape_table),
 944                                            array_values($md_backslash_escape_table), $text);
 945 }
 946
 947
 948 function _DoAutoLinks($text) {
 949         $text = preg_replace("!<((https?|ftp):[^'\">\\s]+)>!",
 950                                                  '<a href="\1">\1</a>', $text);
 951
 952         # Email addresses: <address@domain.foo>
 953         $text = preg_replace('{
 954                 <
 955                 (
 956                         [-.\w]+
 957                         \@
 958                         [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
 959                 )
 960                 >
 961                 }exi',
 962                 "_EncodeEmailAddress(_UnescapeSpecialChars(_UnslashQuotes('\\1')))",
 963                 $text);
 964
 965         return $text;
 966 }
 967
 968
 969 function _EncodeEmailAddress($addr) {
 970 #
 971 #       Input: an email address, e.g. "foo@example.com"
 972 #
 973 #       Output: the email address as a mailto link, with each character
 974 #               of the address encoded as either a decimal or hex entity, in
 975 #               the hopes of foiling most address harvesting spam bots. E.g.:
 976 #
 977 #         <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
 978 #               x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
 979 #               &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
 980 #
 981 #       Based by a filter by Matthew Wickline, posted to the BBEdit-Talk
 982 #       mailing list: <http://tinyurl.com/yu7ue>
 983 #
 984         $addr = "mailto:" . $addr;
 985         $length = strlen($addr);
 986
 987         # leave ':' alone (to spot mailto: later)
 988         $addr = preg_replace_callback('/([^\:])/',
 989                                                                   '_EncodeEmailAddress_callback', $addr);
 990
 991         $addr = "<a href=\"$addr\">$addr</a>";
 992         # strip the mailto: from the visible part
 993         $addr = preg_replace('/">.+?:/', '">', $addr);
 994
 995         return $addr;
 996 }
 997 function _EncodeEmailAddress_callback($matches) {
 998         $char = $matches[1];
 999         $r = rand(0, 100);
1000         # roughly 10% raw, 45% hex, 45% dec
1001         # '@' *must* be encoded. I insist.
1002         if ($r > 90 && $char != '@') return $char;
1003         if ($r < 45) return '&#x'.dechex(ord($char)).';';
1004         return '&#'.ord($char).';';
1005 }
1006
1007
1008 function _UnescapeSpecialChars($text) {
1009 #
1010 # Swap back in all the special characters we've hidden.
1011 #
1012         global $md_escape_table;
1013         return str_replace(array_values($md_escape_table),
1014                                            array_keys($md_escape_table), $text);
1015 }
1016
1017
1018 # Tokenize_HTML is shared between PHP Markdown and PHP SmartyPants.
1019 # We only define it if it is not already defined.
1020 if (!function_exists('_TokenizeHTML')) {
1021         function _TokenizeHTML($str) {
1022         #
1023         #   Parameter:  String containing HTML markup.
1024         #   Returns:    An array of the tokens comprising the input
1025         #               string. Each token is either a tag (possibly with nested,
1026         #               tags contained therein, such as <a href="<MTFoo>">, or a
1027         #               run of text between tags. Each element of the array is a
1028         #               two-element array; the first is either 'tag' or 'text';
1029         #               the second is the actual value.
1030         #
1031         #
1032         #   Regular expression derived from the _tokenize() subroutine in
1033         #   Brad Choate's MTRegex plugin.
1034         #   <http://www.bradchoate.com/past/mtregex.php>
1035         #
1036                 $index = 0;
1037                 $tokens = array();
1038
1039                 $depth = 6;
1040                 $nested_tags = str_repeat('(?:<[a-z\/!$](?:[^<>]|',$depth)
1041                                            .str_repeat(')*>)', $depth);
1042                 $match = "(?s:<!(?:--.*?--\s*)+>)|".  # comment
1043                                  "(?s:<\?.*?\?>)|".         # processing instruction
1044                                  "$nested_tags";            # nested tags
1045
1046                 $parts = preg_split("/($match)/", $str, -1, PREG_SPLIT_DELIM_CAPTURE);
1047
1048                 foreach ($parts as $part) {
1049                         if (++$index % 2 && $part != '')
1050                                 array_push($tokens, array('text', $part));
1051                         else
1052                                 array_push($tokens, array('tag', $part));
1053                 }
1054
1055                 return $tokens;
1056         }
1057 }
1058
1059
1060 function _Outdent($text) {
1061 #
1062 # Remove one level of line-leading tabs or spaces
1063 #
1064         global $md_tab_width;
1065         return preg_replace("/^(\\t|[ ]{1,$md_tab_width})/m", "", $text);
1066 }
1067
1068
1069 function _Detab($text) {
1070 #
1071 # Inspired from a post by Bart Lateur:
1072 # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
1073 #
1074         global $md_tab_width;
1075         $text = preg_replace(
1076                 "/(.*?)\t/e",
1077                 "'\\1'.str_repeat(' ', $md_tab_width - strlen('\\1') % $md_tab_width)",
1078                 $text);
1079         return $text;
1080 }
1081
1082
1083 function _UnslashQuotes($text) {
1084 #
1085 #       This function is useful to remove automaticaly slashed double quotes
1086 #       when using preg_replace and evaluating an expression.
1087 #       Parameter:  String.
1088 #       Returns:    The string with any slash-double-quote (\") sequence replaced
1089 #                               by a single double quote.
1090 #
1091         return str_replace('\"', '"', $text);
1092 }
1093
1094
1095 /*
1096
1097 PHP Markdown
1098 ============
1099
1100 Description
1101 -----------
1102
1103 This is a PHP translation of the original Markdown formatter written in
1104 Perl by John Gruber.
1105
1106 Markdown is a text-to-HTML filter; it translates an easy-to-read /
1107 easy-to-write structured text format into HTML. Markdown's text format
1108 is most similar to that of plain text email, and supports features such
1109 as headers, *emphasis*, code blocks, blockquotes, and links.
1110
1111 Markdown's syntax is designed not as a generic markup language, but
1112 specifically to serve as a front-end to (X)HTML. You can use span-level
1113 HTML tags anywhere in a Markdown document, and you can use block level
1114 HTML tags (like <div> and <table> as well).
1115
1116 For more information about Markdown's syntax, see:
1117
1118 <http://daringfireball.net/projects/markdown/>
1119
1120
1121 Bugs
1122 ----
1123
1124 To file bug reports please send email to:
1125
1126 <michel.fortin@michelf.com>
1127
1128 Please include with your report: (1) the example input; (2) the output you
1129 expected; (3) the output Markdown actually produced.
1130
1131
1132 Version History
1133 ---------------
1134
1135 1.0: Sat 21 Aug 2004
1136
1137 *       Fixed a couple of bugs in _DoLists() and _ProcessListItems() that
1138         caused unordered lists starting with `+` or `-` to be turned into
1139         *ordered* lists.
1140
1141 *       Added to the list of block-level HTML tags:
1142
1143                 noscript, form, fieldset, iframe, math
1144
1145 *       Fixed an odd bug where, with input like this:
1146
1147                 > This line starts the blockquote
1148                 * This list is part of the quote.
1149                 * Second item.
1150
1151                 This paragraph is not part of the blockquote.
1152
1153         The trailing paragraph was incorrectly included in the
1154         blockquote. (The solution was to add an extra "\n" after
1155         lists.)
1156
1157 *       The contents of `<pre>` tags inside `<blockquote>` are no longer
1158         indented in the HTML output.
1159
1160 *       PHP Markdown can now be used as a modifier by the Smarty
1161         templating engine. Rename the file to "modifier.markdown.php"
1162         and put it in your smarty plugins folder.
1163
1164 *       Now works as a bBlog formatter. Rename the file to
1165         "modifier.markdown.php" and place it in the "bBlog_plugins"
1166         folder.
1167
1168
1169 1.0fc1: Wed 8 Jul 2004
1170
1171 *       Greatly simplified the rules for code blocks. No more colons
1172         necessary; if it's indented (4 spaces or 1 tab), it's a code block.
1173
1174 *       Unordered list items can now be denoted by any of the following
1175         bullet markers: [*+-]
1176
1177 *       Replacing `"` with `&quot;` to fix literal quotes within title
1178         attributes.
1179
1180
1181 1.0b9: Sun 27 Jun 2004
1182
1183 *       Replacing `"` with `&quot;` to fix literal quotes within img alt
1184         attributes.
1185
1186
1187 1.0b8: Wed 23 Jun 2004
1188
1189 *   In WordPress, solved a bug where PHP Markdown did not deactivate
1190         the paragraph filter, converting all returns to a line break.
1191         The "texturize" filter was being disabled instead.
1192
1193 *       Added 'math' tags to block-level tag patterns in `_HashHTMLBlocks()`.
1194         Please disregard all the 'math'-tag related items in 1.0b7.
1195
1196 *       Commented out some vestigial code in `_EscapeSpecialChars()`
1197
1198
1199 1.0b7: Sat 12 Jun 2004
1200
1201 *   Added 'math' to `$tags_to_skip` pattern, for MathML users.
1202
1203 *   Tweaked regex for identifying HTML entities in
1204         `_EncodeAmpsAndAngles()`, so as to allow for the very long entity
1205         names used by MathML. (Thanks to Jacques Distler for the patch.)
1206
1207 *   Changed the internals of `_TokenizeHTML` to lower the PHP version
1208         requirement to PHP 4.0.5.
1209
1210
1211 1.0b6: Sun 6 Jun 2004
1212
1213 *   Added a WordPress plugin interface. This means that you can
1214         directly put the "markdown.php" file into the "wp-content/plugins"
1215         directory and then activate it from the administrative interface.
1216
1217 *   Added a Textile compatibility interface. Rename this file to
1218         "classTextile.php" and it can replace Textile anywhere.
1219
1220 *   The title attribute of reference-style links were ignored.
1221         This is now fixed.
1222
1223 *   Changed internal variables names so that they begin with `md_`
1224         instead of `g_`. This should reduce the risk of name collision with
1225         other programs.
1226
1227
1228 1.0b5: Sun 2 May 2004
1229
1230 *       Workaround for supporting `<ins>` and `<del>` as block-level tags.
1231         This only works if the start and end tags are on lines by
1232         themselves.
1233
1234 *       Three or more underscores can now be used for horizontal rules.
1235
1236 *       Lines containing only whitespace are trimmed from blockquotes.
1237
1238 *       You can now optionally wrap URLs with angle brackets -- like so:
1239         `<http://example.com>` -- in link definitions and inline links and
1240         images.
1241
1242 *       `_` and `*` characters in links and images are no longer escaped
1243         as HTML entities. Instead, we use the ridiculous but effective MD5
1244         hashing trick that's used to hide these characters elsewhere. The
1245         end result is that the HTML output uses the literal `*` and `_`
1246         characters, rather than the ugly entities.
1247
1248 *       Passing an empty string to the Markdown function no longer creates
1249         an empty paragraph.
1250
1251 *       Added a global declaration at the beginning of the file. This
1252         means you can now `include 'markdown.php'` from inside a function.
1253
1254
1255 1.0b4.1: Sun 4 Apr 2004
1256
1257 *       Fixed a bug where image tags did not close.
1258
1259 *       Fixed a bug where brakets `[]` inside a link caused the link to be
1260         ignored. PHP Markdown support only 6 (!) level of brakets inside a link
1261         (while John's original version of Markdown in Perl support much more).
1262
1263
1264 1.0b4: Sat 27 Mar 2004
1265
1266 *       First release of PHP Markdown, based on the 1.0b4 release.
1267
1268
1269 Author & Contributors
1270 ---------------------
1271
1272 Original version by John Gruber
1273 <http://daringfireball.net/>
1274
1275 PHP translation by Michel Fortin
1276 <http://www.michelf.com/>
1277
1278 First WordPress plugin interface written by Matt Mullenweg
1279 <http://photomatt.net/>
1280
1281
1282 Copyright and License
1283 ---------------------
1284
1285 Copyright (c) 2004 Michel Fortin
1286 <http://www.michelf.com/>
1287 All rights reserved.
1288
1289 Copyright (c) 2003-2004 John Gruber
1290 <http://daringfireball.net/>
1291 All rights reserved.
1292
1293 Markdown is free software; you can redistribute it and/or modify it
1294 under the terms of the GNU General Public License as published by the
1295 Free Software Foundation; either version 2 of the License, or (at your
1296 option) any later version.
1297
1298 Markdown is distributed in the hope that it will be useful, but WITHOUT
1299 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
1300 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
1301 for more details.
1302
1303 */
1304 ?>