3 * Markdown - A text-to-HTML conversion tool for web writers
5 * @package php-markdown
6 * @author Michel Fortin <michel.fortin@michelf.com>
7 * @copyright 2004-2018 Michel Fortin <https://michelf.com/projects/php-markdown/>
8 * @copyright (Original Markdown) 2004-2006 John Gruber <https://daringfireball.net/projects/markdown/>
14 * Markdown Parser Class
16 class Markdown
implements MarkdownInterface
{
18 * Define the package version
21 const MARKDOWNLIB_VERSION
= "1.8.0";
24 * Simple function interface - Initialize the parser and return the result
25 * of its transform method. This will work fine for derived classes too.
32 public static function defaultTransform($text) {
33 // Take parser class on which this function was called.
34 $parser_class = \
get_called_class();
36 // Try to take parser from the static parser list
38 $parser =& $parser_list[$parser_class];
40 // Create the parser it not already set
42 $parser = new $parser_class;
45 // Transform text using parser.
46 return $parser->transform($text);
50 * Configuration variables
54 * Change to ">" for HTML output.
57 public $empty_element_suffix = " />";
60 * The width of indentation of the output markup
63 public $tab_width = 4;
66 * Change to `true` to disallow markup or entities.
69 public $no_markup = false;
70 public $no_entities = false;
74 * Change to `true` to enable line breaks on \n without two trailling spaces
77 public $hard_wrap = false;
80 * Predefined URLs and titles for reference links and images.
83 public $predef_urls = array();
84 public $predef_titles = array();
87 * Optional filter function for URLs
90 public $url_filter_func = null;
93 * Optional header id="" generation callback function.
96 public $header_id_func = null;
99 * Optional function for converting code block content to HTML
102 public $code_block_content_func = null;
105 * Optional function for converting code span content to HTML.
108 public $code_span_content_func = null;
111 * Class attribute to toggle "enhanced ordered list" behaviour
112 * setting this to true will allow ordered lists to start from the index
113 * number that is defined first.
121 * <li>List item two</li>
122 * <li>List item three</li>
127 public $enhanced_ordered_list = false;
130 * Parser implementation
134 * Regex to match balanced [brackets].
135 * Needed to insert a maximum bracked depth while converting to PHP.
138 protected $nested_brackets_depth = 6;
139 protected $nested_brackets_re;
141 protected $nested_url_parenthesis_depth = 4;
142 protected $nested_url_parenthesis_re;
145 * Table of hash values for escaped characters:
148 protected $escape_chars = '\`*_{}[]()>#+-.!';
149 protected $escape_chars_re;
152 * Constructor function. Initialize appropriate member variables.
155 public function __construct() {
157 $this->prepareItalicsAndBold();
159 $this->nested_brackets_re
=
160 str_repeat('(?>[^\[\]]+|\[', $this->nested_brackets_depth
).
161 str_repeat('\])*', $this->nested_brackets_depth
);
163 $this->nested_url_parenthesis_re
=
164 str_repeat('(?>[^()\s]+|\(', $this->nested_url_parenthesis_depth
).
165 str_repeat('(?>\)))*', $this->nested_url_parenthesis_depth
);
167 $this->escape_chars_re
= '['.preg_quote($this->escape_chars
).']';
169 // Sort document, block, and span gamut in ascendent priority order.
170 asort($this->document_gamut
);
171 asort($this->block_gamut
);
172 asort($this->span_gamut
);
177 * Internal hashes used during transformation.
180 protected $urls = array();
181 protected $titles = array();
182 protected $html_hashes = array();
185 * Status flag to avoid invalid nesting.
188 protected $in_anchor = false;
191 * Status flag to avoid invalid nesting.
194 protected $in_emphasis_processing = false;
197 * Called before the transformation process starts to setup parser states.
200 protected function setup() {
201 // Clear global hashes.
202 $this->urls
= $this->predef_urls
;
203 $this->titles
= $this->predef_titles
;
204 $this->html_hashes
= array();
205 $this->in_anchor
= false;
206 $this->in_emphasis_processing
= false;
210 * Called after the transformation process to clear any variable which may
211 * be taking up memory unnecessarly.
214 protected function teardown() {
215 $this->urls
= array();
216 $this->titles
= array();
217 $this->html_hashes
= array();
221 * Main function. Performs some preprocessing on the input text and pass
222 * it through the document gamut.
226 * @param string $text
229 public function transform($text) {
232 # Remove UTF-8 BOM and marker character in input, if present.
233 $text = preg_replace('{^\xEF\xBB\xBF|\x1A}', '', $text);
235 # Standardize line endings:
236 # DOS to Unix and Mac to Unix
237 $text = preg_replace('{\r\n?}', "\n", $text);
239 # Make sure $text ends with a couple of newlines:
242 # Convert all tabs to spaces.
243 $text = $this->detab($text);
245 # Turn block-level HTML blocks into hash entries
246 $text = $this->hashHTMLBlocks($text);
248 # Strip any lines consisting only of spaces and tabs.
249 # This makes subsequent regexen easier to write, because we can
250 # match consecutive blank lines with /\n+/ instead of something
251 # contorted like /[ ]*\n+/ .
252 $text = preg_replace('/^[ ]+$/m', '', $text);
254 # Run document gamut methods.
255 foreach ($this->document_gamut
as $method => $priority) {
256 $text = $this->$method($text);
265 * Define the document gamut
268 protected $document_gamut = array(
269 // Strip link definitions, store in hashes.
270 "stripLinkDefinitions" => 20,
271 "runBasicBlockGamut" => 30,
275 * Strips link definitions from text, stores the URLs and titles in
277 * @param string $text
280 protected function stripLinkDefinitions($text) {
282 $less_than_tab = $this->tab_width
- 1;
284 // Link defs are in the form: ^[id]: url "optional title"
285 $text = preg_replace_callback('{
286 ^[ ]{0,'.$less_than_tab.'}\[(.+)\][ ]?: # id = $1
288 \n? # maybe *one* newline
296 \n? # maybe one newline
299 (?<=\s) # lookbehind for whitespace
304 )? # title is optional
307 array($this, '_stripLinkDefinitions_callback'),
314 * The callback to strip link definitions
315 * @param array $matches
318 protected function _stripLinkDefinitions_callback($matches) {
319 $link_id = strtolower($matches[1]);
320 $url = $matches[2] == '' ?
$matches[3] : $matches[2];
321 $this->urls
[$link_id] = $url;
322 $this->titles
[$link_id] =& $matches[4];
323 return ''; // String that will replace the block
327 * Hashify HTML blocks
328 * @param string $text
331 protected function hashHTMLBlocks($text) {
332 if ($this->no_markup
) {
336 $less_than_tab = $this->tab_width
- 1;
339 * Hashify HTML blocks:
341 * We only want to do this for block-level HTML tags, such as headers,
342 * lists, and tables. That's because we still want to wrap <p>s around
343 * "paragraphs" that are wrapped in non-block-level tags, such as
344 * anchors, phrase emphasis, and spans. The list of tags we're looking
347 * * List "a" is made of tags which can be both inline or block-level.
348 * These will be treated block-level when the start tag is alone on
349 * its line, otherwise they're not matched here and will be taken as
351 * * List "b" is made of tags which are always block-level;
353 $block_tags_a_re = 'ins|del';
354 $block_tags_b_re = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|'.
355 'script|noscript|style|form|fieldset|iframe|math|svg|'.
356 'article|section|nav|aside|hgroup|header|footer|'.
359 // Regular expression for the content of a block tag.
360 $nested_tags_level = 4;
362 (?> # optional tag attributes
363 \s # starts with whitespace
365 [^>"/]+ # text outside quotes
367 /+(?!>) # slash not followed by ">"
369 "[^"]*" # text inside double quotes (tolerate ">")
371 \'[^\']*\' # text inside single quotes (tolerate ">")
378 [^<]+ # content without tag
380 <\2 # nested opening tag
381 '.$attr.' # attributes
385 >', $nested_tags_level). // end of opening tag
386 '.*?'. // last level nested tag content
388 </\2\s*> # closing nested tag
391 <(?!/\2\s*> # other tags with a different name
395 $content2 = str_replace('\2', '\3', $content);
398 * First, look for nested blocks, e.g.:
401 * tags for inner block must be indented.
405 * The outermost tags must start at the left margin for this to match,
406 * and the inner nested divs must be indented.
407 * We need to do this before the next, more liberal match, because the
408 * next match will start at the first `<div>` and stop at the
411 $text = preg_replace_callback('{(?>
413 (?<=\n) # Starting on its own line
415 \A\n? # the at beginning of the doc
419 # Match from `\n<tag>` to `</tag>\n`, handling nested tags
422 [ ]{0,'.$less_than_tab.'}
423 <('.$block_tags_b_re.')# start tag = $2
424 '.$attr.'> # attributes followed by > and \n
425 '.$content.' # content, support nesting
426 </\2> # the matching end tag
427 [ ]* # trailing spaces/tabs
428 (?=\n+|\Z) # followed by a newline or end of document
430 | # Special version for tags of group a.
432 [ ]{0,'.$less_than_tab.'}
433 <('.$block_tags_a_re.')# start tag = $3
434 '.$attr.'>[ ]*\n # attributes followed by >
435 '.$content2.' # content, support nesting
436 </\3> # the matching end tag
437 [ ]* # trailing spaces/tabs
438 (?=\n+|\Z) # followed by a newline or end of document
440 | # Special case just for <hr />. It was easier to make a special
441 # case than to make the other regex more complicated.
443 [ ]{0,'.$less_than_tab.'}
444 <(hr) # start tag = $2
445 '.$attr.' # attributes
446 /?> # the matching end tag
448 (?=\n{2,}|\Z) # followed by a blank line or end of document
450 | # Special case for standalone HTML comments:
452 [ ]{0,'.$less_than_tab.'}
457 (?=\n{2,}|\Z) # followed by a blank line or end of document
459 | # PHP and ASP-style processor instructions (<? and <%)
461 [ ]{0,'.$less_than_tab.'}
468 (?=\n{2,}|\Z) # followed by a blank line or end of document
472 array($this, '_hashHTMLBlocks_callback'),
480 * The callback for hashing HTML blocks
481 * @param string $matches
484 protected function _hashHTMLBlocks_callback($matches) {
486 $key = $this->hashBlock($text);
487 return "\n\n$key\n\n";
491 * Called whenever a tag must be hashed when a function insert an atomic
492 * element in the text stream. Passing $text to through this function gives
493 * a unique text-token which will be reverted back when calling unhash.
495 * The $boundary argument specify what character should be used to surround
496 * the token. By convension, "B" is used for block elements that needs not
497 * to be wrapped into paragraph tags at the end, ":" is used for elements
498 * that are word separators and "X" is used in the general case.
500 * @param string $text
501 * @param string $boundary
504 protected function hashPart($text, $boundary = 'X') {
505 // Swap back any tag hash found in $text so we do not have to `unhash`
506 // multiple times at the end.
507 $text = $this->unhash($text);
509 // Then hash the block.
511 $key = "$boundary\x1A" . ++
$i . $boundary;
512 $this->html_hashes
[$key] = $text;
513 return $key; // String that will replace the tag.
517 * Shortcut function for hashPart with block-level boundaries.
518 * @param string $text
521 protected function hashBlock($text) {
522 return $this->hashPart($text, 'B');
526 * Define the block gamut - these are all the transformations that form
527 * block-level tags like paragraphs, headers, and list items.
530 protected $block_gamut = array(
532 "doHorizontalRules" => 20,
534 "doCodeBlocks" => 50,
535 "doBlockQuotes" => 60,
539 * Run block gamut tranformations.
541 * We need to escape raw HTML in Markdown source before doing anything
542 * else. This need to be done for each block, and not only at the
543 * begining in the Markdown function since hashed blocks can be part of
544 * list items and could have been indented. Indented blocks would have
545 * been seen as a code block in a previous pass of hashHTMLBlocks.
547 * @param string $text
550 protected function runBlockGamut($text) {
551 $text = $this->hashHTMLBlocks($text);
552 return $this->runBasicBlockGamut($text);
556 * Run block gamut tranformations, without hashing HTML blocks. This is
557 * useful when HTML blocks are known to be already hashed, like in the first
558 * whole-document pass.
560 * @param string $text
563 protected function runBasicBlockGamut($text) {
565 foreach ($this->block_gamut
as $method => $priority) {
566 $text = $this->$method($text);
569 // Finally form paragraph and restore hashed blocks.
570 $text = $this->formParagraphs($text);
576 * Convert horizontal rules
577 * @param string $text
580 protected function doHorizontalRules($text) {
583 ^[ ]{0,3} # Leading space
584 ([-*_]) # $1: First marker
585 (?> # Repeated marker group
586 [ ]{0,2} # Zero, one, or two spaces.
587 \1 # Marker character
588 ){2,} # Group repeated at least twice
589 [ ]* # Tailing spaces
592 "\n".$this->hashBlock("<hr$this->empty_element_suffix")."\n",
598 * These are all the transformations that occur *within* block-level
599 * tags like paragraphs, headers, and list items.
602 protected $span_gamut = array(
603 // Process character escapes, code spans, and inline HTML
606 // Process anchor and image tags. Images must come first,
607 // because ![foo][f] looks like an anchor.
610 // Make links out of things like `<https://example.com/>`
611 // Must come after doAnchors, because you can use < and >
612 // delimiters in inline links like [this](<url>).
614 "encodeAmpsAndAngles" => 40,
615 "doItalicsAndBold" => 50,
616 "doHardBreaks" => 60,
620 * Run span gamut transformations
621 * @param string $text
624 protected function runSpanGamut($text) {
625 foreach ($this->span_gamut
as $method => $priority) {
626 $text = $this->$method($text);
634 * @param string $text
637 protected function doHardBreaks($text) {
638 if ($this->hard_wrap
) {
639 return preg_replace_callback('/ *\n/',
640 array($this, '_doHardBreaks_callback'), $text);
642 return preg_replace_callback('/ {2,}\n/',
643 array($this, '_doHardBreaks_callback'), $text);
648 * Trigger part hashing for the hard break (callback method)
649 * @param array $matches
652 protected function _doHardBreaks_callback($matches) {
653 return $this->hashPart("<br$this->empty_element_suffix\n");
657 * Turn Markdown link shortcuts into XHTML <a> tags.
658 * @param string $text
661 protected function doAnchors($text) {
662 if ($this->in_anchor
) {
665 $this->in_anchor
= true;
667 // First, handle reference-style links: [link text] [id]
668 $text = preg_replace_callback('{
669 ( # wrap whole match in $1
671 ('.$this->nested_brackets_re
.') # link text = $2
674 [ ]? # one optional space
675 (?:\n[ ]*)? # one optional newline followed by spaces
682 array($this, '_doAnchors_reference_callback'), $text);
684 // Next, inline-style links: [link text](url "optional title")
685 $text = preg_replace_callback('{
686 ( # wrap whole match in $1
688 ('.$this->nested_brackets_re
.') # link text = $2
695 ('.$this->nested_url_parenthesis_re
.') # href = $4
699 ([\'"]) # quote char = $6
702 [ \n]* # ignore any spaces/tabs between closing quote and )
703 )? # title is optional
707 array($this, '_doAnchors_inline_callback'), $text);
709 // Last, handle reference-style shortcuts: [link text]
710 // These must come last in case you've also got [link text][1]
711 // or [link text](/foo)
712 $text = preg_replace_callback('{
713 ( # wrap whole match in $1
715 ([^\[\]]+) # link text = $2; can\'t contain [ or ]
719 array($this, '_doAnchors_reference_callback'), $text);
721 $this->in_anchor
= false;
726 * Callback method to parse referenced anchors
727 * @param string $matches
730 protected function _doAnchors_reference_callback($matches) {
731 $whole_match = $matches[1];
732 $link_text = $matches[2];
733 $link_id =& $matches[3];
735 if ($link_id == "") {
736 // for shortcut links like [this][] or [this].
737 $link_id = $link_text;
740 // lower-case and turn embedded newlines into spaces
741 $link_id = strtolower($link_id);
742 $link_id = preg_replace('{[ ]?\n}', ' ', $link_id);
744 if (isset($this->urls
[$link_id])) {
745 $url = $this->urls
[$link_id];
746 $url = $this->encodeURLAttribute($url);
748 $result = "<a href=\"$url\"";
749 if ( isset( $this->titles
[$link_id] ) ) {
750 $title = $this->titles
[$link_id];
751 $title = $this->encodeAttribute($title);
752 $result .= " title=\"$title\"";
755 $link_text = $this->runSpanGamut($link_text);
756 $result .= ">$link_text</a>";
757 $result = $this->hashPart($result);
759 $result = $whole_match;
765 * Callback method to parse inline anchors
766 * @param string $matches
769 protected function _doAnchors_inline_callback($matches) {
770 $whole_match = $matches[1];
771 $link_text = $this->runSpanGamut($matches[2]);
772 $url = $matches[3] == '' ?
$matches[4] : $matches[3];
773 $title =& $matches[7];
775 // If the URL was of the form <s p a c e s> it got caught by the HTML
776 // tag parser and hashed. Need to reverse the process before using
778 $unhashed = $this->unhash($url);
779 if ($unhashed != $url)
780 $url = preg_replace('/^<(.*)>$/', '\1', $unhashed);
782 $url = $this->encodeURLAttribute($url);
784 $result = "<a href=\"$url\"";
786 $title = $this->encodeAttribute($title);
787 $result .= " title=\"$title\"";
790 $link_text = $this->runSpanGamut($link_text);
791 $result .= ">$link_text</a>";
793 return $this->hashPart($result);
797 * Turn Markdown image shortcuts into <img> tags.
798 * @param string $text
801 protected function doImages($text) {
802 // First, handle reference-style labeled images: ![alt text][id]
803 $text = preg_replace_callback('{
804 ( # wrap whole match in $1
806 ('.$this->nested_brackets_re
.') # alt text = $2
809 [ ]? # one optional space
810 (?:\n[ ]*)? # one optional newline followed by spaces
818 array($this, '_doImages_reference_callback'), $text);
820 // Next, handle inline images: ![alt text](url "optional title")
821 // Don't forget: encode * and _
822 $text = preg_replace_callback('{
823 ( # wrap whole match in $1
825 ('.$this->nested_brackets_re
.') # alt text = $2
827 \s? # One optional whitespace character
831 <(\S*)> # src url = $3
833 ('.$this->nested_url_parenthesis_re
.') # src url = $4
837 ([\'"]) # quote char = $6
841 )? # title is optional
845 array($this, '_doImages_inline_callback'), $text);
851 * Callback to parse references image tags
852 * @param array $matches
855 protected function _doImages_reference_callback($matches) {
856 $whole_match = $matches[1];
857 $alt_text = $matches[2];
858 $link_id = strtolower($matches[3]);
860 if ($link_id == "") {
861 $link_id = strtolower($alt_text); // for shortcut links like ![this][].
864 $alt_text = $this->encodeAttribute($alt_text);
865 if (isset($this->urls
[$link_id])) {
866 $url = $this->encodeURLAttribute($this->urls
[$link_id]);
867 $result = "<img src=\"$url\" alt=\"$alt_text\"";
868 if (isset($this->titles
[$link_id])) {
869 $title = $this->titles
[$link_id];
870 $title = $this->encodeAttribute($title);
871 $result .= " title=\"$title\"";
873 $result .= $this->empty_element_suffix
;
874 $result = $this->hashPart($result);
876 // If there's no such link ID, leave intact:
877 $result = $whole_match;
884 * Callback to parse inline image tags
885 * @param array $matches
888 protected function _doImages_inline_callback($matches) {
889 $whole_match = $matches[1];
890 $alt_text = $matches[2];
891 $url = $matches[3] == '' ?
$matches[4] : $matches[3];
892 $title =& $matches[7];
894 $alt_text = $this->encodeAttribute($alt_text);
895 $url = $this->encodeURLAttribute($url);
896 $result = "<img src=\"$url\" alt=\"$alt_text\"";
898 $title = $this->encodeAttribute($title);
899 $result .= " title=\"$title\""; // $title already quoted
901 $result .= $this->empty_element_suffix
;
903 return $this->hashPart($result);
907 * Parse Markdown heading elements to HTML
908 * @param string $text
911 protected function doHeaders($text) {
913 * Setext-style headers:
920 $text = preg_replace_callback('{ ^(.+?)[ ]*\n(=+|-+)[ ]*\n+ }mx',
921 array($this, '_doHeaders_callback_setext'), $text);
927 * ## Header 2 with closing hashes ##
931 $text = preg_replace_callback('{
932 ^(\#{1,6}) # $1 = string of #\'s
934 (.+?) # $2 = Header text
936 \#* # optional closing #\'s (not counted)
939 array($this, '_doHeaders_callback_atx'), $text);
945 * Setext header parsing callback
946 * @param array $matches
949 protected function _doHeaders_callback_setext($matches) {
950 // Terrible hack to check we haven't found an empty list item.
951 if ($matches[2] == '-' && preg_match('{^-(?: |$)}', $matches[1])) {
955 $level = $matches[2][0] == '=' ?
1 : 2;
957 // ID attribute generation
958 $idAtt = $this->_generateIdFromHeaderValue($matches[1]);
960 $block = "<h$level$idAtt>".$this->runSpanGamut($matches[1])."</h$level>";
961 return "\n" . $this->hashBlock($block) . "\n\n";
965 * ATX header parsing callback
966 * @param array $matches
969 protected function _doHeaders_callback_atx($matches) {
970 // ID attribute generation
971 $idAtt = $this->_generateIdFromHeaderValue($matches[2]);
973 $level = strlen($matches[1]);
974 $block = "<h$level$idAtt>".$this->runSpanGamut($matches[2])."</h$level>";
975 return "\n" . $this->hashBlock($block) . "\n\n";
979 * If a header_id_func property is set, we can use it to automatically
980 * generate an id attribute.
982 * This method returns a string in the form id="foo", or an empty string
984 * @param string $headerValue
987 protected function _generateIdFromHeaderValue($headerValue) {
988 if (!is_callable($this->header_id_func
)) {
992 $idValue = call_user_func($this->header_id_func
, $headerValue);
997 return ' id="' . $this->encodeAttribute($idValue) . '"';
1001 * Form HTML ordered (numbered) and unordered (bulleted) lists.
1002 * @param string $text
1005 protected function doLists($text) {
1006 $less_than_tab = $this->tab_width
- 1;
1008 // Re-usable patterns to match list item bullets and number markers:
1009 $marker_ul_re = '[*+-]';
1010 $marker_ol_re = '\d+[\.]';
1012 $markers_relist = array(
1013 $marker_ul_re => $marker_ol_re,
1014 $marker_ol_re => $marker_ul_re,
1017 foreach ($markers_relist as $marker_re => $other_marker_re) {
1018 // Re-usable pattern to match any entirel ul or ol list:
1022 ([ ]{0,'.$less_than_tab.'}) # $3 = number of spaces
1023 ('.$marker_re.') # $4 = first list item marker
1032 (?! # Negative lookahead for another list item marker
1037 (?= # Lookahead for another kind of list
1039 \3 # Must have the same indentation
1040 '.$other_marker_re.'[ ]+
1046 // We use a different prefix before nested lists than top-level lists.
1047 //See extended comment in _ProcessListItems().
1049 if ($this->list_level
) {
1050 $text = preg_replace_callback('{
1054 array($this, '_doLists_callback'), $text);
1056 $text = preg_replace_callback('{
1057 (?:(?<=\n)\n|\A\n?) # Must eat the newline
1060 array($this, '_doLists_callback'), $text);
1068 * List parsing callback
1069 * @param array $matches
1072 protected function _doLists_callback($matches) {
1073 // Re-usable patterns to match list item bullets and number markers:
1074 $marker_ul_re = '[*+-]';
1075 $marker_ol_re = '\d+[\.]';
1076 $marker_any_re = "(?:$marker_ul_re|$marker_ol_re)";
1077 $marker_ol_start_re = '[0-9]+';
1079 $list = $matches[1];
1080 $list_type = preg_match("/$marker_ul_re/", $matches[4]) ?
"ul" : "ol";
1082 $marker_any_re = ( $list_type == "ul" ?
$marker_ul_re : $marker_ol_re );
1085 $result = $this->processListItems($list, $marker_any_re);
1088 if ($this->enhanced_ordered_list
) {
1089 // Get the start number for ordered list.
1090 if ($list_type == 'ol') {
1091 $ol_start_array = array();
1092 $ol_start_check = preg_match("/$marker_ol_start_re/", $matches[4], $ol_start_array);
1093 if ($ol_start_check){
1094 $ol_start = $ol_start_array[0];
1099 if ($ol_start > 1 && $list_type == 'ol'){
1100 $result = $this->hashBlock("<$list_type start=\"$ol_start\">\n" . $result . "</$list_type>");
1102 $result = $this->hashBlock("<$list_type>\n" . $result . "</$list_type>");
1104 return "\n". $result ."\n\n";
1108 * Nesting tracker for list levels
1111 protected $list_level = 0;
1114 * Process the contents of a single ordered or unordered list, splitting it
1115 * into individual list items.
1116 * @param string $list_str
1117 * @param string $marker_any_re
1120 protected function processListItems($list_str, $marker_any_re) {
1122 * The $this->list_level global keeps track of when we're inside a list.
1123 * Each time we enter a list, we increment it; when we leave a list,
1124 * we decrement. If it's zero, we're not in a list anymore.
1126 * We do this because when we're not inside a list, we want to treat
1127 * something like this:
1129 * I recommend upgrading to version
1130 * 8. Oops, now this line is treated
1133 * As a single paragraph, despite the fact that the second line starts
1134 * with a digit-period-space sequence.
1136 * Whereas when we're inside a list (or sub-list), that line will be
1137 * treated as the start of a sub-list. What a kludge, huh? This is
1138 * an aspect of Markdown's syntax that's hard to parse perfectly
1139 * without resorting to mind-reading. Perhaps the solution is to
1140 * change the syntax rules such that sub-lists must start with a
1141 * starting cardinal number; e.g. "1." or "a.".
1143 $this->list_level++
;
1145 // Trim trailing blank lines:
1146 $list_str = preg_replace("/\n{2,}\\z/", "\n", $list_str);
1148 $list_str = preg_replace_callback('{
1149 (\n)? # leading line = $1
1150 (^[ ]*) # leading whitespace = $2
1151 ('.$marker_any_re.' # list marker and space = $3
1152 (?:[ ]+|(?=\n)) # space only required if item is not empty
1154 ((?s:.*?)) # list item text = $4
1155 (?:(\n+(?=\n))|\n) # tailing blank line = $5
1156 (?= \n* (\z | \2 ('.$marker_any_re.') (?:[ ]+|(?=\n))))
1158 array($this, '_processListItems_callback'), $list_str);
1160 $this->list_level
--;
1165 * List item parsing callback
1166 * @param array $matches
1169 protected function _processListItems_callback($matches) {
1170 $item = $matches[4];
1171 $leading_line =& $matches[1];
1172 $leading_space =& $matches[2];
1173 $marker_space = $matches[3];
1174 $tailing_blank_line =& $matches[5];
1176 if ($leading_line ||
$tailing_blank_line ||
1177 preg_match('/\n{2,}/', $item))
1179 // Replace marker with the appropriate whitespace indentation
1180 $item = $leading_space . str_repeat(' ', strlen($marker_space)) . $item;
1181 $item = $this->runBlockGamut($this->outdent($item)."\n");
1183 // Recursion for sub-lists:
1184 $item = $this->doLists($this->outdent($item));
1185 $item = $this->formParagraphs($item, false);
1188 return "<li>" . $item . "</li>\n";
1192 * Process Markdown `<pre><code>` blocks.
1193 * @param string $text
1196 protected function doCodeBlocks($text) {
1197 $text = preg_replace_callback('{
1199 ( # $1 = the code block -- one or more lines, starting with a space/tab
1201 [ ]{'.$this->tab_width
.'} # Lines must start with a tab or a tab-width of spaces
1205 ((?=^[ ]{0,'.$this->tab_width
.'}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
1207 array($this, '_doCodeBlocks_callback'), $text);
1213 * Code block parsing callback
1214 * @param array $matches
1217 protected function _doCodeBlocks_callback($matches) {
1218 $codeblock = $matches[1];
1220 $codeblock = $this->outdent($codeblock);
1221 if ($this->code_block_content_func
) {
1222 $codeblock = call_user_func($this->code_block_content_func
, $codeblock, "");
1224 $codeblock = htmlspecialchars($codeblock, ENT_NOQUOTES
);
1227 # trim leading newlines and trailing newlines
1228 $codeblock = preg_replace('/\A\n+|\n+\z/', '', $codeblock);
1230 $codeblock = "<pre><code>$codeblock\n</code></pre>";
1231 return "\n\n" . $this->hashBlock($codeblock) . "\n\n";
1235 * Create a code span markup for $code. Called from handleSpanToken.
1236 * @param string $code
1239 protected function makeCodeSpan($code) {
1240 if ($this->code_span_content_func
) {
1241 $code = call_user_func($this->code_span_content_func
, $code);
1243 $code = htmlspecialchars(trim($code), ENT_NOQUOTES
);
1245 return $this->hashPart("<code>$code</code>");
1249 * Define the emphasis operators with their regex matches
1252 protected $em_relist = array(
1253 '' => '(?:(?<!\*)\*(?!\*)|(?<!_)_(?!_))(?![\.,:;]?\s)',
1254 '*' => '(?<![\s*])\*(?!\*)',
1255 '_' => '(?<![\s_])_(?!_)',
1259 * Define the strong operators with their regex matches
1262 protected $strong_relist = array(
1263 '' => '(?:(?<!\*)\*\*(?!\*)|(?<!_)__(?!_))(?![\.,:;]?\s)',
1264 '**' => '(?<![\s*])\*\*(?!\*)',
1265 '__' => '(?<![\s_])__(?!_)',
1269 * Define the emphasis + strong operators with their regex matches
1272 protected $em_strong_relist = array(
1273 '' => '(?:(?<!\*)\*\*\*(?!\*)|(?<!_)___(?!_))(?![\.,:;]?\s)',
1274 '***' => '(?<![\s*])\*\*\*(?!\*)',
1275 '___' => '(?<![\s_])___(?!_)',
1279 * Container for prepared regular expressions
1282 protected $em_strong_prepared_relist;
1285 * Prepare regular expressions for searching emphasis tokens in any
1289 protected function prepareItalicsAndBold() {
1290 foreach ($this->em_relist
as $em => $em_re) {
1291 foreach ($this->strong_relist
as $strong => $strong_re) {
1292 // Construct list of allowed token expressions.
1293 $token_relist = array();
1294 if (isset($this->em_strong_relist
["$em$strong"])) {
1295 $token_relist[] = $this->em_strong_relist
["$em$strong"];
1297 $token_relist[] = $em_re;
1298 $token_relist[] = $strong_re;
1300 // Construct master expression from list.
1301 $token_re = '{(' . implode('|', $token_relist) . ')}';
1302 $this->em_strong_prepared_relist
["$em$strong"] = $token_re;
1308 * Convert Markdown italics (emphasis) and bold (strong) to HTML
1309 * @param string $text
1312 protected function doItalicsAndBold($text) {
1313 if ($this->in_emphasis_processing
) {
1314 return $text; // avoid reentrency
1316 $this->in_emphasis_processing
= true;
1318 $token_stack = array('');
1319 $text_stack = array('');
1322 $tree_char_em = false;
1325 // Get prepared regular expression for seraching emphasis tokens
1326 // in current context.
1327 $token_re = $this->em_strong_prepared_relist
["$em$strong"];
1329 // Each loop iteration search for the next emphasis token.
1330 // Each token is then passed to handleSpanToken.
1331 $parts = preg_split($token_re, $text, 2, PREG_SPLIT_DELIM_CAPTURE
);
1332 $text_stack[0] .= $parts[0];
1333 $token =& $parts[1];
1336 if (empty($token)) {
1337 // Reached end of text span: empty stack without emitting.
1338 // any more emphasis.
1339 while ($token_stack[0]) {
1340 $text_stack[1] .= array_shift($token_stack);
1341 $text_stack[0] .= array_shift($text_stack);
1346 $token_len = strlen($token);
1347 if ($tree_char_em) {
1348 // Reached closing marker while inside a three-char emphasis.
1349 if ($token_len == 3) {
1350 // Three-char closing marker, close em and strong.
1351 array_shift($token_stack);
1352 $span = array_shift($text_stack);
1353 $span = $this->runSpanGamut($span);
1354 $span = "<strong><em>$span</em></strong>";
1355 $text_stack[0] .= $this->hashPart($span);
1359 // Other closing marker: close one em or strong and
1360 // change current token state to match the other
1361 $token_stack[0] = str_repeat($token[0], 3-$token_len);
1362 $tag = $token_len == 2 ?
"strong" : "em";
1363 $span = $text_stack[0];
1364 $span = $this->runSpanGamut($span);
1365 $span = "<$tag>$span</$tag>";
1366 $text_stack[0] = $this->hashPart($span);
1367 $
$tag = ''; // $$tag stands for $em or $strong
1369 $tree_char_em = false;
1370 } else if ($token_len == 3) {
1372 // Reached closing marker for both em and strong.
1373 // Closing strong marker:
1374 for ($i = 0; $i < 2; ++
$i) {
1375 $shifted_token = array_shift($token_stack);
1376 $tag = strlen($shifted_token) == 2 ?
"strong" : "em";
1377 $span = array_shift($text_stack);
1378 $span = $this->runSpanGamut($span);
1379 $span = "<$tag>$span</$tag>";
1380 $text_stack[0] .= $this->hashPart($span);
1381 $
$tag = ''; // $$tag stands for $em or $strong
1384 // Reached opening three-char emphasis marker. Push on token
1385 // stack; will be handled by the special condition above.
1388 array_unshift($token_stack, $token);
1389 array_unshift($text_stack, '');
1390 $tree_char_em = true;
1392 } else if ($token_len == 2) {
1394 // Unwind any dangling emphasis marker:
1395 if (strlen($token_stack[0]) == 1) {
1396 $text_stack[1] .= array_shift($token_stack);
1397 $text_stack[0] .= array_shift($text_stack);
1400 // Closing strong marker:
1401 array_shift($token_stack);
1402 $span = array_shift($text_stack);
1403 $span = $this->runSpanGamut($span);
1404 $span = "<strong>$span</strong>";
1405 $text_stack[0] .= $this->hashPart($span);
1408 array_unshift($token_stack, $token);
1409 array_unshift($text_stack, '');
1413 // Here $token_len == 1
1415 if (strlen($token_stack[0]) == 1) {
1416 // Closing emphasis marker:
1417 array_shift($token_stack);
1418 $span = array_shift($text_stack);
1419 $span = $this->runSpanGamut($span);
1420 $span = "<em>$span</em>";
1421 $text_stack[0] .= $this->hashPart($span);
1424 $text_stack[0] .= $token;
1427 array_unshift($token_stack, $token);
1428 array_unshift($text_stack, '');
1433 $this->in_emphasis_processing
= false;
1434 return $text_stack[0];
1438 * Parse Markdown blockquotes to HTML
1439 * @param string $text
1442 protected function doBlockQuotes($text) {
1443 $text = preg_replace_callback('/
1444 ( # Wrap whole match in $1
1446 ^[ ]*>[ ]? # ">" at the start of a line
1447 .+\n # rest of the first line
1448 (.+\n)* # subsequent consecutive lines
1453 array($this, '_doBlockQuotes_callback'), $text);
1459 * Blockquote parsing callback
1460 * @param array $matches
1463 protected function _doBlockQuotes_callback($matches) {
1465 // trim one level of quoting - trim whitespace-only lines
1466 $bq = preg_replace('/^[ ]*>[ ]?|^[ ]+$/m', '', $bq);
1467 $bq = $this->runBlockGamut($bq); // recurse
1469 $bq = preg_replace('/^/m', " ", $bq);
1470 // These leading spaces cause problem with <pre> content,
1471 // so we need to fix that:
1472 $bq = preg_replace_callback('{(\s*<pre>.+?</pre>)}sx',
1473 array($this, '_doBlockQuotes_callback2'), $bq);
1475 return "\n" . $this->hashBlock("<blockquote>\n$bq\n</blockquote>") . "\n\n";
1479 * Blockquote parsing callback
1480 * @param array $matches
1483 protected function _doBlockQuotes_callback2($matches) {
1485 $pre = preg_replace('/^ /m', '', $pre);
1492 * @param string $text String to process in paragraphs
1493 * @param boolean $wrap_in_p Whether paragraphs should be wrapped in <p> tags
1496 protected function formParagraphs($text, $wrap_in_p = true) {
1497 // Strip leading and trailing lines:
1498 $text = preg_replace('/\A\n+|\n+\z/', '', $text);
1500 $grafs = preg_split('/\n{2,}/', $text, -1, PREG_SPLIT_NO_EMPTY
);
1502 // Wrap <p> tags and unhashify HTML blocks
1503 foreach ($grafs as $key => $value) {
1504 if (!preg_match('/^B\x1A[0-9]+B$/', $value)) {
1506 $value = $this->runSpanGamut($value);
1508 $value = preg_replace('/^([ ]*)/', "<p>", $value);
1511 $grafs[$key] = $this->unhash($value);
1514 // Modify elements of @grafs in-place...
1516 $block = $this->html_hashes
[$graf];
1518 // if (preg_match('{
1520 // ( # $1 = <div> tag
1524 // markdown\s*=\s* ([\'"]) # $2 = attr quote char
1530 // ( # $3 = contents
1533 // (</div>) # $4 = closing tag
1535 // }xs', $block, $matches))
1537 // list(, $div_open, , $div_content, $div_close) = $matches;
1539 // // We can't call Markdown(), because that resets the hash;
1540 // // that initialization code should be pulled into its own sub, though.
1541 // $div_content = $this->hashHTMLBlocks($div_content);
1543 // // Run document gamut methods on the content.
1544 // foreach ($this->document_gamut as $method => $priority) {
1545 // $div_content = $this->$method($div_content);
1548 // $div_open = preg_replace(
1549 // '{\smarkdown\s*=\s*([\'"]).+?\1}', '', $div_open);
1551 // $graf = $div_open . "\n" . $div_content . "\n" . $div_close;
1553 $grafs[$key] = $graf;
1557 return implode("\n\n", $grafs);
1561 * Encode text for a double-quoted HTML attribute. This function
1562 * is *not* suitable for attributes enclosed in single quotes.
1563 * @param string $text
1566 protected function encodeAttribute($text) {
1567 $text = $this->encodeAmpsAndAngles($text);
1568 $text = str_replace('"', '"', $text);
1573 * Encode text for a double-quoted HTML attribute containing a URL,
1574 * applying the URL filter if set. Also generates the textual
1575 * representation for the URL (removing mailto: or tel:) storing it in $text.
1576 * This function is *not* suitable for attributes enclosed in single quotes.
1578 * @param string $url
1579 * @param string &$text Passed by reference
1580 * @return string URL
1582 protected function encodeURLAttribute($url, &$text = null) {
1583 if ($this->url_filter_func
) {
1584 $url = call_user_func($this->url_filter_func
, $url);
1587 if (preg_match('{^mailto:}i', $url)) {
1588 $url = $this->encodeEntityObfuscatedAttribute($url, $text, 7);
1589 } else if (preg_match('{^tel:}i', $url)) {
1590 $url = $this->encodeAttribute($url);
1591 $text = substr($url, 4);
1593 $url = $this->encodeAttribute($url);
1601 * Smart processing for ampersands and angle brackets that need to
1602 * be encoded. Valid character entities are left alone unless the
1603 * no-entities mode is set.
1604 * @param string $text
1607 protected function encodeAmpsAndAngles($text) {
1608 if ($this->no_entities
) {
1609 $text = str_replace('&', '&', $text);
1611 // Ampersand-encoding based entirely on Nat Irons's Amputator
1612 // MT plugin: <http://bumppo.net/projects/amputator/>
1613 $text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/',
1616 // Encode remaining <'s
1617 $text = str_replace('<', '<', $text);
1623 * Parse Markdown automatic links to anchor HTML tags
1624 * @param string $text
1627 protected function doAutoLinks($text) {
1628 $text = preg_replace_callback('{<((https?|ftp|dict|tel):[^\'">\s]+)>}i',
1629 array($this, '_doAutoLinks_url_callback'), $text);
1631 // Email addresses: <address@domain.foo>
1632 $text = preg_replace_callback('{
1637 [-!#$%&\'*+/=?^_`.{|}~\w\x80-\xFF]+
1643 [-a-z0-9\x80-\xFF]+(\.[-a-z0-9\x80-\xFF]+)*\.[a-z]+
1645 \[[\d.a-fA-F:]+\] # IPv4 & IPv6
1650 array($this, '_doAutoLinks_email_callback'), $text);
1656 * Parse URL callback
1657 * @param array $matches
1660 protected function _doAutoLinks_url_callback($matches) {
1661 $url = $this->encodeURLAttribute($matches[1], $text);
1662 $link = "<a href=\"$url\">$text</a>";
1663 return $this->hashPart($link);
1667 * Parse email address callback
1668 * @param array $matches
1671 protected function _doAutoLinks_email_callback($matches) {
1672 $addr = $matches[1];
1673 $url = $this->encodeURLAttribute("mailto:$addr", $text);
1674 $link = "<a href=\"$url\">$text</a>";
1675 return $this->hashPart($link);
1679 * Input: some text to obfuscate, e.g. "mailto:foo@example.com"
1681 * Output: the same text but with most characters encoded as either a
1682 * decimal or hex entity, in the hopes of foiling most address
1683 * harvesting spam bots. E.g.:
1685 * mailto:foo
1686 * @example.co
1689 * Note: the additional output $tail is assigned the same value as the
1690 * ouput, minus the number of characters specified by $head_length.
1692 * Based by a filter by Matthew Wickline, posted to BBEdit-Talk.
1693 * With some optimizations by Milian Wolff. Forced encoding of HTML
1694 * attribute special characters by Allan Odgaard.
1696 * @param string $text
1697 * @param string &$tail
1698 * @param integer $head_length
1701 protected function encodeEntityObfuscatedAttribute($text, &$tail = null, $head_length = 0) {
1706 $chars = preg_split('/(?<!^)(?!$)/', $text);
1707 $seed = (int)abs(crc32($text) / strlen($text)); // Deterministic seed.
1709 foreach ($chars as $key => $char) {
1711 // Ignore non-ascii chars.
1713 $r = ($seed * (1 +
$key)) %
100; // Pseudo-random function.
1714 // roughly 10% raw, 45% hex, 45% dec
1715 // '@' *must* be encoded. I insist.
1716 // '"' and '>' have to be encoded inside the attribute
1717 if ($r > 90 && strpos('@"&>', $char) === false) {
1719 } else if ($r < 45) {
1720 $chars[$key] = '&#x'.dechex($ord).';';
1722 $chars[$key] = '&#'.$ord.';';
1727 $text = implode('', $chars);
1728 $tail = $head_length ?
implode('', array_slice($chars, $head_length)) : $text;
1734 * Take the string $str and parse it into tokens, hashing embeded HTML,
1735 * escaped characters and handling code spans.
1736 * @param string $str
1739 protected function parseSpan($str) {
1744 \\\\'.$this->escape_chars_re
.'
1747 `+ # code span marker
1748 '.( $this->no_markup ?
'' : '
1750 <!-- .*? --> # comment
1752 <\?.*?\?> | <%.*?%> # processing instruction
1754 <[!$]?[-a-zA-Z0-9:_]+ # regular tags
1757 (?>[^"\'>]+|"[^"]*"|\'[^\']*\')*
1761 <[-a-zA-Z0-9:_]+\s*/> # xml-style empty tag
1763 </[-a-zA-Z0-9:_]+\s*> # closing tag
1769 // Each loop iteration seach for either the next tag, the next
1770 // openning code span marker, or the next escaped character.
1771 // Each token is then passed to handleSpanToken.
1772 $parts = preg_split($span_re, $str, 2, PREG_SPLIT_DELIM_CAPTURE
);
1774 // Create token from text preceding tag.
1775 if ($parts[0] != "") {
1776 $output .= $parts[0];
1779 // Check if we reach the end.
1780 if (isset($parts[1])) {
1781 $output .= $this->handleSpanToken($parts[1], $parts[2]);
1792 * Handle $token provided by parseSpan by determining its nature and
1793 * returning the corresponding value that should replace it.
1794 * @param string $token
1795 * @param string &$str
1798 protected function handleSpanToken($token, &$str) {
1799 switch ($token[0]) {
1801 return $this->hashPart("&#". ord($token[1]). ";");
1803 // Search for end marker in remaining text.
1804 if (preg_match('/^(.*?[^`])'.preg_quote($token).'(?!`)(.*)$/sm',
1808 $codespan = $this->makeCodeSpan($matches[1]);
1809 return $this->hashPart($codespan);
1811 return $token; // Return as text since no ending marker found.
1813 return $this->hashPart($token);
1818 * Remove one level of line-leading tabs or spaces
1819 * @param string $text
1822 protected function outdent($text) {
1823 return preg_replace('/^(\t|[ ]{1,' . $this->tab_width
. '})/m', '', $text);
1828 * String length function for detab. `_initDetab` will create a function to
1829 * handle UTF-8 if the default function does not exist.
1832 protected $utf8_strlen = 'mb_strlen';
1835 * Replace tabs with the appropriate amount of spaces.
1837 * For each line we separate the line in blocks delemited by tab characters.
1838 * Then we reconstruct every line by adding the appropriate number of space
1839 * between each blocks.
1841 * @param string $text
1844 protected function detab($text) {
1845 $text = preg_replace_callback('/^.*\t.*$/m',
1846 array($this, '_detab_callback'), $text);
1852 * Replace tabs callback
1853 * @param string $matches
1856 protected function _detab_callback($matches) {
1857 $line = $matches[0];
1858 $strlen = $this->utf8_strlen
; // strlen function for UTF-8.
1861 $blocks = explode("\t", $line);
1862 // Add each blocks to the line.
1864 unset($blocks[0]); // Do not add first block twice.
1865 foreach ($blocks as $block) {
1866 // Calculate amount of space, insert spaces, insert block.
1867 $amount = $this->tab_width
-
1868 $strlen($line, 'UTF-8') %
$this->tab_width
;
1869 $line .= str_repeat(" ", $amount) . $block;
1875 * Check for the availability of the function in the `utf8_strlen` property
1876 * (initially `mb_strlen`). If the function is not available, create a
1877 * function that will loosely count the number of UTF-8 characters with a
1878 * regular expression.
1881 protected function _initDetab() {
1883 if (function_exists($this->utf8_strlen
)) {
1887 $this->utf8_strlen
= function($text) {
1888 return preg_match_all('/[\x00-\xBF]|[\xC0-\xFF][\x80-\xBF]*/', $text, $m);
1893 * Swap back in all the tags hashed by _HashHTMLBlocks.
1894 * @param string $text
1897 protected function unhash($text) {
1898 return preg_replace_callback('/(.)\x1A[0-9]+\1/',
1899 array($this, '_unhash_callback'), $text);
1903 * Unhashing callback
1904 * @param array $matches
1907 protected function _unhash_callback($matches) {
1908 return $this->html_hashes
[$matches[0]];