MDL-61928 formslib: shortforms should work with non-editable forms
[moodle.git] / lib / markdown / Markdown.php
blobf885a3d65c45ba811d978f7e0418d1069cc397d6
1 <?php
2 /**
3 * Markdown - A text-to-HTML conversion tool for web writers
5 * @package php-markdown
6 * @author Michel Fortin <michel.fortin@michelf.com>
7 * @copyright 2004-2016 Michel Fortin <https://michelf.com/projects/php-markdown/>
8 * @copyright (Original Markdown) 2004-2006 John Gruber <https://daringfireball.net/projects/markdown/>
9 */
11 namespace Michelf;
13 /**
14 * Markdown Parser Class
16 class Markdown implements MarkdownInterface {
17 /**
18 * Define the package version
19 * @var string
21 const MARKDOWNLIB_VERSION = "1.7.0";
23 /**
24 * Simple function interface - Initialize the parser and return the result
25 * of its transform method. This will work fine for derived classes too.
27 * @api
29 * @param string $text
30 * @return string
32 public static function defaultTransform($text) {
33 // Take parser class on which this function was called.
34 $parser_class = \get_called_class();
36 // Try to take parser from the static parser list
37 static $parser_list;
38 $parser =& $parser_list[$parser_class];
40 // Create the parser it not already set
41 if (!$parser) {
42 $parser = new $parser_class;
45 // Transform text using parser.
46 return $parser->transform($text);
49 /**
50 * Configuration variables
53 /**
54 * Change to ">" for HTML output.
55 * @var string
57 public $empty_element_suffix = " />";
59 /**
60 * The width of indentation of the output markup
61 * @var int
63 public $tab_width = 4;
65 /**
66 * Change to `true` to disallow markup or entities.
67 * @var boolean
69 public $no_markup = false;
70 public $no_entities = false;
73 /**
74 * Change to `true` to enable line breaks on \n without two trailling spaces
75 * @var boolean
77 public $hard_wrap = false;
79 /**
80 * Predefined URLs and titles for reference links and images.
81 * @var array
83 public $predef_urls = array();
84 public $predef_titles = array();
86 /**
87 * Optional filter function for URLs
88 * @var callable
90 public $url_filter_func = null;
92 /**
93 * Optional header id="" generation callback function.
94 * @var callable
96 public $header_id_func = null;
98 /**
99 * Optional function for converting code block content to HTML
100 * @var callable
102 public $code_block_content_func = null;
105 * Optional function for converting code span content to HTML.
106 * @var callable
108 public $code_span_content_func = null;
111 * Class attribute to toggle "enhanced ordered list" behaviour
112 * setting this to true will allow ordered lists to start from the index
113 * number that is defined first.
115 * For example:
116 * 2. List item two
117 * 3. List item three
119 * Becomes:
120 * <ol start="2">
121 * <li>List item two</li>
122 * <li>List item three</li>
123 * </ol>
125 * @var bool
127 public $enhanced_ordered_list = false;
130 * Parser implementation
134 * Regex to match balanced [brackets].
135 * Needed to insert a maximum bracked depth while converting to PHP.
136 * @var int
138 protected $nested_brackets_depth = 6;
139 protected $nested_brackets_re;
141 protected $nested_url_parenthesis_depth = 4;
142 protected $nested_url_parenthesis_re;
145 * Table of hash values for escaped characters:
146 * @var string
148 protected $escape_chars = '\`*_{}[]()>#+-.!';
149 protected $escape_chars_re;
152 * Constructor function. Initialize appropriate member variables.
153 * @return void
155 public function __construct() {
156 $this->_initDetab();
157 $this->prepareItalicsAndBold();
159 $this->nested_brackets_re =
160 str_repeat('(?>[^\[\]]+|\[', $this->nested_brackets_depth).
161 str_repeat('\])*', $this->nested_brackets_depth);
163 $this->nested_url_parenthesis_re =
164 str_repeat('(?>[^()\s]+|\(', $this->nested_url_parenthesis_depth).
165 str_repeat('(?>\)))*', $this->nested_url_parenthesis_depth);
167 $this->escape_chars_re = '['.preg_quote($this->escape_chars).']';
169 // Sort document, block, and span gamut in ascendent priority order.
170 asort($this->document_gamut);
171 asort($this->block_gamut);
172 asort($this->span_gamut);
177 * Internal hashes used during transformation.
178 * @var array
180 protected $urls = array();
181 protected $titles = array();
182 protected $html_hashes = array();
185 * Status flag to avoid invalid nesting.
186 * @var boolean
188 protected $in_anchor = false;
191 * Called before the transformation process starts to setup parser states.
192 * @return void
194 protected function setup() {
195 // Clear global hashes.
196 $this->urls = $this->predef_urls;
197 $this->titles = $this->predef_titles;
198 $this->html_hashes = array();
199 $this->in_anchor = false;
203 * Called after the transformation process to clear any variable which may
204 * be taking up memory unnecessarly.
205 * @return void
207 protected function teardown() {
208 $this->urls = array();
209 $this->titles = array();
210 $this->html_hashes = array();
214 * Main function. Performs some preprocessing on the input text and pass
215 * it through the document gamut.
217 * @api
219 * @param string $text
220 * @return string
222 public function transform($text) {
223 $this->setup();
225 # Remove UTF-8 BOM and marker character in input, if present.
226 $text = preg_replace('{^\xEF\xBB\xBF|\x1A}', '', $text);
228 # Standardize line endings:
229 # DOS to Unix and Mac to Unix
230 $text = preg_replace('{\r\n?}', "\n", $text);
232 # Make sure $text ends with a couple of newlines:
233 $text .= "\n\n";
235 # Convert all tabs to spaces.
236 $text = $this->detab($text);
238 # Turn block-level HTML blocks into hash entries
239 $text = $this->hashHTMLBlocks($text);
241 # Strip any lines consisting only of spaces and tabs.
242 # This makes subsequent regexen easier to write, because we can
243 # match consecutive blank lines with /\n+/ instead of something
244 # contorted like /[ ]*\n+/ .
245 $text = preg_replace('/^[ ]+$/m', '', $text);
247 # Run document gamut methods.
248 foreach ($this->document_gamut as $method => $priority) {
249 $text = $this->$method($text);
252 $this->teardown();
254 return $text . "\n";
258 * Define the document gamut
259 * @var array
261 protected $document_gamut = array(
262 // Strip link definitions, store in hashes.
263 "stripLinkDefinitions" => 20,
264 "runBasicBlockGamut" => 30,
268 * Strips link definitions from text, stores the URLs and titles in
269 * hash references
270 * @param string $text
271 * @return string
273 protected function stripLinkDefinitions($text) {
275 $less_than_tab = $this->tab_width - 1;
277 // Link defs are in the form: ^[id]: url "optional title"
278 $text = preg_replace_callback('{
279 ^[ ]{0,'.$less_than_tab.'}\[(.+)\][ ]?: # id = $1
280 [ ]*
281 \n? # maybe *one* newline
282 [ ]*
284 <(.+?)> # url = $2
286 (\S+?) # url = $3
288 [ ]*
289 \n? # maybe one newline
290 [ ]*
292 (?<=\s) # lookbehind for whitespace
293 ["(]
294 (.*?) # title = $4
295 [")]
296 [ ]*
297 )? # title is optional
298 (?:\n+|\Z)
299 }xm',
300 array($this, '_stripLinkDefinitions_callback'),
301 $text
303 return $text;
307 * The callback to strip link definitions
308 * @param array $matches
309 * @return string
311 protected function _stripLinkDefinitions_callback($matches) {
312 $link_id = strtolower($matches[1]);
313 $url = $matches[2] == '' ? $matches[3] : $matches[2];
314 $this->urls[$link_id] = $url;
315 $this->titles[$link_id] =& $matches[4];
316 return ''; // String that will replace the block
320 * Hashify HTML blocks
321 * @param string $text
322 * @return string
324 protected function hashHTMLBlocks($text) {
325 if ($this->no_markup) {
326 return $text;
329 $less_than_tab = $this->tab_width - 1;
332 * Hashify HTML blocks:
334 * We only want to do this for block-level HTML tags, such as headers,
335 * lists, and tables. That's because we still want to wrap <p>s around
336 * "paragraphs" that are wrapped in non-block-level tags, such as
337 * anchors, phrase emphasis, and spans. The list of tags we're looking
338 * for is hard-coded:
340 * * List "a" is made of tags which can be both inline or block-level.
341 * These will be treated block-level when the start tag is alone on
342 * its line, otherwise they're not matched here and will be taken as
343 * inline later.
344 * * List "b" is made of tags which are always block-level;
346 $block_tags_a_re = 'ins|del';
347 $block_tags_b_re = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|'.
348 'script|noscript|style|form|fieldset|iframe|math|svg|'.
349 'article|section|nav|aside|hgroup|header|footer|'.
350 'figure';
352 // Regular expression for the content of a block tag.
353 $nested_tags_level = 4;
354 $attr = '
355 (?> # optional tag attributes
356 \s # starts with whitespace
358 [^>"/]+ # text outside quotes
360 /+(?!>) # slash not followed by ">"
362 "[^"]*" # text inside double quotes (tolerate ">")
364 \'[^\']*\' # text inside single quotes (tolerate ">")
368 $content =
369 str_repeat('
371 [^<]+ # content without tag
373 <\2 # nested opening tag
374 '.$attr.' # attributes
378 >', $nested_tags_level). // end of opening tag
379 '.*?'. // last level nested tag content
380 str_repeat('
381 </\2\s*> # closing nested tag
384 <(?!/\2\s*> # other tags with a different name
386 )*',
387 $nested_tags_level);
388 $content2 = str_replace('\2', '\3', $content);
391 * First, look for nested blocks, e.g.:
392 * <div>
393 * <div>
394 * tags for inner block must be indented.
395 * </div>
396 * </div>
398 * The outermost tags must start at the left margin for this to match,
399 * and the inner nested divs must be indented.
400 * We need to do this before the next, more liberal match, because the
401 * next match will start at the first `<div>` and stop at the
402 * first `</div>`.
404 $text = preg_replace_callback('{(?>
406 (?<=\n) # Starting on its own line
407 | # or
408 \A\n? # the at beginning of the doc
410 ( # save in $1
412 # Match from `\n<tag>` to `</tag>\n`, handling nested tags
413 # in between.
415 [ ]{0,'.$less_than_tab.'}
416 <('.$block_tags_b_re.')# start tag = $2
417 '.$attr.'> # attributes followed by > and \n
418 '.$content.' # content, support nesting
419 </\2> # the matching end tag
420 [ ]* # trailing spaces/tabs
421 (?=\n+|\Z) # followed by a newline or end of document
423 | # Special version for tags of group a.
425 [ ]{0,'.$less_than_tab.'}
426 <('.$block_tags_a_re.')# start tag = $3
427 '.$attr.'>[ ]*\n # attributes followed by >
428 '.$content2.' # content, support nesting
429 </\3> # the matching end tag
430 [ ]* # trailing spaces/tabs
431 (?=\n+|\Z) # followed by a newline or end of document
433 | # Special case just for <hr />. It was easier to make a special
434 # case than to make the other regex more complicated.
436 [ ]{0,'.$less_than_tab.'}
437 <(hr) # start tag = $2
438 '.$attr.' # attributes
439 /?> # the matching end tag
440 [ ]*
441 (?=\n{2,}|\Z) # followed by a blank line or end of document
443 | # Special case for standalone HTML comments:
445 [ ]{0,'.$less_than_tab.'}
446 (?s:
447 <!-- .*? -->
449 [ ]*
450 (?=\n{2,}|\Z) # followed by a blank line or end of document
452 | # PHP and ASP-style processor instructions (<? and <%)
454 [ ]{0,'.$less_than_tab.'}
455 (?s:
456 <([?%]) # $2
460 [ ]*
461 (?=\n{2,}|\Z) # followed by a blank line or end of document
464 )}Sxmi',
465 array($this, '_hashHTMLBlocks_callback'),
466 $text
469 return $text;
473 * The callback for hashing HTML blocks
474 * @param string $matches
475 * @return string
477 protected function _hashHTMLBlocks_callback($matches) {
478 $text = $matches[1];
479 $key = $this->hashBlock($text);
480 return "\n\n$key\n\n";
484 * Called whenever a tag must be hashed when a function insert an atomic
485 * element in the text stream. Passing $text to through this function gives
486 * a unique text-token which will be reverted back when calling unhash.
488 * The $boundary argument specify what character should be used to surround
489 * the token. By convension, "B" is used for block elements that needs not
490 * to be wrapped into paragraph tags at the end, ":" is used for elements
491 * that are word separators and "X" is used in the general case.
493 * @param string $text
494 * @param string $boundary
495 * @return string
497 protected function hashPart($text, $boundary = 'X') {
498 // Swap back any tag hash found in $text so we do not have to `unhash`
499 // multiple times at the end.
500 $text = $this->unhash($text);
502 // Then hash the block.
503 static $i = 0;
504 $key = "$boundary\x1A" . ++$i . $boundary;
505 $this->html_hashes[$key] = $text;
506 return $key; // String that will replace the tag.
510 * Shortcut function for hashPart with block-level boundaries.
511 * @param string $text
512 * @return string
514 protected function hashBlock($text) {
515 return $this->hashPart($text, 'B');
519 * Define the block gamut - these are all the transformations that form
520 * block-level tags like paragraphs, headers, and list items.
521 * @var array
523 protected $block_gamut = array(
524 "doHeaders" => 10,
525 "doHorizontalRules" => 20,
526 "doLists" => 40,
527 "doCodeBlocks" => 50,
528 "doBlockQuotes" => 60,
532 * Run block gamut tranformations.
534 * We need to escape raw HTML in Markdown source before doing anything
535 * else. This need to be done for each block, and not only at the
536 * begining in the Markdown function since hashed blocks can be part of
537 * list items and could have been indented. Indented blocks would have
538 * been seen as a code block in a previous pass of hashHTMLBlocks.
540 * @param string $text
541 * @return string
543 protected function runBlockGamut($text) {
544 $text = $this->hashHTMLBlocks($text);
545 return $this->runBasicBlockGamut($text);
549 * Run block gamut tranformations, without hashing HTML blocks. This is
550 * useful when HTML blocks are known to be already hashed, like in the first
551 * whole-document pass.
553 * @param string $text
554 * @return string
556 protected function runBasicBlockGamut($text) {
558 foreach ($this->block_gamut as $method => $priority) {
559 $text = $this->$method($text);
562 // Finally form paragraph and restore hashed blocks.
563 $text = $this->formParagraphs($text);
565 return $text;
569 * Convert horizontal rules
570 * @param string $text
571 * @return string
573 protected function doHorizontalRules($text) {
574 return preg_replace(
576 ^[ ]{0,3} # Leading space
577 ([-*_]) # $1: First marker
578 (?> # Repeated marker group
579 [ ]{0,2} # Zero, one, or two spaces.
580 \1 # Marker character
581 ){2,} # Group repeated at least twice
582 [ ]* # Tailing spaces
583 $ # End of line.
584 }mx',
585 "\n".$this->hashBlock("<hr$this->empty_element_suffix")."\n",
586 $text
591 * These are all the transformations that occur *within* block-level
592 * tags like paragraphs, headers, and list items.
593 * @var array
595 protected $span_gamut = array(
596 // Process character escapes, code spans, and inline HTML
597 // in one shot.
598 "parseSpan" => -30,
599 // Process anchor and image tags. Images must come first,
600 // because ![foo][f] looks like an anchor.
601 "doImages" => 10,
602 "doAnchors" => 20,
603 // Make links out of things like `<https://example.com/>`
604 // Must come after doAnchors, because you can use < and >
605 // delimiters in inline links like [this](<url>).
606 "doAutoLinks" => 30,
607 "encodeAmpsAndAngles" => 40,
608 "doItalicsAndBold" => 50,
609 "doHardBreaks" => 60,
613 * Run span gamut transformations
614 * @param string $text
615 * @return string
617 protected function runSpanGamut($text) {
618 foreach ($this->span_gamut as $method => $priority) {
619 $text = $this->$method($text);
622 return $text;
626 * Do hard breaks
627 * @param string $text
628 * @return string
630 protected function doHardBreaks($text) {
631 if ($this->hard_wrap) {
632 return preg_replace_callback('/ *\n/',
633 array($this, '_doHardBreaks_callback'), $text);
634 } else {
635 return preg_replace_callback('/ {2,}\n/',
636 array($this, '_doHardBreaks_callback'), $text);
641 * Trigger part hashing for the hard break (callback method)
642 * @param array $matches
643 * @return string
645 protected function _doHardBreaks_callback($matches) {
646 return $this->hashPart("<br$this->empty_element_suffix\n");
650 * Turn Markdown link shortcuts into XHTML <a> tags.
651 * @param string $text
652 * @return string
654 protected function doAnchors($text) {
655 if ($this->in_anchor) {
656 return $text;
658 $this->in_anchor = true;
660 // First, handle reference-style links: [link text] [id]
661 $text = preg_replace_callback('{
662 ( # wrap whole match in $1
664 ('.$this->nested_brackets_re.') # link text = $2
667 [ ]? # one optional space
668 (?:\n[ ]*)? # one optional newline followed by spaces
671 (.*?) # id = $3
674 }xs',
675 array($this, '_doAnchors_reference_callback'), $text);
677 // Next, inline-style links: [link text](url "optional title")
678 $text = preg_replace_callback('{
679 ( # wrap whole match in $1
681 ('.$this->nested_brackets_re.') # link text = $2
683 \( # literal paren
684 [ \n]*
686 <(.+?)> # href = $3
688 ('.$this->nested_url_parenthesis_re.') # href = $4
690 [ \n]*
691 ( # $5
692 ([\'"]) # quote char = $6
693 (.*?) # Title = $7
694 \6 # matching quote
695 [ \n]* # ignore any spaces/tabs between closing quote and )
696 )? # title is optional
699 }xs',
700 array($this, '_doAnchors_inline_callback'), $text);
702 // Last, handle reference-style shortcuts: [link text]
703 // These must come last in case you've also got [link text][1]
704 // or [link text](/foo)
705 $text = preg_replace_callback('{
706 ( # wrap whole match in $1
708 ([^\[\]]+) # link text = $2; can\'t contain [ or ]
711 }xs',
712 array($this, '_doAnchors_reference_callback'), $text);
714 $this->in_anchor = false;
715 return $text;
719 * Callback method to parse referenced anchors
720 * @param string $matches
721 * @return string
723 protected function _doAnchors_reference_callback($matches) {
724 $whole_match = $matches[1];
725 $link_text = $matches[2];
726 $link_id =& $matches[3];
728 if ($link_id == "") {
729 // for shortcut links like [this][] or [this].
730 $link_id = $link_text;
733 // lower-case and turn embedded newlines into spaces
734 $link_id = strtolower($link_id);
735 $link_id = preg_replace('{[ ]?\n}', ' ', $link_id);
737 if (isset($this->urls[$link_id])) {
738 $url = $this->urls[$link_id];
739 $url = $this->encodeURLAttribute($url);
741 $result = "<a href=\"$url\"";
742 if ( isset( $this->titles[$link_id] ) ) {
743 $title = $this->titles[$link_id];
744 $title = $this->encodeAttribute($title);
745 $result .= " title=\"$title\"";
748 $link_text = $this->runSpanGamut($link_text);
749 $result .= ">$link_text</a>";
750 $result = $this->hashPart($result);
751 } else {
752 $result = $whole_match;
754 return $result;
758 * Callback method to parse inline anchors
759 * @param string $matches
760 * @return string
762 protected function _doAnchors_inline_callback($matches) {
763 $whole_match = $matches[1];
764 $link_text = $this->runSpanGamut($matches[2]);
765 $url = $matches[3] == '' ? $matches[4] : $matches[3];
766 $title =& $matches[7];
768 // If the URL was of the form <s p a c e s> it got caught by the HTML
769 // tag parser and hashed. Need to reverse the process before using
770 // the URL.
771 $unhashed = $this->unhash($url);
772 if ($unhashed != $url)
773 $url = preg_replace('/^<(.*)>$/', '\1', $unhashed);
775 $url = $this->encodeURLAttribute($url);
777 $result = "<a href=\"$url\"";
778 if (isset($title)) {
779 $title = $this->encodeAttribute($title);
780 $result .= " title=\"$title\"";
783 $link_text = $this->runSpanGamut($link_text);
784 $result .= ">$link_text</a>";
786 return $this->hashPart($result);
790 * Turn Markdown image shortcuts into <img> tags.
791 * @param string $text
792 * @return string
794 protected function doImages($text) {
795 // First, handle reference-style labeled images: ![alt text][id]
796 $text = preg_replace_callback('{
797 ( # wrap whole match in $1
799 ('.$this->nested_brackets_re.') # alt text = $2
802 [ ]? # one optional space
803 (?:\n[ ]*)? # one optional newline followed by spaces
806 (.*?) # id = $3
810 }xs',
811 array($this, '_doImages_reference_callback'), $text);
813 // Next, handle inline images: ![alt text](url "optional title")
814 // Don't forget: encode * and _
815 $text = preg_replace_callback('{
816 ( # wrap whole match in $1
818 ('.$this->nested_brackets_re.') # alt text = $2
820 \s? # One optional whitespace character
821 \( # literal paren
822 [ \n]*
824 <(\S*)> # src url = $3
826 ('.$this->nested_url_parenthesis_re.') # src url = $4
828 [ \n]*
829 ( # $5
830 ([\'"]) # quote char = $6
831 (.*?) # title = $7
832 \6 # matching quote
833 [ \n]*
834 )? # title is optional
837 }xs',
838 array($this, '_doImages_inline_callback'), $text);
840 return $text;
844 * Callback to parse references image tags
845 * @param array $matches
846 * @return string
848 protected function _doImages_reference_callback($matches) {
849 $whole_match = $matches[1];
850 $alt_text = $matches[2];
851 $link_id = strtolower($matches[3]);
853 if ($link_id == "") {
854 $link_id = strtolower($alt_text); // for shortcut links like ![this][].
857 $alt_text = $this->encodeAttribute($alt_text);
858 if (isset($this->urls[$link_id])) {
859 $url = $this->encodeURLAttribute($this->urls[$link_id]);
860 $result = "<img src=\"$url\" alt=\"$alt_text\"";
861 if (isset($this->titles[$link_id])) {
862 $title = $this->titles[$link_id];
863 $title = $this->encodeAttribute($title);
864 $result .= " title=\"$title\"";
866 $result .= $this->empty_element_suffix;
867 $result = $this->hashPart($result);
868 } else {
869 // If there's no such link ID, leave intact:
870 $result = $whole_match;
873 return $result;
877 * Callback to parse inline image tags
878 * @param array $matches
879 * @return string
881 protected function _doImages_inline_callback($matches) {
882 $whole_match = $matches[1];
883 $alt_text = $matches[2];
884 $url = $matches[3] == '' ? $matches[4] : $matches[3];
885 $title =& $matches[7];
887 $alt_text = $this->encodeAttribute($alt_text);
888 $url = $this->encodeURLAttribute($url);
889 $result = "<img src=\"$url\" alt=\"$alt_text\"";
890 if (isset($title)) {
891 $title = $this->encodeAttribute($title);
892 $result .= " title=\"$title\""; // $title already quoted
894 $result .= $this->empty_element_suffix;
896 return $this->hashPart($result);
900 * Parse Markdown heading elements to HTML
901 * @param string $text
902 * @return string
904 protected function doHeaders($text) {
906 * Setext-style headers:
907 * Header 1
908 * ========
910 * Header 2
911 * --------
913 $text = preg_replace_callback('{ ^(.+?)[ ]*\n(=+|-+)[ ]*\n+ }mx',
914 array($this, '_doHeaders_callback_setext'), $text);
917 * atx-style headers:
918 * # Header 1
919 * ## Header 2
920 * ## Header 2 with closing hashes ##
921 * ...
922 * ###### Header 6
924 $text = preg_replace_callback('{
925 ^(\#{1,6}) # $1 = string of #\'s
926 [ ]*
927 (.+?) # $2 = Header text
928 [ ]*
929 \#* # optional closing #\'s (not counted)
931 }xm',
932 array($this, '_doHeaders_callback_atx'), $text);
934 return $text;
938 * Setext header parsing callback
939 * @param array $matches
940 * @return string
942 protected function _doHeaders_callback_setext($matches) {
943 // Terrible hack to check we haven't found an empty list item.
944 if ($matches[2] == '-' && preg_match('{^-(?: |$)}', $matches[1])) {
945 return $matches[0];
948 $level = $matches[2]{0} == '=' ? 1 : 2;
950 // ID attribute generation
951 $idAtt = $this->_generateIdFromHeaderValue($matches[1]);
953 $block = "<h$level$idAtt>".$this->runSpanGamut($matches[1])."</h$level>";
954 return "\n" . $this->hashBlock($block) . "\n\n";
958 * ATX header parsing callback
959 * @param array $matches
960 * @return string
962 protected function _doHeaders_callback_atx($matches) {
963 // ID attribute generation
964 $idAtt = $this->_generateIdFromHeaderValue($matches[2]);
966 $level = strlen($matches[1]);
967 $block = "<h$level$idAtt>".$this->runSpanGamut($matches[2])."</h$level>";
968 return "\n" . $this->hashBlock($block) . "\n\n";
972 * If a header_id_func property is set, we can use it to automatically
973 * generate an id attribute.
975 * This method returns a string in the form id="foo", or an empty string
976 * otherwise.
977 * @param string $headerValue
978 * @return string
980 protected function _generateIdFromHeaderValue($headerValue) {
981 if (!is_callable($this->header_id_func)) {
982 return "";
985 $idValue = call_user_func($this->header_id_func, $headerValue);
986 if (!$idValue) {
987 return "";
990 return ' id="' . $this->encodeAttribute($idValue) . '"';
994 * Form HTML ordered (numbered) and unordered (bulleted) lists.
995 * @param string $text
996 * @return string
998 protected function doLists($text) {
999 $less_than_tab = $this->tab_width - 1;
1001 // Re-usable patterns to match list item bullets and number markers:
1002 $marker_ul_re = '[*+-]';
1003 $marker_ol_re = '\d+[\.]';
1005 $markers_relist = array(
1006 $marker_ul_re => $marker_ol_re,
1007 $marker_ol_re => $marker_ul_re,
1010 foreach ($markers_relist as $marker_re => $other_marker_re) {
1011 // Re-usable pattern to match any entirel ul or ol list:
1012 $whole_list_re = '
1013 ( # $1 = whole list
1014 ( # $2
1015 ([ ]{0,'.$less_than_tab.'}) # $3 = number of spaces
1016 ('.$marker_re.') # $4 = first list item marker
1017 [ ]+
1019 (?s:.+?)
1020 ( # $5
1023 \n{2,}
1024 (?=\S)
1025 (?! # Negative lookahead for another list item marker
1026 [ ]*
1027 '.$marker_re.'[ ]+
1030 (?= # Lookahead for another kind of list
1032 \3 # Must have the same indentation
1033 '.$other_marker_re.'[ ]+
1037 '; // mx
1039 // We use a different prefix before nested lists than top-level lists.
1040 //See extended comment in _ProcessListItems().
1042 if ($this->list_level) {
1043 $text = preg_replace_callback('{
1045 '.$whole_list_re.'
1046 }mx',
1047 array($this, '_doLists_callback'), $text);
1048 } else {
1049 $text = preg_replace_callback('{
1050 (?:(?<=\n)\n|\A\n?) # Must eat the newline
1051 '.$whole_list_re.'
1052 }mx',
1053 array($this, '_doLists_callback'), $text);
1057 return $text;
1061 * List parsing callback
1062 * @param array $matches
1063 * @return string
1065 protected function _doLists_callback($matches) {
1066 // Re-usable patterns to match list item bullets and number markers:
1067 $marker_ul_re = '[*+-]';
1068 $marker_ol_re = '\d+[\.]';
1069 $marker_any_re = "(?:$marker_ul_re|$marker_ol_re)";
1070 $marker_ol_start_re = '[0-9]+';
1072 $list = $matches[1];
1073 $list_type = preg_match("/$marker_ul_re/", $matches[4]) ? "ul" : "ol";
1075 $marker_any_re = ( $list_type == "ul" ? $marker_ul_re : $marker_ol_re );
1077 $list .= "\n";
1078 $result = $this->processListItems($list, $marker_any_re);
1080 $ol_start = 1;
1081 if ($this->enhanced_ordered_list) {
1082 // Get the start number for ordered list.
1083 if ($list_type == 'ol') {
1084 $ol_start_array = array();
1085 $ol_start_check = preg_match("/$marker_ol_start_re/", $matches[4], $ol_start_array);
1086 if ($ol_start_check){
1087 $ol_start = $ol_start_array[0];
1092 if ($ol_start > 1 && $list_type == 'ol'){
1093 $result = $this->hashBlock("<$list_type start=\"$ol_start\">\n" . $result . "</$list_type>");
1094 } else {
1095 $result = $this->hashBlock("<$list_type>\n" . $result . "</$list_type>");
1097 return "\n". $result ."\n\n";
1101 * Nesting tracker for list levels
1102 * @var integer
1104 protected $list_level = 0;
1107 * Process the contents of a single ordered or unordered list, splitting it
1108 * into individual list items.
1109 * @param string $list_str
1110 * @param string $marker_any_re
1111 * @return string
1113 protected function processListItems($list_str, $marker_any_re) {
1115 * The $this->list_level global keeps track of when we're inside a list.
1116 * Each time we enter a list, we increment it; when we leave a list,
1117 * we decrement. If it's zero, we're not in a list anymore.
1119 * We do this because when we're not inside a list, we want to treat
1120 * something like this:
1122 * I recommend upgrading to version
1123 * 8. Oops, now this line is treated
1124 * as a sub-list.
1126 * As a single paragraph, despite the fact that the second line starts
1127 * with a digit-period-space sequence.
1129 * Whereas when we're inside a list (or sub-list), that line will be
1130 * treated as the start of a sub-list. What a kludge, huh? This is
1131 * an aspect of Markdown's syntax that's hard to parse perfectly
1132 * without resorting to mind-reading. Perhaps the solution is to
1133 * change the syntax rules such that sub-lists must start with a
1134 * starting cardinal number; e.g. "1." or "a.".
1136 $this->list_level++;
1138 // Trim trailing blank lines:
1139 $list_str = preg_replace("/\n{2,}\\z/", "\n", $list_str);
1141 $list_str = preg_replace_callback('{
1142 (\n)? # leading line = $1
1143 (^[ ]*) # leading whitespace = $2
1144 ('.$marker_any_re.' # list marker and space = $3
1145 (?:[ ]+|(?=\n)) # space only required if item is not empty
1147 ((?s:.*?)) # list item text = $4
1148 (?:(\n+(?=\n))|\n) # tailing blank line = $5
1149 (?= \n* (\z | \2 ('.$marker_any_re.') (?:[ ]+|(?=\n))))
1150 }xm',
1151 array($this, '_processListItems_callback'), $list_str);
1153 $this->list_level--;
1154 return $list_str;
1158 * List item parsing callback
1159 * @param array $matches
1160 * @return string
1162 protected function _processListItems_callback($matches) {
1163 $item = $matches[4];
1164 $leading_line =& $matches[1];
1165 $leading_space =& $matches[2];
1166 $marker_space = $matches[3];
1167 $tailing_blank_line =& $matches[5];
1169 if ($leading_line || $tailing_blank_line ||
1170 preg_match('/\n{2,}/', $item))
1172 // Replace marker with the appropriate whitespace indentation
1173 $item = $leading_space . str_repeat(' ', strlen($marker_space)) . $item;
1174 $item = $this->runBlockGamut($this->outdent($item)."\n");
1175 } else {
1176 // Recursion for sub-lists:
1177 $item = $this->doLists($this->outdent($item));
1178 $item = $this->formParagraphs($item, false);
1181 return "<li>" . $item . "</li>\n";
1185 * Process Markdown `<pre><code>` blocks.
1186 * @param string $text
1187 * @return string
1189 protected function doCodeBlocks($text) {
1190 $text = preg_replace_callback('{
1191 (?:\n\n|\A\n?)
1192 ( # $1 = the code block -- one or more lines, starting with a space/tab
1194 [ ]{'.$this->tab_width.'} # Lines must start with a tab or a tab-width of spaces
1195 .*\n+
1198 ((?=^[ ]{0,'.$this->tab_width.'}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
1199 }xm',
1200 array($this, '_doCodeBlocks_callback'), $text);
1202 return $text;
1206 * Code block parsing callback
1207 * @param array $matches
1208 * @return string
1210 protected function _doCodeBlocks_callback($matches) {
1211 $codeblock = $matches[1];
1213 $codeblock = $this->outdent($codeblock);
1214 if ($this->code_block_content_func) {
1215 $codeblock = call_user_func($this->code_block_content_func, $codeblock, "");
1216 } else {
1217 $codeblock = htmlspecialchars($codeblock, ENT_NOQUOTES);
1220 # trim leading newlines and trailing newlines
1221 $codeblock = preg_replace('/\A\n+|\n+\z/', '', $codeblock);
1223 $codeblock = "<pre><code>$codeblock\n</code></pre>";
1224 return "\n\n" . $this->hashBlock($codeblock) . "\n\n";
1228 * Create a code span markup for $code. Called from handleSpanToken.
1229 * @param string $code
1230 * @return string
1232 protected function makeCodeSpan($code) {
1233 if ($this->code_span_content_func) {
1234 $code = call_user_func($this->code_span_content_func, $code);
1235 } else {
1236 $code = htmlspecialchars(trim($code), ENT_NOQUOTES);
1238 return $this->hashPart("<code>$code</code>");
1242 * Define the emphasis operators with their regex matches
1243 * @var array
1245 protected $em_relist = array(
1246 '' => '(?:(?<!\*)\*(?!\*)|(?<!_)_(?!_))(?![\.,:;]?\s)',
1247 '*' => '(?<![\s*])\*(?!\*)',
1248 '_' => '(?<![\s_])_(?!_)',
1252 * Define the strong operators with their regex matches
1253 * @var array
1255 protected $strong_relist = array(
1256 '' => '(?:(?<!\*)\*\*(?!\*)|(?<!_)__(?!_))(?![\.,:;]?\s)',
1257 '**' => '(?<![\s*])\*\*(?!\*)',
1258 '__' => '(?<![\s_])__(?!_)',
1262 * Define the emphasis + strong operators with their regex matches
1263 * @var array
1265 protected $em_strong_relist = array(
1266 '' => '(?:(?<!\*)\*\*\*(?!\*)|(?<!_)___(?!_))(?![\.,:;]?\s)',
1267 '***' => '(?<![\s*])\*\*\*(?!\*)',
1268 '___' => '(?<![\s_])___(?!_)',
1272 * Container for prepared regular expressions
1273 * @var array
1275 protected $em_strong_prepared_relist;
1278 * Prepare regular expressions for searching emphasis tokens in any
1279 * context.
1280 * @return void
1282 protected function prepareItalicsAndBold() {
1283 foreach ($this->em_relist as $em => $em_re) {
1284 foreach ($this->strong_relist as $strong => $strong_re) {
1285 // Construct list of allowed token expressions.
1286 $token_relist = array();
1287 if (isset($this->em_strong_relist["$em$strong"])) {
1288 $token_relist[] = $this->em_strong_relist["$em$strong"];
1290 $token_relist[] = $em_re;
1291 $token_relist[] = $strong_re;
1293 // Construct master expression from list.
1294 $token_re = '{(' . implode('|', $token_relist) . ')}';
1295 $this->em_strong_prepared_relist["$em$strong"] = $token_re;
1301 * Convert Markdown italics (emphasis) and bold (strong) to HTML
1302 * @param string $text
1303 * @return string
1305 protected function doItalicsAndBold($text) {
1306 $token_stack = array('');
1307 $text_stack = array('');
1308 $em = '';
1309 $strong = '';
1310 $tree_char_em = false;
1312 while (1) {
1313 // Get prepared regular expression for seraching emphasis tokens
1314 // in current context.
1315 $token_re = $this->em_strong_prepared_relist["$em$strong"];
1317 // Each loop iteration search for the next emphasis token.
1318 // Each token is then passed to handleSpanToken.
1319 $parts = preg_split($token_re, $text, 2, PREG_SPLIT_DELIM_CAPTURE);
1320 $text_stack[0] .= $parts[0];
1321 $token =& $parts[1];
1322 $text =& $parts[2];
1324 if (empty($token)) {
1325 // Reached end of text span: empty stack without emitting.
1326 // any more emphasis.
1327 while ($token_stack[0]) {
1328 $text_stack[1] .= array_shift($token_stack);
1329 $text_stack[0] .= array_shift($text_stack);
1331 break;
1334 $token_len = strlen($token);
1335 if ($tree_char_em) {
1336 // Reached closing marker while inside a three-char emphasis.
1337 if ($token_len == 3) {
1338 // Three-char closing marker, close em and strong.
1339 array_shift($token_stack);
1340 $span = array_shift($text_stack);
1341 $span = $this->runSpanGamut($span);
1342 $span = "<strong><em>$span</em></strong>";
1343 $text_stack[0] .= $this->hashPart($span);
1344 $em = '';
1345 $strong = '';
1346 } else {
1347 // Other closing marker: close one em or strong and
1348 // change current token state to match the other
1349 $token_stack[0] = str_repeat($token{0}, 3-$token_len);
1350 $tag = $token_len == 2 ? "strong" : "em";
1351 $span = $text_stack[0];
1352 $span = $this->runSpanGamut($span);
1353 $span = "<$tag>$span</$tag>";
1354 $text_stack[0] = $this->hashPart($span);
1355 $$tag = ''; // $$tag stands for $em or $strong
1357 $tree_char_em = false;
1358 } else if ($token_len == 3) {
1359 if ($em) {
1360 // Reached closing marker for both em and strong.
1361 // Closing strong marker:
1362 for ($i = 0; $i < 2; ++$i) {
1363 $shifted_token = array_shift($token_stack);
1364 $tag = strlen($shifted_token) == 2 ? "strong" : "em";
1365 $span = array_shift($text_stack);
1366 $span = $this->runSpanGamut($span);
1367 $span = "<$tag>$span</$tag>";
1368 $text_stack[0] .= $this->hashPart($span);
1369 $$tag = ''; // $$tag stands for $em or $strong
1371 } else {
1372 // Reached opening three-char emphasis marker. Push on token
1373 // stack; will be handled by the special condition above.
1374 $em = $token{0};
1375 $strong = "$em$em";
1376 array_unshift($token_stack, $token);
1377 array_unshift($text_stack, '');
1378 $tree_char_em = true;
1380 } else if ($token_len == 2) {
1381 if ($strong) {
1382 // Unwind any dangling emphasis marker:
1383 if (strlen($token_stack[0]) == 1) {
1384 $text_stack[1] .= array_shift($token_stack);
1385 $text_stack[0] .= array_shift($text_stack);
1387 // Closing strong marker:
1388 array_shift($token_stack);
1389 $span = array_shift($text_stack);
1390 $span = $this->runSpanGamut($span);
1391 $span = "<strong>$span</strong>";
1392 $text_stack[0] .= $this->hashPart($span);
1393 $strong = '';
1394 } else {
1395 array_unshift($token_stack, $token);
1396 array_unshift($text_stack, '');
1397 $strong = $token;
1399 } else {
1400 // Here $token_len == 1
1401 if ($em) {
1402 if (strlen($token_stack[0]) == 1) {
1403 // Closing emphasis marker:
1404 array_shift($token_stack);
1405 $span = array_shift($text_stack);
1406 $span = $this->runSpanGamut($span);
1407 $span = "<em>$span</em>";
1408 $text_stack[0] .= $this->hashPart($span);
1409 $em = '';
1410 } else {
1411 $text_stack[0] .= $token;
1413 } else {
1414 array_unshift($token_stack, $token);
1415 array_unshift($text_stack, '');
1416 $em = $token;
1420 return $text_stack[0];
1424 * Parse Markdown blockquotes to HTML
1425 * @param string $text
1426 * @return string
1428 protected function doBlockQuotes($text) {
1429 $text = preg_replace_callback('/
1430 ( # Wrap whole match in $1
1432 ^[ ]*>[ ]? # ">" at the start of a line
1433 .+\n # rest of the first line
1434 (.+\n)* # subsequent consecutive lines
1435 \n* # blanks
1438 /xm',
1439 array($this, '_doBlockQuotes_callback'), $text);
1441 return $text;
1445 * Blockquote parsing callback
1446 * @param array $matches
1447 * @return string
1449 protected function _doBlockQuotes_callback($matches) {
1450 $bq = $matches[1];
1451 // trim one level of quoting - trim whitespace-only lines
1452 $bq = preg_replace('/^[ ]*>[ ]?|^[ ]+$/m', '', $bq);
1453 $bq = $this->runBlockGamut($bq); // recurse
1455 $bq = preg_replace('/^/m', " ", $bq);
1456 // These leading spaces cause problem with <pre> content,
1457 // so we need to fix that:
1458 $bq = preg_replace_callback('{(\s*<pre>.+?</pre>)}sx',
1459 array($this, '_doBlockQuotes_callback2'), $bq);
1461 return "\n" . $this->hashBlock("<blockquote>\n$bq\n</blockquote>") . "\n\n";
1465 * Blockquote parsing callback
1466 * @param array $matches
1467 * @return string
1469 protected function _doBlockQuotes_callback2($matches) {
1470 $pre = $matches[1];
1471 $pre = preg_replace('/^ /m', '', $pre);
1472 return $pre;
1476 * Parse paragraphs
1478 * @param string $text String to process in paragraphs
1479 * @param boolean $wrap_in_p Whether paragraphs should be wrapped in <p> tags
1480 * @return string
1482 protected function formParagraphs($text, $wrap_in_p = true) {
1483 // Strip leading and trailing lines:
1484 $text = preg_replace('/\A\n+|\n+\z/', '', $text);
1486 $grafs = preg_split('/\n{2,}/', $text, -1, PREG_SPLIT_NO_EMPTY);
1488 // Wrap <p> tags and unhashify HTML blocks
1489 foreach ($grafs as $key => $value) {
1490 if (!preg_match('/^B\x1A[0-9]+B$/', $value)) {
1491 // Is a paragraph.
1492 $value = $this->runSpanGamut($value);
1493 if ($wrap_in_p) {
1494 $value = preg_replace('/^([ ]*)/', "<p>", $value);
1495 $value .= "</p>";
1497 $grafs[$key] = $this->unhash($value);
1498 } else {
1499 // Is a block.
1500 // Modify elements of @grafs in-place...
1501 $graf = $value;
1502 $block = $this->html_hashes[$graf];
1503 $graf = $block;
1504 // if (preg_match('{
1505 // \A
1506 // ( # $1 = <div> tag
1507 // <div \s+
1508 // [^>]*
1509 // \b
1510 // markdown\s*=\s* ([\'"]) # $2 = attr quote char
1511 // 1
1512 // \2
1513 // [^>]*
1514 // >
1515 // )
1516 // ( # $3 = contents
1517 // .*
1518 // )
1519 // (</div>) # $4 = closing tag
1520 // \z
1521 // }xs', $block, $matches))
1522 // {
1523 // list(, $div_open, , $div_content, $div_close) = $matches;
1525 // // We can't call Markdown(), because that resets the hash;
1526 // // that initialization code should be pulled into its own sub, though.
1527 // $div_content = $this->hashHTMLBlocks($div_content);
1529 // // Run document gamut methods on the content.
1530 // foreach ($this->document_gamut as $method => $priority) {
1531 // $div_content = $this->$method($div_content);
1532 // }
1534 // $div_open = preg_replace(
1535 // '{\smarkdown\s*=\s*([\'"]).+?\1}', '', $div_open);
1537 // $graf = $div_open . "\n" . $div_content . "\n" . $div_close;
1538 // }
1539 $grafs[$key] = $graf;
1543 return implode("\n\n", $grafs);
1547 * Encode text for a double-quoted HTML attribute. This function
1548 * is *not* suitable for attributes enclosed in single quotes.
1549 * @param string $text
1550 * @return string
1552 protected function encodeAttribute($text) {
1553 $text = $this->encodeAmpsAndAngles($text);
1554 $text = str_replace('"', '&quot;', $text);
1555 return $text;
1559 * Encode text for a double-quoted HTML attribute containing a URL,
1560 * applying the URL filter if set. Also generates the textual
1561 * representation for the URL (removing mailto: or tel:) storing it in $text.
1562 * This function is *not* suitable for attributes enclosed in single quotes.
1564 * @param string $url
1565 * @param string &$text Passed by reference
1566 * @return string URL
1568 protected function encodeURLAttribute($url, &$text = null) {
1569 if ($this->url_filter_func) {
1570 $url = call_user_func($this->url_filter_func, $url);
1573 if (preg_match('{^mailto:}i', $url)) {
1574 $url = $this->encodeEntityObfuscatedAttribute($url, $text, 7);
1575 } else if (preg_match('{^tel:}i', $url)) {
1576 $url = $this->encodeAttribute($url);
1577 $text = substr($url, 4);
1578 } else {
1579 $url = $this->encodeAttribute($url);
1580 $text = $url;
1583 return $url;
1587 * Smart processing for ampersands and angle brackets that need to
1588 * be encoded. Valid character entities are left alone unless the
1589 * no-entities mode is set.
1590 * @param string $text
1591 * @return string
1593 protected function encodeAmpsAndAngles($text) {
1594 if ($this->no_entities) {
1595 $text = str_replace('&', '&amp;', $text);
1596 } else {
1597 // Ampersand-encoding based entirely on Nat Irons's Amputator
1598 // MT plugin: <http://bumppo.net/projects/amputator/>
1599 $text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/',
1600 '&amp;', $text);
1602 // Encode remaining <'s
1603 $text = str_replace('<', '&lt;', $text);
1605 return $text;
1609 * Parse Markdown automatic links to anchor HTML tags
1610 * @param string $text
1611 * @return string
1613 protected function doAutoLinks($text) {
1614 $text = preg_replace_callback('{<((https?|ftp|dict|tel):[^\'">\s]+)>}i',
1615 array($this, '_doAutoLinks_url_callback'), $text);
1617 // Email addresses: <address@domain.foo>
1618 $text = preg_replace_callback('{
1620 (?:mailto:)?
1623 [-!#$%&\'*+/=?^_`.{|}~\w\x80-\xFF]+
1625 ".*?"
1629 [-a-z0-9\x80-\xFF]+(\.[-a-z0-9\x80-\xFF]+)*\.[a-z]+
1631 \[[\d.a-fA-F:]+\] # IPv4 & IPv6
1635 }xi',
1636 array($this, '_doAutoLinks_email_callback'), $text);
1638 return $text;
1642 * Parse URL callback
1643 * @param array $matches
1644 * @return string
1646 protected function _doAutoLinks_url_callback($matches) {
1647 $url = $this->encodeURLAttribute($matches[1], $text);
1648 $link = "<a href=\"$url\">$text</a>";
1649 return $this->hashPart($link);
1653 * Parse email address callback
1654 * @param array $matches
1655 * @return string
1657 protected function _doAutoLinks_email_callback($matches) {
1658 $addr = $matches[1];
1659 $url = $this->encodeURLAttribute("mailto:$addr", $text);
1660 $link = "<a href=\"$url\">$text</a>";
1661 return $this->hashPart($link);
1665 * Input: some text to obfuscate, e.g. "mailto:foo@example.com"
1667 * Output: the same text but with most characters encoded as either a
1668 * decimal or hex entity, in the hopes of foiling most address
1669 * harvesting spam bots. E.g.:
1671 * &#109;&#x61;&#105;&#x6c;&#116;&#x6f;&#58;&#x66;o&#111;
1672 * &#x40;&#101;&#x78;&#97;&#x6d;&#112;&#x6c;&#101;&#46;&#x63;&#111;
1673 * &#x6d;
1675 * Note: the additional output $tail is assigned the same value as the
1676 * ouput, minus the number of characters specified by $head_length.
1678 * Based by a filter by Matthew Wickline, posted to BBEdit-Talk.
1679 * With some optimizations by Milian Wolff. Forced encoding of HTML
1680 * attribute special characters by Allan Odgaard.
1682 * @param string $text
1683 * @param string &$tail
1684 * @param integer $head_length
1685 * @return string
1687 protected function encodeEntityObfuscatedAttribute($text, &$tail = null, $head_length = 0) {
1688 if ($text == "") {
1689 return $tail = "";
1692 $chars = preg_split('/(?<!^)(?!$)/', $text);
1693 $seed = (int)abs(crc32($text) / strlen($text)); // Deterministic seed.
1695 foreach ($chars as $key => $char) {
1696 $ord = ord($char);
1697 // Ignore non-ascii chars.
1698 if ($ord < 128) {
1699 $r = ($seed * (1 + $key)) % 100; // Pseudo-random function.
1700 // roughly 10% raw, 45% hex, 45% dec
1701 // '@' *must* be encoded. I insist.
1702 // '"' and '>' have to be encoded inside the attribute
1703 if ($r > 90 && strpos('@"&>', $char) === false) {
1704 /* do nothing */
1705 } else if ($r < 45) {
1706 $chars[$key] = '&#x'.dechex($ord).';';
1707 } else {
1708 $chars[$key] = '&#'.$ord.';';
1713 $text = implode('', $chars);
1714 $tail = $head_length ? implode('', array_slice($chars, $head_length)) : $text;
1716 return $text;
1720 * Take the string $str and parse it into tokens, hashing embeded HTML,
1721 * escaped characters and handling code spans.
1722 * @param string $str
1723 * @return string
1725 protected function parseSpan($str) {
1726 $output = '';
1728 $span_re = '{
1730 \\\\'.$this->escape_chars_re.'
1732 (?<![`\\\\])
1733 `+ # code span marker
1734 '.( $this->no_markup ? '' : '
1736 <!-- .*? --> # comment
1738 <\?.*?\?> | <%.*?%> # processing instruction
1740 <[!$]?[-a-zA-Z0-9:_]+ # regular tags
1743 (?>[^"\'>]+|"[^"]*"|\'[^\']*\')*
1747 <[-a-zA-Z0-9:_]+\s*/> # xml-style empty tag
1749 </[-a-zA-Z0-9:_]+\s*> # closing tag
1750 ').'
1752 }xs';
1754 while (1) {
1755 // Each loop iteration seach for either the next tag, the next
1756 // openning code span marker, or the next escaped character.
1757 // Each token is then passed to handleSpanToken.
1758 $parts = preg_split($span_re, $str, 2, PREG_SPLIT_DELIM_CAPTURE);
1760 // Create token from text preceding tag.
1761 if ($parts[0] != "") {
1762 $output .= $parts[0];
1765 // Check if we reach the end.
1766 if (isset($parts[1])) {
1767 $output .= $this->handleSpanToken($parts[1], $parts[2]);
1768 $str = $parts[2];
1769 } else {
1770 break;
1774 return $output;
1778 * Handle $token provided by parseSpan by determining its nature and
1779 * returning the corresponding value that should replace it.
1780 * @param string $token
1781 * @param string &$str
1782 * @return string
1784 protected function handleSpanToken($token, &$str) {
1785 switch ($token{0}) {
1786 case "\\":
1787 return $this->hashPart("&#". ord($token{1}). ";");
1788 case "`":
1789 // Search for end marker in remaining text.
1790 if (preg_match('/^(.*?[^`])'.preg_quote($token).'(?!`)(.*)$/sm',
1791 $str, $matches))
1793 $str = $matches[2];
1794 $codespan = $this->makeCodeSpan($matches[1]);
1795 return $this->hashPart($codespan);
1797 return $token; // Return as text since no ending marker found.
1798 default:
1799 return $this->hashPart($token);
1804 * Remove one level of line-leading tabs or spaces
1805 * @param string $text
1806 * @return string
1808 protected function outdent($text) {
1809 return preg_replace('/^(\t|[ ]{1,' . $this->tab_width . '})/m', '', $text);
1814 * String length function for detab. `_initDetab` will create a function to
1815 * handle UTF-8 if the default function does not exist.
1816 * @var string
1818 protected $utf8_strlen = 'mb_strlen';
1821 * Replace tabs with the appropriate amount of spaces.
1823 * For each line we separate the line in blocks delemited by tab characters.
1824 * Then we reconstruct every line by adding the appropriate number of space
1825 * between each blocks.
1827 * @param string $text
1828 * @return string
1830 protected function detab($text) {
1831 $text = preg_replace_callback('/^.*\t.*$/m',
1832 array($this, '_detab_callback'), $text);
1834 return $text;
1838 * Replace tabs callback
1839 * @param string $matches
1840 * @return string
1842 protected function _detab_callback($matches) {
1843 $line = $matches[0];
1844 $strlen = $this->utf8_strlen; // strlen function for UTF-8.
1846 // Split in blocks.
1847 $blocks = explode("\t", $line);
1848 // Add each blocks to the line.
1849 $line = $blocks[0];
1850 unset($blocks[0]); // Do not add first block twice.
1851 foreach ($blocks as $block) {
1852 // Calculate amount of space, insert spaces, insert block.
1853 $amount = $this->tab_width -
1854 $strlen($line, 'UTF-8') % $this->tab_width;
1855 $line .= str_repeat(" ", $amount) . $block;
1857 return $line;
1861 * Check for the availability of the function in the `utf8_strlen` property
1862 * (initially `mb_strlen`). If the function is not available, create a
1863 * function that will loosely count the number of UTF-8 characters with a
1864 * regular expression.
1865 * @return void
1867 protected function _initDetab() {
1869 if (function_exists($this->utf8_strlen)) {
1870 return;
1873 $this->utf8_strlen = function($text) {
1874 return preg_match_all('/[\x00-\xBF]|[\xC0-\xFF][\x80-\xBF]*/', $text, $m);
1879 * Swap back in all the tags hashed by _HashHTMLBlocks.
1880 * @param string $text
1881 * @return string
1883 protected function unhash($text) {
1884 return preg_replace_callback('/(.)\x1A[0-9]+\1/',
1885 array($this, '_unhash_callback'), $text);
1889 * Unhashing callback
1890 * @param array $matches
1891 * @return string
1893 protected function _unhash_callback($matches) {
1894 return $this->html_hashes[$matches[0]];