Markdown 1.0.3
[markdown.git] / Markdown.pl
blob42a320b33c3c30be3a82c4028855589a77d6f163
1 #!/usr/bin/perl
4 # Markdown -- A text-to-HTML conversion tool for web writers
6 # Copyright (C) 2004 John Gruber
7 # Copyright (C) 2015 Kyle J. McKay
11 package Markdown;
12 require 5.006_000;
13 use strict;
14 use warnings;
16 use Digest::MD5 qw(md5_hex);
17 use vars qw($VERSION);
18 $VERSION = '1.0.3';
19 # Sun 06 Sep 2015
21 ## Disabled; causes problems under Perl 5.6.1:
22 # use utf8;
23 # binmode( STDOUT, ":utf8" ); # c.f.: http://acis.openlib.org/dev/perl-unicode-struggle.html
27 # Global default settings:
29 my $g_empty_element_suffix = " />"; # Change to ">" for HTML output
30 my $g_url_prefix = ""; # Prefixed to non-absolute URLs
31 my $g_img_prefix = ""; # Prefixed to non-absolute image URLs
32 my $g_tab_width = 4;
36 # Globals:
39 # Regex to match balanced [brackets]. See Friedl's
40 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
41 my $g_nested_brackets;
42 $g_nested_brackets = qr{
43 (?> # Atomic matching
44 [^\[\]]+ # Anything other than brackets
47 (??{ $g_nested_brackets }) # Recursive set of nested brackets
50 }x;
53 # Table of hash values for escaped characters:
54 my %g_escape_table;
55 foreach my $char (split //, '\\`*_{}[]()>#+-.!~') {
56 $g_escape_table{$char} = md5_hex($char);
60 # Global hashes, used by various utility routines
61 my %g_urls;
62 my %g_titles;
63 my %g_html_blocks;
65 # Used to track when we're inside an ordered or unordered list
66 # (see _ProcessListItems() for details):
67 my $g_list_level = 0;
70 #### Blosxom plug-in interface ##########################################
72 # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
73 # which posts Markdown should process, using a "meta-markup: markdown"
74 # header. If it's set to 0 (the default), Markdown will process all
75 # entries.
76 my $g_blosxom_use_meta = 0;
78 sub start { 1; }
79 sub story {
80 my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
82 if ( (! $g_blosxom_use_meta) or
83 (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
85 $$body_ref = Markdown($$body_ref);
91 #### Movable Type plug-in interface #####################################
92 eval {require MT}; # Test to see if we're running in MT.
93 unless ($@) {
94 require MT;
95 import MT;
96 require MT::Template::Context;
97 import MT::Template::Context;
99 eval {require MT::Plugin}; # Test to see if we're running >= MT 3.0.
100 unless ($@) {
101 require MT::Plugin;
102 import MT::Plugin;
103 my $plugin = new MT::Plugin({
104 name => "Markdown",
105 description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
106 doc_link => 'http://daringfireball.net/projects/markdown/'
108 MT->add_plugin( $plugin );
111 MT::Template::Context->add_container_tag(MarkdownOptions => sub {
112 my $ctx = shift;
113 my $args = shift;
114 my $builder = $ctx->stash('builder');
115 my $tokens = $ctx->stash('tokens');
117 if (defined ($args->{'output'}) ) {
118 $ctx->stash('markdown_output', lc $args->{'output'});
121 defined (my $str = $builder->build($ctx, $tokens) )
122 or return $ctx->error($builder->errstr);
123 $str; # return value
126 MT->add_text_filter('markdown' => {
127 label => 'Markdown',
128 docs => 'http://daringfireball.net/projects/markdown/',
129 on_format => sub {
130 my $text = shift;
131 my $ctx = shift;
132 my $raw = 0;
133 if (defined $ctx) {
134 my $output = $ctx->stash('markdown_output');
135 if (defined $output && $output =~ m/^html/i) {
136 $g_empty_element_suffix = ">";
137 $ctx->stash('markdown_output', '');
139 elsif (defined $output && $output eq 'raw') {
140 $raw = 1;
141 $ctx->stash('markdown_output', '');
143 else {
144 $raw = 0;
145 $g_empty_element_suffix = " />";
148 $text = $raw ? $text : Markdown($text);
149 $text;
153 # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
154 my $smartypants;
157 no warnings "once";
158 $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
161 if ($smartypants) {
162 MT->add_text_filter('markdown_with_smartypants' => {
163 label => 'Markdown With SmartyPants',
164 docs => 'http://daringfireball.net/projects/markdown/',
165 on_format => sub {
166 my $text = shift;
167 my $ctx = shift;
168 if (defined $ctx) {
169 my $output = $ctx->stash('markdown_output');
170 if (defined $output && $output eq 'html') {
171 $g_empty_element_suffix = ">";
173 else {
174 $g_empty_element_suffix = " />";
177 $text = Markdown($text);
178 $text = $smartypants->($text, '1');
183 else {
184 #### BBEdit/command-line text filter interface ##########################
185 # Needs to be hidden from MT (and Blosxom when running in static mode).
187 # We're only using $blosxom::version once; tell Perl not to warn us:
188 no warnings 'once';
189 unless ( defined($blosxom::version) ) {
190 use warnings;
192 #### Check for command-line switches: #################
193 my %cli_opts;
194 use Getopt::Long;
195 Getopt::Long::Configure('pass_through');
196 GetOptions(\%cli_opts,
197 'help|h',
198 'version|V|v',
199 'shortversion|short-version|s',
200 'html4tags',
201 'htmlroot|r=s',
202 'imageroot|i=s',
204 if ($cli_opts{'help'}) {
205 exec 'perldoc', $0;
207 if ($cli_opts{'version'}) { # Version info
208 print "\nThis is Markdown, version $VERSION.\n";
209 print "Copyright (C) 2004 John Gruber\n";
210 print "Copyright (C) 2015 Kyle J. McKay\n";
211 exit 0;
213 if ($cli_opts{'shortversion'}) { # Just the version number string.
214 print $VERSION;
215 exit 0;
217 if ($cli_opts{'html4tags'}) { # Use HTML tag style instead of XHTML
218 $g_empty_element_suffix = ">";
220 if ($cli_opts{'htmlroot'}) { # Use URL prefix
221 $g_url_prefix = $cli_opts{'htmlroot'};
223 if ($cli_opts{'imageroot'}) { # Use image URL prefix
224 $g_img_prefix = $cli_opts{'imageroot'};
228 #### Process incoming text: ###########################
229 my $text;
231 local $/; # Slurp the whole file
232 $text = <>;
234 print Markdown($text);
240 sub Markdown {
242 # Main function. The order in which other subs are called here is
243 # essential. Link and image substitutions need to happen before
244 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
245 # and <img> tags get encoded.
247 my $text = shift;
249 # Clear the global hashes. If we don't clear these, you get conflicts
250 # from other articles when generating a page which contains more than
251 # one article (e.g. an index page that shows the N most recent
252 # articles):
253 %g_urls = ();
254 %g_titles = ();
255 %g_html_blocks = ();
258 # Standardize line endings:
259 $text =~ s{\r\n}{\n}g; # DOS to Unix
260 $text =~ s{\r}{\n}g; # Mac to Unix
262 # Make sure $text ends with a couple of newlines:
263 $text .= "\n\n";
265 # Convert all tabs to spaces.
266 $text = _Detab($text);
268 # Strip any lines consisting only of spaces and tabs.
269 # This makes subsequent regexen easier to write, because we can
270 # match consecutive blank lines with /\n+/ instead of something
271 # contorted like /[ \t]*\n+/ .
272 $text =~ s/^[ \t]+$//mg;
274 # Turn block-level HTML blocks into hash entries
275 $text = _HashHTMLBlocks($text);
277 # Strip link definitions, store in hashes.
278 $text = _StripLinkDefinitions($text);
280 $text = _RunBlockGamut($text);
282 $text = _UnescapeSpecialChars($text);
284 return $text . "\n";
288 sub _StripLinkDefinitions {
290 # Strips link definitions from text, stores the URLs and titles in
291 # hash references.
293 my $text = shift;
294 my $less_than_tab = $g_tab_width - 1;
296 # Link defs are in the form: ^[id]: url "optional title"
297 while ($text =~ s{
298 ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1
299 [ \t]*
300 \n? # maybe *one* newline
301 [ \t]*
302 <?(\S+?)>? # url = $2
303 [ \t]*
304 \n? # maybe one newline
305 [ \t]*
307 (?<=\s) # lookbehind for whitespace
308 ["(]
309 (.+?) # title = $3
310 [")]
311 [ \t]*
312 )? # title is optional
313 (?:\n+|\Z)
315 {}mx) {
316 $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
317 if ($3) {
318 $g_titles{lc $1} = $3;
319 $g_titles{lc $1} =~ s/"/&quot;/g;
323 return $text;
327 sub _HashHTMLBlocks {
328 my $text = shift;
329 my $less_than_tab = $g_tab_width - 1;
331 # Hashify HTML blocks:
332 # We only want to do this for block-level HTML tags, such as headers,
333 # lists, and tables. That's because we still want to wrap <p>s around
334 # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
335 # phrase emphasis, and spans. The list of tags we're looking for is
336 # hard-coded:
337 my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/;
338 my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/;
340 # First, look for nested blocks, e.g.:
341 # <div>
342 # <div>
343 # tags for inner block must be indented.
344 # </div>
345 # </div>
347 # The outermost tags must start at the left margin for this to match, and
348 # the inner nested divs must be indented.
349 # We need to do this before the next, more liberal match, because the next
350 # match will start at the first `<div>` and stop at the first `</div>`.
351 $text =~ s{
352 ( # save in $1
353 ^ # start of line (with /m)
354 <($block_tags_a) # start tag = $2
355 \b # word break
356 (.*\n)*? # any number of lines, minimally matching
357 </\2> # the matching end tag
358 [ \t]* # trailing spaces/tabs
359 (?=\n+|\Z) # followed by a newline or end of document
362 my $key = md5_hex($1);
363 $g_html_blocks{$key} = $1;
364 "\n\n" . $key . "\n\n";
365 }egmx;
369 # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
371 $text =~ s{
372 ( # save in $1
373 ^ # start of line (with /m)
374 <($block_tags_b) # start tag = $2
375 \b # word break
376 (.*\n)*? # any number of lines, minimally matching
377 .*</\2> # the matching end tag
378 [ \t]* # trailing spaces/tabs
379 (?=\n+|\Z) # followed by a newline or end of document
382 my $key = md5_hex($1);
383 $g_html_blocks{$key} = $1;
384 "\n\n" . $key . "\n\n";
385 }egmx;
386 # Special case just for <hr />. It was easier to make a special case than
387 # to make the other regex more complicated.
388 $text =~ s{
390 (?<=\n\n) # Starting after a blank line
391 | # or
392 \A\n? # the beginning of the doc
394 ( # save in $1
395 [ ]{0,$less_than_tab}
396 <(hr) # start tag = $2
397 \b # word break
398 ([^<>])*? #
399 /?> # the matching end tag
400 [ \t]*
401 (?=\n{2,}|\Z) # followed by a blank line or end of document
404 my $key = md5_hex($1);
405 $g_html_blocks{$key} = $1;
406 "\n\n" . $key . "\n\n";
407 }egx;
409 # Special case for standalone HTML comments:
410 $text =~ s{
412 (?<=\n\n) # Starting after a blank line
413 | # or
414 \A\n? # the beginning of the doc
416 ( # save in $1
417 [ ]{0,$less_than_tab}
418 (?s:
420 (--.*?--\s*)+
423 [ \t]*
424 (?=\n{2,}|\Z) # followed by a blank line or end of document
427 my $key = md5_hex($1);
428 $g_html_blocks{$key} = $1;
429 "\n\n" . $key . "\n\n";
430 }egx;
433 return $text;
437 sub _RunBlockGamut {
439 # These are all the transformations that form block-level
440 # tags like paragraphs, headers, and list items.
442 my $text = shift;
444 $text = _DoHeaders($text);
446 # Do Horizontal Rules:
447 $text =~ s{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
448 $text =~ s{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
449 $text =~ s{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
451 $text = _DoLists($text);
453 $text = _DoCodeBlocks($text);
455 $text = _DoBlockQuotes($text);
457 # We already ran _HashHTMLBlocks() before, in Markdown(), but that
458 # was to escape raw HTML in the original Markdown source. This time,
459 # we're escaping the markup we've just created, so that we don't wrap
460 # <p> tags around block-level tags.
461 $text = _HashHTMLBlocks($text);
463 $text = _FormParagraphs($text);
465 return $text;
469 sub _RunSpanGamut {
471 # These are all the transformations that occur *within* block-level
472 # tags like paragraphs, headers, and list items.
474 my $text = shift;
476 $text = _DoCodeSpans($text);
478 $text = _EscapeSpecialChars($text);
480 # Process anchor and image tags. Images must come first,
481 # because ![foo][f] looks like an anchor.
482 $text = _DoImages($text);
483 $text = _DoAnchors($text);
485 # Make links out of things like `<http://example.com/>`
486 # Must come after _DoAnchors(), because you can use < and >
487 # delimiters in inline links like [this](<url>).
488 $text = _DoAutoLinks($text);
490 $text = _EncodeAmpsAndAngles($text);
492 $text = _DoItalicsAndBoldAndStrike($text);
494 # Do hard breaks:
495 $text =~ s/ {2,}\n/ <br$g_empty_element_suffix\n/g;
497 return $text;
501 sub _EscapeSpecialChars {
502 my $text = shift;
503 my $tokens ||= _TokenizeHTML($text);
505 $text = ''; # rebuild $text from the tokens
506 # my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags.
507 # my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
509 foreach my $cur_token (@$tokens) {
510 if ($cur_token->[0] eq "tag") {
511 # Within tags, encode *, _ and ~ so they don't conflict
512 # with their use in Markdown for italics and strong.
513 # We're replacing each such character with its
514 # corresponding MD5 checksum value; this is likely
515 # overkill, but it should prevent us from colliding
516 # with the escape values by accident.
517 $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gx;
518 $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gx;
519 $cur_token->[1] =~ s! ~ !$g_escape_table{'~'}!gx;
520 $text .= $cur_token->[1];
521 } else {
522 my $t = $cur_token->[1];
523 $t = _EncodeBackslashEscapes($t);
524 $text .= $t;
527 return $text;
531 sub _DoAnchors {
533 # Turn Markdown link shortcuts into XHTML <a> tags.
535 my $text = shift;
538 # First, handle reference-style links: [link text] [id]
540 $text =~ s{
541 ( # wrap whole match in $1
543 ($g_nested_brackets) # link text = $2
546 [ ]? # one optional space
547 (?:\n[ ]*)? # one optional newline followed by spaces
550 (.*?) # id = $3
554 my $result;
555 my $whole_match = $1;
556 my $link_text = $2;
557 my $link_id = lc $3;
559 if ($link_id eq "") {
560 $link_id = lc $link_text; # for shortcut links like [this][].
563 if (defined $g_urls{$link_id}) {
564 my $url = _PrefixURL($g_urls{$link_id});
565 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
566 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics, bold
567 $url =~ s! ~ !$g_escape_table{'~'}!gx; # and strike through.
568 $result = "<a href=\"$url\"";
569 if ( defined $g_titles{$link_id} ) {
570 my $title = $g_titles{$link_id};
571 $title =~ s! \* !$g_escape_table{'*'}!gx;
572 $title =~ s! _ !$g_escape_table{'_'}!gx;
573 $title =~ s! ~ !$g_escape_table{'~'}!gx;
574 $result .= " title=\"$title\"";
576 $result .= ">$link_text</a>";
578 else {
579 $result = $whole_match;
581 $result;
582 }xsge;
585 # Next, inline-style links: [link text](url "optional title")
587 $text =~ s{
588 ( # wrap whole match in $1
590 ($g_nested_brackets) # link text = $2
592 \( # literal paren
593 [ \t]*
594 <?(.*?)>? # href = $3
595 [ \t]*
596 ( # $4
597 (['"]) # quote char = $5
598 (.*?) # Title = $6
599 \5 # matching quote
600 )? # title is optional
604 my $result;
605 my $whole_match = $1;
606 my $link_text = $2;
607 my $url = $3;
608 my $title = $6;
610 $url = _PrefixURL($url);
611 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
612 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics, bold
613 $url =~ s! ~ !$g_escape_table{'~'}!gx; # and strike through.
614 $result = "<a href=\"$url\"";
616 if (defined $title) {
617 $title =~ s/"/&quot;/g;
618 $title =~ s! \* !$g_escape_table{'*'}!gx;
619 $title =~ s! _ !$g_escape_table{'_'}!gx;
620 $title =~ s! ~ !$g_escape_table{'~'}!gx;
621 $result .= " title=\"$title\"";
624 $result .= ">$link_text</a>";
626 $result;
627 }xsge;
629 return $text;
633 sub _DoImages {
635 # Turn Markdown image shortcuts into <img> tags.
637 my $text = shift;
640 # First, handle reference-style labeled images: ![alt text][id]
642 $text =~ s{
643 ( # wrap whole match in $1
645 (.*?) # alt text = $2
648 [ ]? # one optional space
649 (?:\n[ ]*)? # one optional newline followed by spaces
652 (.*?) # id = $3
657 my $result;
658 my $whole_match = $1;
659 my $alt_text = $2;
660 my $link_id = lc $3;
662 if ($link_id eq "") {
663 $link_id = lc $alt_text; # for shortcut links like ![this][].
666 $alt_text =~ s/"/&quot;/g;
667 if (defined $g_urls{$link_id}) {
668 my $url = _PrefixURL($g_urls{$link_id});
669 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
670 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics, bold
671 $url =~ s! ~ !$g_escape_table{'~'}!gx; # and strike through.
672 $result = "<img src=\"$url\" alt=\"$alt_text\"";
673 if (defined $g_titles{$link_id}) {
674 my $title = $g_titles{$link_id};
675 $title =~ s! \* !$g_escape_table{'*'}!gx;
676 $title =~ s! _ !$g_escape_table{'_'}!gx;
677 $title =~ s! ~ !$g_escape_table{'~'}!gx;
678 $result .= " title=\"$title\"";
680 $result .= $g_empty_element_suffix;
682 else {
683 # If there's no such link ID, leave intact:
684 $result = $whole_match;
687 $result;
688 }xsge;
691 # Next, handle inline images: ![alt text](url "optional title")
692 # Don't forget: encode * and _
694 $text =~ s{
695 ( # wrap whole match in $1
697 (.*?) # alt text = $2
699 \( # literal paren
700 [ \t]*
701 <?(\S+?)>? # src url = $3
702 [ \t]*
703 ( # $4
704 (['"]) # quote char = $5
705 (.*?) # title = $6
706 \5 # matching quote
707 [ \t]*
708 )? # title is optional
712 my $result;
713 my $whole_match = $1;
714 my $alt_text = $2;
715 my $url = $3;
716 my $title = '';
717 if (defined($6)) {
718 $title = $6;
721 $url = _PrefixURL($url);
722 $alt_text =~ s/"/&quot;/g;
723 $title =~ s/"/&quot;/g;
724 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
725 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics, bold
726 $url =~ s! ~ !$g_escape_table{'~'}!gx; # and strike through.
727 $result = "<img src=\"$url\" alt=\"$alt_text\"";
728 if (defined $title) {
729 $title =~ s! \* !$g_escape_table{'*'}!gx;
730 $title =~ s! _ !$g_escape_table{'_'}!gx;
731 $title =~ s! ~ !$g_escape_table{'~'}!gx;
732 $result .= " title=\"$title\"";
734 $result .= $g_empty_element_suffix;
736 $result;
737 }xsge;
739 return $text;
743 sub _DoHeaders {
744 my $text = shift;
746 # Setext-style headers:
747 # Header 1
748 # ========
750 # Header 2
751 # --------
753 # Header 3
754 # ~~~~~~~~
756 $text =~ s{ ^(?:=+[ \t]*\n)?(.+)[ \t]*\n=+[ \t]*\n+ }{
757 "<h1>" . _RunSpanGamut($1) . "</h1>\n\n";
758 }egmx;
760 $text =~ s{ ^(?:-+[ \t]*\n)?(.+)[ \t]*\n-+[ \t]*\n+ }{
761 "<h2>" . _RunSpanGamut($1) . "</h2>\n\n";
762 }egmx;
764 $text =~ s{ ^(?:~+[ \t]*\n)?(.+)[ \t]*\n~+[ \t]*\n+ }{
765 "<h3>" . _RunSpanGamut($1) . "</h3>\n\n";
766 }egmx;
769 # atx-style headers:
770 # # Header 1
771 # ## Header 2
772 # ## Header 2 with closing hashes ##
773 # ...
774 # ###### Header 6
776 $text =~ s{
777 ^(\#{1,6}) # $1 = string of #'s
778 [ \t]*
779 (.+?) # $2 = Header text
780 [ \t]*
781 \#* # optional closing #'s (not counted)
784 my $h_level = length($1);
785 "<h$h_level>" . _RunSpanGamut($2) . "</h$h_level>\n\n";
786 }egmx;
788 return $text;
792 sub _DoLists {
794 # Form HTML ordered (numbered) and unordered (bulleted) lists.
796 my $text = shift;
797 my $less_than_tab = $g_tab_width - 1;
799 # Re-usable patterns to match list item bullets and number markers:
800 my $marker_ul = qr/[*+-]/;
801 my $marker_ol = qr/\d+[.]/;
802 my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
804 # Re-usable pattern to match any entirel ul or ol list:
805 my $whole_list = qr{
806 ( # $1 = whole list
807 ( # $2
808 [ ]{0,$less_than_tab}
809 (${marker_any}) # $3 = first list item marker
810 [ \t]+
812 (?s:.+?)
813 ( # $4
816 \n{2,}
817 (?=\S)
818 (?! # Negative lookahead for another list item marker
819 [ \t]*
820 ${marker_any}[ \t]+
824 }mx;
826 # We use a different prefix before nested lists than top-level lists.
827 # See extended comment in _ProcessListItems().
829 # Note: There's a bit of duplication here. My original implementation
830 # created a scalar regex pattern as the conditional result of the test on
831 # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
832 # substitution once, using the scalar as the pattern. This worked,
833 # everywhere except when running under MT on my hosting account at Pair
834 # Networks. There, this caused all rebuilds to be killed by the reaper (or
835 # perhaps they crashed, but that seems incredibly unlikely given that the
836 # same script on the same server ran fine *except* under MT. I've spent
837 # more time trying to figure out why this is happening than I'd like to
838 # admit. My only guess, backed up by the fact that this workaround works,
839 # is that Perl optimizes the substition when it can figure out that the
840 # pattern will never change, and when this optimization isn't on, we run
841 # afoul of the reaper. Thus, the slightly redundant code to that uses two
842 # static s/// patterns rather than one conditional pattern.
844 if ($g_list_level) {
845 $text =~ s{
847 $whole_list
849 my $list = $1;
850 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
851 # Turn double returns into triple returns, so that we can make a
852 # paragraph for the last item in a list, if necessary:
853 $list =~ s/\n{2,}/\n\n\n/g;
854 my $result = _ProcessListItems($list, $marker_any);
855 $result = "<$list_type>\n" . $result . "</$list_type>\n";
856 $result;
857 }egmx;
859 else {
860 $text =~ s{
861 (?:(?<=\n\n)|\A\n?)
862 $whole_list
864 my $list = $1;
865 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
866 # Turn double returns into triple returns, so that we can make a
867 # paragraph for the last item in a list, if necessary:
868 $list =~ s/\n{2,}/\n\n\n/g;
869 my $result = _ProcessListItems($list, $marker_any);
870 $result = "<$list_type>\n" . $result . "</$list_type>\n";
871 $result;
872 }egmx;
876 return $text;
880 sub _ProcessListItems {
882 # Process the contents of a single ordered or unordered list, splitting it
883 # into individual list items.
886 my $list_str = shift;
887 my $marker_any = shift;
890 # The $g_list_level global keeps track of when we're inside a list.
891 # Each time we enter a list, we increment it; when we leave a list,
892 # we decrement. If it's zero, we're not in a list anymore.
894 # We do this because when we're not inside a list, we want to treat
895 # something like this:
897 # I recommend upgrading to version
898 # 8. Oops, now this line is treated
899 # as a sub-list.
901 # As a single paragraph, despite the fact that the second line starts
902 # with a digit-period-space sequence.
904 # Whereas when we're inside a list (or sub-list), that line will be
905 # treated as the start of a sub-list. What a kludge, huh? This is
906 # an aspect of Markdown's syntax that's hard to parse perfectly
907 # without resorting to mind-reading. Perhaps the solution is to
908 # change the syntax rules such that sub-lists must start with a
909 # starting cardinal number; e.g. "1." or "a.".
911 $g_list_level++;
913 # trim trailing blank lines:
914 $list_str =~ s/\n{2,}\z/\n/;
917 $list_str =~ s{
918 (\n)? # leading line = $1
919 (^[ \t]*) # leading whitespace = $2
920 ($marker_any) [ \t]+ # list marker = $3
921 ((?s:.+?) # list item text = $4
922 (\n{1,2}))
923 (?= \n* (\z | \2 ($marker_any) [ \t]+))
925 my $item = $4;
926 my $leading_line = $1;
927 my $leading_space = $2;
929 if ($leading_line or ($item =~ m/\n{2,}/)) {
930 $item = _RunBlockGamut(_Outdent($item));
932 else {
933 # Recursion for sub-lists:
934 $item = _DoLists(_Outdent($item));
935 chomp $item;
936 $item = _RunSpanGamut($item);
939 "<li>" . $item . "</li>\n";
940 }egmx;
942 $g_list_level--;
943 return $list_str;
948 sub _DoCodeBlocks {
950 # Process Markdown `<pre><code>` blocks.
953 my $text = shift;
955 $text =~ s{
956 (?:\n\n|\A)
957 ( # $1 = the code block -- one or more lines, starting with a space/tab
959 (?:[ ]{$g_tab_width} | \t) # Lines must start with a tab or a tab-width of spaces
960 .*\n+
963 ((?=^[ ]{0,$g_tab_width}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
965 my $codeblock = $1;
966 my $result; # return value
968 $codeblock = _EncodeCode(_Outdent($codeblock));
969 $codeblock = _Detab($codeblock);
970 $codeblock =~ s/\A\n+//; # trim leading newlines
971 $codeblock =~ s/\s+\z//; # trim trailing whitespace
973 $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
975 $result;
976 }egmx;
978 $text =~ s{
979 (?:\n|\A)
980 ``(`+)[ \t]*(?:[\w.-]+[ \t]*)?\n
981 ( # $1 = the code block -- one or more lines, starting with ```
983 .*\n+
986 (?:(?:``\1[ \t]*(?:\n|\Z))|\Z) # and ending with ``` or end of document
988 my $codeblock = $2;
989 my $result; # return value
991 $codeblock = _EncodeCode($codeblock);
992 $codeblock = _Detab($codeblock);
993 $codeblock =~ s/\A\n+//; # trim leading newlines
994 $codeblock =~ s/\s+\z//; # trim trailing whitespace
996 $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
998 $result;
999 }egmx;
1001 return $text;
1005 sub _DoCodeSpans {
1007 # * Backtick quotes are used for <code></code> spans.
1009 # * You can use multiple backticks as the delimiters if you want to
1010 # include literal backticks in the code span. So, this input:
1012 # Just type ``foo `bar` baz`` at the prompt.
1014 # Will translate to:
1016 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1018 # There's no arbitrary limit to the number of backticks you
1019 # can use as delimters. If you need three consecutive backticks
1020 # in your code, use four for delimiters, etc.
1022 # * You can use spaces to get literal backticks at the edges:
1024 # ... type `` `bar` `` ...
1026 # Turns to:
1028 # ... type <code>`bar`</code> ...
1031 my $text = shift;
1033 $text =~ s@
1034 (`+) # $1 = Opening run of `
1035 (.+?) # $2 = The code block
1036 (?<!`)
1037 \1 # Matching closer
1038 (?!`)
1040 my $c = "$2";
1041 $c =~ s/^[ \t]*//g; # leading whitespace
1042 $c =~ s/[ \t]*$//g; # trailing whitespace
1043 $c = _EncodeCode($c);
1044 "<code>$c</code>";
1045 @egsx;
1047 return $text;
1051 sub _EncodeCode {
1053 # Encode/escape certain characters inside Markdown code runs.
1054 # The point is that in code, these characters are literals,
1055 # and lose their special Markdown meanings.
1057 local $_ = shift;
1059 # Encode all ampersands; HTML entities are not
1060 # entities within a Markdown code span.
1061 s/&/&amp;/g;
1063 # Encode $'s, but only if we're running under Blosxom.
1064 # (Blosxom interpolates Perl variables in article bodies.)
1066 no warnings 'once';
1067 if (defined($blosxom::version)) {
1068 s/\$/&#036;/g;
1073 # Do the angle bracket song and dance:
1074 s! < !&lt;!gx;
1075 s! > !&gt;!gx;
1077 # Now, escape characters that are magic in Markdown:
1078 s! \* !$g_escape_table{'*'}!gx;
1079 s! _ !$g_escape_table{'_'}!gx;
1080 s! ~ !$g_escape_table{'~'}!gx;
1081 s! { !$g_escape_table{'{'}!gx;
1082 s! } !$g_escape_table{'}'}!gx;
1083 s! \[ !$g_escape_table{'['}!gx;
1084 s! \] !$g_escape_table{']'}!gx;
1085 s! \\ !$g_escape_table{'\\'}!gx;
1087 return $_;
1091 sub _DoItalicsAndBoldAndStrike {
1092 my $text = shift;
1094 # <strong> must go first:
1095 $text =~ s{ \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }
1096 {<strong>$1</strong>}gsx;
1097 $text =~ s{ (?<!\w) __ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\w) }
1098 {<strong>$1</strong>}gsx;
1100 $text =~ s{ ~~ (?=\S) (.+?[*_]*) (?<=\S) ~~ }
1101 {<strike>$1</strike>}gsx;
1103 $text =~ s{ \* (?=\S) (.+?) (?<=\S) \* }
1104 {<em>$1</em>}gsx;
1105 $text =~ s{ (?<!\w) _ (?=\S) (.+?) (?<=\S) _ (?!\w) }
1106 {<em>$1</em>}gsx;
1108 return $text;
1112 sub _DoBlockQuotes {
1113 my $text = shift;
1115 $text =~ s{
1116 ( # Wrap whole match in $1
1118 ^[ \t]*>[ \t]? # '>' at the start of a line
1119 .+\n # rest of the first line
1120 (.+\n)* # subsequent consecutive lines
1121 \n* # blanks
1125 my $bq = $1;
1126 $bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting
1127 $bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
1128 $bq = _RunBlockGamut($bq); # recurse
1130 $bq =~ s/^/ /g;
1131 # These leading spaces screw with <pre> content, so we need to fix that:
1132 $bq =~ s{
1133 (\s*<pre>.+?</pre>)
1135 my $pre = $1;
1136 $pre =~ s/^ //mg;
1137 $pre;
1138 }egsx;
1140 "<blockquote>\n$bq\n</blockquote>\n\n";
1141 }egmx;
1144 return $text;
1148 sub _FormParagraphs {
1150 # Params:
1151 # $text - string to process with html <p> tags
1153 my $text = shift;
1155 # Strip leading and trailing lines:
1156 $text =~ s/\A\n+//;
1157 $text =~ s/\n+\z//;
1159 my @grafs = split(/\n{2,}/, $text);
1162 # Wrap <p> tags.
1164 foreach (@grafs) {
1165 unless (defined( $g_html_blocks{$_} )) {
1166 $_ = _RunSpanGamut($_);
1167 s/^([ \t]*)/<p>/;
1168 $_ .= "</p>";
1173 # Unhashify HTML blocks
1175 foreach (@grafs) {
1176 if (defined( $g_html_blocks{$_} )) {
1177 $_ = $g_html_blocks{$_};
1181 return join "\n\n", @grafs;
1185 sub _EncodeAmpsAndAngles {
1186 # Smart processing for ampersands and angle brackets that need to be encoded.
1188 my $text = shift;
1190 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1191 # http://bumppo.net/projects/amputator/
1192 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
1194 # Encode naked <'s
1195 $text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
1197 return $text;
1201 sub _EncodeBackslashEscapes {
1203 # Parameter: String.
1204 # Returns: The string, with after processing the following backslash
1205 # escape sequences.
1207 local $_ = shift;
1209 s! \\\\ !$g_escape_table{'\\'}!gx; # Must process escaped backslashes first.
1210 s! \\` !$g_escape_table{'`'}!gx;
1211 s! \\\* !$g_escape_table{'*'}!gx;
1212 s! \\_ !$g_escape_table{'_'}!gx;
1213 s! \\~ !$g_escape_table{'~'}!gx;
1214 s! \\\{ !$g_escape_table{'{'}!gx;
1215 s! \\\} !$g_escape_table{'}'}!gx;
1216 s! \\\[ !$g_escape_table{'['}!gx;
1217 s! \\\] !$g_escape_table{']'}!gx;
1218 s! \\\( !$g_escape_table{'('}!gx;
1219 s! \\\) !$g_escape_table{')'}!gx;
1220 s! \\> !$g_escape_table{'>'}!gx;
1221 s! \\\# !$g_escape_table{'#'}!gx;
1222 s! \\\+ !$g_escape_table{'+'}!gx;
1223 s! \\\- !$g_escape_table{'-'}!gx;
1224 s! \\\. !$g_escape_table{'.'}!gx;
1225 s{ \\! }{$g_escape_table{'!'}}gx;
1227 return $_;
1231 sub _DoAutoLinks {
1232 my $text = shift;
1234 $text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi;
1236 # Email addresses: <address@domain.foo>
1237 $text =~ s{
1239 (?:mailto:)?
1241 [-.\w]+
1243 [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
1247 _EncodeEmailAddress( _UnescapeSpecialChars($1) );
1248 }egix;
1250 return $text;
1254 sub _EncodeEmailAddress {
1256 # Input: an email address, e.g. "foo@example.com"
1258 # Output: the email address as a mailto link, with each character
1259 # of the address encoded as either a decimal or hex entity, in
1260 # the hopes of foiling most address harvesting spam bots. E.g.:
1262 # <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1263 # x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1264 # &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1266 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1267 # mailing list: <http://tinyurl.com/yu7ue>
1270 my $addr = shift;
1272 srand;
1273 my @encode = (
1274 sub { '&#' . ord(shift) . ';' },
1275 sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
1276 sub { shift },
1279 $addr = "mailto:" . $addr;
1281 $addr =~ s{(.)}{
1282 my $char = $1;
1283 if ( $char eq '@' ) {
1284 # this *must* be encoded. I insist.
1285 $char = $encode[int rand 1]->($char);
1286 } elsif ( $char ne ':' ) {
1287 # leave ':' alone (to spot mailto: later)
1288 my $r = rand;
1289 # roughly 10% raw, 45% hex, 45% dec
1290 $char = (
1291 $r > .9 ? $encode[2]->($char) :
1292 $r < .45 ? $encode[1]->($char) :
1293 $encode[0]->($char)
1296 $char;
1297 }gex;
1299 $addr = qq{<a href="$addr">$addr</a>};
1300 $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
1302 return $addr;
1306 sub _UnescapeSpecialChars {
1308 # Swap back in all the special characters we've hidden.
1310 my $text = shift;
1312 while( my($char, $hash) = each(%g_escape_table) ) {
1313 $text =~ s/$hash/$char/g;
1315 return $text;
1319 sub _TokenizeHTML {
1321 # Parameter: String containing HTML markup.
1322 # Returns: Reference to an array of the tokens comprising the input
1323 # string. Each token is either a tag (possibly with nested,
1324 # tags contained therein, such as <a href="<MTFoo>">, or a
1325 # run of text between tags. Each element of the array is a
1326 # two-element array; the first is either 'tag' or 'text';
1327 # the second is the actual value.
1330 # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
1331 # <http://www.bradchoate.com/past/mtregex.php>
1334 my $str = shift;
1335 my $pos = 0;
1336 my $len = length $str;
1337 my @tokens;
1339 my $depth = 6;
1340 my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
1341 my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment
1342 (?s: <\? .*? \?> ) | # processing instruction
1343 $nested_tags/ix; # nested tags
1345 while ($str =~ m/($match)/g) {
1346 my $whole_tag = $1;
1347 my $sec_start = pos $str;
1348 my $tag_start = $sec_start - length $whole_tag;
1349 if ($pos < $tag_start) {
1350 push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
1352 push @tokens, ['tag', $whole_tag];
1353 $pos = pos $str;
1355 push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
1356 \@tokens;
1360 sub _Outdent {
1362 # Remove one level of line-leading tabs or spaces
1364 my $text = shift;
1366 $text =~ s/^(\t|[ ]{1,$g_tab_width})//gm;
1367 return $text;
1371 sub _Detab {
1373 # Cribbed from a post by Bart Lateur:
1374 # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
1376 my $text = shift;
1378 $text =~ s{(.*?)\t}{$1.(' ' x ($g_tab_width - length($1) % $g_tab_width))}ge;
1379 return $text;
1383 sub _PrefixURL {
1385 # Add URL prefix if needed
1387 my $url = shift;
1389 return $url unless $g_url_prefix ne '' || $g_img_prefix ne '';
1390 return $url if $url =~ m,^//, || $url =~ /^[A-Za-z][A-Za-z0-9+.-]*:/;
1391 my $ans = $g_url_prefix;
1392 $ans = $g_img_prefix
1393 if $g_img_prefix ne '' && $url =~ /\.(?:png|gif|jpe?g|svg?z)$/i;
1394 return $url unless $ans ne '';
1395 $ans .= '/' if substr($ans, -1, 1) ne '/';
1396 $ans .= substr($url, 0, 1) eq '/' ? substr($url, 1) : $url;
1397 return $ans;
1403 __END__
1406 =pod
1408 =head1 NAME
1410 B<Markdown>
1413 =head1 SYNOPSIS
1415 B<Markdown.pl> [ B<--help> ] [ B<--html4tags> ] [ B<--htmlroot>=I<prefix> ]
1416 [ B<--imageroot>=I<prefix> ] [ B<--version> ] [ B<--shortversion> ]
1417 [ I<file> ... ]
1420 =head1 DESCRIPTION
1422 Markdown is a text-to-HTML filter; it translates an easy-to-read /
1423 easy-to-write structured text format into HTML. Markdown's text format
1424 is most similar to that of plain text email, and supports features such
1425 as headers, *emphasis*, code blocks, blockquotes, and links.
1427 Markdown's syntax is designed not as a generic markup language, but
1428 specifically to serve as a front-end to (X)HTML. You can use span-level
1429 HTML tags anywhere in a Markdown document, and you can use block level
1430 HTML tags (like <div> and <table> as well).
1432 For more information about Markdown's syntax, see the `basics.text`
1433 and `syntax.text` files included with `Markdown.pl`.
1436 =head1 OPTIONS
1438 Use "--" to end switch parsing. For example, to open a file named "-z", use:
1440 Markdown.pl -- -z
1442 =over 4
1445 =item B<--html4tags>
1447 Use HTML 4 style for empty element tags, e.g.:
1449 <br>
1451 instead of Markdown's default XHTML style tags, e.g.:
1453 <br />
1456 =item B<-r> I<prefix>, B<--htmlroot>=I<prefix>
1458 Any non-absolute URLs have I<prefix> prepended.
1461 =item B<-i> I<prefix>, B<--imageroot>=I<prefix>
1463 Any non-absolute URLs have I<prefix> prepended (overriding the B<-r> prefix
1464 if any) but only if they end in an image suffix.
1467 =item B<-V>, B<--version>
1469 Display Markdown's version number and copyright information.
1472 =item B<-s>, B<--shortversion>
1474 Display the short-form version number.
1477 =item B<-h>, B<--help>
1479 Display Markdown's help.
1482 =back
1485 =head1 VERSION HISTORY
1487 See the readme file for detailed release notes for this version.
1489 1.0.3 - 06 Sep 2015
1491 1.0.2 - 03 Sep 2015
1493 1.0.1 - 14 Dec 2004
1495 1.0 - 28 Aug 2004
1498 =head1 AUTHORS
1500 John Gruber
1501 http://daringfireball.net
1502 http://daringfireball.net/projects/markdown/
1504 PHP port and other contributions by Michel Fortin
1505 http://michelf.com
1507 Additional enhancements and tweaks by Kyle J. McKay
1508 mackyle<at>gmail.com
1511 =head1 COPYRIGHT AND LICENSE
1513 Copyright (C) 2003-2004 John Gruber
1514 Copyright (C) 2015 Kyle J. McKay
1515 All rights reserved.
1517 Redistribution and use in source and binary forms, with or without
1518 modification, are permitted provided that the following conditions are
1519 met:
1521 * Redistributions of source code must retain the above copyright
1522 notice, this list of conditions and the following disclaimer.
1524 * Redistributions in binary form must reproduce the above copyright
1525 notice, this list of conditions and the following disclaimer in the
1526 documentation and/or other materials provided with the distribution.
1528 * Neither the name "Markdown" nor the names of its contributors may
1529 be used to endorse or promote products derived from this software
1530 without specific prior written permission.
1532 This software is provided by the copyright holders and contributors "as
1533 is" and any express or implied warranties, including, but not limited
1534 to, the implied warranties of merchantability and fitness for a
1535 particular purpose are disclaimed. In no event shall the copyright owner
1536 or contributors be liable for any direct, indirect, incidental, special,
1537 exemplary, or consequential damages (including, but not limited to,
1538 procurement of substitute goods or services; loss of use, data, or
1539 profits; or business interruption) however caused and on any theory of
1540 liability, whether in contract, strict liability, or tort (including
1541 negligence or otherwise) arising in any way out of the use of this
1542 software, even if advised of the possibility of such damage.
1544 =cut