Markdown 1.0.4
[markdown.git] / Markdown.pl
blob685b87b4508f886c41b49fa06cc4a95d0e0bd5d1
1 #!/usr/bin/env perl
4 # Markdown -- A text-to-HTML conversion tool for web writers
6 # Copyright (C) 2004 John Gruber
7 # Copyright (C) 2015,2016 Kyle J. McKay
11 package Markdown;
12 require 5.006_000;
13 use strict;
14 use warnings;
15 close(DATA) if fileno(DATA);
17 require Exporter;
18 use Digest::MD5 qw(md5_hex);
19 use File::Basename qw(basename);
20 use vars qw($VERSION @ISA @EXPORT_OK);
21 @ISA = qw(Exporter);
22 @EXPORT_OK = qw(Markdown);
23 $INC{__PACKAGE__.'.pm'} = $INC{basename(__FILE__)} unless exists $INC{__PACKAGE__.'.pm'};
25 $VERSION = '1.0.4';
26 # Sun 05 Jun 2016
29 ## Disabled; causes problems under Perl 5.6.1:
30 # use utf8;
31 # binmode( STDOUT, ":utf8" ); # c.f.: http://acis.openlib.org/dev/perl-unicode-struggle.html
35 # Global default settings:
37 my $g_empty_element_suffix = " />"; # Change to ">" for HTML output
38 my $g_tab_width = 4; # Legacy even though it's wrong
42 # Globals:
45 # Regex to match balanced [brackets]. See Friedl's
46 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
47 my $g_nested_brackets;
48 $g_nested_brackets = qr{
49 (?> # Atomic matching
50 [^\[\]]+ # Anything other than brackets
53 (??{ $g_nested_brackets }) # Recursive set of nested brackets
56 }x;
59 # Table of hash values for escaped characters:
60 my %g_escape_table;
61 foreach my $char (split //, "\\\`*_{}[]()>#+-.!~") {
62 $g_escape_table{$char} = md5_hex($char);
66 # Global hashes, used by various utility routines
67 my %g_urls;
68 my %g_titles;
69 my %g_html_blocks;
70 my %opt;
72 # Used to track when we're inside an ordered or unordered list
73 # (see _ProcessListItems() for details):
74 my $g_list_level = 0;
77 #### Blosxom plug-in interface ##########################################
79 # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
80 # which posts Markdown should process, using a "meta-markup: markdown"
81 # header. If it's set to 0 (the default), Markdown will process all
82 # entries.
83 my $g_blosxom_use_meta = 0;
85 sub start { 1; }
86 sub story {
87 my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
89 if ( (! $g_blosxom_use_meta) or
90 (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
92 $$body_ref = Markdown($$body_ref);
98 #### Movable Type plug-in interface #####################################
99 eval {require MT}; # Test to see if we're running in MT.
100 unless ($@) {
101 require MT;
102 import MT;
103 require MT::Template::Context;
104 import MT::Template::Context;
106 eval {require MT::Plugin}; # Test to see if we're running >= MT 3.0.
107 unless ($@) {
108 require MT::Plugin;
109 import MT::Plugin;
110 my $plugin = new MT::Plugin({
111 name => "Markdown",
112 description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
113 doc_link => 'http://daringfireball.net/projects/markdown/'
115 MT->add_plugin( $plugin );
118 MT::Template::Context->add_container_tag(MarkdownOptions => sub {
119 my $ctx = shift;
120 my $args = shift;
121 my $builder = $ctx->stash('builder');
122 my $tokens = $ctx->stash('tokens');
124 if (defined ($args->{'output'}) ) {
125 $ctx->stash('markdown_output', lc $args->{'output'});
128 defined (my $str = $builder->build($ctx, $tokens) )
129 or return $ctx->error($builder->errstr);
130 $str; # return value
133 MT->add_text_filter('markdown' => {
134 label => 'Markdown',
135 docs => 'http://daringfireball.net/projects/markdown/',
136 on_format => sub {
137 my $text = shift;
138 my $ctx = shift;
139 my $raw = 0;
140 if (defined $ctx) {
141 my $output = $ctx->stash('markdown_output');
142 if (defined $output && $output =~ m/^html/i) {
143 $g_empty_element_suffix = ">";
144 $ctx->stash('markdown_output', '');
146 elsif (defined $output && $output eq 'raw') {
147 $raw = 1;
148 $ctx->stash('markdown_output', '');
150 else {
151 $raw = 0;
152 $g_empty_element_suffix = " />";
155 $text = $raw ? $text : Markdown($text);
156 $text;
160 # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
161 my $smartypants;
164 no warnings "once";
165 $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
168 if ($smartypants) {
169 MT->add_text_filter('markdown_with_smartypants' => {
170 label => 'Markdown With SmartyPants',
171 docs => 'http://daringfireball.net/projects/markdown/',
172 on_format => sub {
173 my $text = shift;
174 my $ctx = shift;
175 if (defined $ctx) {
176 my $output = $ctx->stash('markdown_output');
177 if (defined $output && $output eq 'html') {
178 $g_empty_element_suffix = ">";
180 else {
181 $g_empty_element_suffix = " />";
184 $text = Markdown($text);
185 $text = $smartypants->($text, '1');
190 elsif (!caller) {
191 #### BBEdit/command-line text filter interface ##########################
192 # Needs to be hidden from MT (and Blosxom when running in static mode).
194 # We're only using $blosxom::version once; tell Perl not to warn us:
195 no warnings 'once';
196 unless ( defined($blosxom::version) ) {
197 use warnings;
199 #### Check for command-line switches: #################
200 my %options = ();
201 my %cli_opts;
202 use Getopt::Long;
203 Getopt::Long::Configure('pass_through');
204 GetOptions(\%cli_opts,
205 'help|h',
206 'version|V',
207 'shortversion|short-version|s',
208 'html4tags',
209 'htmlroot|r=s',
210 'imageroot|i=s',
212 if ($cli_opts{'help'}) {
213 exec 'perldoc', $0;
215 if ($cli_opts{'version'}) { # Version info
216 print "\nThis is Markdown, version $VERSION.\n";
217 print "Copyright (C) 2004 John Gruber\n";
218 print "Copyright (C) 2015 Kyle J. McKay\n";
219 exit 0;
221 if ($cli_opts{'shortversion'}) { # Just the version number string.
222 print $VERSION;
223 exit 0;
225 if ($cli_opts{'html4tags'}) { # Use HTML tag style instead of XHTML
226 $options{empty_element_suffix} = ">";
228 if ($cli_opts{'htmlroot'}) { # Use URL prefix
229 $options{url_prefix} = $cli_opts{'htmlroot'};
231 if ($cli_opts{'imageroot'}) { # Use image URL prefix
232 $options{img_prefix} = $cli_opts{'imageroot'};
236 #### Process incoming text: ###########################
237 my $text;
239 local $/; # Slurp the whole file
240 $text = <>;
242 print Markdown($text, \%options);
248 sub Markdown {
250 # Main function. The order in which other subs are called here is
251 # essential. Link and image substitutions need to happen before
252 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
253 # and <img> tags get encoded.
255 my $text = shift;
256 defined $text or $text='';
258 # Any remaining arguments after the first are options; either a single
259 # hashref or a list of name, value paurs.
260 %opt = (
261 # set initial defaults
262 empty_element_suffix => $g_empty_element_suffix,
263 tab_width => $g_tab_width,
264 url_prefix => "", # Prefixed to non-absolute URLs
265 img_prefix => "", # Prefixed to non-absolute image URLs
267 my %args = ();
268 if (ref($_[0]) eq "HASH") {
269 %args = %{$_[0]};
270 } else {
271 %args = @_;
273 while (my ($k,$v) = each %args) {
274 $opt{$k} = $v;
277 # Clear the globals. If we don't clear these, you get conflicts
278 # from other articles when generating a page which contains more than
279 # one article (e.g. an index page that shows the N most recent
280 # articles):
281 %g_urls = ();
282 %g_titles = ();
283 %g_html_blocks = ();
284 $g_list_level = 0;
286 # Standardize line endings:
287 $text =~ s{\r\n}{\n}g; # DOS to Unix
288 $text =~ s{\r}{\n}g; # Mac to Unix
290 # Make sure $text ends with a couple of newlines:
291 $text .= "\n\n";
293 # Handle backticks-delimited code blocks
294 $text = _HashBTCodeBlocks($text);
296 # Convert all tabs to spaces.
297 $text = _Detab($text);
299 # Strip any lines consisting only of spaces and tabs.
300 # This makes subsequent regexen easier to write, because we can
301 # match consecutive blank lines with /\n+/ instead of something
302 # contorted like /[ \t]*\n+/ .
303 $text =~ s/^[ \t]+$//mg;
305 # Turn block-level HTML blocks into hash entries
306 $text = _HashHTMLBlocks($text);
308 # Strip link definitions, store in hashes.
309 $text = _StripLinkDefinitions($text);
311 $text = _RunBlockGamut($text);
313 $text = _UnescapeSpecialChars($text);
315 return $text . "\n";
319 sub _HashBTCodeBlocks {
321 # Process Markdown backticks (```) delimited code blocks
323 my $text = shift;
325 $text =~ s{
326 (?:\n|\A)
327 ``(`+)[ \t]*(?:([\w.+-]+)[ \t]*)?\n
328 ( # $3 = the code block -- one or more lines, starting with ```
330 .*\n+
333 (?:(?:``\1[ \t]*(?:\n|\Z))|\Z) # and ending with ``` or end of document
335 # $2 contains syntax highlighting to use if defined
336 my $codeblock = $3;
337 $codeblock =~ s/[ \t]+$//mg; # trim trailing spaces on lines
338 $codeblock = _Detab($codeblock, 8); # physical tab stops are always 8
339 $codeblock =~ s/\A\n+//; # trim leading newlines
340 $codeblock =~ s/\s+\z//; # trim trailing whitespace
341 $codeblock = _EncodeCode($codeblock); # or run highlighter here
342 $codeblock = "<pre><code>" . $codeblock . "\n</code></pre>";
344 my $key = md5_hex($codeblock);
345 $g_html_blocks{$key} = $codeblock;
346 "\n\n" . $key . "\n\n";
347 }egmx;
349 return $text;
353 sub _StripLinkDefinitions {
355 # Strips link definitions from text, stores the URLs and titles in
356 # hash references.
358 my $text = shift;
359 my $less_than_tab = $opt{tab_width} - 1;
361 # Link defs are in the form: ^[id]: url "optional title"
362 while ($text =~ s{
363 ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1
364 [ \t]*
365 \n? # maybe *one* newline
366 [ \t]*
367 <?(\S+?)>? # url = $2
368 [ \t]*
369 \n? # maybe one newline
370 [ \t]*
372 (?<=\s) # lookbehind for whitespace
373 ["(]
374 (.+?) # title = $3
375 [")]
376 [ \t]*
377 )? # title is optional
378 (?:\n+|\Z)
380 {}mx) {
381 $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
382 if ($3) {
383 $g_titles{lc $1} = $3;
384 $g_titles{lc $1} =~ s/\042/&quot;/g;
388 return $text;
392 sub _HashHTMLBlocks {
393 my $text = shift;
394 my $less_than_tab = $opt{tab_width} - 1;
396 # Hashify HTML blocks:
397 # We only want to do this for block-level HTML tags, such as headers,
398 # lists, and tables. That's because we still want to wrap <p>s around
399 # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
400 # phrase emphasis, and spans. The list of tags we're looking for is
401 # hard-coded:
402 my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/;
403 my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/;
405 # First, look for nested blocks, e.g.:
406 # <div>
407 # <div>
408 # tags for inner block must be indented.
409 # </div>
410 # </div>
412 # The outermost tags must start at the left margin for this to match, and
413 # the inner nested divs must be indented.
414 # We need to do this before the next, more liberal match, because the next
415 # match will start at the first `<div>` and stop at the first `</div>`.
416 $text =~ s{
417 ( # save in $1
418 ^ # start of line (with /m)
419 <($block_tags_a) # start tag = $2
420 \b # word break
421 (.*\n)*? # any number of lines, minimally matching
422 </\2> # the matching end tag
423 [ \t]* # trailing spaces/tabs
424 (?=\n+|\Z) # followed by a newline or end of document
427 my $key = md5_hex($1);
428 $g_html_blocks{$key} = $1;
429 "\n\n" . $key . "\n\n";
430 }egmx;
434 # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
436 $text =~ s{
437 ( # save in $1
438 ^ # start of line (with /m)
439 <($block_tags_b) # start tag = $2
440 \b # word break
441 (.*\n)*? # any number of lines, minimally matching
442 .*</\2> # the matching end tag
443 [ \t]* # trailing spaces/tabs
444 (?=\n+|\Z) # followed by a newline or end of document
447 my $key = md5_hex($1);
448 $g_html_blocks{$key} = $1;
449 "\n\n" . $key . "\n\n";
450 }egmx;
451 # Special case just for <hr />. It was easier to make a special case than
452 # to make the other regex more complicated.
453 $text =~ s{
455 (?<=\n\n) # Starting after a blank line
456 | # or
457 \A\n? # the beginning of the doc
459 ( # save in $1
460 [ ]{0,$less_than_tab}
461 <(hr) # start tag = $2
462 \b # word break
463 ([^<>])*? #
464 /?> # the matching end tag
465 [ \t]*
466 (?=\n{2,}|\Z) # followed by a blank line or end of document
469 my $key = md5_hex($1);
470 $g_html_blocks{$key} = $1;
471 "\n\n" . $key . "\n\n";
472 }egx;
474 # Special case for standalone HTML comments:
475 $text =~ s{
477 (?<=\n\n) # Starting after a blank line
478 | # or
479 \A\n? # the beginning of the doc
481 ( # save in $1
482 [ ]{0,$less_than_tab}
483 (?s:
485 (--.*?--\s*)+
488 [ \t]*
489 (?=\n{2,}|\Z) # followed by a blank line or end of document
492 my $key = md5_hex($1);
493 $g_html_blocks{$key} = $1;
494 "\n\n" . $key . "\n\n";
495 }egx;
498 return $text;
502 sub _RunBlockGamut {
504 # These are all the transformations that form block-level
505 # tags like paragraphs, headers, and list items.
507 my $text = shift;
509 $text = _DoHeaders($text);
511 # Do Horizontal Rules:
512 $text =~ s{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}{\n<hr$opt{empty_element_suffix}\n}gmx;
513 $text =~ s{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}{\n<hr$opt{empty_element_suffix}\n}gmx;
514 $text =~ s{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}{\n<hr$opt{empty_element_suffix}\n}gmx;
516 $text = _DoLists($text);
518 $text = _DoCodeBlocks($text);
520 $text = _DoBlockQuotes($text);
522 # We already ran _HashHTMLBlocks() before, in Markdown(), but that
523 # was to escape raw HTML in the original Markdown source. This time,
524 # we're escaping the markup we've just created, so that we don't wrap
525 # <p> tags around block-level tags.
526 $text = _HashHTMLBlocks($text);
528 $text = _FormParagraphs($text);
530 return $text;
534 sub _RunSpanGamut {
536 # These are all the transformations that occur *within* block-level
537 # tags like paragraphs, headers, and list items.
539 my $text = shift;
541 $text = _DoCodeSpans($text);
543 $text = _EscapeSpecialChars($text);
545 # Process anchor and image tags. Images must come first,
546 # because ![foo][f] looks like an anchor.
547 $text = _DoImages($text);
548 $text = _DoAnchors($text);
550 # Make links out of things like `<http://example.com/>`
551 # Must come after _DoAnchors(), because you can use < and >
552 # delimiters in inline links like [this](<url>).
553 $text = _DoAutoLinks($text);
555 $text = _EncodeAmpsAndAngles($text);
557 $text = _DoItalicsAndBoldAndStrike($text);
559 # Do hard breaks:
560 $text =~ s/ {2,}\n/ <br$opt{empty_element_suffix}\n/g;
562 return $text;
566 sub _EscapeSpecialChars {
567 my $text = shift;
568 my $tokens ||= _TokenizeHTML($text);
570 $text = ''; # rebuild $text from the tokens
571 # my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags.
572 # my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
574 foreach my $cur_token (@$tokens) {
575 if ($cur_token->[0] eq "tag") {
576 # Within tags, encode *, _ and ~ so they don't conflict
577 # with their use in Markdown for italics and strong.
578 # We're replacing each such character with its
579 # corresponding MD5 checksum value; this is likely
580 # overkill, but it should prevent us from colliding
581 # with the escape values by accident.
582 $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gx;
583 $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gx;
584 $cur_token->[1] =~ s! ~ !$g_escape_table{'~'}!gx;
585 $text .= $cur_token->[1];
586 } else {
587 my $t = $cur_token->[1];
588 $t = _EncodeBackslashEscapes($t);
589 $text .= $t;
592 return $text;
596 sub _DoAnchors {
598 # Turn Markdown link shortcuts into XHTML <a> tags.
600 my $text = shift;
603 # First, handle reference-style links: [link text] [id]
605 $text =~ s{
606 ( # wrap whole match in $1
608 ($g_nested_brackets) # link text = $2
611 [ ]? # one optional space
612 (?:\n[ ]*)? # one optional newline followed by spaces
615 (.*?) # id = $3
619 my $result;
620 my $whole_match = $1;
621 my $link_text = $2;
622 my $link_id = lc $3;
624 if ($link_id eq "") {
625 $link_id = lc $link_text; # for shortcut links like [this][].
628 if (defined $g_urls{$link_id}) {
629 my $url = _PrefixURL($g_urls{$link_id});
630 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
631 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics, bold
632 $url =~ s! ~ !$g_escape_table{'~'}!gx; # and strike through.
633 $result = "<a href=\"$url\"";
634 if ( defined $g_titles{$link_id} ) {
635 my $title = $g_titles{$link_id};
636 $title =~ s! \* !$g_escape_table{'*'}!gx;
637 $title =~ s! _ !$g_escape_table{'_'}!gx;
638 $title =~ s! ~ !$g_escape_table{'~'}!gx;
639 $result .= " title=\"$title\"";
641 $result .= ">$link_text</a>";
643 else {
644 $result = $whole_match;
646 $result;
647 }xsge;
650 # Next, inline-style links: [link text](url "optional title")
652 $text =~ s{
653 ( # wrap whole match in $1
655 ($g_nested_brackets) # link text = $2
657 \( # literal paren
658 [ \t]*
659 <?(.*?)>? # href = $3
660 [ \t]*
661 ( # $4
662 (['\042]) # quote char = $5
663 (.*?) # Title = $6
664 \5 # matching quote
665 )? # title is optional
669 my $result;
670 my $whole_match = $1;
671 my $link_text = $2;
672 my $url = $3;
673 my $title = $6;
675 $url = _PrefixURL($url);
676 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
677 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics, bold
678 $url =~ s! ~ !$g_escape_table{'~'}!gx; # and strike through.
679 $result = "<a href=\"$url\"";
681 if (defined $title) {
682 $title =~ s/\042/&quot;/g;
683 $title =~ s! \* !$g_escape_table{'*'}!gx;
684 $title =~ s! _ !$g_escape_table{'_'}!gx;
685 $title =~ s! ~ !$g_escape_table{'~'}!gx;
686 $result .= " title=\"$title\"";
689 $result .= ">$link_text</a>";
691 $result;
692 }xsge;
694 return $text;
698 sub _DoImages {
700 # Turn Markdown image shortcuts into <img> tags.
702 my $text = shift;
705 # First, handle reference-style labeled images: ![alt text][id]
707 $text =~ s{
708 ( # wrap whole match in $1
710 (.*?) # alt text = $2
713 [ ]? # one optional space
714 (?:\n[ ]*)? # one optional newline followed by spaces
717 (.*?) # id = $3
722 my $result;
723 my $whole_match = $1;
724 my $alt_text = $2;
725 my $link_id = lc $3;
727 if ($link_id eq "") {
728 $link_id = lc $alt_text; # for shortcut links like ![this][].
731 $alt_text =~ s/"/&quot;/g;
732 if (defined $g_urls{$link_id}) {
733 my $url = _PrefixURL($g_urls{$link_id});
734 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
735 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics, bold
736 $url =~ s! ~ !$g_escape_table{'~'}!gx; # and strike through.
737 $result = "<img src=\"$url\" alt=\"$alt_text\"";
738 if (defined $g_titles{$link_id}) {
739 my $title = $g_titles{$link_id};
740 $title =~ s! \* !$g_escape_table{'*'}!gx;
741 $title =~ s! _ !$g_escape_table{'_'}!gx;
742 $title =~ s! ~ !$g_escape_table{'~'}!gx;
743 $result .= " title=\"$title\"";
745 $result .= $opt{empty_element_suffix};
747 else {
748 # If there's no such link ID, leave intact:
749 $result = $whole_match;
752 $result;
753 }xsge;
756 # Next, handle inline images: ![alt text](url "optional title")
757 # Don't forget: encode * and _
759 $text =~ s{
760 ( # wrap whole match in $1
762 (.*?) # alt text = $2
764 \( # literal paren
765 [ \t]*
766 <?(\S+?)>? # src url = $3
767 [ \t]*
768 ( # $4
769 (['\042]) # quote char = $5
770 (.*?) # title = $6
771 \5 # matching quote
772 [ \t]*
773 )? # title is optional
777 my $result;
778 my $whole_match = $1;
779 my $alt_text = $2;
780 my $url = $3;
781 my $title = '';
782 if (defined($6)) {
783 $title = $6;
786 $url = _PrefixURL($url);
787 $alt_text =~ s/"/&quot;/g;
788 $title =~ s/"/&quot;/g;
789 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
790 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics, bold
791 $url =~ s! ~ !$g_escape_table{'~'}!gx; # and strike through.
792 $result = "<img src=\"$url\" alt=\"$alt_text\"";
793 if (defined $title) {
794 $title =~ s! \* !$g_escape_table{'*'}!gx;
795 $title =~ s! _ !$g_escape_table{'_'}!gx;
796 $title =~ s! ~ !$g_escape_table{'~'}!gx;
797 $result .= " title=\"$title\"";
799 $result .= $opt{empty_element_suffix};
801 $result;
802 }xsge;
804 return $text;
808 sub _DoHeaders {
809 my $text = shift;
811 # Setext-style headers:
812 # Header 1
813 # ========
815 # Header 2
816 # --------
818 # Header 3
819 # ~~~~~~~~
821 $text =~ s{ ^(?:=+[ \t]*\n)?(.+)[ \t]*\n=+[ \t]*\n+ }{
822 "<h1>" . _RunSpanGamut($1) . "</h1>\n\n";
823 }egmx;
825 $text =~ s{ ^(?:-+[ \t]*\n)?(.+)[ \t]*\n-+[ \t]*\n+ }{
826 "<h2>" . _RunSpanGamut($1) . "</h2>\n\n";
827 }egmx;
829 $text =~ s{ ^(?:~+[ \t]*\n)?(.+)[ \t]*\n~+[ \t]*\n+ }{
830 "<h3>" . _RunSpanGamut($1) . "</h3>\n\n";
831 }egmx;
834 # atx-style headers:
835 # # Header 1
836 # ## Header 2
837 # ## Header 2 with closing hashes ##
838 # ...
839 # ###### Header 6
841 $text =~ s{
842 ^(\#{1,6}) # $1 = string of #'s
843 [ \t]*
844 (.+?) # $2 = Header text
845 [ \t]*
846 \#* # optional closing #'s (not counted)
849 my $h_level = length($1);
850 "<h$h_level>" . _RunSpanGamut($2) . "</h$h_level>\n\n";
851 }egmx;
853 return $text;
857 sub _DoLists {
859 # Form HTML ordered (numbered) and unordered (bulleted) lists.
861 my $text = shift;
862 my $less_than_tab = $opt{tab_width} - 1;
864 # Re-usable patterns to match list item bullets and number markers:
865 my $marker_ul = qr/[*+-]/;
866 my $marker_ol = qr/\d+[.]/;
867 my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
869 # Re-usable pattern to match any entirel ul or ol list:
870 my $whole_list = qr{
871 ( # $1 = whole list
872 ( # $2
873 [ ]{0,$less_than_tab}
874 (${marker_any}) # $3 = first list item marker
875 [ \t]+
877 (?s:.+?)
878 ( # $4
881 \n{2,}
882 (?=\S)
883 (?! # Negative lookahead for another list item marker
884 [ \t]*
885 ${marker_any}[ \t]+
889 }mx;
891 # We use a different prefix before nested lists than top-level lists.
892 # See extended comment in _ProcessListItems().
894 # Note: There's a bit of duplication here. My original implementation
895 # created a scalar regex pattern as the conditional result of the test on
896 # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
897 # substitution once, using the scalar as the pattern. This worked,
898 # everywhere except when running under MT on my hosting account at Pair
899 # Networks. There, this caused all rebuilds to be killed by the reaper (or
900 # perhaps they crashed, but that seems incredibly unlikely given that the
901 # same script on the same server ran fine *except* under MT. I've spent
902 # more time trying to figure out why this is happening than I'd like to
903 # admit. My only guess, backed up by the fact that this workaround works,
904 # is that Perl optimizes the substition when it can figure out that the
905 # pattern will never change, and when this optimization isn't on, we run
906 # afoul of the reaper. Thus, the slightly redundant code to that uses two
907 # static s/// patterns rather than one conditional pattern.
909 if ($g_list_level) {
910 $text =~ s{
912 $whole_list
914 my $list = $1;
915 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
916 # Turn double returns into triple returns, so that we can make a
917 # paragraph for the last item in a list, if necessary:
918 $list =~ s/\n{2,}/\n\n\n/g;
919 my $result = _ProcessListItems($list, $marker_any);
920 $result = "<$list_type>\n" . $result . "</$list_type>\n";
921 $result;
922 }egmx;
924 else {
925 $text =~ s{
926 (?:(?<=\n\n)|\A\n?)
927 $whole_list
929 my $list = $1;
930 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
931 # Turn double returns into triple returns, so that we can make a
932 # paragraph for the last item in a list, if necessary:
933 $list =~ s/\n{2,}/\n\n\n/g;
934 my $result = _ProcessListItems($list, $marker_any);
935 $result = "<$list_type>\n" . $result . "</$list_type>\n";
936 $result;
937 }egmx;
941 return $text;
945 sub _ProcessListItems {
947 # Process the contents of a single ordered or unordered list, splitting it
948 # into individual list items.
951 my $list_str = shift;
952 my $marker_any = shift;
955 # The $g_list_level global keeps track of when we're inside a list.
956 # Each time we enter a list, we increment it; when we leave a list,
957 # we decrement. If it's zero, we're not in a list anymore.
959 # We do this because when we're not inside a list, we want to treat
960 # something like this:
962 # I recommend upgrading to version
963 # 8. Oops, now this line is treated
964 # as a sub-list.
966 # As a single paragraph, despite the fact that the second line starts
967 # with a digit-period-space sequence.
969 # Whereas when we're inside a list (or sub-list), that line will be
970 # treated as the start of a sub-list. What a kludge, huh? This is
971 # an aspect of Markdown's syntax that's hard to parse perfectly
972 # without resorting to mind-reading. Perhaps the solution is to
973 # change the syntax rules such that sub-lists must start with a
974 # starting cardinal number; e.g. "1." or "a.".
976 $g_list_level++;
978 # trim trailing blank lines:
979 $list_str =~ s/\n{2,}\z/\n/;
982 $list_str =~ s{
983 (\n)? # leading line = $1
984 (^[ \t]*) # leading whitespace = $2
985 ($marker_any) [ \t]+ # list marker = $3
986 ((?s:.+?) # list item text = $4
987 (\n{1,2}))
988 (?= \n* (\z | \2 ($marker_any) [ \t]+))
990 my $item = $4;
991 my $leading_line = $1;
992 my $leading_space = $2;
994 if ($leading_line or ($item =~ m/\n{2,}/)) {
995 $item = _RunBlockGamut(_Outdent($item));
997 else {
998 # Recursion for sub-lists:
999 $item = _DoLists(_Outdent($item));
1000 chomp $item;
1001 $item = _RunSpanGamut($item);
1004 "<li>" . $item . "</li>\n";
1005 }egmx;
1007 $g_list_level--;
1008 return $list_str;
1013 sub _DoCodeBlocks {
1015 # Process Markdown `<pre><code>` blocks.
1018 my $text = shift;
1020 $text =~ s{
1021 (?:\n\n|\A)
1022 ( # $1 = the code block -- one or more lines, starting with a space/tab
1024 (?:[ ]{$opt{tab_width}} | \t) # Lines must start with a tab or a tab-width of spaces
1025 .*\n+
1028 ((?=^[ ]{0,$opt{tab_width}}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
1030 my $codeblock = $1;
1031 my $result; # return value
1033 $codeblock = _EncodeCode(_Outdent($codeblock));
1034 $codeblock = _Detab($codeblock);
1035 $codeblock =~ s/\A\n+//; # trim leading newlines
1036 $codeblock =~ s/\s+\z//; # trim trailing whitespace
1038 $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
1040 $result;
1041 }egmx;
1043 return $text;
1047 sub _DoCodeSpans {
1049 # * Backtick quotes are used for <code></code> spans.
1051 # * You can use multiple backticks as the delimiters if you want to
1052 # include literal backticks in the code span. So, this input:
1054 # Just type ``foo `bar` baz`` at the prompt.
1056 # Will translate to:
1058 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1060 # There's no arbitrary limit to the number of backticks you
1061 # can use as delimters. If you need three consecutive backticks
1062 # in your code, use four for delimiters, etc.
1064 # * You can use spaces to get literal backticks at the edges:
1066 # ... type `` `bar` `` ...
1068 # Turns to:
1070 # ... type <code>`bar`</code> ...
1073 my $text = shift;
1075 $text =~ s@
1076 (`+) # $1 = Opening run of `
1077 (.+?) # $2 = The code block
1078 (?<!`)
1079 \1 # Matching closer
1080 (?!`)
1082 my $c = "$2";
1083 $c =~ s/^[ \t]*//g; # leading whitespace
1084 $c =~ s/[ \t]*$//g; # trailing whitespace
1085 $c = _EncodeCode($c);
1086 "<code>$c</code>";
1087 @egsx;
1089 return $text;
1093 sub _EncodeCode {
1095 # Encode/escape certain characters inside Markdown code runs.
1096 # The point is that in code, these characters are literals,
1097 # and lose their special Markdown meanings.
1099 local $_ = shift;
1101 # Encode all ampersands; HTML entities are not
1102 # entities within a Markdown code span.
1103 s/&/&amp;/g;
1105 # Encode $'s, but only if we're running under Blosxom.
1106 # (Blosxom interpolates Perl variables in article bodies.)
1108 no warnings 'once';
1109 if (defined($blosxom::version)) {
1110 s/\$/&#036;/g;
1115 # Do the angle bracket song and dance:
1116 s! < !&lt;!gx;
1117 s! > !&gt;!gx;
1119 # Now, escape characters that are magic in Markdown:
1120 s! \* !$g_escape_table{'*'}!gx;
1121 s! _ !$g_escape_table{'_'}!gx;
1122 s! ~ !$g_escape_table{'~'}!gx;
1123 s! { !$g_escape_table{'{'}!gx;
1124 s! } !$g_escape_table{'}'}!gx;
1125 s! \[ !$g_escape_table{'['}!gx;
1126 s! \] !$g_escape_table{']'}!gx;
1127 s! \\ !$g_escape_table{'\\'}!gx;
1129 return $_;
1133 sub _DoItalicsAndBoldAndStrike {
1134 my $text = shift;
1136 # <strong> must go first:
1137 $text =~ s{ \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }
1138 {<strong>$1</strong>}gsx;
1139 $text =~ s{ (?<!\w) __ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\w) }
1140 {<strong>$1</strong>}gsx;
1142 $text =~ s{ ~~ (?=\S) (.+?[*_]*) (?<=\S) ~~ }
1143 {<strike>$1</strike>}gsx;
1145 $text =~ s{ \* (?=\S) (.+?) (?<=\S) \* }
1146 {<em>$1</em>}gsx;
1147 $text =~ s{ (?<!\w) _ (?=\S) (.+?) (?<=\S) _ (?!\w) }
1148 {<em>$1</em>}gsx;
1150 return $text;
1154 sub _DoBlockQuotes {
1155 my $text = shift;
1157 $text =~ s{
1158 ( # Wrap whole match in $1
1160 ^[ \t]*>[ \t]? # '>' at the start of a line
1161 .+\n # rest of the first line
1162 (.+\n)* # subsequent consecutive lines
1163 \n* # blanks
1167 my $bq = $1;
1168 $bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting
1169 $bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
1170 $bq = _RunBlockGamut($bq); # recurse
1172 $bq =~ s/^/ /g;
1173 # These leading spaces screw with <pre> content, so we need to fix that:
1174 $bq =~ s{
1175 (\s*<pre>.+?</pre>)
1177 my $pre = $1;
1178 $pre =~ s/^ //mg;
1179 $pre;
1180 }egsx;
1182 "<blockquote>\n$bq\n</blockquote>\n\n";
1183 }egmx;
1186 return $text;
1190 sub _FormParagraphs {
1192 # Params:
1193 # $text - string to process with html <p> tags
1195 my $text = shift;
1197 # Strip leading and trailing lines:
1198 $text =~ s/\A\n+//;
1199 $text =~ s/\n+\z//;
1201 my @grafs = split(/\n{2,}/, $text);
1204 # Wrap <p> tags.
1206 foreach (@grafs) {
1207 unless (defined( $g_html_blocks{$_} )) {
1208 $_ = _RunSpanGamut($_);
1209 s/^([ \t]*)/<p>/;
1210 $_ .= "</p>";
1215 # Unhashify HTML blocks
1217 foreach (@grafs) {
1218 if (defined( $g_html_blocks{$_} )) {
1219 $_ = $g_html_blocks{$_};
1223 return join "\n\n", @grafs;
1227 sub _EncodeAmpsAndAngles {
1228 # Smart processing for ampersands and angle brackets that need to be encoded.
1230 my $text = shift;
1232 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1233 # http://bumppo.net/projects/amputator/
1234 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
1236 # Encode naked <'s
1237 $text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
1239 return $text;
1243 sub _EncodeBackslashEscapes {
1245 # Parameter: String.
1246 # Returns: The string, with after processing the following backslash
1247 # escape sequences.
1249 local $_ = shift;
1251 s! \\\\ !$g_escape_table{'\\'}!gx; # Must process escaped backslashes first.
1252 s! \\` !$g_escape_table{'`'}!gx;
1253 s! \\\* !$g_escape_table{'*'}!gx;
1254 s! \\_ !$g_escape_table{'_'}!gx;
1255 s! \\~ !$g_escape_table{'~'}!gx;
1256 s! \\\{ !$g_escape_table{'{'}!gx;
1257 s! \\\} !$g_escape_table{'}'}!gx;
1258 s! \\\[ !$g_escape_table{'['}!gx;
1259 s! \\\] !$g_escape_table{']'}!gx;
1260 s! \\\( !$g_escape_table{'('}!gx;
1261 s! \\\) !$g_escape_table{')'}!gx;
1262 s! \\> !$g_escape_table{'>'}!gx;
1263 s! \\\# !$g_escape_table{'#'}!gx;
1264 s! \\\+ !$g_escape_table{'+'}!gx;
1265 s! \\\- !$g_escape_table{'-'}!gx;
1266 s! \\\. !$g_escape_table{'.'}!gx;
1267 s{ \\! }{$g_escape_table{'!'}}gx;
1269 return $_;
1273 sub _DoAutoLinks {
1274 my $text = shift;
1276 $text =~ s{<((https?|ftp):[^'\042>\s]+)>}{<a href="$1">$1</a>}gi;
1278 # Email addresses: <address@domain.foo>
1279 $text =~ s{
1281 (?:mailto:)?
1283 [-.\w]+
1285 [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
1289 _EncodeEmailAddress( _UnescapeSpecialChars($1) );
1290 }egix;
1292 return $text;
1296 sub _EncodeEmailAddress {
1298 # Input: an email address, e.g. "foo@example.com"
1300 # Output: the email address as a mailto link, with each character
1301 # of the address encoded as either a decimal or hex entity, in
1302 # the hopes of foiling most address harvesting spam bots. E.g.:
1304 # <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1305 # x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1306 # &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1308 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1309 # mailing list: <http://tinyurl.com/yu7ue>
1312 my $addr = shift;
1314 srand;
1315 my @encode = (
1316 sub { '&#' . ord(shift) . ';' },
1317 sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
1318 sub { shift },
1321 $addr = "mailto:" . $addr;
1323 $addr =~ s{(.)}{
1324 my $char = $1;
1325 if ( $char eq '@' ) {
1326 # this *must* be encoded. I insist.
1327 $char = $encode[int rand 1]->($char);
1328 } elsif ( $char ne ':' ) {
1329 # leave ':' alone (to spot mailto: later)
1330 my $r = rand;
1331 # roughly 10% raw, 45% hex, 45% dec
1332 $char = (
1333 $r > .9 ? $encode[2]->($char) :
1334 $r < .45 ? $encode[1]->($char) :
1335 $encode[0]->($char)
1338 $char;
1339 }gex;
1341 $addr = qq{<a href="$addr">$addr</a>};
1342 $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
1344 return $addr;
1348 sub _UnescapeSpecialChars {
1350 # Swap back in all the special characters we've hidden.
1352 my $text = shift;
1354 while( my($char, $hash) = each(%g_escape_table) ) {
1355 $text =~ s/$hash/$char/g;
1357 return $text;
1361 sub _TokenizeHTML {
1363 # Parameter: String containing HTML markup.
1364 # Returns: Reference to an array of the tokens comprising the input
1365 # string. Each token is either a tag (possibly with nested,
1366 # tags contained therein, such as <a href="<MTFoo>">, or a
1367 # run of text between tags. Each element of the array is a
1368 # two-element array; the first is either 'tag' or 'text';
1369 # the second is the actual value.
1372 # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
1373 # <http://www.bradchoate.com/past/mtregex.php>
1376 my $str = shift;
1377 my $pos = 0;
1378 my $len = length $str;
1379 my @tokens;
1381 my $depth = 6;
1382 my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
1383 my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment
1384 (?s: <\? .*? \?> ) | # processing instruction
1385 $nested_tags/ix; # nested tags
1387 while ($str =~ m/($match)/g) {
1388 my $whole_tag = $1;
1389 my $sec_start = pos $str;
1390 my $tag_start = $sec_start - length $whole_tag;
1391 if ($pos < $tag_start) {
1392 push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
1394 push @tokens, ['tag', $whole_tag];
1395 $pos = pos $str;
1397 push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
1398 \@tokens;
1402 sub _Outdent {
1404 # Remove one level of line-leading tabs or spaces
1406 my $text = shift;
1408 $text =~ s/^(\t|[ ]{1,$opt{tab_width}})//gm;
1409 return $text;
1413 sub _Detab {
1415 # Expand tabs to spaces using $opt{tab_width} if no second argument
1417 my $text = shift;
1418 my $ts = shift || $opt{tab_width};
1419 # From the Perl camel book "Fluent Perl" section (slightly modified)
1420 $text =~ s/(.*?)(\t+)/$1 . ' ' x (length($2) * $ts - length($1) % $ts)/ge;
1421 return $text;
1425 sub _PrefixURL {
1427 # Add URL prefix if needed
1429 my $url = shift;
1431 return $url unless $opt{url_prefix} ne '' || $opt{img_prefix} ne '';
1432 return $url if $url =~ m,^//, || $url =~ /^[A-Za-z][A-Za-z0-9+.-]*:/;
1433 my $ans = $opt{url_prefix};
1434 $ans = $opt{img_prefix}
1435 if $opt{img_prefix} ne '' && $url =~ /\.(?:png|gif|jpe?g|svg?z)$/i;
1436 return $url unless $ans ne '';
1437 $ans .= '/' if substr($ans, -1, 1) ne '/';
1438 $ans .= substr($url, 0, 1) eq '/' ? substr($url, 1) : $url;
1439 return $ans;
1445 __DATA__
1448 =pod
1450 =head1 NAME
1452 B<Markdown>
1455 =head1 SYNOPSIS
1457 B<Markdown.pl> [ B<--help> ] [ B<--html4tags> ] [ B<--htmlroot>=I<prefix> ]
1458 [ B<--imageroot>=I<prefix> ] [ B<--version> ] [ B<--shortversion> ]
1459 [ I<file> ... ]
1462 =head1 DESCRIPTION
1464 Markdown is a text-to-HTML filter; it translates an easy-to-read /
1465 easy-to-write structured text format into HTML. Markdown's text format
1466 is most similar to that of plain text email, and supports features such
1467 as headers, *emphasis*, code blocks, blockquotes, and links.
1469 Markdown's syntax is designed not as a generic markup language, but
1470 specifically to serve as a front-end to (X)HTML. You can use span-level
1471 HTML tags anywhere in a Markdown document, and you can use block level
1472 HTML tags (like <div> and <table> as well).
1474 For more information about Markdown's syntax, see the `basics.text`
1475 and `syntax.text` files included with `Markdown.pl`.
1478 =head1 OPTIONS
1480 Use "--" to end switch parsing. For example, to open a file named "-z", use:
1482 Markdown.pl -- -z
1484 =over 4
1487 =item B<--html4tags>
1489 Use HTML 4 style for empty element tags, e.g.:
1491 <br>
1493 instead of Markdown's default XHTML style tags, e.g.:
1495 <br />
1498 =item B<-r> I<prefix>, B<--htmlroot>=I<prefix>
1500 Any non-absolute URLs have I<prefix> prepended.
1503 =item B<-i> I<prefix>, B<--imageroot>=I<prefix>
1505 Any non-absolute URLs have I<prefix> prepended (overriding the B<-r> prefix
1506 if any) but only if they end in an image suffix.
1509 =item B<-V>, B<--version>
1511 Display Markdown's version number and copyright information.
1514 =item B<-s>, B<--shortversion>
1516 Display the short-form version number.
1519 =item B<-h>, B<--help>
1521 Display Markdown's help.
1524 =back
1527 =head1 VERSION HISTORY
1529 See the readme file for detailed release notes for this version.
1531 1.0.4 - 05 Jun 2016
1533 1.0.3 - 06 Sep 2015
1535 1.0.2 - 03 Sep 2015
1537 1.0.1 - 14 Dec 2004
1539 1.0 - 28 Aug 2004
1542 =head1 AUTHORS
1544 John Gruber
1545 http://daringfireball.net
1546 http://daringfireball.net/projects/markdown/
1548 PHP port and other contributions by Michel Fortin
1549 http://michelf.com
1551 Additional enhancements and tweaks by Kyle J. McKay
1552 mackyle<at>gmail.com
1555 =head1 COPYRIGHT AND LICENSE
1557 Copyright (C) 2003-2004 John Gruber
1558 Copyright (C) 2015,2016 Kyle J. McKay
1559 All rights reserved.
1561 Redistribution and use in source and binary forms, with or without
1562 modification, are permitted provided that the following conditions are
1563 met:
1565 * Redistributions of source code must retain the above copyright
1566 notice, this list of conditions and the following disclaimer.
1568 * Redistributions in binary form must reproduce the above copyright
1569 notice, this list of conditions and the following disclaimer in the
1570 documentation and/or other materials provided with the distribution.
1572 * Neither the name "Markdown" nor the names of its contributors may
1573 be used to endorse or promote products derived from this software
1574 without specific prior written permission.
1576 This software is provided by the copyright holders and contributors "as
1577 is" and any express or implied warranties, including, but not limited
1578 to, the implied warranties of merchantability and fitness for a
1579 particular purpose are disclaimed. In no event shall the copyright owner
1580 or contributors be liable for any direct, indirect, incidental, special,
1581 exemplary, or consequential damages (including, but not limited to,
1582 procurement of substitute goods or services; loss of use, data, or
1583 profits; or business interruption) however caused and on any theory of
1584 liability, whether in contract, strict liability, or tort (including
1585 negligence or otherwise) arising in any way out of the use of this
1586 software, even if advised of the possibility of such damage.
1588 =cut