Add entries_cache_meta to general.
[blosxom-plugins.git] / general / Markdown
blob799fde43f463452f8319de9c2921a94ef9c1ab94
1 #!/usr/bin/perl
4 # Markdown -- A text-to-HTML conversion tool for web writers
6 # Copyright (c) 2004 John Gruber
7 # <http://daringfireball.net/projects/markdown/>
9 # With minor modifications made by the Blosxom Plugin Development Team
10 # <blosxom-devel@lists.sourceforge.net>.
14 package Markdown;
15 require 5.006_000;
16 use strict;
17 use warnings;
19 use Digest::MD5 qw(md5_hex);
20 use vars qw($VERSION);
21 $VERSION = '1.0.1b';
22 # Tue 14 Dec 2004
24 ## Disabled; causes problems under Perl 5.6.1:
25 # use utf8;
26 # binmode( STDOUT, ":utf8" ); # c.f.: http://acis.openlib.org/dev/perl-unicode-struggle.html
30 # Global default settings:
32 my $g_empty_element_suffix = " />"; # Change to ">" for HTML output
33 my $g_tab_width = 4;
37 # Globals:
40 # Regex to match balanced [brackets]. See Friedl's
41 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
42 my $g_nested_brackets;
43 $g_nested_brackets = qr{
44 (?> # Atomic matching
45 [^\[\]]+ # Anything other than brackets
48 (??{ $g_nested_brackets }) # Recursive set of nested brackets
51 }x;
54 # Table of hash values for escaped characters:
55 my %g_escape_table;
56 foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
57 $g_escape_table{$char} = md5_hex($char);
61 # Global hashes, used by various utility routines
62 my %g_urls;
63 my %g_titles;
64 my %g_html_blocks;
66 # Used to track when we're inside an ordered or unordered list
67 # (see _ProcessListItems() for details):
68 my $g_list_level = 0;
71 #### Blosxom plug-in interface ##########################################
73 # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
74 # which posts Markdown should process, using a "meta-markup: markdown"
75 # header. If it's set to 0 (the default), Markdown will process all
76 # entries.
77 my $g_blosxom_use_meta = 0;
79 sub start { 1; }
80 sub story {
81 my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
83 if ( (! $g_blosxom_use_meta) or
84 (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
86 $$body_ref = Markdown($$body_ref);
92 #### Movable Type plug-in interface #####################################
93 eval {require MT}; # Test to see if we're running in MT.
94 unless ($@) {
95 require MT;
96 import MT;
97 require MT::Template::Context;
98 import MT::Template::Context;
100 eval {require MT::Plugin}; # Test to see if we're running >= MT 3.0.
101 unless ($@) {
102 require MT::Plugin;
103 import MT::Plugin;
104 my $plugin = new MT::Plugin({
105 name => "Markdown",
106 description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
107 doc_link => 'http://daringfireball.net/projects/markdown/'
109 MT->add_plugin( $plugin );
112 MT::Template::Context->add_container_tag(MarkdownOptions => sub {
113 my $ctx = shift;
114 my $args = shift;
115 my $builder = $ctx->stash('builder');
116 my $tokens = $ctx->stash('tokens');
118 if (defined ($args->{'output'}) ) {
119 $ctx->stash('markdown_output', lc $args->{'output'});
122 defined (my $str = $builder->build($ctx, $tokens) )
123 or return $ctx->error($builder->errstr);
124 $str; # return value
127 MT->add_text_filter('markdown' => {
128 label => 'Markdown',
129 docs => 'http://daringfireball.net/projects/markdown/',
130 on_format => sub {
131 my $text = shift;
132 my $ctx = shift;
133 my $raw = 0;
134 if (defined $ctx) {
135 my $output = $ctx->stash('markdown_output');
136 if (defined $output && $output =~ m/^html/i) {
137 $g_empty_element_suffix = ">";
138 $ctx->stash('markdown_output', '');
140 elsif (defined $output && $output eq 'raw') {
141 $raw = 1;
142 $ctx->stash('markdown_output', '');
144 else {
145 $raw = 0;
146 $g_empty_element_suffix = " />";
149 $text = $raw ? $text : Markdown($text);
150 $text;
154 # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
155 my $smartypants;
158 no warnings "once";
159 $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
162 if ($smartypants) {
163 MT->add_text_filter('markdown_with_smartypants' => {
164 label => 'Markdown With SmartyPants',
165 docs => 'http://daringfireball.net/projects/markdown/',
166 on_format => sub {
167 my $text = shift;
168 my $ctx = shift;
169 if (defined $ctx) {
170 my $output = $ctx->stash('markdown_output');
171 if (defined $output && $output eq 'html') {
172 $g_empty_element_suffix = ">";
174 else {
175 $g_empty_element_suffix = " />";
178 $text = Markdown($text);
179 $text = $smartypants->($text, '1');
184 else {
185 #### BBEdit/command-line text filter interface ##########################
186 # Needs to be hidden from MT (and Blosxom when running in static mode).
188 # We're only using $blosxom::version once; tell Perl not to warn us:
189 no warnings 'once';
190 unless ( defined($blosxom::version) ) {
191 use warnings;
193 #### Check for command-line switches: #################
194 my %cli_opts;
195 use Getopt::Long;
196 Getopt::Long::Configure('pass_through');
197 GetOptions(\%cli_opts,
198 'version',
199 'shortversion',
200 'html4tags',
202 if ($cli_opts{'version'}) { # Version info
203 print "\nThis is Markdown, version $VERSION.\n";
204 print "Copyright 2004 John Gruber\n";
205 print "http://daringfireball.net/projects/markdown/\n\n";
206 exit 0;
208 if ($cli_opts{'shortversion'}) { # Just the version number string.
209 print $VERSION;
210 exit 0;
212 if ($cli_opts{'html4tags'}) { # Use HTML tag style instead of XHTML
213 $g_empty_element_suffix = ">";
217 #### Process incoming text: ###########################
218 my $text;
220 local $/; # Slurp the whole file
221 $text = <>;
223 print Markdown($text);
229 sub Markdown {
231 # Main function. The order in which other subs are called here is
232 # essential. Link and image substitutions need to happen before
233 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
234 # and <img> tags get encoded.
236 my $text = shift;
238 # Clear the global hashes. If we don't clear these, you get conflicts
239 # from other articles when generating a page which contains more than
240 # one article (e.g. an index page that shows the N most recent
241 # articles):
242 %g_urls = ();
243 %g_titles = ();
244 %g_html_blocks = ();
247 # Standardize line endings:
248 $text =~ s{\r\n}{\n}g; # DOS to Unix
249 $text =~ s{\r}{\n}g; # Mac to Unix
251 # Make sure $text ends with a couple of newlines:
252 $text .= "\n\n";
254 # Convert all tabs to spaces.
255 $text = _Detab($text);
257 # Strip any lines consisting only of spaces and tabs.
258 # This makes subsequent regexen easier to write, because we can
259 # match consecutive blank lines with /\n+/ instead of something
260 # contorted like /[ \t]*\n+/ .
261 $text =~ s/^[ \t]+$//mg;
263 # Turn block-level HTML blocks into hash entries
264 $text = _HashHTMLBlocks($text);
266 # Strip link definitions, store in hashes.
267 $text = _StripLinkDefinitions($text);
269 $text = _RunBlockGamut($text);
271 $text = _UnescapeSpecialChars($text);
273 return $text . "\n";
277 sub _StripLinkDefinitions {
279 # Strips link definitions from text, stores the URLs and titles in
280 # hash references.
282 my $text = shift;
283 my $less_than_tab = $g_tab_width - 1;
285 # Link defs are in the form: ^[id]: url "optional title"
286 while ($text =~ s{
287 ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1
288 [ \t]*
289 \n? # maybe *one* newline
290 [ \t]*
291 <?(\S+?)>? # url = $2
292 [ \t]*
293 \n? # maybe one newline
294 [ \t]*
296 (?<=\s) # lookbehind for whitespace
297 ["(]
298 (.+?) # title = $3
299 [")]
300 [ \t]*
301 )? # title is optional
302 (?:\n+|\Z)
304 {}mx) {
305 $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
306 if ($3) {
307 $g_titles{lc $1} = $3;
308 $g_titles{lc $1} =~ s/"/&quot;/g;
312 return $text;
316 sub _HashHTMLBlocks {
317 my $text = shift;
318 my $less_than_tab = $g_tab_width - 1;
320 # Hashify HTML blocks:
321 # We only want to do this for block-level HTML tags, such as headers,
322 # lists, and tables. That's because we still want to wrap <p>s around
323 # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
324 # phrase emphasis, and spans. The list of tags we're looking for is
325 # hard-coded:
326 my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/;
327 my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/;
329 # First, look for nested blocks, e.g.:
330 # <div>
331 # <div>
332 # tags for inner block must be indented.
333 # </div>
334 # </div>
336 # The outermost tags must start at the left margin for this to match, and
337 # the inner nested divs must be indented.
338 # We need to do this before the next, more liberal match, because the next
339 # match will start at the first `<div>` and stop at the first `</div>`.
340 $text =~ s{
341 ( # save in $1
342 ^ # start of line (with /m)
343 <($block_tags_a) # start tag = $2
344 \b # word break
345 (.*\n)*? # any number of lines, minimally matching
346 </\2> # the matching end tag
347 [ \t]* # trailing spaces/tabs
348 (?=\n+|\Z) # followed by a newline or end of document
351 my $key = md5_hex($1);
352 $g_html_blocks{$key} = $1;
353 "\n\n" . $key . "\n\n";
354 }egmx;
358 # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
360 $text =~ s{
361 ( # save in $1
362 ^ # start of line (with /m)
363 <($block_tags_b) # start tag = $2
364 \b # word break
365 (.*\n)*? # any number of lines, minimally matching
366 .*</\2> # the matching end tag
367 [ \t]* # trailing spaces/tabs
368 (?=\n+|\Z) # followed by a newline or end of document
371 my $key = md5_hex($1);
372 $g_html_blocks{$key} = $1;
373 "\n\n" . $key . "\n\n";
374 }egmx;
375 # Special case just for <hr />. It was easier to make a special case than
376 # to make the other regex more complicated.
377 $text =~ s{
379 (?<=\n\n) # Starting after a blank line
380 | # or
381 \A\n? # the beginning of the doc
383 ( # save in $1
384 [ ]{0,$less_than_tab}
385 <(hr) # start tag = $2
386 \b # word break
387 ([^<>])*? #
388 /?> # the matching end tag
389 [ \t]*
390 (?=\n{2,}|\Z) # followed by a blank line or end of document
393 my $key = md5_hex($1);
394 $g_html_blocks{$key} = $1;
395 "\n\n" . $key . "\n\n";
396 }egx;
398 # Special case for standalone HTML comments:
399 $text =~ s{
401 (?<=\n\n) # Starting after a blank line
402 | # or
403 \A\n? # the beginning of the doc
405 ( # save in $1
406 [ ]{0,$less_than_tab}
407 (?s:
409 (--.*?--\s*)+
412 [ \t]*
413 (?=\n{2,}|\Z) # followed by a blank line or end of document
416 my $key = md5_hex($1);
417 $g_html_blocks{$key} = $1;
418 "\n\n" . $key . "\n\n";
419 }egx;
422 return $text;
426 sub _RunBlockGamut {
428 # These are all the transformations that form block-level
429 # tags like paragraphs, headers, and list items.
431 my $text = shift;
433 $text = _DoHeaders($text);
435 # Do Horizontal Rules:
436 $text =~ s{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
437 $text =~ s{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
438 $text =~ s{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
440 $text = _DoLists($text);
442 $text = _DoCodeBlocks($text);
444 $text = _DoBlockQuotes($text);
446 # We already ran _HashHTMLBlocks() before, in Markdown(), but that
447 # was to escape raw HTML in the original Markdown source. This time,
448 # we're escaping the markup we've just created, so that we don't wrap
449 # <p> tags around block-level tags.
450 $text = _HashHTMLBlocks($text);
452 $text = _FormParagraphs($text);
454 return $text;
458 sub _RunSpanGamut {
460 # These are all the transformations that occur *within* block-level
461 # tags like paragraphs, headers, and list items.
463 my $text = shift;
465 $text = _DoCodeSpans($text);
467 $text = _EscapeSpecialChars($text);
469 # Process anchor and image tags. Images must come first,
470 # because ![foo][f] looks like an anchor.
471 $text = _DoImages($text);
472 $text = _DoAnchors($text);
474 # Make links out of things like `<http://example.com/>`
475 # Must come after _DoAnchors(), because you can use < and >
476 # delimiters in inline links like [this](<url>).
477 $text = _DoAutoLinks($text);
479 $text = _EncodeAmpsAndAngles($text);
481 $text = _DoItalicsAndBold($text);
483 # Do hard breaks:
484 $text =~ s/ {2,}\n/ <br$g_empty_element_suffix\n/g;
486 return $text;
490 sub _EscapeSpecialChars {
491 my $text = shift;
492 my $tokens ||= _TokenizeHTML($text);
494 $text = ''; # rebuild $text from the tokens
495 # my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags.
496 # my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
498 foreach my $cur_token (@$tokens) {
499 if ($cur_token->[0] eq "tag") {
500 # Within tags, encode * and _ so they don't conflict
501 # with their use in Markdown for italics and strong.
502 # We're replacing each such character with its
503 # corresponding MD5 checksum value; this is likely
504 # overkill, but it should prevent us from colliding
505 # with the escape values by accident.
506 $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gx;
507 $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gx;
508 $text .= $cur_token->[1];
509 } else {
510 my $t = $cur_token->[1];
511 $t = _EncodeBackslashEscapes($t);
512 $text .= $t;
515 return $text;
519 sub _DoAnchors {
521 # Turn Markdown link shortcuts into XHTML <a> tags.
523 my $text = shift;
526 # First, handle reference-style links: [link text] [id]
528 $text =~ s{
529 ( # wrap whole match in $1
531 ($g_nested_brackets) # link text = $2
534 [ ]? # one optional space
535 (?:\n[ ]*)? # one optional newline followed by spaces
538 (.*?) # id = $3
542 my $result;
543 my $whole_match = $1;
544 my $link_text = $2;
545 my $link_id = lc $3;
547 if ($link_id eq "") {
548 $link_id = lc $link_text; # for shortcut links like [this][].
551 if (defined $g_urls{$link_id}) {
552 my $url = $g_urls{$link_id};
553 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
554 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
555 $result = "<a href=\"$url\"";
556 if ( defined $g_titles{$link_id} ) {
557 my $title = $g_titles{$link_id};
558 $title =~ s! \* !$g_escape_table{'*'}!gx;
559 $title =~ s! _ !$g_escape_table{'_'}!gx;
560 $result .= " title=\"$title\"";
562 $result .= ">$link_text</a>";
564 else {
565 $result = $whole_match;
567 $result;
568 }xsge;
571 # Next, inline-style links: [link text](url "optional title")
573 $text =~ s{
574 ( # wrap whole match in $1
576 ($g_nested_brackets) # link text = $2
578 \( # literal paren
579 [ \t]*
580 <?(.*?)>? # href = $3
581 [ \t]*
582 ( # $4
583 (['"]) # quote char = $5
584 (.*?) # Title = $6
585 \5 # matching quote
586 )? # title is optional
590 my $result;
591 my $whole_match = $1;
592 my $link_text = $2;
593 my $url = $3;
594 my $title = $6;
596 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
597 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
598 $result = "<a href=\"$url\"";
600 if (defined $title) {
601 $title =~ s/"/&quot;/g;
602 $title =~ s! \* !$g_escape_table{'*'}!gx;
603 $title =~ s! _ !$g_escape_table{'_'}!gx;
604 $result .= " title=\"$title\"";
607 $result .= ">$link_text</a>";
609 $result;
610 }xsge;
612 return $text;
616 sub _DoImages {
618 # Turn Markdown image shortcuts into <img> tags.
620 my $text = shift;
623 # First, handle reference-style labeled images: ![alt text][id]
625 $text =~ s{
626 ( # wrap whole match in $1
628 (.*?) # alt text = $2
631 [ ]? # one optional space
632 (?:\n[ ]*)? # one optional newline followed by spaces
635 (.*?) # id = $3
640 my $result;
641 my $whole_match = $1;
642 my $alt_text = $2;
643 my $link_id = lc $3;
645 if ($link_id eq "") {
646 $link_id = lc $alt_text; # for shortcut links like ![this][].
649 $alt_text =~ s/"/&quot;/g;
650 if (defined $g_urls{$link_id}) {
651 my $url = $g_urls{$link_id};
652 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
653 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
654 $result = "<img src=\"$url\" alt=\"$alt_text\"";
655 if (defined $g_titles{$link_id}) {
656 my $title = $g_titles{$link_id};
657 $title =~ s! \* !$g_escape_table{'*'}!gx;
658 $title =~ s! _ !$g_escape_table{'_'}!gx;
659 $result .= " title=\"$title\"";
661 $result .= $g_empty_element_suffix;
663 else {
664 # If there's no such link ID, leave intact:
665 $result = $whole_match;
668 $result;
669 }xsge;
672 # Next, handle inline images: ![alt text](url "optional title")
673 # Don't forget: encode * and _
675 $text =~ s{
676 ( # wrap whole match in $1
678 (.*?) # alt text = $2
680 \( # literal paren
681 [ \t]*
682 <?(\S+?)>? # src url = $3
683 [ \t]*
684 ( # $4
685 (['"]) # quote char = $5
686 (.*?) # title = $6
687 \5 # matching quote
688 [ \t]*
689 )? # title is optional
693 my $result;
694 my $whole_match = $1;
695 my $alt_text = $2;
696 my $url = $3;
697 my $title = '';
698 if (defined($6)) {
699 $title = $6;
702 $alt_text =~ s/"/&quot;/g;
703 $title =~ s/"/&quot;/g;
704 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
705 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
706 $result = "<img src=\"$url\" alt=\"$alt_text\"";
707 if (defined $title) {
708 $title =~ s! \* !$g_escape_table{'*'}!gx;
709 $title =~ s! _ !$g_escape_table{'_'}!gx;
710 $result .= " title=\"$title\"";
712 $result .= $g_empty_element_suffix;
714 $result;
715 }xsge;
717 return $text;
721 sub _DoHeaders {
722 my $text = shift;
724 # Setext-style headers:
725 # Header 1
726 # ========
728 # Header 2
729 # --------
731 $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
732 "<h1>" . _RunSpanGamut($1) . "</h1>\n\n";
733 }egmx;
735 $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
736 "<h2>" . _RunSpanGamut($1) . "</h2>\n\n";
737 }egmx;
740 # atx-style headers:
741 # # Header 1
742 # ## Header 2
743 # ## Header 2 with closing hashes ##
744 # ...
745 # ###### Header 6
747 $text =~ s{
748 ^(\#{1,6}) # $1 = string of #'s
749 [ \t]*
750 (.+?) # $2 = Header text
751 [ \t]*
752 \#* # optional closing #'s (not counted)
755 my $h_level = length($1);
756 "<h$h_level>" . _RunSpanGamut($2) . "</h$h_level>\n\n";
757 }egmx;
759 return $text;
763 sub _DoLists {
765 # Form HTML ordered (numbered) and unordered (bulleted) lists.
767 my $text = shift;
768 my $less_than_tab = $g_tab_width - 1;
770 # Re-usable patterns to match list item bullets and number markers:
771 my $marker_ul = qr/[*+-]/;
772 my $marker_ol = qr/\d+[.]/;
773 my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
775 # Re-usable pattern to match any entirel ul or ol list:
776 my $whole_list = qr{
777 ( # $1 = whole list
778 ( # $2
779 [ ]{0,$less_than_tab}
780 (${marker_any}) # $3 = first list item marker
781 [ \t]+
783 (?s:.+?)
784 ( # $4
787 \n{2,}
788 (?=\S)
789 (?! # Negative lookahead for another list item marker
790 [ \t]*
791 ${marker_any}[ \t]+
795 }mx;
797 # We use a different prefix before nested lists than top-level lists.
798 # See extended comment in _ProcessListItems().
800 # Note: There's a bit of duplication here. My original implementation
801 # created a scalar regex pattern as the conditional result of the test on
802 # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
803 # substitution once, using the scalar as the pattern. This worked,
804 # everywhere except when running under MT on my hosting account at Pair
805 # Networks. There, this caused all rebuilds to be killed by the reaper (or
806 # perhaps they crashed, but that seems incredibly unlikely given that the
807 # same script on the same server ran fine *except* under MT. I've spent
808 # more time trying to figure out why this is happening than I'd like to
809 # admit. My only guess, backed up by the fact that this workaround works,
810 # is that Perl optimizes the substition when it can figure out that the
811 # pattern will never change, and when this optimization isn't on, we run
812 # afoul of the reaper. Thus, the slightly redundant code to that uses two
813 # static s/// patterns rather than one conditional pattern.
815 if ($g_list_level) {
816 $text =~ s{
818 $whole_list
820 my $list = $1;
821 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
822 # Turn double returns into triple returns, so that we can make a
823 # paragraph for the last item in a list, if necessary:
824 $list =~ s/\n{2,}/\n\n\n/g;
825 my $result = _ProcessListItems($list, $marker_any);
826 $result = "<$list_type>\n" . $result . "</$list_type>\n";
827 $result;
828 }egmx;
830 else {
831 $text =~ s{
832 (?:(?<=\n\n)|\A\n?)
833 $whole_list
835 my $list = $1;
836 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
837 # Turn double returns into triple returns, so that we can make a
838 # paragraph for the last item in a list, if necessary:
839 $list =~ s/\n{2,}/\n\n\n/g;
840 my $result = _ProcessListItems($list, $marker_any);
841 $result = "<$list_type>\n" . $result . "</$list_type>\n";
842 $result;
843 }egmx;
847 return $text;
851 sub _ProcessListItems {
853 # Process the contents of a single ordered or unordered list, splitting it
854 # into individual list items.
857 my $list_str = shift;
858 my $marker_any = shift;
861 # The $g_list_level global keeps track of when we're inside a list.
862 # Each time we enter a list, we increment it; when we leave a list,
863 # we decrement. If it's zero, we're not in a list anymore.
865 # We do this because when we're not inside a list, we want to treat
866 # something like this:
868 # I recommend upgrading to version
869 # 8. Oops, now this line is treated
870 # as a sub-list.
872 # As a single paragraph, despite the fact that the second line starts
873 # with a digit-period-space sequence.
875 # Whereas when we're inside a list (or sub-list), that line will be
876 # treated as the start of a sub-list. What a kludge, huh? This is
877 # an aspect of Markdown's syntax that's hard to parse perfectly
878 # without resorting to mind-reading. Perhaps the solution is to
879 # change the syntax rules such that sub-lists must start with a
880 # starting cardinal number; e.g. "1." or "a.".
882 $g_list_level++;
884 # trim trailing blank lines:
885 $list_str =~ s/\n{2,}\z/\n/;
888 $list_str =~ s{
889 (\n)? # leading line = $1
890 (^[ \t]*) # leading whitespace = $2
891 ($marker_any) [ \t]+ # list marker = $3
892 ((?s:.+?) # list item text = $4
893 (\n{1,2}))
894 (?= \n* (\z | \2 ($marker_any) [ \t]+))
896 my $item = $4;
897 my $leading_line = $1;
898 my $leading_space = $2;
900 if ($leading_line or ($item =~ m/\n{2,}/)) {
901 $item = _RunBlockGamut(_Outdent($item));
903 else {
904 # Recursion for sub-lists:
905 $item = _DoLists(_Outdent($item));
906 chomp $item;
907 $item = _RunSpanGamut($item);
910 "<li>" . $item . "</li>\n";
911 }egmx;
913 $g_list_level--;
914 return $list_str;
919 sub _DoCodeBlocks {
921 # Process Markdown `<pre><code>` blocks.
924 my $text = shift;
926 $text =~ s{
927 (?:\n\n|\A)
928 ( # $1 = the code block -- one or more lines, starting with a space/tab
930 (?:[ ]{$g_tab_width} | \t) # Lines must start with a tab or a tab-width of spaces
931 .*\n+
934 ((?=^[ ]{0,$g_tab_width}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
936 my $codeblock = $1;
937 my $result; # return value
939 $codeblock = _EncodeCode(_Outdent($codeblock));
940 $codeblock = _Detab($codeblock);
941 $codeblock =~ s/\A\n+//; # trim leading newlines
942 $codeblock =~ s/\s+\z//; # trim trailing whitespace
944 $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
946 $result;
947 }egmx;
949 return $text;
953 sub _DoCodeSpans {
955 # * Backtick quotes are used for <code></code> spans.
957 # * You can use multiple backticks as the delimiters if you want to
958 # include literal backticks in the code span. So, this input:
960 # Just type ``foo `bar` baz`` at the prompt.
962 # Will translate to:
964 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
966 # There's no arbitrary limit to the number of backticks you
967 # can use as delimters. If you need three consecutive backticks
968 # in your code, use four for delimiters, etc.
970 # * You can use spaces to get literal backticks at the edges:
972 # ... type `` `bar` `` ...
974 # Turns to:
976 # ... type <code>`bar`</code> ...
979 my $text = shift;
981 $text =~ s@
982 (`+) # $1 = Opening run of `
983 (.+?) # $2 = The code block
984 (?<!`)
985 \1 # Matching closer
986 (?!`)
988 my $c = "$2";
989 $c =~ s/^[ \t]*//g; # leading whitespace
990 $c =~ s/[ \t]*$//g; # trailing whitespace
991 $c = _EncodeCode($c);
992 "<code>$c</code>";
993 @egsx;
995 return $text;
999 sub _EncodeCode {
1001 # Encode/escape certain characters inside Markdown code runs.
1002 # The point is that in code, these characters are literals,
1003 # and lose their special Markdown meanings.
1005 local $_ = shift;
1007 # Encode all ampersands; HTML entities are not
1008 # entities within a Markdown code span.
1009 s/&/&amp;/g;
1011 # Encode $'s, but only if we're running under Blosxom.
1012 # (Blosxom interpolates Perl variables in article bodies.)
1014 no warnings 'once';
1015 if (defined($blosxom::version)) {
1016 s/\$/&#036;/g;
1021 # Do the angle bracket song and dance:
1022 s! < !&lt;!gx;
1023 s! > !&gt;!gx;
1025 # Now, escape characters that are magic in Markdown:
1026 s! \* !$g_escape_table{'*'}!gx;
1027 s! _ !$g_escape_table{'_'}!gx;
1028 s! { !$g_escape_table{'{'}!gx;
1029 s! } !$g_escape_table{'}'}!gx;
1030 s! \[ !$g_escape_table{'['}!gx;
1031 s! \] !$g_escape_table{']'}!gx;
1032 s! \\ !$g_escape_table{'\\'}!gx;
1034 return $_;
1038 sub _DoItalicsAndBold {
1039 my $text = shift;
1041 # <strong> must go first:
1042 $text =~ s{ (?<!\w) (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 (?!\w) }
1043 {<strong>$2</strong>}gsx;
1045 $text =~ s{ (?<!\w) (\*|_) (?=\S) (.+?) (?<=\S) \1 (?!\w) }
1046 {<em>$2</em>}gsx;
1048 return $text;
1052 sub _DoBlockQuotes {
1053 my $text = shift;
1055 $text =~ s{
1056 ( # Wrap whole match in $1
1058 ^[ \t]*>[ \t]? # '>' at the start of a line
1059 .+\n # rest of the first line
1060 (.+\n)* # subsequent consecutive lines
1061 \n* # blanks
1065 my $bq = $1;
1066 $bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting
1067 $bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
1068 $bq = _RunBlockGamut($bq); # recurse
1070 $bq =~ s/^/ /g;
1071 # These leading spaces screw with <pre> content, so we need to fix that:
1072 $bq =~ s{
1073 (\s*<pre>.+?</pre>)
1075 my $pre = $1;
1076 $pre =~ s/^ //mg;
1077 $pre;
1078 }egsx;
1080 "<blockquote>\n$bq\n</blockquote>\n\n";
1081 }egmx;
1084 return $text;
1088 sub _FormParagraphs {
1090 # Params:
1091 # $text - string to process with html <p> tags
1093 my $text = shift;
1095 # Strip leading and trailing lines:
1096 $text =~ s/\A\n+//;
1097 $text =~ s/\n+\z//;
1099 my @grafs = split(/\n{2,}/, $text);
1102 # Wrap <p> tags.
1104 foreach (@grafs) {
1105 unless (defined( $g_html_blocks{$_} )) {
1106 $_ = _RunSpanGamut($_);
1107 s/^([ \t]*)/<p>/;
1108 $_ .= "</p>";
1113 # Unhashify HTML blocks
1115 foreach (@grafs) {
1116 if (defined( $g_html_blocks{$_} )) {
1117 $_ = $g_html_blocks{$_};
1121 return join "\n\n", @grafs;
1125 sub _EncodeAmpsAndAngles {
1126 # Smart processing for ampersands and angle brackets that need to be encoded.
1128 my $text = shift;
1130 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1131 # http://bumppo.net/projects/amputator/
1132 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
1134 # Encode naked <'s
1135 $text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
1137 return $text;
1141 sub _EncodeBackslashEscapes {
1143 # Parameter: String.
1144 # Returns: The string, with after processing the following backslash
1145 # escape sequences.
1147 local $_ = shift;
1149 s! \\\\ !$g_escape_table{'\\'}!gx; # Must process escaped backslashes first.
1150 s! \\` !$g_escape_table{'`'}!gx;
1151 s! \\\* !$g_escape_table{'*'}!gx;
1152 s! \\_ !$g_escape_table{'_'}!gx;
1153 s! \\\{ !$g_escape_table{'{'}!gx;
1154 s! \\\} !$g_escape_table{'}'}!gx;
1155 s! \\\[ !$g_escape_table{'['}!gx;
1156 s! \\\] !$g_escape_table{']'}!gx;
1157 s! \\\( !$g_escape_table{'('}!gx;
1158 s! \\\) !$g_escape_table{')'}!gx;
1159 s! \\> !$g_escape_table{'>'}!gx;
1160 s! \\\# !$g_escape_table{'#'}!gx;
1161 s! \\\+ !$g_escape_table{'+'}!gx;
1162 s! \\\- !$g_escape_table{'-'}!gx;
1163 s! \\\. !$g_escape_table{'.'}!gx;
1164 s{ \\! }{$g_escape_table{'!'}}gx;
1166 return $_;
1170 sub _DoAutoLinks {
1171 my $text = shift;
1173 $text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi;
1175 # Email addresses: <address@domain.foo>
1176 $text =~ s{
1178 (?:mailto:)?
1180 [-.\w]+
1182 [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
1186 _EncodeEmailAddress( _UnescapeSpecialChars($1) );
1187 }egix;
1189 return $text;
1193 sub _EncodeEmailAddress {
1195 # Input: an email address, e.g. "foo@example.com"
1197 # Output: the email address as a mailto link, with each character
1198 # of the address encoded as either a decimal or hex entity, in
1199 # the hopes of foiling most address harvesting spam bots. E.g.:
1201 # <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1202 # x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1203 # &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1205 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1206 # mailing list: <http://tinyurl.com/yu7ue>
1209 my $addr = shift;
1211 srand;
1212 my @encode = (
1213 sub { '&#' . ord(shift) . ';' },
1214 sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
1215 sub { shift },
1218 $addr = "mailto:" . $addr;
1220 $addr =~ s{(.)}{
1221 my $char = $1;
1222 if ( $char eq '@' ) {
1223 # this *must* be encoded. I insist.
1224 $char = $encode[int rand 1]->($char);
1225 } elsif ( $char ne ':' ) {
1226 # leave ':' alone (to spot mailto: later)
1227 my $r = rand;
1228 # roughly 10% raw, 45% hex, 45% dec
1229 $char = (
1230 $r > .9 ? $encode[2]->($char) :
1231 $r < .45 ? $encode[1]->($char) :
1232 $encode[0]->($char)
1235 $char;
1236 }gex;
1238 $addr = qq{<a href="$addr">$addr</a>};
1239 $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
1241 return $addr;
1245 sub _UnescapeSpecialChars {
1247 # Swap back in all the special characters we've hidden.
1249 my $text = shift;
1251 while( my($char, $hash) = each(%g_escape_table) ) {
1252 $text =~ s/$hash/$char/g;
1254 return $text;
1258 sub _TokenizeHTML {
1260 # Parameter: String containing HTML markup.
1261 # Returns: Reference to an array of the tokens comprising the input
1262 # string. Each token is either a tag (possibly with nested,
1263 # tags contained therein, such as <a href="<MTFoo>">, or a
1264 # run of text between tags. Each element of the array is a
1265 # two-element array; the first is either 'tag' or 'text';
1266 # the second is the actual value.
1269 # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
1270 # <http://www.bradchoate.com/past/mtregex.php>
1273 my $str = shift;
1274 my $pos = 0;
1275 my $len = length $str;
1276 my @tokens;
1278 my $depth = 6;
1279 my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
1280 my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment
1281 (?s: <\? .*? \?> ) | # processing instruction
1282 $nested_tags/ix; # nested tags
1284 while ($str =~ m/($match)/g) {
1285 my $whole_tag = $1;
1286 my $sec_start = pos $str;
1287 my $tag_start = $sec_start - length $whole_tag;
1288 if ($pos < $tag_start) {
1289 push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
1291 push @tokens, ['tag', $whole_tag];
1292 $pos = pos $str;
1294 push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
1295 \@tokens;
1299 sub _Outdent {
1301 # Remove one level of line-leading tabs or spaces
1303 my $text = shift;
1305 $text =~ s/^(\t|[ ]{1,$g_tab_width})//gm;
1306 return $text;
1310 sub _Detab {
1312 # Cribbed from a post by Bart Lateur:
1313 # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
1315 my $text = shift;
1317 $text =~ s{(.*?)\t}{$1.(' ' x ($g_tab_width - length($1) % $g_tab_width))}ge;
1318 return $text;
1324 __END__
1327 =pod
1329 =head1 NAME
1331 B<Markdown>
1334 =head1 SYNOPSIS
1336 B<Markdown.pl> [ B<--html4tags> ] [ B<--version> ] [ B<-shortversion> ]
1337 [ I<file> ... ]
1340 =head1 DESCRIPTION
1342 Markdown is a text-to-HTML filter; it translates an easy-to-read /
1343 easy-to-write structured text format into HTML. Markdown's text format
1344 is most similar to that of plain text email, and supports features such
1345 as headers, *emphasis*, code blocks, blockquotes, and links.
1347 Markdown's syntax is designed not as a generic markup language, but
1348 specifically to serve as a front-end to (X)HTML. You can use span-level
1349 HTML tags anywhere in a Markdown document, and you can use block level
1350 HTML tags (like <div> and <table> as well).
1352 For more information about Markdown's syntax, see:
1354 http://daringfireball.net/projects/markdown/
1357 =head1 OPTIONS
1359 Use "--" to end switch parsing. For example, to open a file named "-z", use:
1361 Markdown.pl -- -z
1363 =over 4
1366 =item B<--html4tags>
1368 Use HTML 4 style for empty element tags, e.g.:
1370 <br>
1372 instead of Markdown's default XHTML style tags, e.g.:
1374 <br />
1377 =item B<-v>, B<--version>
1379 Display Markdown's version number and copyright information.
1382 =item B<-s>, B<--shortversion>
1384 Display the short-form version number.
1387 =back
1391 =head1 BUGS
1393 To file bug reports or feature requests (other than topics listed in the
1394 Caveats section above) please send email to:
1396 support@daringfireball.net
1398 Please include with your report: (1) the example input; (2) the output
1399 you expected; (3) the output Markdown actually produced.
1402 =head1 VERSION HISTORY
1404 See the readme file for detailed release notes for this version.
1406 1.0.1 - 14 Dec 2004
1408 1.0 - 28 Aug 2004
1411 =head1 AUTHOR
1413 John Gruber
1414 http://daringfireball.net
1416 PHP port and other contributions by Michel Fortin
1417 http://michelf.com
1420 =head1 COPYRIGHT AND LICENSE
1422 Copyright (c) 2003-2004 John Gruber
1423 <http://daringfireball.net/>
1424 All rights reserved.
1426 Redistribution and use in source and binary forms, with or without
1427 modification, are permitted provided that the following conditions are
1428 met:
1430 * Redistributions of source code must retain the above copyright notice,
1431 this list of conditions and the following disclaimer.
1433 * Redistributions in binary form must reproduce the above copyright
1434 notice, this list of conditions and the following disclaimer in the
1435 documentation and/or other materials provided with the distribution.
1437 * Neither the name "Markdown" nor the names of its contributors may
1438 be used to endorse or promote products derived from this software
1439 without specific prior written permission.
1441 This software is provided by the copyright holders and contributors "as
1442 is" and any express or implied warranties, including, but not limited
1443 to, the implied warranties of merchantability and fitness for a
1444 particular purpose are disclaimed. In no event shall the copyright owner
1445 or contributors be liable for any direct, indirect, incidental, special,
1446 exemplary, or consequential damages (including, but not limited to,
1447 procurement of substitute goods or services; loss of use, data, or
1448 profits; or business interruption) however caused and on any theory of
1449 liability, whether in contract, strict liability, or tort (including
1450 negligence or otherwise) arising in any way out of the use of this
1451 software, even if advised of the possibility of such damage.
1453 =cut