1 # my patched markdown version, it basically adds a '  ' after each
3 package Text
::Markdown
;
9 use Digest
::MD5
qw(md5_hex);
14 our $VERSION = '1.0.24';
15 our @EXPORT_OK = qw(markdown);
19 Text::Markdown - Convert Markdown syntax to (X)HTML
23 use Text::Markdown 'markdown';
24 my $html = markdown($text);
26 use Text::Markdown 'markdown';
27 my $html = markdown( $text, {
28 empty_element_suffix => '>',
33 my $m = Text::Markdown->new;
34 my $html = $m->markdown($text);
37 my $m = Text::MultiMarkdown->new(
38 empty_element_suffix => '>',
41 my $html = $m->markdown( $text );
45 Markdown is a text-to-HTML filter; it translates an easy-to-read /
46 easy-to-write structured text format into HTML. Markdown's text format
47 is most similar to that of plain text email, and supports features such
48 as headers, *emphasis*, code blocks, blockquotes, and links.
50 Markdown's syntax is designed not as a generic markup language, but
51 specifically to serve as a front-end to (X)HTML. You can use span-level
52 HTML tags anywhere in a Markdown document, and you can use block level
53 HTML tags (like <div> and <table> as well).
57 This module implements the 'original' Markdown markdown syntax from:
59 http://daringfireball.net/projects/markdown/
63 Text::Markdown supports a number of options to it's processor which control the behaviour of the output document.
65 These options can be supplied to the constructor, on in a hash with the individual calls to the markdown method.
66 See the synopsis for examples of both of the above styles.
68 The options for the processor are:
72 =item empty_element_suffix
74 This option can be used to generate normal HTML output. By default, it is ' />', which is xHTML, change to '>' for normal HTML.
78 Controls indent width in the generated markup, defaults to 4
80 =item markdown_in_html_blocks
82 Controls if Markdown is processed when inside HTML blocks. Defaults to 0.
84 =item trust_list_start_value
86 If true, ordered lists will use the first number as the starting point for
87 numbering. This will let you pick up where you left off by writing:
97 (Note that in the above, quux will be numbered 4.)
105 # Regex to match balanced [brackets]. See Friedl's
106 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
107 our ($g_nested_brackets, $g_nested_parens);
108 $g_nested_brackets = qr{
109 (?
> # Atomic matching
110 [^\
[\
]]+ # Anything other than brackets
113 (??
{ $g_nested_brackets }) # Recursive set of nested brackets
117 # Doesn't allow for whitespace, because we're using it to match URLs:
118 $g_nested_parens = qr{
119 (?
> # Atomic matching
120 [^()\s
]+ # Anything other than parens or whitespace
123 (??
{ $g_nested_parens }) # Recursive set of nested brackets
128 # Table of hash values for escaped characters:
130 foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
131 $g_escape_table{$char} = md5_hex
($char);
138 A simple constructor, see the SYNTAX and OPTIONS sections for more information.
143 my ($class, %p) = @_;
145 $p{base_url
} ||= ''; # This is the base url to be used for WikiLinks
147 $p{tab_width
} = 4 unless (defined $p{tab_width
} and $p{tab_width
} =~ m/^\d+$/);
149 $p{empty_element_suffix
} ||= ' />'; # Change to ">" for HTML output
151 # Is markdown processed in HTML blocks? See t/15inlinehtmldonotturnoffmarkdown.t
152 $p{markdown_in_html_blocks
} = $p{markdown_in_html_blocks
} ?
1 : 0;
154 $p{trust_list_start_value
} = $p{trust_list_start_value
} ?
1 : 0;
156 my $self = { params
=> \
%p };
157 bless $self, ref($class) || $class;
163 The main function as far as the outside world is concerned. See the SYNOPSIS
169 my ( $self, $text, $options ) = @_;
171 # Detect functional mode, and create an instance for this run..
173 if ( $self ne __PACKAGE__
) {
174 my $ob = __PACKAGE__
->new();
175 # $self is text, $text is options
176 return $ob->markdown($self, $text);
179 croak
('Calling ' . $self . '->markdown (as a class method) is not supported.');
185 %$self = (%{ $self->{params
} }, %$options, params
=> $self->{params
});
187 $self->_CleanUpRunData($options);
189 return $self->_Markdown($text);
192 sub _CleanUpRunData
{
193 my ($self, $options) = @_;
194 # Clear the global hashes. If we don't clear these, you get conflicts
195 # from other articles when generating a page which contains more than
196 # one article (e.g. an index page that shows the N most recent
198 $self->{_urls
} = $options->{urls
} ?
$options->{urls
} : {}; # FIXME - document passing this option (tested in 05options.t).
199 $self->{_titles
} = {};
200 $self->{_html_blocks
} = {};
201 # Used to track when we're inside an ordered or unordered list
202 # (see _ProcessListItems() for details)
203 $self->{_list_level
} = 0;
209 # Main function. The order in which other subs are called here is
210 # essential. Link and image substitutions need to happen before
211 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
212 # and <img> tags get encoded.
214 my ($self, $text, $options) = @_;
216 $text = $self->_CleanUpDoc($text);
218 # Turn block-level HTML blocks into hash entries
219 $text = $self->_HashHTMLBlocks($text) unless $self->{markdown_in_html_blocks
};
221 $text = $self->_StripLinkDefinitions($text);
223 $text = $self->_RunBlockGamut($text);
225 $text = $self->_UnescapeSpecialChars($text);
227 $text = $self->_ConvertCopyright($text);
234 Returns a reference to a hash with the key being the markdown reference and the value being the URL.
236 Useful for building scripts which preprocess a list of links before the main content. See t/05options.t
237 for an example of this hashref being passed back into the markdown method to create links.
244 return $self->{_urls
};
248 my ($self, $text) = @_;
250 # Standardize line endings:
251 $text =~ s{\r\n}{\n}g; # DOS to Unix
252 $text =~ s{\r}{\n}g; # Mac to Unix
254 # Make sure $text ends with a couple of newlines:
257 # Convert all tabs to spaces.
258 $text = $self->_Detab($text);
260 # Strip any lines consisting only of spaces and tabs.
261 # This makes subsequent regexen easier to write, because we can
262 # match consecutive blank lines with /\n+/ instead of something
263 # contorted like /[ \t]*\n+/ .
264 $text =~ s/^[ \t]+$//mg;
269 sub _StripLinkDefinitions
{
271 # Strips link definitions from text, stores the URLs and titles in
274 my ($self, $text) = @_;
275 my $less_than_tab = $self->{tab_width
} - 1;
277 # Link defs are in the form: ^[id]: url "optional title"
279 ^[ ]{0,$less_than_tab}\
[(.+)\
]: # id = \$1
281 \n?
# maybe *one* newline
283 <?
(\S
+?
)>?
# url = \$2
285 \n?
# maybe one newline
288 (?
<=\s
) # lookbehind for whitespace
293 )?
# title is optional
296 $self->{_urls
}{lc $1} = $self->_EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
298 $self->{_titles
}{lc $1} = $3;
299 $self->{_titles
}{lc $1} =~ s/"/"/g;
308 # Internal function used to safely MD5sum chunks of the input, which might be Unicode in Perl's internal representation.
310 return unless defined $input;
311 if (Encode
::is_utf8
$input) {
312 return md5_hex
(Encode
::encode
('utf8', $input));
315 return md5_hex
($input);
319 sub _HashHTMLBlocks
{
320 my ($self, $text) = @_;
321 my $less_than_tab = $self->{tab_width
} - 1;
323 # Hashify HTML blocks:
324 # We only want to do this for block-level HTML tags, such as headers,
325 # lists, and tables. That's because we still want to wrap <p>s around
326 # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
327 # phrase emphasis, and spans. The list of tags we're looking for is
331 p
| div
| h
[1-6] | blockquote
| pre
| table
|
332 dl
| ol
| ul
| script
| noscript
| form
|
333 fieldset
| iframe
| math
| ins
| del
338 (?
: # Match one attr name/value pair
339 \s
+ # There needs to be at least some whitespace
340 # before each attribute name.
341 [\w
.:_
-]+ # Attribute name
344 ".+?" # "Attribute value"
346 '.+?' # 'Attribute value'
351 my $empty_tag = qr{< \w+ $tag_attrs \s* />}oxms;
352 my $open_tag = qr{< $block_tags $tag_attrs \s* >}oxms;
353 my $close_tag = undef; # let Text::Balanced handle this
355 use Text
::Balanced
qw(gen_extract_tagged);
356 my $extract_block = gen_extract_tagged
($open_tag, $close_tag, undef, { ignore
=> [$empty_tag] });
359 while ($text =~ s{^(([ ]{0,$less_than_tab}<)?.*\n)}{}m) {
362 # current line could be start of code block
364 my ($tag, $remainder) = $extract_block->($cur_line . $text);
366 my $key = _md5_utf8
($tag);
367 $self->{_html_blocks
}{$key} = $tag;
368 push @chunks, "\n\n" . $key . "\n\n";
372 # No tag match, so toss $cur_line into @chunks
373 push @chunks, $cur_line;
377 # current line could NOT be start of code block
378 push @chunks, $cur_line;
382 push @chunks, $text; # Whatever is left.
384 $text = join '', @chunks;
386 # Special case just for <hr />. It was easier to make a special case than
387 # to make the other regex more complicated.
388 $text = $self->_HashHR($text);
390 $text = $self->_HashHTMLComments($text);
392 $text = $self->_HashPHPASPBlocks($text);
398 my ($self, $text) = @_;
399 my $less_than_tab = $self->{tab_width
} - 1;
403 (?
<=\n\n) # Starting after a blank line
405 \A
\n?
# the beginning of the doc
408 [ ]{0,$less_than_tab}
409 <(hr
) # start tag = $2
412 /?
> # the matching end tag
414 (?
=\n{2,}|\Z
) # followed by a blank line or end of document
417 my $key = _md5_utf8
($1);
418 $self->{_html_blocks
}{$key} = $1;
419 "\n\n" . $key . "\n\n";
425 sub _HashHTMLComments
{
426 my ($self, $text) = @_;
427 my $less_than_tab = $self->{tab_width
} - 1;
429 # Special case for standalone HTML comments:
432 (?
<=\n\n) # Starting after a blank line
434 \A
\n?
# the beginning of the doc
437 [ ]{0,$less_than_tab}
444 (?
=\n{2,}|\Z
) # followed by a blank line or end of document
447 my $key = _md5_utf8
($1);
448 $self->{_html_blocks
}{$key} = $1;
449 "\n\n" . $key . "\n\n";
455 sub _HashPHPASPBlocks
{
456 my ($self, $text) = @_;
457 my $less_than_tab = $self->{tab_width
} - 1;
459 # PHP and ASP-style processor instructions (<?…?> and <%…%>)
462 (?
<=\n\n) # Starting after a blank line
464 \A
\n?
# the beginning of the doc
467 [ ]{0,$less_than_tab}
474 (?
=\n{2,}|\Z
) # followed by a blank line or end of document
477 my $key = _md5_utf8
($1);
478 $self->{_html_blocks
}{$key} = $1;
479 "\n\n" . $key . "\n\n";
486 # These are all the transformations that form block-level
487 # tags like paragraphs, headers, and list items.
489 my ($self, $text) = @_;
491 # Do headers first, as these populate cross-refs
492 $text = $self->_DoHeaders($text);
494 # And now, protect our tables
495 $text = $self->_HashHTMLBlocks($text) unless $self->{markdown_in_html_blocks
};
497 # Do Horizontal Rules:
498 my $less_than_tab = $self->{tab_width
} - 1;
499 $text =~ s{^[ ]{0,$less_than_tab}(\*[ ]?){3,}[ \t]*$}{\n<hr$self->{empty_element_suffix}\n}gmx
;
500 $text =~ s{^[ ]{0,$less_than_tab}(-[ ]?){3,}[ \t]*$}{\n<hr$self->{empty_element_suffix}\n}gmx
;
501 $text =~ s{^[ ]{0,$less_than_tab}(_[ ]?){3,}[ \t]*$}{\n<hr$self->{empty_element_suffix}\n}gmx
;
503 $text = $self->_DoLists($text);
505 $text = $self->_DoCodeBlocks($text);
507 $text = $self->_DoBlockQuotes($text);
509 # We already ran _HashHTMLBlocks() before, in Markdown(), but that
510 # was to escape raw HTML in the original Markdown source. This time,
511 # we're escaping the markup we've just created, so that we don't wrap
512 # <p> tags around block-level tags.
513 $text = $self->_HashHTMLBlocks($text);
515 $text = $self->_FormParagraphs($text);
522 # These are all the transformations that occur *within* block-level
523 # tags like paragraphs, headers, and list items.
525 my ($self, $text) = @_;
527 $text = $self->_DoCodeSpans($text);
528 $text = $self->_EscapeSpecialCharsWithinTagAttributes($text);
529 $text = $self->_EscapeSpecialChars($text);
531 # Process anchor and image tags. Images must come first,
532 # because ![foo][f] looks like an anchor.
533 $text = $self->_DoImages($text);
534 $text = $self->_DoAnchors($text);
536 # Make links out of things like `<http://example.com/>`
537 # Must come after _DoAnchors(), because you can use < and >
538 # delimiters in inline links like [this](<url>).
539 $text = $self->_DoAutoLinks($text);
541 $text = $self->_EncodeAmpsAndAngles($text);
543 $text = $self->_DoItalicsAndBold($text);
545 # FIXME - Is hard coding space here sane, or does this want to be related to tab width?
547 $text =~ s/ {2,}\n/ <br$self->{empty_element_suffix}\n/g;
552 sub _EscapeSpecialChars
{
553 my ($self, $text) = @_;
554 my $tokens ||= $self->_TokenizeHTML($text);
556 $text = ''; # rebuild $text from the tokens
557 # my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags.
558 # my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
560 foreach my $cur_token (@
$tokens) {
561 if ($cur_token->[0] eq "tag") {
562 # Within tags, encode * and _ so they don't conflict
563 # with their use in Markdown for italics and strong.
564 # We're replacing each such character with its
565 # corresponding MD5 checksum value; this is likely
566 # overkill, but it should prevent us from colliding
567 # with the escape values by accident.
568 $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!ogx;
569 $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!ogx;
570 $text .= $cur_token->[1];
572 my $t = $cur_token->[1];
573 $t = $self->_EncodeBackslashEscapes($t);
580 sub _EscapeSpecialCharsWithinTagAttributes
{
582 # Within tags -- meaning between < and > -- encode [\ ` * _] so they
583 # don't conflict with their use in Markdown for code, italics and strong.
584 # We're replacing each such character with its corresponding MD5 checksum
585 # value; this is likely overkill, but it should prevent us from colliding
586 # with the escape values by accident.
588 my ($self, $text) = @_;
589 my $tokens ||= $self->_TokenizeHTML($text);
590 $text = ''; # rebuild $text from the tokens
592 foreach my $cur_token (@
$tokens) {
593 if ($cur_token->[0] eq "tag") {
594 $cur_token->[1] =~ s! \\ !$g_escape_table{'\\'}!gox;
595 $cur_token->[1] =~ s{ (?<=.)</?code>(?=.) }{$g_escape_table{'`'}}gox
;
596 $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gox;
597 $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gox;
599 $text .= $cur_token->[1];
606 # Turn Markdown link shortcuts into XHTML <a> tags.
608 my ($self, $text) = @_;
611 # First, handle reference-style links: [link text] [id]
614 ( # wrap whole match in $1
616 ($g_nested_brackets) # link text = $2
619 [ ]?
# one optional space
620 (?
:\n[ ]*)?
# one optional newline followed by spaces
627 my $whole_match = $1;
631 if ($link_id eq "") {
632 $link_id = lc $link_text; # for shortcut links like [this][].
635 $link_id =~ s{[ ]*\n}{ }g; # turn embedded newlines into spaces
637 $self->_GenerateAnchor($whole_match, $link_text, $link_id);
641 # Next, inline-style links: [link text](url "optional title")
644 ( # wrap whole match in $1
646 ($g_nested_brackets) # link text = $2
650 ($g_nested_parens) # href = $3
653 (['"]) # quote char = $5
656 [ \t]* # ignore any spaces/tabs between closing quote and )
657 )? # title is optional
662 my $whole_match = $1;
667 $self->_GenerateAnchor($whole_match, $link_text, undef, $url, $title);
671 # Last, handle reference-style shortcuts: [link text]
672 # These must come last in case you've also got
[link test
][1]
673 # or [link test](/foo)
676 ( # wrap whole match in $1
678 ([^\
[\
]]+) # link text = $2; can't contain '[' or ']'
683 my $whole_match = $1;
685 (my $link_id = lc $2) =~ s{[ ]*\n}{ }g; # lower-case and turn embedded newlines into spaces
687 $self->_GenerateAnchor($whole_match, $link_text, $link_id);
693 sub _GenerateAnchor
{
694 # FIXME - Fugly, change to named params?
695 my ($self, $whole_match, $link_text, $link_id, $url, $title, $attributes) = @_;
699 $attributes = '' unless defined $attributes;
701 if ( !defined $url && defined $self->{_urls
}{$link_id}) {
702 $url = $self->{_urls
}{$link_id};
709 $url =~ s! \* !$g_escape_table{'*'}!gox; # We've got to encode these to avoid
710 $url =~ s! _ !$g_escape_table{'_'}!gox; # conflicting with italics/bold.
711 $url =~ s{^<(.*)>$}{$1}; # Remove <>'s surrounding URL, if present
713 $result = qq{<a href
="$url"};
715 if ( !defined $title && defined $link_id && defined $self->{_titles
}{$link_id} ) {
716 $title = $self->{_titles
}{$link_id};
719 if ( defined $title ) {
720 $title =~ s/"/"/g;
721 $title =~ s! \* !$g_escape_table{'*'}!gox;
722 $title =~ s! _ !$g_escape_table{'_'}!gox;
723 $result .= qq{ title
="$title"};
726 $result .= "$attributes>$link_text</a>";
733 # Turn Markdown image shortcuts into <img> tags.
735 my ($self, $text) = @_;
738 # First, handle reference-style labeled images: ![alt text][id]
741 ( # wrap whole match in $1
743 (.*?
) # alt text = $2
746 [ ]?
# one optional space
747 (?
:\n[ ]*)?
# one optional newline followed by spaces
756 my $whole_match = $1;
760 if ($link_id eq '') {
761 $link_id = lc $alt_text; # for shortcut links like ![this][].
764 $self->_GenerateImage($whole_match, $alt_text, $link_id);
768 # Next, handle inline images: ![alt text](url "optional title")
769 # Don't forget: encode * and _
772 ( # wrap whole match in $1
774 (.*?
) # alt text = $2
778 ($g_nested_parens) # src url - href = $3
781 (['"]) # quote char = $5
785 )? # title is optional
790 my $whole_match = $1;
798 $self->_GenerateImage($whole_match, $alt_text, undef, $url, $title);
805 # FIXME - Fugly, change to named params?
806 my ($self, $whole_match, $alt_text, $link_id, $url, $title, $attributes) = @_;
810 $attributes = '' unless defined $attributes;
813 $alt_text =~ s/"/"/g;
814 # FIXME - how about >
816 if ( !defined $url && defined $self->{_urls}{$link_id}) {
817 $url = $self->{_urls}{$link_id};
820 # If there's
no such
link ID
, leave intact
:
821 return $whole_match unless defined $url;
823 $url =~ s! \* !$g_escape_table{'*'}!ogx; # We've got to encode these to avoid
824 $url =~ s! _ !$g_escape_table{'_'}!ogx; # conflicting with italics/bold.
825 $url =~ s{^<(.*)>$}{$1}; # Remove <>'s surrounding URL, if present
827 if (!defined $title && length $link_id && defined $self->{_titles
}{$link_id} && length $self->{_titles
}{$link_id}) {
828 $title = $self->{_titles
}{$link_id};
831 $result = qq{<img src
="$url" alt
="$alt_text"};
832 if (defined $title && length $title) {
833 $title =~ s! \* !$g_escape_table{'*'}!ogx;
834 $title =~ s! _ !$g_escape_table{'_'}!ogx;
835 $title =~ s/"/"/g;
836 $result .= qq{ title
="$title"};
838 $result .= $attributes . $self->{empty_element_suffix
};
844 my ($self, $text) = @_;
846 # Setext-style headers:
853 $text =~ s
{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
854 $self->_GenerateHeader('1', $1);
857 $text =~ s
{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
858 $self->_GenerateHeader('2', $1);
865 # ## Header 2 with closing hashes ##
871 ^(\#
{1,6}) # $1 = string of #'s
873 (.+?
) # $2 = Header text
875 \#
* # optional closing #'s (not counted)
878 my $h_level = length($1);
879 $self->_GenerateHeader($h_level, $2);
885 sub _GenerateHeader
{
886 my ($self, $level, $id) = @_;
888 return "<h$level>" . $self->_RunSpanGamut($id) . "</h$level>\n\n";
893 # Form HTML ordered (numbered) and unordered (bulleted) lists.
895 my ($self, $text) = @_;
896 my $less_than_tab = $self->{tab_width
} - 1;
898 # Re-usable patterns to match list item bullets and number markers:
899 my $marker_ul = qr/[*+-]/;
900 my $marker_ol = qr/\d+[.]/;
901 my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
903 # Re-usable pattern to match any entirel ul or ol list:
907 [ ]{0,$less_than_tab}
908 (${marker_any
}) # $3 = first list item marker
917 (?
! # Negative lookahead for another list item marker
925 # We use a different prefix before nested lists than top-level lists.
926 # See extended comment in _ProcessListItems().
928 # Note: There's a bit of duplication here. My original implementation
929 # created a scalar regex pattern as the conditional result of the test on
930 # $self->{_list_level}, and then only ran the $text =~ s{...}{...}egmx
931 # substitution once, using the scalar as the pattern. This worked,
932 # everywhere except when running under MT on my hosting account at Pair
933 # Networks. There, this caused all rebuilds to be killed by the reaper (or
934 # perhaps they crashed, but that seems incredibly unlikely given that the
935 # same script on the same server ran fine *except* under MT. I've spent
936 # more time trying to figure out why this is happening than I'd like to
937 # admit. My only guess, backed up by the fact that this workaround works,
938 # is that Perl optimizes the substition when it can figure out that the
939 # pattern will never change, and when this optimization isn't on, we run
940 # afoul of the reaper. Thus, the slightly redundant code to that uses two
941 # static s/// patterns rather than one conditional pattern.
943 if ($self->{_list_level
}) {
950 my $list_type = ($marker =~ m/$marker_ul/) ?
"ul" : "ol";
951 # Turn double returns into triple returns, so that we can make a
952 # paragraph for the last item in a list, if necessary:
953 $list =~ s/\n{2,}/\n\n\n/g;
954 my $result = ( $list_type eq 'ul' ) ?
955 $self->_ProcessListItemsUL($list, $marker_ul)
956 : $self->_ProcessListItemsOL($list, $marker_ol);
958 $result = $self->_MakeList($list_type, $result, $marker);
969 my $list_type = ($marker =~ m/$marker_ul/) ?
"ul" : "ol";
970 # Turn double returns into triple returns, so that we can make a
971 # paragraph for the last item in a list, if necessary:
972 $list =~ s/\n{2,}/\n\n\n/g;
973 my $result = ( $list_type eq 'ul' ) ?
974 $self->_ProcessListItemsUL($list, $marker_ul)
975 : $self->_ProcessListItemsOL($list, $marker_ol);
976 $result = $self->_MakeList($list_type, $result, $marker);
986 my ($self, $list_type, $content, $marker) = @_;
988 if ($list_type eq 'ol' and $self->{trust_list_start_value
}) {
989 my ($num) = $marker =~ /^(\d+)[.]/;
990 return "<ol start='$num'>\n" . $content . "</ol>\n";
993 return "<$list_type>\n" . $content . "</$list_type>\n";
996 sub _ProcessListItemsOL
{
998 # Process the contents of a single ordered list, splitting it
999 # into individual list items.
1002 my ($self, $list_str, $marker_any) = @_;
1005 # The $self->{_list_level} global keeps track of when we're inside a list.
1006 # Each time we enter a list, we increment it; when we leave a list,
1007 # we decrement. If it's zero, we're not in a list anymore.
1009 # We do this because when we're not inside a list, we want to treat
1010 # something like this:
1012 # I recommend upgrading to version
1013 # 8. Oops, now this line is treated
1016 # As a single paragraph, despite the fact that the second line starts
1017 # with a digit-period-space sequence.
1019 # Whereas when we're inside a list (or sub-list), that line will be
1020 # treated as the start of a sub-list. What a kludge, huh? This is
1021 # an aspect of Markdown's syntax that's hard to parse perfectly
1022 # without resorting to mind-reading. Perhaps the solution is to
1023 # change the syntax rules such that sub-lists must start with a
1024 # starting cardinal number; e.g. "1." or "a.".
1026 $self->{_list_level
}++;
1028 # trim trailing blank lines:
1029 $list_str =~ s/\n{2,}\z/\n/;
1033 (\n)?
# leading line = $1
1034 (^[ \t]*) # leading whitespace = $2
1035 ($marker_any) [ \t]+ # list marker = $3
1036 ((?s
:.+?
) # list item text = $4
1038 (?
= \n* (\z
| \
2 ($marker_any) [ \t]+))
1041 my $leading_line = $1;
1042 my $leading_space = $2;
1044 if ($leading_line or ($item =~ m/\n{2,}/)) {
1045 $item = $self->_RunBlockGamut($self->_Outdent($item));
1048 # Recursion for sub-lists:
1049 $item = $self->_DoLists($self->_Outdent($item));
1051 $item = $self->_RunSpanGamut($item);
1054 "<li>" . $item . "</li>\n";
1057 $self->{_list_level
}--;
1061 sub _ProcessListItemsUL
{
1063 # Process the contents of a single unordered list, splitting it
1064 # into individual list items.
1067 my ($self, $list_str, $marker_any) = @_;
1070 # The $self->{_list_level} global keeps track of when we're inside a list.
1071 # Each time we enter a list, we increment it; when we leave a list,
1072 # we decrement. If it's zero, we're not in a list anymore.
1074 # We do this because when we're not inside a list, we want to treat
1075 # something like this:
1077 # I recommend upgrading to version
1078 # 8. Oops, now this line is treated
1081 # As a single paragraph, despite the fact that the second line starts
1082 # with a digit-period-space sequence.
1084 # Whereas when we're inside a list (or sub-list), that line will be
1085 # treated as the start of a sub-list. What a kludge, huh? This is
1086 # an aspect of Markdown's syntax that's hard to parse perfectly
1087 # without resorting to mind-reading. Perhaps the solution is to
1088 # change the syntax rules such that sub-lists must start with a
1089 # starting cardinal number; e.g. "1." or "a.".
1091 $self->{_list_level
}++;
1093 # trim trailing blank lines:
1094 $list_str =~ s/\n{2,}\z/\n/;
1098 (\n)?
# leading line = $1
1099 (^[ \t]*) # leading whitespace = $2
1100 ($marker_any) [ \t]+ # list marker = $3
1101 ((?s
:.+?
) # list item text = $4
1103 (?
= \n* (\z
| \
2 ($marker_any) [ \t]+))
1106 my $leading_line = $1;
1107 my $leading_space = $2;
1109 if ($leading_line or ($item =~ m/\n{2,}/)) {
1110 $item = $self->_RunBlockGamut($self->_Outdent($item));
1113 # Recursion for sub-lists:
1114 $item = $self->_DoLists($self->_Outdent($item));
1116 $item = $self->_RunSpanGamut($item);
1119 "<li>" . $item . "</li>\n";
1122 $self->{_list_level
}--;
1128 # Process Markdown `<pre><code>` blocks.
1131 my ($self, $text) = @_;
1135 ( # $1 = the code block -- one or more lines, starting with a space/tab
1137 (?
:[ ]{$self->{tab_width
}} | \t) # Lines must start with a tab or a tab-width of spaces
1141 ((?
=^[ ]{0,$self->{tab_width
}}\S
)|\Z
) # Lookahead for non-space at line-start, or end of doc
1144 my $result; # return value
1146 $codeblock = $self->_EncodeCode($self->_Outdent($codeblock));
1147 $codeblock = $self->_Detab($codeblock);
1148 $codeblock =~ s/\A\n+//; # trim leading newlines
1149 $codeblock =~ s/\n+\z//; # trim trailing newlines
1151 $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
1161 # * Backtick quotes are used for <code></code> spans.
1163 # * You can use multiple backticks as the delimiters if you want to
1164 # include literal backticks in the code span. So, this input:
1166 # Just type ``foo `bar` baz`` at the prompt.
1168 # Will translate to:
1170 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1172 # There's no arbitrary limit to the number of backticks you
1173 # can use as delimters. If you need three consecutive backticks
1174 # in your code, use four for delimiters, etc.
1176 # * You can use spaces to get literal backticks at the edges:
1178 # ... type `` `bar` `` ...
1182 # ... type <code>`bar`</code> ...
1185 my ($self, $text) = @_;
1188 (?
<!\\) # Character before opening ` can't be a backslash
1189 (`+) # $1 = Opening run of `
1190 (.+?
) # $2 = The code block
1192 \1 # Matching closer
1196 $c =~ s/^[ \t]*//g; # leading whitespace
1197 $c =~ s/[ \t]*$//g; # trailing whitespace
1198 $c = $self->_EncodeCode($c);
1207 # Encode/escape certain characters inside Markdown code runs.
1208 # The point is that in code, these characters are literals,
1209 # and lose their special Markdown meanings.
1214 # Encode all ampersands; HTML entities are not
1215 # entities within a Markdown code span.
1218 # Encode $'s, but only if we're running under Blosxom.
1219 # (Blosxom interpolates Perl variables in article bodies.)
1222 if (defined($blosxom::version
)) {
1228 # Do the angle bracket song and dance:
1232 # Now, escape characters that are magic in Markdown:
1233 s! \* !$g_escape_table{'*'}!ogx;
1234 s! _ !$g_escape_table{'_'}!ogx;
1235 s! { !$g_escape_table{'{'}!ogx;
1236 s! } !$g_escape_table{'}'}!ogx;
1237 s! \[ !$g_escape_table{'['}!ogx;
1238 s! \] !$g_escape_table{']'}!ogx;
1239 s! \\ !$g_escape_table{'\\'}!ogx;
1244 sub _DoItalicsAndBold
{
1245 my ($self, $text) = @_;
1247 # Handle at beginning of lines:
1248 $text =~ s
{ ^(\
*\
*|__
) (?
=\S
) (.+?
[*_
]*) (?
<=\S
) \
1 }
1249 {<strong
>$2</strong
>}gsx
;
1251 $text =~ s
{ ^(\
*|_
) (?
=\S
) (.+?
) (?
<=\S
) \
1 }
1254 # <strong> must go first:
1255 $text =~ s
{ (?
<=\W
) (\
*\
*|__
) (?
=\S
) (.+?
[*_
]*) (?
<=\S
) \
1 }
1256 {<strong
>$2</strong
>}gsx
;
1258 $text =~ s
{ (?
<=\W
) (\
*|_
) (?
=\S
) (.+?
) (?
<=\S
) \
1 }
1261 # And now, a second pass to catch nested strong and emphasis special cases
1262 $text =~ s
{ (?
<=\W
) (\
*\
*|__
) (?
=\S
) (.+?
[*_
]*) (?
<=\S
) \
1 }
1263 {<strong
>$2</strong
>}gsx
;
1265 $text =~ s
{ (?
<=\W
) (\
*|_
) (?
=\S
) (.+?
) (?
<=\S
) \
1 }
1271 sub _DoBlockQuotes
{
1272 my ($self, $text) = @_;
1275 ( # Wrap whole match in $1
1277 ^[ \t]*>[ \t]?
# '>' at the start of a line
1278 .+\n # rest of the first line
1279 (.+\n)* # subsequent consecutive lines
1285 $bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting
1286 $bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
1287 $bq = $self->_RunBlockGamut($bq); # recurse
1290 # These leading spaces screw with <pre> content, so we need to fix that:
1299 "<blockquote>\n$bq\n</blockquote>\n\n";
1306 sub _FormParagraphs
{
1309 # $text - string to process with html <p> tags
1311 my ($self, $text) = @_;
1313 # Strip leading and trailing lines:
1317 my @grafs = split(/\n{2,}/, $text);
1323 unless (defined( $self->{_html_blocks
}{$_} )) {
1324 $_ = $self->_RunSpanGamut($_);
1325 s/^([ \t]*)/<p> /;
1331 # Unhashify HTML blocks
1334 if (defined( $self->{_html_blocks
}{$_} )) {
1335 $_ = $self->{_html_blocks
}{$_};
1339 return join "\n\n", @grafs;
1342 sub _EncodeAmpsAndAngles
{
1343 # Smart processing for ampersands and angle brackets that need to be encoded.
1345 my ($self, $text) = @_;
1346 return '' if (!defined $text or !length $text);
1348 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1349 # http://bumppo.net/projects/amputator/
1350 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&/g;
1353 $text =~ s{<(?![a-z/?\$!])}{<}gi;
1355 # And >'s - added by Fletcher Penney
1356 # $text =~ s{>(?![a-z/?\$!])}{>}gi;
1357 # Causes problems...
1359 # Remove encoding inside comments
1361 (?
<=<!--) # Begin comment
1362 (.*?
) # Anything inside
1363 (?
=-->) # End comments
1374 sub _EncodeBackslashEscapes
{
1376 # Parameter: String.
1377 # Returns: The string, with after processing the following backslash
1383 s! \\\\ !$g_escape_table{'\\'}!ogx; # Must process escaped backslashes first.
1384 s! \\` !$g_escape_table{'`'}!ogx;
1385 s! \\\* !$g_escape_table{'*'}!ogx;
1386 s! \\_ !$g_escape_table{'_'}!ogx;
1387 s! \\\{ !$g_escape_table{'{'}!ogx;
1388 s! \\\} !$g_escape_table{'}'}!ogx;
1389 s! \\\[ !$g_escape_table{'['}!ogx;
1390 s! \\\] !$g_escape_table{']'}!ogx;
1391 s! \\\( !$g_escape_table{'('}!ogx;
1392 s! \\\) !$g_escape_table{')'}!ogx;
1393 s! \\> !$g_escape_table{'>'}!ogx;
1394 s! \\\# !$g_escape_table{'#'}!ogx;
1395 s! \\\+ !$g_escape_table{'+'}!ogx;
1396 s! \\\- !$g_escape_table{'-'}!ogx;
1397 s! \\\. !$g_escape_table{'.'}!ogx;
1398 s{ \\! }{$g_escape_table{'!'}}ogx
;
1404 my ($self, $text) = @_;
1406 $text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi;
1408 # Email addresses: <address@domain.foo>
1415 [-a
-z0
-9]+(\
.[-a
-z0
-9]+)*\
.[a
-z
]+
1419 $self->_EncodeEmailAddress( $self->_UnescapeSpecialChars($1) );
1425 sub _EncodeEmailAddress
{
1427 # Input: an email address, e.g. "foo@example.com"
1429 # Output: the email address as a mailto link, with each character
1430 # of the address encoded as either a decimal or hex entity, in
1431 # the hopes of foiling most address harvesting spam bots. E.g.:
1433 # <a href="mailto:foo@e
1434 # xample.com">foo
1435 # @example.com</a>
1437 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1438 # mailing list: <http://tinyurl.com/yu7ue>
1441 my ($self, $addr) = @_;
1444 sub { '&#' . ord(shift) . ';' },
1445 sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
1449 $addr = "mailto:" . $addr;
1453 if ( $char eq '@' ) {
1454 # this *must* be encoded. I insist.
1455 $char = $encode[int rand 1]->($char);
1457 elsif ( $char ne ':' ) {
1458 # leave ':' alone (to spot mailto: later)
1460 # roughly 10% raw, 45% hex, 45% dec
1462 $r > .9 ?
$encode[2]->($char) :
1463 $r < .45 ?
$encode[1]->($char) :
1470 $addr = qq{<a href
="$addr">$addr</a
>};
1471 $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
1476 sub _UnescapeSpecialChars
{
1478 # Swap back in all the special characters we've hidden.
1480 my ($self, $text) = @_;
1482 while( my($char, $hash) = each(%g_escape_table) ) {
1483 $text =~ s/$hash/$char/g;
1490 # Parameter: String containing HTML markup.
1491 # Returns: Reference to an array of the tokens comprising the input
1492 # string. Each token is either a tag (possibly with nested,
1493 # tags contained therein, such as <a href="<MTFoo>">, or a
1494 # run of text between tags. Each element of the array is a
1495 # two-element array; the first is either 'tag' or 'text';
1496 # the second is the actual value.
1499 # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
1500 # <http://www.bradchoate.com/past/mtregex.php>
1503 my ($self, $str) = @_;
1505 my $len = length $str;
1509 my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x
$depth) . (')*>)' x
$depth);
1510 my $match = qr
/(?s
: <! ( -- .*?
-- \s
* )+ > ) | # comment
1511 (?s
: <\? .*?
\?> ) | # processing instruction
1512 $nested_tags/iox
; # nested tags
1514 while ($str =~ m/($match)/og) {
1516 my $sec_start = pos $str;
1517 my $tag_start = $sec_start - length $whole_tag;
1518 if ($pos < $tag_start) {
1519 push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
1521 push @tokens, ['tag', $whole_tag];
1524 push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
1530 # Remove one level of line-leading tabs or spaces
1532 my ($self, $text) = @_;
1534 $text =~ s/^(\t|[ ]{1,$self->{tab_width}})//gm;
1540 # Cribbed from a post by Bart Lateur:
1541 # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
1543 my ($self, $text) = @_;
1545 # FIXME - Better anchor/regex would be quicker.
1548 #$text =~ s{(.*?)\t}{$1.(' ' x ($self->{tab_width} - length($1) % $self->{tab_width}))}ge;
1550 # Much swifter, but pretty hateful:
1551 do {} while ($text =~ s{^(.*?)\t}{$1.(' ' x ($self->{tab_width} - length($1) % $self->{tab_width
}))}mge
);
1555 sub _ConvertCopyright
{
1556 my ($self, $text) = @_;
1557 # Convert to an XML compatible form of copyright symbol
1559 $text =~ s/©/©/gi;
1568 =head1 OTHER IMPLEMENTATIONS
1570 Markdown has been re-implemented in a number of languages, and with a number of additions.
1572 Those that I have found are listed below:
1576 =item C - <http://www.pell.portland.or.us/~orc/Code/discount>
1578 Discount - Original Markdown, but in C. Fastest implementation available, and passes MDTest.
1579 Adds it's own set of custom features.
1581 =item python - <http://www.freewisdom.org/projects/python-markdown/>
1583 Python Markdown which is mostly compatible with the original, with an interesting extension API.
1585 =item ruby (maruku) - <http://maruku.rubyforge.org/>
1587 One of the nicest implementations out there. Builds a parse tree internally so very flexible.
1589 =item php - <http://michelf.com/projects/php-markdown/>
1591 A direct port of Markdown.pl, also has a separately maintained 'extra' version,
1592 which adds a number of features that were borrowed by MultiMarkdown.
1594 =item lua - <http://www.frykholm.se/files/markdown.lua>
1596 Port to lua. Simple and lightweight (as lua is).
1598 =item haskell - <http://johnmacfarlane.net/pandoc/>
1600 Pandoc is a more general library, supporting Markdown, reStructuredText, LaTeX and more.
1602 =item javascript - <http://www.attacklab.net/showdown-gui.html>
1604 Direct(ish) port of Markdown.pl to JavaScript
1610 To file bug reports or feature requests please send email to:
1612 bug-Text-Markdown@rt.cpan.org
1614 Please include with your report: (1) the example input; (2) the output
1615 you expected; (3) the output Markdown actually produced.
1617 =head1 VERSION HISTORY
1619 See the Changes file for detailed release notes for this version.
1624 http://daringfireball.net/
1626 PHP port and other contributions by Michel Fortin
1629 MultiMarkdown changes by Fletcher Penney
1630 http://fletcher.freeshell.org/
1632 CPAN Module Text::MultiMarkdown (based on Text::Markdown by Sebastian
1633 Riedel) originally by Darren Kulp (http://kulp.ch/)
1635 This module is maintained by: Tomas Doran http://www.bobtfish.net/
1637 =head1 THIS DISTRIBUTION
1639 Please note that this distribution is a fork of John Gruber's original Markdown project,
1640 and it *is not* in any way blessed by him.
1642 Whilst this code aims to be compatible with the original Markdown.pl (and incorporates
1643 and passes the Markdown test suite) whilst fixing a number of bugs in the original -
1644 there may be differences between the behaviour of this module and Markdown.pl. If you find
1645 any differences where you believe Text::Markdown behaves contrary to the Markdown spec,
1646 please report them as bugs.
1648 Text::Markdown *does not* extend the markdown dialect in any way from that which is documented at
1649 daringfireball. If you want additional features, you should look at L<Text::MultiMarkdown>.
1651 =head1 COPYRIGHT AND LICENSE
1653 Original Code Copyright (c) 2003-2004 John Gruber
1654 <http://daringfireball.net/>
1655 All rights reserved.
1657 MultiMarkdown changes Copyright (c) 2005-2006 Fletcher T. Penney
1658 <http://fletcher.freeshell.org/>
1659 All rights reserved.
1661 Text::MultiMarkdown changes Copyright (c) 2006-2008 Darren Kulp
1662 <http://kulp.ch> and Tomas Doran <http://www.bobtfish.net>
1664 Redistribution and use in source and binary forms, with or without
1665 modification, are permitted provided that the following conditions are
1668 * Redistributions of source code must retain the above copyright notice,
1669 this list of conditions and the following disclaimer.
1671 * Redistributions in binary form must reproduce the above copyright
1672 notice, this list of conditions and the following disclaimer in the
1673 documentation and/or other materials provided with the distribution.
1675 * Neither the name "Markdown" nor the names of its contributors may
1676 be used to endorse or promote products derived from this software
1677 without specific prior written permission.
1679 This software is provided by the copyright holders and contributors "as
1680 is" and any express or implied warranties, including, but not limited
1681 to, the implied warranties of merchantability and fitness for a
1682 particular purpose are disclaimed. In no event shall the copyright owner
1683 or contributors be liable for any direct, indirect, incidental, special,
1684 exemplary, or consequential damages (including, but not limited to,
1685 procurement of substitute goods or services; loss of use, data, or
1686 profits; or business interruption) however caused and on any theory of
1687 liability, whether in contract, strict liability, or tort (including
1688 negligence or otherwise) arising in any way out of the use of this
1689 software, even if advised of the possibility of such damage.