Markdown.pl: allow documented single-quote ref titles
[markdown.git] / Markdown.pl
blob3813b278105cee92df5074ab5f43a561da25ed5c
1 #!/usr/bin/env perl
4 # Markdown -- A text-to-HTML conversion tool for web writers
6 # Copyright (C) 2004 John Gruber
7 # Copyright (C) 2015,2016,2017,2018,2019 Kyle J. McKay
8 # All rights reserved.
9 # License is Modified BSD (aka 3-clause BSD) License\n";
10 # See LICENSE file (or <https://opensource.org/licenses/BSD-3-Clause>)
13 package Markdown;
15 require 5.008;
16 use strict;
17 use warnings;
19 use Encode;
21 use vars qw($COPYRIGHT $VERSION @ISA @EXPORT_OK);
23 BEGIN {*COPYRIGHT =
24 \"Copyright (C) 2004 John Gruber
25 Copyright (C) 2015,2016,2017,2018,2019 Kyle J. McKay
26 All rights reserved.
28 *VERSION = \"1.1.8-PRE"
31 require Exporter;
32 use Digest::MD5 qw(md5 md5_hex);
33 use File::Basename qw(basename);
34 use Scalar::Util qw(refaddr looks_like_number);
35 use Pod::Usage;
36 @ISA = qw(Exporter);
37 @EXPORT_OK = qw(Markdown);
38 $INC{__PACKAGE__.'.pm'} = $INC{basename(__FILE__)} unless exists $INC{__PACKAGE__.'.pm'};
40 close(DATA) if fileno(DATA);
41 exit(&_main(@ARGV)||0) unless caller;
43 my $encoder;
44 BEGIN {
45 $encoder = Encode::find_encoding('Windows-1252') ||
46 Encode::find_encoding('ISO-8859-1') or
47 die "failed to load ISO-8859-1 encoder\n";
51 # Global default settings:
53 my ($g_style_prefix, $g_empty_element_suffix, $g_indent_width, $g_tab_width);
54 BEGIN {
55 $g_style_prefix = "_markdown-"; # Prefix for markdown css class styles
56 $g_empty_element_suffix = " />"; # Change to ">" for HTML output
57 $g_indent_width = 4; # Number of spaces considered new level
58 $g_tab_width = 4; # Legacy even though it's wrong
63 # Globals:
66 # Style sheet template
67 my $g_style_sheet;
69 # Permanent block id table
70 my %g_perm_block_ids;
72 # Global hashes, used by various utility routines
73 my %g_urls;
74 my %g_titles;
75 my %g_anchors;
76 my %g_anchors_id;
77 my %g_block_ids;
78 my %g_code_block_ids;
79 my %g_html_blocks;
80 my %g_code_blocks;
81 my %opt;
83 # Return a "block id" to use to identify the block that does not contain
84 # any characters that could be misinterpreted by the rest of the code
85 # Originally this used md5_hex but that's unnecessarily slow
86 # Instead just use the refaddr of the scalar ref of the entry for that
87 # key in either the global or, if the optional second argument is true,
88 # permanent table. To avoid the result being confused with anything
89 # else, it's prefixed with a control character and suffixed with another
90 # both of which are not allowed by the XML standard or Unicode.
91 sub block_id {
92 $_[1] or return "\5".refaddr(\$g_block_ids{$_[0]})."\6";
93 $_[1] == 1 and return "\2".refaddr(\$g_perm_block_ids{$_[0]})."\3";
94 $_[1] == 2 and return "\25".refaddr(\$g_code_block_ids{$_[0]})."\26";
95 die "programmer error: bad block_id type $_[1]";
98 # Regex to match balanced [brackets]. See Friedl's
99 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
100 my $g_nested_brackets;
101 BEGIN {
102 $g_nested_brackets = qr{
103 (?> # Atomic matching
104 [^\[\]]+ # Anything other than brackets
107 (??{ $g_nested_brackets }) # Recursive set of nested brackets
114 # Table of hash values for escaped characters:
115 my %g_escape_table;
116 BEGIN {
117 $g_escape_table{""} = "\2\3";
118 foreach my $char (split //, "\\\`*_~{}[]()>#+-.!|:") {
119 $g_escape_table{$char} = block_id($char,1);
123 # Used to track when we're inside an ordered or unordered list
124 # (see _ProcessListItems() for details):
125 my $g_list_level;
126 BEGIN {
127 $g_list_level = 0;
131 #### Blosxom plug-in interface ##########################################
132 my $_haveBX;
133 BEGIN {
134 no warnings 'once';
135 $_haveBX = defined($blosxom::version);
138 # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
139 # which posts Markdown should process, using a "meta-markup: markdown"
140 # header. If it's set to 0 (the default), Markdown will process all
141 # entries.
142 my $g_blosxom_use_meta;
143 BEGIN {
144 $g_blosxom_use_meta = 0;
147 sub start { 1; }
148 sub story {
149 my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
151 if ((! $g_blosxom_use_meta) or
152 (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
154 $$body_ref = Markdown($$body_ref);
160 #### Movable Type plug-in interface #####################################
161 my $_haveMT = eval {require MT; 1;}; # Test to see if we're running in MT
162 my $_haveMT3 = $_haveMT && eval {require MT::Plugin; 1;}; # and MT >= MT 3.0.
164 unless ($_haveMT) {
165 require MT;
166 import MT;
167 require MT::Template::Context;
168 import MT::Template::Context;
170 unless ($_haveMT3) {
171 require MT::Plugin;
172 import MT::Plugin;
173 my $plugin = new MT::Plugin({
174 name => "Markdown",
175 description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
176 doc_link => 'http://daringfireball.net/projects/markdown/'
178 MT->add_plugin( $plugin );
181 MT::Template::Context->add_container_tag(MarkdownOptions => sub {
182 my $ctx = shift;
183 my $args = shift;
184 my $builder = $ctx->stash('builder');
185 my $tokens = $ctx->stash('tokens');
187 if (defined ($args->{'output'}) ) {
188 $ctx->stash('markdown_output', lc $args->{'output'});
191 defined (my $str = $builder->build($ctx, $tokens) )
192 or return $ctx->error($builder->errstr);
193 $str; # return value
196 MT->add_text_filter('markdown' => {
197 label => 'Markdown',
198 docs => 'http://daringfireball.net/projects/markdown/',
199 on_format => sub {
200 my $text = shift;
201 my $ctx = shift;
202 my $raw = 0;
203 if (defined $ctx) {
204 my $output = $ctx->stash('markdown_output');
205 if (defined $output && $output =~ m/^html/i) {
206 $g_empty_element_suffix = ">";
207 $ctx->stash('markdown_output', '');
209 elsif (defined $output && $output eq 'raw') {
210 $raw = 1;
211 $ctx->stash('markdown_output', '');
213 else {
214 $raw = 0;
215 $g_empty_element_suffix = " />";
218 $text = $raw ? $text : Markdown($text);
219 $text;
223 # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
224 my $smartypants;
227 no warnings "once";
228 $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
231 if ($smartypants) {
232 MT->add_text_filter('markdown_with_smartypants' => {
233 label => 'Markdown With SmartyPants',
234 docs => 'http://daringfireball.net/projects/markdown/',
235 on_format => sub {
236 my $text = shift;
237 my $ctx = shift;
238 if (defined $ctx) {
239 my $output = $ctx->stash('markdown_output');
240 if (defined $output && $output eq 'html') {
241 $g_empty_element_suffix = ">";
243 else {
244 $g_empty_element_suffix = " />";
247 $text = Markdown($text);
248 $text = $smartypants->($text, '1');
254 sub _strip {
255 my $str = shift;
256 defined($str) or return undef;
257 $str =~ s/^\s+//;
258 $str =~ s/\s+$//;
259 $str =~ s/\s+/ /g;
260 $str;
263 #### BBEdit/command-line text filter interface ##########################
264 sub _main {
265 local *ARGV = \@_;
268 #### Check for command-line switches: #################
269 my %options = ();
270 my %cli_opts;
271 use Getopt::Long;
272 Getopt::Long::Configure(qw(bundling require_order pass_through));
273 GetOptions(\%cli_opts,
274 'help','h',
275 'version|V',
276 'shortversion|short-version|s',
277 'html4tags',
278 'deprecated',
279 'htmlroot|r=s',
280 'imageroot|i=s',
281 'tabwidth|tab-width=s',
282 'stylesheet|style-sheet',
283 'no-stylesheet|no-style-sheet',
284 'stub',
286 if ($cli_opts{'help'}) {
287 pod2usage(-verbose => 2, -exitval => 0);
289 if ($cli_opts{'h'}) {
290 pod2usage(-verbose => 0, -exitval => 0);
292 if ($cli_opts{'version'}) { # Version info
293 print "\nThis is Markdown, version $VERSION.\n", $COPYRIGHT;
294 print "License is Modified BSD (aka 3-clause BSD) License\n";
295 print "<https://opensource.org/licenses/BSD-3-Clause>\n";
296 exit 0;
298 if ($cli_opts{'shortversion'}) { # Just the version number string.
299 print $VERSION;
300 exit 0;
302 my $stub = 0;
303 if ($cli_opts{'stub'}) {
304 $stub = 1;
306 if ($cli_opts{'html4tags'}) { # Use HTML tag style instead of XHTML
307 $options{empty_element_suffix} = ">";
308 $stub = -$stub;
310 if ($cli_opts{'deprecated'}) { # Allow <dir> and <menu> tags to pass through
311 _SetAllowedTag("dir");
312 _SetAllowedTag("menu");
314 if ($cli_opts{'tabwidth'}) {
315 my $tw = $cli_opts{'tabwidth'};
316 die "invalid tab width (must be integer)\n" unless looks_like_number $tw;
317 die "invalid tab width (must be >= 2 and <= 32)\n" unless $tw >= 2 && $tw <= 32;
318 $options{tab_width} = int(0+$tw);
320 if ($cli_opts{'htmlroot'}) { # Use URL prefix
321 $options{url_prefix} = $cli_opts{'htmlroot'};
323 if ($cli_opts{'imageroot'}) { # Use image URL prefix
324 $options{img_prefix} = $cli_opts{'imageroot'};
326 if ($cli_opts{'stylesheet'}) { # Display the style sheet
327 $options{show_styles} = 1;
329 if ($cli_opts{'no-stylesheet'}) { # Do not display the style sheet
330 $options{show_styles} = 0;
332 $options{show_styles} = 1 if $stub && !defined($options{show_styles});
333 $options{tab_width} = 8 unless defined($options{tab_width});
335 my $hdr = sub {
336 if ($stub > 0) {
337 print <<'HTML5';
338 <!DOCTYPE html>
339 <html xmlns="http://www.w3.org/1999/xhtml">
340 <head>
341 <meta charset="utf-8" />
342 <meta http-equiv="content-type" content="text/html; charset=utf-8" />
343 HTML5
344 } elsif ($stub < 0) {
345 print <<'HTML4';
346 <html>
347 <head>
348 <meta charset="utf-8">
349 <meta http-equiv="content-type" content="text/html; charset=utf-8">
350 HTML4
352 if ($stub && ($options{title} || $options{h1})) {
353 my $title = $options{title};
354 defined($title) && $title ne "" or $title = $options{h1};
355 if (defined($title) && $title ne "") {
356 $title =~ s/&/&amp;/g;
357 $title =~ s/</&lt;/g;
358 print "<title>$title</title>\n";
361 if ($options{show_styles}) {
362 my $stylesheet = $g_style_sheet;
363 $stylesheet =~ s/%\(base\)/$g_style_prefix/g;
364 print $stylesheet;
366 if ($stub) {
367 print "</head>\n<body style=\"text-align:center\">\n",
368 "<div style=\"display:inline-block;text-align:left;max-width:42pc\">\n";
372 #### Process incoming text: ###########################
373 my $didhdr;
374 for (;;) {
375 local $_;
377 local $/; # Slurp the whole file
378 $_ = <>;
380 defined($_) or last;
381 my $result = Markdown($_, \%options);
382 if ($result ne "") {
383 if (!$didhdr) {
384 &$hdr();
385 $didhdr = 1;
387 print $result;
390 &$hdr() unless $didhdr;
391 print "</div>\n</body>\n</html>\n" if $stub;
393 exit 0;
397 sub Markdown {
399 # Primary function. The order in which other subs are called here is
400 # essential. Link and image substitutions need to happen before
401 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
402 # and <img> tags get encoded.
404 my $_text = shift;
405 defined $_text or $_text='';
407 my $text;
408 if (Encode::is_utf8($_text) || utf8::decode($_text)) {
409 $text = $_text;
410 } else {
411 $text = $encoder->decode($_text, Encode::FB_DEFAULT);
413 $_text = undef;
415 # Any remaining arguments after the first are options; either a single
416 # hashref or a list of name, value paurs.
417 %opt = (
418 # set initial defaults
419 style_prefix => $g_style_prefix,
420 empty_element_suffix => $g_empty_element_suffix,
421 tab_width => $g_tab_width,
422 indent_width => $g_indent_width,
423 url_prefix => "", # Prefixed to non-absolute URLs
424 img_prefix => "", # Prefixed to non-absolute image URLs
426 my %args = ();
427 if (ref($_[0]) eq "HASH") {
428 %args = %{$_[0]};
429 } else {
430 %args = @_;
432 while (my ($k,$v) = each %args) {
433 $opt{$k} = $v;
436 # Clear the globals. If we don't clear these, you get conflicts
437 # from other articles when generating a page which contains more than
438 # one article (e.g. an index page that shows the N most recent
439 # articles):
440 %g_urls = ();
441 %g_titles = ();
442 %g_anchors = ();
443 %g_block_ids = ();
444 %g_code_block_ids = ();
445 %g_html_blocks = ();
446 %g_code_blocks = ();
447 $g_list_level = 0;
449 # Standardize line endings:
450 $text =~ s{\r\n}{\n}g; # DOS to Unix
451 $text =~ s{\r}{\n}g; # Mac to Unix
453 # Make sure $text ends with a couple of newlines:
454 $text .= "\n\n";
456 # Handle backticks-delimited code blocks
457 $text = _HashBTCodeBlocks($text);
459 # Convert all tabs to spaces.
460 $text = _Detab($text);
462 # Strip any lines consisting only of spaces.
463 # This makes subsequent regexen easier to write, because we can
464 # match consecutive blank lines with /\n+/ instead of something
465 # contorted like / *\n+/ .
466 $text =~ s/^ +$//mg;
468 # Turn block-level HTML blocks into hash entries
469 $text = _HashHTMLBlocks($text);
471 # Strip link definitions, store in hashes.
472 $text = _StripLinkDefinitions($text);
474 $text = _RunBlockGamut($text, 1);
476 # Remove indentation markers
477 $text =~ s/\027+//gs;
479 # Unhashify code blocks
480 $text =~ s/(\025\d+\026)/$g_code_blocks{$1}/g;
482 $text = _UnescapeSpecialChars($text);
484 $text .= "\n" unless $text eq "";
486 utf8::encode($text);
487 if (defined($opt{h1}) && $opt{h1} ne "" && ref($_[0]) eq "HASH") {
488 utf8::encode($opt{h1});
489 ${$_[0]}{h1} = $opt{h1}
491 return $text;
495 sub _HashBTCodeBlocks {
497 # Process Markdown backticks (```) delimited code blocks
499 my $text = shift;
500 my $less_than_indent = $opt{indent_width} - 1;
502 $text =~ s{
503 (?:(?<=\n)|\A)
504 ([ ]{0,$less_than_indent})``(`+)[ \t]*(?:([\w.+-]+)[ \t]*)?\n
505 ( # $4 = the code block -- one or more lines, starting with ```
507 .*\n+
510 # and ending with ``` or end of document
511 (?:(?:[ ]{0,$less_than_indent}``\2[ \t]*(?:\n|\Z))|\Z)
513 # $2 contains syntax highlighting to use if defined
514 my $leadsp = length($1);
515 my $codeblock = $4;
516 $codeblock =~ s/[ \t]+$//mg; # trim trailing spaces on lines
517 $codeblock = _Detab($codeblock, 8); # physical tab stops are always 8
518 $codeblock =~ s/\A\n+//; # trim leading newlines
519 $codeblock =~ s/\s+\z//; # trim trailing whitespace
520 $codeblock =~ s/^ {1,$leadsp}//mg if $leadsp; # trim leading space(s)
521 $codeblock = _EncodeCode($codeblock); # or run highlighter here
522 $codeblock = "<div class=\"$opt{style_prefix}code-bt\"><pre style=\"display:none\"></pre><pre><code>"
523 . $codeblock . "\n</code></pre></div>";
525 my $key = block_id($codeblock);
526 $g_html_blocks{$key} = $codeblock;
527 "\n\n" . $key . "\n\n";
528 }egmx;
530 return $text;
534 sub _StripLinkDefinitions {
536 # Strips link definitions from text, stores the URLs and titles in
537 # hash references.
539 my $text = shift;
540 my $less_than_indent = $opt{indent_width} - 1;
542 # Link defs are in the form: ^[id]: url "optional title"
543 while ($text =~ s{
544 ^[ ]{0,$less_than_indent}\[(.+)\]: # id = $1
545 [ ]*
546 \n? # maybe *one* newline
547 [ ]*
548 <?((?:\S(?:\\\n\s*[^\s"(])?)+?)>? # url = $2
549 [ ]*
550 \n? # maybe one newline
551 [ ]*
553 (?<=\s) # lookbehind for whitespace
554 (?:(['"])|(\()) # title quote char
555 (.+?) # title = $5
556 (?(4)\)|\3) # match same quote
557 [ ]*
558 )? # title is optional
559 (?:\n+|\Z)
561 {}mx) {
562 my $id = _strip(lc $1); # Link IDs are case-insensitive
563 my $url = $2;
564 my $title = _strip($5);
565 $url =~ s/\\\n\s*//gs;
566 if ($id ne "") {
567 $g_urls{$id} = _EncodeAmpsAndAngles($url);
568 if (defined($title) && $title ne "") {
569 $g_titles{$id} = $title;
570 $g_titles{$id} =~ s/\042/&quot;/g;
575 return $text;
578 my ($block_tags_a, $block_tags_b);
579 BEGIN {
580 $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/o;
581 $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/o;
584 sub _HashHTMLBlocks {
585 my $text = shift;
586 my $less_than_indent = $opt{indent_width} - 1;
587 my $idt = "\027" x $g_list_level;
589 # Hashify HTML blocks:
590 # We only want to do this for block-level HTML tags, such as headers,
591 # lists, and tables. That's because we still want to wrap <p>s around
592 # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
593 # phrase emphasis, and spans. The list of tags we're looking for is
594 # hard-coded:
596 # First, look for nested blocks, e.g.:
597 # <div>
598 # <div>
599 # tags for inner block must be indented.
600 # </div>
601 # </div>
603 # The outermost tags must start at the left margin for this to match, and
604 # the inner nested divs must be indented.
605 # We need to do this before the next, more liberal match, because the next
606 # match will start at the first `<div>` and stop at the first `</div>`.
607 $text =~ s{
608 ( # save in $1
609 ^ # start of line (with /m)
610 ((?:\Q$idt\E)?) # optional lead in = $2
611 <($block_tags_a) # start tag = $3
612 \b # word break
613 (?:.*\n)*? # any number of lines, minimally matching
614 \2</\3> # the matching end tag
615 [ ]* # trailing spaces
616 (?=\n+|\Z) # followed by a newline or end of document
619 my $key = block_id($1);
620 $g_html_blocks{$key} = $1;
621 "\n\n" . $key . "\n\n";
622 }egmx;
626 # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
628 $text =~ s{
629 ( # save in $1
630 ^ # start of line (with /m)
631 (?:\Q$idt\E)? # optional lead in
632 <($block_tags_b) # start tag = $2
633 \b # word break
634 (?:.*\n)*? # any number of lines, minimally matching
635 .*</\2> # the matching end tag
636 [ ]* # trailing spaces
637 (?=\n+|\Z) # followed by a newline or end of document
640 my $key = block_id($1);
641 $g_html_blocks{$key} = $1;
642 "\n\n" . $key . "\n\n";
643 }egmx;
644 # Special case just for <hr />. It was easier to make a special case than
645 # to make the other regex more complicated.
646 $text =~ s{
648 (?<=\n\n) # Starting after a blank line
649 | # or
650 \A\n? # the beginning of the doc
652 ( # save in $1
653 [ ]{0,$less_than_indent}
654 <(?:hr) # start tag
655 \b # word break
656 (?:[^<>])*? #
657 /?> # the matching end tag
658 [ ]*
659 (?=\n{2,}|\Z) # followed by a blank line or end of document
662 my $key = block_id($1);
663 $g_html_blocks{$key} = $1;
664 "\n\n" . $key . "\n\n";
665 }egx;
667 # Special case for standalone HTML comments:
668 $text =~ s{
670 (?<=\n\n) # Starting after a blank line
671 | # or
672 \A\n? # the beginning of the doc
674 ( # save in $1
675 [ ]{0,$less_than_indent}
676 (?s:
677 <!--
678 (?:[^-]|(?:-(?!-)))*
681 [ ]*
682 (?=\n{1,}|\Z) # followed by end of line or end of document
685 my $key = block_id($1);
686 $g_html_blocks{$key} = $1;
687 "\n\n" . $key . "\n\n";
688 }egx;
691 return $text;
695 sub _RunBlockGamut {
697 # These are all the transformations that form block-level
698 # tags like paragraphs, headers, and list items.
700 my ($text, $anchors) = @_;
702 $text = _DoHeaders($text, $anchors);
704 # Do Horizontal Rules:
705 $text =~ s{^ {0,3}\*(?: {0,2}\*){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm;
706 $text =~ s{^ {0,3}\_(?: {0,2}\_){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm;
707 $text =~ s{^ {0,3}\-(?: {0,2}\-){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm;
709 $text = _DoLists($text);
711 $text = _DoCodeBlocks($text);
713 $text = _DoBlockQuotes($text);
715 $text = _DoTables($text);
717 # We already ran _HashHTMLBlocks() before, in Markdown(), but that
718 # was to escape raw HTML in the original Markdown source. This time,
719 # we're escaping the markup we've just created, so that we don't wrap
720 # <p> tags around block-level tags.
721 $text = _HashHTMLBlocks($text);
723 $text = _FormParagraphs($text);
725 return $text;
729 sub _RunSpanGamut {
731 # These are all the transformations that occur *within* block-level
732 # tags like paragraphs, headers, and list items.
734 my $text = shift;
736 $text = _DoCodeSpans($text);
738 $text = _EscapeSpecialChars($text);
740 # Process anchor and image tags. Images must come first,
741 # because ![foo][f] looks like an anchor.
742 $text = _DoImages($text);
743 $text = _DoAnchors($text);
745 # Make links out of things like `<http://example.com/>`
746 # Must come after _DoAnchors(), because you can use < and >
747 # delimiters in inline links like [this](<url>).
748 $text = _DoAutoLinks($text);
750 $text = _EncodeAmpsAndAngles($text);
752 $text = _DoItalicsAndBoldAndStrike($text);
754 # Do hard breaks:
755 $text =~ s/ {2,}\n/<br$opt{empty_element_suffix}\n/g;
757 return $text;
761 sub _EscapeSpecialChars {
762 my $text = shift;
763 my $tokens ||= _TokenizeHTML($text);
765 $text = ''; # rebuild $text from the tokens
766 # my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags.
767 # my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
769 foreach my $cur_token (@$tokens) {
770 if ($cur_token->[0] eq "tag") {
771 # Within tags, encode *, _ and ~ so they don't conflict
772 # with their use in Markdown for italics and strong.
773 # We're replacing each such character with its
774 # corresponding block id value; this is likely
775 # overkill, but it should prevent us from colliding
776 # with the escape values by accident.
777 $cur_token->[1] =~ s!([*_~])!$g_escape_table{$1}!g;
778 $text .= $cur_token->[1];
779 } else {
780 my $t = $cur_token->[1];
781 $t = _EncodeBackslashEscapes($t);
782 $text .= $t;
785 return $text;
789 sub _ProcessWikiLink {
790 my ($link_text, $link_loc) = @_;
791 if (defined($link_loc) && $link_loc =~ m{^(?:http|ftp)s?://\S+$}i) {
792 # Just rewrite it to [...](...) form
793 return "[".$link_text."](".$link_loc.")";
795 if (defined($link_loc)) {
796 # We don't handle any other kind of "bar" links yet
797 return undef;
799 if ($link_text =~ m{^(?:http|ftp)s?://\S+$}i) {
800 # Just rewrite it to [...](...) form
801 return "[".$link_text."](".$link_text.")";
803 # We don't handle any other wiki-style links yet
804 return undef;
808 # Return a suitably encoded <a...> tag string
809 # On input NONE of $url, $text or $title should be xmlencoded
810 # but $url should already be url-encoded if needed, but NOT g_escape_table'd
811 sub _MakeATag {
812 my ($url, $text, $title) = @_;
813 defined($url) or $url="";
814 defined($text) or $text="";
815 defined($title) or $title="";
817 my $result = "<a href=\"" . _EncodeAttText($url) . "\"";
818 $title = _strip($title);
819 $result .= " title=\"" . _EncodeAttText($title) . "\"" if $title ne "";
820 return $result . ">" . $text . "</a>";
824 sub _DoAnchors {
826 # Turn Markdown link shortcuts into XHTML <a> tags.
828 my $text = shift;
831 # First, handle wiki-style links: [[wiki style link]]
833 $text =~ s{
834 ( # wrap whole match in $1
835 \[\[
836 ($g_nested_brackets) # link text and id = $2
837 \]\]
840 my $result;
841 my $whole_match = $1;
842 my $link_text = $2;
843 my $link_loc = undef;
845 if ($link_text =~ /^(.*)\|(.*)$/s) {
846 $link_text = $1;
847 $link_loc = $2;
850 $result = _ProcessWikiLink($link_text, $link_loc);
851 defined($result) or $result = $whole_match;
852 $result;
853 }xsge;
856 # Next, handle reference-style links: [link text] [id]
858 $text =~ s{
859 ( # wrap whole match in $1
861 ($g_nested_brackets) # link text = $2
864 [ ]? # one optional space
865 (?:\n[ ]*)? # one optional newline followed by spaces
868 (.*?) # id = $3
872 my $result;
873 my $whole_match = $1;
874 my $link_text = $2;
875 my $link_id = $3;
877 $link_id ne "" or $link_id = $link_text; # for shortcut links like [this][].
878 $link_id = _strip(lc $link_id);
880 if (defined($g_urls{$link_id}) || defined($g_anchors{$link_id})) {
881 my $url = $g_urls{$link_id};
882 $url = defined($url) ? _PrefixURL($url) : $g_anchors{$link_id};
883 $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
884 $result = _MakeATag($url, $link_text, $g_titles{$link_id});
886 else {
887 $result = $whole_match;
889 $result;
890 }xsge;
893 # Subsequently, inline-style links: [link text](url "optional title")
895 $text =~ s{
896 ( # wrap whole match in $1
898 ($g_nested_brackets) # link text = $2
900 \( # literal paren
901 [ ]*
902 <?(.*?)>? # href = $3
903 [ ]*
904 ( # $4
905 (['\042]) # quote char = $5
906 (.*?) # Title = $6
907 \5 # matching quote
908 )? # title is optional
912 #my $result;
913 my $whole_match = $1;
914 my $link_text = $2;
915 my $url = $3;
916 my $title = $6;
918 if ($url =~ /^#\S/) {
919 my $idbase = _strip(lc(substr($url, 1)));
920 my $id = _MakeAnchorId($idbase);
921 if (defined($g_anchors_id{$id})) {
922 $url = $g_anchors_id{$id};
923 } else {
924 $idbase =~ s/-/_/gs;
925 $id = _MakeAnchorId($idbase);
926 if (defined($g_anchors_id{$id})) {
927 $url = $g_anchors_id{$id};
931 $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
932 _MakeATag(_PrefixURL($url), $link_text, $title);
933 }xsge;
936 # Finally, handle reference-style implicit shortcut links: [link text]
938 $text =~ s{
939 ( # wrap whole match in $1
941 ($g_nested_brackets) # link text = $2
945 my $result;
946 my $whole_match = $1;
947 my $link_text = $2;
948 my $link_id = _strip(lc $2);
950 if (defined($g_urls{$link_id}) || defined($g_anchors{$link_id})) {
951 my $url = $g_urls{$link_id};
952 $url = defined($url) ? _PrefixURL($url) : $g_anchors{$link_id};
953 $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
954 $result = _MakeATag($url, $link_text, $g_titles{$link_id});
956 else {
957 $result = $whole_match;
959 $result;
960 }xsge;
962 return $text;
966 # Return a suitably encoded <img...> tag string
967 # On input NONE of $url, $alt or $title should be xmlencoded
968 # but $url should already be url-encoded if needed, but NOT g_escape_table'd
969 sub _MakeIMGTag {
970 my ($url, $alt, $title) = @_;
971 defined($url) or $url="";
972 defined($alt) or $alt="";
973 defined($title) or $title="";
974 return "" unless $url ne "";
976 my $result = "<img src=\"" . _EncodeAttText($url) . "\"";
977 my ($w, $h) = (0, 0);
978 ($alt, $title) = (_strip($alt), _strip($title));
979 if ($title =~ /^(.*)\(([1-9][0-9]*)[xX]([1-9][0-9]*)\)$/os) {
980 ($title, $w, $h) = (_strip($1), $2, $3);
981 } elsif ($title =~ /^(.*)\(\?[xX]([1-9][0-9]*)\)$/os) {
982 ($title, $h) = (_strip($1), $2);
983 } elsif ($title =~ /^(.*)\(([1-9][0-9]*)[xX]\?\)$/os) {
984 ($title, $w) = (_strip($1), $2);
986 $result .= " alt=\"" . _EncodeAttText($alt) . "\"" if $alt ne "";
987 $result .= " width=\"$w\"" if $w != 0;
988 $result .= " height=\"$h\"" if $h != 0;
989 $result .= " title=\"" . _EncodeAttText($title) . "\"" if $title ne "";
990 $result .= $opt{empty_element_suffix};
991 return $result;
995 sub _DoImages {
997 # Turn Markdown image shortcuts into <img> tags.
999 my $text = shift;
1002 # First, handle reference-style labeled images: ![alt text][id]
1004 $text =~ s{
1005 ( # wrap whole match in $1
1007 (.*?) # alt text = $2
1010 [ ]? # one optional space
1011 (?:\n[ ]*)? # one optional newline followed by spaces
1014 (.*?) # id = $3
1019 my $result;
1020 my $whole_match = $1;
1021 my $alt_text = $2;
1022 my $link_id = $3;
1024 $link_id ne "" or $link_id = $alt_text; # for shortcut links like ![this][].
1025 $link_id = _strip(lc $link_id);
1027 if (defined $g_urls{$link_id}) {
1028 $result = _MakeIMGTag(
1029 _PrefixURL($g_urls{$link_id}), $alt_text, $g_titles{$link_id});
1031 else {
1032 # If there's no such link ID, leave intact:
1033 $result = $whole_match;
1036 $result;
1037 }xsge;
1040 # Next, handle inline images: ![alt text](url "optional title")
1041 # Don't forget: encode * and _
1043 $text =~ s{
1044 ( # wrap whole match in $1
1046 (.*?) # alt text = $2
1048 \( # literal paren
1049 [ ]*
1050 <?(\S+?)>? # src url = $3
1051 [ ]*
1052 ( # $4
1053 (['\042]) # quote char = $5
1054 (.*?) # title = $6
1055 \5 # matching quote
1056 [ ]*
1057 )? # title is optional
1061 #my $whole_match = $1;
1062 my $alt_text = $2;
1063 my $url = $3;
1064 my $title = '';
1065 if (defined($6)) {
1066 $title = $6;
1069 _MakeIMGTag(_PrefixURL($url), $alt_text, $title);
1070 }xsge;
1073 # Finally, handle reference-style implicitly labeled links: ![alt text]
1075 $text =~ s{
1076 ( # wrap whole match in $1
1078 (.*?) # alt text = $2
1082 my $result;
1083 my $whole_match = $1;
1084 my $alt_text = $2;
1085 my $link_id = lc(_strip($alt_text));
1087 if (defined $g_urls{$link_id}) {
1088 $result = _MakeIMGTag(
1089 _PrefixURL($g_urls{$link_id}), $alt_text, $g_titles{$link_id});
1091 else {
1092 # If there's no such link ID, leave intact:
1093 $result = $whole_match;
1096 $result;
1097 }xsge;
1099 return $text;
1102 sub _EncodeAttText {
1103 my $text = shift;
1104 defined($text) or return undef;
1105 $text = _HTMLEncode(_strip($text));
1106 # We've got to encode these to avoid conflicting
1107 # with italics, bold and strike through.
1108 $text =~ s!([*_~:])!$g_escape_table{$1}!g;
1109 return $text;
1113 sub _MakeAnchorId {
1114 use bytes;
1115 my $link = shift;
1116 $link = lc($link);
1117 $link =~ tr/-a-z0-9_/_/cs;
1118 return '' unless $link ne '';
1119 $link = "_".$link."_";
1120 $link =~ s/__+/_/gs;
1121 $link = "_".md5_hex($link)."_" if length($link) > 66;
1122 return $link;
1126 sub _GetNewAnchorId {
1127 my $link = _strip(lc(shift));
1128 return '' if defined($g_anchors{$link});
1129 my $id = _MakeAnchorId($link);
1130 return '' unless $id;
1131 $g_anchors{$link} = '#'.$id;
1132 $g_anchors_id{$id} = $g_anchors{$link};
1133 if ($id =~ /-/) {
1134 my $id2 = $id;
1135 $id2 =~ s/-/_/gs;
1136 $id2 =~ s/__+/_/gs;
1137 defined($g_anchors_id{$id2}) or $g_anchors_id{$id2} = $g_anchors{$link};
1139 $id;
1143 sub _DoHeaders {
1144 my ($text, $anchors) = @_;
1145 my $h1;
1146 my $geth1 = $anchors && !defined($opt{h1}) ? sub {
1147 return unless !defined($h1);
1148 my $h = shift;
1149 $h =~ s/^\s+//;
1150 $h =~ s/\s+$//;
1151 $h =~ s/\s+/ /g;
1152 $h1 = $h if $h ne "";
1153 } : sub {};
1155 # Setext-style headers:
1156 # Header 1
1157 # ========
1159 # Header 2
1160 # --------
1162 # Header 3
1163 # ~~~~~~~~
1165 $text =~ s{ ^(?:=+[ ]*\n)?[ ]*(.+?)[ ]*\n=+[ ]*\n+ }{
1166 my $h = $1;
1167 my $id = _GetNewAnchorId($h);
1168 &$geth1($h);
1169 $id = " id=\"$id\"" if $id ne "";
1170 "<h1$id>" . _RunSpanGamut($h) . "</h1>\n\n";
1171 }egmx;
1173 $text =~ s{ ^(?:-+[ ]*\n)?[ ]*(.+?)[ ]*\n-+[ ]*\n+ }{
1174 my $h = $1;
1175 my $id = _GetNewAnchorId($h);
1176 $id = " id=\"$id\"" if $id ne "";
1177 "<h2$id>" . _RunSpanGamut($h) . "</h2>\n\n";
1178 }egmx;
1180 $text =~ s{ ^(?:~+[ ]*\n)?[ ]*(.+?)[ ]*\n~+[ ]*\n+ }{
1181 my $h = $1;
1182 my $id = _GetNewAnchorId($h);
1183 $id = " id=\"$id\"" if $id ne "";
1184 "<h3$id>" . _RunSpanGamut($h) . "</h3>\n\n";
1185 }egmx;
1188 # atx-style headers:
1189 # # Header 1
1190 # ## Header 2
1191 # ## Header 2 with closing hashes ##
1192 # ...
1193 # ###### Header 6
1195 $text =~ s{
1196 ^(\#{1,6}) # $1 = string of #'s
1197 [ ]*
1198 (.+?) # $2 = Header text
1199 [ ]*
1200 \#* # optional closing #'s (not counted)
1203 my $h = $2;
1204 my $h_level = length($1);
1205 my $id = $h_level <= 3 ? _GetNewAnchorId($h) : '';
1206 &$geth1($h) if $h_level == 1;
1207 $id = " id=\"$id\"" if $id ne "";
1208 "<h$h_level$id>" . _RunSpanGamut($h) . "</h$h_level>\n\n";
1209 }egmx;
1211 $opt{h1} = $h1 if defined($h1) && $h1 ne "";
1212 return $text;
1216 my ($marker_ul, $marker_ol, $marker_any, $roman_numeral, $greek_lower);
1217 BEGIN {
1218 # Re-usable patterns to match list item bullets and number markers:
1219 $roman_numeral = qr/(?:
1220 [IiVvXx]|[Ii]{2,3}|[Ii][VvXx]|[VvXx][Ii]{1,3}|[Xx][Vv][Ii]{0,3}|
1221 [Xx][Ii][VvXx]|[Xx]{2}[Ii]{0,3}|[Xx]{2}[Ii]?[Vv]|[Xx]{2}[Vv][Ii]{1,2})/ox;
1222 $greek_lower = qr/(?:[\x{03b1}-\x{03c9}])/o;
1223 $marker_ul = qr/[*+-]/o;
1224 $marker_ol = qr/(?:\d+|[A-Za-z]|$roman_numeral|$greek_lower)[.\)]/o;
1225 $marker_any = qr/(?:$marker_ul|$marker_ol)/o;
1229 sub _GetListMarkerType {
1230 my ($list_type, $list_marker, $last_marker) = @_;
1231 return "" unless $list_type && $list_marker && lc($list_type) eq "ol";
1232 my $last_marker_type = '';
1233 $last_marker_type = _GetListMarkerType($list_type, $last_marker)
1234 if defined($last_marker) &&
1235 # these are roman unless $last_marker type case matches and is 'a' or 'A'
1236 $list_marker =~ /^[IiVvXx][.\)]?$/;
1237 return "I" if $list_marker =~ /^[IVX]/ && $last_marker_type ne 'A';
1238 return "i" if $list_marker =~ /^[ivx]/ && $last_marker_type ne 'a';
1239 return "A" if $list_marker =~ /^[A-Z]/;
1240 return "a" if $list_marker =~ /^[a-z]/ || $list_marker =~ /^$greek_lower/o;
1241 return "1";
1245 sub _GetListItemTypeClass {
1246 my ($list_type, $list_marker, $last_marker) = @_;
1247 my $list_marker_type = _GetListMarkerType($list_type, $list_marker, $last_marker);
1248 my $ans = &{sub{
1249 return "" unless length($list_marker) >= 2 && $list_marker_type =~ /^[IiAa1]$/;
1250 return "lower-greek" if $list_marker_type eq "a" && $list_marker =~ /^$greek_lower/o;
1251 return "" unless $list_marker =~ /\)$/;
1252 return "upper-roman" if $list_marker_type eq "I";
1253 return "lower-roman" if $list_marker_type eq "i";
1254 return "upper-alpha" if $list_marker_type eq "A";
1255 return "lower-alpha" if $list_marker_type eq "a";
1256 return "decimal";
1258 return ($list_marker_type, $ans);
1262 my %_roman_number_table;
1263 BEGIN {
1264 %_roman_number_table = (
1265 i => 1,
1266 ii => 2,
1267 iii => 3,
1268 iv => 4,
1269 v => 5,
1270 vi => 6,
1271 vii => 7,
1272 viii => 8,
1273 ix => 9,
1274 x => 10,
1275 xi => 11,
1276 xii => 12,
1277 xiii => 13,
1278 xiv => 14,
1279 xv => 15,
1280 xvi => 16,
1281 xvii => 17,
1282 xviii => 18,
1283 xix => 19,
1284 xx => 20,
1285 xxi => 21,
1286 xxii => 22,
1287 xxiii => 23,
1288 xxiv => 24,
1289 xxv => 25,
1290 xxvi => 26,
1291 xxvii => 27
1296 # Necessary because ς and σ are the same value grrr
1297 my %_greek_number_table;
1298 BEGIN {
1299 %_greek_number_table = (
1300 "\x{03b1}" => 1, # α
1301 "\x{03b2}" => 2, # β
1302 "\x{03b3}" => 3, # γ
1303 "\x{03b4}" => 4, # δ
1304 "\x{03b5}" => 5, # ε
1305 "\x{03b6}" => 6, # ζ
1306 "\x{03b7}" => 7, # η
1307 "\x{03b8}" => 8, # θ
1308 "\x{03b9}" => 9, # ι
1309 "\x{03ba}" => 10, # κ
1310 "\x{03bb}" => 11, # λ
1311 #"\x{00b5}"=> 12, # µ is "micro" not "mu"
1312 "\x{03bc}" => 12, # μ
1313 "\x{03bd}" => 13, # ν
1314 "\x{03be}" => 14, # ξ
1315 "\x{03bf}" => 15, # ο
1316 "\x{03c0}" => 16, # π
1317 "\x{03c1}" => 17, # ρ
1318 "\x{03c2}" => 18, # ς
1319 "\x{03c3}" => 18, # σ
1320 "\x{03c4}" => 19, # τ
1321 "\x{03c5}" => 20, # υ
1322 "\x{03c6}" => 21, # φ
1323 "\x{03c7}" => 22, # χ
1324 "\x{03c8}" => 23, # ψ
1325 "\x{03c9}" => 24 # ω
1330 sub _GetMarkerIntegerNum {
1331 my ($list_marker_type, $marker_val) = @_;
1332 my $ans = &{sub{
1333 return 0 + $marker_val if $list_marker_type eq "1";
1334 $list_marker_type = lc($list_marker_type);
1335 return $_greek_number_table{$marker_val}
1336 if $list_marker_type eq "a" &&
1337 defined($_greek_number_table{$marker_val});
1338 $marker_val = lc($marker_val);
1339 return ord($marker_val) - ord("a") + 1 if $list_marker_type eq "a";
1340 return 1 unless $list_marker_type eq "i";
1341 defined($_roman_number_table{$marker_val}) and
1342 return $_roman_number_table{$marker_val};
1343 return 1;
1345 return $ans if $ans == 0 && $list_marker_type eq "1";
1346 return $ans >= 1 ? $ans : 1;
1350 sub _IncrList {
1351 my ($from, $to, $extra) = @_;
1352 $extra = defined($extra) ? " $extra" : "";
1353 my $result = "";
1354 while ($from + 10 <= $to) {
1355 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-10\"></span>\n";
1356 $from += 10;
1358 while ($from + 5 <= $to) {
1359 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-5\"></span>\n";
1360 $from += 5;
1362 while ($from + 2 <= $to) {
1363 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-2\"></span>\n";
1364 $from += 2;
1366 while ($from < $to) {
1367 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr\"></span>\n";
1368 ++$from;
1370 return $result;
1374 sub _DoLists {
1376 # Form HTML ordered (numbered) and unordered (bulleted) lists.
1378 my $text = shift;
1379 my $indent = $opt{indent_width};
1380 my $less_than_indent = $indent - 1;
1381 my $less_than_double_indent = 2 * $indent - 1;
1383 # Re-usable pattern to match any entire ul or ol list:
1384 my $whole_list = qr{
1385 ( # $1 (or $_[0]) = whole list
1386 ( # $2 (or $_[1])
1387 (?:(?<=\n)|\A)
1388 [ ]{0,$less_than_indent}
1389 (${marker_any}) # $3 (or $_[2]) = first list item marker
1390 [ ]+
1392 (?s:.+?)
1393 ( # $4 (or $_[3])
1396 \n{2,}
1397 (?=\S)
1398 (?! # Negative lookahead for another list item marker
1399 ${marker_any}[ ]
1403 }mx;
1405 my $list_item_sub = sub {
1406 my $list = $_[0];
1407 my $list_type = ($_[2] =~ m/$marker_ul/) ? "ul" : "ol";
1408 my $list_att = "";
1409 my $list_class = "";
1410 my $list_incr = "";
1411 # Turn double returns into triple returns, so that we can make a
1412 # paragraph for the last item in a list, if necessary:
1413 $list =~ s/\n\n/\n\n\n/g;
1414 my ($result, $first_marker, $fancy) = _ProcessListItems($list_type, $list);
1415 my $list_marker_type = _GetListMarkerType($list_type, $first_marker);
1416 if ($list_marker_type) {
1417 $first_marker =~ s/[.\)]$//;
1418 my $first_marker_num = _GetMarkerIntegerNum($list_marker_type, $first_marker);
1419 $list_att = $list_marker_type eq "1" ? "" : " type=\"$list_marker_type\"";
1420 if ($fancy) {
1421 $list_class = " class=\"$opt{style_prefix}ol\"";
1422 my $start = $first_marker_num;
1423 $start = 10 if $start > 10;
1424 $start = 5 if $start > 5 && $start < 10;
1425 $start = 1 if $start > 1 && $start < 5;
1426 $list_att .= " start=\"$start\"" unless $start == 1;
1427 $list_incr = _IncrList($start, $first_marker_num);
1428 } else {
1429 $list_class = " class=\"$opt{style_prefix}lc-greek\""
1430 if $list_marker_type eq "a" && $first_marker =~ /^$greek_lower/o;
1431 $list_att .= " start=\"$first_marker_num\"" unless $first_marker_num == 1;
1434 my $idt = "\027" x $g_list_level;
1435 $result = "$idt<$list_type$list_att$list_class>\n$list_incr" . $result . "$idt</$list_type>\n\n";
1436 $result;
1439 # We use a different prefix before nested lists than top-level lists.
1440 # See extended comment in _ProcessListItems().
1442 # Note: (jg) There's a bit of duplication here. My original implementation
1443 # created a scalar regex pattern as the conditional result of the test on
1444 # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
1445 # substitution once, using the scalar as the pattern. This worked,
1446 # everywhere except when running under MT on my hosting account at Pair
1447 # Networks. There, this caused all rebuilds to be killed by the reaper (or
1448 # perhaps they crashed, but that seems incredibly unlikely given that the
1449 # same script on the same server ran fine *except* under MT. I've spent
1450 # more time trying to figure out why this is happening than I'd like to
1451 # admit. My only guess, backed up by the fact that this workaround works,
1452 # is that Perl optimizes the substition when it can figure out that the
1453 # pattern will never change, and when this optimization isn't on, we run
1454 # afoul of the reaper. Thus, the slightly redundant code to that uses two
1455 # static s/// patterns rather than one conditional pattern.
1457 # Note: (kjm) With the addition of the two-of-the-same-kind-in-a-row-
1458 # starts-a-list-at-the-top-level rule the two patterns really are somewhat
1459 # different now, but the duplication has pretty much been eliminated via
1460 # use of a separate sub which has the side-effect of making the below
1461 # two cases much easier to grok all at once.
1463 if ($g_list_level) {
1464 $text =~ s{
1466 $whole_list
1468 &$list_item_sub($1, $2, $3, $4);
1469 }egmx;
1471 else {
1472 $text =~ s{
1473 (?: (?<=\n\n) |
1474 \A\n? |
1475 (?:(?<=\n) # two of the same kind of marker lines
1476 (?=[ ]{0,$less_than_indent}$marker_ul[ ].*\n
1477 [ ]{0,$less_than_indent}$marker_ul[ ])) |
1478 (?:(?<=\n) # in a row will start a list
1479 (?=[ ]{0,$less_than_indent}$marker_ol[ ].*\n
1480 [ ]{0,$less_than_indent}$marker_ol[ ])) |
1481 (?:(?<=\n) # or any marker and a sublist marker
1482 (?=[ ]{0,$less_than_indent}$marker_any[ ].*\n
1483 [ ]{$indent,$less_than_double_indent}$marker_any[ ]))
1485 $whole_list
1487 &$list_item_sub($1, $2, $3, $4);
1488 }egmx;
1491 return $text;
1495 sub _ProcessListItems {
1497 # Process the contents of a single ordered or unordered list, splitting it
1498 # into individual list items.
1501 my $list_type = shift;
1502 my $list_str = shift;
1504 # The $g_list_level global keeps track of when we're inside a list.
1505 # Each time we enter a list, we increment it; when we leave a list,
1506 # we decrement. If it's zero, we're not in a list anymore.
1508 # We do this because when we're not inside a list, we want to treat
1509 # something like this:
1511 # I recommend upgrading to version
1512 # 8. Oops, now this line is treated
1513 # as a sub-list.
1515 # As a single paragraph, despite the fact that the second line starts
1516 # with a digit-period-space sequence.
1518 # Whereas when we're inside a list (or sub-list), that line will be
1519 # treated as the start of a sub-list. What a kludge, huh? This is
1520 # an aspect of Markdown's syntax that's hard to parse perfectly
1521 # without resorting to mind-reading. Perhaps the solution is to
1522 # change the syntax rules such that sub-lists must start with a
1523 # starting cardinal number; e.g. "1." or "a.".
1525 $g_list_level++;
1526 my $idt = "\027" x $g_list_level;
1527 my $marker_kind = $list_type eq "ul" ? $marker_ul : $marker_ol;
1528 my $first_marker;
1529 my $first_marker_type;
1530 my $first_marker_num;
1531 my $last_marker;
1532 my $fancy;
1533 my $skipped;
1534 my $typechanged;
1535 my $next_num = 1;
1537 # trim trailing blank lines:
1538 $list_str =~ s/\n{2,}\z/\n/;
1540 my $result = "";
1541 my $oldpos = 0;
1542 pos($list_str) = 0;
1543 while ($list_str =~ m{\G # start where we left off
1544 (\n+)? # leading line = $1
1545 (^[ ]*) # leading whitespace = $2
1546 ($marker_any) [ ] ([ ]*) # list marker = $3 leading item space = $4
1547 }cgmx) {
1548 my $leading_line = $1;
1549 my $leading_space = $2;
1550 my $list_marker = $3;
1551 my $list_marker_len = length($list_marker);
1552 my $leading_item_space = $4;
1553 if ($-[0] > $oldpos) {
1554 $result .= substr($list_str, $oldpos, $-[0] - $oldpos); # Sort-of $`
1555 $oldpos = $-[0]; # point at start of this entire match
1557 if (!defined($first_marker)) {
1558 $first_marker = $list_marker;
1559 $first_marker_type = _GetListMarkerType($list_type, $first_marker);
1560 if ($first_marker_type) {
1561 (my $marker_val = $first_marker) =~ s/[.\)]$//;
1562 $first_marker_num = _GetMarkerIntegerNum($first_marker_type, $marker_val);
1563 $next_num = $first_marker_num;
1564 $skipped = 1 if $next_num != 1;
1566 } elsif ($list_marker !~ /$marker_kind/) {
1567 # Wrong marker kind, "fix up" the marker to a correct "lazy" marker
1568 # But keep the old length in $list_marker_len
1569 $list_marker = $last_marker;
1572 # Now grab the rest of this item's data upto but excluding the next
1573 # list marker at the SAME indent level, but sublists must be INCLUDED
1575 my $item = "";
1576 while ($list_str =~ m{\G
1577 ((?:.+?)(?:\n{1,2})) # list item text = $1
1578 (?= \n* (?: \z | # end of string OR
1579 (^[ ]*) # leading whitespace = $2
1580 ($marker_any) # next list marker = $3
1581 ([ ]+) )) # one or more spaces after marker = $4
1582 }cgmxs) {
1584 # If $3 has a left edge that is at the left edge of the previous
1585 # marker OR $3 has a right edge that is at the right edge of the
1586 # previous marker then we stop; otherwise we go on
1588 $item .= substr($list_str, $-[0], $+[0] - $-[0]); # $&
1589 last if !defined($4) || length($2) == length($leading_space) ||
1590 length($2) + length($3) == length($leading_space) + $list_marker_len;
1591 # move along, you're not the marker droid we're looking for...
1592 $item .= substr($list_str, $+[0], $+[4] - $+[0]);
1593 pos($list_str) = $+[4]; # ...move along over the marker droid
1595 # Remember where we parked
1596 $oldpos = pos($list_str);
1598 # Process the $list_marker $item
1600 my $liatt = '';
1601 my $checkbox = '';
1602 my $incr = '';
1604 if ($list_type eq "ul" && !$leading_item_space && $item =~ /^\[([ xX])\] +(.*)$/s) {
1605 my $checkmark = lc $1;
1606 $item = $2;
1607 my ($checkbox_class, $checkbox_val);
1608 if ($checkmark eq "x") {
1609 ($checkbox_class, $checkbox_val) = ("checkbox-on", "x");
1610 } else {
1611 ($checkbox_class, $checkbox_val) = ("checkbox-off", "&#160;");
1613 $liatt = " class=\"$opt{style_prefix}$checkbox_class\"";
1614 $checkbox = "<span><span></span></span><span></span><span>[<tt>$checkbox_val</tt>]&#160;</span>";
1615 } else {
1616 my $list_marker_type;
1617 ($list_marker_type, $liatt) = _GetListItemTypeClass($list_type, $list_marker, $last_marker);
1618 if ($list_type eq "ol" && defined($first_marker)) {
1619 my $styled = $fancy = 1 if $liatt && $list_marker =~ /\)$/;
1620 my ($sfx, $dash) = ("", "");
1621 ($sfx, $dash) = ("li", "-") if $styled;
1622 if ($liatt =~ /lower/) {
1623 $sfx .= "${dash}lc";
1624 } elsif ($liatt =~ /upper/) {
1625 $sfx .= "${dash}uc";
1627 $sfx .= "-greek" if $liatt =~ /greek/;
1628 $liatt = " class=\"$opt{style_prefix}$sfx\"" if $sfx;
1629 $typechanged = 1 if $list_marker_type ne $first_marker_type;
1630 (my $marker_val = $list_marker) =~ s/[.\)]$//;
1631 my $marker_num = _GetMarkerIntegerNum($list_marker_type, $marker_val);
1632 $marker_num = $next_num if $marker_num < $next_num;
1633 $skipped = 1 if $next_num < $marker_num;
1634 $incr = _IncrList($next_num, $marker_num, "incrlevel=$g_list_level");
1635 $liatt = " value=\"$marker_num\"$liatt" if $fancy || $skipped;
1636 $liatt = " type=\"$list_marker_type\"$liatt" if $styled || $typechanged;
1637 $next_num = $marker_num + 1;
1640 $last_marker = $list_marker;
1642 if ($leading_line or ($item =~ m/\n{2,}/)) {
1643 $item = _RunBlockGamut(_Outdent($item));
1645 else {
1646 # Recursion for sub-lists:
1647 $item = _DoLists(_Outdent($item));
1648 chomp $item;
1649 $item = _RunSpanGamut($item);
1652 # Append to $result
1653 $result .= "$incr$idt<li$liatt>" . $checkbox . $item . "$idt</li>\n";
1655 if ($fancy) {
1656 # remove "incrlevel=$g_list_level " parts
1657 $result =~ s{<span incrlevel=$g_list_level class="$opt{style_prefix}ol-incr((?:-\d{1,2})?)">}
1658 {$idt<span class="$opt{style_prefix}ol-incr$1">}g;
1659 } else {
1660 # remove the $g_list_level incr spans entirely
1661 $result =~ s{<span incrlevel=$g_list_level class="$opt{style_prefix}ol-incr(?:-\d{1,2})?"></span>\n}{}g;
1662 # remove the class="$opt{style_prefix}lc-greek" if first_marker is greek
1663 $result =~ s{(<li[^>]*?) class="$opt{style_prefix}lc-greek">}{$1>}g
1664 if defined($first_marker_type) && $first_marker_type eq "a" && $first_marker =~ /^$greek_lower/o;
1667 # Anything left over (similar to $') goes into result, but this should always be empty
1668 $result .= _RunBlockGamut(substr($list_str, pos($list_str)));
1670 $g_list_level--;
1671 return ($result, $first_marker, $fancy);
1675 sub _DoCodeBlocks {
1677 # Process Markdown `<pre><code>` blocks.
1680 my $text = shift;
1682 $text =~ s{
1683 (?:\n\n|\A\n?)
1684 ( # $1 = the code block -- one or more lines, starting with indent_width spaces
1686 (?:[ ]{$opt{indent_width}}) # Lines must start with indent_width of spaces
1687 .*\n+
1690 ((?=^[ ]{0,$opt{indent_width}}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
1692 my $codeblock = $1;
1694 $codeblock =~ s/\n\n\n/\n\n/g; # undo "paragraph for last list item" change
1695 $codeblock = _EncodeCode(_Outdent($codeblock));
1696 $codeblock =~ s/\A\n+//; # trim leading newlines
1697 $codeblock =~ s/\s+\z//; # trim trailing whitespace
1699 my $result = "<div class=\"$opt{style_prefix}code\"><pre style=\"display:none\"></pre><pre><code>"
1700 . $codeblock . "\n</code></pre></div>";
1701 my $key = block_id($result, 2);
1702 $g_code_blocks{$key} = $result;
1703 "\n\n" . $key . "\n\n";
1704 }egmx;
1706 return $text;
1710 sub _DoCodeSpans {
1712 # * Backtick quotes are used for <code></code> spans.
1714 # * You can use multiple backticks as the delimiters if you want to
1715 # include literal backticks in the code span. So, this input:
1717 # Just type ``foo `bar` baz`` at the prompt.
1719 # Will translate to:
1721 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1723 # There's no arbitrary limit to the number of backticks you
1724 # can use as delimters. If you need three consecutive backticks
1725 # in your code, use four for delimiters, etc.
1727 # * You can use spaces to get literal backticks at the edges:
1729 # ... type `` `bar` `` ...
1731 # Turns to:
1733 # ... type <code>`bar`</code> ...
1736 my $text = shift;
1738 $text =~ s@
1739 (`+) # $1 = Opening run of `
1740 (.+?) # $2 = The code block
1741 (?<!`)
1742 \1 # Matching closer
1743 (?!`)
1745 my $c = "$2";
1746 $c =~ s/^[ ]+//g; # leading whitespace
1747 $c =~ s/[ ]+$//g; # trailing whitespace
1748 $c = _EncodeCode($c);
1749 "<code>$c</code>";
1750 @egsx;
1752 return $text;
1756 sub _EncodeCode {
1758 # Encode/escape certain characters inside Markdown code runs.
1759 # The point is that in code, these characters are literals,
1760 # and lose their special Markdown meanings.
1762 local $_ = shift;
1764 # Encode all ampersands; HTML entities are not
1765 # entities within a Markdown code span.
1766 s/&/&amp;/g;
1768 # Encode $'s, but only if we're running under Blosxom.
1769 # (Blosxom interpolates Perl variables in article bodies.)
1770 s/\$/&#036;/g if $_haveBX;
1772 # Do the angle bracket song and dance:
1773 s! < !&lt;!gx;
1774 s! > !&gt;!gx;
1776 # Now, escape characters that are magic in Markdown:
1777 s!([*_~{}\[\]\\])!$g_escape_table{$1}!g;
1779 return $_;
1783 sub _DoItalicsAndBoldAndStrike {
1784 my $text = shift;
1786 # <strong> must go first:
1787 $text =~ s{ \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }
1788 {<strong>$1</strong>}gsx;
1789 $text =~ s{ (?<!\w) __ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\w) }
1790 {<strong>$1</strong>}gsx;
1792 $text =~ s{ ~~ (?=\S) (.+?[*_]*) (?<=\S) ~~ }
1793 {<strike>$1</strike>}gsx;
1795 $text =~ s{ \* (?=\S) (.+?) (?<=\S) \* }
1796 {<em>$1</em>}gsx;
1797 $text =~ s{ (?<!\w) _ (?=\S) (.+?) (?<=\S) _ (?!\w) }
1798 {<em>$1</em>}gsx;
1800 return $text;
1804 sub _DoBlockQuotes {
1805 my $text = shift;
1807 $text =~ s{
1808 ( # Wrap whole match in $1
1810 ^[ ]*>[ ]? # '>' at the start of a line
1811 .*\n # rest of the first line
1812 (.+\n)* # subsequent consecutive lines
1813 \n* # blanks
1817 my $bq = $1;
1818 $bq =~ s/^[ ]*>[ ]?//gm; # trim one level of quoting
1819 $bq =~ s/^[ ]+$//mg; # trim whitespace-only lines
1820 $bq = _RunBlockGamut($bq); # recurse
1822 $bq =~ s/^/ /mg;
1823 "<blockquote>\n$bq\n</blockquote>\n\n";
1824 }egmx;
1827 return $text;
1831 my ($LEAD, $TRAIL, $LEADBAR, $LEADSP, $COLPL, $SEP);
1832 BEGIN {
1833 $LEAD = qr/(?>[ ]*(?:\|[ ]*)?)/o;
1834 $TRAIL = qr/\|[ ]*/o;
1835 $LEADBAR = qr/(?>[ ]*\|[ ]*)/o;
1836 $LEADSP = qr/(?>[ ]*)/o;
1837 $COLPL = qr/(?:[^\n|\\]|\\[^\n])+/o;
1838 $SEP = qr/[ ]*:?-+:?[ ]*/o;
1841 sub _DoTables {
1842 my $text = shift;
1844 $text =~ s{
1845 ( # Wrap whole thing to avoid $&
1846 (?: (?<=\n\n) | \A\n? ) # Preceded by blank line or beginning of string
1847 ^( # Header line
1848 $LEADBAR \| [^\n]* |
1849 $LEADBAR $COLPL [^\n]* |
1850 $LEADSP $COLPL \| [^\n]*
1852 ( # Separator line
1853 $LEADBAR $SEP (?: \| $SEP )* (?: \| [ ]*)? |
1854 $SEP (?: \| $SEP )+ (?: \| [ ]*)? |
1855 $SEP \| [ ]*
1857 ((?: # Rows (0+)
1858 $LEADBAR \| [^\n]* \n |
1859 $LEADBAR $COLPL [^\n]* \n |
1860 $LEADSP $COLPL \| [^\n]* \n
1864 my ($w, $h, $s, $rows) = ($1, $2, $3, $4);
1865 my @heads = _SplitTableRow($h);
1866 my @seps = _SplitTableRow($s);
1867 if (@heads == @seps) {
1868 my @align = map {
1869 if (/^:-+:$/) {" align=\"center\""}
1870 elsif (/^:/) {" align=\"left\""}
1871 elsif (/:$/) {" align=\"right\""}
1872 else {""}
1873 } @seps;
1874 my $headers = _MakeTableRow("th", \@align, @heads);
1875 my $tab ="\n<table border=\"1\" cellspacing=\"0\" cellpadding=\"2\" class=\"$opt{style_prefix}table\">\n" .
1876 " <tr class=\"$opt{style_prefix}row-hdr\">" . _MakeTableRow("th", \@align, @heads) . "</tr>\n";
1877 my $cnt = 0;
1878 my @classes = ("class=\"$opt{style_prefix}row-even\"", "class=\"$opt{style_prefix}row-odd\"");
1879 $tab .= " <tr " . $classes[++$cnt % 2] . ">" . _MakeTableRow("td", \@align, _SplitTableRow($_)) . "</tr>\n"
1880 foreach split(/\n/, $rows);
1881 $tab .= "</table>\n\n";
1882 } else {
1885 }egmx;
1887 return $text;
1891 sub _SplitTableRow {
1892 my $row = shift;
1893 $row =~ s/^$LEAD//;
1894 $row =~ s/$TRAIL$//;
1895 $row =~ s!\\\\!$g_escape_table{'\\'}!go; # Must process escaped backslashes first.
1896 $row =~ s!\\\|!$g_escape_table{'|'}!go; # Then do \|
1897 my @elems = map {
1898 s!$g_escape_table{'|'}!|!go;
1899 s!$g_escape_table{'\\'}!\\!go;
1900 s/^[ ]+//;
1901 s/[ ]+$//;
1903 } split(/[ ]*\|[ ]*/, $row, -1);
1904 @elems or push(@elems, "");
1905 return @elems;
1909 sub _MakeTableRow {
1910 my $etype = shift;
1911 my $align = shift;
1912 my $row = "";
1913 for (my $i = 0; $i < @$align; ++$i) {
1914 my $data = $_[$i];
1915 defined($data) or $data = "";
1916 $row .= "<" . $etype . $$align[$i] . ">" .
1917 _RunSpanGamut($data) . "</" . $etype . ">";
1919 return $row;
1923 sub _FormParagraphs {
1925 # Params:
1926 # $text - string to process with html <p> tags
1928 my $text = shift;
1930 # Strip leading and trailing lines:
1931 $text =~ s/\A\n+//;
1932 $text =~ s/\n+\z//;
1934 my @grafs = split(/\n{2,}/, $text);
1937 # Wrap <p> tags.
1939 foreach (@grafs) {
1940 unless (defined($g_html_blocks{$_}) || defined($g_code_blocks{$_})) {
1941 $_ = _RunSpanGamut($_);
1942 s/^([ ]*)/<p>/;
1943 $_ .= "</p>";
1948 # Unhashify HTML blocks
1950 foreach (@grafs) {
1951 if (defined( $g_html_blocks{$_} )) {
1952 $_ = $g_html_blocks{$_};
1956 return join "\n\n", @grafs;
1960 my $g_possible_tag_name;
1961 my %ok_tag_name;
1962 BEGIN {
1963 # note: length("blockquote") == 10
1964 $g_possible_tag_name = qr/(?i:[a-z]{1,10}|h[1-6])/o;
1965 %ok_tag_name = map({$_ => 1} qw(
1966 a abbr acronym address
1967 b basefont bdo big blockquote br
1968 caption center cite code col colgroup
1969 dd del dfn div dl dt
1971 font
1972 h1 h2 h3 h4 h5 h6 hr
1973 i img ins
1977 p pre
1979 s samp small span strike strong sub sup
1980 table tbody td tfoot th thead tr tt
1981 u ul
1984 $ok_tag_name{$_} = 0 foreach (qw(
1985 dir menu
1990 sub _SetAllowedTag {
1991 my ($tag, $forbid) = @_;
1992 $ok_tag_name{$tag} = $forbid ? 0 : 1
1993 if defined($tag) && exists($ok_tag_name{$tag});
1997 # Encode leading '<' of any non-tags
1998 # However, "<?", "<!" and "<$" are passed through (legacy on that "<$" thing)
1999 sub _DoTag {
2000 my $tag = shift;
2001 return $tag if $tag =~ /^<[?\$!]/;
2002 if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} || $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
2003 $ok_tag_name{lc($1)}) {
2005 return _ProcessURLTag("href", $tag) if $tag =~ /^<a\s/i;
2006 return _ProcessURLTag("src", $tag) if $tag =~ /^<img\s/i;
2007 return $tag;
2009 $tag =~ s/</&lt;/g;
2010 return $tag;
2014 sub _ProcessURLTag {
2015 my $att = shift;
2016 my $tag = shift;
2018 $att = lc($att) . "=";
2019 if ($tag =~ /^(<[^\s>]+\s+)/g) {
2020 my $out = $1;
2021 while ($tag =~ /\G([^\s\042\047>]+=)([\042\047])((?:(?!\2)(?!>).)*)(\2\s*)/gc) {
2022 my ($p, $q, $v, $s) = ($1, $2, $3, $4);
2023 if (lc($p) eq $att && $v ne "") {
2024 $v = _HTMLEncode(_PrefixURL($v));
2026 $out .= $p . $q . $v . $s;
2028 $out .= substr($tag, pos($tag));
2029 return $out;
2032 return $tag;
2036 sub _HTMLEncode {
2037 my $text = shift;
2039 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
2040 # http://bumppo.net/projects/amputator/
2041 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
2043 # Remaining entities now
2044 $text =~ s/\042/&quot;/g;
2045 $text =~ s/\047/&apos;/g;
2046 $text =~ s/</&lt;/g;
2047 $text =~ s/>/&gt;/g;
2049 return $text;
2053 sub _EncodeAmps {
2054 my $text = shift;
2056 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
2057 # http://bumppo.net/projects/amputator/
2058 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
2060 return $text;
2064 sub _EncodeAmpsAndAngles {
2065 # Smart processing for ampersands and angle brackets that need to be encoded.
2067 my $text = shift;
2069 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
2070 # http://bumppo.net/projects/amputator/
2071 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
2073 # Encode naked <'s
2074 $text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
2075 $text =~ s{<(?=[^>]*$)}{&lt;}g;
2077 # Encode <'s that cannot possibly be a start or end tag
2078 $text =~ s{(<[^>]*>)}{_DoTag($1)}ige;
2080 return $text;
2084 sub _EncodeBackslashEscapes {
2086 # Parameter: String.
2087 # Returns: String after processing the following backslash escape sequences.
2089 local $_ = shift;
2091 s!\\\\!$g_escape_table{'\\'}!go; # Must process escaped backslashes first.
2092 s{\\([`*_~{}\[\]()>#+\-.!`])}{$g_escape_table{$1}}g;
2094 return $_;
2098 sub _DoAutoLinks {
2099 local $_ = shift;
2101 s{<((https?|ftps?):[^'\042>\s]+)>}{<a href="$1">&lt;$1&gt;</a>}gi;
2103 # Email addresses: <address@domain.foo>
2106 (?:mailto:)?
2108 [-.\w]+
2110 [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
2114 _EncodeEmailAddress(_UnescapeSpecialChars($1), "&#x3c;", "&#62;");
2115 }egix;
2117 # (kjm) I don't do "x" patterns
2118 s{(?<![\042'<>])(?<!&[Ll][Tt];)(?<!&#60;)(?<!&#x3[Cc];)\b((?:https?|ftps?)://(?:[-a-zA-Z0-9./?\&\%=_~!*;:\@+\$,\x23](?:(?<![.,:;])|(?=[^\s])))+)}
2119 {<a href="$1">$1</a>}sog;
2120 s{(?<![][])(?<!\] )\[RFC( ?)([0-9]{1,5})\](?![][])(?! \[)}
2121 {[<a href="http://tools.ietf.org/html/rfc$2">RFC$1$2</a>]}sog;
2123 return $_;
2127 sub _EncodeEmailAddress {
2129 # Input: an email address, e.g. "foo@example.com"
2131 # Output: the email address as a mailto link, with each character
2132 # of the address encoded as either a decimal or hex entity, in
2133 # the hopes of foiling most address harvesting spam bots. E.g.:
2135 # <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
2136 # x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
2137 # &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
2139 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
2140 # mailing list: <http://tinyurl.com/yu7ue>
2143 my ($addr, $prefix, $suffix) = @_;
2144 $prefix = "" unless defined($prefix);
2145 $suffix = "" unless defined($suffix);
2147 srand(unpack('N',md5($addr)));
2148 my @encode = (
2149 sub { '&#' . ord(shift) . ';' },
2150 sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
2151 sub { shift },
2154 $addr = "mailto:" . $addr;
2156 $addr =~ s{(.)}{
2157 my $char = $1;
2158 if ( $char eq '@' ) {
2159 # this *must* be encoded. I insist.
2160 $char = $encode[int rand 1]->($char);
2161 } elsif ( $char ne ':' ) {
2162 # leave ':' alone (to spot mailto: later)
2163 my $r = rand;
2164 # roughly 10% raw, 45% hex, 45% dec
2165 $char = (
2166 $r > .9 ? $encode[2]->($char) :
2167 $r < .45 ? $encode[1]->($char) :
2168 $encode[0]->($char)
2171 $char;
2172 }gex;
2174 # strip the mailto: from the visible part
2175 (my $bareaddr = $addr) =~ s/^.+?://;
2176 $addr = qq{<a href="$addr">$prefix$bareaddr$suffix</a>};
2178 return $addr;
2182 sub _UnescapeSpecialChars {
2184 # Swap back in all the special characters we've hidden.
2186 my $text = shift;
2188 while( my($char, $hash) = each(%g_escape_table) ) {
2189 $text =~ s/$hash/$char/g;
2191 return $text;
2195 sub _TokenizeHTML {
2197 # Parameter: String containing HTML markup.
2198 # Returns: Reference to an array of the tokens comprising the input
2199 # string. Each token is either a tag (possibly with nested,
2200 # tags contained therein, such as <a href="<MTFoo>">, or a
2201 # run of text between tags. Each element of the array is a
2202 # two-element array; the first is either 'tag' or 'text';
2203 # the second is the actual value.
2206 # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
2207 # <http://www.bradchoate.com/past/mtregex.php>
2210 my $str = shift;
2211 my $pos = 0;
2212 my $len = length $str;
2213 my @tokens;
2215 my $depth = 6;
2216 my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
2217 my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment
2218 (?s: <\? .*? \?> ) | # processing instruction
2219 $nested_tags/iox; # nested tags
2221 while ($str =~ m/($match)/g) {
2222 my $whole_tag = $1;
2223 my $sec_start = pos $str;
2224 my $tag_start = $sec_start - length $whole_tag;
2225 if ($pos < $tag_start) {
2226 push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
2228 push @tokens, ['tag', $whole_tag];
2229 $pos = pos $str;
2231 push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
2232 \@tokens;
2236 sub _Outdent {
2238 # Remove one level of line-leading indent_width of spaces
2240 my $text = shift;
2242 $text =~ s/^ {1,$opt{indent_width}}//gm;
2243 return $text;
2247 sub _Detab {
2249 # Expand tabs to spaces using $opt{tab_width} if no second argument
2251 my $text = shift;
2252 my $ts = shift || $opt{tab_width};
2253 # From the Perl camel book "Fluent Perl" section (slightly modified)
2254 $text =~ s/(.*?)(\t+)/$1 . ' ' x (length($2) * $ts - length($1) % $ts)/ge;
2255 return $text;
2259 sub _PrefixURL {
2261 # Add URL prefix if needed
2263 my $url = shift;
2264 $url =~ s/^\s+//;
2265 $url =~ s/\s+$//;
2266 $url = "#" unless $url ne "";
2268 return $url unless $opt{url_prefix} ne '' || $opt{img_prefix} ne '';
2269 return $url if $url =~ m"^\002\003" || $url =~ m"^#" ||
2270 $url =~ m,^//, || $url =~ /^[A-Za-z][A-Za-z0-9+.-]*:/;
2271 my $ans = $opt{url_prefix};
2272 $ans = $opt{img_prefix}
2273 if $opt{img_prefix} ne '' && $url =~ m"^[^#?]*\.(?:png|gif|jpe?g|svgz?)(?:[#?]|$)"i;
2274 return $url unless $ans ne '';
2275 $ans .= '/' if substr($ans, -1, 1) ne '/';
2276 $ans .= substr($url, 0, 1) eq '/' ? substr($url, 1) : $url;
2277 return "\2\3".$ans;
2281 BEGIN {
2282 $g_style_sheet = <<'STYLESHEET';
2284 <style type="text/css">
2285 /* <![CDATA[ */
2287 /* Markdown.pl fancy style sheet
2288 ** Copyright (C) 2017,2018,2019 Kyle J. McKay.
2289 ** All rights reserved.
2291 ** Redistribution and use in source and binary forms, with or without
2292 ** modification, are permitted provided that the following conditions are met:
2294 ** 1. Redistributions of source code must retain the above copyright notice,
2295 ** this list of conditions and the following disclaimer.
2297 ** 2. Redistributions in binary form must reproduce the above copyright
2298 ** notice, this list of conditions and the following disclaimer in the
2299 ** documentation and/or other materials provided with the distribution.
2301 ** 3. Neither the name of the copyright holder nor the names of its
2302 ** contributors may be used to endorse or promote products derived from
2303 ** this software without specific prior written permission.
2305 ** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
2306 ** AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2307 ** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2308 ** ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
2309 ** LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
2310 ** CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
2311 ** SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
2312 ** INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
2313 ** CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
2314 ** ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
2315 ** POSSIBILITY OF SUCH DAMAGE.
2318 div.%(base)code-bt > pre, div.%(base)code > pre {
2319 margin: 0;
2320 padding: 0;
2321 overflow: auto;
2324 div.%(base)code-bt > pre > code, div.%(base)code > pre > code {
2325 display: inline-block;
2326 margin: 0;
2327 padding: 0.5em 0;
2328 border-top: thin dotted;
2329 border-bottom: thin dotted;
2332 table.%(base)table {
2333 margin-bottom: 0.5em;
2335 table.%(base)table, table.%(base)table th, table.%(base)table td {
2336 border-collapse: collapse;
2337 border-spacing: 0;
2338 border: thin solid;
2341 ol.%(base)ol {
2342 counter-reset: %(base)item;
2344 ol.%(base)ol[start="0"] {
2345 counter-reset: %(base)item -1;
2347 ol.%(base)ol[start="5"] {
2348 counter-reset: %(base)item 4;
2350 ol.%(base)ol[start="10"] {
2351 counter-reset: %(base)item 9;
2353 ol.%(base)ol > span.%(base)ol-incr {
2354 counter-increment: %(base)item;
2356 ol.%(base)ol > span.%(base)ol-incr-2 {
2357 counter-increment: %(base)item 2;
2359 ol.%(base)ol > span.%(base)ol-incr-5 {
2360 counter-increment: %(base)item 5;
2362 ol.%(base)ol > span.%(base)ol-incr-10 {
2363 counter-increment: %(base)item 10;
2365 ol.%(base)lc-greek, li.%(base)lc-greek {
2366 list-style-type: lower-greek;
2368 ol.%(base)ol > li {
2369 counter-increment: %(base)item;
2371 ol.%(base)ol > li.%(base)li,
2372 ol.%(base)ol > li.%(base)li-lc,
2373 ol.%(base)ol > li.%(base)li-lc-greek,
2374 ol.%(base)ol > li.%(base)li-uc {
2375 list-style-type: none;
2376 display: block;
2378 ol.%(base)ol > li.%(base)li:before,
2379 ol.%(base)ol > li.%(base)li-lc:before,
2380 ol.%(base)ol > li.%(base)li-lc-greek:before,
2381 ol.%(base)ol > li.%(base)li-uc:before {
2382 position: absolute;
2383 text-align: right;
2384 white-space: nowrap;
2385 margin-left: -9ex;
2386 width: 9ex;
2388 ol.%(base)ol > li.%(base)li[type="1"]:before {
2389 content: counter(%(base)item, decimal) ")\A0 \A0 ";
2391 ol.%(base)ol > li.%(base)li-lc[type="i"]:before,
2392 ol.%(base)ol > li.%(base)li-lc[type="I"]:before {
2393 content: counter(%(base)item, lower-roman) ")\A0 \A0 ";
2395 ol.%(base)ol > li.%(base)li-uc[type="I"]:before,
2396 ol.%(base)ol > li.%(base)li-uc[type="i"]:before {
2397 content: counter(%(base)item, upper-roman) ")\A0 \A0 ";
2399 ol.%(base)ol > li.%(base)li-lc[type="a"]:before,
2400 ol.%(base)ol > li.%(base)li-lc[type="A"]:before {
2401 content: counter(%(base)item, lower-alpha) ")\A0 \A0 ";
2403 ol.%(base)ol > li.%(base)li-lc-greek[type="a"]:before,
2404 ol.%(base)ol > li.%(base)li-lc-greek[type="A"]:before {
2405 content: counter(%(base)item, lower-greek) ")\A0 \A0 ";
2407 ol.%(base)ol > li.%(base)li-uc[type="A"]:before,
2408 ol.%(base)ol > li.%(base)li-uc[type="a"]:before {
2409 content: counter(%(base)item, upper-alpha) ")\A0 \A0 ";
2412 li.%(base)checkbox-on,
2413 li.%(base)checkbox-off {
2414 list-style-type: none;
2415 display: block;
2417 li.%(base)checkbox-on > span:first-child + span + span,
2418 li.%(base)checkbox-off > span:first-child + span + span {
2419 position: absolute;
2420 clip: rect(0,0,0,0);
2422 li.%(base)checkbox-on > span:first-child,
2423 li.%(base)checkbox-off > span:first-child,
2424 li.%(base)checkbox-on > span:first-child + span,
2425 li.%(base)checkbox-off > span:first-child + span {
2426 display: block;
2427 position: absolute;
2428 margin-left: -3ex;
2429 width: 1em;
2430 height: 1em;
2432 li.%(base)checkbox-on > span:first-child > span:first-child,
2433 li.%(base)checkbox-off > span:first-child > span:first-child {
2434 display: block;
2435 position: absolute;
2436 left: 0.75pt; top: 0.75pt; right: 0.75pt; bottom: 0.75pt;
2438 li.%(base)checkbox-on > span:first-child > span:first-child:before,
2439 li.%(base)checkbox-off > span:first-child > span:first-child:before {
2440 display: inline-block;
2441 position: relative;
2442 right: 1pt;
2443 width: 100%;
2444 height: 100%;
2445 border: 1pt solid;
2446 content: "";
2448 li.%(base)checkbox-on > span:first-child + span:before {
2449 position: relative;
2450 left: 2pt;
2451 bottom: 1pt;
2452 font-size: 125%;
2453 line-height: 80%;
2454 content: "\2713";
2457 /* ]]> */
2458 </style>
2460 STYLESHEET
2461 $g_style_sheet =~ s/^\s+//g;
2462 $g_style_sheet =~ s/\s+$//g;
2463 $g_style_sheet .= "\n";
2468 __DATA__
2470 =head1 NAME
2472 Markdown.pl - convert Markdown format text files to HTML
2474 =head1 SYNOPSIS
2476 B<Markdown.pl> [B<--help>] [B<--html4tags>] [B<--htmlroot>=I<prefix>]
2477 [B<--imageroot>=I<prefix>] [B<--version>] [B<--shortversion>]
2478 [B<--tabwidth>=I<num>] [B<--stylesheet>] [B<--stub>] [--]
2479 [I<file>...]
2481 Options:
2482 -h show short usage help
2483 --help show long detailed help
2484 --html4tags use <br> instead of <br />
2485 --deprecated allow <dir> and <menu> tags
2486 --tabwidth=num expand tabs to num instead of 8
2487 -r prefix | --htmlroot=prefix append relative non-img URLs
2488 to prefix
2489 -i prefix | --imageroot=prefix append relative img URLs to
2490 prefix
2491 -V | --version show version, authors, license
2492 and copyright
2493 -s | --shortversion show just the version number
2494 --stylesheet output the fancy style sheet
2495 --no-stylesheet do not output fancy style sheet
2496 --stub wrap output in stub document
2497 implies --stylesheet
2498 -- end options and treat next
2499 argument as file
2501 =head1 DESCRIPTION
2503 Markdown is a text-to-HTML filter; it translates an easy-to-read /
2504 easy-to-write structured text format into HTML. Markdown's text format
2505 is most similar to that of plain text email, and supports features such
2506 as headers, *emphasis*, code blocks, blockquotes, and links.
2508 Markdown's syntax is designed not as a generic markup language, but
2509 specifically to serve as a front-end to (X)HTML. You can use span-level
2510 HTML tags anywhere in a Markdown document, and you can use block level
2511 HTML tags (like <div> and <table> as well).
2513 For more information about Markdown's syntax, see the F<basics.md>
2514 and F<syntax.md> files included with F<Markdown.pl>.
2516 Input (auto-detected) may be either ISO-8859-1 or UTF-8. Output is always
2517 converted to the UTF-8 character set.
2520 =head1 OPTIONS
2522 Use "--" to end switch parsing. For example, to open a file named "-z", use:
2524 Markdown.pl -- -z
2526 =over
2529 =item B<--html4tags>
2531 Use HTML 4 style for empty element tags, e.g.:
2533 <br>
2535 instead of Markdown's default XHTML style tags, e.g.:
2537 <br />
2540 =item B<--deprecated>
2542 Both "<dir>" and "<menu>" are normally taken as literal text and the leading
2543 "<" will be automatically escaped.
2545 If this option is used, they are recognized as valid tags and passed through
2546 without being escaped.
2548 When dealing with program argument descriptions "<dir>" can be particularly
2549 problematic therefore use of this option is not recommended.
2551 Other deprecated tags (such as "<font>" and "<center>" for example) continue
2552 to be recognized and passed through even without using this option.
2555 =item B<--tabwidth>=I<num>
2557 Expand tabs to I<num> character wide tab stop positions instead of the default
2558 8. Don't use this; physical tabs should always be expanded to 8-character
2559 positions. This option does I<not> affect the number of spaces needed to
2560 start a new "indent level". That will always be 4 no matter what value is
2561 used (or implied by default) with this option. Also note that tabs inside
2562 backticks-delimited code blocks will always be expanded to 8-character tab
2563 stop positions no matter what value is used for this option.
2565 The value must be S<2 <= I<num> <= 32>.
2568 =item B<-r> I<prefix>, B<--htmlroot>=I<prefix>
2570 Any non-absolute URLs have I<prefix> prepended.
2573 =item B<-i> I<prefix>, B<--imageroot>=I<prefix>
2575 Any non-absolute URLs have I<prefix> prepended (overriding the B<-r> prefix
2576 if any) but only if they end in an image suffix.
2579 =item B<-V>, B<--version>
2581 Display Markdown's version number and copyright information.
2584 =item B<-s>, B<--shortversion>
2586 Display the short-form version number.
2589 =item B<--stylesheet>
2591 Include the fancy style sheet at the beginning of the output (or in the
2592 C<head> section with B<--stub>). This style sheet makes fancy checkboxes
2593 and makes a right parenthesis C<)> show instead of a C<.> for ordered lists
2594 that use them. Without it things will still look fine except that the
2595 fancy stuff won't be there.
2597 Use this option with no other arguments and redirect standard input to
2598 /dev/null to get just the style sheet and nothing else.
2601 =item B<--no-stylesheet>
2603 Overrides a previous B<--stylesheet> and disables implicit inclusion
2604 of the style sheet by the B<--stub> option.
2607 =item B<--stub>
2609 Wrap the output in a full document stub (i.e. has C<html>, C<head> and C<body>
2610 tags). The style sheet I<will> be included in the C<head> section unless the
2611 B<--no-stylesheet> option is also used.
2614 =item B<-h>, B<--help>
2616 Display Markdown's help. With B<--help> full help is shown, with B<-h> only
2617 the usage and options are shown.
2620 =back
2623 =head1 VERSION HISTORY
2625 Z<> See the F<README> file for detailed release notes for this version.
2627 =over
2629 =item Z<> 1.1.7 - 14 Feb 2018
2631 =item Z<> 1.1.6 - 03 Jan 2018
2633 =item Z<> 1.1.5 - 07 Dec 2017
2635 =item Z<> 1.1.4 - 24 Jun 2017
2637 =item Z<> 1.1.3 - 13 Feb 2017
2639 =item Z<> 1.1.2 - 19 Jan 2017
2641 =item Z<> 1.1.1 - 12 Jan 2017
2643 =item Z<> 1.1.0 - 11 Jan 2017
2645 =item Z<> 1.0.4 - 05 Jun 2016
2647 =item Z<> 1.0.3 - 06 Sep 2015
2649 =item Z<> 1.0.2 - 03 Sep 2015
2651 =item Z<> 1.0.1 - 14 Dec 2004
2653 =item Z<> 1.0.0 - 28 Aug 2004
2655 =back
2657 =head1 AUTHORS
2659 =over
2661 =item John Gruber
2663 =item L<http://daringfireball.net>
2665 =item L<http://daringfireball.net/projects/markdown/>
2667 =item E<160>
2669 =back
2671 =over
2673 =item PHP port and other contributions by Michel Fortin
2675 =item L<http://michelf.com>
2677 =item E<160>
2679 =back
2681 =over
2683 =item Additional enhancements and tweaks by Kyle J. McKay
2685 =item mackyle<at>gmail.com
2687 =back
2689 =head1 COPYRIGHT AND LICENSE
2691 =over
2693 =item Copyright (C) 2003-2004 John Gruber
2695 =item Copyright (C) 2015-2019 Kyle J. McKay
2697 =item All rights reserved.
2699 =back
2701 Redistribution and use in source and binary forms, with or without
2702 modification, are permitted provided that the following conditions are
2703 met:
2705 =over
2707 =item *
2709 Redistributions of source code must retain the above copyright
2710 notice, this list of conditions and the following disclaimer.
2712 =item *
2714 Redistributions in binary form must reproduce the above copyright
2715 notice, this list of conditions and the following disclaimer in the
2716 documentation and/or other materials provided with the distribution.
2718 =item *
2720 Neither the name "Markdown" nor the names of its contributors may
2721 be used to endorse or promote products derived from this software
2722 without specific prior written permission.
2724 =back
2726 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
2727 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
2728 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
2729 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
2730 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
2731 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
2732 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
2733 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
2734 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
2735 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
2736 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2738 =cut