various: update links to https where possible
[markdown.git] / Markdown.pl
blob73e796a41287371951f898b4a0e78862d34cec16
1 #!/usr/bin/env perl
4 # Markdown -- A text-to-HTML conversion tool for web writers
6 # Copyright (C) 2004 John Gruber
7 # Copyright (C) 2015,2016,2017,2018,2019 Kyle J. McKay
8 # All rights reserved.
9 # License is Modified BSD (aka 3-clause BSD) License\n";
10 # See LICENSE file (or <https://opensource.org/licenses/BSD-3-Clause>)
13 package Markdown;
15 require 5.008;
16 use strict;
17 use warnings;
19 use Encode;
21 use vars qw($COPYRIGHT $VERSION @ISA @EXPORT_OK);
23 BEGIN {*COPYRIGHT =
24 \"Copyright (C) 2004 John Gruber
25 Copyright (C) 2015,2016,2017,2018,2019 Kyle J. McKay
26 All rights reserved.
28 *VERSION = \"1.1.10-PRE"
31 require Exporter;
32 use Digest::MD5 qw(md5 md5_hex);
33 use File::Basename qw(basename);
34 use Scalar::Util qw(refaddr looks_like_number);
35 my ($hasxml, $hasxml_err); BEGIN { ($hasxml, $hasxml_err) = (0, "") }
36 my ($hasxmlp, $hasxmlp_err); BEGIN { ($hasxmlp, $hasxmlp_err) = (0, "") }
37 @ISA = qw(Exporter);
38 @EXPORT_OK = qw(Markdown);
39 $INC{__PACKAGE__.'.pm'} = $INC{basename(__FILE__)} unless exists $INC{__PACKAGE__.'.pm'};
41 close(DATA) if fileno(DATA);
42 exit(&_main(@ARGV)||0) unless caller;
44 sub fauxdie($) {
45 my $msg = join(" ", @_);
46 $msg =~ s/\s+$//os;
47 printf STDERR "%s: fatal: %s\n", basename($0), $msg;
48 exit 1;
51 my $encoder;
52 BEGIN {
53 $encoder = Encode::find_encoding('Windows-1252') ||
54 Encode::find_encoding('ISO-8859-1') or
55 die "failed to load ISO-8859-1 encoder\n";
59 # Global default settings:
61 my ($g_style_prefix, $g_empty_element_suffix, $g_indent_width, $g_tab_width);
62 BEGIN {
63 $g_style_prefix = "_markdown-"; # Prefix for markdown css class styles
64 $g_empty_element_suffix = " />"; # Change to ">" for HTML output
65 $g_indent_width = 4; # Number of spaces considered new level
66 $g_tab_width = 4; # Legacy even though it's wrong
71 # Globals:
74 # Style sheet template
75 my $g_style_sheet;
77 # Permanent block id table
78 my %g_perm_block_ids;
80 # Global hashes, used by various utility routines
81 my %g_urls;
82 my %g_titles;
83 my %g_anchors;
84 my %g_anchors_id;
85 my %g_block_ids;
86 my %g_code_block_ids;
87 my %g_html_blocks;
88 my %g_code_blocks;
89 my %opt;
91 # Return a "block id" to use to identify the block that does not contain
92 # any characters that could be misinterpreted by the rest of the code
93 # Originally this used md5_hex but that's unnecessarily slow
94 # Instead just use the refaddr of the scalar ref of the entry for that
95 # key in either the global or, if the optional second argument is true,
96 # permanent table. To avoid the result being confused with anything
97 # else, it's prefixed with a control character and suffixed with another
98 # both of which are not allowed by the XML standard or Unicode.
99 sub block_id {
100 $_[1] or return "\5".refaddr(\$g_block_ids{$_[0]})."\6";
101 $_[1] == 1 and return "\2".refaddr(\$g_perm_block_ids{$_[0]})."\3";
102 $_[1] == 2 and return "\25".refaddr(\$g_code_block_ids{$_[0]})."\26";
103 die "programmer error: bad block_id type $_[1]";
106 # Regex to match balanced [brackets]. See Friedl's
107 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
108 my $g_nested_brackets;
109 BEGIN {
110 $g_nested_brackets = qr{
111 (?> # Atomic matching
112 [^\[\]]+ # Anything other than brackets
115 (??{ $g_nested_brackets }) # Recursive set of nested brackets
121 # Regex to match balanced (parentheses)
122 my $g_nested_parens;
123 BEGIN {
124 $g_nested_parens = qr{
125 (?> # Atomic matching
126 [^\(\)]+ # Anything other than parentheses
129 (??{ $g_nested_parens }) # Recursive set of nested parentheses
135 # Table of hash values for escaped characters:
136 my %g_escape_table;
137 BEGIN {
138 $g_escape_table{""} = "\2\3";
139 foreach my $char (split //, "\\\`*_~{}[]()>#+-.!|:<") {
140 $g_escape_table{$char} = block_id($char,1);
144 # Used to track when we're inside an ordered or unordered list
145 # (see _ProcessListItems() for details):
146 my $g_list_level;
147 BEGIN {
148 $g_list_level = 0;
152 #### Blosxom plug-in interface ##########################################
153 my $_haveBX;
154 BEGIN {
155 no warnings 'once';
156 $_haveBX = defined($blosxom::version);
159 # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
160 # which posts Markdown should process, using a "meta-markup: markdown"
161 # header. If it's set to 0 (the default), Markdown will process all
162 # entries.
163 my $g_blosxom_use_meta;
164 BEGIN {
165 $g_blosxom_use_meta = 0;
168 sub start { 1; }
169 sub story {
170 my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
172 if ((! $g_blosxom_use_meta) or
173 (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
175 $$body_ref = Markdown($$body_ref);
181 #### Movable Type plug-in interface #####################################
182 my $_haveMT = eval {require MT; 1;}; # Test to see if we're running in MT
183 my $_haveMT3 = $_haveMT && eval {require MT::Plugin; 1;}; # and MT >= MT 3.0.
185 if ($_haveMT) {
186 require MT;
187 import MT;
188 require MT::Template::Context;
189 import MT::Template::Context;
191 if ($_haveMT3) {
192 require MT::Plugin;
193 import MT::Plugin;
194 my $plugin = new MT::Plugin({
195 name => "Markdown",
196 description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
197 doc_link => 'https://daringfireball.net/projects/markdown/'
199 MT->add_plugin( $plugin );
202 MT::Template::Context->add_container_tag(MarkdownOptions => sub {
203 my $ctx = shift;
204 my $args = shift;
205 my $builder = $ctx->stash('builder');
206 my $tokens = $ctx->stash('tokens');
208 if (defined ($args->{'output'}) ) {
209 $ctx->stash('markdown_output', lc $args->{'output'});
212 defined (my $str = $builder->build($ctx, $tokens) )
213 or return $ctx->error($builder->errstr);
214 $str; # return value
217 MT->add_text_filter('markdown' => {
218 label => 'Markdown',
219 docs => 'https://daringfireball.net/projects/markdown/',
220 on_format => sub {
221 my $text = shift;
222 my $ctx = shift;
223 my $raw = 0;
224 if (defined $ctx) {
225 my $output = $ctx->stash('markdown_output');
226 if (defined $output && $output =~ m/^html/i) {
227 $g_empty_element_suffix = ">";
228 $ctx->stash('markdown_output', '');
230 elsif (defined $output && $output eq 'raw') {
231 $raw = 1;
232 $ctx->stash('markdown_output', '');
234 else {
235 $raw = 0;
236 $g_empty_element_suffix = " />";
239 $text = $raw ? $text : Markdown($text);
240 $text;
244 # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
245 my $smartypants;
248 no warnings "once";
249 $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
252 if ($smartypants) {
253 MT->add_text_filter('markdown_with_smartypants' => {
254 label => 'Markdown With SmartyPants',
255 docs => 'https://daringfireball.net/projects/markdown/',
256 on_format => sub {
257 my $text = shift;
258 my $ctx = shift;
259 if (defined $ctx) {
260 my $output = $ctx->stash('markdown_output');
261 if (defined $output && $output eq 'html') {
262 $g_empty_element_suffix = ">";
264 else {
265 $g_empty_element_suffix = " />";
268 $text = Markdown($text);
269 $text = $smartypants->($text, '1');
275 sub _strip {
276 my $str = shift;
277 defined($str) or return undef;
278 $str =~ s/^\s+//;
279 $str =~ s/\s+$//;
280 $str =~ s/\s+/ /g;
281 $str;
284 #### BBEdit/command-line text filter interface ##########################
285 sub _main {
286 local *ARGV = \@_;
289 #### Check for command-line switches: #################
290 my %options = ();
291 my %cli_opts;
292 my $raw = 0;
293 use Getopt::Long;
294 Getopt::Long::Configure(qw(bundling require_order pass_through));
295 GetOptions(\%cli_opts,
296 'help','h',
297 'version|V',
298 'shortversion|short-version|s',
299 'html4tags',
300 'deprecated',
301 'sanitize',
302 'no-sanitize',
303 'validate-xml',
304 'validate-xml-internal',
305 'no-validate-xml',
306 'absroot|a=s',
307 'base|b=s',
308 'htmlroot|r=s',
309 'imageroot|i=s',
310 'wiki|w:s',
311 'tabwidth|tab-width=s',
312 'raw',
313 'stylesheet|style-sheet',
314 'no-stylesheet|no-style-sheet',
315 'stub',
317 if ($cli_opts{'help'}) {
318 require Pod::Usage;
319 Pod::Usage::pod2usage(-verbose => 2, -exitval => 0);
321 if ($cli_opts{'h'}) {
322 require Pod::Usage;
323 Pod::Usage::pod2usage(-verbose => 0, -exitval => 0);
325 if ($cli_opts{'version'}) { # Version info
326 print "\nThis is Markdown, version $VERSION.\n", $COPYRIGHT;
327 print "License is Modified BSD (aka 3-clause BSD) License\n";
328 print "<https://opensource.org/licenses/BSD-3-Clause>\n";
329 exit 0;
331 if ($cli_opts{'shortversion'}) { # Just the version number string.
332 print $VERSION;
333 exit 0;
335 my $stub = 0;
336 if ($cli_opts{'stub'}) {
337 $stub = 1;
339 if ($cli_opts{'html4tags'}) { # Use HTML tag style instead of XHTML
340 $options{empty_element_suffix} = ">";
341 $stub = -$stub;
343 if ($cli_opts{'deprecated'}) { # Allow <dir> and <menu> tags to pass through
344 _SetAllowedTag("dir");
345 _SetAllowedTag("menu");
347 $options{sanitize} = 1; # sanitize by default
348 if ($cli_opts{'no-sanitize'}) { # Do not sanitize
349 $options{sanitize} = 0;
351 if ($cli_opts{'sanitize'}) { # --sanitize always wins
352 $options{sanitize} = 1;
354 $options{xmlcheck} = $options{sanitize} ? 2 : 0;
355 if ($cli_opts{'no-validate-xml'}) { # Do not validate XML
356 $options{xmlcheck} = 0;
358 if ($cli_opts{'validate-xml'}) { # Validate XML output
359 $options{xmlcheck} = 1;
361 if ($cli_opts{'validate-xml-internal'}) { # Validate XML output internally
362 $options{xmlcheck} = 2;
364 die "--html4tags and --validate-xml are incompatible\n"
365 if $cli_opts{'html4tags'} && $options{xmlcheck} == 1;
366 die "--no-sanitize and --validate-xml-internal are incompatible\n"
367 if !$options{'sanitize'} && $options{xmlcheck} == 2;
368 if ($options{xmlcheck} == 1) {
369 eval { require XML::Simple; 1 } and $hasxml = 1 or $hasxml_err = $@;
370 eval { require XML::Parser; 1 } and $hasxmlp = 1 or $hasxmlp_err = $@ unless $hasxml;
371 die "$hasxml_err$hasxmlp_err" unless $hasxml || $hasxmlp;
373 if ($cli_opts{'tabwidth'}) {
374 my $tw = $cli_opts{'tabwidth'};
375 die "invalid tab width (must be integer)\n" unless looks_like_number $tw;
376 die "invalid tab width (must be >= 2 and <= 32)\n" unless $tw >= 2 && $tw <= 32;
377 $options{tab_width} = int(0+$tw);
379 $options{abs_prefix} = ""; # no abs prefix by default
380 if ($cli_opts{'absroot'}) { # Use abs prefix for absolute path URLs
381 my $abs = $cli_opts{'absroot'};
382 $abs =~ s{/+$}{};
383 $options{abs_prefix} = $abs;
385 $options{base_prefix} = ""; # no base prefix by default
386 if ($cli_opts{'base'}) { # Use base prefix for fragment URLs
387 $options{base_prefix} = $cli_opts{'base'};
389 if ($cli_opts{'htmlroot'}) { # Use URL prefix
390 $options{url_prefix} = $cli_opts{'htmlroot'};
392 if ($cli_opts{'imageroot'}) { # Use image URL prefix
393 $options{img_prefix} = $cli_opts{'imageroot'};
395 if (exists $cli_opts{'wiki'}) { # Enable wiki links
396 my $wpat = $cli_opts{'wiki'};
397 defined($wpat) or $wpat = "";
398 my $wopt = "s(:md)";
399 if ($wpat =~ /^(.*?)%\{((?:[0-9A-Za-z]|[Ss]\([^)]*\))*)\}(.*)$/) {
400 $options{wikipat} = $1 . "%{}" . $3;
401 $wopt = $2;
402 } else {
403 $options{wikipat} = $wpat . "%{}.html";
405 my $sval = 1;
406 while ($wopt =~ /^(.*?)s\(([^)]*)\)(.*)$/i) {
407 my $sarg = $2;
408 $wopt = $1 . "s" . $3;
409 $sarg =~ s/^\s+//; $sarg =~ s/\s+$//;
410 $sval = {} unless ref($sval) eq "HASH";
411 s/^\.//, $sval->{lc($_)}=1 foreach split(/(?:\s*,\s*)|(?:(?<!,)\s+(?!,))/, $sarg);
412 $sval = 1 unless scalar(keys(%$sval));
414 $options{wikiopt} = { map({$_ => 1} split(//,lc($wopt))) };
415 if (ref($sval) eq "HASH" && $sval->{':md'}) {
416 delete $sval->{':md'};
417 $sval->{$_} = 1 foreach qw(md rmd mkd mkdn mdwn mdown markdown litcoffee);
419 $options{wikiopt}->{'s'} = $sval if $options{wikiopt}->{'s'};
421 if ($cli_opts{'raw'}) {
422 $raw = 1;
424 if ($cli_opts{'stylesheet'}) { # Display the style sheet
425 $options{show_styles} = 1;
427 if ($cli_opts{'no-stylesheet'}) { # Do not display the style sheet
428 $options{show_styles} = 0;
430 $options{show_styles} = 1 if $stub && !defined($options{show_styles});
431 $options{tab_width} = 8 unless defined($options{tab_width});
433 my $hdrf = sub {
434 my $out = "";
435 if ($stub > 0) {
436 $out .= <<'HTML5';
437 <!DOCTYPE html>
438 <html xmlns="http://www.w3.org/1999/xhtml">
439 <head>
440 <meta charset="utf-8" />
441 <meta http-equiv="content-type" content="text/html; charset=utf-8" />
442 HTML5
443 } elsif ($stub < 0) {
444 $out .= <<'HTML4';
445 <html>
446 <head>
447 <meta charset="utf-8">
448 <meta http-equiv="content-type" content="text/html; charset=utf-8">
449 HTML4
451 if ($stub && ($options{title} || $options{h1})) {
452 my $title = $options{title};
453 defined($title) && $title ne "" or $title = $options{h1};
454 if (defined($title) && $title ne "") {
455 $title =~ s/&/&amp;/g;
456 $title =~ s/</&lt;/g;
457 $out .= "<title>$title</title>\n";
460 if ($options{show_styles}) {
461 my $stylesheet = $g_style_sheet;
462 $stylesheet =~ s/%\(base\)/$g_style_prefix/g;
463 $out .= $stylesheet;
465 if ($stub) {
466 $out .= "</head>\n<body style=\"text-align:center\">\n" .
467 "<div style=\"display:inline-block;text-align:left;max-width:42pc\">\n";
469 $out;
472 #### Process incoming text: ###########################
473 my ($didhdr, $hdr, $result, $ftr) = (0, "", "", "");
474 @ARGV or push(@ARGV, "-");
475 foreach (@ARGV) {
476 my ($fh, $contents, $oneresult);
477 $_ eq "-" or open $fh, '<', $_ or fauxdie "could not open \"$_\": $!\n";
479 local $/; # Slurp the whole file
480 $_ eq "-" and $contents = <STDIN>;
481 $_ ne "-" and $contents = <$fh>;
483 defined($contents) or fauxdie "could not read \"$_\": $!\n";
484 $_ eq "-" or close($fh);
485 $oneresult = $raw ? ProcessRaw($contents, \%options) : Markdown($contents, \%options);
486 $oneresult =~ s/\s+$//os;
487 if ($oneresult ne "") {
488 if (!$didhdr && !$raw) {
489 $hdr = &$hdrf();
490 $didhdr = 1;
492 $result .= $oneresult . "\n";
495 $hdr = &$hdrf() unless $didhdr || $raw;
496 $ftr = "</div>\n</body>\n</html>\n" if $stub && !$raw;
497 if ($options{xmlcheck} == 1) {
498 my ($good, $errs);
499 if ($stub && !$raw) {
500 ($good, $errs) = _xmlcheck($hdr.$result.$ftr);
501 } else {
502 ($good, $errs) = _xmlcheck("<div>".$result."</div>");
504 $good or die $errs;
506 print $hdr, $result, $ftr;
508 exit 0;
512 sub _xmlcheck {
513 my $text = shift;
514 my ($good, $errs);
515 ($hasxml ? eval { XML::Simple::XMLin($text, KeepRoot => 1) && 1 } :
516 eval {
517 my $p = XML::Parser->new(Style => 'Tree', ErrorContext => 1);
518 $p->parse($text) && 1;
519 }) and $good = 1 or $errs = _trimerr($@);
520 ($good, $errs);
524 sub _trimerr {
525 my $err = shift;
526 1 while $err =~ s{\s+at\s+\.?/[^,\s\n]+\sline\s+[0-9]+\.?(\n|$)}{$1}is;
527 $err =~ s/\s+$//os;
528 $err . "\n";
532 sub _PrepareInput {
533 my $input = shift;
534 defined $input or $input = "";
536 use bytes;
537 $input =~ s/[\x00-\x08\x0B\x0E-\x1F\x7F]+//gso;
539 my $output;
540 if (Encode::is_utf8($input) || utf8::decode($input)) {
541 $output = $input;
542 } else {
543 $output = $encoder->decode($input, Encode::FB_DEFAULT);
545 # Standardize line endings:
546 $output =~ s{\r\n}{\n}g; # DOS to Unix
547 $output =~ s{\r}{\n}g; # Mac to Unix
548 return $output;
552 sub ProcessRaw {
553 my $text = _PrepareInput(shift);
555 %opt = (
556 empty_element_suffix => $g_empty_element_suffix,
558 my %args = ();
559 if (ref($_[0]) eq "HASH") {
560 %args = %{$_[0]};
561 } else {
562 %args = @_;
564 while (my ($k,$v) = each %args) {
565 $opt{$k} = $v;
567 $opt{xmlcheck} = 0 unless looks_like_number($opt{xmlcheck});
569 # Sanitize all '<'...'>' tags if requested
570 $text = _SanitizeTags($text, $opt{xmlcheck} == 2) if $opt{sanitize};
572 utf8::encode($text);
573 return $text;
577 sub Markdown {
579 # Primary function. The order in which other subs are called here is
580 # essential. Link and image substitutions need to happen before
581 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
582 # and <img> tags get encoded.
584 my $text = _PrepareInput(shift);
586 # Any remaining arguments after the first are options; either a single
587 # hashref or a list of name, value paurs.
588 %opt = (
589 # set initial defaults
590 style_prefix => $g_style_prefix,
591 empty_element_suffix => $g_empty_element_suffix,
592 tab_width => $g_tab_width,
593 indent_width => $g_indent_width,
594 url_prefix => "", # Prefixed to non-absolute URLs
595 img_prefix => "", # Prefixed to non-absolute image URLs
597 my %args = ();
598 if (ref($_[0]) eq "HASH") {
599 %args = %{$_[0]};
600 } else {
601 %args = @_;
603 while (my ($k,$v) = each %args) {
604 $opt{$k} = $v;
606 $opt{xmlcheck} = 0 unless looks_like_number($opt{xmlcheck});
608 # Clear the globals. If we don't clear these, you get conflicts
609 # from other articles when generating a page which contains more than
610 # one article (e.g. an index page that shows the N most recent
611 # articles):
612 %g_urls = ();
613 %g_titles = ();
614 %g_anchors = ();
615 %g_block_ids = ();
616 %g_code_block_ids = ();
617 %g_html_blocks = ();
618 %g_code_blocks = ();
619 $g_list_level = 0;
621 # Make sure $text ends with a couple of newlines:
622 $text .= "\n\n";
624 # Handle backticks-delimited code blocks
625 $text = _HashBTCodeBlocks($text);
627 # Convert all tabs to spaces.
628 $text = _DeTab($text);
630 # Strip any lines consisting only of spaces.
631 # This makes subsequent regexen easier to write, because we can
632 # match consecutive blank lines with /\n+/ instead of something
633 # contorted like / *\n+/ .
634 $text =~ s/^ +$//mg;
636 # Turn block-level HTML blocks into hash entries
637 $text = _HashHTMLBlocks($text);
639 # Strip link definitions, store in hashes.
640 $text = _StripLinkDefinitions($text);
642 $text = _RunBlockGamut($text, 1);
644 # Remove indentation markers
645 $text =~ s/\027+//gs;
647 # Unhashify code blocks
648 $text =~ s/(\025\d+\026)/$g_code_blocks{$1}/g;
650 $text = _UnescapeSpecialChars($text);
652 $text .= "\n" unless $text eq "";
654 # Sanitize all '<'...'>' tags if requested
655 $text = _SanitizeTags($text, $opt{xmlcheck} == 2) if $opt{sanitize};
657 utf8::encode($text);
658 if (defined($opt{h1}) && $opt{h1} ne "" && ref($_[0]) eq "HASH") {
659 utf8::encode($opt{h1});
660 ${$_[0]}{h1} = $opt{h1}
662 return $text;
666 sub _HashBTCodeBlocks {
668 # Process Markdown backticks (```) delimited code blocks
670 my $text = shift;
671 my $less_than_indent = $opt{indent_width} - 1;
673 $text =~ s{
674 (?:(?<=\n)|\A)
675 ([ ]{0,$less_than_indent})``(`+)[ \t]*(?:([\w.+-]+[#]?)[ \t]*)?\n
676 ( # $4 = the code block -- one or more lines, starting with ```
678 .*\n+
681 # and ending with ``` or end of document
682 (?:(?:[ ]{0,$less_than_indent}``\2[ \t]*(?:\n|\Z))|\Z)
684 # $2 contains syntax highlighting to use if defined
685 my $leadsp = length($1);
686 my $codeblock = $4;
687 $codeblock =~ s/[ \t]+$//mg; # trim trailing spaces on lines
688 $codeblock = _DeTab($codeblock, 8, $leadsp); # physical tab stops are always 8
689 $codeblock =~ s/\A\n+//; # trim leading newlines
690 $codeblock =~ s/\s+\z//; # trim trailing whitespace
691 $codeblock = _EncodeCode($codeblock); # or run highlighter here
692 $codeblock = "<div class=\"$opt{style_prefix}code-bt\"><pre style=\"display:none\"></pre><pre><code>"
693 . $codeblock . "\n</code></pre></div>";
695 my $key = block_id($codeblock);
696 $g_html_blocks{$key} = $codeblock;
697 "\n\n" . $key . "\n\n";
698 }egmx;
700 return $text;
704 sub _StripLinkDefinitions {
706 # Strips link definitions from text, stores the URLs and titles in
707 # hash references.
709 my $text = shift;
710 my $less_than_indent = $opt{indent_width} - 1;
712 # Link defs are in the form: ^[id]: url "optional title"
713 while ($text =~ s{
714 ^[ ]{0,$less_than_indent}\[(.+)\]: # id = $1
715 [ ]*
716 \n? # maybe *one* newline
717 [ ]*
718 <?((?:\S(?:\\\n\s*[^\s"(])?)+?)>? # url = $2
719 [ ]*
720 \n? # maybe one newline
721 [ ]*
723 (?<=\s) # lookbehind for whitespace
724 (?:(['"])|(\()) # title quote char
725 (.+?) # title = $5
726 (?(4)\)|\3) # match same quote
727 [ ]*
728 )? # title is optional
729 (?:\n+|\Z)
731 {}mx) {
732 my $id = _strip(lc $1); # Link IDs are case-insensitive
733 my $url = $2;
734 my $title = _strip($5);
735 $url =~ s/\\\n\s*//gs;
736 if ($id ne "") {
737 # These values always get passed through _MakeATag or _MakeIMGTag later
738 $g_urls{$id} = $url;
739 if (defined($title) && $title ne "") {
740 $g_titles{$id} = $title;
745 return $text;
748 my ($block_tags_a, $block_tags_b);
749 BEGIN {
750 $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/io;
751 $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/io;
754 sub _HashHTMLBlocks {
755 my $text = shift;
756 my $less_than_indent = $opt{indent_width} - 1;
757 my $idt = "\027" x $g_list_level;
759 # Hashify HTML blocks:
760 # We only want to do this for block-level HTML tags, such as headers,
761 # lists, and tables. That's because we still want to wrap <p>s around
762 # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
763 # phrase emphasis, and spans. The list of tags we're looking for is
764 # hard-coded:
766 # First, look for nested blocks, e.g.:
767 # <div>
768 # <div>
769 # tags for inner block must be indented.
770 # </div>
771 # </div>
773 # The outermost tags must start at the left margin for this to match, and
774 # the inner nested divs must be indented.
775 # We need to do this before the next, more liberal match, because the next
776 # match will start at the first `<div>` and stop at the first `</div>`.
777 $text =~ s{
778 ( # save in $1
779 ^ # start of line (with /m)
780 ((?:\Q$idt\E)?) # optional lead in = $2
781 <($block_tags_a) # start tag = $3
782 \b # word break
783 (?:.*\n)*? # any number of lines, minimally matching
784 \2</\3> # the matching end tag
785 [ ]* # trailing spaces
786 (?=\n+|\Z) # followed by a newline or end of document
789 my $key = block_id($1);
790 $g_html_blocks{$key} = $1;
791 "\n\n" . $key . "\n\n";
792 }eigmx;
796 # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
798 $text =~ s{
799 ( # save in $1
800 ^ # start of line (with /m)
801 (?:\Q$idt\E)? # optional lead in
802 <($block_tags_b) # start tag = $2
803 \b # word break
804 (?:.*\n)*? # any number of lines, minimally matching
805 .*</\2> # the matching end tag
806 [ ]* # trailing spaces
807 (?=\n+|\Z) # followed by a newline or end of document
810 my $key = block_id($1);
811 $g_html_blocks{$key} = $1;
812 "\n\n" . $key . "\n\n";
813 }eigmx;
814 # Special case just for <hr />. It was easier to make a special case than
815 # to make the other regex more complicated.
816 $text =~ s{
818 (?<=\n) # Starting after end of line
819 | # or
820 \A # the beginning of the doc
822 ( # save in $1
823 [ ]{0,$less_than_indent}
824 <(?:hr) # start tag
825 \b # word break
826 (?:[^<>])*? #
827 /?> # the matching end tag
828 [ ]*
829 (?=\n{1,}|\Z) # followed by end of line or end of document
832 my $key = block_id($1);
833 $g_html_blocks{$key} = $1;
834 "\n\n" . $key . "\n\n";
835 }eigx;
837 # Special case for standalone HTML comments:
838 $text =~ s{
840 (?<=\n\n) # Starting after a blank line
841 | # or
842 \A\n? # the beginning of the doc
844 ( # save in $1
845 [ ]{0,$less_than_indent}
846 (?s:
847 <!--
848 (?:[^-]|(?:-(?!-)))*
851 [ ]*
852 (?=\n{1,}|\Z) # followed by end of line or end of document
855 my $key = block_id($1);
856 $g_html_blocks{$key} = $1;
857 "\n\n" . $key . "\n\n";
858 }egx;
861 return $text;
865 sub _RunBlockGamut {
867 # These are all the transformations that form block-level
868 # tags like paragraphs, headers, and list items.
870 my ($text, $anchors) = @_;
872 $text = _DoHeaders($text, $anchors);
874 # Do Horizontal Rules:
875 $text =~ s{^ {0,3}\*(?: {0,2}\*){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm;
876 $text =~ s{^ {0,3}\_(?: {0,2}\_){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm;
877 $text =~ s{^ {0,3}\-(?: {0,2}\-){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm;
879 $text = _DoListsAndBlocks($text);
881 $text = _DoTables($text);
883 # We already ran _HashHTMLBlocks() before, in Markdown(), but that
884 # was to escape raw HTML in the original Markdown source. This time,
885 # we're escaping the markup we've just created, so that we don't wrap
886 # <p> tags around block-level tags.
887 $text = _HashHTMLBlocks($text);
889 $text = _FormParagraphs($text);
891 return $text;
895 sub _DoListBlocks {
896 return _DoBlockQuotes(_DoCodeBlocks($_[0])) if $_[0] ne "";
900 sub _RunSpanGamut {
902 # These are all the transformations that occur *within* block-level
903 # tags like paragraphs, headers, and list items.
905 my $text = shift;
907 $text = _DoCodeSpans($text);
909 $text = _EscapeSpecialChars($text);
911 # Process anchor and image tags. Images must come first,
912 # because ![foo][f] looks like an anchor.
913 $text = _DoImages($text);
914 $text = _DoAnchors($text);
916 # Make links out of things like `<http://example.com/>`
917 # Must come after _DoAnchors(), because you can use < and >
918 # delimiters in inline links like [this](<url>).
919 $text = _DoAutoLinks($text);
921 $text = _EncodeAmpsAndAngles($text);
923 $text = _DoItalicsAndBoldAndStrike($text);
925 # Do hard breaks:
926 $text =~ s/ {2,}\n/<br$opt{empty_element_suffix}\n/g;
928 return $text;
932 sub _EscapeSpecialChars {
933 my $text = shift;
934 my $tokens ||= _TokenizeHTML($text);
936 $text = ''; # rebuild $text from the tokens
937 # my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags.
938 # my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
940 foreach my $cur_token (@$tokens) {
941 if ($cur_token->[0] eq "tag") {
942 # Within tags, encode *, _ and ~ so they don't conflict
943 # with their use in Markdown for italics and strong.
944 # We're replacing each such character with its
945 # corresponding block id value; this is likely
946 # overkill, but it should prevent us from colliding
947 # with the escape values by accident.
948 $cur_token->[1] =~ s!([*_~])!$g_escape_table{$1}!g;
949 $text .= $cur_token->[1];
950 } else {
951 my $t = $cur_token->[1];
952 $t = _EncodeBackslashEscapes($t);
953 $text .= $t;
956 return $text;
960 sub _ProcessWikiLink {
961 my ($link_text, $link_loc) = @_;
962 if (defined($link_loc) &&
963 ($link_loc =~ m{^#\S*$} || $link_loc =~ m{^(?:http|ftp)s?://\S+$}i)) {
964 # Return the new link
965 return _MakeATag(_FindFragmentMatch($link_loc), $link_text);
967 if (!defined($link_loc) &&
968 ($link_loc = _strip($link_text)) =~ m{^(?:http|ftp)s?://\S+$}i) {
969 # Return the new link
970 return _MakeATag($link_loc, $link_text);
972 return undef if $link_loc eq "" || $link_text eq "";
973 if ($link_loc =~ /^[A-Za-z][A-Za-z0-9+.-]*:/os) {
974 # Unrecognized scheme
975 return undef;
977 if ($opt{wikipat}) {
978 my $o = $opt{wikiopt};
979 my $qsfrag = "";
980 my $base = $link_loc;
981 if ($link_loc =~ /^(.*?)([?#].*)$/os) {
982 ($base, $qsfrag) = ($1, $2);
984 $base = _wxform($base);
985 my $result = $opt{wikipat};
986 $result =~ s/%\{\}/$base/;
987 if ($qsfrag =~ /^([^#]*)(#.+)$/os) {
988 my ($q,$f) = ($1,$2);
989 #$f = _wxform($f) if $f =~ / /;
990 $qsfrag = $q . $f;
992 $result .= $qsfrag;
994 use bytes;
995 $result =~ s/%(?![0-9A-Fa-f]{2})/%25/sog;
996 if ($o->{r}) {
997 $result =~
998 s/([\x00-\x1F <>"{}|\\^`x7F])/sprintf("%%%02X",ord($1))/soge;
999 } else {
1000 $result =~
1001 s/([\x00-\x1F <>"{}|\\^`\x7F-\xFF])/sprintf("%%%02X",ord($1))/soge;
1003 $result =~ s/(%(?![0-9A-F]{2})[0-9A-Fa-f]{2})/uc($1)/soge;
1005 # Return the new link
1006 return _MakeATag($result, $link_text);
1008 # leave it alone
1009 return undef;
1013 sub _wxform {
1014 my $w = shift;
1015 my $o = $opt{wikiopt};
1016 my $opt_s = $o->{s};
1017 if ($opt_s) {
1018 if (ref($opt_s)) {
1019 if ($w =~ m{^(.*)[.]([^./]*)$}) {
1020 my ($base, $ext) = ($1, $2);
1021 $w = $base if $opt_s->{lc($ext)};
1023 } else {
1024 $w =~ s{[.][^./]*$}{};
1027 $w =~ tr{/}{ } if $o->{f};
1028 $w =~ s{/+}{/}gos if !$o->{f} && !$o->{v};
1029 if ($o->{d}) {
1030 $w =~ tr{ }{-};
1031 $w =~ s/-+/-/gos unless $o->{v};
1032 } else {
1033 $w =~ tr{ }{_};
1034 $w =~ s/_+/_/gos unless $o->{v};
1036 $w = uc($w) if $o->{u};
1037 $w = lc($w) if $o->{l};
1038 return $w;
1042 # Return a suitably encoded <a...> tag string
1043 # On input NONE of $url, $text or $title should be xmlencoded
1044 # but $url should already be url-encoded if needed, but NOT g_escape_table'd
1045 sub _MakeATag {
1046 my ($url, $text, $title) = @_;
1047 defined($url) or $url="";
1048 defined($text) or $text="";
1049 defined($title) or $title="";
1051 $url =~ m"^#" and $url = $opt{base_prefix} . $url;
1052 my $result = $g_escape_table{'<'}."a href=\"" . _EncodeAttText($url) . "\"";
1053 $title = _strip($title);
1054 $text =~ s{<(/?a)}{&lt;$1}sogi;
1055 $text = _DoItalicsAndBoldAndStrike($text);
1056 # We've got to encode any of these remaining to avoid
1057 # conflicting with other italics, bold and strike through.
1058 $text =~ s!([*_~])!$g_escape_table{$1}!g;
1059 $result .= " title=\"" . _EncodeAttText($title) . "\"" if $title ne "";
1060 return $result . $g_escape_table{'>'} .
1061 $text . $g_escape_table{'<'}."/a".$g_escape_table{'>'};
1065 sub _DoAnchors {
1067 # Turn Markdown link shortcuts into XHTML <a> tags.
1069 my $text = shift;
1072 # First, handle wiki-style links: [[wiki style link]]
1074 $text =~ s{
1075 ( # wrap whole match in $1
1076 \[\[
1077 ($g_nested_brackets) # link text and id = $2
1078 \]\]
1081 my $result;
1082 my $whole_match = $1;
1083 my $link_text = $2;
1084 my $link_loc = undef;
1086 if ($link_text =~ /^(.*)\|(.*)$/s) {
1087 $link_text = $1;
1088 $link_loc = _strip($2);
1091 $result = _ProcessWikiLink($link_text, $link_loc);
1092 defined($result) or $result = $whole_match;
1093 $result;
1094 }xsge;
1097 # Next, handle reference-style links: [link text] [id]
1099 $text =~ s{
1100 ( # wrap whole match in $1
1102 ($g_nested_brackets) # link text = $2
1105 [ ]? # one optional space
1106 (?:\n[ ]*)? # one optional newline followed by spaces
1109 ($g_nested_brackets) # id = $3
1113 my $result;
1114 my $whole_match = $1;
1115 my $link_text = $2;
1116 my $link_id = $3;
1118 $link_id ne "" or $link_id = $link_text; # for shortcut links like [this][].
1119 $link_id = _strip(lc $link_id);
1121 if (defined($g_urls{$link_id}) || defined($g_anchors{$link_id})) {
1122 my $url = $g_urls{$link_id};
1123 defined($url) or $url = $g_anchors{$link_id};
1124 $url = _FindFragmentMatch($url);
1125 $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
1126 $result = _MakeATag(_PrefixURL($url), $link_text, $g_titles{$link_id});
1128 else {
1129 $result = $whole_match;
1131 $result;
1132 }xsge;
1135 # Subsequently, inline-style links: [link text](url "optional title")
1137 $text =~ s{
1138 ( # wrap whole match in $1
1140 ($g_nested_brackets) # link text = $2
1142 \( # literal paren
1143 ($g_nested_parens) # href and optional title = $3
1147 #my $result;
1148 my $whole_match = $1;
1149 my $link_text = $2;
1150 my ($url, $title) = _SplitUrlTitlePart($3);
1152 if (defined($url)) {
1153 $url = _FindFragmentMatch($url);
1154 $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
1155 _MakeATag(_PrefixURL($url), $link_text, $title);
1156 } else {
1157 # The href/title part didn't match the pattern
1158 $whole_match;
1160 }xsge;
1163 # Finally, handle reference-style implicit shortcut links: [link text]
1165 $text =~ s{
1166 ( # wrap whole match in $1
1168 ($g_nested_brackets) # link text = $2
1172 my $result;
1173 my $whole_match = $1;
1174 my $link_text = $2;
1175 my $link_id = _strip(lc $2);
1177 if (defined($g_urls{$link_id}) || defined($g_anchors{$link_id})) {
1178 my $url = $g_urls{$link_id};
1179 defined($url) or $url = $g_anchors{$link_id};
1180 $url = _FindFragmentMatch($url);
1181 $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
1182 $result = _MakeATag(_PrefixURL($url), $link_text, $g_titles{$link_id});
1184 else {
1185 $result = $whole_match;
1187 $result;
1188 }xsge;
1190 return $text;
1194 sub _PeelWrapped {
1195 defined($_[0]) or return undef;
1196 if (substr($_[0],0,1) eq "(") {
1197 return substr($_[0], 1, length($_[0]) - (substr($_[0], -1, 1) eq ")" ? 2 : 1));
1199 return $_[0];
1203 sub _SplitUrlTitlePart {
1204 return ("", undef) if $_[0] =~ m{^\s*$}; # explicitly allowed
1205 my $u = $_[0];
1206 $u =~ s/^\s*(['\042])/# $1/;
1207 if ($u =~ m{
1208 ^ # match beginning
1209 \s*?
1210 <?([^\s'\042]\S*?)>? # URL = $1
1211 (?: # optional grouping
1212 \s+ # must be distinct from URL
1213 (['\042]?) # quote char = $2
1214 (.*?) # Title = $3
1215 \2? # matching quote
1216 )? # title is optional
1218 \z # match end
1219 }osx) {
1220 return (undef, undef) if $_[1] && ($1 eq "" || $1 eq "#");
1221 return (_PeelWrapped($1), $2 ? $3 : _PeelWrapped($3));
1222 } else {
1223 return (undef, undef);
1228 sub _FindFragmentMatch {
1229 my $url = shift;
1230 if (defined($url) && $url =~ /^#\S/) {
1231 # try very hard to find a match
1232 my $idbase = _strip(lc(substr($url, 1)));
1233 my $idbase0 = $idbase;
1234 my $id = _MakeAnchorId($idbase);
1235 if (defined($g_anchors_id{$id})) {
1236 $url = $g_anchors_id{$id};
1237 } else {
1238 $idbase =~ s/-/_/gs;
1239 $id = _MakeAnchorId($idbase);
1240 if (defined($g_anchors_id{$id})) {
1241 $url = $g_anchors_id{$id};
1242 } else {
1243 $id = _MakeAnchorId($idbase0, 1);
1244 if (defined($g_anchors_id{$id})) {
1245 $url = $g_anchors_id{$id};
1246 } else {
1247 $id = _MakeAnchorId($idbase, 1);
1248 if (defined($g_anchors_id{$id})) {
1249 $url = $g_anchors_id{$id};
1255 return $url;
1259 # Return a suitably encoded <img...> tag string
1260 # On input NONE of $url, $alt or $title should be xmlencoded
1261 # but $url should already be url-encoded if needed, but NOT g_escape_table'd
1262 sub _MakeIMGTag {
1263 my ($url, $alt, $title) = @_;
1264 defined($url) or $url="";
1265 defined($alt) or $alt="";
1266 defined($title) or $title="";
1267 return "" unless $url ne "";
1269 my $result = $g_escape_table{'<'}."img src=\"" . _EncodeAttText($url) . "\"";
1270 my ($w, $h) = (0, 0);
1271 ($alt, $title) = (_strip($alt), _strip($title));
1272 if ($title =~ /^(.*)\(([1-9][0-9]*)[xX]([1-9][0-9]*)\)$/os) {
1273 ($title, $w, $h) = (_strip($1), $2, $3);
1274 } elsif ($title =~ /^(.*)\(\?[xX]([1-9][0-9]*)\)$/os) {
1275 ($title, $h) = (_strip($1), $2);
1276 } elsif ($title =~ /^(.*)\(([1-9][0-9]*)[xX]\?\)$/os) {
1277 ($title, $w) = (_strip($1), $2);
1279 $result .= " alt=\"" . _EncodeAttText($alt) . "\"" if $alt ne "";
1280 $result .= " width=\"$w\"" if $w != 0;
1281 $result .= " height=\"$h\"" if $h != 0;
1282 $result .= " title=\"" . _EncodeAttText($title) . "\"" if $title ne "";
1283 $result .= " /" unless $opt{empty_element_suffix} eq ">";
1284 $result .= $g_escape_table{'>'};
1285 return $result;
1289 sub _DoImages {
1291 # Turn Markdown image shortcuts into <img> tags.
1293 my $text = shift;
1296 # First, handle reference-style labeled images: ![alt text][id]
1298 $text =~ s{
1299 ( # wrap whole match in $1
1301 ($g_nested_brackets) # alt text = $2
1304 [ ]? # one optional space
1305 (?:\n[ ]*)? # one optional newline followed by spaces
1308 ($g_nested_brackets) # id = $3
1313 my $result;
1314 my $whole_match = $1;
1315 my $alt_text = $2;
1316 my $link_id = $3;
1318 $link_id ne "" or $link_id = $alt_text; # for shortcut links like ![this][].
1319 $link_id = _strip(lc $link_id);
1321 if (defined $g_urls{$link_id}) {
1322 $result = _MakeIMGTag(
1323 _PrefixURL($g_urls{$link_id}), $alt_text, $g_titles{$link_id});
1325 else {
1326 # If there's no such link ID, leave intact:
1327 $result = $whole_match;
1330 $result;
1331 }xsge;
1334 # Next, handle inline images: ![alt text](url "optional title")
1335 # Don't forget: encode * and _
1337 $text =~ s{
1338 ( # wrap whole match in $1
1340 ($g_nested_brackets) # alt text = $2
1342 \( # literal paren
1343 ($g_nested_parens) # src and optional title = $3
1347 my $whole_match = $1;
1348 my $alt_text = $2;
1349 my ($url, $title) = _SplitUrlTitlePart($3, 1);
1350 defined($url) ? _MakeIMGTag(_PrefixURL($url), $alt_text, $title) : $whole_match;
1351 }xsge;
1354 # Finally, handle reference-style implicitly labeled links: ![alt text]
1356 $text =~ s{
1357 ( # wrap whole match in $1
1359 ($g_nested_brackets) # alt text = $2
1363 my $result;
1364 my $whole_match = $1;
1365 my $alt_text = $2;
1366 my $link_id = lc(_strip($alt_text));
1368 if (defined $g_urls{$link_id}) {
1369 $result = _MakeIMGTag(
1370 _PrefixURL($g_urls{$link_id}), $alt_text, $g_titles{$link_id});
1372 else {
1373 # If there's no such link ID, leave intact:
1374 $result = $whole_match;
1377 $result;
1378 }xsge;
1380 return $text;
1383 sub _EncodeAttText {
1384 my $text = shift;
1385 defined($text) or return undef;
1386 $text = _HTMLEncode(_strip($text));
1387 # We've got to encode these to avoid conflicting
1388 # with italics, bold and strike through.
1389 $text =~ s!([*_~:])!$g_escape_table{$1}!g;
1390 return $text;
1394 sub _MakeAnchorId {
1395 use bytes;
1396 my ($link, $strip) = @_;
1397 $link = lc($link);
1398 if ($strip) {
1399 $link =~ s/\s+/_/gs;
1400 $link =~ tr/-a-z0-9_//cd;
1401 } else {
1402 $link =~ tr/-a-z0-9_/_/cs;
1404 return '' unless $link ne '';
1405 $link = "_".$link."_";
1406 $link =~ s/__+/_/gs;
1407 $link = "_".md5_hex($link)."_" if length($link) > 66;
1408 return $link;
1412 sub _GetNewAnchorId {
1413 my $link = _strip(lc(shift));
1414 return '' if $link eq "" || defined($g_anchors{$link});
1415 my $id = _MakeAnchorId($link);
1416 return '' unless $id;
1417 $g_anchors{$link} = '#'.$id;
1418 $g_anchors_id{$id} = $g_anchors{$link};
1419 if ($id =~ /-/) {
1420 my $id2 = $id;
1421 $id2 =~ s/-/_/gs;
1422 $id2 =~ s/__+/_/gs;
1423 defined($g_anchors_id{$id2}) or $g_anchors_id{$id2} = $g_anchors{$link};
1425 my $idd = _MakeAnchorId($link, 1);
1426 if ($idd) {
1427 defined($g_anchors_id{$idd}) or $g_anchors_id{$idd} = $g_anchors{$link};
1428 if ($idd =~ /-/) {
1429 my $idd2 = $idd;
1430 $idd2 =~ s/-/_/gs;
1431 $idd2 =~ s/__+/_/gs;
1432 defined($g_anchors_id{$idd2}) or $g_anchors_id{$idd2} = $g_anchors{$link};
1435 $id;
1439 sub _DoHeaders {
1440 my ($text, $anchors) = @_;
1441 my $h1;
1442 my $geth1 = $anchors && !defined($opt{h1}) ? sub {
1443 return unless !defined($h1);
1444 my $h = shift;
1445 return unless defined($h) && $h !~ /^\s*$/;
1446 $h = _StripTags(_UnescapeSpecialChars($h));
1447 $h =~ s/^\s+//;
1448 $h =~ s/\s+$//;
1449 $h =~ s/\s+/ /g;
1450 $h1 = $h if $h ne "";
1451 } : sub {};
1453 # atx-style headers:
1454 # # Header 1
1455 # ## Header 2
1456 # ## Header 2 with closing hashes ##
1457 # ...
1458 # ###### Header 6
1460 $text =~ s{
1461 ^(\#{1,6}) # $1 = string of #'s
1462 [ ]*
1463 ((?:(?:(?<![#])[^\s]|[^#\s]).*?)?) # $2 = Header text
1464 [ ]*
1467 my $h_level = length($1);
1468 my $h = $2;
1469 $h =~ s/#+$//;
1470 $h =~ s/\s+$//;
1471 my $id = $h eq "" ? "" : _GetNewAnchorId($h);
1472 $id = " id=\"$id\"" if $id ne "";
1473 my $rsg = _RunSpanGamut($h);
1474 &$geth1($rsg) if $h_level == 1 && $h ne "";
1475 "<h$h_level$id>" . $rsg . "</h$h_level>\n\n";
1476 }egmx;
1478 # Setext-style headers:
1479 # Header 1
1480 # ========
1482 # Header 2
1483 # --------
1485 # Header 3
1486 # ~~~~~~~~
1488 $text =~ s{ ^(?:=+[ ]*\n)?[ ]*(.+?)[ ]*\n=+[ ]*\n+ }{
1489 my $h = $1;
1490 my $id = _GetNewAnchorId($h);
1491 $id = " id=\"$id\"" if $id ne "";
1492 my $rsg = _RunSpanGamut($h);
1493 &$geth1($rsg);
1494 "<h1$id>" . $rsg . "</h1>\n\n";
1495 }egmx;
1497 $text =~ s{ ^(?:-+[ ]*\n)?[ ]*(.+?)[ ]*\n-+[ ]*\n+ }{
1498 my $h = $1;
1499 my $id = _GetNewAnchorId($h);
1500 $id = " id=\"$id\"" if $id ne "";
1501 "<h2$id>" . _RunSpanGamut($h) . "</h2>\n\n";
1502 }egmx;
1504 $text =~ s{ ^(?:~+[ ]*\n)?[ ]*(.+?)[ ]*\n~+[ ]*\n+ }{
1505 my $h = $1;
1506 my $id = _GetNewAnchorId($h);
1507 $id = " id=\"$id\"" if $id ne "";
1508 "<h3$id>" . _RunSpanGamut($h) . "</h3>\n\n";
1509 }egmx;
1511 $opt{h1} = $h1 if defined($h1) && $h1 ne "";
1512 return $text;
1516 my ($marker_ul, $marker_ol, $marker_any, $roman_numeral, $greek_lower);
1517 BEGIN {
1518 # Re-usable patterns to match list item bullets and number markers:
1519 $roman_numeral = qr/(?:
1520 [IiVvXx]|[Ii]{2,3}|[Ii][VvXx]|[VvXx][Ii]{1,3}|[Xx][Vv][Ii]{0,3}|
1521 [Xx][Ii][VvXx]|[Xx]{2}[Ii]{0,3}|[Xx]{2}[Ii]?[Vv]|[Xx]{2}[Vv][Ii]{1,2})/ox;
1522 $greek_lower = qr/(?:[\x{03b1}-\x{03c9}])/o;
1523 $marker_ul = qr/[*+-]/o;
1524 $marker_ol = qr/(?:\d+|[A-Za-z]|$roman_numeral|$greek_lower)[.\)]/o;
1525 $marker_any = qr/(?:$marker_ul|$marker_ol)/o;
1529 sub _GetListMarkerType {
1530 my ($list_type, $list_marker, $last_marker) = @_;
1531 return "" unless $list_type && $list_marker && lc($list_type) eq "ol";
1532 my $last_marker_type = '';
1533 $last_marker_type = _GetListMarkerType($list_type, $last_marker)
1534 if defined($last_marker) &&
1535 # these are roman unless $last_marker type case matches and is 'a' or 'A'
1536 $list_marker =~ /^[IiVvXx][.\)]?$/;
1537 return "I" if $list_marker =~ /^[IVX]/ && $last_marker_type ne 'A';
1538 return "i" if $list_marker =~ /^[ivx]/ && $last_marker_type ne 'a';
1539 return "A" if $list_marker =~ /^[A-Z]/;
1540 return "a" if $list_marker =~ /^[a-z]/ || $list_marker =~ /^$greek_lower/o;
1541 return "1";
1545 sub _GetListItemTypeClass {
1546 my ($list_type, $list_marker, $last_marker) = @_;
1547 my $list_marker_type = _GetListMarkerType($list_type, $list_marker, $last_marker);
1548 my $ans = &{sub{
1549 return "" unless length($list_marker) >= 2 && $list_marker_type =~ /^[IiAa1]$/;
1550 return "lower-greek" if $list_marker_type eq "a" && $list_marker =~ /^$greek_lower/o;
1551 return "" unless $list_marker =~ /\)$/;
1552 return "upper-roman" if $list_marker_type eq "I";
1553 return "lower-roman" if $list_marker_type eq "i";
1554 return "upper-alpha" if $list_marker_type eq "A";
1555 return "lower-alpha" if $list_marker_type eq "a";
1556 return "decimal";
1558 return ($list_marker_type, $ans);
1562 my %_roman_number_table;
1563 BEGIN {
1564 %_roman_number_table = (
1565 i => 1,
1566 ii => 2,
1567 iii => 3,
1568 iv => 4,
1569 v => 5,
1570 vi => 6,
1571 vii => 7,
1572 viii => 8,
1573 ix => 9,
1574 x => 10,
1575 xi => 11,
1576 xii => 12,
1577 xiii => 13,
1578 xiv => 14,
1579 xv => 15,
1580 xvi => 16,
1581 xvii => 17,
1582 xviii => 18,
1583 xix => 19,
1584 xx => 20,
1585 xxi => 21,
1586 xxii => 22,
1587 xxiii => 23,
1588 xxiv => 24,
1589 xxv => 25,
1590 xxvi => 26,
1591 xxvii => 27
1596 # Necessary because ς and σ are the same value grrr
1597 my %_greek_number_table;
1598 BEGIN {
1599 %_greek_number_table = (
1600 "\x{03b1}" => 1, # α
1601 "\x{03b2}" => 2, # β
1602 "\x{03b3}" => 3, # γ
1603 "\x{03b4}" => 4, # δ
1604 "\x{03b5}" => 5, # ε
1605 "\x{03b6}" => 6, # ζ
1606 "\x{03b7}" => 7, # η
1607 "\x{03b8}" => 8, # θ
1608 "\x{03b9}" => 9, # ι
1609 "\x{03ba}" => 10, # κ
1610 "\x{03bb}" => 11, # λ
1611 #"\x{00b5}"=> 12, # µ is "micro" not "mu"
1612 "\x{03bc}" => 12, # μ
1613 "\x{03bd}" => 13, # ν
1614 "\x{03be}" => 14, # ξ
1615 "\x{03bf}" => 15, # ο
1616 "\x{03c0}" => 16, # π
1617 "\x{03c1}" => 17, # ρ
1618 "\x{03c2}" => 18, # ς
1619 "\x{03c3}" => 18, # σ
1620 "\x{03c4}" => 19, # τ
1621 "\x{03c5}" => 20, # υ
1622 "\x{03c6}" => 21, # φ
1623 "\x{03c7}" => 22, # χ
1624 "\x{03c8}" => 23, # ψ
1625 "\x{03c9}" => 24 # ω
1630 sub _GetMarkerIntegerNum {
1631 my ($list_marker_type, $marker_val) = @_;
1632 my $ans = &{sub{
1633 return 0 + $marker_val if $list_marker_type eq "1";
1634 $list_marker_type = lc($list_marker_type);
1635 return $_greek_number_table{$marker_val}
1636 if $list_marker_type eq "a" &&
1637 defined($_greek_number_table{$marker_val});
1638 $marker_val = lc($marker_val);
1639 return ord($marker_val) - ord("a") + 1 if $list_marker_type eq "a";
1640 return 1 unless $list_marker_type eq "i";
1641 defined($_roman_number_table{$marker_val}) and
1642 return $_roman_number_table{$marker_val};
1643 return 1;
1645 return $ans if $ans == 0 && $list_marker_type eq "1";
1646 return $ans >= 1 ? $ans : 1;
1650 sub _IncrList {
1651 my ($from, $to, $extra) = @_;
1652 $extra = defined($extra) ? " $extra" : "";
1653 my $result = "";
1654 while ($from + 10 <= $to) {
1655 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-10\"></span>\n";
1656 $from += 10;
1658 while ($from + 5 <= $to) {
1659 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-5\"></span>\n";
1660 $from += 5;
1662 while ($from + 2 <= $to) {
1663 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-2\"></span>\n";
1664 $from += 2;
1666 while ($from < $to) {
1667 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr\"></span>\n";
1668 ++$from;
1670 return $result;
1674 sub _DoListsAndBlocks {
1676 # Form HTML ordered (numbered) and unordered (bulleted) lists.
1678 my $text = shift;
1679 my $indent = $opt{indent_width};
1680 my $less_than_indent = $indent - 1;
1681 my $less_than_double_indent = 2 * $indent - 1;
1683 # Re-usable pattern to match any entire ul or ol list:
1684 my $whole_list = qr{
1685 ( # $1 (or $_[0]) = whole list
1686 ( # $2 (or $_[1])
1687 (?:(?<=\n)|\A)
1688 [ ]{0,$less_than_indent}
1689 (${marker_any}) # $3 (or $_[2]) = first list item marker
1690 [ ]+
1692 (?s:.+?)
1693 ( # $4 (or $_[3])
1696 \n{2,}
1697 (?=\S)
1698 (?! # Negative lookahead for another list item marker
1699 ${marker_any}[ ]
1703 }mx;
1705 my $list_item_sub = sub {
1706 my $list = $_[0];
1707 my $list_type = ($_[2] =~ m/$marker_ul/) ? "ul" : "ol";
1708 my $list_att = "";
1709 my $list_class = "";
1710 my $list_incr = "";
1711 # Turn double returns into triple returns, so that we can make a
1712 # paragraph for the last item in a list, if necessary:
1713 $list =~ s/\n\n/\n\n\n/g;
1714 my ($result, $first_marker, $fancy) = _ProcessListItems($list_type, $list);
1715 defined($first_marker) or return $list;
1716 my $list_marker_type = _GetListMarkerType($list_type, $first_marker);
1717 if ($list_marker_type) {
1718 $first_marker =~ s/[.\)]$//;
1719 my $first_marker_num = _GetMarkerIntegerNum($list_marker_type, $first_marker);
1720 $list_att = $list_marker_type eq "1" ? "" : " type=\"$list_marker_type\"";
1721 if ($fancy) {
1722 $list_class = " class=\"$opt{style_prefix}ol\"";
1723 my $start = $first_marker_num;
1724 $start = 10 if $start > 10;
1725 $start = 5 if $start > 5 && $start < 10;
1726 $start = 1 if $start > 1 && $start < 5;
1727 $list_att .= " start=\"$start\"" unless $start == 1;
1728 $list_incr = _IncrList($start, $first_marker_num);
1729 } else {
1730 $list_class = " class=\"$opt{style_prefix}lc-greek\""
1731 if $list_marker_type eq "a" && $first_marker =~ /^$greek_lower/o;
1732 $list_att .= " start=\"$first_marker_num\"" unless $first_marker_num == 1;
1735 my $idt = "\027" x $g_list_level;
1736 $result = "$idt<$list_type$list_att$list_class>\n$list_incr" . $result . "$idt</$list_type>\n\n";
1737 $result;
1740 # We use a different prefix before nested lists than top-level lists.
1741 # See extended comment in _ProcessListItems().
1743 # Note: (jg) There's a bit of duplication here. My original implementation
1744 # created a scalar regex pattern as the conditional result of the test on
1745 # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
1746 # substitution once, using the scalar as the pattern. This worked,
1747 # everywhere except when running under MT on my hosting account at Pair
1748 # Networks. There, this caused all rebuilds to be killed by the reaper (or
1749 # perhaps they crashed, but that seems incredibly unlikely given that the
1750 # same script on the same server ran fine *except* under MT. I've spent
1751 # more time trying to figure out why this is happening than I'd like to
1752 # admit. My only guess, backed up by the fact that this workaround works,
1753 # is that Perl optimizes the substition when it can figure out that the
1754 # pattern will never change, and when this optimization isn't on, we run
1755 # afoul of the reaper. Thus, the slightly redundant code to that uses two
1756 # static s/// patterns rather than one conditional pattern.
1758 # Note: (kjm) With the addition of the two-of-the-same-kind-in-a-row-
1759 # starts-a-list-at-the-top-level rule the two patterns really are somewhat
1760 # different now, but the duplication has pretty much been eliminated via
1761 # use of a separate sub which has the side-effect of making the below
1762 # two cases much easier to grok all at once.
1764 if ($g_list_level) {
1765 my $parse = $text;
1766 $text = "";
1767 pos($parse) = 0;
1768 while ($parse =~ /\G(?s:.)*?^$whole_list/gmc) {
1769 my @captures = ($1, $2, $3, $4);
1770 if ($-[1] > $-[0]) {
1771 $text .= _DoListBlocks(substr($parse, $-[0], $-[1] - $-[0]));
1773 $text .= &$list_item_sub(@captures);
1775 $text .= _DoListBlocks(substr($parse, pos($parse))) if pos($parse) < length($parse);
1777 else {
1778 my $parse = $text;
1779 $text = "";
1780 pos($parse) = 0;
1781 while ($parse =~ m{\G(?s:.)*?
1782 (?: (?<=\n\n) |
1783 \A\n? |
1784 (?<=:\n) |
1785 (?:(?<=\n) # a list starts with one unordered marker line
1786 (?=[ ]{0,$less_than_indent}$marker_ul[ ])) |
1787 (?:(?<=\n) # or two ordered marker lines in a row
1788 (?=[ ]{0,$less_than_indent}$marker_ol[ ].*\n\n?
1789 [ ]{0,$less_than_indent}$marker_ol[ ])) |
1790 (?:(?<=\n) # or any marker and a sublist marker
1791 (?=[ ]{0,$less_than_indent}$marker_any[ ].*\n\n?
1792 [ ]{$indent,$less_than_double_indent}$marker_any[ ]))
1794 $whole_list
1795 }gmcx) {
1796 my @captures = ($1, $2, $3, $4);
1797 if ($-[1] > $-[0]) {
1798 $text .= _DoListBlocks(substr($parse, $-[0], $-[1] - $-[0]));
1800 $text .= &$list_item_sub(@captures);
1802 $text .= _DoListBlocks(substr($parse, pos($parse))) if pos($parse) < length($parse);
1805 return $text;
1809 sub _ProcessListItems {
1811 # Process the contents of a single ordered or unordered list, splitting it
1812 # into individual list items.
1815 my $list_type = shift;
1816 my $list_str = shift;
1818 # The $g_list_level global keeps track of when we're inside a list.
1819 # Each time we enter a list, we increment it; when we leave a list,
1820 # we decrement. If it's zero, we're not in a list anymore.
1822 # We do this because when we're not inside a list, we want to treat
1823 # something like this:
1825 # I recommend upgrading to version
1826 # 8. Oops, now this line is treated
1827 # as a sub-list.
1829 # As a single paragraph, despite the fact that the second line starts
1830 # with a digit-period-space sequence.
1832 # Whereas when we're inside a list (or sub-list), that line will be
1833 # treated as the start of a sub-list. What a kludge, huh? This is
1834 # an aspect of Markdown's syntax that's hard to parse perfectly
1835 # without resorting to mind-reading. Perhaps the solution is to
1836 # change the syntax rules such that sub-lists must start with a
1837 # starting cardinal number; e.g. "1." or "a.".
1839 $g_list_level++;
1840 my $idt = "\027" x $g_list_level;
1841 my $marker_kind = $list_type eq "ul" ? $marker_ul : $marker_ol;
1842 my $first_marker;
1843 my $first_marker_type;
1844 my $first_marker_num;
1845 my $last_marker;
1846 my $fancy;
1847 my $skipped;
1848 my $typechanged;
1849 my $next_num = 1;
1851 # trim trailing blank lines:
1852 $list_str =~ s/\n{2,}\z/\n/;
1854 my $result = "";
1855 my $oldpos = 0;
1856 pos($list_str) = 0;
1857 while ($list_str =~ m{\G # start where we left off
1858 (\n+)? # leading line = $1
1859 (^[ ]*) # leading whitespace = $2
1860 ($marker_any) [ ] ([ ]*) # list marker = $3 leading item space = $4
1861 }cgmx) {
1862 my $leading_line = $1;
1863 my $leading_space = $2;
1864 my $list_marker = $3;
1865 my $list_marker_len = length($list_marker);
1866 my $leading_item_space = $4;
1867 if ($-[0] > $oldpos) {
1868 $result .= substr($list_str, $oldpos, $-[0] - $oldpos); # Sort-of $`
1869 $oldpos = $-[0]; # point at start of this entire match
1871 if (!defined($first_marker)) {
1872 $first_marker = $list_marker;
1873 $first_marker_type = _GetListMarkerType($list_type, $first_marker);
1874 if ($first_marker_type) {
1875 (my $marker_val = $first_marker) =~ s/[.\)]$//;
1876 $first_marker_num = _GetMarkerIntegerNum($first_marker_type, $marker_val);
1877 $next_num = $first_marker_num;
1878 $skipped = 1 if $next_num != 1;
1880 } elsif ($list_marker !~ /$marker_kind/) {
1881 # Wrong marker kind, "fix up" the marker to a correct "lazy" marker
1882 # But keep the old length in $list_marker_len
1883 $list_marker = $last_marker;
1886 # Now grab the rest of this item's data upto but excluding the next
1887 # list marker at the SAME indent level, but sublists must be INCLUDED
1889 my $item = "";
1890 while ($list_str =~ m{\G
1891 ((?:.+?)(?:\n{1,2})) # list item text = $1
1892 (?= \n* (?: \z | # end of string OR
1893 (^[ ]*) # leading whitespace = $2
1894 ($marker_any) # next list marker = $3
1895 ([ ]+) )) # one or more spaces after marker = $4
1896 }cgmxs) {
1898 # If $3 has a left edge that is at the left edge of the previous
1899 # marker OR $3 has a right edge that is at the right edge of the
1900 # previous marker then we stop; otherwise we go on
1902 $item .= substr($list_str, $-[0], $+[0] - $-[0]); # $&
1903 last if !defined($4) || length($2) == length($leading_space) ||
1904 length($2) + length($3) == length($leading_space) + $list_marker_len;
1905 # move along, you're not the marker droid we're looking for...
1906 $item .= substr($list_str, $+[0], $+[4] - $+[0]);
1907 pos($list_str) = $+[4]; # ...move along over the marker droid
1909 # Remember where we parked
1910 $oldpos = pos($list_str);
1912 # Process the $list_marker $item
1914 my $liatt = '';
1915 my $checkbox = '';
1916 my $incr = '';
1918 if ($list_type eq "ul" && !$leading_item_space && $item =~ /^\[([ xX])\] +(.*)$/s) {
1919 my $checkmark = lc $1;
1920 $item = $2;
1921 my ($checkbox_class, $checkbox_val);
1922 if ($checkmark eq "x") {
1923 ($checkbox_class, $checkbox_val) = ("checkbox-on", "x");
1924 } else {
1925 ($checkbox_class, $checkbox_val) = ("checkbox-off", "&#160;");
1927 $liatt = " class=\"$opt{style_prefix}$checkbox_class\"";
1928 $checkbox = "<span><span></span></span><span></span><span>[<tt>$checkbox_val</tt>]&#160;</span>";
1929 } else {
1930 my $list_marker_type;
1931 ($list_marker_type, $liatt) = _GetListItemTypeClass($list_type, $list_marker, $last_marker);
1932 if ($list_type eq "ol" && defined($first_marker)) {
1933 my $styled = $fancy = 1 if $liatt && $list_marker =~ /\)$/;
1934 my ($sfx, $dash) = ("", "");
1935 ($sfx, $dash) = ("li", "-") if $styled;
1936 if ($liatt =~ /lower/) {
1937 $sfx .= "${dash}lc";
1938 } elsif ($liatt =~ /upper/) {
1939 $sfx .= "${dash}uc";
1941 $sfx .= "-greek" if $liatt =~ /greek/;
1942 $liatt = " class=\"$opt{style_prefix}$sfx\"" if $sfx;
1943 $typechanged = 1 if $list_marker_type ne $first_marker_type;
1944 (my $marker_val = $list_marker) =~ s/[.\)]$//;
1945 my $marker_num = _GetMarkerIntegerNum($list_marker_type, $marker_val);
1946 $marker_num = $next_num if $marker_num < $next_num;
1947 $skipped = 1 if $next_num < $marker_num;
1948 $incr = _IncrList($next_num, $marker_num, "incrlevel=$g_list_level");
1949 $liatt = " value=\"$marker_num\"$liatt" if $fancy || $skipped;
1950 $liatt = " type=\"$list_marker_type\"$liatt" if $styled || $typechanged;
1951 $next_num = $marker_num + 1;
1954 $last_marker = $list_marker;
1956 if ($leading_line or ($item =~ m/\n{2,}/)) {
1957 $item = _RunBlockGamut(_Outdent($item));
1958 $item =~ s{(</[OUou][Ll]>)\s*\z}{$1} and $item .= "\n$idt<span style=\"display:none\">&#160;</span>";
1960 else {
1961 # Recursion for sub-lists:
1962 $item = _DoListsAndBlocks(_Outdent($item));
1963 chomp $item;
1964 $item = _RunSpanGamut($item);
1967 # Append to $result
1968 $result .= "$incr$idt<li$liatt>" . $checkbox . $item . "$idt</li>\n";
1970 if ($fancy) {
1971 # remove "incrlevel=$g_list_level " parts
1972 $result =~ s{<span incrlevel=$g_list_level class="$opt{style_prefix}ol-incr((?:-\d{1,2})?)">}
1973 {$idt<span class="$opt{style_prefix}ol-incr$1">}g;
1974 } else {
1975 # remove the $g_list_level incr spans entirely
1976 $result =~ s{<span incrlevel=$g_list_level class="$opt{style_prefix}ol-incr(?:-\d{1,2})?"></span>\n}{}g;
1977 # remove the class="$opt{style_prefix}lc-greek" if first_marker is greek
1978 $result =~ s{(<li[^>]*?) class="$opt{style_prefix}lc-greek">}{$1>}g
1979 if defined($first_marker_type) && $first_marker_type eq "a" && $first_marker =~ /^$greek_lower/o;
1982 # Anything left over (similar to $') goes into result, but this should always be empty
1983 $result .= _RunBlockGamut(substr($list_str, pos($list_str))) if pos($list_str) < length($list_str);
1985 $g_list_level--;
1987 # After all that, if we only got an ordered list with a single item
1988 # and its first marker is a four-digit number >= 1492 and <= 2999
1989 # or an UPPERCASE letter, then pretend we didn't see any list at all.
1991 if ($first_marker_type && $first_marker_num + 1 == $next_num) {
1992 if (($first_marker_type eq "1" && $first_marker_num >= 1492 && $first_marker_num <= 2999) ||
1993 ($first_marker_type eq "A" && !$fancy)) {
1994 return (undef, undef, undef);
1998 return ($result, $first_marker, $fancy);
2002 sub _DoCodeBlocks {
2004 # Process Markdown `<pre><code>` blocks.
2007 my $text = shift;
2009 $text =~ s{
2010 (?:\n\n|\A\n?)
2011 ( # $1 = the code block -- one or more lines, starting with indent_width spaces
2013 (?:[ ]{$opt{indent_width}}) # Lines must start with indent_width of spaces
2014 .*\n+
2017 ((?=^[ ]{0,$opt{indent_width}}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
2019 my $codeblock = $1;
2021 $codeblock =~ s/\n\n\n/\n\n/g; # undo "paragraph for last list item" change
2022 $codeblock = _EncodeCode(_Outdent($codeblock));
2023 $codeblock =~ s/\A\n+//; # trim leading newlines
2024 $codeblock =~ s/\s+\z//; # trim trailing whitespace
2026 my $result = "<div class=\"$opt{style_prefix}code\"><pre style=\"display:none\"></pre><pre><code>"
2027 . $codeblock . "\n</code></pre></div>";
2028 my $key = block_id($result, 2);
2029 $g_code_blocks{$key} = $result;
2030 "\n\n" . $key . "\n\n";
2031 }egmx;
2033 return $text;
2037 sub _DoCodeSpans {
2039 # * Backtick quotes are used for <code></code> spans.
2041 # * You can use multiple backticks as the delimiters if you want to
2042 # include literal backticks in the code span. So, this input:
2044 # Just type ``foo `bar` baz`` at the prompt.
2046 # Will translate to:
2048 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
2050 # There's no arbitrary limit to the number of backticks you
2051 # can use as delimters. If you need three consecutive backticks
2052 # in your code, use four for delimiters, etc.
2054 # * You can use spaces to get literal backticks at the edges:
2056 # ... type `` `bar` `` ...
2058 # Turns to:
2060 # ... type <code>`bar`</code> ...
2063 my $text = shift;
2065 $text =~ s@
2066 (`+) # $1 = Opening run of `
2067 (.+?) # $2 = The code block
2068 (?<!`)
2069 \1 # Matching closer
2070 (?!`)
2072 my $c = "$2";
2073 $c =~ s/^[ ]+//g; # leading whitespace
2074 $c =~ s/[ ]+$//g; # trailing whitespace
2075 $c = _EncodeCode($c);
2076 "<code>$c</code>";
2077 @egsx;
2079 return $text;
2083 sub _EncodeCode {
2085 # Encode/escape certain characters inside Markdown code runs.
2086 # The point is that in code, these characters are literals,
2087 # and lose their special Markdown meanings.
2089 local $_ = shift;
2091 # Encode all ampersands; HTML entities are not
2092 # entities within a Markdown code span.
2093 s/&/&amp;/g;
2095 # Encode $'s, but only if we're running under Blosxom.
2096 # (Blosxom interpolates Perl variables in article bodies.)
2097 s/\$/&#036;/g if $_haveBX;
2099 # Do the angle bracket song and dance:
2100 s! < !&lt;!gx;
2101 s! > !&gt;!gx;
2103 # Now, escape characters that are magic in Markdown:
2104 s!([*_~{}\[\]\\])!$g_escape_table{$1}!g;
2106 return $_;
2110 sub _DoItalicsAndBoldAndStrike {
2111 my $text = shift;
2113 my $doital1 = sub {
2114 my $text = shift;
2115 $text =~ s{ \* (?=\S) (.+?) (?<=\S) \* }
2116 {<em>$1</em>}gsx;
2117 # We've got to encode any of these remaining to
2118 # avoid conflicting with other italics and bold.
2119 $text =~ s!([*])!$g_escape_table{$1}!g;
2120 $text;
2122 my $doital2 = sub {
2123 my $text = shift;
2124 $text =~ s{ (?<!\w) _ (?=\S) (.+?) (?<=\S) _ (?!\w) }
2125 {<em>$1</em>}gsx;
2126 # We've got to encode any of these remaining to
2127 # avoid conflicting with other italics and bold.
2128 $text =~ s!([_])!$g_escape_table{$1}!g;
2129 $text;
2132 # <strong> must go first:
2133 $text =~ s{ \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }
2134 {"<strong>".&$doital1($1)."</strong>"}gsex;
2135 $text =~ s{ (?<!\w) __ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\w) }
2136 {"<strong>".&$doital2($1)."</strong>"}gsex;
2138 $text =~ s{ ~~ (?=\S) (.+?[*_]*) (?<=\S) ~~ }
2139 {<strike>$1</strike>}gsx;
2141 $text =~ s{ \* (?=\S) (.+?) (?<=\S) \* }
2142 {<em>$1</em>}gsx;
2143 $text =~ s{ (?<!\w) _ (?=\S) (.+?) (?<=\S) _ (?!\w) }
2144 {<em>$1</em>}gsx;
2146 return $text;
2150 sub _DoBlockQuotes {
2151 my $text = shift;
2153 $text =~ s{
2154 ( # Wrap whole match in $1
2156 ^[ ]*>[ ]? # '>' at the start of a line
2157 .*\n # rest of the first line
2158 (.+\n)* # subsequent consecutive lines
2159 \n* # blanks
2163 my $bq = $1;
2164 $bq =~ s/^[ ]*>[ ]?//gm; # trim one level of quoting
2165 $bq =~ s/^[ ]+$//mg; # trim whitespace-only lines
2166 $bq = _RunBlockGamut($bq); # recurse
2168 $bq =~ s/^/\027/mg;
2169 "<blockquote>\n$bq\n</blockquote>\n\n";
2170 }egmx;
2173 return $text;
2177 my ($LEAD, $TRAIL, $LEADBAR, $LEADSP, $COLPL, $SEP);
2178 BEGIN {
2179 $LEAD = qr/(?>[ ]*(?:\|[ ]*)?)/o;
2180 $TRAIL = qr/[ ]*(?<!\\)\|[ ]*/o;
2181 $LEADBAR = qr/(?>[ ]*\|[ ]*)/o;
2182 $LEADSP = qr/(?>[ ]*)/o;
2183 $COLPL = qr/(?:[^\n|\\]|\\(?:(?>[^\n])|(?=\n|$)))+/o;
2184 $SEP = qr/[ ]*:?-+:?[ ]*/o;
2187 sub _DoTables {
2188 my $text = shift;
2190 $text =~ s{
2191 ( # Wrap whole thing to avoid $&
2192 (?: (?<=\n\n) | \A\n? ) # Preceded by blank line or beginning of string
2193 ^( # Header line
2194 $LEADBAR \| [^\n]* |
2195 $LEADBAR $COLPL [^\n]* |
2196 $LEADSP $COLPL \| [^\n]*
2198 ( # Separator line
2199 $LEADBAR $SEP (?: \| $SEP )* (?: \| [ ]*)? |
2200 $SEP (?: \| $SEP )+ (?: \| [ ]*)? |
2201 $SEP \| [ ]*
2203 ((?: # Rows (0+)
2204 $LEADBAR \| [^\n]* \n |
2205 $LEADBAR $COLPL [^\n]* \n |
2206 $LEADSP $COLPL \| [^\n]* \n
2210 my ($w, $h, $s, $rows) = ($1, $2, $3, $4);
2211 my @heads = _SplitTableRow($h);
2212 my @seps = _SplitTableRow($s);
2213 if (@heads == @seps) {
2214 my @align = map {
2215 if (/^:-+:$/) {" align=\"center\""}
2216 elsif (/^:/) {" align=\"left\""}
2217 elsif (/:$/) {" align=\"right\""}
2218 else {""}
2219 } @seps;
2220 my $nohdr = "";
2221 $nohdr = " $opt{style_prefix}table-nohdr" if join("", @heads) eq "";
2222 my $tab ="\n<table border=\"1\" cellspacing=\"0\" cellpadding=\"2\" class=\"$opt{style_prefix}table$nohdr\">\n";
2223 $tab .=
2224 " <tr class=\"$opt{style_prefix}row-hdr\">" . _MakeTableRow("th", \@align, @heads) . "</tr>\n"
2225 unless $nohdr;
2226 my $cnt = 0;
2227 my @classes = ("class=\"$opt{style_prefix}row-even\"", "class=\"$opt{style_prefix}row-odd\"");
2228 $tab .= " <tr " . $classes[++$cnt % 2] . ">" . _MakeTableRow("td", \@align, @$_) . "</tr>\n"
2229 foreach (_SplitMergeRows($rows));
2230 $tab .= "</table>\n\n";
2231 } else {
2234 }egmx;
2236 return $text;
2240 sub _SplitMergeRows {
2241 my @rows = ();
2242 my ($mergeprev, $mergenext) = (0,0);
2243 foreach (split(/\n/, $_[0])) {
2244 $mergeprev = $mergenext;
2245 $mergenext = 0;
2246 my @cols = _SplitTableRow($_);
2247 if (_endswithbareslash($cols[$#cols])) {
2248 my $last = $cols[$#cols];
2249 substr($last, -1, 1) = "";
2250 $last =~ s/[ ]+$//;
2251 $cols[$#cols] = $last;
2252 $mergenext = 1;
2254 if ($mergeprev) {
2255 for (my $i = 0; $i <= $#cols; ++$i) {
2256 my $cell = $rows[$#rows]->[$i];
2257 defined($cell) or $cell = "";
2258 $rows[$#rows]->[$i] = _MergeCells($cell, $cols[$i]);
2260 } else {
2261 push(@rows, [@cols]);
2264 return @rows;
2268 sub _endswithbareslash {
2269 return 0 unless substr($_[0], -1, 1) eq "\\";
2270 my @parts = split(/\\\\/, $_[0], -1);
2271 return substr($parts[$#parts], -1, 1) eq "\\";
2275 sub _MergeCells {
2276 my ($c1, $c2) = @_;
2277 return $c1 if $c2 eq "";
2278 return $c2 if $c1 eq "";
2279 return $c1 . " " . $c2;
2283 sub _SplitTableRow {
2284 my $row = shift;
2285 $row =~ s/^$LEAD//;
2286 $row =~ s/$TRAIL$//;
2287 $row =~ s!\\\\!$g_escape_table{'\\'}!go; # Must process escaped backslashes first.
2288 $row =~ s!\\\|!$g_escape_table{'|'}!go; # Then do \|
2289 my @elems = map {
2290 s!$g_escape_table{'|'}!|!go;
2291 s!$g_escape_table{'\\'}!\\\\!go;
2292 s/^[ ]+//;
2293 s/[ ]+$//;
2295 } split(/[ ]*\|[ ]*/, $row, -1);
2296 @elems or push(@elems, "");
2297 return @elems;
2301 sub _MakeTableRow {
2302 my $etype = shift;
2303 my $align = shift;
2304 my $row = "";
2305 for (my $i = 0; $i < @$align; ++$i) {
2306 my $data = $_[$i];
2307 defined($data) or $data = "";
2308 $row .= "<" . $etype . $$align[$i] . ">" .
2309 _RunSpanGamut($data) . "</" . $etype . ">";
2311 return $row;
2315 sub _FormParagraphs {
2317 # Params:
2318 # $text - string to process with html <p> tags
2320 my $text = shift;
2322 # Strip leading and trailing lines:
2323 $text =~ s/\A\n+//;
2324 $text =~ s/\n+\z//;
2326 my @grafs = split(/\n{2,}/, $text);
2329 # Wrap <p> tags.
2331 foreach (@grafs) {
2332 unless (defined($g_html_blocks{$_}) || defined($g_code_blocks{$_})) {
2333 $_ = _RunSpanGamut($_);
2334 s/^([ ]*)/<p>/;
2335 $_ .= "</p>";
2340 # Unhashify HTML blocks
2342 foreach (@grafs) {
2343 if (defined( $g_html_blocks{$_} )) {
2344 $_ = $g_html_blocks{$_};
2348 return join "\n\n", @grafs;
2352 my $g_possible_tag_name;
2353 my %ok_tag_name;
2354 BEGIN {
2355 # note: length("blockquote") == 10
2356 $g_possible_tag_name = qr/(?i:[a-z]{1,10}|h[1-6])/o;
2357 %ok_tag_name = map({$_ => 1} qw(
2358 a abbr acronym address area
2359 b basefont bdo big blockquote br
2360 caption center cite code col colgroup
2361 dd del dfn div dl dt
2363 font
2364 h1 h2 h3 h4 h5 h6 hr
2365 i img ins
2370 p pre
2372 s samp small span strike strong sub sup
2373 table tbody td tfoot th thead tr tt
2374 u ul
2377 $ok_tag_name{$_} = 0 foreach (qw(
2378 dir menu
2383 sub _SetAllowedTag {
2384 my ($tag, $forbid) = @_;
2385 $ok_tag_name{$tag} = $forbid ? 0 : 1
2386 if defined($tag) && exists($ok_tag_name{$tag});
2390 # Encode leading '<' of any non-tags
2391 # However, "<?", "<!" and "<$" are passed through (legacy on that "<$" thing)
2392 sub _DoTag {
2393 my $tag = shift;
2394 return $tag if $tag =~ /^<[?\$!]/;
2395 if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} || $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
2396 $ok_tag_name{lc($1)}) {
2398 return _ProcessURLTag("href", $tag, 1) if $tag =~ /^<a\s/i;
2399 return _ProcessURLTag("src", $tag) if $tag =~ /^<img\s/i;
2400 return $tag;
2402 $tag =~ s/^</&lt;/;
2403 return $tag;
2406 # Strip out all tags that _DoTag would match
2407 sub _StripTags {
2408 my $text = shift;
2409 my $_StripTag = sub {
2410 my $tag = shift;
2411 return $tag if $tag =~ /^<[?\$!]/;
2412 if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} || $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
2413 $ok_tag_name{lc($1)}) {
2415 return ""; # strip it out
2417 return $tag;
2419 $text =~ s{(<[^>]*>)}{&$_StripTag($1)}ige;
2420 return $text;
2423 my %univatt; # universally allowed attribute names
2424 my %tagatt; # per-element allowed attribute names
2425 my %tagmt; # empty element tags
2426 my %tagocl; # non-empty elements with optional closing tag
2427 my %tagacl; # which %tagocl an opening %tagocl will close
2428 my %tagblk; # block elements
2429 my %taga1p; # open tags which require at least one attribute
2430 my %lcattval; # names of attribute values to lowercase
2431 my %impatt; # names of "implied" attributes
2432 BEGIN {
2433 %univatt = map({$_ => 1} qw(class dir id lang style title xml:lang));
2434 %tagatt = (
2435 'a' => { map({$_ => 1} qw(href name)) },
2436 'area' => { map({$_ => 1} qw(alt coords href nohref shape)) },
2437 'basefont' => { map({$_ => 1} qw(color face size)) },
2438 'br' => { map({$_ => 1} qw(clear)) },
2439 'caption' => { map({$_ => 1} qw(align)) },
2440 'col' => { map({$_ => 1} qw(align span width valign)) },
2441 'colgroup' => { map({$_ => 1} qw(align span width valign)) },
2442 'dir' => { map({$_ => 1} qw(compact)) },
2443 'div' => { map({$_ => 1} qw(align)) },
2444 'dl' => { map({$_ => 1} qw(compact)) },
2445 'font' => { map({$_ => 1} qw(color face size)) },
2446 'h1' => { map({$_ => 1} qw(align)) },
2447 'h2' => { map({$_ => 1} qw(align)) },
2448 'h3' => { map({$_ => 1} qw(align)) },
2449 'h4' => { map({$_ => 1} qw(align)) },
2450 'h5' => { map({$_ => 1} qw(align)) },
2451 'h6' => { map({$_ => 1} qw(align)) },
2452 'hr' => { map({$_ => 1} qw(align noshade size width)) },
2453 # NO server-side image maps, therefore NOT ismap !
2454 'img' => { map({$_ => 1} qw(align alt border height hspace src usemap vspace width)) },
2455 'li' => { map({$_ => 1} qw(compact type value)) },
2456 'map' => { map({$_ => 1} qw(name)) },
2457 'menu' => { map({$_ => 1} qw(compact)) },
2458 'ol' => { map({$_ => 1} qw(compact start type)) },
2459 'p' => { map({$_ => 1} qw(align)) },
2460 'pre' => { map({$_ => 1} qw(width)) },
2461 'table' => { map({$_ => 1} qw(align border cellpadding cellspacing summary width)) },
2462 'tbody' => { map({$_ => 1} qw(align valign)) },
2463 'tfoot' => { map({$_ => 1} qw(align valign)) },
2464 'thead' => { map({$_ => 1} qw(align valign)) },
2465 'td' => { map({$_ => 1} qw(align colspan height nowrap rowspan valign width)) },
2466 'th' => { map({$_ => 1} qw(align colspan height nowrap rowspan valign width)) },
2467 'tr' => { map({$_ => 1} qw(align valign)) },
2468 'ul' => { map({$_ => 1} qw(compact type)) }
2470 %tagmt = map({$_ => 1} qw(area basefont br col hr img));
2471 %tagocl = map({$_ => 1} qw(colgroup dd dt li p tbody td tfoot th thead tr));
2472 %tagacl = (
2473 'colgroup' => \%tagocl,
2474 'dd' => \%tagocl,
2475 'dt' => \%tagocl,
2476 'li' => \%tagocl,
2477 'tbody' => \%tagocl,
2478 'td' => { map({$_ => 1} qw(colgroup dd dt li p td tfoot th thead)) },
2479 'tfoot' => \%tagocl,
2480 'th' => { map({$_ => 1} qw(colgroup dd dt li p td tfoot th thead)) },
2481 'thead' => \%tagocl,
2482 'tr' => { map({$_ => 1} qw(colgroup dd dt li p td tfoot th thead tr)) },
2484 %tagblk = map({$_ => 1} qw(address blockquote div dl h1 h2 h3 h4 h5 h6 hr ol p pre table));
2485 %impatt = map({$_ => 1} qw(checked compact ismap nohref noshade nowrap));
2486 %lcattval = map({$_ => 1} qw(
2487 align border cellpadding cellspacing checked clear color colspan
2488 compact coords height hspace ismap nohref noshade nowrap rowspan size
2489 span shape valign vspace width
2491 %taga1p = map({$_ => 1} qw(a area img map));
2495 # _SanitizeTags
2497 # Inspect all '<'...'>' tags in the input and HTML encode those things
2498 # that cannot possibly be tags and at the same time sanitize them.
2500 # $1 => text to process
2501 # <= sanitized text
2502 sub _SanitizeTags {
2503 my ($text, $validate) = @_;
2504 $text =~ s/\s+$//;
2505 $text ne "" or return "";
2506 my @stack = ();
2507 my $ans = "";
2508 my $end = length($text);
2509 pos($text) = 0;
2510 my ($autoclose, $autoclopen);
2511 my $lastmt = "";
2512 $autoclose = sub {
2513 my $s = $_[0] || "";
2514 while (@stack && $stack[$#stack]->[0] ne $s &&
2515 $tagocl{$stack[$#stack]->[0]}) {
2516 $ans .= "</" . $stack[$#stack]->[0] . ">";
2517 pop(@stack);
2519 } if $validate;
2520 $autoclopen = sub {
2521 my $s = $_[0] || "";
2522 my $c;
2523 if ($tagblk{$s}) {$c = {p=>1}}
2524 elsif ($tagocl{$s}) {$c = $tagacl{$s}}
2525 else {return}
2526 while (@stack && $c->{$stack[$#stack]->[0]}) {
2527 $ans .= "</" . $stack[$#stack]->[0] . ">";
2528 pop(@stack);
2530 } if $validate;
2531 while (pos($text) < $end) {
2532 if ($text =~ /\G([^<]+)/gc) {
2533 $ans .= $1;
2534 $lastmt = "" if $1 =~ /\S/;
2535 next;
2537 my $tstart = pos($text);
2538 if ($text =~ /\G(<[^>]*>)/gc) {
2539 my $tag = $1;
2540 if ($tag =~ /^<!--/) { # pass "comments" through
2541 $ans .= $tag;
2542 next;
2544 my $tt;
2545 if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} ||
2546 $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
2547 $ok_tag_name{$tt=lc($1)})
2549 my ($stag, $styp) = _Sanitize($tag);
2550 if ($styp == 2 && $lastmt eq $tt) {
2551 $lastmt = "";
2552 next;
2554 $lastmt = $styp == 3 ? $tt : "";
2555 if ($validate && $styp) {
2556 &$autoclopen($tt) if $styp == 1 || $styp == 3;
2557 if ($styp == 1) {
2558 push(@stack,[$tt,$tstart]);
2559 } elsif ($styp == 2) {
2560 &$autoclose($tt) unless $tt eq "p";
2561 !@stack and _xmlfail("closing tag $tt without matching open at " .
2562 _linecol($tstart, $text));
2563 if ($stack[$#stack]->[0] eq $tt) {
2564 pop(@stack);
2565 } else {
2566 my @i = @{$stack[$#stack]};
2567 _xmlfail("opening tag $i[0] at " . _linecol($i[1], $text) .
2568 " mismatch with closing tag $tt at " . _linecol($tstart, $text));
2572 $ans .= $stag;
2573 next;
2574 } else {
2575 $tag =~ s/^</&lt;/;
2576 $ans .= $tag;
2577 $lastmt = "";
2578 next;
2581 # can only get here if "\G" char is an unmatched "<"
2582 pos($text) += 1;
2583 $ans .= "&lt;";
2584 $lastmt = "";
2586 &$autoclose if $validate;
2587 if ($validate && @stack) {
2588 my @errs;
2589 my $j;
2590 for ($j = 0; $j <= $#stack; ++$j) {
2591 my @i = @{$stack[$j]};
2592 unshift(@errs, "opening tag $i[0] without matching close at " .
2593 _linecol($i[1], $text));
2595 _xmlfail(@errs);
2597 return $ans."\n";
2601 sub _linecol {
2602 my ($pos, $txt) = @_;
2603 pos($txt) = 0;
2604 my ($l, $p);
2605 $l = 1;
2606 ++$l while ($p = pos($txt)), $txt =~ /\G[^\n]*\n/gc && pos($txt) <= $pos;
2607 return "line $l col " . (1 + ($pos - $p));
2611 sub _xmlfail {
2612 die join("", map("$_\n", @_));
2616 sub _Sanitize {
2617 my $tag = shift;
2618 my $seenatt = {};
2619 if ($tag =~ m{^</}) {
2620 $tag =~ s/\s+>$/>/;
2621 return (lc($tag),2);
2623 if ($tag =~ /^<([^\s<\/>]+)\s+/gs) {
2624 my $tt = lc($1);
2625 my $out = "<" . $tt . " ";
2626 my $ok = $tagatt{$tt};
2627 ref($ok) eq "HASH" or $ok = {};
2628 while ($tag =~ /\G\s*([^\s\042\047<\/>=]+)((?>=)|\s*)/gcs) {
2629 my ($a,$s) = ($1, $2);
2630 if ($s eq "" && substr($tag, pos($tag), 1) =~ /^[\042\047]/) {
2631 # pretend the "=" sign wasn't overlooked
2632 $s = "=";
2634 if (substr($s,0,1) ne "=") {
2635 # it's one of "those" attributes (e.g. compact) or not
2636 # _SanitizeAtt will fix it up if it is
2637 $out .= _SanitizeAtt($a, '""', $ok, $seenatt);
2638 next;
2640 if ($tag =~ /\G([\042\047])((?:(?!\1)(?!<).)*)\1\s*/gcs) {
2641 $out .= _SanitizeAtt($a, $1.$2.$1, $ok, $seenatt);
2642 next;
2644 if ($tag =~ /\G([\042\047])((?:(?!\1)(?![<>])(?![\/][>]).)*)/gcs) {
2645 # what to do what to do what to do
2646 # trim trailing \s+ and magically add the missing quote
2647 my ($q, $v) = ($1, $2);
2648 $v =~ s/\s+$//;
2649 $out .= _SanitizeAtt($a, $q.$v.$q, $ok, $seenatt);
2650 next;
2652 if ($tag =~ /\G([^\s<\/>]+)\s*/gcs) {
2653 # auto quote it
2654 my $v = $1;
2655 $v =~ s/\042/&quot;/go;
2656 $out .= _SanitizeAtt($a, '"'.$v.'"', $ok, $seenatt);
2657 next;
2659 # give it an empty value
2660 $out .= _SanitizeAtt($a, '""', $ok, $seenatt);
2662 my $sfx = substr($tag, pos($tag));
2663 $out =~ s/\s+$//;
2664 my $typ = 1;
2665 if ($tagmt{$tt}) {
2666 $typ = 3;
2667 $out .= $opt{empty_element_suffix};
2668 } else {
2669 $out .= ">";
2670 $out .= "</$tt>" and $typ = 3 if $tag =~ m,/>$,;
2672 return ($out,$typ);
2673 } elsif ($tag =~ /^<([^\s<\/>]+)/s) {
2674 my $tt = lc($1);
2675 return ("&lt;" . substr($tag,1), 0) if $taga1p{$tt};
2676 if ($tagmt{$tt}) {
2677 return ("<" . $tt . $opt{empty_element_suffix}, 3);
2678 } elsif ($tag =~ m,/>$,) {
2679 return ("<" . $tt . "></" . $tt . ">", 3);
2680 } else {
2681 return ("<" . $tt . ">", 1);
2684 return (lc($tag),0);
2688 sub _SanitizeAtt {
2689 my $att = lc($_[0]);
2690 return "" unless $att =~ /^[_a-z:][_a-z:0-9.-]*$/; # no weirdo char att names
2691 return "" unless $univatt{$att} || $_[2]->{$att};
2692 return "" if $_[3]->{$att}; # no repeats
2693 $_[3]->{$att} = 1;
2694 $impatt{$att} and return $att."=".'"'.$att.'"';
2695 if ($lcattval{$att}) {
2696 return $att."=".lc($_[1])." ";
2697 } else {
2698 return $att."=".$_[1]." ";
2703 sub _ProcessURLTag {
2704 my ($att, $tag, $dofrag) = @_;
2706 $att = lc($att) . "=";
2707 if ($tag =~ /^(<[^\s>]+\s+)/g) {
2708 my $out = $1;
2709 while ($tag =~ /\G([^\s\042\047<\/>=]+=)([\042\047])((?:(?!\2)(?!<).)*)(\2\s*)/gcs) {
2710 my ($p, $q, $v, $s) = ($1, $2, $3, $4);
2711 if (lc($p) eq $att && $v ne "") {
2712 if ($dofrag && $v =~ m"^#") {
2713 $v = _FindFragmentMatch($v);
2714 my $bp;
2715 if (($bp = $opt{base_prefix}) ne "") {
2716 $v = "\2\3" . $bp . $v;
2718 } else {
2719 $v = _PrefixURL($v);
2721 $v = _EncodeAttText($v);
2723 $out .= $p . $q . $v . $s;
2725 $out .= substr($tag, pos($tag));
2726 substr($out,0,1) = $g_escape_table{'<'};
2727 substr($out,-1,1) = $g_escape_table{'>'};
2728 return $out;
2731 return $tag;
2735 sub _HTMLEncode {
2736 my $text = shift;
2738 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
2739 # http://bumppo.net/projects/amputator/
2740 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
2742 # Remaining entities now
2743 $text =~ s/\042/&quot;/g;
2744 $text =~ s/\047/&#39;/g; # Some older browsers do not grok &apos;
2745 $text =~ s/</&lt;/g;
2746 $text =~ s/>/&gt;/g;
2748 return $text;
2752 sub _EncodeAmps {
2753 my $text = shift;
2755 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
2756 # http://bumppo.net/projects/amputator/
2757 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
2759 return $text;
2763 sub _EncodeAmpsAndAngles {
2764 # Smart processing for ampersands and angle brackets that need to be encoded.
2766 my $text = shift;
2768 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
2769 # http://bumppo.net/projects/amputator/
2770 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
2772 # Encode naked <'s
2773 $text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
2774 $text =~ s{<(?=[^>]*$)}{&lt;}g;
2776 # Encode <'s that cannot possibly be a start or end tag
2777 $text =~ s{(<[^>]*>)}{_DoTag($1)}ige;
2779 return $text;
2783 sub _EncodeBackslashEscapes {
2785 # Parameter: String.
2786 # Returns: String after processing the following backslash escape sequences.
2788 local $_ = shift;
2790 s!\\\\!$g_escape_table{'\\'}!go; # Must process escaped backslashes first.
2791 s{\\([`*_~{}\[\]()>#+\-.!`])}{$g_escape_table{$1}}g;
2793 return $_;
2797 sub _DoAutoLinks {
2798 local $_ = shift;
2800 s{<((https?|ftps?):[^'\042>\s]+)>(?!\s*</a>)}{_MakeATag($1, "&lt;".$1."&gt;")}gise;
2802 # Email addresses: <address@domain.foo>
2805 (?:mailto:)?
2807 [-.\w]+
2809 [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
2813 _EncodeEmailAddress(_UnescapeSpecialChars($1), "&#x3c;", "&#62;");
2814 }egix;
2816 # (kjm) I don't do "x" patterns
2817 s{(?:^|(?<=\s))((?:https?|ftps?)://(?:[-a-zA-Z0-9./?\&\%=_~!*;:\@+\$,\x23](?:(?<![.,:;])|(?=[^\s])))+)}
2818 {_MakeATag($1, $1)}soge;
2819 s{(?<![][])(?<!\] )\[RFC( ?)([0-9]{1,5})\](?![][])(?! \[)}
2820 {"["._MakeATag("https://tools.ietf.org/html/rfc$2", "RFC$1$2", "RFC $2")."]"}soge;
2822 return $_;
2826 sub _EncodeEmailAddress {
2828 # Input: an email address, e.g. "foo@example.com"
2830 # Output: the email address as a mailto link, with each character
2831 # of the address encoded as either a decimal or hex entity, in
2832 # the hopes of foiling most address harvesting spam bots. E.g.:
2834 # <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
2835 # x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
2836 # &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
2838 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
2839 # mailing list: <https://tinyurl.com/yu7ue>
2842 my ($addr, $prefix, $suffix) = @_;
2843 $prefix = "" unless defined($prefix);
2844 $suffix = "" unless defined($suffix);
2846 srand(unpack('N',md5($addr)));
2847 my @encode = (
2848 sub { '&#' . ord(shift) . ';' },
2849 sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
2850 sub { shift },
2853 $addr = "mailto:" . $addr;
2855 $addr =~ s{(.)}{
2856 my $char = $1;
2857 if ( $char eq '@' ) {
2858 # this *must* be encoded. I insist.
2859 $char = $encode[int rand 1]->($char);
2860 } elsif ( $char ne ':' ) {
2861 # leave ':' alone (to spot mailto: later)
2862 my $r = rand;
2863 # roughly 10% raw, 45% hex, 45% dec
2864 $char = (
2865 $r > .9 ? $encode[2]->($char) :
2866 $r < .45 ? $encode[1]->($char) :
2867 $encode[0]->($char)
2870 $char;
2871 }gex;
2873 # strip the mailto: from the visible part
2874 (my $bareaddr = $addr) =~ s/^.+?://;
2875 $addr = _MakeATag("$addr", $prefix.$bareaddr.$suffix);
2877 return $addr;
2881 sub _UnescapeSpecialChars {
2883 # Swap back in all the special characters we've hidden.
2885 my $text = shift;
2887 while( my($char, $hash) = each(%g_escape_table) ) {
2888 $text =~ s/$hash/$char/g;
2890 return $text;
2894 sub _TokenizeHTML {
2896 # Parameter: String containing HTML markup.
2897 # Returns: Reference to an array of the tokens comprising the input
2898 # string. Each token is either a tag (possibly with nested,
2899 # tags contained therein, such as <a href="<MTFoo>">, or a
2900 # run of text between tags. Each element of the array is a
2901 # two-element array; the first is either 'tag' or 'text';
2902 # the second is the actual value.
2905 # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
2906 # <https://web.archive.org/web/20041215155739/http://bradchoate.com/weblog/2002/07/27/mtregex>
2909 my $str = shift;
2910 my $pos = 0;
2911 my $len = length $str;
2912 my @tokens;
2914 my $depth = 6;
2915 my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
2916 my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment
2917 (?s: <\? .*? \?> ) | # processing instruction
2918 $nested_tags/iox; # nested tags
2920 while ($str =~ m/($match)/g) {
2921 my $whole_tag = $1;
2922 my $sec_start = pos $str;
2923 my $tag_start = $sec_start - length $whole_tag;
2924 if ($pos < $tag_start) {
2925 push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
2927 push @tokens, ['tag', $whole_tag];
2928 $pos = pos $str;
2930 push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
2931 \@tokens;
2935 sub _Outdent {
2937 # Remove one level of line-leading indent_width of spaces
2939 my $text = shift;
2941 $text =~ s/^ {1,$opt{indent_width}}//gm;
2942 return $text;
2946 # _DeTab
2948 # $1 => input text
2949 # $2 => optional tab width (default is $opt{tab_width})
2950 # $3 => leading spaces to strip off each line first (default is 0 aka none)
2951 # <= result with tabs expanded
2952 sub _DeTab {
2953 my $text = shift;
2954 my $ts = shift || $opt{tab_width};
2955 my $leadsp = shift || 0;
2956 my $spr = qr/^ {1,$leadsp}/ if $leadsp;
2957 pos($text) = 0;
2958 my $end = length($text);
2959 my $ans = "";
2960 while (pos($text) < $end) {
2961 my $line;
2962 if ($text =~ /\G(.*?\n)/gcs) {
2963 $line = $1;
2964 } else {
2965 $line = substr($text, pos($text));
2966 pos($text) = $end;
2968 $line =~ s/$spr// if $leadsp;
2969 # From the Perl camel book section "Fluent Perl" but modified a bit
2970 $line =~ s/(.*?)(\t+)/$1 . ' ' x (length($2) * $ts - length($1) % $ts)/ges;
2971 $ans .= $line;
2973 return $ans;
2977 sub _PrefixURL {
2979 # Add URL prefix if needed
2981 my $url = shift;
2982 $url =~ s/^\s+//;
2983 $url =~ s/\s+$//;
2984 $url = "#" unless $url ne "";
2986 return $url unless $opt{abs_prefix} ne '' || $opt{url_prefix} ne '' || $opt{img_prefix} ne '';
2987 return $url if $url =~ m"^\002\003" || $url =~ m"^#" || $url =~ m,^//,;
2988 $url = $opt{abs_prefix} . $url if $url =~ m,^/, && $opt{abs_prefix} ne '';
2989 return $url if $url =~ /^[A-Za-z][A-Za-z0-9+.-]*:/ || $url =~ m,^//,;
2990 my $ans = $opt{url_prefix};
2991 $ans = $opt{img_prefix}
2992 if $opt{img_prefix} ne '' && $url =~ m"^[^#?]*\.(?:png|gif|jpe?g|svgz?)(?:[#?]|$)"i;
2993 return $url unless $ans ne '';
2994 $ans .= '/' if substr($ans, -1, 1) ne '/';
2995 $ans .= substr($url, 0, 1) eq '/' ? substr($url, 1) : $url;
2996 return "\2\3".$ans;
3000 BEGIN {
3001 $g_style_sheet = <<'STYLESHEET';
3003 <style type="text/css">
3004 /* <![CDATA[ */
3006 /* Markdown.pl fancy style sheet
3007 ** Copyright (C) 2017,2018,2019 Kyle J. McKay.
3008 ** All rights reserved.
3010 ** Redistribution and use in source and binary forms, with or without
3011 ** modification, are permitted provided that the following conditions are met:
3013 ** 1. Redistributions of source code must retain the above copyright notice,
3014 ** this list of conditions and the following disclaimer.
3016 ** 2. Redistributions in binary form must reproduce the above copyright
3017 ** notice, this list of conditions and the following disclaimer in the
3018 ** documentation and/or other materials provided with the distribution.
3020 ** 3. Neither the name of the copyright holder nor the names of its
3021 ** contributors may be used to endorse or promote products derived from
3022 ** this software without specific prior written permission.
3024 ** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
3025 ** AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
3026 ** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
3027 ** ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
3028 ** LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
3029 ** CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
3030 ** SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
3031 ** INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
3032 ** CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
3033 ** ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
3034 ** POSSIBILITY OF SUCH DAMAGE.
3037 div.%(base)code-bt > pre, div.%(base)code > pre {
3038 margin: 0;
3039 padding: 0;
3040 overflow: auto;
3043 div.%(base)code-bt > pre > code, div.%(base)code > pre > code {
3044 display: inline-block;
3045 margin: 0;
3046 padding: 0.5em 0;
3047 border-top: thin dotted;
3048 border-bottom: thin dotted;
3051 table.%(base)table {
3052 margin-bottom: 0.5em;
3054 table.%(base)table, table.%(base)table th, table.%(base)table td {
3055 border-collapse: collapse;
3056 border-spacing: 0;
3057 border: thin solid;
3060 ol.%(base)ol {
3061 counter-reset: %(base)item;
3063 ol.%(base)ol[start="0"] {
3064 counter-reset: %(base)item -1;
3066 ol.%(base)ol[start="5"] {
3067 counter-reset: %(base)item 4;
3069 ol.%(base)ol[start="10"] {
3070 counter-reset: %(base)item 9;
3072 ol.%(base)ol > span.%(base)ol-incr {
3073 counter-increment: %(base)item;
3075 ol.%(base)ol > span.%(base)ol-incr-2 {
3076 counter-increment: %(base)item 2;
3078 ol.%(base)ol > span.%(base)ol-incr-5 {
3079 counter-increment: %(base)item 5;
3081 ol.%(base)ol > span.%(base)ol-incr-10 {
3082 counter-increment: %(base)item 10;
3084 ol.%(base)lc-greek, li.%(base)lc-greek {
3085 list-style-type: lower-greek;
3087 ol.%(base)ol > li {
3088 counter-increment: %(base)item;
3090 ol.%(base)ol > li.%(base)li,
3091 ol.%(base)ol > li.%(base)li-lc,
3092 ol.%(base)ol > li.%(base)li-lc-greek,
3093 ol.%(base)ol > li.%(base)li-uc {
3094 list-style-type: none;
3095 display: block;
3097 ol.%(base)ol > li.%(base)li:before,
3098 ol.%(base)ol > li.%(base)li-lc:before,
3099 ol.%(base)ol > li.%(base)li-lc-greek:before,
3100 ol.%(base)ol > li.%(base)li-uc:before {
3101 position: absolute;
3102 text-align: right;
3103 white-space: nowrap;
3104 margin-left: -9ex;
3105 width: 9ex;
3107 ol.%(base)ol > li.%(base)li[type="1"]:before {
3108 content: counter(%(base)item, decimal) ")\A0 \A0 ";
3110 ol.%(base)ol > li.%(base)li-lc[type="i"]:before,
3111 ol.%(base)ol > li.%(base)li-lc[type="I"]:before {
3112 content: counter(%(base)item, lower-roman) ")\A0 \A0 ";
3114 ol.%(base)ol > li.%(base)li-uc[type="I"]:before,
3115 ol.%(base)ol > li.%(base)li-uc[type="i"]:before {
3116 content: counter(%(base)item, upper-roman) ")\A0 \A0 ";
3118 ol.%(base)ol > li.%(base)li-lc[type="a"]:before,
3119 ol.%(base)ol > li.%(base)li-lc[type="A"]:before {
3120 content: counter(%(base)item, lower-alpha) ")\A0 \A0 ";
3122 ol.%(base)ol > li.%(base)li-lc-greek[type="a"]:before,
3123 ol.%(base)ol > li.%(base)li-lc-greek[type="A"]:before {
3124 content: counter(%(base)item, lower-greek) ")\A0 \A0 ";
3126 ol.%(base)ol > li.%(base)li-uc[type="A"]:before,
3127 ol.%(base)ol > li.%(base)li-uc[type="a"]:before {
3128 content: counter(%(base)item, upper-alpha) ")\A0 \A0 ";
3131 li.%(base)checkbox-on,
3132 li.%(base)checkbox-off {
3133 list-style-type: none;
3134 display: block;
3136 li.%(base)checkbox-on > span:first-child + span + span,
3137 li.%(base)checkbox-off > span:first-child + span + span {
3138 position: absolute;
3139 clip: rect(0,0,0,0);
3141 li.%(base)checkbox-on > span:first-child,
3142 li.%(base)checkbox-off > span:first-child,
3143 li.%(base)checkbox-on > span:first-child + span,
3144 li.%(base)checkbox-off > span:first-child + span {
3145 display: block;
3146 position: absolute;
3147 margin-left: -3ex;
3148 width: 1em;
3149 height: 1em;
3151 li.%(base)checkbox-on > span:first-child > span:first-child,
3152 li.%(base)checkbox-off > span:first-child > span:first-child {
3153 display: block;
3154 position: absolute;
3155 left: 0.75pt; top: 0.75pt; right: 0.75pt; bottom: 0.75pt;
3157 li.%(base)checkbox-on > span:first-child > span:first-child:before,
3158 li.%(base)checkbox-off > span:first-child > span:first-child:before {
3159 display: inline-block;
3160 position: relative;
3161 right: 1pt;
3162 width: 100%;
3163 height: 100%;
3164 border: 1pt solid;
3165 content: "";
3167 li.%(base)checkbox-on > span:first-child + span:before {
3168 position: relative;
3169 left: 2pt;
3170 bottom: 1pt;
3171 font-size: 125%;
3172 line-height: 80%;
3173 vertical-align: text-top;
3174 content: "\2713";
3177 /* ]]> */
3178 </style>
3180 STYLESHEET
3181 $g_style_sheet =~ s/^\s+//g;
3182 $g_style_sheet =~ s/\s+$//g;
3183 $g_style_sheet .= "\n";
3188 __DATA__
3190 =head1 NAME
3192 Markdown.pl - convert Markdown format text files to HTML
3194 =head1 SYNOPSIS
3196 B<Markdown.pl> [B<--help>] [B<--html4tags>] [B<--htmlroot>=I<prefix>]
3197 [B<--imageroot>=I<prefix>] [B<--version>] [B<--shortversion>]
3198 [B<--tabwidth>=I<num>] [B<--stylesheet>] [B<--stub>] [--]
3199 [I<file>...]
3201 Options:
3202 -h show short usage help
3203 --help show long detailed help
3204 --html4tags use <br> instead of <br />
3205 --deprecated allow <dir> and <menu> tags
3206 --sanitize sanitize tag attributes
3207 --no-sanitize do not sanitize tag attributes
3208 --validate-xml check if output is valid XML
3209 --validate-xml-internal fast basic check if output is valid XML
3210 --no-validate-xml do not check output for valid XML
3211 --tabwidth=num expand tabs to num instead of 8
3212 -a prefix | --absroot=prefix append abspath URLs to prefix
3213 -b prefix | --base=prefix prepend prefix to fragment-only URLs
3214 -r prefix | --htmlroot=prefix append relative non-img URLs to prefix
3215 -i prefix | --imageroot=prefix append relative img URLs to prefix
3216 -w [wikipat] | --wiki[=wikipat] activate wiki links using wikipat
3217 -V | --version show version, authors, license
3218 and copyright
3219 -s | --shortversion show just the version number
3220 --raw input contains only raw html
3221 --stylesheet output the fancy style sheet
3222 --no-stylesheet do not output fancy style sheet
3223 --stub wrap output in stub document
3224 implies --stylesheet
3225 -- end options and treat next
3226 argument as file
3228 =head1 DESCRIPTION
3230 Markdown is a text-to-HTML filter; it translates an easy-to-read /
3231 easy-to-write structured text format into HTML. Markdown's text format
3232 is most similar to that of plain text email, and supports features such
3233 as headers, *emphasis*, code blocks, blockquotes, and links.
3235 Markdown's syntax is designed not as a generic markup language, but
3236 specifically to serve as a front-end to (X)HTML. You can use span-level
3237 HTML tags anywhere in a Markdown document, and you can use block level
3238 HTML tags (like <div> and <table> as well).
3240 For more information about Markdown's syntax, see the F<basics.md>
3241 and F<syntax.md> files included with F<Markdown.pl>.
3243 Input (auto-detected) may be either ISO-8859-1 or UTF-8. Output is always
3244 converted to the UTF-8 character set.
3247 =head1 OPTIONS
3249 Use "--" to end switch parsing. For example, to open a file named "-z", use:
3251 Markdown.pl -- -z
3253 =over
3256 =item B<--html4tags>
3258 Use HTML 4 style for empty element tags, e.g.:
3260 <br>
3262 instead of Markdown's default XHTML style tags, e.g.:
3264 <br />
3266 This option is I<NOT compatible> with the B<--validate-xml> option
3267 and will produce an immediate error if both are given.
3270 =item B<--deprecated>
3272 Both "<dir>" and "<menu>" are normally taken as literal text and the leading
3273 "<" will be automatically escaped.
3275 If this option is used, they are recognized as valid tags and passed through
3276 without being escaped.
3278 When dealing with program argument descriptions "<dir>" can be particularly
3279 problematic therefore use of this option is not recommended.
3281 Other deprecated tags (such as "<font>" and "<center>" for example) continue
3282 to be recognized and passed through even without using this option.
3285 =item B<--sanitize>
3287 Removes troublesome tag attributes from embedded tags. Only a very strictly
3288 limited set of tag attributes will be permitted, other attributes will be
3289 silently discarded. The set of allowed attributes varies by tag.
3291 Splits empty minimized elements that are not one of the HTML allowed empty
3292 elements (C<area> C<basefont> C<br> C<col> C<hr> C<img>) into separate begin
3293 and end tags. For example, C<< <p/> >> or C<< <p /> >> will be split into
3294 C<< <p></p> >>.
3296 Combines adjacent (whitespace separated only) opening and closing tags for
3297 the same HTML empty element into a single minimized tag. For example,
3298 C<< <br></br> >> will become C<< <br /> >>.
3300 This is enabled by default.
3303 =item B<--no-sanitize>
3305 Do not sanitize tag attributes. This option does not allow any tags that would
3306 not be allowed without this option, but it does completely suppress the
3307 attribute sanitation process. If this option is specified, no attributes will
3308 be removed from any tag (although C<img> and C<a> tags will still be affected
3309 by B<--imageroot>, B<--htmlroot>, B<--absroot> and/or B<--base> options).
3310 Use of this option is I<NOT RECOMMENDED>.
3313 =item B<--validate-xml>
3315 Perform XML validation on the output before it's output and die if
3316 it fails validation. This requires the C<XML::Simple> or C<XML::Parser>
3317 module be present (one is only required if this option is given).
3319 Any errors are reported to STDERR and the exit status will be
3320 non-zero on XML validation failure. Note that all line and column
3321 numbers in the error output refer to the entire output that would
3322 have been produced. Re-run with B<--no-validate-xml> to see what's
3323 actually present at those line and column positions.
3325 If the B<--stub> option has also been given, then the entire output is
3326 validated as-is. Without the B<--stub> option, the output will be wrapped
3327 in C<< <div>...</div> >> for validation purposes but that extra "div" added
3328 for validation will not be added to the final output.
3330 This option is I<NOT enabled by default>.
3332 This option is I<NOT compatible> with the B<--html4tags> option and will
3333 produce an immediate error if both are given.
3336 =item B<--validate-xml-internal>
3338 Perform XML validation on the output before it's output and die if
3339 it fails validation. This uses a simple internal consistency checker
3340 that finds unmatched and mismatched open/close tags.
3342 Non-empty elements that in HTML have optional closing tags (C<colgroup>
3343 C<dd> C<dt> C<li> C<p> C<tbody> C<td> C<tfoot> C<th> C<thead> C<tr>)
3344 will automatically have any omitted end tags inserted during the
3345 `--validate-xml-internal` process.
3347 Any errors are reported to STDERR and the exit status will be
3348 non-zero on XML validation failure. Note that all line and column
3349 numbers in the error output refer to the entire output that would
3350 have been produced before sanitization without any B<--stub> or
3351 B<--stylesheet> options. Re-run with B<--no-sanitize> and
3352 B<--no-validate-xml> and I<without> any B<--stub> or B<--stylesheet>
3353 options to see what's actually present at those line and column
3354 positions.
3356 This option validates the output I<prior to> adding any requested
3357 B<--stub> or B<--stylesheet>. As the built-in stub and stylesheet
3358 have already been validated that speeds things up. The output is
3359 I<NOT> wrapped (in a C<< <div>...</div> >>) for validation as that's
3360 not required for the internal checker.
3362 This option is I<IS enabled by default> unless B<--no-sanitize> is
3363 active.
3365 This option is I<IS compatible> with the B<--html4tags> option.
3367 This option requires the B<--sanitize> option and will produce an
3368 immediate error if both B<--no-sanitize> and B<--validate-xml-internal>
3369 are given.
3371 Note that B<--validate-xml-internal> is I<MUCH faster> than
3372 B<--validate-xml> and I<does NOT> require any extra XML modules to
3373 be present.
3376 =item B<--no-validate-xml>
3378 Do not perform XML validation on the output. Markdown.pl itself will
3379 normally generate valid XML sequences (unless B<--html4tags> has been
3380 used). However, any raw tags in the input (that are on the "approved"
3381 list), could potentially result in invalid XML output (i.e. mismatched
3382 start and end tags, missing start or end tag etc.).
3384 Markdown.pl will I<NOT check> for these issues itself. But with
3385 the B<--validate-xml> option will use C<XML::Simple> or C<XML::Parser>
3386 to do so.
3388 Note that B<--validate-xml-internal> is the default option unless
3389 B<--no-sanitize> is used in which case B<--no-validate-xml> is the
3390 default option.
3393 =item B<--tabwidth>=I<num>
3395 Expand tabs to I<num> character wide tab stop positions instead of the default
3396 8. Don't use this; physical tabs should always be expanded to 8-character
3397 positions. This option does I<not> affect the number of spaces needed to
3398 start a new "indent level". That will always be 4 no matter what value is
3399 used (or implied by default) with this option. Also note that tabs inside
3400 backticks-delimited code blocks will always be expanded to 8-character tab
3401 stop positions no matter what value is used for this option.
3403 The value must be S<2 <= I<num> <= 32>.
3406 =item B<-a> I<prefix>, B<--absroot>=I<prefix>
3408 Any absolute path URLs (i.e. URLs without a scheme starting with "/" but not
3409 "//") have I<prefix> prepended which prevents them from being acted upon by the
3410 B<--htmlroot> and/or B<--imageroot> options provided the result is a full
3411 absolute URL. The default is to prepend nothing and leave them as absolute
3412 path URLs which will allow them to be processed by any B<--htmlroot> and/or
3413 B<--imageroot> options.
3415 This option can be helpful when documents are being formatted for display on a
3416 different system and the absolute path URLs need to be "fixed up".
3419 =item B<-b> I<prefix>, B<--base>=I<prefix>
3421 Any fragment-only URLs have I<prefix> prepended. The default is to prepend
3422 nothing and leave them as bare fragment URLs. Use of this option may be
3423 necessary when embedding the output of Markdown.pl into a document that makes
3424 use of the C<< <base> >> tag in order for intra-document fragment URL links to
3425 work properly in such a document.
3428 =item B<-r> I<prefix>, B<--htmlroot>=I<prefix>
3430 Any non-absolute URLs have I<prefix> prepended.
3433 =item B<-i> I<prefix>, B<--imageroot>=I<prefix>
3435 Any non-absolute URLs have I<prefix> prepended (overriding the B<-r> prefix
3436 if any) but only if they end in an image suffix.
3439 =item B<-w> [I<wikipat>], B<--wiki>[=I<wikipat>]
3441 Activate wiki links. Any link enclosed in double brackets (e.g. "[[link]]") is
3442 considered a wiki link. By default only absolute URL and fragment links are
3443 allowed in the "wiki link style" format. Any other double-bracketed strings
3444 are left unmolested.
3446 If this option is given, all other wiki links are enabled as well. Any
3447 non-absolute URL or fragment links will be transformed into a link using
3448 I<wikipat> where the default I<wikipat> if none is given is C<%{s(:md)}.html>.
3450 If the given I<wikipat> does not contain a C<%{...}> placeholder sequence
3451 then it will automatically have C<%{s(:md)}.html> suffixed to it.
3453 The C<...> part of the C<%{...}> sequence specifies zero or more case-insensitive
3454 single-letter options with the following effects:
3456 =over
3458 =item B<d>
3460 Convert spaces to dashes (ASCII 0x2D) instead of underscore (ASCII 0x5F). Note
3461 that if this option is given then runs of multiple dashes will be converted to
3462 a single dash I<instead> but runs of multiple underscores will be left untouched.
3464 =item B<f>
3466 Flatten the resulting name by replacing forward slashes (ASCII 0x2F) as well.
3467 They will be converted to underscores unless the C<d> option is given (in which
3468 case they will be converted to dashes). This conversion takes place before
3469 applying the runs-of-multiple reduction.
3471 =item B<l>
3473 Convert link target (excluding any query string and/or fragment) to lowercase.
3474 Takes precedence over any C<u> option, but specifically excludes C<%>-escapes
3475 which are always UPPERCASE hexadecimal.
3477 =item B<r>
3479 Leave raw UTF-8 characters in the result. Normally anything not allowed
3480 directly in a URL ends up URL-encoded. With this option, raw valid UTF-8
3481 sequences will be left untouched. Use with care.
3483 =item B<s> or B<s(>I<< <ext> >>[B<,>I<< <ext> >>]...B<)>
3485 After (temporarily) removing any query string and/or fragment, strip any final
3486 "dot" suffix so long as it occurs after the last slash (if any slash was
3487 present before applying the C<f> option). The "dot" (ASCII 0x2E) and all
3488 following characters (if any) are removed. If the optional C<< (<ext>,...) >>
3489 part is present then only strip the extension if it consists of a "dot"
3490 followed by one of the case-insensitive I<< <ext> >> values. As a special
3491 case, using the value C<:md> for one of the I<< <ext> >> values causes that
3492 value to be expanded to all known markdown extensions.
3494 =item B<u>
3496 Convert link target (excluding any query string and/or fragment) to UPPERCASE.
3498 =item B<v>
3500 Leave runs-of-multiple characters alone (aka "verbatim"). Does not affect
3501 any of the other options except by eliminating the runs-of-multple reduction
3502 step. Also does I<not> inhibit the initial whitespace trimming.
3504 =back
3506 The URL target of the wiki link is created by first trimming whitespace
3507 (starting and ending whitespace is removed and all other runs of consecutive
3508 whitespace are replaced with a single space) from the wiki link target,
3509 removing (temporarily) any query string and/or fragment, if no options are
3510 present, spaces are converted to underscores (C<_>) and runs of multiple
3511 consecutive underscores are replaced with a single underscore (ASCII 0x5F).
3512 Finally, the I<wikipat> string gets its first placeholder (the C<%{...}>
3513 sequence) replaced with this computed value and the original query string
3514 and/or fragment is re-appended (if any were originally present) and
3515 URL-encoding is applied as needed to produce the actual final target URL.
3517 See above option descriptions for possible available modifications.
3519 One of the commonly used hosting platforms does something substantially similar
3520 to using C<%{dfv}> as the placeholder.
3523 =item B<-V>, B<--version>
3525 Display Markdown's version number and copyright information.
3528 =item B<-s>, B<--shortversion>
3530 Display the short-form version number.
3533 =item B<--raw>
3535 Input contains only raw HTML/XHTML. All options other than
3536 B<--html4tags>, B<--deprecated>, B<--sanitize> (on by default),
3537 B<--validate-xml> and B<--validate-xml-internal> (and their B<--no-...>
3538 variants) are ignored.
3540 With this option, arbitrary HTML/XHTML input can be passed through
3541 the sanitizer and/or validator. If sanitation is requested (the
3542 default), input must only contain the contents of the "<body>"
3543 section (i.e. no "<head>" or "<html>"). Output I<will> be converted
3544 to UTF-8 regardless of the input encoding. All line endings will
3545 be normalized to C<\n> and input encodings other than UTF-8 or
3546 ISO-8859-1 or US-ASCII will end up mangled.
3548 Remember that any B<--stub> and/or B<--stylesheet> options are
3549 I<completely ignored> when B<--raw> is given.
3552 =item B<--stylesheet>
3554 Include the fancy style sheet at the beginning of the output (or in the
3555 C<head> section with B<--stub>). This style sheet makes fancy checkboxes
3556 and makes a right parenthesis C<)> show instead of a C<.> for ordered lists
3557 that use them. Without it things will still look fine except that the
3558 fancy stuff won't be there.
3560 Use this option with no other arguments and redirect standard input to
3561 /dev/null to get just the style sheet and nothing else.
3564 =item B<--no-stylesheet>
3566 Overrides a previous B<--stylesheet> and disables implicit inclusion
3567 of the style sheet by the B<--stub> option.
3570 =item B<--stub>
3572 Wrap the output in a full document stub (i.e. has C<html>, C<head> and C<body>
3573 tags). The style sheet I<will> be included in the C<head> section unless the
3574 B<--no-stylesheet> option is also used.
3577 =item B<-h>, B<--help>
3579 Display Markdown's help. With B<--help> full help is shown, with B<-h> only
3580 the usage and options are shown.
3583 =back
3586 =head1 VERSION HISTORY
3588 Z<> See the F<README> file for detailed release notes for this version.
3590 =over
3592 =item Z<> 1.1.9 - 15 Dec 2019
3594 =item Z<> 1.1.8 - 22 Nov 2019
3596 =item Z<> 1.1.7 - 14 Feb 2018
3598 =item Z<> 1.1.6 - 03 Jan 2018
3600 =item Z<> 1.1.5 - 07 Dec 2017
3602 =item Z<> 1.1.4 - 24 Jun 2017
3604 =item Z<> 1.1.3 - 13 Feb 2017
3606 =item Z<> 1.1.2 - 19 Jan 2017
3608 =item Z<> 1.1.1 - 12 Jan 2017
3610 =item Z<> 1.1.0 - 11 Jan 2017
3612 =item Z<> 1.0.4 - 05 Jun 2016
3614 =item Z<> 1.0.3 - 06 Sep 2015
3616 =item Z<> 1.0.2 - 03 Sep 2015
3618 =item Z<> 1.0.1 - 14 Dec 2004
3620 =item Z<> 1.0.0 - 28 Aug 2004
3622 =back
3624 =head1 AUTHORS
3626 =over
3628 =item John Gruber
3630 =item L<https://daringfireball.net>
3632 =item L<https://daringfireball.net/projects/markdown/>
3634 =item E<160>
3636 =back
3638 =over
3640 =item PHP port and other contributions by Michel Fortin
3642 =item L<https://michelf.ca>
3644 =item E<160>
3646 =back
3648 =over
3650 =item Additional enhancements and tweaks by Kyle J. McKay
3652 =item mackyle<at>gmail.com
3654 =back
3656 =head1 COPYRIGHT AND LICENSE
3658 =over
3660 =item Copyright (C) 2003-2004 John Gruber
3662 =item Copyright (C) 2015-2019 Kyle J. McKay
3664 =item All rights reserved.
3666 =back
3668 Redistribution and use in source and binary forms, with or without
3669 modification, are permitted provided that the following conditions are
3670 met:
3672 =over
3674 =item *
3676 Redistributions of source code must retain the above copyright
3677 notice, this list of conditions and the following disclaimer.
3679 =item *
3681 Redistributions in binary form must reproduce the above copyright
3682 notice, this list of conditions and the following disclaimer in the
3683 documentation and/or other materials provided with the distribution.
3685 =item *
3687 Neither the name "Markdown" nor the names of its contributors may
3688 be used to endorse or promote products derived from this software
3689 without specific prior written permission.
3691 =back
3693 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
3694 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
3695 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
3696 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
3697 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
3698 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
3699 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
3700 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
3701 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
3702 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
3703 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3705 =cut