Markdown version 1.1.15
[markdown.git] / Markdown.pl
blob72ab205f6a8fe2138733466d66d6c4c403de02a0
1 #!/usr/bin/env perl
4 # Markdown -- A text-to-HTML conversion tool for web writers
6 # Copyright (C) 2004 John Gruber
7 # Copyright (C) 2015,2016,2017,2018,2019,2020,2021 Kyle J. McKay
8 # All rights reserved.
9 # License is Modified BSD (aka 3-clause BSD) License\n";
10 # See LICENSE file (or <https://opensource.org/licenses/BSD-3-Clause>)
13 package Markdown;
15 use 5.008;
16 use strict;
17 use warnings;
19 use Encode;
21 use vars qw($COPYRIGHT $DATE $VERSION @ISA @EXPORT_OK);
23 BEGIN {*COPYRIGHT =
24 \"Copyright (C) 2004 John Gruber
25 Copyright (C) 2015,2016,2017,2018,2019,2020,2021 Kyle J. McKay
26 All rights reserved.
28 *VERSION = \"1.1.15";
29 *DATE = \"2021-08-15";
32 use Exporter ();
33 use Digest::MD5 qw(md5 md5_hex);
34 use File::Basename qw(basename);
35 use Scalar::Util qw(refaddr looks_like_number);
36 my ($hasxml, $hasxml_err); BEGIN { ($hasxml, $hasxml_err) = (0, "") }
37 my ($hasxmlp, $hasxmlp_err); BEGIN { ($hasxmlp, $hasxmlp_err) = (0, "") }
38 BEGIN {
39 @ISA = qw(Exporter);
40 @EXPORT_OK = qw(Markdown ProcessRaw GenerateStyleSheet SetWikiOpts SplitURL
41 escapeXML unescapeXML ResolveFragment ConvertNamedCharacterEntities);
42 $INC{__PACKAGE__.'.pm'} = $INC{basename(__FILE__)} unless exists $INC{__PACKAGE__.'.pm'};
45 close(DATA) if fileno(DATA);
46 exit(&_main(@ARGV)||0) unless caller;
48 sub fauxdie($) {
49 my $msg = join(" ", @_);
50 $msg =~ s/\s+$//os;
51 printf STDERR "%s: fatal: %s\n", basename($0), $msg;
52 exit 1;
55 my $encoder;
56 BEGIN {
57 $encoder = Encode::find_encoding('Windows-1252') ||
58 Encode::find_encoding('ISO-8859-1') or
59 die "failed to load ISO-8859-1 encoder\n";
63 # Global default settings:
65 my ($g_style_prefix, $g_empty_element_suffix, $g_indent_width,
66 $g_start_p, $g_close_p);
67 BEGIN {
68 $g_style_prefix = "_markdown-"; # Prefix for markdown css class styles
69 $g_empty_element_suffix = " />"; # Change to ">" for HTML output
70 $g_indent_width = 4; # Number of spaces considered new level
71 $g_start_p = "<p>"; # _FormParagraphs open paragraph tag
72 $g_close_p = "</p>"; # _FormParagraphs close paragraph tag
77 # Globals:
80 # Style sheet template
81 my $g_style_sheet;
83 # Permanent block id table
84 my %g_perm_block_ids;
86 # Global hashes, used by various utility routines
87 my %g_urls;
88 my %g_titles;
89 my %g_anchors;
90 my %g_anchors_id;
91 my %g_block_ids;
92 my %g_code_block_ids;
93 my %g_html_blocks;
94 my %g_code_blocks;
95 my @g_xml_comments;
96 my %opt;
97 my @autonum;
99 # Return a "block id" to use to identify the block that does not contain
100 # any characters that could be misinterpreted by the rest of the code
101 # Originally this used md5_hex but that's unnecessarily slow
102 # Instead just use the refaddr of the scalar ref of the entry for that
103 # key in either the global or, if the optional second argument is true,
104 # permanent table. To avoid the result being confused with anything
105 # else, it's prefixed with a control character and suffixed with another
106 # both of which are not allowed by the XML standard or Unicode.
107 sub block_id {
108 $_[1] or return "\5".refaddr(\$g_block_ids{$_[0]})."\6";
109 $_[1] == 1 and return "\2".refaddr(\$g_perm_block_ids{$_[0]})."\3";
110 $_[1] == 2 and return "\25".refaddr(\$g_code_block_ids{$_[0]})."\26";
111 die "programmer error: bad block_id type $_[1]";
114 # Regex to match balanced [brackets]. See Friedl's
115 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
116 my $g_nested_brackets;
117 BEGIN {
118 $g_nested_brackets = qr{
119 (?> # Atomic matching
120 [^\[\]]+ # Anything other than brackets
123 (??{ $g_nested_brackets }) # Recursive set of nested brackets
129 # Regex to match balanced (parentheses)
130 my $g_nested_parens;
131 BEGIN {
132 $g_nested_parens = qr{
133 (?> # Atomic matching
134 [^\(\)]+ # Anything other than parentheses
137 (??{ $g_nested_parens }) # Recursive set of nested parentheses
143 # Table of hash values for escaped characters:
144 my %g_escape_table;
145 BEGIN {
146 $g_escape_table{""} = "\2\3";
147 foreach my $char (split //, "\\\`*_~{}[]()>#+-.!|:<") {
148 $g_escape_table{$char} = block_id($char,1);
152 # Used to track when we're inside an ordered or unordered list
153 # (see _ProcessListItems() for details):
154 my $g_list_level;
155 BEGIN {
156 $g_list_level = 0;
159 # Entity conversion table
160 my %named_character_entity;
161 BEGIN { %named_character_entity = (
162 'Aacute' => '193',
163 'aacute' => '225',
164 'Acirc' => '194',
165 'acirc' => '226',
166 'acute' => '180',
167 'AElig' => '198',
168 'aelig' => '230',
169 'Agrave' => '192',
170 'agrave' => '224',
171 'alefsym' => 'x2135',
172 'Alpha' => '913',
173 'alpha' => '945',
174 'and' => 'x2227',
175 'ang' => 'x2220',
176 'apos' => '39',
177 'Aring' => '197',
178 'aring' => '229',
179 'asymp' => 'x2248',
180 'Atilde' => '195',
181 'atilde' => '227',
182 'Auml' => '196',
183 'auml' => '228',
184 'bdquo' => 'x201e',
185 'Beta' => '914',
186 'beta' => '946',
187 'brvbar' => '166',
188 'bull' => 'x2022',
189 'cap' => 'x2229',
190 'Ccedil' => '199',
191 'ccedil' => '231',
192 'cedil' => '184',
193 'cent' => '162',
194 'Chi' => '935',
195 'chi' => '967',
196 'circ' => '710',
197 'clubs' => 'x2663',
198 'cong' => 'x2245',
199 'copy' => '169',
200 'crarr' => 'x21b5',
201 'cup' => 'x222a',
202 'curren' => '164',
203 'Dagger' => 'x2021',
204 'dagger' => 'x2020',
205 'dArr' => 'x21d3',
206 'darr' => 'x2193',
207 'deg' => '176',
208 'Delta' => '916',
209 'delta' => '948',
210 'diams' => 'x2666',
211 'divide' => '247',
212 'Eacute' => '201',
213 'eacute' => '233',
214 'Ecirc' => '202',
215 'ecirc' => '234',
216 'Egrave' => '200',
217 'egrave' => '232',
218 'empty' => 'x2205',
219 'emsp' => 'x2003',
220 'ensp' => 'x2002',
221 'Epsilon' => '917',
222 'epsilon' => '949',
223 'equiv' => 'x2261',
224 'Eta' => '919',
225 'eta' => '951',
226 'ETH' => '208',
227 'eth' => '240',
228 'Euml' => '203',
229 'euml' => '235',
230 'euro' => 'x20ac',
231 'exist' => 'x2203',
232 'fnof' => '402',
233 'forall' => 'x2200',
234 'frac12' => '189',
235 'frac14' => '188',
236 'frac34' => '190',
237 'frasl' => 'x2044',
238 'Gamma' => '915',
239 'gamma' => '947',
240 'ge' => 'x2265',
241 'hArr' => 'x21d4',
242 'harr' => 'x2194',
243 'hearts' => 'x2665',
244 'hellip' => 'x2026',
245 'Iacute' => '205',
246 'iacute' => '237',
247 'Icirc' => '206',
248 'icirc' => '238',
249 'iexcl' => '161',
250 'Igrave' => '204',
251 'igrave' => '236',
252 'image' => 'x2111',
253 'infin' => 'x221e',
254 'int' => 'x222b',
255 'Iota' => '921',
256 'iota' => '953',
257 'iquest' => '191',
258 'isin' => 'x2208',
259 'Iuml' => '207',
260 'iuml' => '239',
261 'Kappa' => '922',
262 'kappa' => '954',
263 'Lambda' => '923',
264 'lambda' => '955',
265 'lang' => 'x2329',
266 'laquo' => '171',
267 'lArr' => 'x21d0',
268 'larr' => 'x2190',
269 'lceil' => 'x2308',
270 'ldquo' => 'x201c',
271 'le' => 'x2264',
272 'lfloor' => 'x230a',
273 'lowast' => 'x2217',
274 'loz' => 'x25ca',
275 'lrm' => 'x200e',
276 'lsaquo' => 'x2039',
277 'lsquo' => 'x2018',
278 'macr' => '175',
279 'mdash' => 'x2014',
280 'micro' => '181',
281 'middot' => '183',
282 'minus' => 'x2212',
283 'Mu' => '924',
284 'mu' => '956',
285 'nabla' => 'x2207',
286 'nbsp' => '160',
287 'ndash' => 'x2013',
288 'ne' => 'x2260',
289 'ni' => 'x220b',
290 'not' => '172',
291 'notin' => 'x2209',
292 'nsub' => 'x2284',
293 'Ntilde' => '209',
294 'ntilde' => '241',
295 'Nu' => '925',
296 'nu' => '957',
297 'Oacute' => '211',
298 'oacute' => '243',
299 'Ocirc' => '212',
300 'ocirc' => '244',
301 'OElig' => '338',
302 'oelig' => '339',
303 'Ograve' => '210',
304 'ograve' => '242',
305 'oline' => 'x203e',
306 'Omega' => '937',
307 'omega' => '969',
308 'Omicron' => '927',
309 'omicron' => '959',
310 'oplus' => 'x2295',
311 'or' => 'x2228',
312 'ordf' => '170',
313 'ordm' => '186',
314 'Oslash' => '216',
315 'oslash' => '248',
316 'Otilde' => '213',
317 'otilde' => '245',
318 'otimes' => 'x2297',
319 'Ouml' => '214',
320 'ouml' => '246',
321 'para' => '182',
322 'part' => 'x2202',
323 'permil' => 'x2030',
324 'perp' => 'x22a5',
325 'Phi' => '934',
326 'phi' => '966',
327 'Pi' => '928',
328 'pi' => '960',
329 'piv' => '982',
330 'plusmn' => '177',
331 'pound' => '163',
332 'Prime' => 'x2033',
333 'prime' => 'x2032',
334 'prod' => 'x220f',
335 'prop' => 'x221d',
336 'Psi' => '936',
337 'psi' => '968',
338 'radic' => 'x221a',
339 'rang' => 'x232a',
340 'raquo' => '187',
341 'rArr' => 'x21d2',
342 'rarr' => 'x2192',
343 'rceil' => 'x2309',
344 'rdquo' => 'x201d',
345 'real' => 'x211c',
346 'reg' => '174',
347 'rfloor' => 'x230b',
348 'Rho' => '929',
349 'rho' => '961',
350 'rlm' => 'x200f',
351 'rsaquo' => 'x203a',
352 'rsquo' => 'x2019',
353 'sbquo' => 'x201a',
354 'Scaron' => '352',
355 'scaron' => '353',
356 'sdot' => 'x22c5',
357 'sect' => '167',
358 'shy' => '173',
359 'Sigma' => '931',
360 'sigma' => '963',
361 'sigmaf' => '962',
362 'sim' => 'x223c',
363 'spades' => 'x2660',
364 'sub' => 'x2282',
365 'sube' => 'x2286',
366 'sum' => 'x2211',
367 'sup' => 'x2283',
368 'sup1' => '185',
369 'sup2' => '178',
370 'sup3' => '179',
371 'supe' => 'x2287',
372 'szlig' => '223',
373 'Tau' => '932',
374 'tau' => '964',
375 'there4' => 'x2234',
376 'Theta' => '920',
377 'theta' => '952',
378 'thetasym' => '977',
379 'thinsp' => 'x2009',
380 'THORN' => '222',
381 'thorn' => '254',
382 'tilde' => '732',
383 'times' => '215',
384 'trade' => 'x2122',
385 'Uacute' => '218',
386 'uacute' => '250',
387 'uArr' => 'x21d1',
388 'uarr' => 'x2191',
389 'Ucirc' => '219',
390 'ucirc' => '251',
391 'Ugrave' => '217',
392 'ugrave' => '249',
393 'uml' => '168',
394 'upsih' => '978',
395 'Upsilon' => '933',
396 'upsilon' => '965',
397 'Uuml' => '220',
398 'uuml' => '252',
399 'weierp' => 'x2118',
400 'Xi' => '926',
401 'xi' => '958',
402 'Yacute' => '221',
403 'yacute' => '253',
404 'yen' => '165',
405 'Yuml' => '376',
406 'yuml' => '255',
407 'Zeta' => '918',
408 'zeta' => '950',
409 'zwj' => 'x200d',
410 'zwnj' => 'x200c'
414 #### Blosxom plug-in interface ##########################################
415 my $_haveBX;
416 BEGIN {
417 no warnings 'once';
418 $_haveBX = defined($blosxom::version);
421 # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
422 # which posts Markdown should process, using a "meta-markup: markdown"
423 # header. If it's set to 0 (the default), Markdown will process all
424 # entries.
425 my $g_blosxom_use_meta;
426 BEGIN {
427 $g_blosxom_use_meta = 0;
430 sub start { 1; }
431 sub story {
432 my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
434 if ((! $g_blosxom_use_meta) or
435 (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
437 $$body_ref = Markdown($$body_ref);
443 #### Movable Type plug-in interface #####################################
444 my $_haveMT = eval {require MT; 1;}; # Test to see if we're running in MT
445 my $_haveMT3 = $_haveMT && eval {require MT::Plugin; 1;}; # and MT >= MT 3.0.
447 if ($_haveMT) {
448 require MT;
449 import MT;
450 require MT::Template::Context;
451 import MT::Template::Context;
453 if ($_haveMT3) {
454 require MT::Plugin;
455 import MT::Plugin;
456 my $plugin = new MT::Plugin({
457 name => "Markdown",
458 description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
459 doc_link => 'https://daringfireball.net/projects/markdown/'
461 MT->add_plugin( $plugin );
464 MT::Template::Context->add_container_tag(MarkdownOptions => sub {
465 my $ctx = shift;
466 my $args = shift;
467 my $builder = $ctx->stash('builder');
468 my $tokens = $ctx->stash('tokens');
470 if (defined ($args->{'output'}) ) {
471 $ctx->stash('markdown_output', lc $args->{'output'});
474 defined (my $str = $builder->build($ctx, $tokens) )
475 or return $ctx->error($builder->errstr);
476 $str; # return value
479 MT->add_text_filter('markdown' => {
480 label => 'Markdown',
481 docs => 'https://daringfireball.net/projects/markdown/',
482 on_format => sub {
483 my $text = shift;
484 my $ctx = shift;
485 my $raw = 0;
486 if (defined $ctx) {
487 my $output = $ctx->stash('markdown_output');
488 if (defined $output && $output =~ m/^html/i) {
489 $g_empty_element_suffix = ">";
490 $ctx->stash('markdown_output', '');
492 elsif (defined $output && $output eq 'raw') {
493 $raw = 1;
494 $ctx->stash('markdown_output', '');
496 else {
497 $raw = 0;
498 $g_empty_element_suffix = " />";
501 $text = $raw ? $text : Markdown($text);
502 $text;
506 # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
507 my $smartypants;
510 no warnings "once";
511 $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
514 if ($smartypants) {
515 MT->add_text_filter('markdown_with_smartypants' => {
516 label => 'Markdown With SmartyPants',
517 docs => 'https://daringfireball.net/projects/markdown/',
518 on_format => sub {
519 my $text = shift;
520 my $ctx = shift;
521 if (defined $ctx) {
522 my $output = $ctx->stash('markdown_output');
523 if (defined $output && $output eq 'html') {
524 $g_empty_element_suffix = ">";
526 else {
527 $g_empty_element_suffix = " />";
530 $text = Markdown($text);
531 $text = $smartypants->($text, '1');
537 sub _tabDefault {
538 return $_haveBX || $_haveMT ? 4 : 8;
541 sub _strip {
542 my $str = shift;
543 defined($str) or return undef;
544 $str =~ s/^\s+//;
545 $str =~ s/\s+$//;
546 $str =~ s/\s+/ /g;
547 $str;
550 my %_yamlmode;
551 BEGIN {%_yamlmode = (
552 disable => 0,
553 reveal => 1,
554 enable => 1,
555 conceal => 1,
556 show => -1,
557 unknown => -1,
558 strip => -1
560 my %_yamlvis;
561 BEGIN {%_yamlvis = (
562 disable => 0,
563 reveal => 1,
564 enable => -1,
565 conceal => 0,
566 show => 1,
567 unknown => -1,
568 strip => 0
571 sub _require_pod_usage() {
572 require Pod::Usage;
573 eval 'require Pod::Text::Termcap; 1;' and
574 @Pod::Usage::ISA = (qw( Pod::Text::Termcap ));
575 defined($ENV{PERLDOC}) && $ENV{PERLDOC} ne "" or
576 $ENV{PERLDOC} = "-oterm -oman";
579 #### BBEdit/command-line text filter interface ##########################
580 sub _main {
581 local *ARGV = \@_;
584 #### Check for command-line switches: #################
585 my %options = ();
586 my %cli_opts;
587 my $raw = 0;
588 use Getopt::Long;
589 Getopt::Long::Configure(qw(bundling require_order pass_through));
590 GetOptions(
591 'help' => sub {
592 _require_pod_usage;
593 Pod::Usage::pod2usage(-verbose => 2, -exitval => 0)},
594 'h' => sub {
595 _require_pod_usage;
596 Pod::Usage::pod2usage(-verbose => 0, -exitval => 0)},
597 'version|V' => sub { # Version info
598 print "\nThis is Markdown, version $VERSION $DATE.\n", $COPYRIGHT;
599 print "License is Modified BSD (aka 3-clause BSD) License\n";
600 print "<https://opensource.org/licenses/BSD-3-Clause>\n";
601 exit 0},
602 'shortversion|short-version|s' => sub { # Just the version number string
603 print $VERSION;
604 exit 0},
605 'html4tags' => \$cli_opts{'html4tags'},
606 'deprecated' => \$cli_opts{'deprecated'},
607 'sanitize' => \$cli_opts{'sanitize'},
608 'no-sanitize' => sub {$cli_opts{'sanitize'} = 0},
609 'validate-xml' => sub {$cli_opts{'validate-xml'} = 1},
610 'validate-xml-internal' => sub {$cli_opts{'validate-xml'} = 2},
611 'no-validate-xml' => sub {$cli_opts{'validate-xml'} = 0},
612 'stripcommentsstrict|stripcomments-strict|strip-comments-strict' =>
613 sub {$cli_opts{'stripcomments'} = 1},
614 'stripcomments|stripcommentslax|stripcomments-lax|strip-comments|strip-comments-lax' =>
615 sub {$cli_opts{'stripcomments'} = 2},
616 'stripcommentslaxonly|stripcomments-laxonly|stripcomments-lax-only|strip-comments-lax-only' =>
617 sub {$cli_opts{'stripcomments'} = 3},
618 'nostripcomments|no-stripcomments|no-strip-comments' => sub {$cli_opts{'stripcomments'} = 0},
619 'keepabs|keep-abs|k' => \$cli_opts{'keepabs'},
620 'absroot|a=s' => \$cli_opts{'absroot'},
621 'base|b=s' => \$cli_opts{'base'},
622 'htmlroot|r=s' => \$cli_opts{'htmlroot'},
623 'imageroot|i=s' => \$cli_opts{'imageroot'},
624 'div:s' => \$cli_opts{'divname'},
625 'wiki|w:s' => \$cli_opts{'wiki'},
626 'tabwidth|tab-width=s' => \$cli_opts{'tabwidth'},
627 'autonumber|auto-number' => \$cli_opts{'autonumber'},
628 'raw' => sub { $cli_opts{'raw'} = 1 },
629 'raw-xml' => sub { $cli_opts{'raw'} = 1 },
630 'raw-html' => sub { $cli_opts{'raw'} = 2 },
631 'stylesheet|style-sheet' => \$cli_opts{'stylesheet'},
632 'no-stylesheet|no-style-sheet' => sub {$cli_opts{'stylesheet'} = 0},
633 'keep-named-character-entities' => \$cli_opts{'keepcharents'},
634 'no-keep-named-character-entities' => sub {$cli_opts{'keepcharents'} = 0},
635 'us-ascii|ascii' => \$cli_opts{'us_ascii'},
636 'no-us-ascii|no-ascii' => sub {$cli_opts{'us_ascii'} = 0},
637 'stub' => \$cli_opts{'stub'},
638 'yaml:s' => \$cli_opts{'yaml'},
640 defined($cli_opts{'raw'}) or $cli_opts{'raw'} = 0;
641 my $stub = 0;
642 if ($cli_opts{'stub'}) {
643 $stub = 1;
645 if ($cli_opts{'html4tags'}) { # Use HTML tag style instead of XHTML
646 $options{empty_element_suffix} = ">";
647 $stub = -$stub;
649 if ($cli_opts{'deprecated'}) { # Allow <dir> and <menu> tags to pass through
650 _SetAllowedTag("dir");
651 _SetAllowedTag("menu");
653 my $xmlcheck;
654 $options{'keep_named_character_entities'} = $cli_opts{'keepcharents'} ? "1" : 0;
655 $options{'us_ascii'} = $cli_opts{'us_ascii'} ? "1" : 0;
656 $options{divwrap} = defined($cli_opts{'divname'});
657 $options{divname} = defined($cli_opts{'divname'}) ? $cli_opts{'divname'} : "";
658 $options{sanitize} = 1; # sanitize by default
659 $options{sanitize} = $cli_opts{'sanitize'} if defined($cli_opts{'sanitize'});
660 $xmlcheck = $options{sanitize} ? 2 : 0;
661 $xmlcheck = $cli_opts{'validate-xml'} if defined($cli_opts{'validate-xml'});
662 $options{stripcomments} = $cli_opts{'stripcomments'} if defined($cli_opts{'stripcomments'});
663 die "--html4tags and --validate-xml are incompatible\n"
664 if $cli_opts{'html4tags'} && $xmlcheck == 1;
665 die "--no-sanitize and --validate-xml-internal are incompatible\n"
666 if !$options{'sanitize'} && $xmlcheck == 2;
667 die "--no-sanitize and --strip-comments are incompatible\n"
668 if !$options{'sanitize'} && $options{stripcomments};
669 die "--raw-html requires --validate-xml-internal\n"
670 if $cli_opts{'raw'} == 2 && $xmlcheck != 2;
671 if ($xmlcheck == 1) {
672 eval { require XML::Simple; 1 } and $hasxml = 1 or $hasxml_err = $@;
673 eval { require XML::Parser; 1 } and $hasxmlp = 1 or $hasxmlp_err = $@ unless $hasxml;
674 die "$hasxml_err$hasxmlp_err" unless $hasxml || $hasxmlp;
676 if ($cli_opts{'tabwidth'}) {
677 my $tw = $cli_opts{'tabwidth'};
678 die "invalid tab width (must be integer)\n" unless looks_like_number $tw;
679 die "invalid tab width (must be >= 2 and <= 32)\n" unless $tw >= 2 && $tw <= 32;
680 $options{tab_width} = int(0+$tw);
682 $options{auto_number} = 6 if $cli_opts{'autonumber'};
683 $options{keepabs} = $cli_opts{'keepabs'};
684 $options{abs_prefix} = ""; # no abs prefix by default
685 if ($cli_opts{'absroot'}) { # Use abs prefix for absolute path URLs
686 my $abs = $cli_opts{'absroot'};
687 $abs =~ s{/+$}{};
688 $options{abs_prefix} = $abs;
690 $options{base_prefix} = ""; # no base prefix by default
691 if ($cli_opts{'base'}) { # Use base prefix for fragment URLs
692 $options{base_prefix} = $cli_opts{'base'};
694 if ($cli_opts{'htmlroot'}) { # Use URL prefix
695 $options{url_prefix} = $cli_opts{'htmlroot'};
697 if ($cli_opts{'imageroot'}) { # Use image URL prefix
698 $options{img_prefix} = $cli_opts{'imageroot'};
700 SetWikiOpts(\%options, $cli_opts{'wiki'}); # Set wiki links options
701 if (ref($options{wikiopt}) eq 'HASH') {
702 my $o = $options{wikiopt};
703 $o->{"f"} && $o->{"%"} and
704 die "--wiki sub-options 'f' and '%' are mutually exclusive\n"
706 if ($cli_opts{'raw'}) {
707 $raw = 1;
708 $options{htmlauto} = 1 if $cli_opts{'raw'} == 2;
710 $options{show_styles} = $cli_opts{'stylesheet'} if defined($cli_opts{'stylesheet'});
711 $options{show_styles} = 1 if $stub && !defined($options{show_styles});
712 $options{tab_width} = 8 unless defined($options{tab_width});
713 my $ym = $cli_opts{'yaml'};
714 defined($ym) && $ym ne "" or $ym = "enable";
715 my $lcym = lc($ym);
716 exists($_yamlmode{$lcym}) or die "invalid --yaml= value '$ym'\n";
717 $options{yamlmode} = $_yamlmode{$lcym};
718 $options{yamlvis} = $_yamlvis{$lcym};
720 my $hdrf = sub {
721 my $out = "";
722 if ($stub > 0) {
723 $out .= <<'HTML5';
724 <!DOCTYPE html>
725 <html xmlns="http://www.w3.org/1999/xhtml">
726 <head>
727 <meta charset="utf-8" />
728 <meta http-equiv="content-type" content="text/html; charset=utf-8" />
729 HTML5
730 } elsif ($stub < 0) {
731 $out .= <<'HTML4';
732 <html>
733 <head>
734 <meta charset="utf-8">
735 <meta http-equiv="content-type" content="text/html; charset=utf-8">
736 HTML4
738 if ($stub && ($options{title} || $options{h1})) {
739 my $title = $options{title};
740 defined($title) && $title ne "" or $title = $options{h1};
741 if (defined($title) && $title ne "") {
742 $title =~ s/&/&amp;/g;
743 $title =~ s/</&lt;/g;
744 $out .= "<title>$title</title>\n";
747 $out .= GenerateStyleSheet($g_style_prefix) if $options{show_styles};
748 if ($stub) {
749 $out .= "</head>\n<body style=\"text-align:center\">\n" .
750 "<div style=\"display:inline-block;text-align:left;max-width:42pc\">\n";
752 $out;
755 #### Process incoming text: ###########################
756 my ($didhdr, $hdr, $result, $ftr) = (0, "", "", "");
757 @ARGV or push(@ARGV, "-");
758 foreach (@ARGV) {
759 my ($fh, $contents, $oneresult);
760 $_ eq "-" or open $fh, '<', $_ or fauxdie "could not open \"$_\": $!\n";
762 local $/; # Slurp the whole file
763 $_ eq "-" and $contents = <STDIN>;
764 $_ ne "-" and $contents = <$fh>;
766 defined($contents) or fauxdie "could not read \"$_\": $!\n";
767 $_ eq "-" or close($fh);
768 $options{xmlcheck} = ($xmlcheck == 2) ? 2 : 0;
769 $oneresult = $raw ? ProcessRaw($contents, \%options) : Markdown($contents, \%options);
770 $oneresult =~ s/\s+$//os;
771 if ($oneresult ne "") {
772 if (!$didhdr && !$raw) {
773 $hdr = &$hdrf();
774 $didhdr = 1;
776 $result .= $oneresult . "\n";
779 $hdr = &$hdrf() unless $didhdr || $raw;
780 $ftr = "</div>\n</body>\n</html>\n" if $stub && !$raw;
781 if ($xmlcheck == 1) {
782 my ($good, $errs);
783 if ($stub && !$raw) {
784 ($good, $errs) = _xmlcheck($hdr.$result.$ftr);
785 } else {
786 ($good, $errs) = _xmlcheck("<div>".$result."</div>");
788 $good or die $errs;
790 print $hdr, $result, $ftr;
792 exit 0;
796 # INPUT
797 # $1: HASH ref
798 # $2: value of --wiki= option (see docs) except
799 # that a value of undef turns off wiki links
800 # OUTPUT
801 # $1->{wikipat}
802 # $1->{wikiopt}
804 sub SetWikiOpts {
805 my ($o, $wpat) = @_;
806 ref($o) eq "HASH" or die "internal error: first arg to SetWikiOpts must be HASH ref";
807 delete $o->{wikipat};
808 delete $o->{wikiopt};
809 defined($wpat) or return;
810 # Parse wiki links option setting
811 my $wopt = "s(:md)";
812 if ($wpat =~ /^(.*?)%\{((?:[%0-9A-Za-z]|[Ss]\([^)]*\))*)\}(.*)$/) {
813 $o->{wikipat} = $1 . "%{}" . $3;
814 $wopt = $2;
815 } else {
816 $o->{wikipat} = $wpat . "%{}.html";
818 my $sval = 1;
819 while ($wopt =~ /^(.*?)s\(([^)]*)\)(.*)$/i) {
820 my $sarg = $2;
821 $wopt = $1 . "s" . $3;
822 $sarg =~ s/^\s+//; $sarg =~ s/\s+$//;
823 $sval = {} unless ref($sval) eq "HASH";
824 s/^\.//, $sval->{lc($_)}=1 foreach split(/(?:\s*,\s*)|(?:(?<!,)\s+(?!,))/, $sarg);
825 $sval = 1 unless scalar(keys(%$sval));
827 $o->{wikiopt} = { map({$_ => 1} split(//,lc($wopt))) };
828 if (ref($sval) eq "HASH" && $sval->{':md'}) {
829 delete $sval->{':md'};
830 $sval->{$_} = 1 foreach qw(md rmd mkd mkdn mdwn mdown markdown litcoffee);
832 $o->{wikiopt}->{'s'} = $sval if $o->{wikiopt}->{'s'};
836 # Return a copy of the fancy CSS style sheet that uses the
837 # passed in prefix as a prefix for the CSS style names.
838 # If no argument is passed in, use $g_style_prefix
839 # as the CSS style name prefix.
840 sub GenerateStyleSheet {
841 my $prefix = shift;
842 defined($prefix) or $prefix = $g_style_prefix;
843 my $stylesheet = $g_style_sheet;
844 $stylesheet =~ s/%\(base\)/$prefix/g;
845 return $stylesheet;
849 sub _xmlcheck {
850 my $text = shift;
851 my ($good, $errs);
852 ($hasxml ? eval { XML::Simple::XMLin($text, KeepRoot => 1) && 1 } :
853 eval {
854 my $p = XML::Parser->new(Style => 'Tree', ErrorContext => 1);
855 $p->parse($text) && 1;
856 }) and $good = 1 or $errs = _trimerr($@);
857 ($good, $errs);
861 sub _trimerr {
862 my $err = shift;
863 1 while $err =~ s{\s+at\s+\.?/[^,\s\n]+\sline\s+[0-9]+\.?(\n|$)}{$1}is;
864 $err =~ s/\s+$//os;
865 $err . "\n";
869 sub _PrepareInput {
870 my ($input,$parseyaml) = @_;
871 defined $input or $input = "";
873 use bytes;
874 $input =~ s/[\x00-\x08\x0B\x0E-\x1F\x7F]+//gso;
876 my $output;
877 if (Encode::is_utf8($input) || utf8::decode($input)) {
878 $output = $input;
879 } else {
880 $output = $encoder->decode($input, Encode::FB_DEFAULT);
882 # Standardize line endings:
883 $output =~ s{\r\n}{\n}g; # DOS to Unix
884 $output =~ s{\r}{\n}g; # Mac to Unix
886 # Extract YAML front matter if requested
887 my $yaml = undef;
888 if ($parseyaml) {
889 $yaml = {};
890 if ($output =~ /^---[ \t]*(?:\n|\z)/g) {
891 until ($output =~ /\G(?:(?:(?:---)|(?:\.\.\.))[ \t]*(?:\n|\z)|\z)/gc) {
892 next if $output =~ m"\G[ \t]*(?:#[^\n]*)?\n"gc; # skip comment lines
893 next if $output =~ m"\G[ \t]*(?:#[^\n]*)\z"gc; # skip final no EOL comment
894 last unless $output =~ /\G([^\n]+)(?:\n|\z)/gc;
895 my $yl = $1;
896 if ($yl =~ /^([A-Za-z_][A-Za-z_0-9.-]*):[ \t]+(.*)$/os) {
897 my ($k, $v) = ($1, $2);
898 $yaml->{lc($k)} = _YAMLvalue($2);
901 $output = substr($output, pos($output));
904 return wantarray ? ($output, $yaml) : $output;
908 sub _YAMLvalue {
909 my $v = shift;
910 $v =~ s/^\s+//;
911 if (substr($v, 0, 1) eq '"') {
912 # only $ and/or @ present issues, map them
913 $v =~ tr/\@\$/\036\037/;
914 eval '{$v='.$v."\n}1" or $v = undef;
915 $v =~ tr/\036\037/\@\$/ if defined($v);
916 } else {
917 $v =~ s"#.*$""os;
918 $v =~ s/\s+$//os;
919 $v ne "" or $v = undef;
921 return $v;
925 sub ProcessRaw {
926 my $text = _PrepareInput(shift);
928 # Any remaining arguments after the first are options; either a single
929 # hashref or a list of name, value pairs. See _SanitizeOpts comments.
931 %opt = (
932 empty_element_suffix => $g_empty_element_suffix,
934 my %args = ();
935 if (ref($_[0]) eq "HASH") {
936 %args = %{$_[0]};
937 } else {
938 %args = @_;
940 while (my ($k,$v) = each %args) {
941 $opt{$k} = $v;
943 _SanitizeOpts(\%opt);
945 # Sanitize all '<'...'>' tags if requested
946 $text = _SanitizeTags($text, $opt{xmlcheck}, $opt{htmlauto}) if $opt{sanitize};
948 # Eliminate known named character entities
949 $opt{keep_named_character_entities} or
950 $text = ConvertNamedCharacterEntities($text);
952 # Convert to US-ASCII only if requested
953 $opt{us_ascii} and
954 $text = ConvertToASCII($text);
956 utf8::encode($text);
957 if ($opt{divwrap}) {
958 my $id = $opt{divname};
959 defined($id) or $id = "";
960 $id eq "" or $id = ' id="'.escapeXML($id).'"';
961 chomp($text);
962 return "<div$id>\n".$text."\n</div>\n";
964 return $text;
968 # $1: HASH ref with the following key value semantics
970 # sanitize => any-false-value (no action), any-true-value (sanitize).
971 # note that a true value of xmlcheck or a true value of
972 # stripcomments or a urlfunc value that is a CODE ref
973 # always forces sanitize to activate.
974 # tag attributes are sanitized by removing all "questionable"
975 # attributes (such as script attributes, unknown attributes
976 # and so forth) and normalizing the remaining ones (i.e.
977 # adding missing quotes and/or values etc.).
978 # effective for both ProcessRaw and Markdown.
979 # xmlcheck => 0 (no check), any-true-value (internal check).
980 # note that the default if xmlcheck is not set/valid is 2.
981 # note that a true value is effective for both ProcessRaw
982 # and Markdown. note that a true value automatically inserts
983 # the closing tag for auto-closing tags and converts empty tags
984 # to the correct format converting empty tags that shouldn't be
985 # to an open and close pair; since xmlcheck is a function of the
986 # sanitizer, tag attributes are also always sanitized whenever
987 # xmlcheck has a true value.
988 # note that a true xmlcheck value WILL call "die" with a
989 # detailed indication of the error(s) if xml validation fails
990 # in which case any line/column numbers refer to the text that
991 # would be produced by a sanitize=>0, xmlcheck=>0 call to
992 # either ProcessRaw or Markdown, NOT the original input text.
993 # htmlauto => any-false-value (no auto close), any-true-value (auto-close)
994 # only effective for ProcessRaw; always enabled for Markdown.
995 # when xmlcheck is set to 2 provide html automatic closing tag
996 # and optional closing tag semantics where closing tags are
997 # automatically inserted when encountering an opening tag that
998 # auto closes a currently open tag and tags with an optional
999 # closing tag that's missing have that inserted as appropriate.
1000 # a true value may result in some texts being rejected that
1001 # would be otherwise be accepted (e.g. "<p><pre></pre></p>"
1002 # which gets turned into "<p></p><pre></pre></p>" which then
1003 # no longer validates).
1004 # stripcomments => any-false-value (no action), any-true-value (strip).
1005 # => 1 (strip-strict), 2 (strip-lax), 3 (strip-lax-only)
1006 # a non-numeric true value will be forced to 2.
1007 # a numeric value < 0 will be forced to 2.
1008 # a numeric value > 0 and < 1 will be forced to 2.
1009 # a numeric value > 3 will be forced to 3.
1010 # a non-integer value will forced to an integral value.
1011 # 1, 2, and 3 correspond to the command line options
1012 # --strip-comments-strict, --strip-comments-lax and
1013 # --strip-comments-lax-only respectively.
1014 # since the strip comments mechanism is a function of the
1015 # sanitizer, if stripcomments is set to any-true-value then
1016 # tag attributes will also always be sanitized.
1017 # if stripcomments is not set or is set to the empty string,
1018 # then it will be set to 3 if sanitize is true and 0 otherwise.
1019 # effective for both ProcessRaw and Markdown.
1020 # empty_element_suffix => " />" or ">"
1021 # will be forced to " />" if not valid or defined.
1022 # effective for both ProcessRaw and Markdown.
1023 # keep_named_character_entities => "1" (keep them), any-other-value (convert).
1024 # unless this option is present and has exactly the value "1"
1025 # then known named character entities will be converted to
1026 # their equivalent numerical entity. Use of this option is
1027 # strongly discouraged to avoid strict XML validation failures.
1028 # us_ascii => if true, non-US-ASCII characters will be converted to
1029 # numerical character entities making the output US-ASCII only.
1030 # divwrap => if true, wrap output contents in <div>...</div>
1031 # divname => if defined and non-empty will be id of divwrap div tag
1032 # urlfunc => if set to a CODE ref, the function will be called with
1033 # seven arguments like so:
1034 # $result = &$urlfunc($iresult, \%opts, $tag, $uhost, $uabs, $q, $f)
1035 # where on input $iresult is the result that would be produced
1036 # if no urlfunc was provided and on output $result will be
1037 # used as url value. $tag is either "img" or "a" to indicate the
1038 # source of the url. $uhost.$uabs is the result of stripping off
1039 # any query string and/or fragment from $iresult. $q contains either
1040 # an empty string or the stripped off query string and $f contains
1041 # an empty string or the stripped off fragment if they were originally
1042 # present where a non-empty $q always starts with '?' and a non-empty
1043 # $f always starts with '#'. $uhost contains the scheme and
1044 # host+port if present (it may be the empty string). $uabs contains
1045 # the path portion which may or may not start with a "/" depending on
1046 # whether or not it's a relative path. The $iresult value is related
1047 # to the other arguments like so:
1048 # $iresult = $uhost . $uabs . $q . $f;
1049 # All values passed to the urlfunc function have already been HTML
1050 # unescaped and the returned value will be automatically HTML escaped.
1051 # Any provided urlfunc should treat the %opts HASH as
1052 # read-only. Modifying the %opts HASH in urlfunc will
1053 # likely result in unpredictable behavior! Don't do it!
1054 # If urlfunc is set to a CODE ref then tags will also always be
1055 # sanitized.
1057 # The remaining key value pairs are ignored by ProcessRaw and are only
1058 # effective when using Markdown or _main
1060 # tab_width => 1..32 which is how many spaces tabs are expanded to.
1061 # will be forced to 8 if not in range.
1062 # indent_width => 1..32 how many spaces make a new "indent" level.
1063 # will be forced to 4 if not in range.
1064 # style_prefix => prefix to prepend to all CSS style names in the
1065 # fancy CSS style sheet.
1066 # defaults to $g_style_prefix if not defined.
1067 # note that _main actually adds the style sheet (when
1068 # requested); use GenerateStyleSheet to retrieve the
1069 # fancy style sheet when calling Markdown directly.
1070 # auto_number => <= 0 (default) no numbering, 1 number h1s,
1071 # 2 number h1s, h2s, 3 number h1-h3s, ... >= 6 number h1-h6s
1072 # anchors => existence of this key triggers return of anchors HASH
1073 # yamlmode => 0 (no YAML processing), > 0 (YAML on), < 0 (YAML ignore)
1074 # if 0, the YAML front matter processor is completely
1075 # disabled and any YAML front matter that might be present
1076 # will be treated as markup. if > 0 any YAML front matter
1077 # will be processed and any recognized options applied.
1078 # if < 0 any YAML front matter will be parsed but no
1079 # options will be applied at all. When != 0 the parsed
1080 # YAML front matter can be retrieved via the 'yaml' key.
1081 # defaults to 1 if not defined or not a number.
1082 # yamlvis => 0 (invisible), > 0 (visible), < 0 (vis if unknown)
1083 # if yamlmode == 0 then yamlvis has no effect. if > 0
1084 # then any parsed YAML front matter options will be shown
1085 # in the formatted output. if 0 then NO YAML front
1086 # matter options will be shown in the formatted output.
1087 # if < 0 then YAML front matter options will be shown in
1088 # the formatted output only if there are any unrecognized
1089 # options present.
1090 # defaults to -1 if not defined or not a number.
1091 # keepabs => any-false-value (no action), any-true-value (keep)
1092 # if true, any absolute path URLs remaining after applying
1093 # any abs_prefix value will be kept and not be subject
1094 # to modification by any url_prefix or img_prefix value.
1095 # abs_prefix => value to prefix to absolute path URLs (i.e. start with /).
1096 # note that this does NOT get prepended to //host/path URLs.
1097 # url_prefix => value to prefix to non-absolute URLs.
1098 # note that this does NOT get prepended to //host/path URLs.
1099 # img_prefix => value to prefix to non-absolute image URLs.
1100 # note that this does NOT get prepended to //host/path URLs.
1101 # note that if img_prefix is undef or empty ("") then
1102 # url_prefix will be prepended to image URLs.
1103 # base_prefix => value to prefix to fragment-only URLs (i.e. start with #).
1104 # note that fragment-only URLs are always left undisturbed
1105 # if this is not set. Fragment-only URLs are NOT affected by
1106 # any of abs_prefix, url_prefix or img_prefix.
1107 # wikipat => non-empty pattern string to enable wiki links.
1108 # best set with SetWikiOpts (see SetWikiOpts comments).
1109 # wikiopt => HASH ref of options affecting wiki links processing.
1110 # best set with SetWikiOpts (see SetWikiOpts comments).
1111 # wikifunc => if set to a CODE ref, the function will be called with
1112 # six arguments like so:
1113 # $result = &$wikifunc($iresult, \%opts, $link, $wbase, $qf, $io)
1114 # where on input $iresult is the result that would be produced
1115 # if no wikifunc was provided and on output $result will be
1116 # used as the wiki expansion. $link is the original wiki
1117 # destination as specified in the source, $wbase is the result
1118 # of stripping off any query string and/or fragment from $link
1119 # and then transforming that according to the wikiopt HASH ref.
1120 # $qf contains either an empty string or the stripped off
1121 # query string and/or fragment if one was originally present.
1122 # If $io is a HASH ref (otherwise it will be undef), then it's
1123 # a wiki image link and %$io contains the options (if any) where
1124 # the keys that might be present include "width", "height", "align"
1125 # and "alt". If present, width and height are guaranteed to be
1126 # positive integers and align is guaranteed to be "left", "right"
1127 # or "center". The value for "alt" may be the empty string.
1128 # The "imgflag" key has a SCALAR ref value and if the value it
1129 # refers to is changed to any false value the result will become
1130 # an A tag rather than an IMG tag.
1131 # The $iresult value is related to the other arguments like so:
1132 # $iresult = $opts->{wikipat};
1133 # $iresult =~ s/%\{\}.+$/%{}/ if ref($io) eq "HASH";
1134 # $iresult = s/%\{\}/$wbase/;
1135 # $iresult .= $qf;
1136 # Any provided wikifunc should treat the %opts HASH as
1137 # read-only. Modifying the %opts HASH in wikifunc will
1138 # likely result in unpredictable behavior! Don't do it!
1140 # Special handling for abs_prefix, url_prefix, img_prefix and/or base_prefix
1141 # may be activated by setting any subset (or all) of the values for these
1142 # keys to a CODE ref. The single argument is the URL and the result must
1143 # be the adjusted URL. For example, the equivalent CODE ref to setting
1144 # url_prefix to $string is simply sub { $string.$_[0] }. By using a
1145 # CODE ref, behavior other than simply performing a prepend operation
1146 # can be realized when necessary for unusual situations.
1148 # The following are OUTPUT values that can only be retrieved when
1149 # Markdown is called with a HASH ref as the second argument
1151 # anchors => if the 'anchors' key exists in the input HASH ref
1152 # will be set to a HASH ref containing lookup keys
1153 # for valid fragment ids in the document (only those
1154 # created from Markdown markup) with the value the
1155 # actual fragment link to use. Do not use this directly
1156 # but pass it as the first argument to the ResolveFragment
1157 # function to resolve a "fuzzy" fragment name to its
1158 # actual fragment name in the generated output.
1159 # NOTE: to activate return of anchors the 'anchors' key
1160 # simply must exist in the input HASH ref passed to the
1161 # Markdown function, its value will be replaced on output.
1163 # h1 => will be set to the tag-stripped value of the first
1164 # non-empty H1 generated by Markdown-style markup.
1165 # note that literal <h1>...</h1> values are NOT picked up.
1166 # will be left unchanged if no Markdown-style H1 detected.
1167 # note that the value is NOT xml escaped but should be
1168 # before embedding in an XHTML document. If yamlmode > 0
1169 # and a 'title' value has been encountered, then this
1170 # will be set to that 'title' value instead (and the
1171 # 'title' key and value will still be present in %$yaml).
1173 # yaml => if yamlmode is != 0 then this will be set to a HASH
1174 # ref containing any parsed YAML front matter or left
1175 # unchanged if no YAML front matter was found. If the
1176 # parsed YAML front matter contains only whitespace and/or
1177 # comments then this will be set to a HASH ref that has
1178 # no keys or values.
1180 sub _SanitizeOpts {
1181 my $o = shift; # hashref
1182 ref($o) eq "HASH" or return;
1184 $o->{firstline} = 0;
1185 $o->{keep_named_character_entities} = 0 unless
1186 defined($o->{keep_named_character_entities}) && $o->{keep_named_character_entities} eq "1";
1187 $o->{xmlcheck} = looks_like_number($o->{xmlcheck}) && $o->{xmlcheck} == 0 ? 0 : 2;
1188 $o->{sanitize} = 1 if $o->{xmlcheck} && !$o->{sanitize};
1189 $o->{sanitize} = 1 if ref($o->{urlfunc}) eq 'CODE' && !$o->{sanitize};
1190 !looks_like_number($o->{stripcomments}) and
1191 $o->{stripcomments} = $o->{stripcomments} ? 2 :
1192 ($o->{sanitize} && (!defined($o->{stripcomments}) || $o->{stripcomments} eq "") ? 3 : 0);
1193 $o->{stripcomments} && $o->{stripcomments} < 1 and $o->{stripcomments} = 2;
1194 $o->{stripcomments} = int($o->{stripcomments});
1195 $o->{stripcomments} > 3 and $o->{stripcomments} = 3;
1196 $o->{stripcomments} && !$o->{sanitize} and $o->{sanitize} = 1;
1198 # this is gross, but having the globals avoids unnecessary slowdown
1199 if ($o->{sanitize} && $o->{xmlcheck}) {
1200 $g_start_p = "<\20>";
1201 $g_close_p = "</\20>";
1202 } else {
1203 $g_start_p = "<p>";
1204 $g_close_p = "</p>";
1207 defined($o->{empty_element_suffix}) &&
1208 ($o->{empty_element_suffix} eq " />" || $o->{empty_element_suffix} eq ">")
1209 or $o->{empty_element_suffix} = " />";
1211 $o->{tab_width} = 8 unless looks_like_number($o->{tab_width}) &&
1212 1 <= $o->{tab_width} && $o->{tab_width} <= 32;
1213 $o->{tab_width} = int($o->{tab_width});
1215 $o->{indent_width} = 4 unless looks_like_number($o->{indent_width}) &&
1216 1 <= $o->{indent_width} && $o->{indent_width} <= 32;
1217 $o->{indent_width} = int($o->{indent_width});
1219 defined($o->{auto_number}) or $o->{auto_number} = '';
1220 $o->{auto_number} eq '' || looks_like_number($o->{auto_number})
1221 or $o->{auto_number} = 6;
1222 if ($o->{auto_number} ne '') {
1223 $o->{auto_number} = int(0+$o->{auto_number});
1224 $o->{auto_number} >= 0 or $o->{auto_number} = 0;
1225 $o->{auto_number} <= 6 or $o->{auto_number} = 6;
1228 defined($o->{style_prefix}) or $o->{style_prefix} = $g_style_prefix;
1230 $o->{abs_prefix} = _MakePrefixCODERef($o->{abs_prefix}, 1)
1231 unless ref($o->{abs_prefix}) eq 'CODE';
1232 $o->{url_prefix} = _MakePrefixCODERef($o->{url_prefix}, 0)
1233 unless ref($o->{url_prefix}) eq 'CODE';
1234 $o->{img_prefix} = _MakePrefixCODERef($o->{img_prefix}, 0)
1235 unless ref($o->{img_prefix}) eq 'CODE';
1236 $o->{base_prefix} = _MakePrefixCODERef($o->{base_prefix}, -1)
1237 unless ref($o->{base_prefix}) eq 'CODE';
1239 ref($o->{wikiopt}) eq "HASH" or $o->{wikiopt} = {};
1241 # Note that because Markdown makes a copy of the options
1242 # before calling this function, this does not actually remove
1243 # any "h1" key that might have been set by the caller of
1244 # the Markdown function. However, by deleting it here,
1245 # this guarantees that any found value will actually be
1246 # picked up and stored (which will not happen if the key
1247 # already exists).
1248 delete $o->{h1};
1250 # Default is to silently strip any known YAML front matter
1251 # Same comment about "yaml" key as above for "h1" key
1252 $o->{yamlmode} = 1 unless looks_like_number($o->{yamlmode});
1253 $o->{yamlvis} = -1 unless looks_like_number($o->{yamlvis});
1254 delete $o->{yaml};
1256 # The anchors hash will only be returned if the key exists
1257 # (the key's value doesn't matter), set the value to an empty
1258 # HASH ref just in case to make sure it's always a HASH ref.
1259 $o->{anchors} = {} if exists($o->{anchors});
1262 my %_yamlopts;
1263 BEGIN {%_yamlopts = map({$_ => 1} qw(
1264 display_metadata
1265 header_enum
1266 title
1270 sub _HasUnknownYAMLOptions {
1271 do { return 1 unless exists($_yamlopts{$_}) } foreach keys(%{$_[0]});
1272 return 0;
1276 sub _ApplyYAMLOpts {
1277 my ($yaml, $opt) = @_;
1278 if (defined($yaml->{display_metadata}) && $opt->{yamlvis} < 0) {
1279 # ignore display_metadata except in --yaml=enable mode
1280 $opt->{yamlvis} = _YAMLTrueValue($yaml->{display_metadata}) ? 1 : 0;
1282 $opt->{h1} = $yaml->{title} if defined($yaml->{title});
1283 if (defined($yaml->{header_enum}) && $opt->{auto_number} eq '') {
1284 $opt->{auto_number} = _YAMLTrueValue($yaml->{header_enum}) ? 6 : 0;
1289 sub _YAMLTrueValue {
1290 my $v = shift;
1291 defined($v) or $v = "";
1292 $v = lc($v);
1293 return !($v eq "" || $v eq "0" || $v eq "false" || $v eq "disable" || $v eq "off" || $v eq "no");
1297 # Actually returns an empty string rather than a CODE ref
1298 # if an empty prefix is passed in. Trailing "/"s are trimmed
1299 # off if the second argument is positive or the string does NOT
1300 # consist of only "/"s. A trailing "/" is added unless the
1301 # trimmed prefix already has one or the second argument is true.
1302 # If the second argument is negative, the prefix is used as-is.
1303 sub _MakePrefixCODERef {
1304 my ($prefix, $mtok) = @_;
1305 defined($prefix) or $prefix = "";
1306 looks_like_number($mtok) or $mtok = $mtok ? 1 : 0;
1307 if ($mtok > 0) {
1308 $prefix =~ s,/+$,,;
1309 } elsif (!$mtok) {
1310 $prefix =~ s,//+$,/,;
1312 $prefix ne "" or return "";
1313 $prefix .= '/' if !$mtok && substr($prefix, -1, 1) ne '/';
1314 return sub { $prefix . $_[0] };
1318 sub Markdown {
1320 # Primary function. The order in which other subs are called here is
1321 # essential. Link and image substitutions need to happen before
1322 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
1323 # and <img> tags get encoded.
1325 my $text = shift;
1327 # Any remaining arguments after the first are options; either a single
1328 # hashref or a list of name, value pairs. See _SanitizeOpts comments.
1329 %opt = (
1330 # set initial defaults
1331 style_prefix => $g_style_prefix,
1332 empty_element_suffix => $g_empty_element_suffix,
1333 tab_width => _tabDefault,
1334 indent_width => $g_indent_width,
1335 abs_prefix => "", # Prefixed to absolute path URLs
1336 url_prefix => "", # Prefixed to non-absolute URLs
1337 img_prefix => "", # Prefixed to non-absolute image URLs
1338 base_prefix => "", # Prefixed to fragment-only URLs
1340 @autonum = ();
1341 my %args = ();
1342 if (ref($_[0]) eq "HASH") {
1343 %args = %{$_[0]};
1344 } else {
1345 %args = @_;
1347 while (my ($k,$v) = each %args) {
1348 $opt{$k} = $v;
1350 _SanitizeOpts(\%opt);
1352 my $yaml;
1353 ($text, $yaml) = _PrepareInput($text, $opt{yamlmode});
1354 _ApplyYAMLOpts($yaml, \%opt) if ref($yaml) eq "HASH" && $opt{yamlmode} > 0;
1355 my $yamltable = "";
1356 if (ref($yaml) eq "HASH" && %$yaml && $opt{yamlmode} && $opt{yamlvis}) {
1357 if ($opt{yamlvis} > 0 || _HasUnknownYAMLOptions($yaml)) {
1358 my ($hrows, $drows) = ("", "");
1359 foreach (sort(keys(%$yaml))) {
1360 my $v = $yaml->{$_};
1361 my $rspn = '';
1362 if (defined($v)) {
1363 $v =~ s/&/&amp;/g;
1364 $v =~ s/</&lt;/g;
1365 utf8::encode($v);
1366 $drows .= "<td>" . $v . "</td>\n";
1367 } else {
1368 $rspn = " class=\"$opt{style_prefix}yaml-undef-value\" rowspan=\"2\" valign=\"top\"";
1370 $hrows .= "<th$rspn>" . $_ . "</th>\n";
1372 $yamltable = "<table class=\"$opt{style_prefix}yaml-table\" border=\"1\">\n" .
1373 "<tr>\n$hrows</tr>\n<tr>\n$drows</tr>\n</table>\n";
1374 $opt{firstline} = scalar(@{[$yamltable =~ /\n/g]});
1378 # Clear the globals. If we don't clear these, you get conflicts
1379 # from other articles when generating a page which contains more than
1380 # one article (e.g. an index page that shows the N most recent
1381 # articles):
1382 %g_urls = ();
1383 %g_titles = ();
1384 %g_anchors = ();
1385 %g_anchors_id = ();
1386 %g_block_ids = ();
1387 %g_code_block_ids = ();
1388 %g_html_blocks = ();
1389 %g_code_blocks = ();
1390 @g_xml_comments = ();
1391 $g_list_level = 0;
1393 # Make sure $text ends with a couple of newlines:
1394 $text .= "\n\n";
1396 # Handle backticks-delimited code blocks
1397 $text = _HashBTCodeBlocks($text);
1399 # Convert all tabs to spaces.
1400 $text = _DeTab($text);
1402 # Strip any lines consisting only of spaces.
1403 # This makes subsequent regexen easier to write, because we can
1404 # match consecutive blank lines with /\n+/ instead of something
1405 # contorted like / *\n+/ .
1406 $text =~ s/^ +$//mg;
1408 # Turn block-level HTML blocks into hash entries
1409 $text = _HashHTMLBlocks($text, 1);
1411 # Strip link definitions, store in hashes.
1412 $text = _StripLinkDefinitions($text);
1414 $text = _RunBlockGamut($text, 1);
1416 # Remove indentation markers
1417 $text =~ s/\027+//gs;
1419 # Expand auto number flags
1420 $text =~ s/\034([1-6])/_AutoHeaderNum(ord($1)&0x7)/gse
1421 if $opt{auto_number} ne '' && $opt{auto_number} > 0;
1423 # Unhashify code blocks
1424 $text =~ s/(\025\d+\026)/$g_code_blocks{$1}/g;
1426 $text = _UnescapeSpecialChars($text);
1428 $text .= "\n" unless $text eq "";
1430 # Sanitize all '<'...'>' tags if requested
1431 $text = _SanitizeTags($text, $opt{xmlcheck}, 1) if $opt{sanitize};
1433 # Eliminate known named character entities
1434 $opt{keep_named_character_entities} or do {
1435 $yamltable = ConvertNamedCharacterEntities($yamltable);
1436 $text = ConvertNamedCharacterEntities($text);
1439 # Convert to US-ASCII only if requested
1440 $opt{us_ascii} and do {
1441 utf8::decode($yamltable);
1442 $yamltable = ConvertToASCII($yamltable);
1443 utf8::encode($yamltable);
1444 $text = ConvertToASCII($text);
1447 utf8::encode($text);
1448 if (ref($_[0]) eq "HASH") {
1449 ${$_[0]}{anchors} = {%g_anchors_id} if exists(${$_[0]}{anchors});
1450 if (defined($opt{h1}) && $opt{h1}) {
1451 utf8::encode($opt{h1});
1452 ${$_[0]}{h1} = $opt{h1};
1454 ${$_[0]}{yaml} = $yaml if ref($yaml) eq "HASH";
1457 if ($opt{divwrap}) {
1458 my $id = $opt{divname};
1459 defined($id) or $id = "";
1460 $id eq "" or $id = ' id="'.escapeXML($id).'"';
1461 chomp($text);
1462 return "<div$id>\n".$yamltable.$text."\n</div>\n";
1464 return $yamltable.$text;
1468 sub _HashBTCodeBlocks {
1470 # Process Markdown backticks (```) delimited code blocks
1471 # Process some (limited recognition) tilde (~~~) delimited code blocks
1473 my $text = shift;
1474 my $less_than_indent = $opt{indent_width} - 1;
1476 $text =~ s{
1477 (?:(?<=\n)|\A)
1478 ([ ]{0,$less_than_indent})``(`+)[ \t]*(?:([\w.+-]+[#]?)(?:[ \t][ \t\w.+-]*)?)?\n
1479 ( # $4 = the code block -- one or more lines, starting with ```
1481 .*\n
1484 # and ending with ``` or end of document
1485 (?:(?:[ ]{0,$less_than_indent}``\2`*[ \t]*(?:\n|\Z))|\Z)
1487 # $2 contains syntax highlighting to use if defined
1488 my $leadsp = length($1);
1489 my $codeblock = $4;
1490 $codeblock =~ s/[ \t]+$//mg; # trim trailing spaces on lines
1491 $codeblock = _DeTab($codeblock, 8, $leadsp); # physical tab stops are always 8
1492 $codeblock =~ s/\A\n+//; # trim leading newlines
1493 $codeblock =~ s/\s+\z//; # trim trailing whitespace
1494 $codeblock = _EncodeCode($codeblock); # or run highlighter here
1495 $codeblock = "<div class=\"$opt{style_prefix}code-bt\"><pre style=\"display:none\"></pre><pre><code>"
1496 . $codeblock . "\n</code></pre></div>";
1498 my $key = block_id($codeblock, 2);
1499 $g_code_blocks{$key} = $codeblock;
1500 "\n\n" . $key . "\n\n";
1501 }egmx;
1503 $text =~ s{
1504 (?:(?<=\n)|\A)
1505 ([ ]{0,$less_than_indent})~~(~)[ \t]*(?:([\w.+-]+[#]?)(?:[ \t][ \t\w.+-]*)?)?\n
1506 ( # $4 = the code block -- one or more lines, starting with ~~~
1508 .*\n
1511 # and ending with ~~~ or end of document
1512 (?:(?:[ ]{0,$less_than_indent}~~\2~*[ \t]*(?:\n|\Z))|\Z)
1514 # $2 contains syntax highlighting to use if defined
1515 my $leadsp = length($1);
1516 my $codeblock = $4;
1517 $codeblock =~ s/[ \t]+$//mg; # trim trailing spaces on lines
1518 $codeblock = _DeTab($codeblock, 8, $leadsp); # physical tab stops are always 8
1519 $codeblock =~ s/\A\n+//; # trim leading newlines
1520 $codeblock =~ s/\s+\z//; # trim trailing whitespace
1521 $codeblock = _EncodeCode($codeblock); # or run highlighter here
1522 $codeblock = "<div class=\"$opt{style_prefix}code-bt\"><pre style=\"display:none\"></pre><pre><code>"
1523 . $codeblock . "\n</code></pre></div>";
1525 my $key = block_id($codeblock);
1526 $g_html_blocks{$key} = $codeblock;
1527 "\n\n" . $key . "\n\n";
1528 }egmx;
1530 return $text;
1534 sub _StripLinkDefinitions {
1536 # Strips link definitions from text, stores the URLs and titles in
1537 # hash references.
1539 my $text = shift;
1540 my $less_than_indent = $opt{indent_width} - 1;
1542 # Link defs are in the form: ^[id]: url "optional title"
1543 while ($text =~ s{
1544 ^[ ]{0,$less_than_indent}\[(.+)\]: # id = $1
1545 [ ]*
1546 \n? # maybe *one* newline
1547 [ ]*
1548 <?((?:\S(?:\\\n\s*[^\s"(])?)+?)>? # url = $2
1549 [ ]*
1550 \n? # maybe one newline
1551 [ ]*
1553 (?<=\s) # lookbehind for whitespace
1554 (?:(['"])|(\()) # title quote char
1555 (.+?) # title = $5
1556 (?(4)\)|\3) # match same quote
1557 [ ]*
1558 )? # title is optional
1559 (?:\n+|\Z)
1561 {}mx) {
1562 my $id = _strip(lc $1); # Link IDs are case-insensitive
1563 my $url = $2;
1564 my $title = _strip($5);
1565 $url =~ s/\\\n\s*//gs;
1566 if ($id ne "") {
1567 # These values always get passed through _MakeATag or _MakeIMGTag later
1568 $g_urls{$id} = $url;
1569 if (defined($title) && $title ne "") {
1570 $g_titles{$id} = $title;
1575 return $text;
1578 my %ok_tag_name; # initialized later
1579 my ($block_tags_a, $block_tags_b, $block_tags_c);
1580 BEGIN {
1581 $block_tags_a = qr/\020|p|div|center|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/io;
1582 $block_tags_b = qr/\020|p|div|center|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/io;
1583 $block_tags_c = qr/div|center|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/io;
1586 sub _HashHTMLBlocks {
1587 my ($text, $toplevel) = @_;
1588 my $less_than_indent = $opt{indent_width} - 1;
1589 my $idt = "\027" x $g_list_level;
1590 my $blkprc = $toplevel ?
1591 sub { return $ok_tag_name{$_[1]} ? _EncodeAmpsAndAngles($_[0]) : $_[0] } :
1592 sub { return $_[0] };
1594 # Hashify HTML blocks:
1595 # We only want to do this for block-level HTML tags, such as headers,
1596 # lists, and tables. That's because we still want to wrap <p>s around
1597 # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
1598 # phrase emphasis, and spans. The list of tags we're looking for is
1599 # hard-coded:
1601 # First, look for nested blocks, e.g.:
1602 # <div>
1603 # <div>
1604 # tags for inner block must be indented.
1605 # </div>
1606 # </div>
1608 # The outermost tags must start at the left margin for this to match, and
1609 # the inner nested divs must be indented.
1610 # We need to do this before the next, more liberal match, because the next
1611 # match will start at the first `<div>` and stop at the first `</div>`.
1612 $text =~ s{
1613 ( # save in $1
1614 ^ # start of line (with /m)
1615 ((?:\Q$idt\E)?) # optional lead in = $2
1616 <($block_tags_a) # start tag = $3
1617 \b # word break
1618 (?:.*\n)*? # any number of lines, minimally matching
1619 \2</\3\s*> # the matching end tag
1620 [ ]* # trailing spaces
1621 (?=\n+|\Z) # followed by a newline or end of document
1624 my $blk = &$blkprc($1, $3);
1625 my $key = block_id($blk);
1626 $g_html_blocks{$key} = $blk;
1627 "\n\n" . $key . "\n\n";
1628 }eigmx;
1632 # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
1634 $text =~ s{
1635 ( # save in $1
1636 ^ # start of line (with /m)
1637 (?:\Q$idt\E)? # optional lead in
1638 <($block_tags_b) # start tag = $2
1639 \b # word break
1640 (?:.*\n)*? # any number of lines, minimally matching
1641 .*</\2\s*> # the matching end tag
1642 [ ]* # trailing spaces
1643 (?=\n+|\Z) # followed by a newline or end of document
1646 my $blk = &$blkprc($1, $2);
1647 my $key = block_id($blk);
1648 $g_html_blocks{$key} = $blk;
1649 "\n\n" . $key . "\n\n";
1650 }eigmx;
1653 # Now match any empty block tags that should have been paired
1655 $text =~ s{
1656 ( # save in $1
1657 ^ # start of line (with /m)
1658 (?:\Q$idt\E)? # optional lead in
1659 <($block_tags_c) # start tag = $2
1660 \b # word break
1661 (?:[^<>])*? #
1662 /?> # the matching end tag
1663 [ ]* # trailing spaces
1664 (?=\n+|\Z) # followed by a newline or end of document
1667 my $key = block_id($1);
1668 $g_html_blocks{$key} = $1;
1669 "\n\n" . $key . "\n\n";
1670 }eigmx;
1672 # Special case just for <hr />. It was easier to make a special case than
1673 # to make the other regex more complicated.
1674 $text =~ s{
1676 (?<=\n) # Starting after end of line
1677 | # or
1678 \A # the beginning of the doc
1680 ( # save in $1
1681 [ ]{0,$less_than_indent}
1682 <(?:hr) # start tag
1683 \b # word break
1684 (?:[^<>])*? #
1685 /?> # the matching end tag
1686 [ ]*
1687 (?=\n{1,}|\Z) # followed by end of line or end of document
1690 my $key = block_id($1);
1691 $g_html_blocks{$key} = $1;
1692 "\n\n" . $key . "\n\n";
1693 }eigx;
1695 # Special case for standalone XML comments:
1696 $opt{stripcomments} != 2 &&
1697 $text =~ s{
1699 (?<=\n\n) # Starting after a blank line
1700 | # or
1701 \A\n? # the beginning of the doc
1703 ( # save in $1
1704 [ ]{0,$less_than_indent}
1705 (?s:
1706 <!--
1707 (?:[^-]|(?:-(?!-)))*
1710 (?:[ \t]*\n[ \t]*)?
1711 <!--
1712 (?:[^-]|(?:-(?!-)))*
1716 [ ]*
1717 (?=\n{1,}|\Z) # followed by end of line or end of document
1720 my $key = block_id($1);
1721 push(@g_xml_comments, $key)
1722 if $opt{stripcomments} && $opt{stripcomments} < 3 &&
1723 !exists($g_html_blocks{$key});
1724 $g_html_blocks{$key} = $1;
1725 "\n\n" . $key . "\n\n";
1726 }egx;
1728 # Special case for standalone XML-like comments:
1729 $opt{stripcomments} >= 2 &&
1730 $text =~ s{
1732 (?<=\n\n) # Starting after a blank line
1733 | # or
1734 \A\n? # the beginning of the doc
1736 ( # save in $1
1737 [ ]{0,$less_than_indent}
1738 (?s:
1739 <!--
1740 (?:[^-]|(?:-(?!->)))*
1743 (?:[ \t]*\n[ \t]*)?
1744 <!--
1745 (?:[^-]|(?:-(?!->)))*
1749 [ ]*
1750 (?=\n{1,}|\Z) # followed by end of line or end of document
1753 my $key = block_id($1);
1754 push(@g_xml_comments, $key) unless exists($g_html_blocks{$key});
1755 $g_html_blocks{$key} = $1;
1756 "\n\n" . $key . "\n\n";
1757 }egx;
1760 return $text;
1764 sub _RunBlockGamut {
1766 # These are all the transformations that form block-level
1767 # tags like paragraphs, headers, and list items.
1769 my ($text, $anchors) = @_;
1771 $text = _DoHeaders($text, $anchors);
1773 # Do Horizontal Rules:
1774 $text =~ s{^ {0,3}\*(?: {0,2}\*){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm;
1775 $text =~ s{^ {0,3}\_(?: {0,2}\_){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm;
1776 $text =~ s{^ {0,3}\-(?: {0,2}\-){2,}[ ]*$}{\n<hr$opt{empty_element_suffix}\n}gm;
1778 $text = _DoListsAndBlocks($text);
1780 $text = _DoTables($text);
1782 # We already ran _HashHTMLBlocks() before, in Markdown(), but that
1783 # was to escape raw HTML in the original Markdown source. This time,
1784 # we're escaping the markup we've just created, so that we don't wrap
1785 # <p> tags around block-level tags.
1786 $text = _HashHTMLBlocks($text);
1788 $text = _FormParagraphs($text, $anchors);
1790 return $text;
1794 sub _DoBTListBlocks {
1795 return _DoBlockQuotes(_DoCodeBlocks(_HashBTCodeBlocks($_[0]))) if $_[0] ne "";
1799 sub _DoListBlocks {
1800 return _DoBlockQuotes(_DoCodeBlocks($_[0])) if $_[0] ne "";
1804 sub _RunSpanGamut {
1806 # These are all the transformations that occur *within* block-level
1807 # tags like paragraphs, headers, and list items.
1809 my $text = shift;
1811 $text = _DoCodeSpans($text);
1813 $text = _EscapeSpecialChars($text);
1815 # Process anchor and image tags. Images must come first,
1816 # because ![foo][f] looks like an anchor.
1817 $text = _DoImages($text);
1818 $text = _DoAnchors($text);
1820 # Make links out of things like `<http://example.com/>`
1821 # Must come after _DoAnchors(), because you can use < and >
1822 # delimiters in inline links like [this](<url>).
1823 $text = _DoAutoLinks($text);
1825 $text = _EncodeAmpsAndAngles($text);
1827 $text = _DoItalicsAndBoldAndStrike($text);
1829 # Do hard breaks:
1830 $text =~ s/ {3,}(\n|\z)/<br clear=\"all\"$opt{empty_element_suffix}$1/g;
1831 $text =~ s/ {2,}\n/<br$opt{empty_element_suffix}\n/g;
1832 $text =~ s/ ?\\\n/<br$opt{empty_element_suffix}\n/g;
1833 $text =~ s/ {2,}\z//g;
1835 return $text;
1839 sub _EscapeSpecialChars {
1840 my $text = shift;
1841 my $tokens ||= _TokenizeHTML($text);
1843 $text = ''; # rebuild $text from the tokens
1844 # my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags.
1845 # my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
1847 foreach my $cur_token (@$tokens) {
1848 if ($cur_token->[0] eq "tag") {
1849 # Within tags, encode *, _ and ~ so they don't conflict
1850 # with their use in Markdown for italics and strong.
1851 # We're replacing each such character with its
1852 # corresponding block id value; this is likely
1853 # overkill, but it should prevent us from colliding
1854 # with the escape values by accident.
1855 $cur_token->[1] =~ s!([*_~])!$g_escape_table{$1}!go;
1856 $text .= $cur_token->[1];
1857 } else {
1858 my $t = $cur_token->[1];
1859 $t = _EncodeBackslashEscapes($t);
1860 $text .= $t;
1863 return $text;
1867 sub _ProcessWikiLink {
1868 my ($link_text, $link_loc) = @_;
1869 if (defined($link_loc) &&
1870 ($link_loc =~ m{^#\S*$} || $link_loc =~ m{^(?:http|ftp)s?://\S+$}i)) {
1871 # Return the new link
1872 return _MakeATag(_FindFragmentMatch($link_loc), $link_text);
1874 if (!defined($link_loc)) {
1875 $link_loc = _RunSpanGamut($link_text);
1876 $link_loc = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($link_loc))));
1877 $link_loc =~ m{^(?:http|ftp)s?://\S+$}i and
1878 # Return the new link
1879 return _MakeATag($link_loc, $link_text);
1881 return undef if $link_loc eq "" || $link_text eq "";
1882 if ($link_loc =~ /^[A-Za-z][A-Za-z0-9+.-]*:/os) {
1883 # Unrecognized scheme
1884 return undef;
1886 if ($opt{wikipat}) {
1887 my $o = $opt{wikiopt};
1888 my $img_link = _strip($link_text);
1889 my $img = 0;
1890 my $qsfrag = "";
1891 my $base;
1892 my $imgopts = undef;
1893 if ($img_link =~ /^[^#?\s]+\.(?:png|gif|jpe?g|svgz?)$/i) {
1894 $base = _wxform($img_link, 1);
1895 $img = 1;
1896 $imgopts = _ParseWikiImgOpts($link_loc);
1897 $imgopts->{imgflag} = \$img;
1898 } else {
1899 $base = $link_loc;
1900 if ($link_loc =~ /^(.*?)([?#].*)$/os) {
1901 ($base, $qsfrag) = ($1, $2);
1903 $base = _wxform($base);
1905 my $result = $opt{wikipat};
1906 $result =~ s/%\{\}.+$/%{}/os if $img;
1907 $result =~ s/%\{\}/$base/;
1908 if ($qsfrag =~ /^([^#]*)(#.+)$/os) {
1909 my ($q,$f) = ($1,$2);
1910 #$f = _wxform($f) if $f =~ / /;
1911 $qsfrag = $q . $f;
1913 $result .= $qsfrag;
1914 $result = &{$opt{wikifunc}}($result, \%opt, ($img?$img_link:$link_loc), $base, $qsfrag, $imgopts)
1915 if ref($opt{wikifunc}) eq 'CODE';
1917 use bytes;
1918 $result =~ s/%(?![0-9A-Fa-f]{2})/%25/sog;
1919 if ($o->{r}) {
1920 $result =~
1921 s/([\x00-\x1F <>"{}|\\^`x7F])/sprintf("%%%02X",ord($1))/soge;
1922 } else {
1923 $result =~
1924 s/([\x00-\x1F <>"{}|\\^`\x7F-\xFF])/sprintf("%%%02X",ord($1))/soge;
1926 $result =~ s/(%(?![0-9A-F]{2})[0-9A-Fa-f]{2})/uc($1)/soge;
1928 # Return the new link
1929 return $img ? _MakeIMGTag($result, undef, undef, $imgopts) : _MakeATag($result, $link_text);
1931 # leave it alone
1932 return undef;
1936 sub _ParseWikiImgOpts {
1937 my $alts = shift;
1938 my %o = ();
1939 # alt= consumes the rest of the line, do it first
1940 if ($alts =~ /(?:^|,)\s*alt\s*=\s*(.*)$/ios) {
1941 my $atext = $1;
1942 $alts = substr($alts, 0, $-[0]);
1943 $o{alt} = _strip($atext);
1945 foreach my $kv (split(/\s*,\s*/, lc($alts))) {
1946 if ($kv =~ /^\s*([^\s]+)\s*=\s*([^\s]+)\s*$/os) {
1947 my ($k, $v) = ($1, $2);
1948 if (($k eq "width" || $k eq "height") && $v =~ /^\d+$/) {
1949 $o{$k} = 0+$v if $v > 0;
1950 next;
1952 if ($k eq "align" && ($v eq "left" || $v eq "right" || $v eq "center")) {
1953 $o{$k} = $v;
1954 next;
1958 return \%o;
1962 sub _wxform {
1963 my ($w, $img) = @_;
1964 my $o = $opt{wikiopt};
1965 my $opt_s = $o->{s};
1966 if (!$img && $opt_s) {
1967 if (ref($opt_s)) {
1968 if ($w =~ m{^(.*)[.]([^./]*)$}) {
1969 my ($base, $ext) = ($1, $2);
1970 $w = $base if $opt_s->{lc($ext)};
1972 } else {
1973 $w =~ s{[.][^./]*$}{};
1976 $w = uc($w) if $o->{u};
1977 $w = lc($w) if $o->{l};
1978 $w =~ s{/+}{%252F}gos if $o->{"%"};
1979 $w =~ s/ +/%20/gos if $o->{b};
1980 $w =~ tr{/}{ } if $o->{f};
1981 $w =~ s{/+}{/}gos if !$o->{f} && !$o->{v};
1982 if ($o->{d}) {
1983 $w =~ tr{ }{-};
1984 $w =~ s/-+/-/gos unless $o->{v};
1985 } else {
1986 $w =~ tr{ }{_};
1987 $w =~ s/_+/_/gos unless $o->{v};
1989 return $w;
1993 # Return a suitably encoded <a...> tag string
1994 # On input NONE of $url, $text or $title should be xmlencoded
1995 # but $url should already be url-encoded if needed, but NOT g_escape_table'd
1996 sub _MakeATag {
1997 my ($url, $text, $title) = @_;
1998 defined($url) or $url="";
1999 defined($text) or $text="";
2000 defined($title) or $title="";
2002 $url =~ m"^#" && ref($opt{base_prefix}) eq 'CODE' and $url = &{$opt{base_prefix}}($url);
2003 my $result = $g_escape_table{'<'}."a href=\"" . _EncodeAttText($url) . "\"";
2004 $title = _strip($title);
2005 $text =~ s{<(/?a)}{&lt;$1}sogi;
2006 $text = _DoItalicsAndBoldAndStrike($text);
2007 # We've got to encode any of these remaining to avoid
2008 # conflicting with other italics, bold and strike through and links.
2009 $text =~ s!([]*_~[])!$g_escape_table{$1}!go;
2010 $result .= " title=\"" . _EncodeAttText($title) . "\"" if $title ne "";
2011 return $result . $g_escape_table{'>'} .
2012 $text . $g_escape_table{'<'}."/a".$g_escape_table{'>'};
2016 sub _DoAnchors {
2018 # Turn Markdown link shortcuts into XHTML <a> tags.
2020 my $text = shift;
2023 # First, handle wiki-style links: [[wiki style link]]
2025 $text =~ s{
2026 ( # wrap whole match in $1
2027 \[\[
2028 ($g_nested_brackets) # link text and id = $2
2029 \]\]
2032 my $result;
2033 my $whole_match = $1;
2034 my $link_text = $2;
2035 my $link_loc = undef;
2037 if ($link_text =~ /^(.*)\|(.*)$/s) {
2038 $link_text = $1;
2039 $link_loc = _strip($2);
2042 $result = _ProcessWikiLink($link_text, $link_loc);
2043 defined($result) or $result = $whole_match;
2044 $result;
2045 }xsge;
2048 # Next, handle reference-style links: [link text] [id]
2050 $text =~ s{
2051 ( # wrap whole match in $1
2053 ($g_nested_brackets) # link text = $2
2056 [ ]? # one optional space
2057 (?:\n[ ]*)? # one optional newline followed by spaces
2060 ($g_nested_brackets) # id = $3
2064 my $result;
2065 my $whole_match = $1;
2066 my $link_text = $2;
2067 my $link_id = $3;
2069 if ($link_id eq "") {
2070 # for shortcut links like [this][].
2071 $link_id = _RunSpanGamut($link_text);
2072 $link_id = unescapeXML(_StripTags(_UnescapeSpecialChars($link_id)));
2074 $link_id = _strip(lc $link_id);
2076 if (defined($g_urls{$link_id}) || defined($g_anchors{$link_id})) {
2077 my $url = $g_urls{$link_id};
2078 defined($url) or $url = $g_anchors{$link_id};
2079 $url = _FindFragmentMatch($url);
2080 $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
2081 $result = _MakeATag(_PrefixURL($url), $link_text, $g_titles{$link_id});
2083 else {
2084 $result = $whole_match;
2086 $result;
2087 }xsge;
2090 # Subsequently, inline-style links: [link text](url "optional title")
2092 $text =~ s{
2093 ( # wrap whole match in $1
2095 ($g_nested_brackets) # link text = $2
2097 \( # literal paren
2098 ($g_nested_parens) # href and optional title = $3
2102 #my $result;
2103 my $whole_match = $1;
2104 my $link_text = $2;
2105 my ($url, $title) = _SplitUrlTitlePart($3);
2107 if (defined($url)) {
2108 $url = _FindFragmentMatch($url);
2109 $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
2110 _MakeATag(_PrefixURL($url), $link_text, $title);
2111 } else {
2112 # The href/title part didn't match the pattern
2113 $whole_match;
2115 }xsge;
2118 # Finally, handle reference-style implicit shortcut links: [link text]
2120 $text =~ s{
2121 ( # wrap whole match in $1
2123 ($g_nested_brackets) # link text = $2
2127 my $result;
2128 my $whole_match = $1;
2129 my $link_text = $2;
2130 my $link_id = _RunSpanGamut($2);
2131 $link_id = _strip(lc(unescapeXML(_StripTags(_UnescapeSpecialChars($link_id)))));
2133 if (defined($g_urls{$link_id}) || defined($g_anchors{$link_id})) {
2134 my $url = $g_urls{$link_id};
2135 defined($url) or $url = $g_anchors{$link_id};
2136 $url = _FindFragmentMatch($url);
2137 $link_text = '[' . $link_text . ']' if $link_text =~ /^\d{1,3}$/;
2138 $result = _MakeATag(_PrefixURL($url), $link_text, $g_titles{$link_id});
2140 else {
2141 $result = $whole_match;
2143 $result;
2144 }xsge;
2146 return $text;
2150 sub _PeelWrapped {
2151 defined($_[0]) or return undef;
2152 if (substr($_[0],0,1) eq "(") {
2153 return substr($_[0], 1, length($_[0]) - (substr($_[0], -1, 1) eq ")" ? 2 : 1));
2155 return $_[0];
2159 sub _SplitUrlTitlePart {
2160 return ("", undef) if $_[0] =~ m{^\s*$}; # explicitly allowed
2161 my $u = $_[0];
2162 $u =~ s/^\s*(['\042])/# $1/;
2163 if ($u =~ m{
2164 ^ # match beginning
2165 \s*?
2166 <?([^\s'\042]\S*?)>? # URL = $1
2167 (?: # optional grouping
2168 \s+ # must be distinct from URL
2169 (['\042]?) # quote char = $2
2170 (.*?) # Title = $3
2171 \2? # matching quote
2172 )? # title is optional
2174 \z # match end
2175 }osx) {
2176 return (undef, undef) if $_[1] && ($1 eq "" || $1 eq "#");
2177 return (_PeelWrapped($1), $2 ? $3 : _PeelWrapped($3));
2178 } else {
2179 return (undef, undef);
2184 sub _FindFragmentMatchInternal {
2185 my ($anchors_id, $url, $undefifnomatch) = @_;
2186 if (defined($url) && $url =~ /^#\S/) {
2187 # try very hard to find a match
2188 my $idbase = _strip(lc(substr($url, 1)));
2189 my $idbase0 = $idbase;
2190 my $id = _MakeAnchorId($idbase);
2191 $undefifnomatch and $url = undef;
2192 if (defined($$anchors_id{$id})) {
2193 $url = $$anchors_id{$id};
2194 } else {
2195 $idbase =~ s/-/_/gs;
2196 $id = _MakeAnchorId($idbase);
2197 if (defined($$anchors_id{$id})) {
2198 $url = $$anchors_id{$id};
2199 } else {
2200 $id = _MakeAnchorId($idbase0, 1);
2201 if (defined($$anchors_id{$id})) {
2202 $url = $$anchors_id{$id};
2203 } else {
2204 $id = _MakeAnchorId($idbase, 1);
2205 if (defined($$anchors_id{$id})) {
2206 $url = $$anchors_id{$id};
2212 return $url;
2216 sub _FindFragmentMatch {
2217 return _FindFragmentMatchInternal(\%g_anchors_id, @_);
2221 sub _ToUTF8 {
2222 my $input = shift;
2223 my $output;
2224 if (Encode::is_utf8($input) || utf8::decode($input)) {
2225 $output = $input;
2226 } else {
2227 $output = $encoder->decode($input, Encode::FB_DEFAULT);
2229 return $output;
2233 # $_[0] -> HASH ref of anchors (e.g. the "anchors" OUTPUT from Markdown)
2234 # $_[1] -> fragment to resolve, may optionally start with '#'
2235 # An empty string ("") or hash ("#") is returned as-is.
2236 # returns undef if no match otherwise resolved fragment name
2237 # which will start with a '#' if $_[1] started with '#' otherwise will not.
2238 # This function can be used to connect up links to "implicit" anchors.
2239 # All Markdown-format H1-H6 headers have an implicit anchor added
2240 # based on the header item text. Passing that text to this function
2241 # will cough up the matching implicit anchor if there is one.
2242 sub ResolveFragment
2244 my ($anchors, $frag) = @_;
2245 defined($frag) or return undef;
2246 $frag eq "" || $frag eq "#" and return $frag;
2247 my $hadhash = ($frag =~ s/^#//);
2248 $frag =~ /^\S/ or return undef;
2249 ref($anchors) eq 'HASH' or return undef;
2250 my $ans = _FindFragmentMatchInternal($anchors, '#'._ToUTF8($frag), 1);
2251 $hadhash || !defined($ans) or $ans =~ s/^#//;
2252 defined($ans) and utf8::encode($ans);
2253 return $ans;
2257 # Return a suitably encoded <img...> tag string
2258 # On input NONE of $url, $alt or $title should be xmlencoded
2259 # but $url should already be url-encoded if needed, but NOT g_escape_table'd
2260 sub _MakeIMGTag {
2261 my ($url, $alt, $title, $iopts) = @_;
2262 defined($url) or $url="";
2263 defined($alt) or $alt="";
2264 defined($title) or $title="";
2265 ref($iopts) eq "HASH" or $iopts = {};
2266 return "" unless $url ne "";
2268 my ($w, $h, $lf, $rt) = (0, 0, '', '');
2269 ($alt, $title) = (_strip($alt), _strip($title));
2270 if ($title =~ /^(.*)\((<?)([1-9][0-9]*)[xX\xd7]([1-9][0-9]*)(>?)\)$/os) {
2271 ($title, $w, $h, $lf, $rt) = (_strip($1), $3, $4, $2, $5);
2272 } elsif ($title =~ /^(.*)\((<?)\?[xX\xd7]([1-9][0-9]*)(>?)\)$/os) {
2273 ($title, $h, $lf, $rt) = (_strip($1), $3, $2, $4);
2274 } elsif ($title =~ /^(.*)\((<?)([1-9][0-9]*)[xX\xd7]\?(>?)\)$/os) {
2275 ($title, $w, $lf, $rt) = (_strip($1), $3, $2, $4);
2276 } elsif ($title =~ /^(.*)\((?!\))(<?)(>?)\)$/os) {
2277 ($title, $lf, $rt) = (_strip($1), $2, $3);
2279 $iopts->{align} = "center" if $lf && $rt;
2280 $iopts->{align} = "left" if $lf && !$rt;
2281 $iopts->{align} = "right" if !$lf && $rt;
2282 $iopts->{width} = $w if $w != 0;
2283 $iopts->{height} = $h if $h != 0;
2284 $iopts->{alt} = $alt if $alt ne "";
2285 $iopts->{title} = $title if $title ne "";
2286 my $iopt = sub { defined($iopts->{$_[0]}) ? $iopts->{$_[0]} : (@_ > 1 ? $_[1] : "") };
2287 my $result = '';
2288 $result .= $g_escape_table{'<'}."center".$g_escape_table{'>'}
2289 if &$iopt("align") eq "center";
2290 $result .= $g_escape_table{'<'}."img src=\"" . _EncodeAttText($url) . "\"";
2291 $result .= " align=\"left\"" if &$iopt("align") eq "left";
2292 $result .= " align=\"right\"" if &$iopt("align") eq "right";
2293 $result .= " alt=\"" . _EncodeAttText($iopts->{alt}) . "\"" if &$iopt("alt") ne "";
2294 $result .= " width=\"" . $iopts->{width} . "\"" if &$iopt("width",0) != 0;
2295 $result .= " height=\"" . $iopts->{height} . "\"" if &$iopt("height",0) != 0;
2296 $result .= " title=\"" . _EncodeAttText($iopts->{title}) . "\"" if &$iopt("title") ne "";
2297 $result .= " /" unless $opt{empty_element_suffix} eq ">";
2298 $result .= $g_escape_table{'>'};
2299 $result .= $g_escape_table{'<'}."/center".$g_escape_table{'>'}
2300 if &$iopt("align") eq "center";
2301 return $result;
2305 sub _DoImages {
2307 # Turn Markdown image shortcuts into <img> tags.
2309 my $text = shift;
2312 # First, handle reference-style labeled images: ![alt text][id]
2314 $text =~ s{
2315 ( # wrap whole match in $1
2317 ($g_nested_brackets) # alt text = $2
2320 [ ]? # one optional space
2321 (?:\n[ ]*)? # one optional newline followed by spaces
2324 ($g_nested_brackets) # id = $3
2329 my $result;
2330 my $whole_match = $1;
2331 my $alt_text = $2;
2332 my $link_id = $3;
2334 $link_id ne "" or $link_id = $alt_text; # for shortcut links like ![this][].
2335 $link_id = _strip(lc $link_id);
2337 if (defined $g_urls{$link_id}) {
2338 $result = _MakeIMGTag(
2339 _PrefixURL($g_urls{$link_id}), $alt_text, $g_titles{$link_id});
2341 else {
2342 # If there's no such link ID, leave intact:
2343 $result = $whole_match;
2346 $result;
2347 }xsge;
2350 # Next, handle inline images: ![alt text](url "optional title")
2351 # Don't forget: encode * and _
2353 $text =~ s{
2354 ( # wrap whole match in $1
2356 ($g_nested_brackets) # alt text = $2
2358 \( # literal paren
2359 ($g_nested_parens) # src and optional title = $3
2363 my $whole_match = $1;
2364 my $alt_text = $2;
2365 my ($url, $title) = _SplitUrlTitlePart($3, 1);
2366 defined($url) ? _MakeIMGTag(_PrefixURL($url), $alt_text, $title) : $whole_match;
2367 }xsge;
2370 # Finally, handle reference-style implicitly labeled links: ![alt text]
2372 $text =~ s{
2373 ( # wrap whole match in $1
2375 ($g_nested_brackets) # alt text = $2
2379 my $result;
2380 my $whole_match = $1;
2381 my $alt_text = $2;
2382 my $link_id = lc(_strip($alt_text));
2384 if (defined $g_urls{$link_id}) {
2385 $result = _MakeIMGTag(
2386 _PrefixURL($g_urls{$link_id}), $alt_text, $g_titles{$link_id});
2388 else {
2389 # If there's no such link ID, leave intact:
2390 $result = $whole_match;
2393 $result;
2394 }xsge;
2396 return $text;
2399 sub _EncodeAttText {
2400 my $text = shift;
2401 defined($text) or return undef;
2402 $text = escapeXML(_strip($text));
2403 # We've got to encode these to avoid conflicting
2404 # with italics, bold and strike through.
2405 $text =~ s!([*_~:])!$g_escape_table{$1}!go;
2406 return $text;
2410 sub _MakeAnchorId {
2411 use bytes;
2412 my ($link, $strip) = @_;
2413 $link = lc($link);
2414 if ($strip) {
2415 $link =~ s/\s+/_/gs;
2416 $link =~ tr/-a-z0-9_//cd;
2417 } else {
2418 $link =~ tr/-a-z0-9_/_/cs;
2420 return '' unless $link ne '';
2421 $link = "_".$link."_";
2422 $link =~ s/__+/_/gs;
2423 $link = "_".md5_hex($link)."_" if length($link) > 66;
2424 return $link;
2428 sub _GetNewAnchorId {
2429 my $link = _strip(lc(shift));
2430 return '' if $link eq "" || defined($g_anchors{$link});
2431 my $id = _MakeAnchorId($link);
2432 return '' unless $id;
2433 $g_anchors{$link} = '#'.$id;
2434 $g_anchors_id{$id} = $g_anchors{$link};
2435 if ($id =~ /-/) {
2436 my $id2 = $id;
2437 $id2 =~ s/-/_/gs;
2438 $id2 =~ s/__+/_/gs;
2439 defined($g_anchors_id{$id2}) or $g_anchors_id{$id2} = $g_anchors{$link};
2441 my $idd = _MakeAnchorId($link, 1);
2442 if ($idd) {
2443 defined($g_anchors_id{$idd}) or $g_anchors_id{$idd} = $g_anchors{$link};
2444 if ($idd =~ /-/) {
2445 my $idd2 = $idd;
2446 $idd2 =~ s/-/_/gs;
2447 $idd2 =~ s/__+/_/gs;
2448 defined($g_anchors_id{$idd2}) or $g_anchors_id{$idd2} = $g_anchors{$link};
2451 $id;
2455 sub _DoHeaders {
2456 my ($text, $anchors) = @_;
2457 my $h1;
2458 my $geth1 = $anchors && !defined($opt{h1}) ? sub {
2459 return unless !defined($h1);
2460 my $h = shift;
2461 $h1 = $h if $h ne "";
2462 } : sub {};
2464 # atx-style headers:
2465 # # Header 1
2466 # ## Header 2
2467 # ## Header 2 with closing hashes ##
2468 # ...
2469 # ###### Header 6
2471 $text =~ s{
2472 ^(\#{1,6}) # $1 = string of #'s
2473 [ ]*
2474 ((?:(?:(?<![#])[^\s]|[^#\s]).*?)?) # $2 = Header text
2475 [ ]*
2478 my $h_level = length($1);
2479 my $h = $2;
2480 $h =~ s/#+$//;
2481 $h =~ s/\s+$//;
2482 my $rsg = _RunSpanGamut($h);
2483 $h = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($rsg))));
2484 my $id = $h eq "" ? "" : _GetNewAnchorId($h);
2485 $id = " id=\"$id\"" if $id ne "";
2486 &$geth1($h) if $h_level == 1;
2487 "<h$h_level$id>" . _AutoHeaderFlag($h_level) . $rsg . "</h$h_level>\n\n";
2488 }egmx;
2490 # Setext-style headers:
2491 # Header 1
2492 # ========
2494 # Header 2
2495 # --------
2497 # Header 3
2498 # ~~~~~~~~
2500 $text =~ s{ ^(?:=+[ ]*\n)?[ ]*(.+?)[ ]*\n=+[ ]*\n+ }{
2501 my $h = $1;
2502 my $rsg = _RunSpanGamut($h);
2503 $h = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($rsg))));
2504 my $id = $h eq "" ? "" : _GetNewAnchorId($h);
2505 $id = " id=\"$id\"" if $id ne "";
2506 &$geth1($h);
2507 "<h1$id>" . _AutoHeaderFlag(1) . $rsg . "</h1>\n\n";
2508 }egmx;
2510 $text =~ s{ ^(?:-+[ ]*\n)?[ ]*(.+?)[ ]*\n-+[ ]*\n+ }{
2511 my $h = $1;
2512 my $rsg = _RunSpanGamut($h);
2513 $h = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($rsg))));
2514 my $id = $h eq "" ? "" : _GetNewAnchorId($h);
2515 $id = " id=\"$id\"" if $id ne "";
2516 "<h2$id>" . _AutoHeaderFlag(2) . $rsg . "</h2>\n\n";
2517 }egmx;
2519 $text =~ s{ ^(?:~+[ ]*\n)?[ ]*(.+?)[ ]*\n~+[ ]*\n+ }{
2520 my $h = $1;
2521 my $rsg = _RunSpanGamut($h);
2522 $h = _strip(unescapeXML(_StripTags(_UnescapeSpecialChars($rsg))));
2523 my $id = $h eq "" ? "" : _GetNewAnchorId($h);
2524 $id = " id=\"$id\"" if $id ne "";
2525 "<h3$id>" . _AutoHeaderFlag(3) . $rsg . "</h3>\n\n";
2526 }egmx;
2528 $opt{h1} = $h1 if defined($h1) && $h1 ne "";
2529 return $text;
2533 sub _AutoHeaderFlag {
2534 my $level = shift;
2535 my $auto = $opt{auto_number} || 0;
2536 return '' unless 1 <= $level && $level <= $auto;
2537 return "\34".chr(0x30+$level);
2541 sub _AutoHeaderNum {
2542 my $level = shift;
2543 my $auto = $opt{auto_number} || 0;
2544 return '' unless 1 <= $level && $level <= $auto;
2545 pop(@autonum) while @autonum > $level;
2546 push(@autonum, 1) while @autonum < $level - 1;
2547 $autonum[$level - 1] += 1;
2548 return join('.', @autonum).' ';
2552 my ($marker_ul, $marker_ol, $marker_any, $roman_numeral, $greek_lower);
2553 BEGIN {
2554 # Re-usable patterns to match list item bullets and number markers:
2555 $roman_numeral = qr/(?:
2556 [IiVvXx]|[Ii]{2,3}|[Ii][VvXx]|[VvXx][Ii]{1,3}|[Xx][Vv][Ii]{0,3}|
2557 [Xx][Ii][VvXx]|[Xx]{2}[Ii]{0,3}|[Xx]{2}[Ii]?[Vv]|[Xx]{2}[Vv][Ii]{1,2})/ox;
2558 $greek_lower = qr/(?:[\x{03b1}-\x{03c9}])/o;
2559 $marker_ul = qr/[*+-]/o;
2560 $marker_ol = qr/(?:\d+|[A-Za-z]|$roman_numeral|$greek_lower)[.\)]/o;
2561 $marker_any = qr/(?:$marker_ul|$marker_ol)/o;
2565 sub _GetListMarkerType {
2566 my ($list_type, $list_marker, $last_marker) = @_;
2567 return "" unless $list_type && $list_marker && lc($list_type) eq "ol";
2568 my $last_marker_type = '';
2569 $last_marker_type = _GetListMarkerType($list_type, $last_marker)
2570 if defined($last_marker) &&
2571 # these are roman unless $last_marker type case matches and is 'a' or 'A'
2572 $list_marker =~ /^[IiVvXx][.\)]?$/;
2573 return "I" if $list_marker =~ /^[IVX]/ && $last_marker_type ne 'A';
2574 return "i" if $list_marker =~ /^[ivx]/ && $last_marker_type ne 'a';
2575 return "A" if $list_marker =~ /^[A-Z]/;
2576 return "a" if $list_marker =~ /^[a-z]/ || $list_marker =~ /^$greek_lower/o;
2577 return "1";
2581 sub _GetListItemTypeClass {
2582 my ($list_type, $list_marker, $last_marker) = @_;
2583 my $list_marker_type = _GetListMarkerType($list_type, $list_marker, $last_marker);
2584 my $ans = &{sub{
2585 return "" unless length($list_marker) >= 2 && $list_marker_type =~ /^[IiAa1]$/;
2586 return "lower-greek" if $list_marker_type eq "a" && $list_marker =~ /^$greek_lower/o;
2587 return "" unless $list_marker =~ /\)$/;
2588 return "upper-roman" if $list_marker_type eq "I";
2589 return "lower-roman" if $list_marker_type eq "i";
2590 return "upper-alpha" if $list_marker_type eq "A";
2591 return "lower-alpha" if $list_marker_type eq "a";
2592 return "decimal";
2594 return ($list_marker_type, $ans);
2598 my %_roman_number_table;
2599 BEGIN {
2600 %_roman_number_table = (
2601 i => 1,
2602 ii => 2,
2603 iii => 3,
2604 iv => 4,
2605 v => 5,
2606 vi => 6,
2607 vii => 7,
2608 viii => 8,
2609 ix => 9,
2610 x => 10,
2611 xi => 11,
2612 xii => 12,
2613 xiii => 13,
2614 xiv => 14,
2615 xv => 15,
2616 xvi => 16,
2617 xvii => 17,
2618 xviii => 18,
2619 xix => 19,
2620 xx => 20,
2621 xxi => 21,
2622 xxii => 22,
2623 xxiii => 23,
2624 xxiv => 24,
2625 xxv => 25,
2626 xxvi => 26,
2627 xxvii => 27
2632 # Necessary because ς and σ are the same value grrr
2633 my %_greek_number_table;
2634 BEGIN {
2635 %_greek_number_table = (
2636 "\x{03b1}" => 1, # α
2637 "\x{03b2}" => 2, # β
2638 "\x{03b3}" => 3, # γ
2639 "\x{03b4}" => 4, # δ
2640 "\x{03b5}" => 5, # ε
2641 "\x{03b6}" => 6, # ζ
2642 "\x{03b7}" => 7, # η
2643 "\x{03b8}" => 8, # θ
2644 "\x{03b9}" => 9, # ι
2645 "\x{03ba}" => 10, # κ
2646 "\x{03bb}" => 11, # λ
2647 #"\x{00b5}"=> 12, # µ is "micro" not "mu"
2648 "\x{03bc}" => 12, # μ
2649 "\x{03bd}" => 13, # ν
2650 "\x{03be}" => 14, # ξ
2651 "\x{03bf}" => 15, # ο
2652 "\x{03c0}" => 16, # π
2653 "\x{03c1}" => 17, # ρ
2654 "\x{03c2}" => 18, # ς
2655 "\x{03c3}" => 18, # σ
2656 "\x{03c4}" => 19, # τ
2657 "\x{03c5}" => 20, # υ
2658 "\x{03c6}" => 21, # φ
2659 "\x{03c7}" => 22, # χ
2660 "\x{03c8}" => 23, # ψ
2661 "\x{03c9}" => 24 # ω
2666 sub _GetMarkerIntegerNum {
2667 my ($list_marker_type, $marker_val) = @_;
2668 my $ans = &{sub{
2669 return 0 + $marker_val if $list_marker_type eq "1";
2670 $list_marker_type = lc($list_marker_type);
2671 return $_greek_number_table{$marker_val}
2672 if $list_marker_type eq "a" &&
2673 defined($_greek_number_table{$marker_val});
2674 $marker_val = lc($marker_val);
2675 return ord($marker_val) - ord("a") + 1 if $list_marker_type eq "a";
2676 return 1 unless $list_marker_type eq "i";
2677 defined($_roman_number_table{$marker_val}) and
2678 return $_roman_number_table{$marker_val};
2679 return 1;
2681 return $ans if $ans == 0 && $list_marker_type eq "1";
2682 return $ans >= 1 ? $ans : 1;
2686 sub _IncrList {
2687 my ($from, $to, $extra) = @_;
2688 $extra = defined($extra) ? " $extra" : "";
2689 my $result = "";
2690 while ($from + 10 <= $to) {
2691 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-10\"></span>\n";
2692 $from += 10;
2694 while ($from + 5 <= $to) {
2695 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-5\"></span>\n";
2696 $from += 5;
2698 while ($from + 2 <= $to) {
2699 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr-2\"></span>\n";
2700 $from += 2;
2702 while ($from < $to) {
2703 $result .= "<span$extra class=\"$opt{style_prefix}ol-incr\"></span>\n";
2704 ++$from;
2706 return $result;
2710 sub _DoListsAndBlocks {
2712 # Form HTML ordered (numbered) and unordered (bulleted) lists.
2714 my $text = shift;
2715 my $indent = $opt{indent_width};
2716 my $less_than_indent = $indent - 1;
2717 my $less_than_double_indent = 2 * $indent - 1;
2719 # Re-usable pattern to match any entire ul or ol list:
2720 my $whole_list = qr{
2721 ( # $1 (or $_[0]) = whole list
2722 ( # $2 (or $_[1])
2723 (?:(?<=\n)|\A)
2724 [ ]{0,$less_than_indent}
2725 (${marker_any}) # $3 (or $_[2]) = first list item marker
2726 [ ]+
2728 (?s:.+?)
2729 ( # $4 (or $_[3])
2732 \n{2,}
2733 (?=\S)
2734 (?! # Negative lookahead for another list item marker
2735 ${marker_any}[ ]
2739 }mx;
2741 my $list_item_sub = sub {
2742 my $list = $_[0];
2743 my $list_type = ($_[2] =~ m/$marker_ul/) ? "ul" : "ol";
2744 my $list_att = "";
2745 my $list_class = "";
2746 my $list_incr = "";
2747 # Turn double returns into triple returns, so that we can make a
2748 # paragraph for the last item in a list, if necessary:
2749 $list =~ s/\n\n/\n\n\n/g;
2750 my ($result, $first_marker, $fancy) = _ProcessListItems($list_type, $list);
2751 defined($first_marker) or return $list;
2752 my $list_marker_type = _GetListMarkerType($list_type, $first_marker);
2753 if ($list_marker_type) {
2754 $first_marker =~ s/[.\)]$//;
2755 my $first_marker_num = _GetMarkerIntegerNum($list_marker_type, $first_marker);
2756 $list_att = $list_marker_type eq "1" ? "" : " type=\"$list_marker_type\"";
2757 if ($fancy) {
2758 $list_class = " class=\"$opt{style_prefix}ol\"";
2759 my $start = $first_marker_num;
2760 $start = 10 if $start > 10;
2761 $start = 5 if $start > 5 && $start < 10;
2762 $start = 1 if $start > 1 && $start < 5;
2763 $list_att .= " start=\"$start\"" unless $start == 1;
2764 $list_incr = _IncrList($start, $first_marker_num);
2765 } else {
2766 $list_class = " class=\"$opt{style_prefix}lc-greek\""
2767 if $list_marker_type eq "a" && $first_marker =~ /^$greek_lower/o;
2768 $list_att .= " start=\"$first_marker_num\"" unless $first_marker_num == 1;
2771 my $idt = "\027" x $g_list_level;
2772 $result = "$idt<$list_type$list_att$list_class>\n$list_incr" . $result . "$idt</$list_type>\n\n";
2773 $result;
2776 # We use a different prefix before nested lists than top-level lists.
2777 # See extended comment in _ProcessListItems().
2779 # Note: (jg) There's a bit of duplication here. My original implementation
2780 # created a scalar regex pattern as the conditional result of the test on
2781 # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
2782 # substitution once, using the scalar as the pattern. This worked,
2783 # everywhere except when running under MT on my hosting account at Pair
2784 # Networks. There, this caused all rebuilds to be killed by the reaper (or
2785 # perhaps they crashed, but that seems incredibly unlikely given that the
2786 # same script on the same server ran fine *except* under MT. I've spent
2787 # more time trying to figure out why this is happening than I'd like to
2788 # admit. My only guess, backed up by the fact that this workaround works,
2789 # is that Perl optimizes the substition when it can figure out that the
2790 # pattern will never change, and when this optimization isn't on, we run
2791 # afoul of the reaper. Thus, the slightly redundant code to that uses two
2792 # static s/// patterns rather than one conditional pattern.
2794 # Note: (kjm) With the addition of the two-of-the-same-kind-in-a-row-
2795 # starts-a-list-at-the-top-level rule the two patterns really are somewhat
2796 # different now, but the duplication has pretty much been eliminated via
2797 # use of a separate sub which has the side-effect of making the below
2798 # two cases much easier to grok all at once.
2800 if ($g_list_level) {
2801 my $parse = $text;
2802 $text = "";
2803 pos($parse) = 0;
2804 while ($parse =~ /\G(?s:.)*?^$whole_list/gmc) {
2805 my @captures = ($1, $2, $3, $4);
2806 if ($-[1] > $-[0]) {
2807 $text .= _DoBTListBlocks(substr($parse, $-[0], $-[1] - $-[0]));
2809 $text .= &$list_item_sub(@captures);
2811 $text .= _DoBTListBlocks(substr($parse, pos($parse))) if pos($parse) < length($parse);
2813 else {
2814 my $parse = $text;
2815 $text = "";
2816 pos($parse) = 0;
2817 while ($parse =~ m{\G(?s:.)*?
2818 (?: (?<=\n\n) |
2819 \A\n? |
2820 (?<=:\n) |
2821 (?:(?<=\n) # a list starts with one unordered marker line
2822 (?=[ ]{0,$less_than_indent}$marker_ul[ ])) |
2823 (?:(?<=\n) # or two ordered marker lines in a row
2824 (?=[ ]{0,$less_than_indent}$marker_ol[ ].*\n\n?
2825 [ ]{0,$less_than_indent}$marker_ol[ ])) |
2826 (?:(?<=\n) # or any marker and a sublist marker
2827 (?=[ ]{0,$less_than_indent}$marker_any[ ].*\n\n?
2828 [ ]{$indent,$less_than_double_indent}$marker_any[ ]))
2830 $whole_list
2831 }gmcx) {
2832 my @captures = ($1, $2, $3, $4);
2833 if ($-[1] > $-[0]) {
2834 $text .= _DoListBlocks(substr($parse, $-[0], $-[1] - $-[0]));
2836 $text .= &$list_item_sub(@captures);
2838 $text .= _DoListBlocks(substr($parse, pos($parse))) if pos($parse) < length($parse);
2841 return $text;
2845 sub _ProcessListItems {
2847 # Process the contents of a single ordered or unordered list, splitting it
2848 # into individual list items.
2851 my $list_type = shift;
2852 my $list_str = shift;
2854 # The $g_list_level global keeps track of when we're inside a list.
2855 # Each time we enter a list, we increment it; when we leave a list,
2856 # we decrement. If it's zero, we're not in a list anymore.
2858 # We do this because when we're not inside a list, we want to treat
2859 # something like this:
2861 # I recommend upgrading to version
2862 # 8. Oops, now this line is treated
2863 # as a sub-list.
2865 # As a single paragraph, despite the fact that the second line starts
2866 # with a digit-period-space sequence.
2868 # Whereas when we're inside a list (or sub-list), that line will be
2869 # treated as the start of a sub-list. What a kludge, huh? This is
2870 # an aspect of Markdown's syntax that's hard to parse perfectly
2871 # without resorting to mind-reading. Perhaps the solution is to
2872 # change the syntax rules such that sub-lists must start with a
2873 # starting cardinal number; e.g. "1." or "a.".
2875 $g_list_level++;
2876 my $idt = "\027" x $g_list_level;
2877 my $marker_kind = $list_type eq "ul" ? $marker_ul : $marker_ol;
2878 my $first_marker;
2879 my $first_marker_type;
2880 my $first_marker_num;
2881 my $last_marker;
2882 my $fancy;
2883 my $skipped;
2884 my $typechanged;
2885 my $next_num = 1;
2887 # trim trailing blank lines:
2888 $list_str =~ s/\n{2,}\z/\n/;
2890 my $result = "";
2891 my $oldpos = 0;
2892 pos($list_str) = 0;
2893 while ($list_str =~ m{\G # start where we left off
2894 (\n+)? # leading line = $1
2895 (^[ ]*) # leading whitespace = $2
2896 ($marker_any) [ ] ([ ]*) # list marker = $3 leading item space = $4
2897 }cgmx) {
2898 my $leading_line = $1;
2899 my $leading_space = $2;
2900 my $list_marker = $3;
2901 my $list_marker_len = length($list_marker);
2902 my $leading_item_space = $4;
2903 if ($-[0] > $oldpos) {
2904 $result .= substr($list_str, $oldpos, $-[0] - $oldpos); # Sort-of $`
2905 $oldpos = $-[0]; # point at start of this entire match
2907 if (!defined($first_marker)) {
2908 $first_marker = $list_marker;
2909 $first_marker_type = _GetListMarkerType($list_type, $first_marker);
2910 if ($first_marker_type) {
2911 (my $marker_val = $first_marker) =~ s/[.\)]$//;
2912 $first_marker_num = _GetMarkerIntegerNum($first_marker_type, $marker_val);
2913 $next_num = $first_marker_num;
2914 $skipped = 1 if $next_num != 1;
2916 } elsif ($list_marker !~ /$marker_kind/) {
2917 # Wrong marker kind, "fix up" the marker to a correct "lazy" marker
2918 # But keep the old length in $list_marker_len
2919 $list_marker = $last_marker;
2922 # Now grab the rest of this item's data upto but excluding the next
2923 # list marker at the SAME indent level, but sublists must be INCLUDED
2925 my $item = "";
2926 while ($list_str =~ m{\G
2927 ((?:.+?)(?:\n{1,2})) # list item text = $1
2928 (?= \n* (?: \z | # end of string OR
2929 (^[ ]*) # leading whitespace = $2
2930 ($marker_any) # next list marker = $3
2931 ([ ]+) )) # one or more spaces after marker = $4
2932 }cgmxs) {
2934 # If $3 has a left edge that is at the left edge of the previous
2935 # marker OR $3 has a right edge that is at the right edge of the
2936 # previous marker then we stop; otherwise we go on
2938 $item .= substr($list_str, $-[0], $+[0] - $-[0]); # $&
2939 last if !defined($4) || length($2) == length($leading_space) ||
2940 length($2) + length($3) == length($leading_space) + $list_marker_len;
2941 # move along, you're not the marker droid we're looking for...
2942 $item .= substr($list_str, $+[0], $+[4] - $+[0]);
2943 pos($list_str) = $+[4]; # ...move along over the marker droid
2945 # Remember where we parked
2946 $oldpos = pos($list_str);
2948 # Process the $list_marker $item
2950 my $liatt = '';
2951 my $checkbox = '';
2952 my $incr = '';
2954 if ($list_type eq "ul" && !$leading_item_space && $item =~ /^\[([ xX\xd7])\] +(.*)$/s) {
2955 my $checkmark = $1;
2956 $item = $2;
2957 my ($checkbox_class, $checkbox_val);
2958 if ($checkmark ne " ") {
2959 ($checkbox_class, $checkbox_val) = ("checkbox-on", "x");
2960 } else {
2961 ($checkbox_class, $checkbox_val) = ("checkbox-off", "&#160;");
2963 $liatt = " class=\"$opt{style_prefix}$checkbox_class\"";
2964 $checkbox = "<span><span></span></span><span></span><span>[<tt>$checkbox_val</tt>]&#160;</span>";
2965 } else {
2966 my $list_marker_type;
2967 ($list_marker_type, $liatt) = _GetListItemTypeClass($list_type, $list_marker, $last_marker);
2968 if ($list_type eq "ol" && defined($first_marker)) {
2969 my $styled = $fancy = 1 if $liatt && $list_marker =~ /\)$/;
2970 my ($sfx, $dash) = ("", "");
2971 ($sfx, $dash) = ("li", "-") if $styled;
2972 if ($liatt =~ /lower/) {
2973 $sfx .= "${dash}lc";
2974 } elsif ($liatt =~ /upper/) {
2975 $sfx .= "${dash}uc";
2977 $sfx .= "-greek" if $liatt =~ /greek/;
2978 $liatt = " class=\"$opt{style_prefix}$sfx\"" if $sfx;
2979 $typechanged = 1 if $list_marker_type ne $first_marker_type;
2980 (my $marker_val = $list_marker) =~ s/[.\)]$//;
2981 my $marker_num = _GetMarkerIntegerNum($list_marker_type, $marker_val);
2982 $marker_num = $next_num if $marker_num < $next_num;
2983 $skipped = 1 if $next_num < $marker_num;
2984 $incr = _IncrList($next_num, $marker_num, "incrlevel=$g_list_level");
2985 $liatt = " value=\"$marker_num\"$liatt" if $fancy || $skipped;
2986 $liatt = " type=\"$list_marker_type\"$liatt" if $styled || $typechanged;
2987 $next_num = $marker_num + 1;
2990 $last_marker = $list_marker;
2992 if ($item =~ /^(.+)/) {
2993 my $ml_text = $1;
2994 my $ml_len = length($1);
2995 my $ml_sub = sub {my $ml_mk = shift; $ml_mk =~ s!([-+*.\)])!$g_escape_table{$1}!go; $ml_mk};
2996 $ml_text =~ s/(?:(?<= )|\A)(${marker_any})(?= )/&$ml_sub($1)/ge;
2997 $item = $ml_text . substr($item, $ml_len);
2999 if ($leading_line or ($item =~ m/\n{2,}/)) {
3000 $item = _RunBlockGamut(_Outdent($item));
3001 $item =~ s{(</[OUou][Ll]>)\s*\z}{$1} and $item .= "\n$idt<span style=\"display:none\">&#160;</span>";
3003 else {
3004 # Recursion for sub-lists:
3005 $item = _DoListsAndBlocks(_Outdent($item));
3006 chomp $item;
3007 $item = _RunSpanGamut($item);
3010 # Append to $result
3011 $result .= "$incr$idt<li$liatt>" . $checkbox . $item . "$idt</li>\n";
3013 if ($fancy) {
3014 # remove "incrlevel=$g_list_level " parts
3015 $result =~ s{<span incrlevel=$g_list_level class="$opt{style_prefix}ol-incr((?:-\d{1,2})?)">}
3016 {$idt<span class="$opt{style_prefix}ol-incr$1">}g;
3017 } else {
3018 # remove the $g_list_level incr spans entirely
3019 $result =~ s{<span incrlevel=$g_list_level class="$opt{style_prefix}ol-incr(?:-\d{1,2})?"></span>\n}{}g;
3020 # remove the class="$opt{style_prefix}lc-greek" if first_marker is greek
3021 $result =~ s{(<li[^>]*?) class="$opt{style_prefix}lc-greek">}{$1>}g
3022 if defined($first_marker_type) && $first_marker_type eq "a" && $first_marker =~ /^$greek_lower/o;
3025 # Anything left over (similar to $') goes into result, but this should always be empty
3026 $result .= _RunBlockGamut(substr($list_str, pos($list_str))) if pos($list_str) < length($list_str);
3028 $g_list_level--;
3030 # After all that, if we only got an ordered list with a single item
3031 # and its first marker is a four-digit number >= 1492 and <= 2999
3032 # or an UPPERCASE letter, then pretend we didn't see any list at all.
3034 if ($first_marker_type && $first_marker_num + 1 == $next_num) {
3035 if (($first_marker_type eq "1" && $first_marker_num >= 1492 && $first_marker_num <= 2999) ||
3036 ($first_marker_type eq "A" && !$fancy)) {
3037 return (undef, undef, undef);
3041 return ($result, $first_marker, $fancy);
3045 sub _DoCodeBlocks {
3047 # Process Markdown `<pre><code>` blocks.
3050 my $text = shift;
3051 my $less_than_indent = $opt{indent_width} - 1;
3053 $text =~ s{
3054 (\n\n|\A\n?)
3055 ( # $2 = the code block -- one or more lines, starting with indent_width spaces
3057 (?:[ ]{$opt{indent_width}}) # Lines must start with indent_width of spaces
3058 .*\n+
3061 (?:(?=(^[ ]{0,$less_than_indent}\S.*))|\Z) # Lookahead for non-space at line-start, or end of doc
3062 }{&{sub{
3063 my ($prefix, $codeblock, $n) = ($1, $2, $3);
3065 if (defined($n) && length($n) && (()=($codeblock =~ /\n/g)) == 1 && _IsTableStart($codeblock.$n."\n")) {
3066 return $prefix.$codeblock;
3069 $codeblock =~ s/\n\n\n/\n\n/g; # undo "paragraph for last list item" change
3070 $codeblock = _EncodeCode(_Outdent($codeblock));
3071 $codeblock =~ s/\A\n+//; # trim leading newlines
3072 $codeblock =~ s/\s+\z//; # trim trailing whitespace
3074 my $result = "<div class=\"$opt{style_prefix}code\"><pre style=\"display:none\"></pre><pre><code>"
3075 . $codeblock . "\n</code></pre></div>";
3076 my $key = block_id($result, 2);
3077 $g_code_blocks{$key} = $result;
3078 "\n\n" . $key . "\n\n";
3079 }}}egmx;
3081 return $text;
3085 sub _DoCodeSpans {
3087 # * Backtick quotes are used for <code></code> spans.
3089 # * You can use multiple backticks as the delimiters if you want to
3090 # include literal backticks in the code span. So, this input:
3092 # Just type ``foo `bar` baz`` at the prompt.
3094 # Will translate to:
3096 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
3098 # There's no arbitrary limit to the number of backticks you
3099 # can use as delimters. If you need three consecutive backticks
3100 # in your code, use four for delimiters, etc.
3102 # * You can use spaces to get literal backticks at the edges:
3104 # ... type `` `bar` `` ...
3106 # Turns to:
3108 # ... type <code>`bar`</code> ...
3111 my $text = shift;
3113 $text =~ s@
3114 (`+) # $1 = Opening run of `
3115 (.+?) # $2 = The code block
3116 (?<!`)
3117 \1 # Matching closer
3118 (?!`)
3120 my $c = "$2";
3121 $c =~ s/^[ ]+//g; # leading whitespace
3122 $c =~ s/[ ]+$//g; # trailing whitespace
3123 $c = _EncodeCode($c);
3124 "<code>$c</code>";
3125 @egsx;
3127 return $text;
3131 sub _EncodeCode {
3133 # Encode/escape certain characters inside Markdown code runs.
3134 # The point is that in code, these characters are literals,
3135 # and lose their special Markdown meanings.
3137 local $_ = shift;
3139 # Encode all ampersands; HTML entities are not
3140 # entities within a Markdown code span.
3141 s/&/&amp;/g;
3143 # Encode $'s, but only if we're running under Blosxom.
3144 # (Blosxom interpolates Perl variables in article bodies.)
3145 s/\$/&#036;/g if $_haveBX;
3147 # Do the angle bracket song and dance:
3148 s! < !&lt;!gx;
3149 s! > !&gt;!gx;
3151 # Now, escape characters that are magic in Markdown:
3152 s!([*_~{}\[\]\\])!$g_escape_table{$1}!go;
3154 return $_;
3158 sub _DoItalicsAndBoldAndStrike {
3159 my $text = shift;
3161 my $doital1 = sub {
3162 my $text = shift;
3163 $text =~ s{ \* (?=\S) (.+?) (?<=\S) \* }
3164 {<em>$1</em>}gsx;
3165 # We've got to encode any of these remaining to
3166 # avoid conflicting with other italics and bold.
3167 $text =~ s!([*])!$g_escape_table{$1}!go;
3168 $text;
3170 my $doital2 = sub {
3171 my $text = shift;
3172 $text =~ s{ (?<!\w) _ (?=\S) (.+?) (?<=\S) _ (?!\w) }
3173 {<em>$1</em>}gsx;
3174 # We've got to encode any of these remaining to
3175 # avoid conflicting with other italics and bold.
3176 $text =~ s!([_])!$g_escape_table{$1}!go;
3177 $text;
3180 # <strong> must go first:
3181 $text =~ s{ \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }
3182 {"<strong>".&$doital1($1)."</strong>"}gsex;
3183 $text =~ s{ (?<!\w) __ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\w) }
3184 {"<strong>".&$doital2($1)."</strong>"}gsex;
3186 $text =~ s{ ~~ (?=\S) (.+?[*_]*) (?<=\S) ~~ }
3187 {<strike>$1</strike>}gsx;
3189 $text =~ s{ \* (?=\S) (.+?) (?<=\S) \* }
3190 {<em>$1</em>}gsx;
3191 $text =~ s{ (?<!\w) _ (?=\S) (.+?) (?<=\S) _ (?!\w) }
3192 {<em>$1</em>}gsx;
3194 return $text;
3198 sub _DoBlockQuotes {
3199 my $text = shift;
3201 $text =~ s{
3202 ( # Wrap whole match in $1
3204 ^[ ]*>[ ]? # '>' at the start of a line
3205 .*\n # rest of the first line
3206 (.+\n)* # subsequent consecutive lines
3207 \n* # blanks
3211 my $bq = $1;
3212 $bq =~ s/^[ ]*>[ ]?//gm; # trim one level of quoting
3213 $bq =~ s/^[ ]+$//mg; # trim whitespace-only lines
3214 $bq = _RunBlockGamut($bq); # recurse
3216 $bq =~ s/^/\027/mg;
3217 "<blockquote>\n$bq\n</blockquote>\n\n";
3218 }egmx;
3221 return $text;
3225 my ($LEAD, $TRAIL, $LEADBAR, $LEADSP, $COLPL, $SEP);
3226 BEGIN {
3227 $LEAD = qr/(?>[ ]*(?:\|[ ]*)?)/o;
3228 $TRAIL = qr/[ ]*(?<!\\)\|[ ]*/o;
3229 $LEADBAR = qr/(?>[ ]*\|[ ]*)/o;
3230 $LEADSP = qr/(?>[ ]*)/o;
3231 $COLPL = qr/(?:[^\n|\\]|\\(?:(?>[^\n])|(?=\n|$)))+/o;
3232 $SEP = qr/[ ]*:?-+:?[ ]*/o;
3235 sub _IsTableStart {
3236 my $text = shift;
3237 my $ans = 0;
3239 if ($text =~ m{
3240 ^( # Header line
3241 $LEADBAR \| [^\n]* |
3242 $LEADBAR $COLPL [^\n]* |
3243 $LEADSP $COLPL \| [^\n]*
3245 ( # Separator line
3246 $LEADBAR $SEP (?: \| $SEP )* (?: \| [ ]*)? |
3247 $SEP (?: \| $SEP )+ (?: \| [ ]*)? |
3248 $SEP \| [ ]*
3250 }mx) {
3251 my ($h, $s) = ($1, $2);
3252 _SplitTableRow($h) == _SplitTableRow($s) and $ans = 1;
3255 return $ans;
3258 sub _DoTables {
3259 my $text = shift;
3261 $text =~ s{
3262 ( # Wrap whole thing to avoid $&
3263 (?: (?<=\n\n) | \A\n? ) # Preceded by blank line or beginning of string
3264 ^( # Header line
3265 $LEADBAR \| [^\n]* |
3266 $LEADBAR $COLPL [^\n]* |
3267 $LEADSP $COLPL \| [^\n]*
3269 ( # Separator line
3270 $LEADBAR $SEP (?: \| $SEP )* (?: \| [ ]*)? |
3271 $SEP (?: \| $SEP )+ (?: \| [ ]*)? |
3272 $SEP \| [ ]*
3274 ((?: # Rows (0+)
3275 $LEADBAR \| [^\n]* \n |
3276 $LEADBAR $COLPL [^\n]* \n |
3277 $LEADSP $COLPL \| [^\n]* \n
3281 my ($w, $h, $s, $rows) = ($1, $2, $3, $4);
3282 my @heads = _SplitTableRow($h);
3283 my @seps = _SplitTableRow($s);
3284 if (@heads == @seps) {
3285 my @align = map {
3286 if (/^:-+:$/) {" align=\"center\""}
3287 elsif (/^:/) {" align=\"left\""}
3288 elsif (/:$/) {" align=\"right\""}
3289 else {""}
3290 } @seps;
3291 my $nohdr = "";
3292 $nohdr = " $opt{style_prefix}table-nohdr" if join("", @heads) eq "";
3293 my $tab ="\n<table border=\"1\" cellspacing=\"0\" cellpadding=\"2\" class=\"$opt{style_prefix}table$nohdr\">\n";
3294 $tab .=
3295 " <tr class=\"$opt{style_prefix}row-hdr\">" . _MakeTableRow("th", \@align, @heads) . "</tr>\n"
3296 unless $nohdr;
3297 my $cnt = 0;
3298 my @classes = ("class=\"$opt{style_prefix}row-even\"", "class=\"$opt{style_prefix}row-odd\"");
3299 $tab .= " <tr " . $classes[++$cnt % 2] . ">" . _MakeTableRow("td", \@align, @$_) . "</tr>\n"
3300 foreach (_SplitMergeRows($rows));
3301 $tab .= "</table>\n\n";
3302 } else {
3305 }egmx;
3307 return $text;
3311 sub _SplitMergeRows {
3312 my @rows = ();
3313 my ($mergeprev, $mergenext) = (0,0);
3314 foreach (split(/\n/, $_[0])) {
3315 $mergeprev = $mergenext;
3316 $mergenext = 0;
3317 my @cols = _SplitTableRow($_);
3318 if (_endswithbareslash($cols[$#cols])) {
3319 my $last = $cols[$#cols];
3320 substr($last, -1, 1) = "";
3321 $last =~ s/[ ]+$//;
3322 $cols[$#cols] = $last;
3323 $mergenext = 1;
3325 if ($mergeprev) {
3326 for (my $i = 0; $i <= $#cols; ++$i) {
3327 my $cell = $rows[$#rows]->[$i];
3328 defined($cell) or $cell = "";
3329 $rows[$#rows]->[$i] = _MergeCells($cell, $cols[$i]);
3331 } else {
3332 push(@rows, [@cols]);
3335 return @rows;
3339 sub _endswithbareslash {
3340 return 0 unless substr($_[0], -1, 1) eq "\\";
3341 my @parts = split(/\\\\/, $_[0], -1);
3342 return substr($parts[$#parts], -1, 1) eq "\\";
3346 sub _MergeCells {
3347 my ($c1, $c2) = @_;
3348 return $c1 if $c2 eq "";
3349 return $c2 if $c1 eq "";
3350 return $c1 . " " . $c2;
3354 sub _SplitTableRow {
3355 my $row = shift;
3356 $row =~ s/^$LEAD//;
3357 $row =~ s/$TRAIL$//;
3358 $row =~ s!\\\\!$g_escape_table{'\\'}!go; # Must process escaped backslashes first.
3359 $row =~ s!\\\|!$g_escape_table{'|'}!go; # Then do \|
3360 my @elems = map {
3361 s!$g_escape_table{'|'}!|!go;
3362 s!$g_escape_table{'\\'}!\\\\!go;
3363 s/^[ ]+//;
3364 s/[ ]+$//;
3366 } split(/[ ]*\|[ ]*/, $row, -1);
3367 @elems or push(@elems, "");
3368 return @elems;
3372 sub _MakeTableRow {
3373 my $etype = shift;
3374 my $align = shift;
3375 my $row = "";
3376 for (my $i = 0; $i < @$align; ++$i) {
3377 my $data = $_[$i];
3378 defined($data) or $data = "";
3379 $row .= "<" . $etype . $$align[$i] . ">" .
3380 _RunSpanGamut($data) . "</" . $etype . ">";
3382 return $row;
3386 sub _FormParagraphs {
3388 # Params:
3389 # $text - string to process with html <p> tags
3391 my ($text, $anchors) = @_;
3393 # Strip leading and trailing lines:
3394 $text =~ s/\A\n+//;
3395 $text =~ s/\n+\z//;
3397 my @grafs = split(/\n{2,}/, $text);
3400 # Wrap <p> tags.
3402 foreach (@grafs) {
3403 unless (defined($g_html_blocks{$_}) || defined($g_code_blocks{$_})) {
3404 $_ = _RunSpanGamut($_);
3405 s/^([ ]*)/$g_start_p/;
3406 $_ .= $g_close_p;
3411 # Strip standalone XML comments if requested
3413 if ($anchors && $opt{stripcomments} && @g_xml_comments) {
3414 my %xml_comment = ();
3415 $xml_comment{$_} = 1 foreach @g_xml_comments;
3416 my @grafs2 = ();
3417 do { push(@grafs2, $_) unless $xml_comment{$_} } foreach @grafs;
3418 @grafs = @grafs2;
3422 # Unhashify HTML blocks
3424 foreach (@grafs) {
3425 if (defined( $g_html_blocks{$_} )) {
3426 $_ = $g_html_blocks{$_};
3430 return join "\n\n", @grafs;
3434 # %ok_tag_name declared previously
3435 my $g_possible_tag_name;
3436 BEGIN {
3437 # note: length("blockquote") == 10
3438 $g_possible_tag_name = qr/(?i:[a-z]{1,10}|h[1-6]|\020)/o;
3439 %ok_tag_name = map({$_ => 1} "\20", qw(
3440 a abbr acronym address area
3441 b basefont bdo big blockquote br
3442 caption center cite code col colgroup
3443 dd del dfn div dl dt
3445 font
3446 h1 h2 h3 h4 h5 h6 hr
3447 i img ins
3452 p pre
3454 s samp small span strike strong sub sup
3455 table tbody td tfoot th thead tr tt
3456 u ul
3459 $ok_tag_name{$_} = 0 foreach (qw(
3460 dir menu
3465 sub _SetAllowedTag {
3466 my ($tag, $forbid) = @_;
3467 $ok_tag_name{$tag} = $forbid ? 0 : 1
3468 if defined($tag) && exists($ok_tag_name{$tag});
3472 # Encode leading '<' of any non-tags
3473 # However, "<?", "<!" and "<$" are passed through (legacy on that "<$" thing)
3474 sub _DoTag {
3475 my $tag = shift;
3476 return $tag if $tag =~ /^<[?\$!]/;
3477 if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} || $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
3478 $ok_tag_name{lc($1)}) {
3480 return _ProcessURLTag("href", $tag, 1) if $tag =~ /^<a\s/i;
3481 return _ProcessURLTag("src", $tag) if $tag =~ /^<img\s/i;
3482 return $tag;
3484 $tag =~ s/^</&lt;/;
3485 return $tag;
3488 # Strip out all tags that _DoTag would match
3489 sub _StripTags {
3490 my $text = shift;
3491 my $_StripTag = sub {
3492 my $tag = shift;
3493 return $tag if $tag =~ /^<[?\$!]/;
3494 if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} || $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
3495 $ok_tag_name{lc($1)}) {
3497 return ""; # strip it out
3499 return $tag;
3501 $text =~ s{(<[^>]*>)}{&$_StripTag($1)}ige;
3502 return $text;
3505 my %univatt; # universally allowed attribute names
3506 my %tagatt; # per-element allowed attribute names
3507 my %tagmt; # empty element tags
3508 my %tagocl; # non-empty elements with optional closing tag
3509 my %tagacl; # which %tagocl an opening %tagocl will close
3510 my %tagblk; # block elements
3511 my %taginl; # inline markup tags which trigger an auto <p> reopen
3512 my %taga1p; # open tags which require at least one attribute
3513 my %lcattval; # names of attribute values to lowercase
3514 my %impatt; # names of "implied" attributes
3515 BEGIN {
3516 %univatt = map({$_ => 1} qw(class dir id lang style title xml:lang));
3517 %tagatt = (
3518 'a' => { map({$_ => 1} qw(href name rel target)) },
3519 'area' => { map({$_ => 1} qw(alt coords href nohref shape)) },
3520 'basefont' => { map({$_ => 1} qw(color face size)) },
3521 'br' => { map({$_ => 1} qw(clear)) },
3522 'caption' => { map({$_ => 1} qw(align)) },
3523 'col' => { map({$_ => 1} qw(align char charoff span width valign)) },
3524 'colgroup' => { map({$_ => 1} qw(align char charoff span width valign)) },
3525 'dir' => { map({$_ => 1} qw(compact)) },
3526 'div' => { map({$_ => 1} qw(align)) },
3527 'dl' => { map({$_ => 1} qw(compact)) },
3528 'font' => { map({$_ => 1} qw(color face size)) },
3529 'h1' => { map({$_ => 1} qw(align)) },
3530 'h2' => { map({$_ => 1} qw(align)) },
3531 'h3' => { map({$_ => 1} qw(align)) },
3532 'h4' => { map({$_ => 1} qw(align)) },
3533 'h5' => { map({$_ => 1} qw(align)) },
3534 'h6' => { map({$_ => 1} qw(align)) },
3535 'hr' => { map({$_ => 1} qw(align noshade size width)) },
3536 # NO server-side image maps, therefore NOT ismap !
3537 'img' => { map({$_ => 1} qw(align alt border height hspace src usemap vspace width)) },
3538 'li' => { map({$_ => 1} qw(compact type value)) },
3539 'map' => { map({$_ => 1} qw(name)) },
3540 'menu' => { map({$_ => 1} qw(compact)) },
3541 'ol' => { map({$_ => 1} qw(compact start type)) },
3542 'p' => { map({$_ => 1} qw(align)) },
3543 'pre' => { map({$_ => 1} qw(width)) },
3544 'table' => { map({$_ => 1} qw(align bgcolor border cellpadding cellspacing frame rules summary width)) },
3545 'tbody' => { map({$_ => 1} qw(align char charoff valign)) },
3546 'tfoot' => { map({$_ => 1} qw(align char charoff valign)) },
3547 'thead' => { map({$_ => 1} qw(align char charoff valign)) },
3548 'td' => { map({$_ => 1} qw(align bgcolor char charoff colspan height nowrap rowspan valign width)) },
3549 'th' => { map({$_ => 1} qw(align bgcolor char charoff colspan height nowrap rowspan valign width)) },
3550 'tr' => { map({$_ => 1} qw(align bgcolor char charoff valign)) },
3551 'ul' => { map({$_ => 1} qw(compact type)) }
3553 %tagmt = map({$_ => 1} qw(area basefont br col hr img));
3554 %tagocl = map({$_ => 1} qw(colgroup dd dt li p tbody td tfoot th thead tr));
3555 %tagacl = (
3556 'colgroup' => \%tagocl,
3557 'dd' => { map({$_ => 1} qw(colgroup dd dt li p)) },
3558 'dt' => { map({$_ => 1} qw(colgroup dd dt li p)) },
3559 'li' => { map({$_ => 1} qw(colgroup dd dt li p)) },
3560 'tbody' => \%tagocl,
3561 'td' => { map({$_ => 1} qw(colgroup dd dt li p td th)) },
3562 'tfoot' => \%tagocl,
3563 'th' => { map({$_ => 1} qw(colgroup dd dt li p td th)) },
3564 'thead' => \%tagocl,
3565 'tr' => { map({$_ => 1} qw(colgroup dd dt li p td th tr)) },
3567 %tagblk = map({$_ => 1} qw(address blockquote center div dl h1 h2 h3 h4 h5 h6 hr ol p pre table ul));
3568 %taginl = map({$_ => 1} qw(a abbr acronym b basefont bdo big br cite code dfn em font i
3569 img kbd map q s samp small span strike strong sub sup tt u var));
3570 %impatt = map({$_ => 1} qw(checked compact ismap nohref noshade nowrap));
3571 %lcattval = map({$_ => 1} qw(
3572 align border cellpadding cellspacing checked clear color colspan
3573 compact coords height hspace ismap nohref noshade nowrap rowspan size
3574 span shape valign vspace width
3576 %taga1p = map({$_ => 1} qw(a area bdo img map));
3580 # _SanitizeTags
3582 # Inspect all '<'...'>' tags in the input and HTML encode those things
3583 # that cannot possibly be tags and at the same time sanitize them.
3585 # $1 => text to process
3586 # <= sanitized text
3587 sub _SanitizeTags {
3588 my ($text, $validate, $htmlauto) = @_;
3589 $text =~ s/\s+$//;
3590 $text ne "" or return "";
3591 my @stack = ();
3592 my $ans = "";
3593 my $end = length($text);
3594 pos($text) = 0;
3595 my ($autoclose, $autoclopen);
3596 my $lastmt = "";
3597 my $reopenp = 0;
3598 $autoclose = $htmlauto ? sub {
3599 my $s = $_[0] || "";
3600 while (@stack &&
3601 ($stack[$#stack]->[0] ne $s || $_[1] && !$stack[$#stack]->[2]) &&
3602 $tagocl{$stack[$#stack]->[0]}) {
3603 $ans .= "</" . $stack[$#stack]->[0] . ">";
3604 pop(@stack);
3606 } : sub {} if $validate;
3607 $autoclopen = $htmlauto ? sub {
3608 my $s = $_[0] || "";
3609 my $c;
3610 if ($tagblk{$s}) {$c = {p=>1}}
3611 elsif ($tagocl{$s}) {$c = $tagacl{$s}}
3612 else {return}
3613 my $clp = 0;
3614 while (@stack && $c->{$stack[$#stack]->[0]}) {
3615 $clp = 0;
3616 if ($stack[$#stack]->[2] &&
3617 $stack[$#stack]->[1]+3 eq $_[1]) {
3618 $ans .= "</\20>";
3619 } else {
3620 $ans .= "</" . $stack[$#stack]->[0] . ">";
3622 if ($stack[$#stack]->[2]) {
3623 $stack[$#stack]->[0] = "\20";
3624 } else {
3625 $clp = $s ne "p" && $stack[$#stack]->[0] eq "p";
3626 pop(@stack);
3629 $clp;
3630 } : sub {} if $validate;
3631 while (pos($text) < $end) {
3632 if ($text =~ /\G(\s+)/gc) {
3633 $ans .= $1;
3634 next;
3636 if ($text =~ /\G([^<]+)/gc) {
3637 if ($validate && @stack && $stack[$#stack]->[0] eq "\20") {
3638 push(@stack,["p",pos($text)-length($1)]);
3639 $reopenp = 0;
3640 $ans .= "<p>";
3642 $reopenp && do {
3643 push(@stack,["p",pos($text)-length($1)]);
3644 $reopenp = 0;
3645 $ans .= "<p>";
3647 $ans .= _EncodeAmps($1);
3648 $lastmt = "";
3649 next;
3651 my $tstart = pos($text);
3652 if ($opt{stripcomments} != 2 &&
3653 $text =~ /\G(<!--(?:[^-]|(?:-(?!-)))*-->)/gc) {
3654 # pass "comments" through unless stripping them
3655 if ($opt{stripcomments} && $opt{stripcomments} < 3) {
3656 # strip any trailing whitespace + \n after comment if present
3657 $text =~ /\G[ \t]*\n/gc;
3658 } else {
3659 # pass the "comment" on through
3660 $ans .= $1;
3662 next;
3664 if ($opt{stripcomments} >= 2 &&
3665 $text =~ /\G(<!--(?:[^-]|(?:-(?!->)))*-->)/gc) {
3666 # strip any trailing whitespace + \n after lax comment if present
3667 $text =~ /\G[ \t]*\n/gc;
3668 next;
3670 if ($text =~ /\G(<[^>]*>)/gc) {
3671 my $tag = $1;
3672 my $tt;
3673 if (($tag =~ m{^<($g_possible_tag_name)(?:[\s>]|/>$)} ||
3674 $tag =~ m{^</($g_possible_tag_name)\s*>}) &&
3675 $ok_tag_name{$tt=lc($1)})
3677 my ($stag, $styp, $autocloseflag) = _Sanitize($tag);
3678 if ($styp == 2 && $lastmt eq $tt) {
3679 $lastmt = "";
3680 next;
3682 $lastmt = $styp == -3 ? $tt : "";
3683 $tt = "p" if $autocloseflag;
3684 if ($validate && $styp) {
3685 my $clp = &$autoclopen($tt, $tstart) if $styp != 2;
3686 if ($styp == 1) {
3687 $reopenp && $taginl{$tt} and do {
3688 push(@stack,["p",$tstart]);
3689 $ans .= "<p>";
3691 push(@stack,[$tt,$tstart,$autocloseflag,$clp]);
3692 $reopenp = 0;
3693 } elsif ($styp == 2) {
3694 $reopenp && ($tt eq "p" || $tt eq "\20") and do {
3695 $reopenp = 0;
3696 next;
3698 &$autoclose($tt, $autocloseflag);
3699 my $mtstkchk = sub {
3700 !@stack and _xmlfail("closing tag $tt without matching open at " .
3701 _linecol($tstart, $text));
3703 &$mtstkchk;
3704 if ($autocloseflag && $stack[$#stack]->[0] eq "\20") {
3705 pop(@stack);
3706 $stag = "";
3707 } elsif ($stack[$#stack]->[0] eq $tt) {
3708 $stack[$#stack]->[3] and $reopenp = 1;
3709 pop(@stack);
3710 } else {
3711 pop(@stack) while @stack && $stack[$#stack]->[0] eq "\20";
3712 &$mtstkchk;
3713 my @i = @{$stack[$#stack]};
3714 _xmlfail("opening tag $i[0] at " . _linecol($i[1], $text) .
3715 " mismatch with closing tag $tt at " . _linecol($tstart, $text));
3719 $ans .= $stag;
3720 next;
3721 } else {
3722 $tag =~ s/^</&lt;/;
3723 $ans .= _EncodeAmps($tag);
3724 $lastmt = "";
3725 next;
3728 # can only get here if "\G" char is an unmatched "<"
3729 pos($text) += 1;
3730 $ans .= "&lt;";
3731 $lastmt = "";
3733 &$autoclose if $validate;
3734 if ($validate && @stack) {
3735 my @errs;
3736 my $j;
3737 for ($j = 0; $j <= $#stack; ++$j) {
3738 my @i = @{$stack[$j]};
3739 next if $i[0] eq "\20";
3740 unshift(@errs, "opening tag $i[0] without matching close at " .
3741 _linecol($i[1], $text));
3743 _xmlfail(@errs) unless !@errs;
3745 # Remove any unwanted extra leading <p></p> sections
3746 $ans =~ s{<p></\020>}{}gs if $validate;
3748 return $ans."\n";
3752 sub _linecol {
3753 my ($pos, $txt) = @_;
3754 pos($txt) = 0;
3755 my ($l, $p);
3756 $l = 1 + $opt{firstline};
3757 ++$l while ($p = pos($txt)), $txt =~ /\G[^\n]*\n/gc && pos($txt) <= $pos;
3758 return "line $l col " . (1 + ($pos - $p));
3762 sub _xmlfail {
3763 die join("", map("$_\n", @_));
3767 sub _Sanitize {
3768 my $tag = shift;
3769 my $seenatt = {};
3770 if ($tag =~ m{^</}) {
3771 my $autocloseflag = undef;
3772 $autocloseflag = 1, $tag="</p>" if $tag eq "</\20>";
3773 $tag =~ tr/\t\n\f\r //d; # remove whitespace
3774 return (lc($tag),2,$autocloseflag);
3776 if ($tag =~ m{^<([^\s</>]+)\s+}gs) {
3777 my $tt = lc($1);
3778 my $autocloseflag = undef;
3779 $autocloseflag = 1, $tt="p" if $tt eq "\20";
3780 my $out = "<" . $tt . " ";
3781 my $ok = $tagatt{$tt};
3782 ref($ok) eq "HASH" or $ok = {};
3783 my $atc = 0;
3784 while ($tag =~ m{\G\s*([^\s\042\047</>=]+)((?>=)|\s*)}gcs) {
3785 my ($a,$s) = ($1, $2);
3786 if ($s eq "" && substr($tag, pos($tag), 1) =~ /^[\042\047]/) {
3787 # pretend the "=" sign wasn't overlooked
3788 $s = "=";
3790 if (substr($s,0,1) ne "=") {
3791 # it's one of "those" attributes (e.g. compact) or not
3792 # _SanitizeAtt will fix it up if it is
3793 $out .= _SanitizeAtt($a, '""', $ok, $seenatt, $tt);
3794 ++$atc;
3795 next;
3797 if ($tag =~ /\G([\042\047])((?:(?!\1)(?!<).)*)\1\s*/gcs) {
3798 $out .= _SanitizeAtt($a, $1.$2.$1, $ok, $seenatt, $tt);
3799 ++$atc;
3800 next;
3802 if ($tag =~ m{\G([\042\047])((?:(?!\1)(?![<>])(?!/>).)*)}gcs) {
3803 # what to do what to do what to do
3804 # trim trailing \s+ and magically add the missing quote
3805 my ($q, $v) = ($1, $2);
3806 $v =~ s/\s+$//;
3807 $out .= _SanitizeAtt($a, $q.$v.$q, $ok, $seenatt, $tt);
3808 ++$atc;
3809 next;
3811 if ($tag =~ m{\G([^\s<>]+)\s*}gcs) {
3812 # auto quote it
3813 my $v = $1;
3814 $v =~ s/\042/&quot;/go;
3815 $out .= _SanitizeAtt($a, '"'.$v.'"', $ok, $seenatt, $tt);
3816 ++$atc;
3817 next;
3819 # give it an empty value
3820 $out .= _SanitizeAtt($a, '""', $ok, $seenatt, $tt);
3821 ++$atc;
3823 my $sfx = substr($tag, pos($tag));
3824 $out =~ s/\s+$//;
3825 my $typ = 1;
3826 if ($tagmt{$tt}) {
3827 $typ = ($sfx =~ m,/>$,) ? 3 : -3;
3828 $out .= $opt{empty_element_suffix};
3829 return ("&lt;" . substr($tag,1), 0) if !$atc && $taga1p{$tt};
3830 } else {
3831 if ($sfx =~ m,/>$,) {
3832 return ("&lt;" . substr($tag,1), 0) if !$atc && $taga1p{$tt};
3833 $typ = 3;
3834 } else {
3835 return ("&lt;" . substr($tag,1), 0) if !$atc && $taga1p{$tt};
3837 $out .= ">";
3838 $out .= "</$tt>" if $typ == 3;
3840 return ($out,$typ,$autocloseflag);
3841 } elsif ($tag =~ /^<([^\s<\/>]+)/s) {
3842 my $tt = lc($1);
3843 return ("&lt;" . substr($tag,1), 0) if $taga1p{$tt};
3844 if ($tagmt{$tt}) {
3845 my $typ = ($tag =~ m,/>$,) ? 3 : -3;
3846 return ("<" . $tt . $opt{empty_element_suffix}, $typ);
3847 } elsif ($tag =~ m,/>$,) {
3848 return ("<" . $tt . "></" . $tt . ">", 3);
3849 } else {
3850 return ("<" . $tt . ">", 1) unless $tt eq "\20";
3851 return ("<p>", 1, 1);
3854 return (lc($tag),0);
3858 sub _SanitizeAtt {
3859 my $att = lc($_[0]);
3860 return "" unless $att =~ /^[_a-z:][_a-z:0-9.-]*$/; # no weirdo char att names
3861 return "" unless $univatt{$att} || $_[2]->{$att};
3862 return "" if $_[3]->{$att}; # no repeats
3863 $_[3]->{$att} = 1;
3864 $impatt{$att} and return $att."=".'"'.$att.'" ';
3865 (($_[4] eq "a" && $att eq "href") ||
3866 ($_[4] eq "img" && $att eq "src")) &&
3867 $_[1] =~ /^\s*[\047\042]\s*javascript:/io and do {
3868 $_[1] = '"#"';
3869 ref($opt{base_prefix}) eq 'CODE' and
3870 $_[1] = '"' . escapeXML(&{$opt{base_prefix}}("#")) . '"';
3872 if ($_[4] eq "a") {
3873 $att eq "target" and
3874 return $_[1] =~ /^([\042\047])\s*_blank\s*\1$/io ? 'target="_blank" ' : "";
3875 $att eq "rel" and
3876 return $_[1] =~ /^([\042\047])\s*nofollow\s*\1$/io ? 'rel="nofollow" ' : "";
3878 if ($lcattval{$att}) {
3879 return $att."="._SanitizeAttValue(lc($_[1]))." ";
3880 } else {
3881 my $satt = _SanitizeAttValue($_[1]);
3882 if (ref($opt{urlfunc}) eq 'CODE' &&
3883 (($_[4] eq "a" && $att eq "href") ||
3884 ($_[4] eq "img" && $att eq "src")) ) {
3885 my ($lq,$v,$rq);
3886 $lq = substr($satt, 0, 1);
3887 $rq = substr($satt, -1, 1);
3888 $v = unescapeXML(substr($satt, 1, length($satt)-2));
3889 my ($uhost, $upath, $uq, $uf) = SplitURL($v);
3890 $v = &{$opt{urlfunc}}($v, \%opt, $_[4], $uhost, $upath, $uq, $uf);
3891 $satt = $lq . escapeXML($v) . $rq;
3893 return $att."=".$satt." ";
3898 sub _SanitizeAttValue {
3899 my $v = shift;
3900 if ($v =~ /^([\042\047])(.*?)\1$/s) {
3901 return $1.escapeXML($2).$1;
3902 } else {
3903 return '"'.escapeXML($v).'"';
3908 sub _ProcessURLTag {
3909 my ($att, $tag, $dofrag) = @_;
3911 $att = lc($att) . "=";
3912 if ($tag =~ /^(<[^\s>]+\s+)/g) {
3913 my $out = $1;
3914 while ($tag =~ /\G([^\s\042\047<\/>=]+=)([\042\047])((?:(?!\2)(?!<).)*)(\2\s*)/gcs) {
3915 my ($p, $q, $v, $s) = ($1, $2, $3, $4);
3916 if (lc($p) eq $att && $v ne "") {
3917 if ($dofrag && $v =~ m"^#") {
3918 $v = _FindFragmentMatch($v);
3919 my $bpcr;
3920 if (ref($bpcr = $opt{base_prefix}) eq 'CODE') {
3921 $v = "\2\3" . &$bpcr($v);
3923 } else {
3924 $v = _PrefixURL($v);
3926 $v = _EncodeAttText($v);
3928 $out .= $p . $q . $v . $s;
3930 $out .= substr($tag, pos($tag));
3931 substr($out,0,1) = $g_escape_table{'<'};
3932 substr($out,-1,1) = $g_escape_table{'>'};
3933 return $out;
3936 return $tag;
3940 my $oops_entities;
3941 BEGIN { $oops_entities = qr/(?:lt|gt|amp|quot|apos|nbsp)/io; }
3944 # $_[0] => the value to XML escape
3945 # returns the XML escaped value
3946 # Encodes the five required entites (amp,lt,gt,quot,apos)
3947 # while preserving any pre-existing entities which means that
3948 # calling this repeatedly on already-escaped text should return
3949 # it unchanged (i.e. it's idempotent).
3951 sub escapeXML {
3952 my $text = shift;
3954 # Treat these accidents as though they had the needed ';'
3955 $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
3957 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
3958 # http://bumppo.net/projects/amputator/
3959 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
3961 # Remaining entities now
3962 $text =~ s/\042/&quot;/g;
3963 $text =~ s/\047/&#39;/g; # Some older browsers do not grok &apos;
3964 $text =~ s/</&lt;/g;
3965 $text =~ s/>/&gt;/g;
3967 return $text;
3971 # $_[0] => value to be unescaped
3972 # returns unescaped value
3973 # The five required XML entities (amp,lt,gt,quot,apos) plus nbsp
3974 # are decoded as well as decimal (#d+) and hexadecimal (#xh+).
3976 # While the escapeXML function tries to be idempotent when presented
3977 # with an already-escaped string, this function is NOT necessarily
3978 # idempotent when presented with an already decoded string unless it's
3979 # been decoded to the point there are no more recognizable entities left.
3980 # In other words given a string such as:
3982 # &amp;amp;amp;amp;
3984 # Each call will only decode one layer of escaping and it will take four
3985 # successive calls to finally end up with just "&".
3987 sub unescapeXML {
3988 my $text = shift;
3990 # Treat these accidents as though they had the needed ';'
3991 $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
3993 $text =~ s/&[qQ][uU][oO][tT];/\042/gso;
3994 $text =~ s/&[aA][pP][oO][sS];/\047/gso;
3995 $text =~ s/&[gG][tT];/>/gso;
3996 $text =~ s/&[lL][tT];/</gso;
3997 $text =~ s/&[nN][bB][sS][pP];/&#160;/gso;
3998 $text =~ s{&([aA][mM][pP]|\#\d+|\#x[0-9a-fA-F]+);}{
3999 local $_=$1;
4000 lc($_) eq 'amp' ? '&' :
4001 /^#(\d+)$/ ? chr($1) :
4002 /^#[xX](.*)$/ ? chr(hex($1)) :
4004 }gsex;
4006 return $text;
4010 # $_[0] => the input URL to split
4011 # $_[1] => if a true value, call unescapeXML before splitting
4012 # returns array:
4013 # [0] => scheme, name+password, host, port ("" if not present)
4014 # [1] => path in url (starts with "/" if absolute otherwise relative)
4015 # [2] => query string ("" if not present otherwise starts with "?")
4016 # [3] => fragment ("" if not present otherwise starts with "#")
4017 # The returned value recovers the (possibly unescapeXML'd) input
4018 # string by simply concatenating the returned array elements.
4020 sub SplitURL {
4021 my ($url, $unesc) = @_;
4022 $unesc and $url = unescapeXML($url);
4023 my ($sh, $p, $q, $f) = ("", "", "", "");
4024 if ($url =~ m{^([A-Za-z][A-Za-z0-9.+-]*:)(//.*)$}os) {
4025 $sh = $1;
4026 $url = $2;
4028 if ($url =~ m{^(//[^/?#]*)((?:[/?#].*)?)$}os) {
4029 $sh .= $1;
4030 $url = $2;
4032 ($p, $q, $f) = $url =~ m{^([^?#]*)((?:[?][^#]*)?)((?:[#].*)?)$}os;
4033 return ($sh, $p, $q, $f);
4037 my $_replacesub;
4038 BEGIN { $_replacesub = sub {
4039 my $x = $named_character_entity{$_[1]};
4040 $x ? '&#'.$x.';' : $_[0];
4044 # $_[0] => the input text to process
4045 # returns text with all known named character entities replaced
4046 # with their equivalent numerical entity
4047 sub ConvertNamedCharacterEntities {
4048 use bytes;
4049 my $text = shift;
4050 defined($text) or return undef;
4051 $text =~ s/(&([A-Za-z]{3,8}[1-4]{0,2});)/&$_replacesub($1,$2)/goes;
4052 return $text;
4056 my $_usasciisub;
4057 BEGIN { $_usasciisub = sub {
4058 my $c = $_[0];
4059 my $o = ord($c);
4060 return ($o <= 999) ? (($o < 128) ? $c : "&#$o;") : sprintf("&#x%x;", $o);
4064 # $_[0] => the input text to process
4065 # returns text with non-US-ASCII characters replaced
4066 # with their equivalent numerical character entities,
4067 # but only if the input text has already been utf8::decode'd
4068 sub ConvertToASCII {
4069 my $text = shift;
4070 defined($text) or return undef;
4071 $text =~ s/([^\x00-\x7F])/&$_usasciisub($1)/goes;
4072 return $text;
4076 sub _EncodeAmps {
4077 my $text = shift;
4079 # Treat these accidents as though they had the needed ';'
4080 $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
4082 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
4083 # http://bumppo.net/projects/amputator/
4084 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
4086 return $text;
4090 sub _EncodeAmpsAndAngles {
4091 # Smart processing for ampersands and angle brackets that need to be encoded.
4093 my $text = shift;
4095 # Treat these accidents as though they had the needed ';'
4096 $text =~ s/&($oops_entities)(?![A-Za-z0-9=;])/&$1;/go;
4098 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
4099 # http://bumppo.net/projects/amputator/
4100 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
4102 # Encode naked <'s
4103 $text =~ s{<(?![\020a-z/?\$!])}{&lt;}gi;
4104 $text =~ s{<(?=[^>]*$)}{&lt;}g;
4106 # Encode <'s that cannot possibly be a start or end tag
4107 $text =~ s{(<[^>]*>)}{_DoTag($1)}ige;
4109 return $text;
4113 sub _EncodeBackslashEscapes {
4115 # Parameter: String.
4116 # Returns: String after processing the following backslash escape sequences.
4118 local $_ = shift;
4120 s!\\\\!$g_escape_table{'\\'}!go; # Must process escaped backslashes first.
4121 s{\\([`*_~{}\[\]()>#+\-.!`])}{$g_escape_table{$1}}go;
4123 return $_;
4127 sub _DoAutoLinks {
4128 local $_ = shift;
4130 s{<((https?|ftps?):[^'\042>\s]+)>(?!\s*</a>)}{_MakeATag($1, "&lt;".$1."&gt;")}gise;
4132 # Email addresses: <address@domain.foo>
4135 (?:mailto:)?
4137 [-.\w]+
4139 [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
4143 _EncodeEmailAddress(_UnescapeSpecialChars($1), "&#x3c;", "&#62;");
4144 }egix;
4146 # (kjm) I don't do "x" patterns
4147 s{(?:^|(?<=\s))((?:https?|ftps?)://(?:[-a-zA-Z0-9./?\&\%=_~!*;:\@+\$,\x23](?:(?<![.,:;])|(?=[^\s])))+)}
4148 {_MakeATag($1, $1)}soge;
4149 s{(?<![][])(?<!\] )\[RFC( ?)([0-9]{1,5})\](?![][])(?! \[)}
4150 {"["._MakeATag("https://tools.ietf.org/html/rfc$2", "RFC$1$2", "RFC $2")."]"}soge;
4152 return $_;
4156 sub _EncodeEmailAddress {
4158 # Input: an email address, e.g. "foo@example.com"
4160 # Output: the email address as a mailto link, with each character
4161 # of the address encoded as either a decimal or hex entity, in
4162 # the hopes of foiling most address harvesting spam bots. E.g.:
4164 # <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
4165 # x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
4166 # &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
4168 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
4169 # mailing list: <https://tinyurl.com/yu7ue>
4172 my ($addr, $prefix, $suffix) = @_;
4173 $prefix = "" unless defined($prefix);
4174 $suffix = "" unless defined($suffix);
4176 srand(unpack('N',md5($addr)));
4177 my @encode = (
4178 sub { '&#' . ord(shift) . ';' },
4179 sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
4180 sub { shift },
4183 $addr = "mailto:" . $addr;
4185 $addr =~ s{(.)}{
4186 my $char = $1;
4187 if ( $char eq '@' ) {
4188 # this *must* be encoded. I insist.
4189 $char = $encode[int rand 1]->($char);
4190 } elsif ( $char ne ':' ) {
4191 # leave ':' alone (to spot mailto: later)
4192 my $r = rand;
4193 # roughly 10% raw, 45% hex, 45% dec
4194 $char = (
4195 $r > .9 ? $encode[2]->($char) :
4196 $r < .45 ? $encode[1]->($char) :
4197 $encode[0]->($char)
4200 $char;
4201 }gex;
4203 # strip the mailto: from the visible part
4204 (my $bareaddr = $addr) =~ s/^.+?://;
4205 $addr = _MakeATag("$addr", $prefix.$bareaddr.$suffix);
4207 return $addr;
4211 sub _UnescapeSpecialChars {
4213 # Swap back in all the special characters we've hidden.
4215 my $text = shift;
4217 while( my($char, $hash) = each(%g_escape_table) ) {
4218 $text =~ s/$hash/$char/g;
4220 return $text;
4224 sub _TokenizeHTML {
4226 # Parameter: String containing HTML markup.
4227 # Returns: Reference to an array of the tokens comprising the input
4228 # string. Each token is either a tag (possibly with nested,
4229 # tags contained therein, such as <a href="<MTFoo>">, or a
4230 # run of text between tags. Each element of the array is a
4231 # two-element array; the first is either 'tag' or 'text';
4232 # the second is the actual value.
4235 # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
4236 # <https://web.archive.org/web/20041215155739/http://bradchoate.com/weblog/2002/07/27/mtregex>
4239 my $str = shift;
4240 my $pos = 0;
4241 my $len = length $str;
4242 my @tokens;
4244 my $depth = 6;
4245 my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
4246 my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment
4247 (?s: <\? .*? \?> ) | # processing instruction
4248 $nested_tags/iox; # nested tags
4250 while ($str =~ m/($match)/g) {
4251 my $whole_tag = $1;
4252 my $sec_start = pos $str;
4253 my $tag_start = $sec_start - length $whole_tag;
4254 if ($pos < $tag_start) {
4255 push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
4257 push @tokens, ['tag', $whole_tag];
4258 $pos = pos $str;
4260 push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
4261 \@tokens;
4265 sub _Outdent {
4267 # Remove one level of line-leading indent_width of spaces
4269 my $text = shift;
4271 $text =~ s/^ {1,$opt{indent_width}}//gm;
4272 return $text;
4276 # _DeTab
4278 # $1 => input text
4279 # $2 => optional tab width (default is $opt{tab_width})
4280 # $3 => leading spaces to strip off each line first (default is 0 aka none)
4281 # <= result with tabs expanded
4282 sub _DeTab {
4283 my $text = shift;
4284 my $ts = shift || $opt{tab_width};
4285 my $leadsp = shift || 0;
4286 my $spr = qr/^ {1,$leadsp}/ if $leadsp;
4287 pos($text) = 0;
4288 my $end = length($text);
4289 my $ans = "";
4290 while (pos($text) < $end) {
4291 my $line;
4292 if ($text =~ /\G(.*?\n)/gcs) {
4293 $line = $1;
4294 } else {
4295 $line = substr($text, pos($text));
4296 pos($text) = $end;
4298 $line =~ s/$spr// if $leadsp;
4299 # From the Perl camel book section "Fluent Perl" but modified a bit
4300 $line =~ s/(.*?)(\t+)/$1 . ' ' x (length($2) * $ts - length($1) % $ts)/ges;
4301 $ans .= $line;
4303 return $ans;
4307 sub _PrefixURL {
4309 # Add URL prefix if needed
4311 my $url = shift;
4312 $url =~ s/^\s+//;
4313 $url =~ s/\s+$//;
4314 $url = "#" unless $url ne "";
4316 return $url unless
4317 ref($opt{abs_prefix}) eq 'CODE' ||
4318 ref($opt{url_prefix}) eq 'CODE' ||
4319 ref($opt{img_prefix}) eq 'CODE' ;
4320 return $url if $url =~ m"^\002\003" || $url =~ m"^#" || $url =~ m,^//,;
4321 $url = &{$opt{abs_prefix}}($url) if $url =~ m,^/, && ref($opt{abs_prefix}) eq 'CODE';
4322 return $url if $url =~ /^[A-Za-z][A-Za-z0-9+.-]*:/ || $url =~ m,^//, ||
4323 ($opt{keepabs} && $url =~ m,^/,);
4324 my $cr = $opt{url_prefix};
4325 $cr = $opt{img_prefix}
4326 if ref($opt{img_prefix}) eq 'CODE' && $url =~ m"^[^#?]*\.(?:png|gif|jpe?g|svgz?)(?:[#?]|$)"i;
4327 return $url unless ref($cr) eq 'CODE';
4328 return "\2\3".&$cr(substr($url, 0, 1) eq '/' ? substr($url, 1) : $url);
4332 BEGIN {
4333 $g_style_sheet = <<'STYLESHEET';
4335 <style type="text/css">
4336 /* <![CDATA[ */
4338 /* Markdown.pl fancy style sheet
4339 ** Copyright (C) 2017,2018,2019,2020,2021 Kyle J. McKay.
4340 ** All rights reserved.
4342 ** Redistribution and use in source and binary forms, with or without
4343 ** modification, are permitted provided that the following conditions are met:
4345 ** 1. Redistributions of source code must retain the above copyright notice,
4346 ** this list of conditions and the following disclaimer.
4348 ** 2. Redistributions in binary form must reproduce the above copyright
4349 ** notice, this list of conditions and the following disclaimer in the
4350 ** documentation and/or other materials provided with the distribution.
4352 ** 3. Neither the name of the copyright holder nor the names of its
4353 ** contributors may be used to endorse or promote products derived from
4354 ** this software without specific prior written permission.
4356 ** THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
4357 ** AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
4358 ** IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
4359 ** ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
4360 ** LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
4361 ** CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
4362 ** SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
4363 ** INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
4364 ** CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
4365 ** ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
4366 ** POSSIBILITY OF SUCH DAMAGE.
4369 div.%(base)code-bt > pre, div.%(base)code > pre {
4370 margin: 0;
4371 padding: 0;
4372 overflow: auto;
4375 div.%(base)code-bt > pre > code, div.%(base)code > pre > code {
4376 display: inline-block;
4377 margin: 0;
4378 padding: 0.5em 0;
4379 border-top: thin dotted;
4380 border-bottom: thin dotted;
4383 table.%(base)table {
4384 margin-bottom: 0.5em;
4386 table.%(base)table, table.%(base)table th, table.%(base)table td {
4387 border-collapse: collapse;
4388 border-spacing: 0;
4389 border: thin solid;
4391 table.%(base)yaml-table {
4392 border-collapse: collapse;
4394 table.%(base)yaml-table * {
4395 border: thin solid;
4397 table.%(base)yaml-table th {
4398 text-align: center;
4400 table.%(base)yaml-table th, table.%(base)yaml-table td {
4401 padding-left: 0.5ex;
4402 padding-right: 0.5ex;
4405 ol.%(base)ol {
4406 counter-reset: %(base)item;
4408 ol.%(base)ol[start="0"] {
4409 counter-reset: %(base)item -1;
4411 ol.%(base)ol[start="5"] {
4412 counter-reset: %(base)item 4;
4414 ol.%(base)ol[start="10"] {
4415 counter-reset: %(base)item 9;
4417 ol.%(base)ol > span.%(base)ol-incr {
4418 counter-increment: %(base)item;
4420 ol.%(base)ol > span.%(base)ol-incr-2 {
4421 counter-increment: %(base)item 2;
4423 ol.%(base)ol > span.%(base)ol-incr-5 {
4424 counter-increment: %(base)item 5;
4426 ol.%(base)ol > span.%(base)ol-incr-10 {
4427 counter-increment: %(base)item 10;
4429 ol.%(base)lc-greek, li.%(base)lc-greek {
4430 list-style-type: lower-greek;
4432 ol.%(base)ol > li {
4433 counter-increment: %(base)item;
4435 ol.%(base)ol > li.%(base)li,
4436 ol.%(base)ol > li.%(base)li-lc,
4437 ol.%(base)ol > li.%(base)li-lc-greek,
4438 ol.%(base)ol > li.%(base)li-uc {
4439 list-style-type: none;
4440 display: block;
4442 ol.%(base)ol > li.%(base)li:before,
4443 ol.%(base)ol > li.%(base)li-lc:before,
4444 ol.%(base)ol > li.%(base)li-lc-greek:before,
4445 ol.%(base)ol > li.%(base)li-uc:before {
4446 position: absolute;
4447 text-align: right;
4448 white-space: nowrap;
4449 margin-left: -9ex;
4450 width: 9ex;
4452 ol.%(base)ol > li.%(base)li[type="1"]:before {
4453 content: counter(%(base)item, decimal) ")\A0 \A0 ";
4455 ol.%(base)ol > li.%(base)li-lc[type="i"]:before,
4456 ol.%(base)ol > li.%(base)li-lc[type="I"]:before {
4457 content: counter(%(base)item, lower-roman) ")\A0 \A0 ";
4459 ol.%(base)ol > li.%(base)li-uc[type="I"]:before,
4460 ol.%(base)ol > li.%(base)li-uc[type="i"]:before {
4461 content: counter(%(base)item, upper-roman) ")\A0 \A0 ";
4463 ol.%(base)ol > li.%(base)li-lc[type="a"]:before,
4464 ol.%(base)ol > li.%(base)li-lc[type="A"]:before {
4465 content: counter(%(base)item, lower-alpha) ")\A0 \A0 ";
4467 ol.%(base)ol > li.%(base)li-lc-greek[type="a"]:before,
4468 ol.%(base)ol > li.%(base)li-lc-greek[type="A"]:before {
4469 content: counter(%(base)item, lower-greek) ")\A0 \A0 ";
4471 ol.%(base)ol > li.%(base)li-uc[type="A"]:before,
4472 ol.%(base)ol > li.%(base)li-uc[type="a"]:before {
4473 content: counter(%(base)item, upper-alpha) ")\A0 \A0 ";
4476 li.%(base)checkbox-on,
4477 li.%(base)checkbox-off {
4478 list-style-type: none;
4479 display: block;
4481 li.%(base)checkbox-on > span:first-child + span + span,
4482 li.%(base)checkbox-off > span:first-child + span + span {
4483 position: absolute;
4484 clip: rect(0,0,0,0);
4486 li.%(base)checkbox-on > span:first-child,
4487 li.%(base)checkbox-off > span:first-child,
4488 li.%(base)checkbox-on > span:first-child + span,
4489 li.%(base)checkbox-off > span:first-child + span {
4490 display: block;
4491 position: absolute;
4492 margin-left: -3ex;
4493 width: 1em;
4494 height: 1em;
4496 li.%(base)checkbox-on > span:first-child > span:first-child,
4497 li.%(base)checkbox-off > span:first-child > span:first-child {
4498 display: block;
4499 position: absolute;
4500 left: 0.75pt; top: 0.75pt; right: 0.75pt; bottom: 0.75pt;
4502 li.%(base)checkbox-on > span:first-child > span:first-child:before,
4503 li.%(base)checkbox-off > span:first-child > span:first-child:before {
4504 display: inline-block;
4505 position: relative;
4506 right: 1pt;
4507 width: 100%;
4508 height: 100%;
4509 border: 1pt solid;
4510 content: "";
4512 li.%(base)checkbox-on > span:first-child + span:before {
4513 position: relative;
4514 left: 2pt;
4515 bottom: 1pt;
4516 font-size: 125%;
4517 line-height: 80%;
4518 vertical-align: text-top;
4519 content: "\2713";
4522 /* ]]> */
4523 </style>
4525 STYLESHEET
4526 $g_style_sheet =~ s/^\s+//g;
4527 $g_style_sheet =~ s/\s+$//g;
4528 $g_style_sheet .= "\n";
4533 __DATA__
4535 =head1 NAME
4537 Markdown.pl - convert Markdown format text files to HTML
4539 =head1 SYNOPSIS
4541 B<Markdown.pl> [B<--help>] [B<--html4tags>] [B<--htmlroot>=I<prefix>]
4542 [B<--imageroot>=I<prefix>] [B<--version>] [B<--shortversion>]
4543 [B<--tabwidth>=I<num>] [B<--stylesheet>] [B<--stub>] [--]
4544 [I<file>...]
4546 Options:
4547 -h show short usage help
4548 --help show long detailed help
4549 --html4tags use <br> instead of <br />
4550 --deprecated allow <dir> and <menu> tags
4551 --sanitize sanitize tag attributes
4552 --no-sanitize do not sanitize tag attributes
4553 --validate-xml check if output is valid XML
4554 --validate-xml-internal fast basic check if output is valid XML
4555 --no-validate-xml do not check output for valid XML
4556 --strip-comments remove XML-like comments from output
4557 --strip-comments-lax remove XML-like comments from output
4558 --strip-comments-strict remove only strictly valid XML comments
4559 --strip-comments-lax-only remove only invalid XML-like comments
4560 --no-strip-comments do not remove any XML/XML-like comments
4561 --tabwidth=num expand tabs to num instead of 8
4562 --auto-number automatically number h1-h6 headers
4563 -k | --keep-abs keep abspath URLs despite -r/-i
4564 -a prefix | --absroot=prefix append abspath URLs to prefix
4565 -b prefix | --base=prefix prepend prefix to fragment-only URLs
4566 -r prefix | --htmlroot=prefix append relative non-img URLs to prefix
4567 -i prefix | --imageroot=prefix append relative img URLs to prefix
4568 -w [wikipat] | --wiki[=wikipat] activate wiki links using wikipat
4569 --yaml[=(enable|disable|strip|...)] select YAML front matter processing
4570 -V | --version show version, authors, license
4571 and copyright
4572 -s | --shortversion show just the version number
4573 --raw | --raw-xml input contains only raw xhtml
4574 --raw-html input contains only raw html
4575 --div[=id] wrap body in div with given id
4576 --stylesheet output the fancy style sheet
4577 --no-stylesheet do not output fancy style sheet
4578 --keep-named-character-entities do not convert named character entities
4579 --us-ascii convert non-ASCII to character entities
4580 --stub wrap output in stub document
4581 implies --stylesheet
4582 -- end options and treat next
4583 argument as file
4585 =head1 DESCRIPTION
4587 Markdown is a text-to-HTML filter; it translates an easy-to-read /
4588 easy-to-write structured text format into HTML. Markdown's text format
4589 is most similar to that of plain text email, and supports features such
4590 as headers, *emphasis*, code blocks, blockquotes, and links.
4592 Markdown's syntax is designed not as a generic markup language, but
4593 specifically to serve as a front-end to (X)HTML. You can use span-level
4594 HTML tags anywhere in a Markdown document, and you can use block level
4595 HTML tags (like <div> and <table> as well).
4597 For more information about Markdown's syntax, see the F<basics.md>
4598 and F<syntax.md> files included with F<Markdown.pl>.
4600 Input (auto-detected) may be either ISO-8859-1 or UTF-8. Output is always
4601 converted to the UTF-8 character set.
4604 =head1 OPTIONS
4606 Use "--" to end switch parsing. For example, to open a file named "-z", use:
4608 Markdown.pl -- -z
4610 =over
4613 =item B<--html4tags>
4615 Use HTML 4 style for empty element tags, e.g.:
4617 <br>
4619 instead of Markdown's default XHTML style tags, e.g.:
4621 <br />
4623 This option is I<NOT compatible> with the B<--validate-xml> option
4624 and will produce an immediate error if both are given.
4627 =item B<--deprecated>
4629 Both "<dir>" and "<menu>" are normally taken as literal text and the leading
4630 "<" will be automatically escaped.
4632 If this option is used, they are recognized as valid tags and passed through
4633 without being escaped.
4635 When dealing with program argument descriptions "<dir>" can be particularly
4636 problematic therefore use of this option is not recommended.
4638 Other deprecated tags (such as "<font>" and "<center>" for example) continue
4639 to be recognized and passed through even without using this option.
4642 =item B<--sanitize>
4644 Removes troublesome tag attributes from embedded tags. Only a very strictly
4645 limited set of tag attributes will be permitted, other attributes will be
4646 silently discarded. The set of allowed attributes varies by tag.
4648 Splits empty minimized elements that are not one of the HTML allowed empty
4649 elements (C<area> C<basefont> C<br> C<col> C<hr> C<img>) into separate begin
4650 and end tags. For example, C<< <p/> >> or C<< <p /> >> will be split into
4651 C<< <p></p> >>.
4653 Combines adjacent (whitespace separated only) opening and closing tags for
4654 the same HTML empty element into a single minimized tag. For example,
4655 C<< <br></br> >> will become C<< <br /> >>.
4657 Tags that require at least one attribute to be present to be meaningful
4658 (e.g. C<a>, C<area>, C<img>, C<map>) but have none will be treated as non-tags
4659 potentially creating unexpected errors. For example, the sequence
4660 C<< <a>text here</a> >> will be sanitized to C<< &lt;a>text here</a> >> since
4661 an C<a> tag without any attributes is meaningless, but then the trailing
4662 close tag C<< </a> >> will become an error because it has no matching open
4663 C<< <a ...> >> tag.
4665 The point of this check is not to cause undue frustration, but to allow
4666 such constructs to be used as text without the need for escaping since they
4667 are meaningless as tags. For example, C<< <a><c><e> >> works just fine
4668 as plain text and so does C<< <A><C><E> >> because the
4669 C<< <a> >>/C<< <A> >> will be treated as a non-tag automatically. In fact,
4670 they can even appear inside links too such as
4671 C<< <a href="#somewhere">Link to <a><c><e> article</a> >>.
4673 Problematic C<&> characters are fixed up such as standalone C<&>s (or those not
4674 part of a valid entity reference) are turned into C<&amp;>. Within attribute
4675 values, single and double quotes are turned into C<&> entity refs.
4677 This is enabled by default.
4680 =item B<--no-sanitize>
4682 Do not sanitize tag attributes. This option does not allow any tags that would
4683 not be allowed without this option, but it does completely suppress the
4684 attribute sanitation process. If this option is specified, no attributes will
4685 be removed from any tag (although C<img> and C<a> tags will still be affected
4686 by B<--imageroot>, B<--htmlroot>, B<--absroot> and/or B<--base> options).
4687 Use of this option is I<NOT RECOMMENDED>.
4690 =item B<--validate-xml>
4692 Perform XML validation on the output before it's output and die if
4693 it fails validation. This requires the C<XML::Simple> or C<XML::Parser>
4694 module be present (one is only required if this option is given).
4696 Any errors are reported to STDERR and the exit status will be
4697 non-zero on XML validation failure. Note that all line and column
4698 numbers in the error output refer to the entire output that would
4699 have been produced. Re-run with B<--no-validate-xml> to see what's
4700 actually present at those line and column positions.
4702 If the B<--stub> option has also been given, then the entire output is
4703 validated as-is. Without the B<--stub> option, the output will be wrapped
4704 in C<< <div>...</div> >> for validation purposes but that extra "div" added
4705 for validation will not be added to the final output.
4707 This option is I<NOT enabled by default>.
4709 This option is I<NOT compatible> with the B<--html4tags> option and will
4710 produce an immediate error if both are given.
4713 =item B<--validate-xml-internal>
4715 Perform XML validation on the output before it's output and die if
4716 it fails validation. This uses a simple internal consistency checker
4717 that finds unmatched and mismatched open/close tags.
4719 Non-empty elements that in HTML have optional closing tags (C<colgroup>
4720 C<dd> C<dt> C<li> C<p> C<tbody> C<td> C<tfoot> C<th> C<thead> C<tr>)
4721 will automatically have any omitted end tags inserted during the
4722 `--validate-xml-internal` process.
4724 Any errors are reported to STDERR and the exit status will be
4725 non-zero on XML validation failure. Note that all line and column
4726 numbers in the error output refer to the entire output that would
4727 have been produced before sanitization without any B<--stub> or
4728 B<--stylesheet> options. Re-run with B<--no-sanitize> and
4729 B<--no-validate-xml> and I<without> any B<--stub> or B<--stylesheet>
4730 options to see what's actually present at those line and column
4731 positions.
4733 This option validates the output I<prior to> adding any requested
4734 B<--stub> or B<--stylesheet>. As the built-in stub and stylesheet
4735 have already been validated that speeds things up. The output is
4736 I<NOT> wrapped (in a C<< <div>...</div> >>) for validation as that's
4737 not required for the internal checker.
4739 This option is I<IS enabled by default> unless B<--no-sanitize> is
4740 active.
4742 This option is I<IS compatible> with the B<--html4tags> option.
4744 This option requires the B<--sanitize> option and will produce an
4745 immediate error if both B<--no-sanitize> and B<--validate-xml-internal>
4746 are given.
4748 Note that B<--validate-xml-internal> is I<MUCH faster> than
4749 B<--validate-xml> and I<does NOT> require any extra XML modules to
4750 be present.
4753 =item B<--no-validate-xml>
4755 Do not perform XML validation on the output. Markdown.pl itself will
4756 normally generate valid XML sequences (unless B<--html4tags> has been
4757 used). However, any raw tags in the input (that are on the "approved"
4758 list), could potentially result in invalid XML output (i.e. mismatched
4759 start and end tags, missing start or end tag etc.).
4761 Markdown.pl can check for these issues itself using its own internal
4762 B<--validate-xml-internal> check or, with the B<--validate-xml>
4763 option, it can use C<XML::Simple> or C<XML::Parser> to do so.
4765 Note that B<--validate-xml-internal> is the default option unless
4766 B<--no-sanitize> is used in which case B<--no-validate-xml> is the
4767 default option.
4770 =item B<--strip-comments>/B<--strip-comments-lax>
4772 (N.B. B<--strip-comments> is just a short form of B<--strip-comments-lax>)
4774 Strip XML and XML-like comments from the output. Any XML or XML-like
4775 comments encountered will be omitted from the output if either of these
4776 options is given.
4778 Unlike the B<--strip-comments-strict> option, these options I<will>
4779 strip any XML-like comments that contain internal double hyphen
4780 (i.e. C<-->) sequences.
4782 This option requires the B<--sanitize> option to be used (which is
4783 the default).
4785 If either of these options is given, it will supersede any previous
4786 B<--strip-comments-strict>, B<--strip-comments-lax-only> or
4787 B<--no-strip-comments> options.
4790 =item B<--strip-comments-strict>
4792 Strip only strictly XML standard compliant comments from the output.
4794 Note that the XML standard section 2.5 specifically prohibits
4795 a C<--> sequence within an XML comment (i.e. C<--> cannot occur after
4796 the comment start tag C<< <!-- >> unless it is immediately followed
4797 by C<< > >> which makes it the comment end tag C<< --> >>).
4799 In other words, S<C<< <!-- --> >>>, S<C<< <!-- - --> >>>, S<C<< <!----> >>>,
4800 and S<C<< <!--- --> >>> are all valid XML comments, but S<C<< <!-----> >>>
4801 and S<C<< <!-- ---> >>> are not!
4803 As part of the "sanitation" process (triggered by the B<--sanitize>
4804 option), any invalid tags have their leading C<< < >> escaped (to
4805 C<< &#lt; >>) thus making them ordinary text and this I<includes>
4806 invalid XML comments.
4808 What this means is that the B<--strip-comments-strict> option I<will not>
4809 remove invalid XML comments (such as S<C<< <!-----> >>>)!
4811 But see the B<--strip-comments-lax> option for a solution.
4813 If this option is given, it will supersede any previous
4814 B<--strip-comments>, B<--strip-comments-lax>, B<--strip-comments-lax-only>
4815 or B<--no-strip-comments> options.
4818 =item B<--strip-comments-lax-only>
4820 This is the default option if no other strip comments options are given
4821 AND the B<--sanitize> option is active (the default).
4823 This is a compromise option. It works just like the B<--strip-comments-lax>
4824 option, but I<ONLY> on strictly invalid XML-like comments.
4826 In other words, if a strictly valid XML comment is present, it will be retained
4827 in the output. If a strictly invalid XML comment is present which would have
4828 been stripped by B<--strip-comments-lax> but would have had its leading C<< < >>
4829 escaped automatically by the B<--no-strip-comments> or B<--strip-comments-strict>
4830 modes (because it's not a strictly valid XML comment), then it I<will> be stripped
4831 by this mode.
4833 This option prevents ugly invalid XML comments from slipping through into the
4834 output as escaped plain text while still passing through valid XML comments
4835 without stripping them.
4837 If this option is given, it will supersede any previous
4838 B<--strip-comments>, B<--strip-comments-lax>, B<--strip-comments-lax-only>
4839 or B<--no-strip-comments> options.
4842 =item B<--no-strip-comments>
4844 Do not strip XML or XML-like comments from the output.
4846 This is the default option I<ONLY> when no other strip comments options have
4847 been give I<and> the B<--no-sanitize> option is in effect (which is I<not> the
4848 default).
4850 When B<--no-strip-comments> is active, strictly invalid XML comments such
4851 as those that contain an internal double hyphen (C<-->) sequence will end
4852 up having their leading C<< < >> escaped automatically and end up as plain
4853 text in the output!
4855 If this option is given, it will supersede any previous
4856 B<--strip-comments>, B<--strip-comments-lax>, B<--strip-comments-lax-only>
4857 or B<--no-strip-comments> options.
4860 =item B<--tabwidth>=I<num>
4862 Expand tabs to I<num> character wide tab stop positions instead of the default
4863 8. Don't use this; physical tabs should always be expanded to 8-character
4864 positions. This option does I<not> affect the number of spaces needed to
4865 start a new "indent level". That will always be 4 no matter what value is
4866 used (or implied by default) with this option. Also note that tabs inside
4867 backticks-delimited code blocks will always be expanded to 8-character tab
4868 stop positions no matter what value is used for this option.
4870 The value must be S<2 <= I<num> <= 32>.
4873 =item B<--auto-number>
4875 Automatically number all h1-h6 headings generated from Markdown markup.
4876 Explicit C<< <h1> >> ... C<< <h6> >> tag content remains unmolested.
4878 If this option is given, any YAML C<header_enum> setting will be ignored.
4881 =item B<-k>, B<--keep-abs>
4883 Normally any absolute path URLs (i.e. URLs without a scheme starting
4884 with "/" but not "//") are subject to modification by any
4885 B<-r>/B<--htmlroot> or B<-i>/B<--imageroot> option.
4887 If the B<-a>/B<--absroot> option is used and it transforms these
4888 absolute path URLs into a full absolute URL (i.e. starts with a
4889 scheme or "//") then any subsequent B<-r>/B<--htmlroot> or
4890 B<-i>/B<--imageroot> processing will be skipped because the URL is
4891 no longer relative.
4893 If the B<--keep-abs> option is given, then (after applying any
4894 B<-a>/B<--absroot> option if present) absolute path URLs will be
4895 kept as-is and will not be processed further by any B<-r>/B<--htmlroot>
4896 or B<-i>/B<--imageroot> option.
4898 Note that if the B<-a>/B<--absroot> option transforms an absolute
4899 path URL into a relative PATH URL it I<will> be subject to subsequent
4900 B<-r>/B<--htmlroot> or B<-i>/B<--imageroot> processing regardless
4901 of the B<-k>/B<--keep-abs> option.
4904 =item B<-a> I<prefix>, B<--absroot>=I<prefix>
4906 Any absolute path URLs (i.e. URLs without a scheme starting with "/" but not
4907 "//") have I<prefix> prepended which prevents them from being acted upon by the
4908 B<--htmlroot> and/or B<--imageroot> options provided the result is a full
4909 absolute URL. The default is to prepend nothing and leave them as absolute
4910 path URLs which will allow them to be processed by any B<--htmlroot> and/or
4911 B<--imageroot> options.
4913 This option can be helpful when documents are being formatted for display on a
4914 different system and the absolute path URLs need to be "fixed up".
4917 =item B<-b> I<prefix>, B<--base>=I<prefix>
4919 Any fragment-only URLs have I<prefix> prepended. The default is to prepend
4920 nothing and leave them as bare fragment URLs. Use of this option may be
4921 necessary when embedding the output of Markdown.pl into a document that makes
4922 use of the C<< <base> >> tag in order for intra-document fragment URL links to
4923 work properly in such a document.
4926 =item B<-r> I<prefix>, B<--htmlroot>=I<prefix>
4928 Any non-absolute URLs have I<prefix> prepended.
4931 =item B<-i> I<prefix>, B<--imageroot>=I<prefix>
4933 Any non-absolute URLs have I<prefix> prepended (overriding the B<-r> prefix
4934 if any) but only if they end in an image suffix.
4937 =item B<-w> [I<wikipat>], B<--wiki>[=I<wikipat>]
4939 Activate wiki links. Any link enclosed in double brackets (e.g. "[[link]]") is
4940 considered a wiki link. By default only absolute URL and fragment links are
4941 allowed in the "wiki link style" format. Any other double-bracketed strings
4942 are left unmolested.
4944 If this option is given, all other wiki links are enabled as well. Any
4945 non-absolute URL or fragment links will be transformed into a link using
4946 I<wikipat> where the default I<wikipat> if none is given is C<%{s(:md)}.html>.
4948 If the given I<wikipat> does not contain a C<%{...}> placeholder sequence
4949 then it will automatically have C<%{s(:md)}.html> suffixed to it.
4951 The C<...> part of the C<%{...}> sequence specifies zero or more
4952 case-insensitive single-letter options with the following effects:
4954 =over
4956 =item B<b>
4958 Retain blanks (aka spaces) in the output. They will become C<%20>
4959 in the final URL. Because spaces are always trimmed before processing
4960 wiki links, runs of multiple spaces will be collapsed into a single
4961 space and any leading or trailing spaces will be removed.
4963 =item B<d>
4965 Convert spaces to dashes (ASCII 0x2D) instead of underscore (ASCII
4966 0x5F). Note that if this option is given then runs of multiple
4967 dashes will be converted to a single dash I<instead> but runs of
4968 multiple underscores will be left untouched.
4970 =item B<f>
4972 Flatten the resulting name by replacing forward slashes (ASCII 0x2F)
4973 as well. They will be converted to underscores unless the C<d>
4974 option is given (in which case they will be converted to dashes).
4975 This conversion takes place before applying the runs-of-multiple
4976 reduction. This option is incompatible with the B<%> option.
4978 =item B<%>
4980 Flatten the resulting name by replacing runs of one or more forward
4981 slashes (ASCII 0x2F) with C<%2F>. Note that when encoded into a
4982 URL the C<%2F> actually becomes C<%252F>. This option is incompatible
4983 with the B<f> option.
4985 =item B<l>
4987 Convert link target (excluding any query string and/or fragment) to lowercase.
4988 Takes precedence over any C<u> option, but specifically excludes C<%>-escapes
4989 which are always UPPERCASE hexadecimal.
4991 =item B<r>
4993 Leave raw UTF-8 characters in the result. Normally anything not allowed
4994 directly in a URL ends up URL-encoded. With this option, raw valid UTF-8
4995 sequences will be left untouched. Use with care.
4997 =item B<s> or B<s(>I<< <ext> >>[B<,>I<< <ext> >>]...B<)>
4999 After (temporarily) removing any query string and/or fragment, strip any final
5000 "dot" suffix so long as it occurs after the last slash (if any slash was
5001 present before applying the C<f> option). The "dot" (ASCII 0x2E) and all
5002 following characters (if any) are removed. If the optional C<< (<ext>,...) >>
5003 part is present then only strip the extension if it consists of a "dot"
5004 followed by one of the case-insensitive I<< <ext> >> values. As a special
5005 case, using the value C<:md> for one of the I<< <ext> >> values causes that
5006 value to be expanded to all known markdown extensions.
5008 When processing wiki image links, this option is ignored.
5010 =item B<u>
5012 Convert link target (excluding any query string and/or fragment) to UPPERCASE.
5014 =item B<v>
5016 Leave runs-of-multiple characters alone (aka "verbatim"). Does not affect
5017 any of the other options except by eliminating the runs-of-multple reduction
5018 step. Also does I<not> inhibit the initial whitespace trimming.
5020 Does not affect the runs-of-multiple "/" replacement performed by the B<%>
5021 option.
5023 =back
5025 The URL target of the wiki link is created by first trimming whitespace
5026 (starting and ending whitespace is removed and all other runs of consecutive
5027 whitespace are replaced with a single space) from the wiki link target,
5028 removing (temporarily) any query string and/or fragment, if no options are
5029 present, spaces are converted to underscores (C<_>) and runs of multiple
5030 consecutive underscores are replaced with a single underscore (ASCII 0x5F).
5031 Finally, the I<wikipat> string gets its first placeholder (the C<%{...}>
5032 sequence) replaced with this computed value and the original query string
5033 and/or fragment is re-appended (if any were originally present) and
5034 URL-encoding is applied as needed to produce the actual final target URL.
5036 Note that when processing wiki image links, no extension stripping ever takes
5037 place (i.e. the "s" option is ignored) and anything after the placeholder (the
5038 C<%{...}> sequence) in the pattern is omitted from the result.
5040 See above option descriptions for possible available modifications.
5042 One of the commonly used hosting platforms does something substantially similar
5043 to using C<%{dfv}> as the placeholder.
5045 One of the commonly used wiki platforms does something similar to using C<%{%}>
5046 as the placeholder.
5049 =item B<--yaml>[=I<yamlmode>]
5051 Select YAML front matter processing. The optional I<yamlmode> value
5052 must be one of the following:
5054 =over
5056 =item B<enable>
5058 Recognize any YAML front matter and apply any options specified
5059 therein. If any unrecognized options are present, the options will
5060 also be shown in the formatted output.
5062 This is the default I<yamlmode> if omitted.
5064 =item B<disable>
5066 No YAML front matter processing at all takes place. If YAML front
5067 matter is present, it will be treated as regular non-YAML markup
5068 text to be processed.
5070 =item B<strip>
5072 If YAML front matter is present, it will be stripped and completely
5073 ignored before beginning to process the rest of the input.
5075 In this mode, any options in the YAML front matter that would have
5076 otherwise been recognized will I<not have any effect!>
5078 =item B<show>
5080 If YAML front matter is present and contains anything other than
5081 comments, the non-comments parts will be shown in the formatted
5082 output.
5084 In this mode, any options in the YAML front matter that would have
5085 otherwise been recognized will I<not have any effect!>
5087 This is a show-only mode.
5089 =item B<reveal>
5091 This mode works just like the B<enable> mode except that if the
5092 YAML front matter contains anything other than comments, then
5093 I<all> of the non-comments parts will be shown in the formatted
5094 output.
5096 In this mode, any recognized options in the YAML front matter I<are>
5097 processed the same way they would be in the B<enable> mode except
5098 that any option to suppress the B<reveal> mode is ignored.
5100 =item B<conceal>
5102 This mode works just like the B<enable> mode except that no options
5103 are ever shown in the formatted output regardless of whether or not
5104 there are any unrecognized options present.
5106 In this mode, any recognized options in the YAML front matter I<are>
5107 processed the same way they would be in the B<enable> mode except
5108 that any option to suppress the B<conceal> mode is ignored.
5110 =item B<unknown>
5112 This mode works just like the B<show> mode if any unrecognized
5113 YAML front matter options are present. Otherwise it works like
5114 the B<strip> mode.
5116 In this mode, any options in the YAML front matter that would have
5117 otherwise been recognized will I<not have any effect!>
5119 =back
5121 If B<--raw>, B<--raw-xml> or B<--raw-html> has been specified then
5122 the default if no B<--yaml> option has been given is B<--yaml=disable>.
5124 Otherwise the default if no B<--yaml> option has been given is
5125 B<--yaml=enable>.
5127 Note that only a limited subset of YAML is recognized. Specifically
5128 only comments, and top-level single-line S<C<key: value>> items
5129 where key must be plain (i.e. non-quoted), start with a letter or
5130 underscore and contain only letters, underscores, hyphens (C<->),
5131 periods (C<.>) and digits. Keys are case-insensitive (i.e. converted
5132 to lowercase). As with YAML, at least one whitespace is required
5133 between the ":" and the value (unless it's the empty value).
5135 Values may be either plain or double-quoted (single-quoted is not
5136 recognized). The double-quoted style may use C-style character
5137 escape codes but may not extend past the end of the line.
5139 For YAML front matter to be recognized, the very first line of the
5140 document must be exactly three hyphens (C<--->). The YAML terminates
5141 when a line of three hyphens (C<--->) or a line of three periods
5142 (C<...>) or the end of the file is encountered. Of course the YAML
5143 mode must also be something I<other> than B<--yaml=disable>.
5146 =item B<-V>, B<--version>
5148 Display Markdown's version number and copyright information.
5151 =item B<-s>, B<--shortversion>
5153 Display the short-form version number.
5156 =item B<--raw>, B<--raw-xml>
5158 Input contains only raw XHTML. All options other than B<--html4tags>,
5159 B<--deprecated>, B<--sanitize> (on by default), B<--strip-comments>,
5160 B<--div>, B<--keep-named-character-entities>, B<--validate-xml> and
5161 B<--validate-xml-internal> (and their B<--no-...> variants) are
5162 ignored.
5164 With this option, arbitrary XHTML input can be passed through
5165 the sanitizer and/or validator. If sanitation is requested (the
5166 default), input must only contain the contents of the "<body>"
5167 section (i.e. no "<head>" or "<html>"). Output I<will> be converted
5168 to UTF-8 regardless of the input encoding. All line endings will
5169 be normalized to C<\n> and input encodings other than UTF-8 or
5170 ISO-8859-1 or US-ASCII will end up mangled.
5172 Remember that any B<--stub> and/or B<--stylesheet> options are
5173 I<completely ignored> when B<--raw> is given.
5176 =item B<--raw-html>
5178 Input contains only raw HTML. All options other than
5179 B<--html4tags>, B<--deprecated>, B<--sanitize> (on by default),
5180 B<--strip-comments>, and B<--validate-xml-internal>
5181 (and their B<--no-...> variants) are ignored.
5183 Requires the (possibly implicit) B<--validate-xml-internal> option.
5185 Works just like B<--raw-xml> except that HTML auto closing and
5186 optional closing tag semantics are activated during the validation
5187 causing missing closing tags to be inserted where required by the
5188 standard. Non-raw mode always enables these semantics.
5190 This will transform HTML into valid XHTML or fail with an error message.
5192 Unfortunately, it will also fail to accept some documents that
5193 the plain B<--raw-xml> option will.
5195 For example, this document:
5197 <dt><li>a</li></dt>
5199 Will be rejected because upon encountering the C<< <li> >> open
5200 tag a closing C<< </dt> >> will automatically be inserted resulting
5201 in this document:
5203 <dt></dt><li>a</li></dt>
5205 Which, of course, no longer validates. Since C<li> blocks cannot
5206 directly be nested within C<dt> blocks (according to the standard),
5207 the input document is not strictly correct.
5209 Remember that any B<--stub> and/or B<--stylesheet> options are
5210 I<completely ignored> when B<--raw-html> is given.
5213 =item B<--div>[=I<divname>]
5215 Wrap the output contents in a C<div> tag. If I<divname> is given the
5216 tag will have that C<id> attribute value. If the B<--stub> option and/or
5217 the B<--stylesheet> option are active, they are applied I<after> wrapping
5218 the output contents in the C<div>. Note that if a YAML table ends up
5219 being generated, it I<will> be included I<inside> the C<div> wrapper.
5221 In contrast to the B<--stylesheet> and B<--stub> options, this option
5222 I<is> allowed with the B<--raw-xml> and B<--raw-html> options.
5225 =item B<--stylesheet>
5227 Include the fancy style sheet at the beginning of the output (or in the
5228 C<head> section with B<--stub>). This style sheet makes fancy checkboxes
5229 and makes a right parenthesis C<)> show instead of a C<.> for ordered lists
5230 that use them. Without it things will still look fine except that the
5231 fancy stuff won't be there.
5233 Use this option with no other arguments and redirect standard input to
5234 /dev/null to get just the style sheet and nothing else.
5237 =item B<--no-stylesheet>
5239 Overrides a previous B<--stylesheet> and disables implicit inclusion
5240 of the style sheet by the B<--stub> option.
5243 =item B<--keep-named-character-entities>
5245 Do not convert named character entities to their equivalent numerical character
5246 entity. Normally any occurrence of a named character entity such as
5247 C<&hellip;> would be converted to its equivalent character entity such as
5248 C<&#x2026;>. If this option is given, that conversion is suppressed.
5250 The only always-valid named entities as far as XML is concerned are the five
5251 entities C<&amp;>, C<&lt;>, C<&gt;>, C<&quot;> and C<&apos;>. Even that last
5252 one (C<&apos;>) may not be universally supported in XHTML user agents (and it
5253 is converted to C<&#39;> for that reason unless this option is given).
5255 Regardless of this option, C<&amp;>, C<&lt;>, C<&gt;> and C<&quot;> are always
5256 left alone since they are universally supported.
5258 Use of this option is I<NOT RECOMMENDED>.
5261 =item B<--us-ascii>/B<--ascii>
5263 (N.B. B<--ascii> is just a short form of B<--us-ascii>)
5265 Convert any non-US-ASCII characters to their equivalent numerical character
5266 entity. Any characters with a code point value greater than or equal to
5267 128 will be converted. Note that the output is still technically UTF-8 since
5268 the US-ASCII code points coincide with the same code points of UTF-8.
5270 Using this option will make the output strictly 7-bit and therefore it should
5271 survive just about any transport mechanism at the expense of an increase in
5272 size that depends on how many non-US-ASCII characters are present.
5275 =item B<--stub>
5277 Wrap the output in a full document stub (i.e. has C<html>, C<head> and C<body>
5278 tags). The style sheet I<will> be included in the C<head> section unless the
5279 B<--no-stylesheet> option is also used.
5281 The C<< <title> >> value for a document produced with the B<--stub> option
5282 comes from the first markdown markup C<h1> that's generated unless YAML
5283 processing has been enabled (the default) and a C<title> YAML value has
5284 been set in which case that always takes precedence.
5287 =item B<-h>, B<--help>
5289 Display Markdown's help. With B<--help> full help is shown, with B<-h> only
5290 the usage and options are shown.
5293 =back
5296 =head1 HTML CONTENT
5298 Markdown format documents are intended to be human readable without the use
5299 of XML-like markup.
5301 Nevertheless, html content can be included verbatim provided that the tags
5302 used are limited to those of the HTML 4 specification and only those tags
5303 that represent body content -- scripting tags and attributes are not allowed.
5305 The final version of the HTML specification (including a DTD) can be found
5306 here:
5308 =over
5310 =over
5312 =item L<https://www.w3.org/TR/1999/REC-html401-19991224/>
5314 =back
5316 =back
5318 Note that attempts to use any of the new tags from the "HTML Living Standard"
5319 will simply result in them being escaped into literal text.
5321 Stick to markdown-format text or HTML 4 tags to avoid unexpected output.
5324 =head1 PERL MODULE
5326 Markdown can be used as a Perl module and can be "use"d like so:
5328 use Markdown qw(...);
5330 Or like so:
5332 BEGIN {require "Markdown.pl" && Markdown->import(qw(...))}
5334 where the C<...> part is the list of desired imports.
5336 The Markdown module does not export any functions by default.
5338 The C<Markdown.pm> file is a symbolic link to C<Markdown.pl>.
5340 =head2 Markdown module functions
5342 Any of these functions may be imported, but none of them
5343 are imported by default.
5345 =over
5348 =item * $result = Markdown::Markdown($string[, options...])
5350 Converts Markdown-format C<$string> to UTF-8 encoded XHTML and
5351 returns it.
5353 The C<options...> may be either a single HASH ref or one or more
5354 pairs of C<< key => value >>.
5356 See the comments for the C<_SanitizeOpts> function for a list of
5357 possible option keys.
5360 =item * $result = Markdown::ProcessRaw($string[, options...])
5362 Converts raw XHTML in C<$string> to XHTML and returns it.
5364 The C<options...> may be either a single HASH ref or one or more
5365 pairs of C<< key => value >>.
5367 See the comments for the C<_SanitizeOpts> function for a list of
5368 possible option keys.
5370 This function provides the ability to apply the internal XML
5371 validation and sanitation functionality to arbitrary XHTML without
5372 performing any of the Markdown format interpretation.
5375 =item * $stylesheet = Markdown::GenerateStyleSheet([$prefix])
5377 Returns an XHTML style sheet that supports the fancy Markdown styles
5378 such as checkboxes and right parenthesis lists.
5380 All of the style class names have C<$prefix> prepended.
5382 If C<$prefix> is omitted or C<undef> then the default S<"_markdown-">
5383 prefix will be used which is the same default prefix that the
5384 C<Markdown> function uses.
5386 The returned string value consists of a C<< <style type="text/css"> >>
5387 tag, the contents of the style sheet and ends with a C<< </style> >> tag.
5390 =item * Markdown::SetWikiOpts($hashref, $wikioption)
5392 The value of C<$wikioption> should be the value of the C<wikipat> value
5393 from the B<--wiki> option. Use the empty string S<""> to enable wiki
5394 links using the defaults and use C<undef> to disable wiki links.
5396 The C<wikipat> and C<wikiopt> keys in C<$hashref> will both be
5397 affected by this call and they should be passed in to the Markdown
5398 function as options to enable processing of wiki links.
5400 The simplest way to do this is simply to pass a HASH ref as the
5401 second argument to the Markdown function after having used this
5402 function on it to properly set the C<wikipat> and C<wikiopt>
5403 keys and values.
5406 =back
5409 =head2 Example
5411 This rudimentary example approximates running
5412 S<C<Markdown.pl --stub --wiki>>
5413 on the input (files if given, standard input if not).
5415 use Markdown qw(Markdown SetWikiOpts GenerateStyleSheet escapeXML);
5417 my $string;
5418 {local $/; $string = <>;}
5419 my %opts = ( h1 => "default title" );
5420 SetWikiOpts(\%opts, ""); # enable default --wiki processing
5421 my $xhtml = Markdown($string, \%opts);
5422 print "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n",
5423 "<head>\n<title>".escapeXML($opts{h1})."</title>\n",
5424 GenerateStyleSheet(),"</head>\n",
5425 "<body style=\"text-align:center\">\n",
5426 "<div style=\"".
5427 "display:inline-block;text-align:left;max-width:42pc\">\n",
5428 $xhtml, "</div></body></html>\n";
5431 =head1 VERSION HISTORY
5433 Z<> See the F<README> file for detailed release notes for this version.
5435 =over
5437 =item Z<> 1.1.15 - 15 Aug 2021
5439 =item Z<> 1.1.14 - 14 Jun 2021
5441 =item Z<> 1.1.13 - 13 Jun 2021
5443 =item Z<> 1.1.12 - 17 Mar 2021
5445 =item Z<> 1.1.11 - 05 Mar 2021
5447 =item Z<> 1.1.10 - 08 Jul 2020
5449 =item Z<> 1.1.9 - 15 Dec 2019
5451 =item Z<> 1.1.8 - 22 Nov 2019
5453 =item Z<> 1.1.7 - 14 Feb 2018
5455 =item Z<> 1.1.6 - 03 Jan 2018
5457 =item Z<> 1.1.5 - 07 Dec 2017
5459 =item Z<> 1.1.4 - 24 Jun 2017
5461 =item Z<> 1.1.3 - 13 Feb 2017
5463 =item Z<> 1.1.2 - 19 Jan 2017
5465 =item Z<> 1.1.1 - 12 Jan 2017
5467 =item Z<> 1.1.0 - 11 Jan 2017
5469 =item Z<> 1.0.4 - 05 Jun 2016
5471 =item Z<> 1.0.3 - 06 Sep 2015
5473 =item Z<> 1.0.2 - 03 Sep 2015
5475 =item Z<> 1.0.1 - 14 Dec 2004
5477 =item Z<> 1.0.0 - 28 Aug 2004
5479 =back
5481 =head1 AUTHORS
5483 =over
5485 =item John Gruber
5487 =item L<https://daringfireball.net>
5489 =item L<https://daringfireball.net/projects/markdown/>
5491 =item E<160>
5493 =back
5495 =over
5497 =item PHP port and other contributions by Michel Fortin
5499 =item L<https://michelf.ca>
5501 =item E<160>
5503 =back
5505 =over
5507 =item Additional enhancements and tweaks by Kyle J. McKay
5509 =item mackyle<at>gmail.com
5511 =item L<https://repo.or.cz/markdown.git>
5513 =back
5515 =head1 COPYRIGHT AND LICENSE
5517 =over
5519 =item Copyright (C) 2003-2004 John Gruber
5521 =item Copyright (C) 2015-2021 Kyle J. McKay
5523 =item All rights reserved.
5525 =back
5527 Redistribution and use in source and binary forms, with or without
5528 modification, are permitted provided that the following conditions are
5529 met:
5531 =over
5533 =item *
5535 Redistributions of source code must retain the above copyright
5536 notice, this list of conditions and the following disclaimer.
5538 =item *
5540 Redistributions in binary form must reproduce the above copyright
5541 notice, this list of conditions and the following disclaimer in the
5542 documentation and/or other materials provided with the distribution.
5544 =item *
5546 Neither the name "Markdown" nor the names of its contributors may
5547 be used to endorse or promote products derived from this software
5548 without specific prior written permission.
5550 =back
5552 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
5553 IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
5554 TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
5555 PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
5556 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
5557 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
5558 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
5559 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
5560 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
5561 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
5562 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5564 =cut