fix codetest failure - ASSERT_ARGS does not have a ; after and
[parrot.git] / lib / Pod / Escapes.pm
blobfb3b27c2adcfca94b3180e8c68c87cbe6e5cc2e5
2 require 5;
3 # The documentation is at the end.
4 # Time-stamp: "2004-05-07 15:31:25 ADT"
5 package Pod::Escapes;
6 require Exporter;
7 @ISA = ('Exporter');
8 $VERSION = '1.04';
9 @EXPORT_OK = qw(
10 %Code2USASCII
11 %Name2character
12 %Name2character_number
13 %Latin1Code_to_fallback
14 %Latin1Char_to_fallback
15 e2char
16 e2charnum
18 %EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
20 #==========================================================================
22 use strict;
23 use vars qw(
24 %Code2USASCII
25 %Name2character
26 %Name2character_number
27 %Latin1Code_to_fallback
28 %Latin1Char_to_fallback
29 $FAR_CHAR
30 $FAR_CHAR_NUMBER
31 $NOT_ASCII
34 $FAR_CHAR = "?" unless defined $FAR_CHAR;
35 $FAR_CHAR_NUMBER = ord($FAR_CHAR) unless defined $FAR_CHAR_NUMBER;
37 $NOT_ASCII = 'A' ne chr(65) unless defined $NOT_ASCII;
39 #--------------------------------------------------------------------------
40 sub e2char {
41 my $in = $_[0];
42 return undef unless defined $in and length $in;
44 # Convert to decimal:
45 if($in =~ m/^(0[0-7]*)$/s ) {
46 $in = oct $in;
47 } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) {
48 $in = hex $1;
49 } # else it's decimal, or named
51 if($NOT_ASCII) {
52 # We're in bizarro world of not-ASCII!
53 # Cope with US-ASCII codes, use fallbacks for Latin-1, or use FAR_CHAR.
54 unless($in =~ m/^\d+$/s) {
55 # It's a named character reference. Get its numeric Unicode value.
56 $in = $Name2character{$in};
57 return undef unless defined $in; # (if there's no such name)
58 $in = ord $in; # (All ents must be one character long.)
59 # ...So $in holds the char's US-ASCII numeric value, which we'll
60 # now go get the local equivalent for.
63 # It's numeric, whether by origin or by mutation from a known name
64 return $Code2USASCII{$in} # so "65" => "A" everywhere
65 || $Latin1Code_to_fallback{$in} # Fallback.
66 || $FAR_CHAR; # Fall further back
69 # Normal handling:
70 if($in =~ m/^\d+$/s) {
71 if($] < 5.007 and $in > 255) { # can't be trusted with Unicode
72 return $FAR_CHAR;
73 } else {
74 return chr($in);
76 } else {
77 return $Name2character{$in}; # returns undef if unknown
81 #--------------------------------------------------------------------------
82 sub e2charnum {
83 my $in = $_[0];
84 return undef unless defined $in and length $in;
86 # Convert to decimal:
87 if($in =~ m/^(0[0-7]*)$/s ) {
88 $in = oct $in;
89 } elsif($in =~ m/^0?x([0-9a-fA-F]+)$/s ) {
90 $in = hex $1;
91 } # else it's decimal, or named
93 if($in =~ m/^\d+$/s) {
94 return 0 + $in;
95 } else {
96 return $Name2character_number{$in}; # returns undef if unknown
100 #--------------------------------------------------------------------------
102 %Name2character_number = (
103 # General XML/XHTML:
104 'lt' => 60,
105 'gt' => 62,
106 'quot' => 34,
107 'amp' => 38,
108 'apos' => 39,
110 # POD-specific:
111 'sol' => 47,
112 'verbar' => 124,
114 'lchevron' => 171, # legacy for laquo
115 'rchevron' => 187, # legacy for raquo
117 # Remember, grave looks like \ (as in virtu\)
118 # acute looks like / (as in re/sume/)
119 # circumflex looks like ^ (as in papier ma^che/)
120 # umlaut/dieresis looks like " (as in nai"ve, Chloe")
122 # From the XHTML 1 .ent files:
123 'nbsp' , 160,
124 'iexcl' , 161,
125 'cent' , 162,
126 'pound' , 163,
127 'curren' , 164,
128 'yen' , 165,
129 'brvbar' , 166,
130 'sect' , 167,
131 'uml' , 168,
132 'copy' , 169,
133 'ordf' , 170,
134 'laquo' , 171,
135 'not' , 172,
136 'shy' , 173,
137 'reg' , 174,
138 'macr' , 175,
139 'deg' , 176,
140 'plusmn' , 177,
141 'sup2' , 178,
142 'sup3' , 179,
143 'acute' , 180,
144 'micro' , 181,
145 'para' , 182,
146 'middot' , 183,
147 'cedil' , 184,
148 'sup1' , 185,
149 'ordm' , 186,
150 'raquo' , 187,
151 'frac14' , 188,
152 'frac12' , 189,
153 'frac34' , 190,
154 'iquest' , 191,
155 'Agrave' , 192,
156 'Aacute' , 193,
157 'Acirc' , 194,
158 'Atilde' , 195,
159 'Auml' , 196,
160 'Aring' , 197,
161 'AElig' , 198,
162 'Ccedil' , 199,
163 'Egrave' , 200,
164 'Eacute' , 201,
165 'Ecirc' , 202,
166 'Euml' , 203,
167 'Igrave' , 204,
168 'Iacute' , 205,
169 'Icirc' , 206,
170 'Iuml' , 207,
171 'ETH' , 208,
172 'Ntilde' , 209,
173 'Ograve' , 210,
174 'Oacute' , 211,
175 'Ocirc' , 212,
176 'Otilde' , 213,
177 'Ouml' , 214,
178 'times' , 215,
179 'Oslash' , 216,
180 'Ugrave' , 217,
181 'Uacute' , 218,
182 'Ucirc' , 219,
183 'Uuml' , 220,
184 'Yacute' , 221,
185 'THORN' , 222,
186 'szlig' , 223,
187 'agrave' , 224,
188 'aacute' , 225,
189 'acirc' , 226,
190 'atilde' , 227,
191 'auml' , 228,
192 'aring' , 229,
193 'aelig' , 230,
194 'ccedil' , 231,
195 'egrave' , 232,
196 'eacute' , 233,
197 'ecirc' , 234,
198 'euml' , 235,
199 'igrave' , 236,
200 'iacute' , 237,
201 'icirc' , 238,
202 'iuml' , 239,
203 'eth' , 240,
204 'ntilde' , 241,
205 'ograve' , 242,
206 'oacute' , 243,
207 'ocirc' , 244,
208 'otilde' , 245,
209 'ouml' , 246,
210 'divide' , 247,
211 'oslash' , 248,
212 'ugrave' , 249,
213 'uacute' , 250,
214 'ucirc' , 251,
215 'uuml' , 252,
216 'yacute' , 253,
217 'thorn' , 254,
218 'yuml' , 255,
220 'fnof' , 402,
221 'Alpha' , 913,
222 'Beta' , 914,
223 'Gamma' , 915,
224 'Delta' , 916,
225 'Epsilon' , 917,
226 'Zeta' , 918,
227 'Eta' , 919,
228 'Theta' , 920,
229 'Iota' , 921,
230 'Kappa' , 922,
231 'Lambda' , 923,
232 'Mu' , 924,
233 'Nu' , 925,
234 'Xi' , 926,
235 'Omicron' , 927,
236 'Pi' , 928,
237 'Rho' , 929,
238 'Sigma' , 931,
239 'Tau' , 932,
240 'Upsilon' , 933,
241 'Phi' , 934,
242 'Chi' , 935,
243 'Psi' , 936,
244 'Omega' , 937,
245 'alpha' , 945,
246 'beta' , 946,
247 'gamma' , 947,
248 'delta' , 948,
249 'epsilon' , 949,
250 'zeta' , 950,
251 'eta' , 951,
252 'theta' , 952,
253 'iota' , 953,
254 'kappa' , 954,
255 'lambda' , 955,
256 'mu' , 956,
257 'nu' , 957,
258 'xi' , 958,
259 'omicron' , 959,
260 'pi' , 960,
261 'rho' , 961,
262 'sigmaf' , 962,
263 'sigma' , 963,
264 'tau' , 964,
265 'upsilon' , 965,
266 'phi' , 966,
267 'chi' , 967,
268 'psi' , 968,
269 'omega' , 969,
270 'thetasym' , 977,
271 'upsih' , 978,
272 'piv' , 982,
273 'bull' , 8226,
274 'hellip' , 8230,
275 'prime' , 8242,
276 'Prime' , 8243,
277 'oline' , 8254,
278 'frasl' , 8260,
279 'weierp' , 8472,
280 'image' , 8465,
281 'real' , 8476,
282 'trade' , 8482,
283 'alefsym' , 8501,
284 'larr' , 8592,
285 'uarr' , 8593,
286 'rarr' , 8594,
287 'darr' , 8595,
288 'harr' , 8596,
289 'crarr' , 8629,
290 'lArr' , 8656,
291 'uArr' , 8657,
292 'rArr' , 8658,
293 'dArr' , 8659,
294 'hArr' , 8660,
295 'forall' , 8704,
296 'part' , 8706,
297 'exist' , 8707,
298 'empty' , 8709,
299 'nabla' , 8711,
300 'isin' , 8712,
301 'notin' , 8713,
302 'ni' , 8715,
303 'prod' , 8719,
304 'sum' , 8721,
305 'minus' , 8722,
306 'lowast' , 8727,
307 'radic' , 8730,
308 'prop' , 8733,
309 'infin' , 8734,
310 'ang' , 8736,
311 'and' , 8743,
312 'or' , 8744,
313 'cap' , 8745,
314 'cup' , 8746,
315 'int' , 8747,
316 'there4' , 8756,
317 'sim' , 8764,
318 'cong' , 8773,
319 'asymp' , 8776,
320 'ne' , 8800,
321 'equiv' , 8801,
322 'le' , 8804,
323 'ge' , 8805,
324 'sub' , 8834,
325 'sup' , 8835,
326 'nsub' , 8836,
327 'sube' , 8838,
328 'supe' , 8839,
329 'oplus' , 8853,
330 'otimes' , 8855,
331 'perp' , 8869,
332 'sdot' , 8901,
333 'lceil' , 8968,
334 'rceil' , 8969,
335 'lfloor' , 8970,
336 'rfloor' , 8971,
337 'lang' , 9001,
338 'rang' , 9002,
339 'loz' , 9674,
340 'spades' , 9824,
341 'clubs' , 9827,
342 'hearts' , 9829,
343 'diams' , 9830,
344 'OElig' , 338,
345 'oelig' , 339,
346 'Scaron' , 352,
347 'scaron' , 353,
348 'Yuml' , 376,
349 'circ' , 710,
350 'tilde' , 732,
351 'ensp' , 8194,
352 'emsp' , 8195,
353 'thinsp' , 8201,
354 'zwnj' , 8204,
355 'zwj' , 8205,
356 'lrm' , 8206,
357 'rlm' , 8207,
358 'ndash' , 8211,
359 'mdash' , 8212,
360 'lsquo' , 8216,
361 'rsquo' , 8217,
362 'sbquo' , 8218,
363 'ldquo' , 8220,
364 'rdquo' , 8221,
365 'bdquo' , 8222,
366 'dagger' , 8224,
367 'Dagger' , 8225,
368 'permil' , 8240,
369 'lsaquo' , 8249,
370 'rsaquo' , 8250,
371 'euro' , 8364,
375 # Fill out %Name2character...
377 %Name2character = ();
378 my($name, $number);
379 while( ($name, $number) = each %Name2character_number) {
380 if($] < 5.007 and $number > 255) {
381 $Name2character{$name} = $FAR_CHAR;
382 # substitute for Unicode characters, for perls
383 # that can't reliable handle them
384 } else {
385 $Name2character{$name} = chr $number;
386 # normal case
389 # So they resolve 'right' even in EBCDIC-land
390 $Name2character{'lt' } = '<';
391 $Name2character{'gt' } = '>';
392 $Name2character{'quot'} = '"';
393 $Name2character{'amp' } = '&';
394 $Name2character{'apos'} = "'";
395 $Name2character{'sol' } = '/';
396 $Name2character{'verbar'} = '|';
399 #--------------------------------------------------------------------------
401 %Code2USASCII = (
402 # mostly generated by
403 # perl -e "printf qq{ \x25 3s, '\x25s',\n}, $_, chr($_) foreach (32 .. 126)"
404 32, ' ',
405 33, '!',
406 34, '"',
407 35, '#',
408 36, '$',
409 37, '%',
410 38, '&',
411 39, "'", #!
412 40, '(',
413 41, ')',
414 42, '*',
415 43, '+',
416 44, ',',
417 45, '-',
418 46, '.',
419 47, '/',
420 48, '0',
421 49, '1',
422 50, '2',
423 51, '3',
424 52, '4',
425 53, '5',
426 54, '6',
427 55, '7',
428 56, '8',
429 57, '9',
430 58, ':',
431 59, ';',
432 60, '<',
433 61, '=',
434 62, '>',
435 63, '?',
436 64, '@',
437 65, 'A',
438 66, 'B',
439 67, 'C',
440 68, 'D',
441 69, 'E',
442 70, 'F',
443 71, 'G',
444 72, 'H',
445 73, 'I',
446 74, 'J',
447 75, 'K',
448 76, 'L',
449 77, 'M',
450 78, 'N',
451 79, 'O',
452 80, 'P',
453 81, 'Q',
454 82, 'R',
455 83, 'S',
456 84, 'T',
457 85, 'U',
458 86, 'V',
459 87, 'W',
460 88, 'X',
461 89, 'Y',
462 90, 'Z',
463 91, '[',
464 92, "\\", #!
465 93, ']',
466 94, '^',
467 95, '_',
468 96, '`',
469 97, 'a',
470 98, 'b',
471 99, 'c',
472 100, 'd',
473 101, 'e',
474 102, 'f',
475 103, 'g',
476 104, 'h',
477 105, 'i',
478 106, 'j',
479 107, 'k',
480 108, 'l',
481 109, 'm',
482 110, 'n',
483 111, 'o',
484 112, 'p',
485 113, 'q',
486 114, 'r',
487 115, 's',
488 116, 't',
489 117, 'u',
490 118, 'v',
491 119, 'w',
492 120, 'x',
493 121, 'y',
494 122, 'z',
495 123, '{',
496 124, '|',
497 125, '}',
498 126, '~',
501 #--------------------------------------------------------------------------
503 %Latin1Code_to_fallback = ();
504 @Latin1Code_to_fallback{0xA0 .. 0xFF} = (
505 # Copied from Text/Unidecode/x00.pm:
507 ' ', qq{!}, qq{C/}, 'PS', qq{\$?}, qq{Y=}, qq{|}, 'SS', qq{"}, qq{(c)}, 'a', qq{<<}, qq{!}, "", qq{(r)}, qq{-},
508 'deg', qq{+-}, '2', '3', qq{'}, 'u', 'P', qq{*}, qq{,}, '1', 'o', qq{>>}, qq{1/4}, qq{1/2}, qq{3/4}, qq{?},
509 'A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I',
510 'D', 'N', 'O', 'O', 'O', 'O', 'O', 'x', 'O', 'U', 'U', 'U', 'U', 'U', 'Th', 'ss',
511 'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i',
512 'd', 'n', 'o', 'o', 'o', 'o', 'o', qq{/}, 'o', 'u', 'u', 'u', 'u', 'y', 'th', 'y',
517 # Now stuff %Latin1Char_to_fallback:
518 %Latin1Char_to_fallback = ();
519 my($k,$v);
520 while( ($k,$v) = each %Latin1Code_to_fallback) {
521 $Latin1Char_to_fallback{chr $k} = $v;
522 #print chr($k), ' => ', $v, "\n";
526 #--------------------------------------------------------------------------
528 __END__
530 =head1 NAME
532 Pod::Escapes -- for resolving Pod EE<lt>...E<gt> sequences
534 =head1 SYNOPSIS
536 use Pod::Escapes qw(e2char);
537 ...la la la, parsing POD, la la la...
538 $text = e2char($e_node->label);
539 unless(defined $text) {
540 print "Unknown E sequence \"", $e_node->label, "\"!";
542 ...else print/interpolate $text...
544 =head1 DESCRIPTION
546 This module provides things that are useful in decoding
547 Pod EE<lt>...E<gt> sequences. Presumably, it should be used
548 only by Pod parsers and/or formatters.
550 By default, Pod::Escapes exports none of its symbols. But
551 you can request any of them to be exported.
552 Either request them individually, as with
553 C<use Pod::Escapes qw(symbolname symbolname2...);>,
554 or you can do C<use Pod::Escapes qw(:ALL);> to get all
555 exportable symbols.
557 =head1 GOODIES
559 =over
561 =item e2char($e_content)
563 Given a name or number that could appear in a
564 C<EE<lt>name_or_numE<gt>> sequence, this returns the string that
565 it stands for. For example, C<e2char('sol')>, C<e2char('47')>,
566 C<e2char('0x2F')>, and C<e2char('057')> all return "/",
567 because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>,
568 and C<EE<lt>057E<gt>>, all mean "/". If
569 the name has no known value (as with a name of "qacute") or is
570 syntactally invalid (as with a name of "1/4"), this returns undef.
572 =item e2charnum($e_content)
574 Given a name or number that could appear in a
575 C<EE<lt>name_or_numE<gt>> sequence, this returns the number of
576 the Unicode character that this stands for. For example,
577 C<e2char('sol')>, C<e2char('47')>,
578 C<e2char('0x2F')>, and C<e2char('057')> all return 47,
579 because C<EE<lt>solE<gt>>, C<EE<lt>47E<gt>>, C<EE<lt>0x2fE<gt>>,
580 and C<EE<lt>057E<gt>>, all mean "/", whose Unicode number is 47. If
581 the name has no known value (as with a name of "qacute") or is
582 syntactally invalid (as with a name of "1/4"), this returns undef.
584 =item $Name2character{I<name>}
586 Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol"
587 to the string that each stands for. Note that this does not
588 include numerics (like "64" or "x981c"). Under old Perl versions
589 (before 5.7) you get a "?" in place of characters whose Unicode
590 value is over 255.
592 =item $Name2character_number{I<name>}
594 Maps from names (as in C<EE<lt>I<name>E<gt>>) like "eacute" or "sol"
595 to the Unicode value that each stands for. For example,
596 C<$Name2character_number{'eacute'}> is 201, and
597 C<$Name2character_number{'eacute'}> is 8364. You get the correct
598 Unicode value, regardless of the version of Perl you're using --
599 which differs from C<%Name2character>'s behavior under pre-5.7 Perls.
601 Note that this hash does not
602 include numerics (like "64" or "x981c").
604 =item $Latin1Code_to_fallback{I<integer>}
606 For numbers in the range 160 (0x00A0) to 255 (0x00FF), this maps
607 from the character code for a Latin-1 character (like 233 for
608 lowercase e-acute) to the US-ASCII character that best aproximates
609 it (like "e"). You may find this useful if you are rendering
610 POD in a format that you think deals well only with US-ASCII
611 characters.
613 =item $Latin1Char_to_fallback{I<character>}
615 Just as above, but maps from characters (like "\xE9",
616 lowercase e-acute) to characters (like "e").
618 =item $Code2USASCII{I<integer>}
620 This maps from US-ASCII codes (like 32) to the corresponding
621 character (like space, for 32). Only characters 32 to 126 are
622 defined. This is meant for use by C<e2char($x)> when it senses
623 that it's running on a non-ASCII platform (where chr(32) doesn't
624 get you a space -- but $Code2USASCII{32} will). It's
625 documented here just in case you might find it useful.
627 =back
629 =head1 CAVEATS
631 On Perl versions before 5.7, Unicode characters with a value
632 over 255 (like lambda or emdash) can't be conveyed. This
633 module does work under such early Perl versions, but in the
634 place of each such character, you get a "?". Latin-1
635 characters (characters 160-255) are unaffected.
637 Under EBCDIC platforms, C<e2char($n)> may not always be the
638 same as C<chr(e2charnum($n))>, and ditto for
639 C<$Name2character{$name}> and
640 C<chr($Name2character_number{$name})>.
642 =head1 SEE ALSO
644 L<perlpod|perlpod>
646 L<perlpodspec|perlpodspec>
648 L<Text::Unidecode|Text::Unidecode>
650 =head1 COPYRIGHT AND DISCLAIMERS
652 Copyright (c) 2001-2004 Sean M. Burke. All rights reserved.
654 This library is free software; you can redistribute it and/or modify
655 it under the same terms as Perl itself.
657 This program is distributed in the hope that it will be useful, but
658 without any warranty; without even the implied warranty of
659 merchantability or fitness for a particular purpose.
661 Portions of the data tables in this module are derived from the
662 entity declarations in the W3C XHTML specification.
664 Currently (October 2001), that's these three:
666 http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent
667 http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent
668 http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent
670 =head1 AUTHOR
672 Sean M. Burke C<sburke@cpan.org>
674 =cut
676 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
677 # What I used for reading the XHTML .ent files:
679 use strict;
680 my(@norms, @good, @bad);
681 my $dir = 'c:/sgml/docbook/';
682 my %escapes;
683 foreach my $file (qw(
684 xhtml-symbol.ent
685 xhtml-lat1.ent
686 xhtml-special.ent
687 )) {
688 open(IN, "<", "$dir$file") or die "can't read-open $dir$file: $!";
689 print "Reading $file...\n";
690 while(<IN>) {
691 if(m/<!ENTITY\s+(\S+)\s+"&#([^;]+);">/) {
692 my($name, $value) = ($1,$2);
693 next if $name eq 'quot' or $name eq 'apos' or $name eq 'gt';
695 $value = hex $1 if $value =~ m/^x([a-fA-F0-9]+)$/s;
696 print "ILLEGAL VALUE $value" unless $value =~ m/^\d+$/s;
697 if($value > 255) {
698 push @good , sprintf " %-10s , chr(%s),\n", "'$name'", $value;
699 push @bad , sprintf " %-10s , \$bad,\n", "'$name'", $value;
700 } else {
701 push @norms, sprintf " %-10s , chr(%s),\n", "'$name'", $value;
703 } elsif(m/<!ENT/) {
704 print "# Skipping $_";
708 close(IN);
711 print @norms;
712 print "\n ( \$] .= 5.006001 ? (\n";
713 print @good;
714 print " ) : (\n";
715 print @bad;
716 print " )\n);\n";
718 __END__
719 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~