Upgrade to Perl 5.8.8
[msysgit/kusma.git] / lib / perl5 / 5.8.8 / msys / Encode.pm
blobac0123c89ec2083090bde1638f91819f743a2da4
2 # $Id: Encode.pm,v 2.12 2005/09/08 14:17:17 dankogai Exp dankogai $
4 package Encode;
5 use strict;
6 our $VERSION = sprintf "%d.%02d", q$Revision: 2.12 $ =~ /(\d+)/g;
7 sub DEBUG () { 0 }
8 use XSLoader ();
9 XSLoader::load(__PACKAGE__, $VERSION);
11 require Exporter;
12 use base qw/Exporter/;
14 # Public, encouraged API is exported by default
16 our @EXPORT = qw(
17 decode decode_utf8 encode encode_utf8
18 encodings find_encoding clone_encoding
21 our @FB_FLAGS = qw(DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
22 PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL);
23 our @FB_CONSTS = qw(FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
24 FB_PERLQQ FB_HTMLCREF FB_XMLCREF);
26 our @EXPORT_OK =
28 qw(
29 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
30 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
32 @FB_FLAGS, @FB_CONSTS,
35 our %EXPORT_TAGS =
37 all => [ @EXPORT, @EXPORT_OK ],
38 fallbacks => [ @FB_CONSTS ],
39 fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
42 # Documentation moved after __END__ for speed - NI-S
44 our $ON_EBCDIC = (ord("A") == 193);
46 use Encode::Alias;
48 # Make a %Encoding package variable to allow a certain amount of cheating
49 our %Encoding;
50 our %ExtModule;
51 require Encode::Config;
52 eval { require Encode::ConfigLocal };
54 sub encodings
56 my $class = shift;
57 my %enc;
58 if (@_ and $_[0] eq ":all"){
59 %enc = ( %Encoding, %ExtModule );
60 }else{
61 %enc = %Encoding;
62 for my $mod (map {m/::/o ? $_ : "Encode::$_" } @_){
63 DEBUG and warn $mod;
64 for my $enc (keys %ExtModule){
65 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
69 return
70 sort { lc $a cmp lc $b }
71 grep {!/^(?:Internal|Unicode|Guess)$/o} keys %enc;
74 sub perlio_ok{
75 my $obj = ref($_[0]) ? $_[0] : find_encoding($_[0]);
76 $obj->can("perlio_ok") and return $obj->perlio_ok();
77 return 0; # safety net
80 sub define_encoding
82 my $obj = shift;
83 my $name = shift;
84 $Encoding{$name} = $obj;
85 my $lc = lc($name);
86 define_alias($lc => $obj) unless $lc eq $name;
87 while (@_){
88 my $alias = shift;
89 define_alias($alias, $obj);
91 return $obj;
94 sub getEncoding
96 my ($class, $name, $skip_external) = @_;
98 ref($name) && $name->can('renew') and return $name;
99 exists $Encoding{$name} and return $Encoding{$name};
100 my $lc = lc $name;
101 exists $Encoding{$lc} and return $Encoding{$lc};
103 my $oc = $class->find_alias($name);
104 defined($oc) and return $oc;
105 $lc ne $name and $oc = $class->find_alias($lc);
106 defined($oc) and return $oc;
108 unless ($skip_external)
110 if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){
111 $mod =~ s,::,/,g ; $mod .= '.pm';
112 eval{ require $mod; };
113 exists $Encoding{$name} and return $Encoding{$name};
116 return;
119 sub find_encoding($;$)
121 my ($name, $skip_external) = @_;
122 return __PACKAGE__->getEncoding($name,$skip_external);
125 sub resolve_alias($){
126 my $obj = find_encoding(shift);
127 defined $obj and return $obj->name;
128 return;
131 sub clone_encoding($){
132 my $obj = find_encoding(shift);
133 ref $obj or return;
134 eval { require Storable };
135 $@ and return;
136 return Storable::dclone($obj);
139 sub encode($$;$)
141 my ($name, $string, $check) = @_;
142 return undef unless defined $string;
143 $string .= '' if ref $string; # stringify;
144 $check ||=0;
145 my $enc = find_encoding($name);
146 unless(defined $enc){
147 require Carp;
148 Carp::croak("Unknown encoding '$name'");
150 my $octets = $enc->encode($string,$check);
151 $_[1] = $string if $check and !($check & LEAVE_SRC());
152 return $octets;
155 sub decode($$;$)
157 my ($name,$octets,$check) = @_;
158 return undef unless defined $octets;
159 $octets .= '' if ref $octets;
160 $check ||=0;
161 my $enc = find_encoding($name);
162 unless(defined $enc){
163 require Carp;
164 Carp::croak("Unknown encoding '$name'");
166 my $string = $enc->decode($octets,$check);
167 $_[1] = $octets if $check and !($check & LEAVE_SRC());
168 return $string;
171 sub from_to($$$;$)
173 my ($string,$from,$to,$check) = @_;
174 return undef unless defined $string;
175 $check ||=0;
176 my $f = find_encoding($from);
177 unless (defined $f){
178 require Carp;
179 Carp::croak("Unknown encoding '$from'");
181 my $t = find_encoding($to);
182 unless (defined $t){
183 require Carp;
184 Carp::croak("Unknown encoding '$to'");
186 my $uni = $f->decode($string,$check);
187 return undef if ($check && length($string));
188 $string = $t->encode($uni,$check);
189 return undef if ($check && length($uni));
190 return defined($_[0] = $string) ? length($string) : undef ;
193 sub encode_utf8($)
195 my ($str) = @_;
196 utf8::encode($str);
197 return $str;
200 sub decode_utf8($;$)
202 my ($str, $check) = @_;
203 if ($check){
204 return decode("utf8", $str, $check);
205 }else{
206 return decode("utf8", $str);
207 return $str;
211 predefine_encodings(1);
214 # This is to restore %Encoding if really needed;
217 sub predefine_encodings{
218 use Encode::Encoding;
219 no warnings 'redefine';
220 my $use_xs = shift;
221 if ($ON_EBCDIC) {
222 # was in Encode::UTF_EBCDIC
223 package Encode::UTF_EBCDIC;
224 push @Encode::UTF_EBCDIC::ISA, 'Encode::Encoding';
225 *decode = sub{
226 my ($obj,$str,$chk) = @_;
227 my $res = '';
228 for (my $i = 0; $i < length($str); $i++) {
229 $res .=
230 chr(utf8::unicode_to_native(ord(substr($str,$i,1))));
232 $_[1] = '' if $chk;
233 return $res;
235 *encode = sub{
236 my ($obj,$str,$chk) = @_;
237 my $res = '';
238 for (my $i = 0; $i < length($str); $i++) {
239 $res .=
240 chr(utf8::native_to_unicode(ord(substr($str,$i,1))));
242 $_[1] = '' if $chk;
243 return $res;
245 $Encode::Encoding{Unicode} =
246 bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC";
247 } else {
248 package Encode::Internal;
249 push @Encode::Internal::ISA, 'Encode::Encoding';
250 *decode = sub{
251 my ($obj,$str,$chk) = @_;
252 utf8::upgrade($str);
253 $_[1] = '' if $chk;
254 return $str;
256 *encode = \&decode;
257 $Encode::Encoding{Unicode} =
258 bless {Name => "Internal"} => "Encode::Internal";
262 # was in Encode::utf8
263 package Encode::utf8;
264 push @Encode::utf8::ISA, 'Encode::Encoding';
266 if ($use_xs){
267 Encode::DEBUG and warn __PACKAGE__, " XS on";
268 *decode = \&decode_xs;
269 *encode = \&encode_xs;
270 }else{
271 Encode::DEBUG and warn __PACKAGE__, " XS off";
272 *decode = sub{
273 my ($obj,$octets,$chk) = @_;
274 my $str = Encode::decode_utf8($octets);
275 if (defined $str) {
276 $_[1] = '' if $chk;
277 return $str;
279 return undef;
281 *encode = sub {
282 my ($obj,$string,$chk) = @_;
283 my $octets = Encode::encode_utf8($string);
284 $_[1] = '' if $chk;
285 return $octets;
288 *cat_decode = sub{ # ($obj, $dst, $src, $pos, $trm, $chk)
289 my ($obj, undef, undef, $pos, $trm) = @_; # currently ignores $chk
290 my ($rdst, $rsrc, $rpos) = \@_[1,2,3];
291 use bytes;
292 if ((my $npos = index($$rsrc, $trm, $pos)) >= 0) {
293 $$rdst .= substr($$rsrc, $pos, $npos - $pos + length($trm));
294 $$rpos = $npos + length($trm);
295 return 1;
297 $$rdst .= substr($$rsrc, $pos);
298 $$rpos = length($$rsrc);
299 return '';
301 $Encode::Encoding{utf8} =
302 bless {Name => "utf8"} => "Encode::utf8";
303 $Encode::Encoding{"utf-8-strict"} =
304 bless {Name => "utf-8-strict", strict_utf8 => 1 } => "Encode::utf8";
310 __END__
312 =head1 NAME
314 Encode - character encodings
316 =head1 SYNOPSIS
318 use Encode;
320 =head2 Table of Contents
322 Encode consists of a collection of modules whose details are too big
323 to fit in one document. This POD itself explains the top-level APIs
324 and general topics at a glance. For other topics and more details,
325 see the PODs below:
327 Name Description
328 --------------------------------------------------------
329 Encode::Alias Alias definitions to encodings
330 Encode::Encoding Encode Implementation Base Class
331 Encode::Supported List of Supported Encodings
332 Encode::CN Simplified Chinese Encodings
333 Encode::JP Japanese Encodings
334 Encode::KR Korean Encodings
335 Encode::TW Traditional Chinese Encodings
336 --------------------------------------------------------
338 =head1 DESCRIPTION
340 The C<Encode> module provides the interfaces between Perl's strings
341 and the rest of the system. Perl strings are sequences of
342 B<characters>.
344 The repertoire of characters that Perl can represent is at least that
345 defined by the Unicode Consortium. On most platforms the ordinal
346 values of the characters (as returned by C<ord(ch)>) is the "Unicode
347 codepoint" for the character (the exceptions are those platforms where
348 the legacy encoding is some variant of EBCDIC rather than a super-set
349 of ASCII - see L<perlebcdic>).
351 Traditionally, computer data has been moved around in 8-bit chunks
352 often called "bytes". These chunks are also known as "octets" in
353 networking standards. Perl is widely used to manipulate data of many
354 types - not only strings of characters representing human or computer
355 languages but also "binary" data being the machine's representation of
356 numbers, pixels in an image - or just about anything.
358 When Perl is processing "binary data", the programmer wants Perl to
359 process "sequences of bytes". This is not a problem for Perl - as a
360 byte has 256 possible values, it easily fits in Perl's much larger
361 "logical character".
363 =head2 TERMINOLOGY
365 =over 2
367 =item *
369 I<character>: a character in the range 0..(2**32-1) (or more).
370 (What Perl's strings are made of.)
372 =item *
374 I<byte>: a character in the range 0..255
375 (A special case of a Perl character.)
377 =item *
379 I<octet>: 8 bits of data, with ordinal values 0..255
380 (Term for bytes passed to or from a non-Perl context, e.g. a disk file.)
382 =back
384 =head1 PERL ENCODING API
386 =over 2
388 =item $octets = encode(ENCODING, $string [, CHECK])
390 Encodes a string from Perl's internal form into I<ENCODING> and returns
391 a sequence of octets. ENCODING can be either a canonical name or
392 an alias. For encoding names and aliases, see L</"Defining Aliases">.
393 For CHECK, see L</"Handling Malformed Data">.
395 For example, to convert a string from Perl's internal format to
396 iso-8859-1 (also known as Latin1),
398 $octets = encode("iso-8859-1", $string);
400 B<CAVEAT>: When you run C<$octets = encode("utf8", $string)>, then $octets
401 B<may not be equal to> $string. Though they both contain the same data, the utf8 flag
402 for $octets is B<always> off. When you encode anything, utf8 flag of
403 the result is always off, even when it contains completely valid utf8
404 string. See L</"The UTF-8 flag"> below.
406 If the $string is C<undef> then C<undef> is returned.
408 =item $string = decode(ENCODING, $octets [, CHECK])
410 Decodes a sequence of octets assumed to be in I<ENCODING> into Perl's
411 internal form and returns the resulting string. As in encode(),
412 ENCODING can be either a canonical name or an alias. For encoding names
413 and aliases, see L</"Defining Aliases">. For CHECK, see
414 L</"Handling Malformed Data">.
416 For example, to convert ISO-8859-1 data to a string in Perl's internal format:
418 $string = decode("iso-8859-1", $octets);
420 B<CAVEAT>: When you run C<$string = decode("utf8", $octets)>, then $string
421 B<may not be equal to> $octets. Though they both contain the same data,
422 the utf8 flag for $string is on unless $octets entirely consists of
423 ASCII data (or EBCDIC on EBCDIC machines). See L</"The UTF-8 flag">
424 below.
426 If the $string is C<undef> then C<undef> is returned.
428 =item [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
430 Converts B<in-place> data between two encodings. The data in $octets
431 must be encoded as octets and not as characters in Perl's internal
432 format. For example, to convert ISO-8859-1 data to Microsoft's CP1250
433 encoding:
435 from_to($octets, "iso-8859-1", "cp1250");
437 and to convert it back:
439 from_to($octets, "cp1250", "iso-8859-1");
441 Note that because the conversion happens in place, the data to be
442 converted cannot be a string constant; it must be a scalar variable.
444 from_to() returns the length of the converted string in octets on
445 success, I<undef> on error.
447 B<CAVEAT>: The following operations look the same but are not quite so;
449 from_to($data, "iso-8859-1", "utf8"); #1
450 $data = decode("iso-8859-1", $data); #2
452 Both #1 and #2 make $data consist of a completely valid UTF-8 string
453 but only #2 turns utf8 flag on. #1 is equivalent to
455 $data = encode("utf8", decode("iso-8859-1", $data));
457 See L</"The UTF-8 flag"> below.
459 =item $octets = encode_utf8($string);
461 Equivalent to C<$octets = encode("utf8", $string);> The characters
462 that comprise $string are encoded in Perl's internal format and the
463 result is returned as a sequence of octets. All possible
464 characters have a UTF-8 representation so this function cannot fail.
467 =item $string = decode_utf8($octets [, CHECK]);
469 equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
470 The sequence of octets represented by
471 $octets is decoded from UTF-8 into a sequence of logical
472 characters. Not all sequences of octets form valid UTF-8 encodings, so
473 it is possible for this call to fail. For CHECK, see
474 L</"Handling Malformed Data">.
476 =back
478 =head2 Listing available encodings
480 use Encode;
481 @list = Encode->encodings();
483 Returns a list of the canonical names of the available encodings that
484 are loaded. To get a list of all available encodings including the
485 ones that are not loaded yet, say
487 @all_encodings = Encode->encodings(":all");
489 Or you can give the name of a specific module.
491 @with_jp = Encode->encodings("Encode::JP");
493 When "::" is not in the name, "Encode::" is assumed.
495 @ebcdic = Encode->encodings("EBCDIC");
497 To find out in detail which encodings are supported by this package,
498 see L<Encode::Supported>.
500 =head2 Defining Aliases
502 To add a new alias to a given encoding, use:
504 use Encode;
505 use Encode::Alias;
506 define_alias(newName => ENCODING);
508 After that, newName can be used as an alias for ENCODING.
509 ENCODING may be either the name of an encoding or an
510 I<encoding object>
512 But before you do so, make sure the alias is nonexistent with
513 C<resolve_alias()>, which returns the canonical name thereof.
514 i.e.
516 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
517 Encode::resolve_alias("iso-8859-12") # false; nonexistent
518 Encode::resolve_alias($name) eq $name # true if $name is canonical
520 resolve_alias() does not need C<use Encode::Alias>; it can be
521 exported via C<use Encode qw(resolve_alias)>.
523 See L<Encode::Alias> for details.
525 =head1 Encoding via PerlIO
527 If your perl supports I<PerlIO> (which is the default), you can use a PerlIO layer to decode
528 and encode directly via a filehandle. The following two examples
529 are totally identical in their functionality.
531 # via PerlIO
532 open my $in, "<:encoding(shiftjis)", $infile or die;
533 open my $out, ">:encoding(euc-jp)", $outfile or die;
534 while(<$in>){ print $out $_; }
536 # via from_to
537 open my $in, "<", $infile or die;
538 open my $out, ">", $outfile or die;
539 while(<$in>){
540 from_to($_, "shiftjis", "euc-jp", 1);
541 print $out $_;
544 Unfortunately, it may be that encodings are PerlIO-savvy. You can check
545 if your encoding is supported by PerlIO by calling the C<perlio_ok>
546 method.
548 Encode::perlio_ok("hz"); # False
549 find_encoding("euc-cn")->perlio_ok; # True where PerlIO is available
551 use Encode qw(perlio_ok); # exported upon request
552 perlio_ok("euc-jp")
554 Fortunately, all encodings that come with Encode core are PerlIO-savvy
555 except for hz and ISO-2022-kr. For gory details, see
556 L<Encode::Encoding> and L<Encode::PerlIO>.
558 =head1 Handling Malformed Data
560 The optional I<CHECK> argument tells Encode what to do when it
561 encounters malformed data. Without CHECK, Encode::FB_DEFAULT ( == 0 )
562 is assumed.
564 As of version 2.12 Encode supports coderef values for CHECK. See below.
566 =over 2
568 =item B<NOTE:> Not all encoding support this feature
570 Some encodings ignore I<CHECK> argument. For example,
571 L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
573 =back
575 Now here is the list of I<CHECK> values available
577 =over 2
579 =item I<CHECK> = Encode::FB_DEFAULT ( == 0)
581 If I<CHECK> is 0, (en|de)code will put a I<substitution character> in
582 place of a malformed character. When you encode, E<lt>subcharE<gt>
583 will be used. When you decode the code point C<0xFFFD> is used. If
584 the data is supposed to be UTF-8, an optional lexical warning
585 (category utf8) is given.
587 =item I<CHECK> = Encode::FB_CROAK ( == 1)
589 If I<CHECK> is 1, methods will die on error immediately with an error
590 message. Therefore, when I<CHECK> is set to 1, you should trap the
591 error with eval{} unless you really want to let it die.
593 =item I<CHECK> = Encode::FB_QUIET
595 If I<CHECK> is set to Encode::FB_QUIET, (en|de)code will immediately
596 return the portion of the data that has been processed so far when an
597 error occurs. The data argument will be overwritten with everything
598 after that point (that is, the unprocessed part of data). This is
599 handy when you have to call decode repeatedly in the case where your
600 source data may contain partial multi-byte character sequences,
601 (i.e. you are reading with a fixed-width buffer). Here is a sample
602 code that does exactly this:
604 my $buffer = ''; my $string = '';
605 while(read $fh, $buffer, 256, length($buffer)){
606 $string .= decode($encoding, $buffer, Encode::FB_QUIET);
607 # $buffer now contains the unprocessed partial character
610 =item I<CHECK> = Encode::FB_WARN
612 This is the same as above, except that it warns on error. Handy when
613 you are debugging the mode above.
615 =item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
617 =item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
619 =item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
621 For encodings that are implemented by Encode::XS, CHECK ==
622 Encode::FB_PERLQQ turns (en|de)code into C<perlqq> fallback mode.
624 When you decode, C<\xI<HH>> will be inserted for a malformed character,
625 where I<HH> is the hex representation of the octet that could not be
626 decoded to utf8. And when you encode, C<\x{I<HHHH>}> will be inserted,
627 where I<HHHH> is the Unicode ID of the character that cannot be found
628 in the character repertoire of the encoding.
630 HTML/XML character reference modes are about the same, in place of
631 C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number and
632 XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
634 In Encode 2.10 or later, C<LEAVE_SRC> is also implied.
636 =item The bitmask
638 These modes are actually set via a bitmask. Here is how the FB_XX
639 constants are laid out. You can import the FB_XX constants via
640 C<use Encode qw(:fallbacks)>; you can import the generic bitmask
641 constants via C<use Encode qw(:fallback_all)>.
643 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
644 DIE_ON_ERR 0x0001 X
645 WARN_ON_ERR 0x0002 X
646 RETURN_ON_ERR 0x0004 X X
647 LEAVE_SRC 0x0008 X
648 PERLQQ 0x0100 X
649 HTMLCREF 0x0200
650 XMLCREF 0x0400
652 =back
654 =head2 coderef for CHECK
656 As of Encode 2.12 CHECK can also be a code reference which takes the
657 ord value of unmapped caharacter as an argument and returns a string
658 that represents the fallback character. For instance,
660 $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
662 Acts like FB_PERLQQ but E<lt>U+I<XXXX>E<gt> is used instead of
663 \x{I<XXXX>}.
665 =head1 Defining Encodings
667 To define a new encoding, use:
669 use Encode qw(define_encoding);
670 define_encoding($object, 'canonicalName' [, alias...]);
672 I<canonicalName> will be associated with I<$object>. The object
673 should provide the interface described in L<Encode::Encoding>.
674 If more than two arguments are provided then additional
675 arguments are taken as aliases for I<$object>.
677 See L<Encode::Encoding> for more details.
679 =head1 The UTF-8 flag
681 Before the introduction of utf8 support in perl, The C<eq> operator
682 just compared the strings represented by two scalars. Beginning with
683 perl 5.8, C<eq> compares two strings with simultaneous consideration
684 of I<the utf8 flag>. To explain why we made it so, I will quote page
685 402 of C<Programming Perl, 3rd ed.>
687 =over 2
689 =item Goal #1:
691 Old byte-oriented programs should not spontaneously break on the old
692 byte-oriented data they used to work on.
694 =item Goal #2:
696 Old byte-oriented programs should magically start working on the new
697 character-oriented data when appropriate.
699 =item Goal #3:
701 Programs should run just as fast in the new character-oriented mode
702 as in the old byte-oriented mode.
704 =item Goal #4:
706 Perl should remain one language, rather than forking into a
707 byte-oriented Perl and a character-oriented Perl.
709 =back
711 Back when C<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0
712 was born and many features documented in the book remained
713 unimplemented for a long time. Perl 5.8 corrected this and the introduction
714 of the UTF-8 flag is one of them. You can think of this perl notion as of a
715 byte-oriented mode (utf8 flag off) and a character-oriented mode (utf8
716 flag on).
718 Here is how Encode takes care of the utf8 flag.
720 =over 2
722 =item *
724 When you encode, the resulting utf8 flag is always off.
726 =item *
728 When you decode, the resulting utf8 flag is on unless you can
729 unambiguously represent data. Here is the definition of
730 dis-ambiguity.
732 After C<$utf8 = decode('foo', $octet);>,
734 When $octet is... The utf8 flag in $utf8 is
735 ---------------------------------------------
736 In ASCII only (or EBCDIC only) OFF
737 In ISO-8859-1 ON
738 In any other Encoding ON
739 ---------------------------------------------
741 As you see, there is one exception, In ASCII. That way you can assume
742 Goal #1. And with Encode Goal #2 is assumed but you still have to be
743 careful in such cases mentioned in B<CAVEAT> paragraphs.
745 This utf8 flag is not visible in perl scripts, exactly for the same
746 reason you cannot (or you I<don't have to>) see if a scalar contains a
747 string, integer, or floating point number. But you can still peek
748 and poke these if you will. See the section below.
750 =back
752 =head2 Messing with Perl's Internals
754 The following API uses parts of Perl's internals in the current
755 implementation. As such, they are efficient but may change.
757 =over 2
759 =item is_utf8(STRING [, CHECK])
761 [INTERNAL] Tests whether the UTF-8 flag is turned on in the STRING.
762 If CHECK is true, also checks the data in STRING for being well-formed
763 UTF-8. Returns true if successful, false otherwise.
765 As of perl 5.8.1, L<utf8> also has utf8::is_utf8().
767 =item _utf8_on(STRING)
769 [INTERNAL] Turns on the UTF-8 flag in STRING. The data in STRING is
770 B<not> checked for being well-formed UTF-8. Do not use unless you
771 B<know> that the STRING is well-formed UTF-8. Returns the previous
772 state of the UTF-8 flag (so please don't treat the return value as
773 indicating success or failure), or C<undef> if STRING is not a string.
775 =item _utf8_off(STRING)
777 [INTERNAL] Turns off the UTF-8 flag in STRING. Do not use frivolously.
778 Returns the previous state of the UTF-8 flag (so please don't treat the
779 return value as indicating success or failure), or C<undef> if STRING is
780 not a string.
782 =back
784 =head1 UTF-8 vs. utf8
786 ....We now view strings not as sequences of bytes, but as sequences
787 of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
788 computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
790 That has been the perl's notion of UTF-8 but official UTF-8 is more
791 strict; Its ranges is much narrower (0 .. 10FFFF), some sequences are
792 not allowed (i.e. Those used in the surrogate pair, 0xFFFE, et al).
794 Now that is overruled by Larry Wall himself.
796 From: Larry Wall <larry@wall.org>
797 Date: December 04, 2004 11:51:58 JST
798 To: perl-unicode@perl.org
799 Subject: Re: Make Encode.pm support the real UTF-8
800 Message-Id: <20041204025158.GA28754@wall.org>
802 On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
803 : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
804 : but "UTF-8" is the name of the standard and should give the
805 : corresponding behaviour.
807 For what it's worth, that's how I've always kept them straight in my
808 head.
810 Also for what it's worth, Perl 6 will mostly default to strict but
811 make it easy to switch back to lax.
813 Larry
815 Do you copy? As of Perl 5.8.7, B<UTF-8> means strict, official UTF-8
816 while B<utf8> means liberal, lax, version thereof. And Encode version
817 2.10 or later thus groks the difference between C<UTF-8> and C"utf8".
819 encode("utf8", "\x{FFFF_FFFF}", 1); # okay
820 encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
822 C<UTF-8> in Encode is actually a canonical name for C<utf-8-strict>.
823 Yes, the hyphen between "UTF" and "8" is important. Without it Encode
824 goes "liberal"
826 find_encoding("UTF-8")->name # is 'utf-8-strict'
827 find_encoding("utf-8")->name # ditto. names are case insensitive
828 find_encoding("utf8")->name # ditto. "_" are treated as "-"
829 find_encoding("UTF8")->name # is 'utf8'.
832 =head1 SEE ALSO
834 L<Encode::Encoding>,
835 L<Encode::Supported>,
836 L<Encode::PerlIO>,
837 L<encoding>,
838 L<perlebcdic>,
839 L<perlfunc/open>,
840 L<perlunicode>,
841 L<utf8>,
842 the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
844 =head1 MAINTAINER
846 This project was originated by Nick Ing-Simmons and later maintained
847 by Dan Kogai E<lt>dankogai@dan.co.jpE<gt>. See AUTHORS for a full
848 list of people involved. For any questions, use
849 E<lt>perl-unicode@perl.orgE<gt> so we can all share.
851 =cut