3 # Copyright (C) 2008 LibLime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it under the
8 # terms of the GNU General Public License as published by the Free Software
9 # Foundation; either version 2 of the License, or (at your option) any later
12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with Koha; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 use MARC
::Charset qw
/marc8_to_utf8/;
27 use Unicode
::Normalize
;
29 use vars
qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
32 # set the version for version checking
33 $VERSION = 3.07.00.049;
49 C4::Charset - utilities for handling character set conversions.
57 This module contains routines for dealing with character set
58 conversions, particularly for MARC records.
60 A variety of character encodings are in use by various MARC
61 standards, and even more character encodings are used by
62 non-standard MARC records. The various MARC formats generally
63 do not do a good job of advertising a given record's character
64 encoding, and even when a record does advertise its encoding,
65 e.g., via the Leader/09, experience has shown that one cannot
68 Ultimately, all MARC records are stored in Koha in UTF-8 and
69 must be converted from whatever the source character encoding is.
70 The goal of this module is to ensure that these conversions
71 take place accurately. When a character conversion cannot take
72 place, or at least not accurately, the module was provide
73 enough information to allow user-facing code to inform the user
74 on how to deal with the situation.
80 =head2 IsStringUTF8ish
82 my $is_utf8 = IsStringUTF8ish($str);
84 Determines if C<$str> is valid UTF-8. This can mean
91 The Perl UTF-8 flag is set and the string contains valid UTF-8.
95 The Perl UTF-8 flag is B<not> set, but the octets contain
100 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
101 because in one could be presented with a MARC blob that is
102 not actually in UTF-8 but whose sequence of octets appears to be
103 valid UTF-8. The rest of the MARC character conversion functions
104 will assume that this situation occur does not very often.
108 sub IsStringUTF8ish
{
111 return 1 if utf8
::is_utf8
($str);
112 return utf8
::decode
($str);
117 my $marc_record = SetUTF8Flag($marc_record, $nfd);
119 This function sets the PERL UTF8 flag for data.
120 It is required when using new_from_usmarc
121 since MARC::File::USMARC does not handle PERL UTF8 setting.
122 When editing unicode marc records fields and subfields, you
123 would end up in double encoding without using this function.
125 If $nfd is set, string normalization will use NFD instead of NFC
128 In my opinion, this function belongs to MARC::Record and not
130 But since it handles charset, and MARC::Record, it finds its way in that package
135 my ($record, $nfd)=@_;
136 return unless ($record && $record->fields());
137 foreach my $field ($record->fields()){
138 if ($field->tag()>=10){
140 foreach my $subfield ($field->subfields()){
141 push @subfields,($$subfield[0],NormalizeString
($$subfield[1],$nfd));
144 my $newfield=MARC
::Field
->new(
146 $field->indicator(1),
147 $field->indicator(2),
150 $field->replace_with($newfield);
152 warn "ERROR occurred in SetUTF8Flag $@" if $@
;
157 =head2 NormalizeString
159 my $normalized_string=NormalizeString($string,$nfd,$transform);
162 nfd : If you want to set NFD and not NFC
163 transform : If you expect all the signs to be removed
165 Sets the PERL UTF8 Flag on your initial data if need be
166 and applies cleaning if required
168 Returns a utf8 NFC normalized string
171 my $string=NormalizeString ("l'ornithoptère");
172 #results into ornithoptère in NFC form and sets UTF8 Flag
178 my ($string,$nfd,$transform)=@_;
179 utf8
::decode
($string) unless (utf8
::is_utf8
($string));
181 $string= NFD
($string);
184 $string=NFC
($string);
187 $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g
;
188 #removing one letter words "d'" "l'" was changed into "d " "l "
189 $string=~s/\b\S\b//g;
195 =head2 MarcToUTF8Record
197 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
198 $marc_flavour, [, $source_encoding]);
200 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
201 optional source encoding, return a C<MARC::Record> that is
204 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
205 is not guaranteed to have been converted correctly. Specifically,
206 if C<$converted_from> is 'failed', the MARC record returned failed
207 character conversion and had each of its non-ASCII octets changed
208 to the Unicode replacement character.
210 If the source encoding was not specified, this routine will
211 try to guess it; the character encoding used for a successful
212 conversion is returned in C<$converted_from>.
216 sub MarcToUTF8Record
{
218 my $marc_flavour = shift;
219 my $source_encoding = shift;
221 my $marc_blob_is_utf8 = 0;
222 if (ref($marc) eq 'MARC::Record') {
223 my $marc_blob = $marc->as_usmarc();
224 $marc_blob_is_utf8 = IsStringUTF8ish
($marc_blob);
225 $marc_record = $marc;
227 # dealing with a MARC blob
229 # remove any ersatz whitespace from the beginning and
230 # end of the MARC blob -- these can creep into MARC
231 # files produced by several sources -- caller really
232 # should be doing this, however
235 $marc_blob_is_utf8 = IsStringUTF8ish
($marc);
237 $marc_record = MARC
::Record
->new_from_usmarc($marc);
240 # if we fail the first time, one likely problem
241 # is that we have a MARC21 record that says that it's
242 # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
243 # We'll try parsing it again.
244 substr($marc, 9, 1) = ' ';
246 $marc_record = MARC
::Record
->new_from_usmarc($marc);
249 # it's hopeless; return an empty MARC::Record
250 return MARC
::Record
->new(), 'failed', ['could not parse MARC blob'];
255 # If we do not know the source encoding, try some guesses
257 # 1. Record is UTF-8 already.
258 # 2. If MARC flavor is MARC21 or NORMARC, then
259 # a. record is MARC-8
260 # b. record is ISO-8859-1
261 # 3. If MARC flavor is UNIMARC, then
262 if (not defined $source_encoding) {
263 if ($marc_blob_is_utf8) {
264 # note that for MARC21/NORMARC we are not bothering to check
265 # if the Leader/09 is set to 'a' or not -- because
266 # of problems with various ILSs (including Koha in the
267 # past, alas), this just is not trustworthy.
268 SetMarcUnicodeFlag
($marc_record, $marc_flavour);
269 return $marc_record, 'UTF-8', [];
271 if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
272 return _default_marc21_charconv_to_utf8
($marc_record, $marc_flavour);
273 } elsif ($marc_flavour =~/UNIMARC/) {
274 return _default_unimarc_charconv_to_utf8
($marc_record, $marc_flavour);
276 return _default_marc21_charconv_to_utf8
($marc_record, $marc_flavour);
280 # caller knows the character encoding
281 my $original_marc_record = $marc_record->clone();
283 if ($source_encoding =~ /utf-?8/i) {
284 if ($marc_blob_is_utf8) {
285 SetMarcUnicodeFlag
($marc_record, $marc_flavour);
286 return $marc_record, 'UTF-8', [];
288 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
290 } elsif ($source_encoding =~ /marc-?8/i) {
291 @errors = _marc_marc8_to_utf8
($marc_record, $marc_flavour);
292 } elsif ($source_encoding =~ /5426/) {
293 @errors = _marc_iso5426_to_utf8
($marc_record, $marc_flavour);
295 # assume any other character encoding is for Text::Iconv
296 @errors = _marc_to_utf8_via_text_iconv
($marc_record, $marc_flavour, $source_encoding);
300 _marc_to_utf8_replacement_char
($original_marc_record, $marc_flavour);
301 return $original_marc_record, 'failed', \
@errors;
303 return $marc_record, $source_encoding, [];
309 =head2 SetMarcUnicodeFlag
311 SetMarcUnicodeFlag($marc_record, $marc_flavour);
313 Set both the internal MARC::Record encoding flag
314 and the appropriate Leader/09 (MARC21) or
315 100/26-29 (UNIMARC) to indicate that the record
316 is in UTF-8. Note that this does B<not> do
317 any actual character conversion.
321 sub SetMarcUnicodeFlag
{
322 my $marc_record = shift;
323 my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
325 $marc_record->encoding('UTF-8');
326 if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
327 my $leader = $marc_record->leader();
328 substr($leader, 9, 1) = 'a';
329 $marc_record->leader($leader);
330 } elsif ($marc_flavour =~/UNIMARC/) {
331 my $defaultlanguage = C4
::Context
->preference("UNIMARCField100Language");
332 $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
334 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?
(21,12):(36,25));
335 $string=$marc_record->subfield( 100, "a" );
336 if (defined $string && length($string)==$subflength) {
337 $string = substr $string, 0,$subflength if (length($string)>$subflength);
340 $string = POSIX
::strftime
( "%Y%m%d", localtime );
342 $string = sprintf( "%-*s", $subflength, $string );
343 substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
345 substr( $string, $encodingposition, 3, "y50" );
346 if ( $marc_record->subfield( 100, "a" ) ) {
347 $marc_record->field('100')->update(a
=>$string);
350 $marc_record->insert_grouped_field(
351 MARC
::Field
->new( 100, '', '', "a" => $string ) );
353 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
355 warn "Unrecognized marcflavour: $marc_flavour";
359 =head2 StripNonXmlChars
361 my $new_str = StripNonXmlChars($old_str);
363 Given a string, return a copy with the
364 characters that are illegal in XML
367 This function exists to work around a problem
368 that can occur with badly-encoded MARC records.
369 Specifically, if a UTF-8 MARC record also
370 has excape (\x1b) characters, MARC::File::XML
371 will let the escape characters pass through
372 when as_xml() or as_xml_record() is called. The
373 problem is that the escape character is not
374 legal in well-formed XML documents, so when
375 MARC::File::XML attempts to parse such a record,
376 the XML parser will fail.
378 Stripping such characters will allow a
379 MARC::Record->new_from_xml()
380 to work, at the possible risk of some data loss.
384 sub StripNonXmlChars
{
386 if (!defined($str) || $str eq ""){
389 $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
403 Removes Non Sorting Block characters
407 my $NSB = '\x88' ; # NSB : begin Non Sorting Block
408 my $NSE = '\x89' ; # NSE : Non Sorting Block end
409 my $NSB2 = '\x98' ; # NSB : begin Non Sorting Block
410 my $NSE2 = '\x9C' ; # NSE : Non Sorting Block end
411 my $C2 = '\xC2' ; # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
413 # handles non sorting blocks
427 =head1 INTERNAL FUNCTIONS
429 =head2 _default_marc21_charconv_to_utf8
431 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
433 Converts a C<MARC::Record> of unknown character set to UTF-8,
434 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
435 to UTF-8, then a default conversion that replaces each non-ASCII
436 character with the replacement character.
438 The C<$guessed_charset> return value contains the character set
439 that resulted in a conversion to valid UTF-8; note that
440 if the MARC-8 and ISO-8859-1 conversions failed, the value of
445 sub _default_marc21_charconv_to_utf8
{
446 my $marc_record = shift;
447 my $marc_flavour = shift;
449 my $trial_marc8 = $marc_record->clone();
451 my @errors = _marc_marc8_to_utf8
($trial_marc8, $marc_flavour);
453 return $trial_marc8, 'MARC-8', [];
455 push @all_errors, @errors;
457 my $trial_8859_1 = $marc_record->clone();
458 @errors = _marc_to_utf8_via_text_iconv
($trial_8859_1, $marc_flavour, 'iso-8859-1');
460 return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
461 # instead if we wanted to report details
462 # of the failed attempt at MARC-8 => UTF-8
464 push @all_errors, @errors;
466 my $default_converted = $marc_record->clone();
467 _marc_to_utf8_replacement_char
($default_converted, $marc_flavour);
468 return $default_converted, 'failed', \
@all_errors;
471 =head2 _default_unimarc_charconv_to_utf8
473 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
475 Converts a C<MARC::Record> of unknown character set to UTF-8,
476 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
477 to UTF-8, then a default conversion that replaces each non-ASCII
478 character with the replacement character.
480 The C<$guessed_charset> return value contains the character set
481 that resulted in a conversion to valid UTF-8; note that
482 if the MARC-8 and ISO-8859-1 conversions failed, the value of
487 sub _default_unimarc_charconv_to_utf8
{
488 my $marc_record = shift;
489 my $marc_flavour = shift;
491 my $trial_marc8 = $marc_record->clone();
493 my @errors = _marc_iso5426_to_utf8
($trial_marc8, $marc_flavour);
495 return $trial_marc8, 'iso-5426';
497 push @all_errors, @errors;
499 my $trial_8859_1 = $marc_record->clone();
500 @errors = _marc_to_utf8_via_text_iconv
($trial_8859_1, $marc_flavour, 'iso-8859-1');
502 return $trial_8859_1, 'iso-8859-1';
504 push @all_errors, @errors;
506 my $default_converted = $marc_record->clone();
507 _marc_to_utf8_replacement_char
($default_converted, $marc_flavour);
508 return $default_converted, 'failed', \
@all_errors;
511 =head2 _marc_marc8_to_utf8
513 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
515 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
516 If the conversion fails for some reason, an
517 appropriate messages will be placed in the returned
522 sub _marc_marc8_to_utf8
{
523 my $marc_record = shift;
524 my $marc_flavour = shift;
526 my $prev_ignore = MARC
::Charset
->ignore_errors();
527 MARC
::Charset
->ignore_errors(1);
529 # trap warnings raised by MARC::Charset
531 local $SIG{__WARN__
} = sub {
533 if ($msg =~ /MARC.Charset/) {
534 # FIXME - purpose of this regexp is to strip out the
535 # line reference to MARC/Charset.pm, but as it
536 # exists probably won't work quite on Windows --
537 # some sort of minimal-bunch back-tracking RE
538 # would be helpful here
539 $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
542 # if warning doesn't come from MARC::Charset, just
548 foreach my $field ($marc_record->fields()) {
549 if ($field->is_control_field()) {
550 ; # do nothing -- control fields should not contain non-ASCII characters
552 my @converted_subfields;
553 foreach my $subfield ($field->subfields()) {
554 my $utf8sf = MARC
::Charset
::marc8_to_utf8
($subfield->[1]);
555 unless (IsStringUTF8ish
($utf8sf)) {
556 # Because of a bug in MARC::Charset 0.98, if the string
557 # has (a) one or more diacritics that (b) are only in character positions
558 # 128 to 255 inclusive, the resulting converted string is not in
559 # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
560 # occurs, upgrade the string in place. Moral of the story seems to be
561 # that pack("U", ...) is better than chr(...) if you need to guarantee
562 # that the resulting string is UTF-8.
563 utf8
::upgrade
($utf8sf);
565 push @converted_subfields, $subfield->[0], $utf8sf;
568 $field->replace_with(MARC
::Field
->new(
569 $field->tag(), $field->indicator(1), $field->indicator(2),
570 @converted_subfields)
575 MARC
::Charset
->ignore_errors($prev_ignore);
577 SetMarcUnicodeFlag
($marc_record, $marc_flavour);
582 =head2 _marc_iso5426_to_utf8
584 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
586 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
587 If the conversion fails for some reason, an
588 appropriate messages will be placed in the returned
591 FIXME - is ISO-5426 equivalent enough to MARC-8
592 that C<MARC::Charset> can be used instead?
596 sub _marc_iso5426_to_utf8
{
597 my $marc_record = shift;
598 my $marc_flavour = shift;
602 foreach my $field ($marc_record->fields()) {
603 if ($field->is_control_field()) {
604 ; # do nothing -- control fields should not contain non-ASCII characters
606 my @converted_subfields;
607 foreach my $subfield ($field->subfields()) {
608 my $utf8sf = char_decode5426
($subfield->[1]);
609 push @converted_subfields, $subfield->[0], $utf8sf;
612 $field->replace_with(MARC
::Field
->new(
613 $field->tag(), $field->indicator(1), $field->indicator(2),
614 @converted_subfields)
619 SetMarcUnicodeFlag
($marc_record, $marc_flavour);
624 =head2 _marc_to_utf8_via_text_iconv
626 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
628 Convert a C<MARC::Record> to UTF-8 in-place using the
629 C<Text::Iconv> CPAN module. Any source encoding accepted
630 by the user's iconv installation should work. If
631 the source encoding is not recognized on the user's
632 server or the conversion fails for some reason,
633 appropriate messages will be placed in the returned
638 sub _marc_to_utf8_via_text_iconv
{
639 my $marc_record = shift;
640 my $marc_flavour = shift;
641 my $source_encoding = shift;
645 eval { $decoder = Text
::Iconv
->new($source_encoding, 'utf8'); };
647 push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
651 my $prev_raise_error = Text
::Iconv
->raise_error();
652 Text
::Iconv
->raise_error(1);
654 foreach my $field ($marc_record->fields()) {
655 if ($field->is_control_field()) {
656 ; # do nothing -- control fields should not contain non-ASCII characters
658 my @converted_subfields;
659 foreach my $subfield ($field->subfields()) {
661 my $conversion_ok = 1;
662 eval { $converted_value = $decoder->convert($subfield->[1]); };
666 } elsif (not defined $converted_value) {
668 push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
671 if ($conversion_ok) {
672 push @converted_subfields, $subfield->[0], $converted_value;
674 $converted_value = $subfield->[1];
675 $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
676 push @converted_subfields, $subfield->[0], $converted_value;
680 $field->replace_with(MARC
::Field
->new(
681 $field->tag(), $field->indicator(1), $field->indicator(2),
682 @converted_subfields)
687 SetMarcUnicodeFlag
($marc_record, $marc_flavour);
688 Text
::Iconv
->raise_error($prev_raise_error);
693 =head2 _marc_to_utf8_replacement_char
695 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
697 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
698 unsatisfactory method of replacing all non-ASCII (e.g.,
699 where the eight bit is set) octet with the Unicode
700 replacement character. This is meant as a last-ditch
701 method, and would be best used as part of a UI that
702 lets a cataloguer pick various character conversions
703 until he or she finds the right one.
707 sub _marc_to_utf8_replacement_char
{
708 my $marc_record = shift;
709 my $marc_flavour = shift;
711 foreach my $field ($marc_record->fields()) {
712 if ($field->is_control_field()) {
713 ; # do nothing -- control fields should not contain non-ASCII characters
715 my @converted_subfields;
716 foreach my $subfield ($field->subfields()) {
717 my $value = $subfield->[1];
718 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
719 push @converted_subfields, $subfield->[0], $value;
722 $field->replace_with(MARC
::Field
->new(
723 $field->tag(), $field->indicator(1), $field->indicator(2),
724 @converted_subfields)
729 SetMarcUnicodeFlag
($marc_record, $marc_flavour);
732 =head2 char_decode5426
734 my $utf8string = char_decode5426($iso_5426_string);
736 Converts a string from ISO-5426 to UTF-8.
742 $chars{0xb0}=0x0101;#3/0ayn[ain]
743 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
744 #$chars{0xb2}=0x00e0;#'à';
745 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
746 #$chars{0xb3}=0x00e7;#'ç';
747 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
754 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
755 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
756 $chars{0xfa}=0x0153; #oe
757 $chars{0xea}=0x0152; #oe
758 $chars{0x81d1}=0x00b0;
761 ## combined characters iso5426
763 $chars{0xc041}=0x1ea2; # capital a with hook above
764 $chars{0xc045}=0x1eba; # capital e with hook above
765 $chars{0xc049}=0x1ec8; # capital i with hook above
766 $chars{0xc04f}=0x1ece; # capital o with hook above
767 $chars{0xc055}=0x1ee6; # capital u with hook above
768 $chars{0xc059}=0x1ef6; # capital y with hook above
769 $chars{0xc061}=0x1ea3; # small a with hook above
770 $chars{0xc065}=0x1ebb; # small e with hook above
771 $chars{0xc069}=0x1ec9; # small i with hook above
772 $chars{0xc06f}=0x1ecf; # small o with hook above
773 $chars{0xc075}=0x1ee7; # small u with hook above
774 $chars{0xc079}=0x1ef7; # small y with hook above
777 $chars{0xc141}=0x00c0; # capital a with grave accent
778 $chars{0xc145}=0x00c8; # capital e with grave accent
779 $chars{0xc149}=0x00cc; # capital i with grave accent
780 $chars{0xc14f}=0x00d2; # capital o with grave accent
781 $chars{0xc155}=0x00d9; # capital u with grave accent
782 $chars{0xc157}=0x1e80; # capital w with grave
783 $chars{0xc159}=0x1ef2; # capital y with grave
784 $chars{0xc161}=0x00e0; # small a with grave accent
785 $chars{0xc165}=0x00e8; # small e with grave accent
786 $chars{0xc169}=0x00ec; # small i with grave accent
787 $chars{0xc16f}=0x00f2; # small o with grave accent
788 $chars{0xc175}=0x00f9; # small u with grave accent
789 $chars{0xc177}=0x1e81; # small w with grave
790 $chars{0xc179}=0x1ef3; # small y with grave
792 $chars{0xc241}=0x00c1; # capital a with acute accent
793 $chars{0xc243}=0x0106; # capital c with acute accent
794 $chars{0xc245}=0x00c9; # capital e with acute accent
795 $chars{0xc247}=0x01f4; # capital g with acute
796 $chars{0xc249}=0x00cd; # capital i with acute accent
797 $chars{0xc24b}=0x1e30; # capital k with acute
798 $chars{0xc24c}=0x0139; # capital l with acute accent
799 $chars{0xc24d}=0x1e3e; # capital m with acute
800 $chars{0xc24e}=0x0143; # capital n with acute accent
801 $chars{0xc24f}=0x00d3; # capital o with acute accent
802 $chars{0xc250}=0x1e54; # capital p with acute
803 $chars{0xc252}=0x0154; # capital r with acute accent
804 $chars{0xc253}=0x015a; # capital s with acute accent
805 $chars{0xc255}=0x00da; # capital u with acute accent
806 $chars{0xc257}=0x1e82; # capital w with acute
807 $chars{0xc259}=0x00dd; # capital y with acute accent
808 $chars{0xc25a}=0x0179; # capital z with acute accent
809 $chars{0xc261}=0x00e1; # small a with acute accent
810 $chars{0xc263}=0x0107; # small c with acute accent
811 $chars{0xc265}=0x00e9; # small e with acute accent
812 $chars{0xc267}=0x01f5; # small g with acute
813 $chars{0xc269}=0x00ed; # small i with acute accent
814 $chars{0xc26b}=0x1e31; # small k with acute
815 $chars{0xc26c}=0x013a; # small l with acute accent
816 $chars{0xc26d}=0x1e3f; # small m with acute
817 $chars{0xc26e}=0x0144; # small n with acute accent
818 $chars{0xc26f}=0x00f3; # small o with acute accent
819 $chars{0xc270}=0x1e55; # small p with acute
820 $chars{0xc272}=0x0155; # small r with acute accent
821 $chars{0xc273}=0x015b; # small s with acute accent
822 $chars{0xc275}=0x00fa; # small u with acute accent
823 $chars{0xc277}=0x1e83; # small w with acute
824 $chars{0xc279}=0x00fd; # small y with acute accent
825 $chars{0xc27a}=0x017a; # small z with acute accent
826 $chars{0xc2e1}=0x01fc; # capital ae with acute
827 $chars{0xc2f1}=0x01fd; # small ae with acute
828 # 4/3 circumflex accent
829 $chars{0xc341}=0x00c2; # capital a with circumflex accent
830 $chars{0xc343}=0x0108; # capital c with circumflex
831 $chars{0xc345}=0x00ca; # capital e with circumflex accent
832 $chars{0xc347}=0x011c; # capital g with circumflex
833 $chars{0xc348}=0x0124; # capital h with circumflex
834 $chars{0xc349}=0x00ce; # capital i with circumflex accent
835 $chars{0xc34a}=0x0134; # capital j with circumflex
836 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
837 $chars{0xc353}=0x015c; # capital s with circumflex
838 $chars{0xc355}=0x00db; # capital u with circumflex
839 $chars{0xc357}=0x0174; # capital w with circumflex
840 $chars{0xc359}=0x0176; # capital y with circumflex
841 $chars{0xc35a}=0x1e90; # capital z with circumflex
842 $chars{0xc361}=0x00e2; # small a with circumflex accent
843 $chars{0xc363}=0x0109; # small c with circumflex
844 $chars{0xc365}=0x00ea; # small e with circumflex accent
845 $chars{0xc367}=0x011d; # small g with circumflex
846 $chars{0xc368}=0x0125; # small h with circumflex
847 $chars{0xc369}=0x00ee; # small i with circumflex accent
848 $chars{0xc36a}=0x0135; # small j with circumflex
849 $chars{0xc36e}=0x00f1; # small n with tilde
850 $chars{0xc36f}=0x00f4; # small o with circumflex accent
851 $chars{0xc373}=0x015d; # small s with circumflex
852 $chars{0xc375}=0x00fb; # small u with circumflex
853 $chars{0xc377}=0x0175; # small w with circumflex
854 $chars{0xc379}=0x0177; # small y with circumflex
855 $chars{0xc37a}=0x1e91; # small z with circumflex
857 $chars{0xc441}=0x00c3; # capital a with tilde
858 $chars{0xc445}=0x1ebc; # capital e with tilde
859 $chars{0xc449}=0x0128; # capital i with tilde
860 $chars{0xc44e}=0x00d1; # capital n with tilde
861 $chars{0xc44f}=0x00d5; # capital o with tilde
862 $chars{0xc455}=0x0168; # capital u with tilde
863 $chars{0xc456}=0x1e7c; # capital v with tilde
864 $chars{0xc459}=0x1ef8; # capital y with tilde
865 $chars{0xc461}=0x00e3; # small a with tilde
866 $chars{0xc465}=0x1ebd; # small e with tilde
867 $chars{0xc469}=0x0129; # small i with tilde
868 $chars{0xc46e}=0x00f1; # small n with tilde
869 $chars{0xc46f}=0x00f5; # small o with tilde
870 $chars{0xc475}=0x0169; # small u with tilde
871 $chars{0xc476}=0x1e7d; # small v with tilde
872 $chars{0xc479}=0x1ef9; # small y with tilde
874 $chars{0xc541}=0x0100; # capital a with macron
875 $chars{0xc545}=0x0112; # capital e with macron
876 $chars{0xc547}=0x1e20; # capital g with macron
877 $chars{0xc549}=0x012a; # capital i with macron
878 $chars{0xc54f}=0x014c; # capital o with macron
879 $chars{0xc555}=0x016a; # capital u with macron
880 $chars{0xc561}=0x0101; # small a with macron
881 $chars{0xc565}=0x0113; # small e with macron
882 $chars{0xc567}=0x1e21; # small g with macron
883 $chars{0xc569}=0x012b; # small i with macron
884 $chars{0xc56f}=0x014d; # small o with macron
885 $chars{0xc575}=0x016b; # small u with macron
886 $chars{0xc572}=0x0159; # small r with macron
887 $chars{0xc5e1}=0x01e2; # capital ae with macron
888 $chars{0xc5f1}=0x01e3; # small ae with macron
890 $chars{0xc641}=0x0102; # capital a with breve
891 $chars{0xc645}=0x0114; # capital e with breve
892 $chars{0xc647}=0x011e; # capital g with breve
893 $chars{0xc649}=0x012c; # capital i with breve
894 $chars{0xc64f}=0x014e; # capital o with breve
895 $chars{0xc655}=0x016c; # capital u with breve
896 $chars{0xc661}=0x0103; # small a with breve
897 $chars{0xc665}=0x0115; # small e with breve
898 $chars{0xc667}=0x011f; # small g with breve
899 $chars{0xc669}=0x012d; # small i with breve
900 $chars{0xc66f}=0x014f; # small o with breve
901 $chars{0xc675}=0x016d; # small u with breve
903 $chars{0xc7b0}=0x01e1; # Ain with dot above
904 $chars{0xc742}=0x1e02; # capital b with dot above
905 $chars{0xc743}=0x010a; # capital c with dot above
906 $chars{0xc744}=0x1e0a; # capital d with dot above
907 $chars{0xc745}=0x0116; # capital e with dot above
908 $chars{0xc746}=0x1e1e; # capital f with dot above
909 $chars{0xc747}=0x0120; # capital g with dot above
910 $chars{0xc748}=0x1e22; # capital h with dot above
911 $chars{0xc749}=0x0130; # capital i with dot above
912 $chars{0xc74d}=0x1e40; # capital m with dot above
913 $chars{0xc74e}=0x1e44; # capital n with dot above
914 $chars{0xc750}=0x1e56; # capital p with dot above
915 $chars{0xc752}=0x1e58; # capital r with dot above
916 $chars{0xc753}=0x1e60; # capital s with dot above
917 $chars{0xc754}=0x1e6a; # capital t with dot above
918 $chars{0xc757}=0x1e86; # capital w with dot above
919 $chars{0xc758}=0x1e8a; # capital x with dot above
920 $chars{0xc759}=0x1e8e; # capital y with dot above
921 $chars{0xc75a}=0x017b; # capital z with dot above
922 $chars{0xc761}=0x0227; # small b with dot above
923 $chars{0xc762}=0x1e03; # small b with dot above
924 $chars{0xc763}=0x010b; # small c with dot above
925 $chars{0xc764}=0x1e0b; # small d with dot above
926 $chars{0xc765}=0x0117; # small e with dot above
927 $chars{0xc766}=0x1e1f; # small f with dot above
928 $chars{0xc767}=0x0121; # small g with dot above
929 $chars{0xc768}=0x1e23; # small h with dot above
930 $chars{0xc76d}=0x1e41; # small m with dot above
931 $chars{0xc76e}=0x1e45; # small n with dot above
932 $chars{0xc770}=0x1e57; # small p with dot above
933 $chars{0xc772}=0x1e59; # small r with dot above
934 $chars{0xc773}=0x1e61; # small s with dot above
935 $chars{0xc774}=0x1e6b; # small t with dot above
936 $chars{0xc777}=0x1e87; # small w with dot above
937 $chars{0xc778}=0x1e8b; # small x with dot above
938 $chars{0xc779}=0x1e8f; # small y with dot above
939 $chars{0xc77a}=0x017c; # small z with dot above
940 # 4/8 trema, diaresis
941 $chars{0xc820}=0x00a8; # diaeresis
942 $chars{0xc841}=0x00c4; # capital a with diaeresis
943 $chars{0xc845}=0x00cb; # capital e with diaeresis
944 $chars{0xc848}=0x1e26; # capital h with diaeresis
945 $chars{0xc849}=0x00cf; # capital i with diaeresis
946 $chars{0xc84f}=0x00d6; # capital o with diaeresis
947 $chars{0xc855}=0x00dc; # capital u with diaeresis
948 $chars{0xc857}=0x1e84; # capital w with diaeresis
949 $chars{0xc858}=0x1e8c; # capital x with diaeresis
950 $chars{0xc859}=0x0178; # capital y with diaeresis
951 $chars{0xc861}=0x00e4; # small a with diaeresis
952 $chars{0xc865}=0x00eb; # small e with diaeresis
953 $chars{0xc868}=0x1e27; # small h with diaeresis
954 $chars{0xc869}=0x00ef; # small i with diaeresis
955 $chars{0xc86f}=0x00f6; # small o with diaeresis
956 $chars{0xc874}=0x1e97; # small t with diaeresis
957 $chars{0xc875}=0x00fc; # small u with diaeresis
958 $chars{0xc877}=0x1e85; # small w with diaeresis
959 $chars{0xc878}=0x1e8d; # small x with diaeresis
960 $chars{0xc879}=0x00ff; # small y with diaeresis
962 $chars{0xc920}=0x00a8; # [diaeresis]
963 $chars{0xc961}=0x00e4; # a with umlaut
964 $chars{0xc965}=0x00eb; # e with umlaut
965 $chars{0xc969}=0x00ef; # i with umlaut
966 $chars{0xc96f}=0x00f6; # o with umlaut
967 $chars{0xc975}=0x00fc; # u with umlaut
969 $chars{0xca41}=0x00c5; # capital a with ring above
970 $chars{0xcaad}=0x016e; # capital u with ring above
971 $chars{0xca61}=0x00e5; # small a with ring above
972 $chars{0xca75}=0x016f; # small u with ring above
973 $chars{0xca77}=0x1e98; # small w with ring above
974 $chars{0xca79}=0x1e99; # small y with ring above
975 # 4/11 high comma off centre
976 # 4/12 inverted high comma centred
977 # 4/13 double acute accent
978 $chars{0xcd4f}=0x0150; # capital o with double acute
979 $chars{0xcd55}=0x0170; # capital u with double acute
980 $chars{0xcd6f}=0x0151; # small o with double acute
981 $chars{0xcd75}=0x0171; # small u with double acute
983 $chars{0xce54}=0x01a0; # latin capital letter o with horn
984 $chars{0xce55}=0x01af; # latin capital letter u with horn
985 $chars{0xce74}=0x01a1; # latin small letter o with horn
986 $chars{0xce75}=0x01b0; # latin small letter u with horn
988 $chars{0xcf41}=0x01cd; # capital a with caron
989 $chars{0xcf43}=0x010c; # capital c with caron
990 $chars{0xcf44}=0x010e; # capital d with caron
991 $chars{0xcf45}=0x011a; # capital e with caron
992 $chars{0xcf47}=0x01e6; # capital g with caron
993 $chars{0xcf49}=0x01cf; # capital i with caron
994 $chars{0xcf4b}=0x01e8; # capital k with caron
995 $chars{0xcf4c}=0x013d; # capital l with caron
996 $chars{0xcf4e}=0x0147; # capital n with caron
997 $chars{0xcf4f}=0x01d1; # capital o with caron
998 $chars{0xcf52}=0x0158; # capital r with caron
999 $chars{0xcf53}=0x0160; # capital s with caron
1000 $chars{0xcf54}=0x0164; # capital t with caron
1001 $chars{0xcf55}=0x01d3; # capital u with caron
1002 $chars{0xcf5a}=0x017d; # capital z with caron
1003 $chars{0xcf61}=0x01ce; # small a with caron
1004 $chars{0xcf63}=0x010d; # small c with caron
1005 $chars{0xcf64}=0x010f; # small d with caron
1006 $chars{0xcf65}=0x011b; # small e with caron
1007 $chars{0xcf67}=0x01e7; # small g with caron
1008 $chars{0xcf69}=0x01d0; # small i with caron
1009 $chars{0xcf6a}=0x01f0; # small j with caron
1010 $chars{0xcf6b}=0x01e9; # small k with caron
1011 $chars{0xcf6c}=0x013e; # small l with caron
1012 $chars{0xcf6e}=0x0148; # small n with caron
1013 $chars{0xcf6f}=0x01d2; # small o with caron
1014 $chars{0xcf72}=0x0159; # small r with caron
1015 $chars{0xcf73}=0x0161; # small s with caron
1016 $chars{0xcf74}=0x0165; # small t with caron
1017 $chars{0xcf75}=0x01d4; # small u with caron
1018 $chars{0xcf7a}=0x017e; # small z with caron
1020 $chars{0xd020}=0x00b8; # cedilla
1021 $chars{0xd043}=0x00c7; # capital c with cedilla
1022 $chars{0xd044}=0x1e10; # capital d with cedilla
1023 $chars{0xd047}=0x0122; # capital g with cedilla
1024 $chars{0xd048}=0x1e28; # capital h with cedilla
1025 $chars{0xd04b}=0x0136; # capital k with cedilla
1026 $chars{0xd04c}=0x013b; # capital l with cedilla
1027 $chars{0xd04e}=0x0145; # capital n with cedilla
1028 $chars{0xd052}=0x0156; # capital r with cedilla
1029 $chars{0xd053}=0x015e; # capital s with cedilla
1030 $chars{0xd054}=0x0162; # capital t with cedilla
1031 $chars{0xd063}=0x00e7; # small c with cedilla
1032 $chars{0xd064}=0x1e11; # small d with cedilla
1033 $chars{0xd065}=0x0119; # small e with cedilla
1034 $chars{0xd067}=0x0123; # small g with cedilla
1035 $chars{0xd068}=0x1e29; # small h with cedilla
1036 $chars{0xd06b}=0x0137; # small k with cedilla
1037 $chars{0xd06c}=0x013c; # small l with cedilla
1038 $chars{0xd06e}=0x0146; # small n with cedilla
1039 $chars{0xd072}=0x0157; # small r with cedilla
1040 $chars{0xd073}=0x015f; # small s with cedilla
1041 $chars{0xd074}=0x0163; # small t with cedilla
1044 # 5/3 ogonek (hook to right
1045 $chars{0xd320}=0x02db; # ogonek
1046 $chars{0xd341}=0x0104; # capital a with ogonek
1047 $chars{0xd345}=0x0118; # capital e with ogonek
1048 $chars{0xd349}=0x012e; # capital i with ogonek
1049 $chars{0xd34f}=0x01ea; # capital o with ogonek
1050 $chars{0xd355}=0x0172; # capital u with ogonek
1051 $chars{0xd361}=0x0105; # small a with ogonek
1052 $chars{0xd365}=0x0119; # small e with ogonek
1053 $chars{0xd369}=0x012f; # small i with ogonek
1054 $chars{0xd36f}=0x01eb; # small o with ogonek
1055 $chars{0xd375}=0x0173; # small u with ogonek
1057 $chars{0xd441}=0x1e00; # capital a with ring below
1058 $chars{0xd461}=0x1e01; # small a with ring below
1059 # 5/5 half circle below
1060 $chars{0xf948}=0x1e2a; # capital h with breve below
1061 $chars{0xf968}=0x1e2b; # small h with breve below
1063 $chars{0xd641}=0x1ea0; # capital a with dot below
1064 $chars{0xd642}=0x1e04; # capital b with dot below
1065 $chars{0xd644}=0x1e0c; # capital d with dot below
1066 $chars{0xd645}=0x1eb8; # capital e with dot below
1067 $chars{0xd648}=0x1e24; # capital h with dot below
1068 $chars{0xd649}=0x1eca; # capital i with dot below
1069 $chars{0xd64b}=0x1e32; # capital k with dot below
1070 $chars{0xd64c}=0x1e36; # capital l with dot below
1071 $chars{0xd64d}=0x1e42; # capital m with dot below
1072 $chars{0xd64e}=0x1e46; # capital n with dot below
1073 $chars{0xd64f}=0x1ecc; # capital o with dot below
1074 $chars{0xd652}=0x1e5a; # capital r with dot below
1075 $chars{0xd653}=0x1e62; # capital s with dot below
1076 $chars{0xd654}=0x1e6c; # capital t with dot below
1077 $chars{0xd655}=0x1ee4; # capital u with dot below
1078 $chars{0xd656}=0x1e7e; # capital v with dot below
1079 $chars{0xd657}=0x1e88; # capital w with dot below
1080 $chars{0xd659}=0x1ef4; # capital y with dot below
1081 $chars{0xd65a}=0x1e92; # capital z with dot below
1082 $chars{0xd661}=0x1ea1; # small a with dot below
1083 $chars{0xd662}=0x1e05; # small b with dot below
1084 $chars{0xd664}=0x1e0d; # small d with dot below
1085 $chars{0xd665}=0x1eb9; # small e with dot below
1086 $chars{0xd668}=0x1e25; # small h with dot below
1087 $chars{0xd669}=0x1ecb; # small i with dot below
1088 $chars{0xd66b}=0x1e33; # small k with dot below
1089 $chars{0xd66c}=0x1e37; # small l with dot below
1090 $chars{0xd66d}=0x1e43; # small m with dot below
1091 $chars{0xd66e}=0x1e47; # small n with dot below
1092 $chars{0xd66f}=0x1ecd; # small o with dot below
1093 $chars{0xd672}=0x1e5b; # small r with dot below
1094 $chars{0xd673}=0x1e63; # small s with dot below
1095 $chars{0xd674}=0x1e6d; # small t with dot below
1096 $chars{0xd675}=0x1ee5; # small u with dot below
1097 $chars{0xd676}=0x1e7f; # small v with dot below
1098 $chars{0xd677}=0x1e89; # small w with dot below
1099 $chars{0xd679}=0x1ef5; # small y with dot below
1100 $chars{0xd67a}=0x1e93; # small z with dot below
1101 # 5/7 double dot below
1102 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1103 $chars{0xd775}=0x1e73; # small u with diaeresis below
1105 $chars{0xd820}=0x005f; # underline
1106 # 5/9 double underline
1107 $chars{0xd920}=0x2017; # double underline
1108 # 5/10 small low vertical bar
1109 $chars{0xda20}=0x02cc; #
1110 # 5/11 circumflex below
1111 # 5/12 (this position shall not be used)
1112 # 5/13 left half of ligature sign and of double tilde
1113 # 5/14 right half of ligature sign
1114 # 5/15 right half of double tilde
1115 # map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1117 sub char_decode5426
{
1121 my @data = unpack("C*", $string);
1123 my $length=scalar(@data);
1124 for (my $i = 0; $i < scalar(@data); $i++) {
1125 my $char= $data[$i];
1126 if ($char >= 0x00 && $char <= 0x7F){
1129 push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1130 }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1133 if ($chars{$char*256+$data[$i+1]}) {
1134 $convchar= $chars{$char * 256 + $data[$i+1]};
1136 # printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1137 } elsif ($chars{$char}) {
1138 $convchar= $chars{$char};
1139 # printf "0xC char %x, converted %x\n",$char,$chars{$char};
1143 push @characters,$convchar;
1146 if ($chars{$char}) {
1147 $convchar= $chars{$char};
1148 # printf "char %x, converted %x\n",$char,$chars{$char};
1150 # printf "char %x $char\n",$char;
1153 push @characters,$convchar;
1156 $result=pack "U*",@characters;
1157 # $result=~s/\x01//;
1158 # $result=~s/\x00//;
1162 $result=~s/\x1b\x5b//;
1163 # map{printf "%x",$_} @characters;
1173 Koha Development Team <http://koha-community.org/>
1175 Galen Charlton <galen.charlton@liblime.com>