3 # Copyright (C) 2008 LibLime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it under the
8 # terms of the GNU General Public License as published by the Free Software
9 # Foundation; either version 2 of the License, or (at your option) any later
12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with Koha; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 use MARC
::Charset qw
/marc8_to_utf8/;
26 use Unicode
::Normalize
;
28 use vars
qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
31 # set the version for version checking
32 $VERSION = 3.07.00.049;
48 C4::Charset - utilities for handling character set conversions.
56 This module contains routines for dealing with character set
57 conversions, particularly for MARC records.
59 A variety of character encodings are in use by various MARC
60 standards, and even more character encodings are used by
61 non-standard MARC records. The various MARC formats generally
62 do not do a good job of advertising a given record's character
63 encoding, and even when a record does advertise its encoding,
64 e.g., via the Leader/09, experience has shown that one cannot
67 Ultimately, all MARC records are stored in Koha in UTF-8 and
68 must be converted from whatever the source character encoding is.
69 The goal of this module is to ensure that these conversions
70 take place accurately. When a character conversion cannot take
71 place, or at least not accurately, the module was provide
72 enough information to allow user-facing code to inform the user
73 on how to deal with the situation.
79 =head2 IsStringUTF8ish
81 my $is_utf8 = IsStringUTF8ish($str);
83 Determines if C<$str> is valid UTF-8. This can mean
90 The Perl UTF-8 flag is set and the string contains valid UTF-8.
94 The Perl UTF-8 flag is B<not> set, but the octets contain
99 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
100 because in one could be presented with a MARC blob that is
101 not actually in UTF-8 but whose sequence of octets appears to be
102 valid UTF-8. The rest of the MARC character conversion functions
103 will assume that this situation occur does not very often.
107 sub IsStringUTF8ish
{
110 return 1 if utf8
::is_utf8
($str);
111 return utf8
::decode
($str);
116 my $marc_record = SetUTF8Flag($marc_record, $nfd);
118 This function sets the PERL UTF8 flag for data.
119 It is required when using new_from_usmarc
120 since MARC::File::USMARC does not handle PERL UTF8 setting.
121 When editing unicode marc records fields and subfields, you
122 would end up in double encoding without using this function.
124 If $nfd is set, string normalization will use NFD instead of NFC
127 In my opinion, this function belongs to MARC::Record and not
129 But since it handles charset, and MARC::Record, it finds its way in that package
134 my ($record, $nfd)=@_;
135 return unless ($record && $record->fields());
136 foreach my $field ($record->fields()){
137 if ($field->tag()>=10){
139 foreach my $subfield ($field->subfields()){
140 push @subfields,($$subfield[0],NormalizeString
($$subfield[1],$nfd));
142 my $newfield=MARC
::Field
->new(
144 $field->indicator(1),
145 $field->indicator(2),
148 $field->replace_with($newfield);
153 =head2 NormalizeString
155 my $normalized_string=NormalizeString($string,$nfd,$transform);
158 nfd : If you want to set NFD and not NFC
159 transform : If you expect all the signs to be removed
161 Sets the PERL UTF8 Flag on your initial data if need be
162 and applies cleaning if required
164 Returns a utf8 NFC normalized string
167 my $string=NormalizeString ("l'ornithoptère");
168 #results into ornithoptère in NFC form and sets UTF8 Flag
174 my ($string,$nfd,$transform)=@_;
175 utf8
::decode
($string) unless (utf8
::is_utf8
($string));
177 $string= NFD
($string);
180 $string=NFC
($string);
183 $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g
;
184 #removing one letter words "d'" "l'" was changed into "d " "l "
185 $string=~s/\b\S\b//g;
191 =head2 MarcToUTF8Record
193 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
194 $marc_flavour, [, $source_encoding]);
196 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
197 optional source encoding, return a C<MARC::Record> that is
200 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
201 is not guaranteed to have been converted correctly. Specifically,
202 if C<$converted_from> is 'failed', the MARC record returned failed
203 character conversion and had each of its non-ASCII octets changed
204 to the Unicode replacement character.
206 If the source encoding was not specified, this routine will
207 try to guess it; the character encoding used for a successful
208 conversion is returned in C<$converted_from>.
212 sub MarcToUTF8Record
{
214 my $marc_flavour = shift;
215 my $source_encoding = shift;
217 my $marc_blob_is_utf8 = 0;
218 if (ref($marc) eq 'MARC::Record') {
219 my $marc_blob = $marc->as_usmarc();
220 $marc_blob_is_utf8 = IsStringUTF8ish
($marc_blob);
221 $marc_record = $marc;
223 # dealing with a MARC blob
225 # remove any ersatz whitespace from the beginning and
226 # end of the MARC blob -- these can creep into MARC
227 # files produced by several sources -- caller really
228 # should be doing this, however
231 $marc_blob_is_utf8 = IsStringUTF8ish
($marc);
233 $marc_record = MARC
::Record
->new_from_usmarc($marc);
236 # if we fail the first time, one likely problem
237 # is that we have a MARC21 record that says that it's
238 # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
239 # We'll try parsing it again.
240 substr($marc, 9, 1) = ' ';
242 $marc_record = MARC
::Record
->new_from_usmarc($marc);
245 # it's hopeless; return an empty MARC::Record
246 return MARC
::Record
->new(), 'failed', ['could not parse MARC blob'];
251 # If we do not know the source encoding, try some guesses
253 # 1. Record is UTF-8 already.
254 # 2. If MARC flavor is MARC21 or NORMARC, then
255 # a. record is MARC-8
256 # b. record is ISO-8859-1
257 # 3. If MARC flavor is UNIMARC, then
258 if (not defined $source_encoding) {
259 if ($marc_blob_is_utf8) {
260 # note that for MARC21/NORMARC we are not bothering to check
261 # if the Leader/09 is set to 'a' or not -- because
262 # of problems with various ILSs (including Koha in the
263 # past, alas), this just is not trustworthy.
264 SetMarcUnicodeFlag
($marc_record, $marc_flavour);
265 return $marc_record, 'UTF-8', [];
267 if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
268 return _default_marc21_charconv_to_utf8
($marc_record, $marc_flavour);
269 } elsif ($marc_flavour =~/UNIMARC/) {
270 return _default_unimarc_charconv_to_utf8
($marc_record, $marc_flavour);
272 return _default_marc21_charconv_to_utf8
($marc_record, $marc_flavour);
276 # caller knows the character encoding
277 my $original_marc_record = $marc_record->clone();
279 if ($source_encoding =~ /utf-?8/i) {
280 if ($marc_blob_is_utf8) {
281 SetMarcUnicodeFlag
($marc_record, $marc_flavour);
282 return $marc_record, 'UTF-8', [];
284 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
286 } elsif ($source_encoding =~ /marc-?8/i) {
287 @errors = _marc_marc8_to_utf8
($marc_record, $marc_flavour);
288 } elsif ($source_encoding =~ /5426/) {
289 @errors = _marc_iso5426_to_utf8
($marc_record, $marc_flavour);
291 # assume any other character encoding is for Text::Iconv
292 @errors = _marc_to_utf8_via_text_iconv
($marc_record, $marc_flavour, $source_encoding);
296 _marc_to_utf8_replacement_char
($original_marc_record, $marc_flavour);
297 return $original_marc_record, 'failed', \
@errors;
299 return $marc_record, $source_encoding, [];
305 =head2 SetMarcUnicodeFlag
307 SetMarcUnicodeFlag($marc_record, $marc_flavour);
309 Set both the internal MARC::Record encoding flag
310 and the appropriate Leader/09 (MARC21) or
311 100/26-29 (UNIMARC) to indicate that the record
312 is in UTF-8. Note that this does B<not> do
313 any actual character conversion.
317 sub SetMarcUnicodeFlag
{
318 my $marc_record = shift;
319 my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
321 $marc_record->encoding('UTF-8');
322 if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
323 my $leader = $marc_record->leader();
324 substr($leader, 9, 1) = 'a';
325 $marc_record->leader($leader);
326 } elsif ($marc_flavour =~/UNIMARC/) {
327 my $defaultlanguage = C4
::Context
->preference("UNIMARCField100Language");
328 $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
330 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?
(21,12):(36,25));
331 $string=$marc_record->subfield( 100, "a" );
332 if (defined $string && length($string)==$subflength) {
333 $string = substr $string, 0,$subflength if (length($string)>$subflength);
336 $string = POSIX
::strftime
( "%Y%m%d", localtime );
338 $string = sprintf( "%-*s", $subflength, $string );
339 substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
341 substr( $string, $encodingposition, 3, "y50" );
342 if ( $marc_record->subfield( 100, "a" ) ) {
343 $marc_record->field('100')->update(a
=>$string);
346 $marc_record->insert_grouped_field(
347 MARC
::Field
->new( 100, '', '', "a" => $string ) );
349 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
351 warn "Unrecognized marcflavour: $marc_flavour";
355 =head2 StripNonXmlChars
357 my $new_str = StripNonXmlChars($old_str);
359 Given a string, return a copy with the
360 characters that are illegal in XML
363 This function exists to work around a problem
364 that can occur with badly-encoded MARC records.
365 Specifically, if a UTF-8 MARC record also
366 has excape (\x1b) characters, MARC::File::XML
367 will let the escape characters pass through
368 when as_xml() or as_xml_record() is called. The
369 problem is that the escape character is not
370 legal in well-formed XML documents, so when
371 MARC::File::XML attempts to parse such a record,
372 the XML parser will fail.
374 Stripping such characters will allow a
375 MARC::Record->new_from_xml()
376 to work, at the possible risk of some data loss.
380 sub StripNonXmlChars
{
382 if (!defined($str) || $str eq ""){
385 $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
399 Removes Non Sorting Block characters
403 my $NSB = '\x88' ; # NSB : begin Non Sorting Block
404 my $NSE = '\x89' ; # NSE : Non Sorting Block end
405 my $NSB2 = '\x98' ; # NSB : begin Non Sorting Block
406 my $NSE2 = '\x9C' ; # NSE : Non Sorting Block end
407 my $C2 = '\xC2' ; # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
409 # handles non sorting blocks
423 =head1 INTERNAL FUNCTIONS
425 =head2 _default_marc21_charconv_to_utf8
427 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
429 Converts a C<MARC::Record> of unknown character set to UTF-8,
430 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
431 to UTF-8, then a default conversion that replaces each non-ASCII
432 character with the replacement character.
434 The C<$guessed_charset> return value contains the character set
435 that resulted in a conversion to valid UTF-8; note that
436 if the MARC-8 and ISO-8859-1 conversions failed, the value of
441 sub _default_marc21_charconv_to_utf8
{
442 my $marc_record = shift;
443 my $marc_flavour = shift;
445 my $trial_marc8 = $marc_record->clone();
447 my @errors = _marc_marc8_to_utf8
($trial_marc8, $marc_flavour);
449 return $trial_marc8, 'MARC-8', [];
451 push @all_errors, @errors;
453 my $trial_8859_1 = $marc_record->clone();
454 @errors = _marc_to_utf8_via_text_iconv
($trial_8859_1, $marc_flavour, 'iso-8859-1');
456 return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
457 # instead if we wanted to report details
458 # of the failed attempt at MARC-8 => UTF-8
460 push @all_errors, @errors;
462 my $default_converted = $marc_record->clone();
463 _marc_to_utf8_replacement_char
($default_converted, $marc_flavour);
464 return $default_converted, 'failed', \
@all_errors;
467 =head2 _default_unimarc_charconv_to_utf8
469 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
471 Converts a C<MARC::Record> of unknown character set to UTF-8,
472 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
473 to UTF-8, then a default conversion that replaces each non-ASCII
474 character with the replacement character.
476 The C<$guessed_charset> return value contains the character set
477 that resulted in a conversion to valid UTF-8; note that
478 if the MARC-8 and ISO-8859-1 conversions failed, the value of
483 sub _default_unimarc_charconv_to_utf8
{
484 my $marc_record = shift;
485 my $marc_flavour = shift;
487 my $trial_marc8 = $marc_record->clone();
489 my @errors = _marc_iso5426_to_utf8
($trial_marc8, $marc_flavour);
491 return $trial_marc8, 'iso-5426';
493 push @all_errors, @errors;
495 my $trial_8859_1 = $marc_record->clone();
496 @errors = _marc_to_utf8_via_text_iconv
($trial_8859_1, $marc_flavour, 'iso-8859-1');
498 return $trial_8859_1, 'iso-8859-1';
500 push @all_errors, @errors;
502 my $default_converted = $marc_record->clone();
503 _marc_to_utf8_replacement_char
($default_converted, $marc_flavour);
504 return $default_converted, 'failed', \
@all_errors;
507 =head2 _marc_marc8_to_utf8
509 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
511 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
512 If the conversion fails for some reason, an
513 appropriate messages will be placed in the returned
518 sub _marc_marc8_to_utf8
{
519 my $marc_record = shift;
520 my $marc_flavour = shift;
522 my $prev_ignore = MARC
::Charset
->ignore_errors();
523 MARC
::Charset
->ignore_errors(1);
525 # trap warnings raised by MARC::Charset
527 local $SIG{__WARN__
} = sub {
529 if ($msg =~ /MARC.Charset/) {
530 # FIXME - purpose of this regexp is to strip out the
531 # line reference to MARC/Charset.pm, but as it
532 # exists probably won't work quite on Windows --
533 # some sort of minimal-bunch back-tracking RE
534 # would be helpful here
535 $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
538 # if warning doesn't come from MARC::Charset, just
544 foreach my $field ($marc_record->fields()) {
545 if ($field->is_control_field()) {
546 ; # do nothing -- control fields should not contain non-ASCII characters
548 my @converted_subfields;
549 foreach my $subfield ($field->subfields()) {
550 my $utf8sf = MARC
::Charset
::marc8_to_utf8
($subfield->[1]);
551 unless (IsStringUTF8ish
($utf8sf)) {
552 # Because of a bug in MARC::Charset 0.98, if the string
553 # has (a) one or more diacritics that (b) are only in character positions
554 # 128 to 255 inclusive, the resulting converted string is not in
555 # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
556 # occurs, upgrade the string in place. Moral of the story seems to be
557 # that pack("U", ...) is better than chr(...) if you need to guarantee
558 # that the resulting string is UTF-8.
559 utf8
::upgrade
($utf8sf);
561 push @converted_subfields, $subfield->[0], $utf8sf;
564 $field->replace_with(MARC
::Field
->new(
565 $field->tag(), $field->indicator(1), $field->indicator(2),
566 @converted_subfields)
571 MARC
::Charset
->ignore_errors($prev_ignore);
573 SetMarcUnicodeFlag
($marc_record, $marc_flavour);
578 =head2 _marc_iso5426_to_utf8
580 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
582 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
583 If the conversion fails for some reason, an
584 appropriate messages will be placed in the returned
587 FIXME - is ISO-5426 equivalent enough to MARC-8
588 that C<MARC::Charset> can be used instead?
592 sub _marc_iso5426_to_utf8
{
593 my $marc_record = shift;
594 my $marc_flavour = shift;
598 foreach my $field ($marc_record->fields()) {
599 if ($field->is_control_field()) {
600 ; # do nothing -- control fields should not contain non-ASCII characters
602 my @converted_subfields;
603 foreach my $subfield ($field->subfields()) {
604 my $utf8sf = char_decode5426
($subfield->[1]);
605 push @converted_subfields, $subfield->[0], $utf8sf;
608 $field->replace_with(MARC
::Field
->new(
609 $field->tag(), $field->indicator(1), $field->indicator(2),
610 @converted_subfields)
615 SetMarcUnicodeFlag
($marc_record, $marc_flavour);
620 =head2 _marc_to_utf8_via_text_iconv
622 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
624 Convert a C<MARC::Record> to UTF-8 in-place using the
625 C<Text::Iconv> CPAN module. Any source encoding accepted
626 by the user's iconv installation should work. If
627 the source encoding is not recognized on the user's
628 server or the conversion fails for some reason,
629 appropriate messages will be placed in the returned
634 sub _marc_to_utf8_via_text_iconv
{
635 my $marc_record = shift;
636 my $marc_flavour = shift;
637 my $source_encoding = shift;
641 eval { $decoder = Text
::Iconv
->new($source_encoding, 'utf8'); };
643 push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
647 my $prev_raise_error = Text
::Iconv
->raise_error();
648 Text
::Iconv
->raise_error(1);
650 foreach my $field ($marc_record->fields()) {
651 if ($field->is_control_field()) {
652 ; # do nothing -- control fields should not contain non-ASCII characters
654 my @converted_subfields;
655 foreach my $subfield ($field->subfields()) {
657 my $conversion_ok = 1;
658 eval { $converted_value = $decoder->convert($subfield->[1]); };
662 } elsif (not defined $converted_value) {
664 push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
667 if ($conversion_ok) {
668 push @converted_subfields, $subfield->[0], $converted_value;
670 $converted_value = $subfield->[1];
671 $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
672 push @converted_subfields, $subfield->[0], $converted_value;
676 $field->replace_with(MARC
::Field
->new(
677 $field->tag(), $field->indicator(1), $field->indicator(2),
678 @converted_subfields)
683 SetMarcUnicodeFlag
($marc_record, $marc_flavour);
684 Text
::Iconv
->raise_error($prev_raise_error);
689 =head2 _marc_to_utf8_replacement_char
691 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
693 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
694 unsatisfactory method of replacing all non-ASCII (e.g.,
695 where the eight bit is set) octet with the Unicode
696 replacement character. This is meant as a last-ditch
697 method, and would be best used as part of a UI that
698 lets a cataloguer pick various character conversions
699 until he or she finds the right one.
703 sub _marc_to_utf8_replacement_char
{
704 my $marc_record = shift;
705 my $marc_flavour = shift;
707 foreach my $field ($marc_record->fields()) {
708 if ($field->is_control_field()) {
709 ; # do nothing -- control fields should not contain non-ASCII characters
711 my @converted_subfields;
712 foreach my $subfield ($field->subfields()) {
713 my $value = $subfield->[1];
714 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
715 push @converted_subfields, $subfield->[0], $value;
718 $field->replace_with(MARC
::Field
->new(
719 $field->tag(), $field->indicator(1), $field->indicator(2),
720 @converted_subfields)
725 SetMarcUnicodeFlag
($marc_record, $marc_flavour);
728 =head2 char_decode5426
730 my $utf8string = char_decode5426($iso_5426_string);
732 Converts a string from ISO-5426 to UTF-8.
738 $chars{0xb0}=0x0101;#3/0ayn[ain]
739 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
740 #$chars{0xb2}=0x00e0;#'à';
741 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
742 #$chars{0xb3}=0x00e7;#'ç';
743 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
750 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
751 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
752 $chars{0xfa}=0x0153; #oe
753 $chars{0xea}=0x0152; #oe
754 $chars{0x81d1}=0x00b0;
757 ## combined characters iso5426
759 $chars{0xc041}=0x1ea2; # capital a with hook above
760 $chars{0xc045}=0x1eba; # capital e with hook above
761 $chars{0xc049}=0x1ec8; # capital i with hook above
762 $chars{0xc04f}=0x1ece; # capital o with hook above
763 $chars{0xc055}=0x1ee6; # capital u with hook above
764 $chars{0xc059}=0x1ef6; # capital y with hook above
765 $chars{0xc061}=0x1ea3; # small a with hook above
766 $chars{0xc065}=0x1ebb; # small e with hook above
767 $chars{0xc069}=0x1ec9; # small i with hook above
768 $chars{0xc06f}=0x1ecf; # small o with hook above
769 $chars{0xc075}=0x1ee7; # small u with hook above
770 $chars{0xc079}=0x1ef7; # small y with hook above
773 $chars{0xc141}=0x00c0; # capital a with grave accent
774 $chars{0xc145}=0x00c8; # capital e with grave accent
775 $chars{0xc149}=0x00cc; # capital i with grave accent
776 $chars{0xc14f}=0x00d2; # capital o with grave accent
777 $chars{0xc155}=0x00d9; # capital u with grave accent
778 $chars{0xc157}=0x1e80; # capital w with grave
779 $chars{0xc159}=0x1ef2; # capital y with grave
780 $chars{0xc161}=0x00e0; # small a with grave accent
781 $chars{0xc165}=0x00e8; # small e with grave accent
782 $chars{0xc169}=0x00ec; # small i with grave accent
783 $chars{0xc16f}=0x00f2; # small o with grave accent
784 $chars{0xc175}=0x00f9; # small u with grave accent
785 $chars{0xc177}=0x1e81; # small w with grave
786 $chars{0xc179}=0x1ef3; # small y with grave
788 $chars{0xc241}=0x00c1; # capital a with acute accent
789 $chars{0xc243}=0x0106; # capital c with acute accent
790 $chars{0xc245}=0x00c9; # capital e with acute accent
791 $chars{0xc247}=0x01f4; # capital g with acute
792 $chars{0xc249}=0x00cd; # capital i with acute accent
793 $chars{0xc24b}=0x1e30; # capital k with acute
794 $chars{0xc24c}=0x0139; # capital l with acute accent
795 $chars{0xc24d}=0x1e3e; # capital m with acute
796 $chars{0xc24e}=0x0143; # capital n with acute accent
797 $chars{0xc24f}=0x00d3; # capital o with acute accent
798 $chars{0xc250}=0x1e54; # capital p with acute
799 $chars{0xc252}=0x0154; # capital r with acute accent
800 $chars{0xc253}=0x015a; # capital s with acute accent
801 $chars{0xc255}=0x00da; # capital u with acute accent
802 $chars{0xc257}=0x1e82; # capital w with acute
803 $chars{0xc259}=0x00dd; # capital y with acute accent
804 $chars{0xc25a}=0x0179; # capital z with acute accent
805 $chars{0xc261}=0x00e1; # small a with acute accent
806 $chars{0xc263}=0x0107; # small c with acute accent
807 $chars{0xc265}=0x00e9; # small e with acute accent
808 $chars{0xc267}=0x01f5; # small g with acute
809 $chars{0xc269}=0x00ed; # small i with acute accent
810 $chars{0xc26b}=0x1e31; # small k with acute
811 $chars{0xc26c}=0x013a; # small l with acute accent
812 $chars{0xc26d}=0x1e3f; # small m with acute
813 $chars{0xc26e}=0x0144; # small n with acute accent
814 $chars{0xc26f}=0x00f3; # small o with acute accent
815 $chars{0xc270}=0x1e55; # small p with acute
816 $chars{0xc272}=0x0155; # small r with acute accent
817 $chars{0xc273}=0x015b; # small s with acute accent
818 $chars{0xc275}=0x00fa; # small u with acute accent
819 $chars{0xc277}=0x1e83; # small w with acute
820 $chars{0xc279}=0x00fd; # small y with acute accent
821 $chars{0xc27a}=0x017a; # small z with acute accent
822 $chars{0xc2e1}=0x01fc; # capital ae with acute
823 $chars{0xc2f1}=0x01fd; # small ae with acute
824 # 4/3 circumflex accent
825 $chars{0xc341}=0x00c2; # capital a with circumflex accent
826 $chars{0xc343}=0x0108; # capital c with circumflex
827 $chars{0xc345}=0x00ca; # capital e with circumflex accent
828 $chars{0xc347}=0x011c; # capital g with circumflex
829 $chars{0xc348}=0x0124; # capital h with circumflex
830 $chars{0xc349}=0x00ce; # capital i with circumflex accent
831 $chars{0xc34a}=0x0134; # capital j with circumflex
832 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
833 $chars{0xc353}=0x015c; # capital s with circumflex
834 $chars{0xc355}=0x00db; # capital u with circumflex
835 $chars{0xc357}=0x0174; # capital w with circumflex
836 $chars{0xc359}=0x0176; # capital y with circumflex
837 $chars{0xc35a}=0x1e90; # capital z with circumflex
838 $chars{0xc361}=0x00e2; # small a with circumflex accent
839 $chars{0xc363}=0x0109; # small c with circumflex
840 $chars{0xc365}=0x00ea; # small e with circumflex accent
841 $chars{0xc367}=0x011d; # small g with circumflex
842 $chars{0xc368}=0x0125; # small h with circumflex
843 $chars{0xc369}=0x00ee; # small i with circumflex accent
844 $chars{0xc36a}=0x0135; # small j with circumflex
845 $chars{0xc36e}=0x00f1; # small n with tilde
846 $chars{0xc36f}=0x00f4; # small o with circumflex accent
847 $chars{0xc373}=0x015d; # small s with circumflex
848 $chars{0xc375}=0x00fb; # small u with circumflex
849 $chars{0xc377}=0x0175; # small w with circumflex
850 $chars{0xc379}=0x0177; # small y with circumflex
851 $chars{0xc37a}=0x1e91; # small z with circumflex
853 $chars{0xc441}=0x00c3; # capital a with tilde
854 $chars{0xc445}=0x1ebc; # capital e with tilde
855 $chars{0xc449}=0x0128; # capital i with tilde
856 $chars{0xc44e}=0x00d1; # capital n with tilde
857 $chars{0xc44f}=0x00d5; # capital o with tilde
858 $chars{0xc455}=0x0168; # capital u with tilde
859 $chars{0xc456}=0x1e7c; # capital v with tilde
860 $chars{0xc459}=0x1ef8; # capital y with tilde
861 $chars{0xc461}=0x00e3; # small a with tilde
862 $chars{0xc465}=0x1ebd; # small e with tilde
863 $chars{0xc469}=0x0129; # small i with tilde
864 $chars{0xc46e}=0x00f1; # small n with tilde
865 $chars{0xc46f}=0x00f5; # small o with tilde
866 $chars{0xc475}=0x0169; # small u with tilde
867 $chars{0xc476}=0x1e7d; # small v with tilde
868 $chars{0xc479}=0x1ef9; # small y with tilde
870 $chars{0xc541}=0x0100; # capital a with macron
871 $chars{0xc545}=0x0112; # capital e with macron
872 $chars{0xc547}=0x1e20; # capital g with macron
873 $chars{0xc549}=0x012a; # capital i with macron
874 $chars{0xc54f}=0x014c; # capital o with macron
875 $chars{0xc555}=0x016a; # capital u with macron
876 $chars{0xc561}=0x0101; # small a with macron
877 $chars{0xc565}=0x0113; # small e with macron
878 $chars{0xc567}=0x1e21; # small g with macron
879 $chars{0xc569}=0x012b; # small i with macron
880 $chars{0xc56f}=0x014d; # small o with macron
881 $chars{0xc575}=0x016b; # small u with macron
882 $chars{0xc572}=0x0159; # small r with macron
883 $chars{0xc5e1}=0x01e2; # capital ae with macron
884 $chars{0xc5f1}=0x01e3; # small ae with macron
886 $chars{0xc641}=0x0102; # capital a with breve
887 $chars{0xc645}=0x0114; # capital e with breve
888 $chars{0xc647}=0x011e; # capital g with breve
889 $chars{0xc649}=0x012c; # capital i with breve
890 $chars{0xc64f}=0x014e; # capital o with breve
891 $chars{0xc655}=0x016c; # capital u with breve
892 $chars{0xc661}=0x0103; # small a with breve
893 $chars{0xc665}=0x0115; # small e with breve
894 $chars{0xc667}=0x011f; # small g with breve
895 $chars{0xc669}=0x012d; # small i with breve
896 $chars{0xc66f}=0x014f; # small o with breve
897 $chars{0xc675}=0x016d; # small u with breve
899 $chars{0xc7b0}=0x01e1; # Ain with dot above
900 $chars{0xc742}=0x1e02; # capital b with dot above
901 $chars{0xc743}=0x010a; # capital c with dot above
902 $chars{0xc744}=0x1e0a; # capital d with dot above
903 $chars{0xc745}=0x0116; # capital e with dot above
904 $chars{0xc746}=0x1e1e; # capital f with dot above
905 $chars{0xc747}=0x0120; # capital g with dot above
906 $chars{0xc748}=0x1e22; # capital h with dot above
907 $chars{0xc749}=0x0130; # capital i with dot above
908 $chars{0xc74d}=0x1e40; # capital m with dot above
909 $chars{0xc74e}=0x1e44; # capital n with dot above
910 $chars{0xc750}=0x1e56; # capital p with dot above
911 $chars{0xc752}=0x1e58; # capital r with dot above
912 $chars{0xc753}=0x1e60; # capital s with dot above
913 $chars{0xc754}=0x1e6a; # capital t with dot above
914 $chars{0xc757}=0x1e86; # capital w with dot above
915 $chars{0xc758}=0x1e8a; # capital x with dot above
916 $chars{0xc759}=0x1e8e; # capital y with dot above
917 $chars{0xc75a}=0x017b; # capital z with dot above
918 $chars{0xc761}=0x0227; # small b with dot above
919 $chars{0xc762}=0x1e03; # small b with dot above
920 $chars{0xc763}=0x010b; # small c with dot above
921 $chars{0xc764}=0x1e0b; # small d with dot above
922 $chars{0xc765}=0x0117; # small e with dot above
923 $chars{0xc766}=0x1e1f; # small f with dot above
924 $chars{0xc767}=0x0121; # small g with dot above
925 $chars{0xc768}=0x1e23; # small h with dot above
926 $chars{0xc76d}=0x1e41; # small m with dot above
927 $chars{0xc76e}=0x1e45; # small n with dot above
928 $chars{0xc770}=0x1e57; # small p with dot above
929 $chars{0xc772}=0x1e59; # small r with dot above
930 $chars{0xc773}=0x1e61; # small s with dot above
931 $chars{0xc774}=0x1e6b; # small t with dot above
932 $chars{0xc777}=0x1e87; # small w with dot above
933 $chars{0xc778}=0x1e8b; # small x with dot above
934 $chars{0xc779}=0x1e8f; # small y with dot above
935 $chars{0xc77a}=0x017c; # small z with dot above
936 # 4/8 trema, diaresis
937 $chars{0xc820}=0x00a8; # diaeresis
938 $chars{0xc841}=0x00c4; # capital a with diaeresis
939 $chars{0xc845}=0x00cb; # capital e with diaeresis
940 $chars{0xc848}=0x1e26; # capital h with diaeresis
941 $chars{0xc849}=0x00cf; # capital i with diaeresis
942 $chars{0xc84f}=0x00d6; # capital o with diaeresis
943 $chars{0xc855}=0x00dc; # capital u with diaeresis
944 $chars{0xc857}=0x1e84; # capital w with diaeresis
945 $chars{0xc858}=0x1e8c; # capital x with diaeresis
946 $chars{0xc859}=0x0178; # capital y with diaeresis
947 $chars{0xc861}=0x00e4; # small a with diaeresis
948 $chars{0xc865}=0x00eb; # small e with diaeresis
949 $chars{0xc868}=0x1e27; # small h with diaeresis
950 $chars{0xc869}=0x00ef; # small i with diaeresis
951 $chars{0xc86f}=0x00f6; # small o with diaeresis
952 $chars{0xc874}=0x1e97; # small t with diaeresis
953 $chars{0xc875}=0x00fc; # small u with diaeresis
954 $chars{0xc877}=0x1e85; # small w with diaeresis
955 $chars{0xc878}=0x1e8d; # small x with diaeresis
956 $chars{0xc879}=0x00ff; # small y with diaeresis
958 $chars{0xc920}=0x00a8; # [diaeresis]
959 $chars{0xc961}=0x00e4; # a with umlaut
960 $chars{0xc965}=0x00eb; # e with umlaut
961 $chars{0xc969}=0x00ef; # i with umlaut
962 $chars{0xc96f}=0x00f6; # o with umlaut
963 $chars{0xc975}=0x00fc; # u with umlaut
965 $chars{0xca41}=0x00c5; # capital a with ring above
966 $chars{0xcaad}=0x016e; # capital u with ring above
967 $chars{0xca61}=0x00e5; # small a with ring above
968 $chars{0xca75}=0x016f; # small u with ring above
969 $chars{0xca77}=0x1e98; # small w with ring above
970 $chars{0xca79}=0x1e99; # small y with ring above
971 # 4/11 high comma off centre
972 # 4/12 inverted high comma centred
973 # 4/13 double acute accent
974 $chars{0xcd4f}=0x0150; # capital o with double acute
975 $chars{0xcd55}=0x0170; # capital u with double acute
976 $chars{0xcd6f}=0x0151; # small o with double acute
977 $chars{0xcd75}=0x0171; # small u with double acute
979 $chars{0xce54}=0x01a0; # latin capital letter o with horn
980 $chars{0xce55}=0x01af; # latin capital letter u with horn
981 $chars{0xce74}=0x01a1; # latin small letter o with horn
982 $chars{0xce75}=0x01b0; # latin small letter u with horn
984 $chars{0xcf41}=0x01cd; # capital a with caron
985 $chars{0xcf43}=0x010c; # capital c with caron
986 $chars{0xcf44}=0x010e; # capital d with caron
987 $chars{0xcf45}=0x011a; # capital e with caron
988 $chars{0xcf47}=0x01e6; # capital g with caron
989 $chars{0xcf49}=0x01cf; # capital i with caron
990 $chars{0xcf4b}=0x01e8; # capital k with caron
991 $chars{0xcf4c}=0x013d; # capital l with caron
992 $chars{0xcf4e}=0x0147; # capital n with caron
993 $chars{0xcf4f}=0x01d1; # capital o with caron
994 $chars{0xcf52}=0x0158; # capital r with caron
995 $chars{0xcf53}=0x0160; # capital s with caron
996 $chars{0xcf54}=0x0164; # capital t with caron
997 $chars{0xcf55}=0x01d3; # capital u with caron
998 $chars{0xcf5a}=0x017d; # capital z with caron
999 $chars{0xcf61}=0x01ce; # small a with caron
1000 $chars{0xcf63}=0x010d; # small c with caron
1001 $chars{0xcf64}=0x010f; # small d with caron
1002 $chars{0xcf65}=0x011b; # small e with caron
1003 $chars{0xcf67}=0x01e7; # small g with caron
1004 $chars{0xcf69}=0x01d0; # small i with caron
1005 $chars{0xcf6a}=0x01f0; # small j with caron
1006 $chars{0xcf6b}=0x01e9; # small k with caron
1007 $chars{0xcf6c}=0x013e; # small l with caron
1008 $chars{0xcf6e}=0x0148; # small n with caron
1009 $chars{0xcf6f}=0x01d2; # small o with caron
1010 $chars{0xcf72}=0x0159; # small r with caron
1011 $chars{0xcf73}=0x0161; # small s with caron
1012 $chars{0xcf74}=0x0165; # small t with caron
1013 $chars{0xcf75}=0x01d4; # small u with caron
1014 $chars{0xcf7a}=0x017e; # small z with caron
1016 $chars{0xd020}=0x00b8; # cedilla
1017 $chars{0xd043}=0x00c7; # capital c with cedilla
1018 $chars{0xd044}=0x1e10; # capital d with cedilla
1019 $chars{0xd047}=0x0122; # capital g with cedilla
1020 $chars{0xd048}=0x1e28; # capital h with cedilla
1021 $chars{0xd04b}=0x0136; # capital k with cedilla
1022 $chars{0xd04c}=0x013b; # capital l with cedilla
1023 $chars{0xd04e}=0x0145; # capital n with cedilla
1024 $chars{0xd052}=0x0156; # capital r with cedilla
1025 $chars{0xd053}=0x015e; # capital s with cedilla
1026 $chars{0xd054}=0x0162; # capital t with cedilla
1027 $chars{0xd063}=0x00e7; # small c with cedilla
1028 $chars{0xd064}=0x1e11; # small d with cedilla
1029 $chars{0xd065}=0x0119; # small e with cedilla
1030 $chars{0xd067}=0x0123; # small g with cedilla
1031 $chars{0xd068}=0x1e29; # small h with cedilla
1032 $chars{0xd06b}=0x0137; # small k with cedilla
1033 $chars{0xd06c}=0x013c; # small l with cedilla
1034 $chars{0xd06e}=0x0146; # small n with cedilla
1035 $chars{0xd072}=0x0157; # small r with cedilla
1036 $chars{0xd073}=0x015f; # small s with cedilla
1037 $chars{0xd074}=0x0163; # small t with cedilla
1040 # 5/3 ogonek (hook to right
1041 $chars{0xd320}=0x02db; # ogonek
1042 $chars{0xd341}=0x0104; # capital a with ogonek
1043 $chars{0xd345}=0x0118; # capital e with ogonek
1044 $chars{0xd349}=0x012e; # capital i with ogonek
1045 $chars{0xd34f}=0x01ea; # capital o with ogonek
1046 $chars{0xd355}=0x0172; # capital u with ogonek
1047 $chars{0xd361}=0x0105; # small a with ogonek
1048 $chars{0xd365}=0x0119; # small e with ogonek
1049 $chars{0xd369}=0x012f; # small i with ogonek
1050 $chars{0xd36f}=0x01eb; # small o with ogonek
1051 $chars{0xd375}=0x0173; # small u with ogonek
1053 $chars{0xd441}=0x1e00; # capital a with ring below
1054 $chars{0xd461}=0x1e01; # small a with ring below
1055 # 5/5 half circle below
1056 $chars{0xf948}=0x1e2a; # capital h with breve below
1057 $chars{0xf968}=0x1e2b; # small h with breve below
1059 $chars{0xd641}=0x1ea0; # capital a with dot below
1060 $chars{0xd642}=0x1e04; # capital b with dot below
1061 $chars{0xd644}=0x1e0c; # capital d with dot below
1062 $chars{0xd645}=0x1eb8; # capital e with dot below
1063 $chars{0xd648}=0x1e24; # capital h with dot below
1064 $chars{0xd649}=0x1eca; # capital i with dot below
1065 $chars{0xd64b}=0x1e32; # capital k with dot below
1066 $chars{0xd64c}=0x1e36; # capital l with dot below
1067 $chars{0xd64d}=0x1e42; # capital m with dot below
1068 $chars{0xd64e}=0x1e46; # capital n with dot below
1069 $chars{0xd64f}=0x1ecc; # capital o with dot below
1070 $chars{0xd652}=0x1e5a; # capital r with dot below
1071 $chars{0xd653}=0x1e62; # capital s with dot below
1072 $chars{0xd654}=0x1e6c; # capital t with dot below
1073 $chars{0xd655}=0x1ee4; # capital u with dot below
1074 $chars{0xd656}=0x1e7e; # capital v with dot below
1075 $chars{0xd657}=0x1e88; # capital w with dot below
1076 $chars{0xd659}=0x1ef4; # capital y with dot below
1077 $chars{0xd65a}=0x1e92; # capital z with dot below
1078 $chars{0xd661}=0x1ea1; # small a with dot below
1079 $chars{0xd662}=0x1e05; # small b with dot below
1080 $chars{0xd664}=0x1e0d; # small d with dot below
1081 $chars{0xd665}=0x1eb9; # small e with dot below
1082 $chars{0xd668}=0x1e25; # small h with dot below
1083 $chars{0xd669}=0x1ecb; # small i with dot below
1084 $chars{0xd66b}=0x1e33; # small k with dot below
1085 $chars{0xd66c}=0x1e37; # small l with dot below
1086 $chars{0xd66d}=0x1e43; # small m with dot below
1087 $chars{0xd66e}=0x1e47; # small n with dot below
1088 $chars{0xd66f}=0x1ecd; # small o with dot below
1089 $chars{0xd672}=0x1e5b; # small r with dot below
1090 $chars{0xd673}=0x1e63; # small s with dot below
1091 $chars{0xd674}=0x1e6d; # small t with dot below
1092 $chars{0xd675}=0x1ee5; # small u with dot below
1093 $chars{0xd676}=0x1e7f; # small v with dot below
1094 $chars{0xd677}=0x1e89; # small w with dot below
1095 $chars{0xd679}=0x1ef5; # small y with dot below
1096 $chars{0xd67a}=0x1e93; # small z with dot below
1097 # 5/7 double dot below
1098 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1099 $chars{0xd775}=0x1e73; # small u with diaeresis below
1101 $chars{0xd820}=0x005f; # underline
1102 # 5/9 double underline
1103 $chars{0xd920}=0x2017; # double underline
1104 # 5/10 small low vertical bar
1105 $chars{0xda20}=0x02cc; #
1106 # 5/11 circumflex below
1107 # 5/12 (this position shall not be used)
1108 # 5/13 left half of ligature sign and of double tilde
1109 # 5/14 right half of ligature sign
1110 # 5/15 right half of double tilde
1111 # map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1113 sub char_decode5426
{
1117 my @data = unpack("C*", $string);
1119 my $length=scalar(@data);
1120 for (my $i = 0; $i < scalar(@data); $i++) {
1121 my $char= $data[$i];
1122 if ($char >= 0x00 && $char <= 0x7F){
1125 push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1126 }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1129 if ($chars{$char*256+$data[$i+1]}) {
1130 $convchar= $chars{$char * 256 + $data[$i+1]};
1132 # printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1133 } elsif ($chars{$char}) {
1134 $convchar= $chars{$char};
1135 # printf "0xC char %x, converted %x\n",$char,$chars{$char};
1139 push @characters,$convchar;
1142 if ($chars{$char}) {
1143 $convchar= $chars{$char};
1144 # printf "char %x, converted %x\n",$char,$chars{$char};
1146 # printf "char %x $char\n",$char;
1149 push @characters,$convchar;
1152 $result=pack "U*",@characters;
1153 # $result=~s/\x01//;
1154 # $result=~s/\x00//;
1158 $result=~s/\x1b\x5b//;
1159 # map{printf "%x",$_} @characters;
1169 Koha Development Team <http://koha-community.org/>
1171 Galen Charlton <galen.charlton@liblime.com>