3 # Copyright (C) 2008 LibLime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 3 of the License, or
10 # (at your option) any later version.
12 # Koha is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Koha; if not, see <http://www.gnu.org/licenses>.
23 use MARC
::Charset qw
/marc8_to_utf8/;
26 use Unicode
::Normalize
;
27 use Encode
qw( decode encode is_utf8 );
29 use vars
qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
32 # set the version for version checking
33 $VERSION = 3.07.00.049;
52 C4::Charset - utilities for handling character set conversions.
60 This module contains routines for dealing with character set
61 conversions, particularly for MARC records.
63 A variety of character encodings are in use by various MARC
64 standards, and even more character encodings are used by
65 non-standard MARC records. The various MARC formats generally
66 do not do a good job of advertising a given record's character
67 encoding, and even when a record does advertise its encoding,
68 e.g., via the Leader/09, experience has shown that one cannot
71 Ultimately, all MARC records are stored in Koha in UTF-8 and
72 must be converted from whatever the source character encoding is.
73 The goal of this module is to ensure that these conversions
74 take place accurately. When a character conversion cannot take
75 place, or at least not accurately, the module was provide
76 enough information to allow user-facing code to inform the user
77 on how to deal with the situation.
83 =head2 IsStringUTF8ish
85 my $is_utf8 = IsStringUTF8ish($str);
87 Determines if C<$str> is valid UTF-8. This can mean
94 The Perl UTF-8 flag is set and the string contains valid UTF-8.
98 The Perl UTF-8 flag is B<not> set, but the octets contain
103 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
104 because in one could be presented with a MARC blob that is
105 not actually in UTF-8 but whose sequence of octets appears to be
106 valid UTF-8. The rest of the MARC character conversion functions
107 will assume that this situation occur does not very often.
111 sub IsStringUTF8ish
{
114 return 1 if Encode
::is_utf8
($str);
115 return utf8
::decode
( $str );
120 my $marc_record = SetUTF8Flag($marc_record, $nfd);
122 This function sets the PERL UTF8 flag for data.
123 It is required when using new_from_usmarc
124 since MARC::File::USMARC does not handle PERL UTF8 setting.
125 When editing unicode marc records fields and subfields, you
126 would end up in double encoding without using this function.
128 If $nfd is set, string normalization will use NFD instead of NFC
131 In my opinion, this function belongs to MARC::Record and not
133 But since it handles charset, and MARC::Record, it finds its way in that package
138 my ($record, $nfd)=@_;
139 return unless ($record && $record->fields());
140 foreach my $field ($record->fields()){
141 if ($field->tag()>=10){
143 foreach my $subfield ($field->subfields()){
144 push @subfields,($$subfield[0],NormalizeString
($$subfield[1],$nfd));
147 my $newfield=MARC
::Field
->new(
149 $field->indicator(1),
150 $field->indicator(2),
153 $field->replace_with($newfield);
155 warn "ERROR occurred in SetUTF8Flag $@" if $@
;
160 =head2 NormalizeString
162 my $normalized_string=NormalizeString($string,$nfd,$transform);
165 nfd : If you want to set NFD and not NFC
166 transform : If you expect all the signs to be removed
168 Sets the PERL UTF8 Flag on your initial data if need be
169 and applies cleaning if required
171 Returns a utf8 NFC normalized string
174 my $string=NormalizeString ("l'ornithoptère");
175 #results into ornithoptère in NFC form and sets UTF8 Flag
181 my ($string,$nfd,$transform)=@_;
182 return $string unless defined($string); # force scalar context return.
183 $string = Encode
::decode
('UTF-8', $string) unless (Encode
::is_utf8
($string));
185 $string= NFD
($string);
188 $string=NFC
($string);
191 $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g
;
192 #removing one letter words "d'" "l'" was changed into "d " "l "
193 $string=~s/\b\S\b//g;
199 =head2 MarcToUTF8Record
201 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
202 $marc_flavour, [, $source_encoding]);
204 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
205 optional source encoding, return a C<MARC::Record> that is
208 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
209 is not guaranteed to have been converted correctly. Specifically,
210 if C<$converted_from> is 'failed', the MARC record returned failed
211 character conversion and had each of its non-ASCII octets changed
212 to the Unicode replacement character.
214 If the source encoding was not specified, this routine will
215 try to guess it; the character encoding used for a successful
216 conversion is returned in C<$converted_from>.
220 sub MarcToUTF8Record
{
222 my $marc_flavour = shift;
223 my $source_encoding = shift;
225 my $marc_blob_is_utf8 = 0;
226 if (ref($marc) eq 'MARC::Record') {
227 my $marc_blob = $marc->as_usmarc();
228 $marc_blob_is_utf8 = IsStringUTF8ish
($marc_blob);
229 $marc_record = $marc;
231 # dealing with a MARC blob
233 # remove any ersatz whitespace from the beginning and
234 # end of the MARC blob -- these can creep into MARC
235 # files produced by several sources -- caller really
236 # should be doing this, however
239 $marc_blob_is_utf8 = IsStringUTF8ish
($marc);
241 $marc_record = MARC
::Record
->new_from_usmarc($marc);
244 # if we fail the first time, one likely problem
245 # is that we have a MARC21 record that says that it's
246 # UTF-8 (Leader/09 = 'a') but contains non-UTF-8 characters.
247 # We'll try parsing it again.
248 substr($marc, 9, 1) = ' ';
250 $marc_record = MARC
::Record
->new_from_usmarc($marc);
253 # it's hopeless; return an empty MARC::Record
254 return MARC
::Record
->new(), 'failed', ['could not parse MARC blob'];
259 # If we do not know the source encoding, try some guesses
261 # 1. Record is UTF-8 already.
262 # 2. If MARC flavor is MARC21 or NORMARC, then
263 # a. record is MARC-8
264 # b. record is ISO-8859-1
265 # 3. If MARC flavor is UNIMARC, then
266 if (not defined $source_encoding) {
267 if ($marc_blob_is_utf8) {
268 # note that for MARC21/NORMARC we are not bothering to check
269 # if the Leader/09 is set to 'a' or not -- because
270 # of problems with various ILSs (including Koha in the
271 # past, alas), this just is not trustworthy.
272 SetMarcUnicodeFlag
($marc_record, $marc_flavour);
273 return $marc_record, 'UTF-8', [];
275 if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
276 return _default_marc21_charconv_to_utf8
($marc_record, $marc_flavour);
277 } elsif ($marc_flavour =~/UNIMARC/) {
278 return _default_unimarc_charconv_to_utf8
($marc_record, $marc_flavour);
280 return _default_marc21_charconv_to_utf8
($marc_record, $marc_flavour);
284 # caller knows the character encoding
285 my $original_marc_record = $marc_record->clone();
287 if ($source_encoding =~ /utf-?8/i) {
288 if ($marc_blob_is_utf8) {
289 SetMarcUnicodeFlag
($marc_record, $marc_flavour);
290 return $marc_record, 'UTF-8', [];
292 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
294 } elsif ($source_encoding =~ /marc-?8/i) {
295 @errors = _marc_marc8_to_utf8
($marc_record, $marc_flavour);
296 } elsif ($source_encoding =~ /5426/) {
297 @errors = _marc_iso5426_to_utf8
($marc_record, $marc_flavour);
299 # assume any other character encoding is for Text::Iconv
300 @errors = _marc_to_utf8_via_text_iconv
($marc_record, $marc_flavour, $source_encoding);
304 _marc_to_utf8_replacement_char
($original_marc_record, $marc_flavour);
305 return $original_marc_record, 'failed', \
@errors;
307 return $marc_record, $source_encoding, [];
313 =head2 SetMarcUnicodeFlag
315 SetMarcUnicodeFlag($marc_record, $marc_flavour);
317 Set both the internal MARC::Record encoding flag
318 and the appropriate Leader/09 (MARC21) or
319 100/26-29 (UNIMARC) to indicate that the record
320 is in UTF-8. Note that this does B<not> do
321 any actual character conversion.
325 sub SetMarcUnicodeFlag
{
326 my $marc_record = shift;
327 my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
329 $marc_record->encoding('UTF-8');
330 if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
331 my $leader = $marc_record->leader();
332 substr($leader, 9, 1) = 'a';
333 $marc_record->leader($leader);
334 } elsif ($marc_flavour =~/UNIMARC/) {
336 my $defaultlanguage = C4
::Context
->preference("UNIMARCField100Language");
337 $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
339 my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?
(21,12):(36,25));
340 $string=$marc_record->subfield( 100, "a" );
341 if (defined $string && length($string)==$subflength) {
342 $string = substr $string, 0,$subflength if (length($string)>$subflength);
345 $string = POSIX
::strftime
( "%Y%m%d", localtime );
347 $string = sprintf( "%-*s", $subflength, $string );
348 substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
350 substr( $string, $encodingposition, 3, "y50" );
351 if ( $marc_record->subfield( 100, "a" ) ) {
352 $marc_record->field('100')->update(a
=>$string);
355 $marc_record->insert_grouped_field(
356 MARC
::Field
->new( 100, '', '', "a" => $string ) );
358 $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
360 warn "Unrecognized marcflavour: $marc_flavour";
364 =head2 StripNonXmlChars
366 my $new_str = StripNonXmlChars($old_str);
368 Given a string, return a copy with the
369 characters that are illegal in XML
372 This function exists to work around a problem
373 that can occur with badly-encoded MARC records.
374 Specifically, if a UTF-8 MARC record also
375 has excape (\x1b) characters, MARC::File::XML
376 will let the escape characters pass through
377 when as_xml() or as_xml_record() is called. The
378 problem is that the escape character is not
379 legal in well-formed XML documents, so when
380 MARC::File::XML attempts to parse such a record,
381 the XML parser will fail.
383 Stripping such characters will allow a
384 MARC::Record->new_from_xml()
385 to work, at the possible risk of some data loss.
389 sub StripNonXmlChars
{
391 if (!defined($str) || $str eq ""){
394 $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
408 Removes Non Sorting Block characters
412 my $NSB = '\x88' ; # NSB : begin Non Sorting Block
413 my $NSE = '\x89' ; # NSE : Non Sorting Block end
414 my $NSB2 = '\x98' ; # NSB : begin Non Sorting Block
415 my $NSE2 = '\x9C' ; # NSE : Non Sorting Block end
416 my $C2 = '\xC2' ; # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
418 # handles non sorting blocks
421 s/($C2){0,1}($NSB|$NSB2)//g ;
422 s/($C2){0,1}($NSE|$NSE2)//g ;
429 =head2 SanitizeRecord
431 SanitizeRecord($marcrecord);
434 This routine is called in the maintenance script misc/maintenance/sanitize_records.pl.
435 It cleans any string with '&amp;...', replacing it by '&'
440 my ( $record, $biblionumber ) = @_;
442 my $record_modified = 0;
443 my $frameworkcode = C4
::Biblio
::GetFrameworkCode
($biblionumber);
444 my ( $url_field, $url_subfield ) =
445 C4
::Biblio
::GetMarcFromKohaField
( 'biblioitems.url', $frameworkcode );
446 foreach my $field ( $record->fields() ) {
447 if ( $field->is_control_field() ) {
448 my $value = $field->data();
449 my $sanitized_value = _clean_ampersand
($value);
450 $record_modified = 1 if $sanitized_value ne $value;
451 $field->update($sanitized_value);
454 my @subfields = $field->subfields();
456 foreach my $subfield (@subfields) {
458 if $url_field eq $field->tag()
459 and $url_subfield eq $subfield->[0];
460 my $value = $subfield->[1];
461 my $sanitized_value = _clean_ampersand
($value);
462 push @new_subfields, $subfield->[0] => $sanitized_value;
463 $record_modified = 1 if $sanitized_value ne $value;
465 if ( scalar(@new_subfields) > 0 ) {
466 my $new_field = eval {
468 $field->tag(), $field->indicator(1),
469 $field->indicator(2), @new_subfields
476 $field->replace_with($new_field);
483 return $record, $record_modified;
486 sub _clean_ampersand
{
488 $string =~ s/(&)(amp;)+/$1/g;
492 =head1 INTERNAL FUNCTIONS
494 =head2 _default_marc21_charconv_to_utf8
496 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
498 Converts a C<MARC::Record> of unknown character set to UTF-8,
499 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
500 to UTF-8, then a default conversion that replaces each non-ASCII
501 character with the replacement character.
503 The C<$guessed_charset> return value contains the character set
504 that resulted in a conversion to valid UTF-8; note that
505 if the MARC-8 and ISO-8859-1 conversions failed, the value of
510 sub _default_marc21_charconv_to_utf8
{
511 my $marc_record = shift;
512 my $marc_flavour = shift;
514 my $trial_marc8 = $marc_record->clone();
516 my @errors = _marc_marc8_to_utf8
($trial_marc8, $marc_flavour);
518 return $trial_marc8, 'MARC-8', [];
520 push @all_errors, @errors;
522 my $trial_8859_1 = $marc_record->clone();
523 @errors = _marc_to_utf8_via_text_iconv
($trial_8859_1, $marc_flavour, 'iso-8859-1');
525 return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
526 # instead if we wanted to report details
527 # of the failed attempt at MARC-8 => UTF-8
529 push @all_errors, @errors;
531 my $default_converted = $marc_record->clone();
532 _marc_to_utf8_replacement_char
($default_converted, $marc_flavour);
533 return $default_converted, 'failed', \
@all_errors;
536 =head2 _default_unimarc_charconv_to_utf8
538 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
540 Converts a C<MARC::Record> of unknown character set to UTF-8,
541 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
542 to UTF-8, then a default conversion that replaces each non-ASCII
543 character with the replacement character.
545 The C<$guessed_charset> return value contains the character set
546 that resulted in a conversion to valid UTF-8; note that
547 if the MARC-8 and ISO-8859-1 conversions failed, the value of
552 sub _default_unimarc_charconv_to_utf8
{
553 my $marc_record = shift;
554 my $marc_flavour = shift;
556 my $trial_marc8 = $marc_record->clone();
558 my @errors = _marc_iso5426_to_utf8
($trial_marc8, $marc_flavour);
560 return $trial_marc8, 'iso-5426';
562 push @all_errors, @errors;
564 my $trial_8859_1 = $marc_record->clone();
565 @errors = _marc_to_utf8_via_text_iconv
($trial_8859_1, $marc_flavour, 'iso-8859-1');
567 return $trial_8859_1, 'iso-8859-1';
569 push @all_errors, @errors;
571 my $default_converted = $marc_record->clone();
572 _marc_to_utf8_replacement_char
($default_converted, $marc_flavour);
573 return $default_converted, 'failed', \
@all_errors;
576 =head2 _marc_marc8_to_utf8
578 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
580 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
581 If the conversion fails for some reason, an
582 appropriate messages will be placed in the returned
587 sub _marc_marc8_to_utf8
{
588 my $marc_record = shift;
589 my $marc_flavour = shift;
591 my $prev_ignore = MARC
::Charset
->ignore_errors();
592 MARC
::Charset
->ignore_errors(1);
594 # trap warnings raised by MARC::Charset
596 local $SIG{__WARN__
} = sub {
598 if ($msg =~ /MARC.Charset/) {
599 # FIXME - purpose of this regexp is to strip out the
600 # line reference to MARC/Charset.pm, but as it
601 # exists probably won't work quite on Windows --
602 # some sort of minimal-bunch back-tracking RE
603 # would be helpful here
604 $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
607 # if warning doesn't come from MARC::Charset, just
613 foreach my $field ($marc_record->fields()) {
614 if ($field->is_control_field()) {
615 ; # do nothing -- control fields should not contain non-ASCII characters
617 my @converted_subfields;
618 foreach my $subfield ($field->subfields()) {
619 my $utf8sf = MARC
::Charset
::marc8_to_utf8
($subfield->[1]);
620 unless (IsStringUTF8ish
($utf8sf)) {
621 # Because of a bug in MARC::Charset 0.98, if the string
622 # has (a) one or more diacritics that (b) are only in character positions
623 # 128 to 255 inclusive, the resulting converted string is not in
624 # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
625 # occurs, upgrade the string in place. Moral of the story seems to be
626 # that pack("U", ...) is better than chr(...) if you need to guarantee
627 # that the resulting string is UTF-8.
628 $utf8sf = Encode
::encode
('UTF-8', $utf8sf);
630 push @converted_subfields, $subfield->[0], $utf8sf;
633 $field->replace_with(MARC
::Field
->new(
634 $field->tag(), $field->indicator(1), $field->indicator(2),
635 @converted_subfields)
640 MARC
::Charset
->ignore_errors($prev_ignore);
642 SetMarcUnicodeFlag
($marc_record, $marc_flavour);
647 =head2 _marc_iso5426_to_utf8
649 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
651 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
652 If the conversion fails for some reason, an
653 appropriate messages will be placed in the returned
656 FIXME - is ISO-5426 equivalent enough to MARC-8
657 that C<MARC::Charset> can be used instead?
661 sub _marc_iso5426_to_utf8
{
662 my $marc_record = shift;
663 my $marc_flavour = shift;
667 foreach my $field ($marc_record->fields()) {
668 if ($field->is_control_field()) {
669 ; # do nothing -- control fields should not contain non-ASCII characters
671 my @converted_subfields;
672 foreach my $subfield ($field->subfields()) {
673 my $utf8sf = char_decode5426
($subfield->[1]);
674 push @converted_subfields, $subfield->[0], $utf8sf;
677 $field->replace_with(MARC
::Field
->new(
678 $field->tag(), $field->indicator(1), $field->indicator(2),
679 @converted_subfields)
684 SetMarcUnicodeFlag
($marc_record, $marc_flavour);
689 =head2 _marc_to_utf8_via_text_iconv
691 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
693 Convert a C<MARC::Record> to UTF-8 in-place using the
694 C<Text::Iconv> CPAN module. Any source encoding accepted
695 by the user's iconv installation should work. If
696 the source encoding is not recognized on the user's
697 server or the conversion fails for some reason,
698 appropriate messages will be placed in the returned
703 sub _marc_to_utf8_via_text_iconv
{
704 my $marc_record = shift;
705 my $marc_flavour = shift;
706 my $source_encoding = shift;
710 eval { $decoder = Text
::Iconv
->new($source_encoding, 'utf8'); };
712 push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
716 my $prev_raise_error = Text
::Iconv
->raise_error();
717 Text
::Iconv
->raise_error(1);
719 foreach my $field ($marc_record->fields()) {
720 if ($field->is_control_field()) {
721 ; # do nothing -- control fields should not contain non-ASCII characters
723 my @converted_subfields;
724 foreach my $subfield ($field->subfields()) {
726 my $conversion_ok = 1;
727 eval { $converted_value = $decoder->convert($subfield->[1]); };
731 } elsif (not defined $converted_value) {
733 push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
736 if ($conversion_ok) {
737 push @converted_subfields, $subfield->[0], $converted_value;
739 $converted_value = $subfield->[1];
740 $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
741 push @converted_subfields, $subfield->[0], $converted_value;
745 $field->replace_with(MARC
::Field
->new(
746 $field->tag(), $field->indicator(1), $field->indicator(2),
747 @converted_subfields)
752 SetMarcUnicodeFlag
($marc_record, $marc_flavour);
753 Text
::Iconv
->raise_error($prev_raise_error);
758 =head2 _marc_to_utf8_replacement_char
760 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
762 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
763 unsatisfactory method of replacing all non-ASCII (e.g.,
764 where the eight bit is set) octet with the Unicode
765 replacement character. This is meant as a last-ditch
766 method, and would be best used as part of a UI that
767 lets a cataloguer pick various character conversions
768 until he or she finds the right one.
772 sub _marc_to_utf8_replacement_char
{
773 my $marc_record = shift;
774 my $marc_flavour = shift;
776 foreach my $field ($marc_record->fields()) {
777 if ($field->is_control_field()) {
778 ; # do nothing -- control fields should not contain non-ASCII characters
780 my @converted_subfields;
781 foreach my $subfield ($field->subfields()) {
782 my $value = $subfield->[1];
783 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
784 push @converted_subfields, $subfield->[0], $value;
787 $field->replace_with(MARC
::Field
->new(
788 $field->tag(), $field->indicator(1), $field->indicator(2),
789 @converted_subfields)
794 SetMarcUnicodeFlag
($marc_record, $marc_flavour);
797 =head2 char_decode5426
799 my $utf8string = char_decode5426($iso_5426_string);
801 Converts a string from ISO-5426 to UTF-8.
810 $chars{0xb0}=0x0101;#3/0ayn[ain]
811 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
812 #$chars{0xb2}=0x00e0;#'à';
813 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
814 #$chars{0xb3}=0x00e7;#'ç';
815 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
820 $chars{0xb6}=0x2021; # double dagger
821 $chars{0xb7}=0x00b7; # middle dot
822 $chars{0xb8}=0x2033; # double prime
823 $chars{0xb9}=0x2019; # right single quotation mark
824 $chars{0xba}=0x201d; # right double quotation mark
825 $chars{0xbb}=0x00bb; # right-pointing double angle quotation mark
826 $chars{0xbc}=0x266f; # music sharp sign
827 $chars{0xbd}=0x02b9; # modifier letter prime
828 $chars{0xbe}=0x02ba; # modifier letter double prime
829 $chars{0xbf}=0x00bf; # inverted question mark
833 $chars{0xe1}=0x00c6; # latin capital letter ae
834 $chars{0xe2}=0x0110; # latin capital letter d with stroke
835 $chars{0xe6}=0x0132; # latin capital ligature ij
836 $chars{0xe8}=0x0141; # latin capital letter l with stroke
837 $chars{0xe9}=0x00d8; # latin capital letter o with stroke
838 $chars{0xea}=0x0152; # latin capital ligature oe
839 $chars{0xec}=0x00de; # latin capital letter thorn
843 $chars{0xf1}=0x00e6; # latin small letter ae
844 $chars{0xf2}=0x0111; # latin small letter d with stroke
845 $chars{0xf3}=0x00f0; # latin small letter eth
846 $chars{0xf5}=0x0131; # latin small letter dotless i
847 $chars{0xf6}=0x0133; # latin small ligature ij
848 $chars{0xf8}=0x0142; # latin small letter l with stroke
849 $chars{0xf9}=0x00f8; # latin small letter o with stroke
850 $chars{0xfa}=0x0153; # latin small ligature oe
851 $chars{0xfb}=0x00df; # latin small letter sharp s
852 $chars{0xfc}=0x00fe; # latin small letter thorn
856 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
857 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
858 #$chars{0x81d1}=0x00b0; # FIXME useless
861 ## combined characters iso5426
863 $chars{0xc041}=0x1ea2; # capital a with hook above
864 $chars{0xc045}=0x1eba; # capital e with hook above
865 $chars{0xc049}=0x1ec8; # capital i with hook above
866 $chars{0xc04f}=0x1ece; # capital o with hook above
867 $chars{0xc055}=0x1ee6; # capital u with hook above
868 $chars{0xc059}=0x1ef6; # capital y with hook above
869 $chars{0xc061}=0x1ea3; # small a with hook above
870 $chars{0xc065}=0x1ebb; # small e with hook above
871 $chars{0xc069}=0x1ec9; # small i with hook above
872 $chars{0xc06f}=0x1ecf; # small o with hook above
873 $chars{0xc075}=0x1ee7; # small u with hook above
874 $chars{0xc079}=0x1ef7; # small y with hook above
877 $chars{0xc141}=0x00c0; # capital a with grave accent
878 $chars{0xc145}=0x00c8; # capital e with grave accent
879 $chars{0xc149}=0x00cc; # capital i with grave accent
880 $chars{0xc14f}=0x00d2; # capital o with grave accent
881 $chars{0xc155}=0x00d9; # capital u with grave accent
882 $chars{0xc157}=0x1e80; # capital w with grave
883 $chars{0xc159}=0x1ef2; # capital y with grave
884 $chars{0xc161}=0x00e0; # small a with grave accent
885 $chars{0xc165}=0x00e8; # small e with grave accent
886 $chars{0xc169}=0x00ec; # small i with grave accent
887 $chars{0xc16f}=0x00f2; # small o with grave accent
888 $chars{0xc175}=0x00f9; # small u with grave accent
889 $chars{0xc177}=0x1e81; # small w with grave
890 $chars{0xc179}=0x1ef3; # small y with grave
892 $chars{0xc241}=0x00c1; # capital a with acute accent
893 $chars{0xc243}=0x0106; # capital c with acute accent
894 $chars{0xc245}=0x00c9; # capital e with acute accent
895 $chars{0xc247}=0x01f4; # capital g with acute
896 $chars{0xc249}=0x00cd; # capital i with acute accent
897 $chars{0xc24b}=0x1e30; # capital k with acute
898 $chars{0xc24c}=0x0139; # capital l with acute accent
899 $chars{0xc24d}=0x1e3e; # capital m with acute
900 $chars{0xc24e}=0x0143; # capital n with acute accent
901 $chars{0xc24f}=0x00d3; # capital o with acute accent
902 $chars{0xc250}=0x1e54; # capital p with acute
903 $chars{0xc252}=0x0154; # capital r with acute accent
904 $chars{0xc253}=0x015a; # capital s with acute accent
905 $chars{0xc255}=0x00da; # capital u with acute accent
906 $chars{0xc257}=0x1e82; # capital w with acute
907 $chars{0xc259}=0x00dd; # capital y with acute accent
908 $chars{0xc25a}=0x0179; # capital z with acute accent
909 $chars{0xc261}=0x00e1; # small a with acute accent
910 $chars{0xc263}=0x0107; # small c with acute accent
911 $chars{0xc265}=0x00e9; # small e with acute accent
912 $chars{0xc267}=0x01f5; # small g with acute
913 $chars{0xc269}=0x00ed; # small i with acute accent
914 $chars{0xc26b}=0x1e31; # small k with acute
915 $chars{0xc26c}=0x013a; # small l with acute accent
916 $chars{0xc26d}=0x1e3f; # small m with acute
917 $chars{0xc26e}=0x0144; # small n with acute accent
918 $chars{0xc26f}=0x00f3; # small o with acute accent
919 $chars{0xc270}=0x1e55; # small p with acute
920 $chars{0xc272}=0x0155; # small r with acute accent
921 $chars{0xc273}=0x015b; # small s with acute accent
922 $chars{0xc275}=0x00fa; # small u with acute accent
923 $chars{0xc277}=0x1e83; # small w with acute
924 $chars{0xc279}=0x00fd; # small y with acute accent
925 $chars{0xc27a}=0x017a; # small z with acute accent
926 $chars{0xc2e1}=0x01fc; # capital ae with acute
927 $chars{0xc2f1}=0x01fd; # small ae with acute
928 # 4/3 circumflex accent
929 $chars{0xc341}=0x00c2; # capital a with circumflex accent
930 $chars{0xc343}=0x0108; # capital c with circumflex
931 $chars{0xc345}=0x00ca; # capital e with circumflex accent
932 $chars{0xc347}=0x011c; # capital g with circumflex
933 $chars{0xc348}=0x0124; # capital h with circumflex
934 $chars{0xc349}=0x00ce; # capital i with circumflex accent
935 $chars{0xc34a}=0x0134; # capital j with circumflex
936 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
937 $chars{0xc353}=0x015c; # capital s with circumflex
938 $chars{0xc355}=0x00db; # capital u with circumflex
939 $chars{0xc357}=0x0174; # capital w with circumflex
940 $chars{0xc359}=0x0176; # capital y with circumflex
941 $chars{0xc35a}=0x1e90; # capital z with circumflex
942 $chars{0xc361}=0x00e2; # small a with circumflex accent
943 $chars{0xc363}=0x0109; # small c with circumflex
944 $chars{0xc365}=0x00ea; # small e with circumflex accent
945 $chars{0xc367}=0x011d; # small g with circumflex
946 $chars{0xc368}=0x0125; # small h with circumflex
947 $chars{0xc369}=0x00ee; # small i with circumflex accent
948 $chars{0xc36a}=0x0135; # small j with circumflex
949 $chars{0xc36e}=0x00f1; # small n with tilde
950 $chars{0xc36f}=0x00f4; # small o with circumflex accent
951 $chars{0xc373}=0x015d; # small s with circumflex
952 $chars{0xc375}=0x00fb; # small u with circumflex
953 $chars{0xc377}=0x0175; # small w with circumflex
954 $chars{0xc379}=0x0177; # small y with circumflex
955 $chars{0xc37a}=0x1e91; # small z with circumflex
957 $chars{0xc441}=0x00c3; # capital a with tilde
958 $chars{0xc445}=0x1ebc; # capital e with tilde
959 $chars{0xc449}=0x0128; # capital i with tilde
960 $chars{0xc44e}=0x00d1; # capital n with tilde
961 $chars{0xc44f}=0x00d5; # capital o with tilde
962 $chars{0xc455}=0x0168; # capital u with tilde
963 $chars{0xc456}=0x1e7c; # capital v with tilde
964 $chars{0xc459}=0x1ef8; # capital y with tilde
965 $chars{0xc461}=0x00e3; # small a with tilde
966 $chars{0xc465}=0x1ebd; # small e with tilde
967 $chars{0xc469}=0x0129; # small i with tilde
968 $chars{0xc46e}=0x00f1; # small n with tilde
969 $chars{0xc46f}=0x00f5; # small o with tilde
970 $chars{0xc475}=0x0169; # small u with tilde
971 $chars{0xc476}=0x1e7d; # small v with tilde
972 $chars{0xc479}=0x1ef9; # small y with tilde
974 $chars{0xc541}=0x0100; # capital a with macron
975 $chars{0xc545}=0x0112; # capital e with macron
976 $chars{0xc547}=0x1e20; # capital g with macron
977 $chars{0xc549}=0x012a; # capital i with macron
978 $chars{0xc54f}=0x014c; # capital o with macron
979 $chars{0xc555}=0x016a; # capital u with macron
980 $chars{0xc561}=0x0101; # small a with macron
981 $chars{0xc565}=0x0113; # small e with macron
982 $chars{0xc567}=0x1e21; # small g with macron
983 $chars{0xc569}=0x012b; # small i with macron
984 $chars{0xc56f}=0x014d; # small o with macron
985 $chars{0xc575}=0x016b; # small u with macron
986 $chars{0xc572}=0x0159; # small r with macron
987 $chars{0xc5e1}=0x01e2; # capital ae with macron
988 $chars{0xc5f1}=0x01e3; # small ae with macron
990 $chars{0xc641}=0x0102; # capital a with breve
991 $chars{0xc645}=0x0114; # capital e with breve
992 $chars{0xc647}=0x011e; # capital g with breve
993 $chars{0xc649}=0x012c; # capital i with breve
994 $chars{0xc64f}=0x014e; # capital o with breve
995 $chars{0xc655}=0x016c; # capital u with breve
996 $chars{0xc661}=0x0103; # small a with breve
997 $chars{0xc665}=0x0115; # small e with breve
998 $chars{0xc667}=0x011f; # small g with breve
999 $chars{0xc669}=0x012d; # small i with breve
1000 $chars{0xc66f}=0x014f; # small o with breve
1001 $chars{0xc675}=0x016d; # small u with breve
1003 $chars{0xc7b0}=0x01e1; # Ain with dot above
1004 $chars{0xc742}=0x1e02; # capital b with dot above
1005 $chars{0xc743}=0x010a; # capital c with dot above
1006 $chars{0xc744}=0x1e0a; # capital d with dot above
1007 $chars{0xc745}=0x0116; # capital e with dot above
1008 $chars{0xc746}=0x1e1e; # capital f with dot above
1009 $chars{0xc747}=0x0120; # capital g with dot above
1010 $chars{0xc748}=0x1e22; # capital h with dot above
1011 $chars{0xc749}=0x0130; # capital i with dot above
1012 $chars{0xc74d}=0x1e40; # capital m with dot above
1013 $chars{0xc74e}=0x1e44; # capital n with dot above
1014 $chars{0xc750}=0x1e56; # capital p with dot above
1015 $chars{0xc752}=0x1e58; # capital r with dot above
1016 $chars{0xc753}=0x1e60; # capital s with dot above
1017 $chars{0xc754}=0x1e6a; # capital t with dot above
1018 $chars{0xc757}=0x1e86; # capital w with dot above
1019 $chars{0xc758}=0x1e8a; # capital x with dot above
1020 $chars{0xc759}=0x1e8e; # capital y with dot above
1021 $chars{0xc75a}=0x017b; # capital z with dot above
1022 $chars{0xc761}=0x0227; # small b with dot above
1023 $chars{0xc762}=0x1e03; # small b with dot above
1024 $chars{0xc763}=0x010b; # small c with dot above
1025 $chars{0xc764}=0x1e0b; # small d with dot above
1026 $chars{0xc765}=0x0117; # small e with dot above
1027 $chars{0xc766}=0x1e1f; # small f with dot above
1028 $chars{0xc767}=0x0121; # small g with dot above
1029 $chars{0xc768}=0x1e23; # small h with dot above
1030 $chars{0xc76d}=0x1e41; # small m with dot above
1031 $chars{0xc76e}=0x1e45; # small n with dot above
1032 $chars{0xc770}=0x1e57; # small p with dot above
1033 $chars{0xc772}=0x1e59; # small r with dot above
1034 $chars{0xc773}=0x1e61; # small s with dot above
1035 $chars{0xc774}=0x1e6b; # small t with dot above
1036 $chars{0xc777}=0x1e87; # small w with dot above
1037 $chars{0xc778}=0x1e8b; # small x with dot above
1038 $chars{0xc779}=0x1e8f; # small y with dot above
1039 $chars{0xc77a}=0x017c; # small z with dot above
1040 # 4/8 trema, diaresis
1041 $chars{0xc820}=0x00a8; # diaeresis
1042 $chars{0xc841}=0x00c4; # capital a with diaeresis
1043 $chars{0xc845}=0x00cb; # capital e with diaeresis
1044 $chars{0xc848}=0x1e26; # capital h with diaeresis
1045 $chars{0xc849}=0x00cf; # capital i with diaeresis
1046 $chars{0xc84f}=0x00d6; # capital o with diaeresis
1047 $chars{0xc855}=0x00dc; # capital u with diaeresis
1048 $chars{0xc857}=0x1e84; # capital w with diaeresis
1049 $chars{0xc858}=0x1e8c; # capital x with diaeresis
1050 $chars{0xc859}=0x0178; # capital y with diaeresis
1051 $chars{0xc861}=0x00e4; # small a with diaeresis
1052 $chars{0xc865}=0x00eb; # small e with diaeresis
1053 $chars{0xc868}=0x1e27; # small h with diaeresis
1054 $chars{0xc869}=0x00ef; # small i with diaeresis
1055 $chars{0xc86f}=0x00f6; # small o with diaeresis
1056 $chars{0xc874}=0x1e97; # small t with diaeresis
1057 $chars{0xc875}=0x00fc; # small u with diaeresis
1058 $chars{0xc877}=0x1e85; # small w with diaeresis
1059 $chars{0xc878}=0x1e8d; # small x with diaeresis
1060 $chars{0xc879}=0x00ff; # small y with diaeresis
1062 $chars{0xc920}=0x00a8; # [diaeresis]
1063 $chars{0xc961}=0x00e4; # a with umlaut
1064 $chars{0xc965}=0x00eb; # e with umlaut
1065 $chars{0xc969}=0x00ef; # i with umlaut
1066 $chars{0xc96f}=0x00f6; # o with umlaut
1067 $chars{0xc975}=0x00fc; # u with umlaut
1069 $chars{0xca41}=0x00c5; # capital a with ring above
1070 $chars{0xcaad}=0x016e; # capital u with ring above
1071 $chars{0xca61}=0x00e5; # small a with ring above
1072 $chars{0xca75}=0x016f; # small u with ring above
1073 $chars{0xca77}=0x1e98; # small w with ring above
1074 $chars{0xca79}=0x1e99; # small y with ring above
1075 # 4/11 high comma off centre
1076 # 4/12 inverted high comma centred
1077 # 4/13 double acute accent
1078 $chars{0xcd4f}=0x0150; # capital o with double acute
1079 $chars{0xcd55}=0x0170; # capital u with double acute
1080 $chars{0xcd6f}=0x0151; # small o with double acute
1081 $chars{0xcd75}=0x0171; # small u with double acute
1083 $chars{0xce54}=0x01a0; # latin capital letter o with horn
1084 $chars{0xce55}=0x01af; # latin capital letter u with horn
1085 $chars{0xce74}=0x01a1; # latin small letter o with horn
1086 $chars{0xce75}=0x01b0; # latin small letter u with horn
1088 $chars{0xcf41}=0x01cd; # capital a with caron
1089 $chars{0xcf43}=0x010c; # capital c with caron
1090 $chars{0xcf44}=0x010e; # capital d with caron
1091 $chars{0xcf45}=0x011a; # capital e with caron
1092 $chars{0xcf47}=0x01e6; # capital g with caron
1093 $chars{0xcf49}=0x01cf; # capital i with caron
1094 $chars{0xcf4b}=0x01e8; # capital k with caron
1095 $chars{0xcf4c}=0x013d; # capital l with caron
1096 $chars{0xcf4e}=0x0147; # capital n with caron
1097 $chars{0xcf4f}=0x01d1; # capital o with caron
1098 $chars{0xcf52}=0x0158; # capital r with caron
1099 $chars{0xcf53}=0x0160; # capital s with caron
1100 $chars{0xcf54}=0x0164; # capital t with caron
1101 $chars{0xcf55}=0x01d3; # capital u with caron
1102 $chars{0xcf5a}=0x017d; # capital z with caron
1103 $chars{0xcf61}=0x01ce; # small a with caron
1104 $chars{0xcf63}=0x010d; # small c with caron
1105 $chars{0xcf64}=0x010f; # small d with caron
1106 $chars{0xcf65}=0x011b; # small e with caron
1107 $chars{0xcf67}=0x01e7; # small g with caron
1108 $chars{0xcf69}=0x01d0; # small i with caron
1109 $chars{0xcf6a}=0x01f0; # small j with caron
1110 $chars{0xcf6b}=0x01e9; # small k with caron
1111 $chars{0xcf6c}=0x013e; # small l with caron
1112 $chars{0xcf6e}=0x0148; # small n with caron
1113 $chars{0xcf6f}=0x01d2; # small o with caron
1114 $chars{0xcf72}=0x0159; # small r with caron
1115 $chars{0xcf73}=0x0161; # small s with caron
1116 $chars{0xcf74}=0x0165; # small t with caron
1117 $chars{0xcf75}=0x01d4; # small u with caron
1118 $chars{0xcf7a}=0x017e; # small z with caron
1120 $chars{0xd020}=0x00b8; # cedilla
1121 $chars{0xd043}=0x00c7; # capital c with cedilla
1122 $chars{0xd044}=0x1e10; # capital d with cedilla
1123 $chars{0xd047}=0x0122; # capital g with cedilla
1124 $chars{0xd048}=0x1e28; # capital h with cedilla
1125 $chars{0xd04b}=0x0136; # capital k with cedilla
1126 $chars{0xd04c}=0x013b; # capital l with cedilla
1127 $chars{0xd04e}=0x0145; # capital n with cedilla
1128 $chars{0xd052}=0x0156; # capital r with cedilla
1129 $chars{0xd053}=0x015e; # capital s with cedilla
1130 $chars{0xd054}=0x0162; # capital t with cedilla
1131 $chars{0xd063}=0x00e7; # small c with cedilla
1132 $chars{0xd064}=0x1e11; # small d with cedilla
1133 $chars{0xd065}=0x0119; # small e with cedilla
1134 $chars{0xd067}=0x0123; # small g with cedilla
1135 $chars{0xd068}=0x1e29; # small h with cedilla
1136 $chars{0xd06b}=0x0137; # small k with cedilla
1137 $chars{0xd06c}=0x013c; # small l with cedilla
1138 $chars{0xd06e}=0x0146; # small n with cedilla
1139 $chars{0xd072}=0x0157; # small r with cedilla
1140 $chars{0xd073}=0x015f; # small s with cedilla
1141 $chars{0xd074}=0x0163; # small t with cedilla
1144 # 5/3 ogonek (hook to right
1145 $chars{0xd320}=0x02db; # ogonek
1146 $chars{0xd341}=0x0104; # capital a with ogonek
1147 $chars{0xd345}=0x0118; # capital e with ogonek
1148 $chars{0xd349}=0x012e; # capital i with ogonek
1149 $chars{0xd34f}=0x01ea; # capital o with ogonek
1150 $chars{0xd355}=0x0172; # capital u with ogonek
1151 $chars{0xd361}=0x0105; # small a with ogonek
1152 $chars{0xd365}=0x0119; # small e with ogonek
1153 $chars{0xd369}=0x012f; # small i with ogonek
1154 $chars{0xd36f}=0x01eb; # small o with ogonek
1155 $chars{0xd375}=0x0173; # small u with ogonek
1157 $chars{0xd441}=0x1e00; # capital a with ring below
1158 $chars{0xd461}=0x1e01; # small a with ring below
1159 # 5/5 half circle below
1160 $chars{0xd548}=0x1e2a; # capital h with breve below
1161 $chars{0xd568}=0x1e2b; # small h with breve below
1163 $chars{0xd641}=0x1ea0; # capital a with dot below
1164 $chars{0xd642}=0x1e04; # capital b with dot below
1165 $chars{0xd644}=0x1e0c; # capital d with dot below
1166 $chars{0xd645}=0x1eb8; # capital e with dot below
1167 $chars{0xd648}=0x1e24; # capital h with dot below
1168 $chars{0xd649}=0x1eca; # capital i with dot below
1169 $chars{0xd64b}=0x1e32; # capital k with dot below
1170 $chars{0xd64c}=0x1e36; # capital l with dot below
1171 $chars{0xd64d}=0x1e42; # capital m with dot below
1172 $chars{0xd64e}=0x1e46; # capital n with dot below
1173 $chars{0xd64f}=0x1ecc; # capital o with dot below
1174 $chars{0xd652}=0x1e5a; # capital r with dot below
1175 $chars{0xd653}=0x1e62; # capital s with dot below
1176 $chars{0xd654}=0x1e6c; # capital t with dot below
1177 $chars{0xd655}=0x1ee4; # capital u with dot below
1178 $chars{0xd656}=0x1e7e; # capital v with dot below
1179 $chars{0xd657}=0x1e88; # capital w with dot below
1180 $chars{0xd659}=0x1ef4; # capital y with dot below
1181 $chars{0xd65a}=0x1e92; # capital z with dot below
1182 $chars{0xd661}=0x1ea1; # small a with dot below
1183 $chars{0xd662}=0x1e05; # small b with dot below
1184 $chars{0xd664}=0x1e0d; # small d with dot below
1185 $chars{0xd665}=0x1eb9; # small e with dot below
1186 $chars{0xd668}=0x1e25; # small h with dot below
1187 $chars{0xd669}=0x1ecb; # small i with dot below
1188 $chars{0xd66b}=0x1e33; # small k with dot below
1189 $chars{0xd66c}=0x1e37; # small l with dot below
1190 $chars{0xd66d}=0x1e43; # small m with dot below
1191 $chars{0xd66e}=0x1e47; # small n with dot below
1192 $chars{0xd66f}=0x1ecd; # small o with dot below
1193 $chars{0xd672}=0x1e5b; # small r with dot below
1194 $chars{0xd673}=0x1e63; # small s with dot below
1195 $chars{0xd674}=0x1e6d; # small t with dot below
1196 $chars{0xd675}=0x1ee5; # small u with dot below
1197 $chars{0xd676}=0x1e7f; # small v with dot below
1198 $chars{0xd677}=0x1e89; # small w with dot below
1199 $chars{0xd679}=0x1ef5; # small y with dot below
1200 $chars{0xd67a}=0x1e93; # small z with dot below
1201 # 5/7 double dot below
1202 $chars{0xd755}=0x1e72; # capital u with diaeresis below
1203 $chars{0xd775}=0x1e73; # small u with diaeresis below
1205 $chars{0xd820}=0x005f; # underline
1206 # 5/9 double underline
1207 $chars{0xd920}=0x2017; # double underline
1208 # 5/10 small low vertical bar
1209 $chars{0xda20}=0x02cc; #
1210 # 5/11 circumflex below
1211 # 5/12 (this position shall not be used)
1212 # 5/13 left half of ligature sign and of double tilde
1213 # 5/14 right half of ligature sign
1214 # 5/15 right half of double tilde
1215 # map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1217 sub char_decode5426
{
1221 my @data = unpack("C*", $string);
1223 my $length=scalar(@data);
1224 for (my $i = 0; $i < scalar(@data); $i++) {
1225 my $char= $data[$i];
1226 if ($char >= 0x00 && $char <= 0x7F){
1229 push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1230 }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1233 if ($chars{$char*256+$data[$i+1]}) {
1234 $convchar= $chars{$char * 256 + $data[$i+1]};
1236 # printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1237 } elsif ($chars{$char}) {
1238 $convchar= $chars{$char};
1239 # printf "0xC char %x, converted %x\n",$char,$chars{$char};
1243 push @characters,$convchar;
1246 if ($chars{$char}) {
1247 $convchar= $chars{$char};
1248 # printf "char %x, converted %x\n",$char,$chars{$char};
1250 # printf "char %x $char\n",$char;
1253 push @characters,$convchar;
1256 $result=pack "U*",@characters;
1257 # $result=~s/\x01//;
1258 # $result=~s/\x00//;
1262 $result=~s/\x1b\x5b//;
1263 # map{printf "%x",$_} @characters;
1273 Koha Development Team <http://koha-community.org/>
1275 Galen Charlton <galen.charlton@liblime.com>