Update procedures
[shapes.git] / source / charconverters.cc
blobddfaebc4aa6784c592a1d1d232d615928aac3ca2
1 /* This file is part of Shapes.
3 * Shapes is free software: you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation, either version 3 of the License, or
6 * any later version.
8 * Shapes is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
13 * You should have received a copy of the GNU General Public License
14 * along with Shapes. If not, see <http://www.gnu.org/licenses/>.
16 * Copyright 2008, 2010 Henrik Tidefelt
19 #include "strrefdup.h"
20 #include "charconverters.h"
21 #include "shapesexceptions.h"
22 #include "glyphlist.h"
23 #include "characterencoding.h"
24 #include "texttypes.h"
27 #include <iconv.h>
28 #include <errno.h>// How come iconv is not enough?
29 #include <string>
30 #include <fstream>
31 #include <sstream>
35 using namespace Shapes;
37 void
38 iconv_maybe_open( iconv_t * converter, const char * to_encoding, const char * from_encoding )
40 if( *converter == (iconv_t)( -1 ) )
42 *converter = iconv_open( to_encoding, from_encoding );
43 if( *converter == (iconv_t)( -1 ) )
45 std::ostringstream msg;
46 msg << "iconv_open failed to create converter from " << from_encoding << " to " << to_encoding << "." ;
47 throw Exceptions::ExternalError( strrefdup( msg ) );
52 iconv_t
53 Helpers::requireUTF8ToMacRomanConverter( bool cleanup )
55 static iconv_t converter = (iconv_t)( - 1 );
56 if( cleanup )
58 if( converter != (iconv_t)( -1 ) )
60 iconv_close( converter );
61 converter = (iconv_t)( -1 );
64 else
66 iconv_maybe_open( & converter,
67 MAC_ROMAN, // This is meant to be what is called MacRoman in PDF.
68 "UTF-8" );
70 return converter;
73 iconv_t
74 Helpers::requireMacRomanToUTF8Converter( bool cleanup )
76 static iconv_t converter = (iconv_t)( - 1 );
77 if( cleanup )
79 if( converter != (iconv_t)( -1 ) )
81 iconv_close( converter );
82 converter = (iconv_t)( -1 );
85 else
87 iconv_maybe_open( & converter,
88 "UTF-8",
89 MAC_ROMAN ); // This is meant to be what is called MacRoman in PDF.
91 return converter;
94 iconv_t
95 Helpers::requireUTF8ToASCIIConverter( bool cleanup )
97 static iconv_t converter = (iconv_t)( - 1 );
98 if( cleanup )
100 if( converter != (iconv_t)( -1 ) )
102 iconv_close( converter );
103 converter = (iconv_t)( -1 );
106 else
108 iconv_maybe_open( & converter,
109 "ASCII", // This is used for the names of glyphs in a font
110 "UTF-8" );
112 return converter;
115 iconv_t
116 Helpers::requireUTF8ToUCS4Converter( bool cleanup )
118 static iconv_t converter = (iconv_t)( - 1 );
119 if( cleanup )
121 if( converter != (iconv_t)( -1 ) )
123 iconv_close( converter );
124 converter = (iconv_t)( -1 );
127 else
129 iconv_maybe_open( & converter,
130 UCS_4_INTERNAL,
131 "UTF-8" );
133 return converter;
136 iconv_t
137 Helpers::requireUCS4ToUTF8Converter( bool cleanup )
139 static iconv_t converter = (iconv_t)( - 1 );
140 if( cleanup )
142 if( converter != (iconv_t)( -1 ) )
144 iconv_close( converter );
145 converter = (iconv_t)( -1 );
148 else
150 iconv_maybe_open( & converter,
151 "UTF-8",
152 UCS_4_INTERNAL );
154 return converter;
157 iconv_t
158 Helpers::requireUCS4ToMacRomanConverter( bool cleanup )
160 static iconv_t converter = (iconv_t)( - 1 );
161 if( cleanup )
163 if( converter != (iconv_t)( -1 ) )
165 iconv_close( converter );
166 converter = (iconv_t)( -1 );
169 else
171 iconv_maybe_open( & converter,
172 MAC_ROMAN,
173 UCS_4_INTERNAL );
175 return converter;
178 iconv_t
179 Helpers::requireUTF16BEToUCS4Converter( bool cleanup )
181 static iconv_t converter = (iconv_t)( - 1 );
182 if( cleanup )
184 if( converter != (iconv_t)( -1 ) )
186 iconv_close( converter );
187 converter = (iconv_t)( -1 );
190 else
192 iconv_maybe_open( & converter,
193 UCS_4_INTERNAL,
194 "UTF-16BE" );
196 return converter;
199 iconv_t
200 Helpers::requireUCS4ToUTF16BEConverter( bool cleanup )
202 static iconv_t converter = (iconv_t)( - 1 );
203 if( cleanup )
205 if( converter != (iconv_t)( -1 ) )
207 iconv_close( converter );
208 converter = (iconv_t)( -1 );
211 else
213 iconv_maybe_open( & converter,
214 "UTF-16BE",
215 UCS_4_INTERNAL );
217 return converter;
220 iconv_t
221 Helpers::requireUTF8ToWinANSIConverter( bool cleanup )
223 static iconv_t converter = (iconv_t)( - 1 );
224 if( cleanup )
226 if( converter != (iconv_t)( -1 ) )
228 iconv_close( converter );
229 converter = (iconv_t)( -1 );
232 else
234 iconv_maybe_open( & converter,
235 "LATIN1", // This is meant to be what is called WinANSI in PDF.
236 "UTF-8" );
238 return converter;
241 iconv_t
242 Helpers::requireUTF8ToUTF16BEConverter( bool cleanup )
244 static iconv_t converter = (iconv_t)( - 1 );
245 if( cleanup )
247 if( converter != (iconv_t)( -1 ) )
249 iconv_close( converter );
250 converter = (iconv_t)( -1 );
253 else
255 iconv_maybe_open( & converter,
256 "UTF-16BE",
257 "UTF-8" );
259 return converter;
262 const FontMetrics::GlyphList &
263 Helpers::requireGlyphList( bool cleanup )
265 static const FontMetrics::GlyphList * converter = 0;
266 if( cleanup )
268 if( converter != 0 )
270 delete converter;
271 converter = 0;
274 else
276 if( converter == 0 )
278 std::string filename = Lang::Font::searchGlyphList( );
279 std::ifstream iFile( filename.c_str( ) );
280 if( ! iFile.is_open( ) )
282 std::ostringstream oss;
283 oss << "Could locate, but not open the glyph list " << filename ;
284 throw Exceptions::ExternalError( strrefdup( oss ) );
288 converter = new FontMetrics::GlyphList( iFile );
290 catch( const char * ball )
292 std::ostringstream oss;
293 oss << "Parsing the glyph list " << filename << " resulted in the error: " << ball ;
294 throw Exceptions::ExternalError( strrefdup( oss ) );
296 catch( const std::string ball )
298 std::ostringstream oss;
299 oss << "Parsing the glyph list " << filename << " resulted in the error: " << ball ;
300 throw Exceptions::ExternalError( strrefdup( oss ) );
302 catch( const Shapes::Exceptions::Exception & ball )
304 std::cerr << "Parsing the glyph list " << filename << " resulted in an error. Rethrowing." << std::endl ;
305 throw;
307 catch( ... )
309 throw Exceptions::InternalError( "An unrecognized exception was caught from glyph list parsing." );
313 return *converter;
316 const FontMetrics::CharacterEncoding &
317 Helpers::requireMacRomanEncoding( bool cleanup )
319 static const FontMetrics::CharacterEncoding * converter = 0;
320 if( cleanup )
322 if( converter != 0 )
324 delete converter;
325 converter = 0;
328 else
330 if( converter == 0 )
332 std::string filename = Lang::Font::searchCharacterEncoding( "MacRoman" );
333 std::ifstream iFile( filename.c_str( ) );
334 if( ! iFile.is_open( ) )
336 std::ostringstream oss;
337 oss << "Could locate, but not open the character encoding " << filename ;
338 throw Exceptions::ExternalError( strrefdup( oss ) );
342 converter = new FontMetrics::CharacterEncoding( iFile );
344 catch( const char * ball )
346 std::ostringstream oss;
347 oss << "Parsing the character encoding " << filename << " resulted in the error: " << ball ;
348 throw Exceptions::ExternalError( strrefdup( oss ) );
350 catch( const std::string ball )
352 std::ostringstream oss;
353 oss << "Parsing the character encoding " << filename << " resulted in the error: " << ball ;
354 throw Exceptions::ExternalError( strrefdup( oss ) );
356 catch( const Shapes::Exceptions::Exception & ball )
358 std::cerr << "Parsing the character encoding " << filename << " resulted in an error. Rethrowing." << std::endl ;
359 throw;
361 catch( ... )
363 throw Exceptions::InternalError( "An unrecognized exception was caught from character encoding parsing." );
367 return *converter;
370 unsigned char
371 Kernel::UnicodeCodePoint::get_MacRoman( ) const
373 static iconv_t converter = Helpers::requireUCS4ToMacRomanConverter( );
375 const size_t BUF_SIZE = 1;
376 char buf[ BUF_SIZE ];
377 char * dst = buf;
378 size_t outbytesleft = BUF_SIZE;
380 const char * src = reinterpret_cast< const char * >( & value_ );
381 size_t inbytesleft = sizeof( value_ );
383 size_t count = iconv( converter,
384 ICONV_CAST( & src ), & inbytesleft,
385 & dst, & outbytesleft );
386 if( count == (size_t)(-1) )
388 if( errno == EILSEQ )
390 std::ostringstream msg;
391 msg << "The UCS-4 code point U+" << std::hex << value_ << " cannot be represented in MacRoman encodig." ;
392 throw Exceptions::MiscellaneousRequirement( strrefdup( msg ) );
394 else if( errno == EINVAL )
396 throw Exceptions::InternalError( "Malformed UCS-4 value (in conversion to MacRoman)." );
398 else if( errno == E2BIG )
400 throw Exceptions::InternalError( "The MacRoman destination buffer was too small when encoding a single UCS-4 code point." );
402 else
404 std::ostringstream msg;
405 msg << "iconv failed with an unrecognized error code: " << errno ;
406 throw Exceptions::InternalError( strrefdup( msg ) );
409 else if( inbytesleft != 0 )
411 throw Exceptions::InternalError( "Failed to use the entire UCS-4 code point when converting to MacRoman." );
413 return *reinterpret_cast< unsigned char * >( buf );
416 void
417 Kernel::UnicodeCodePoint::decode_UTF8( const char ** src, size_t * src_avail )
419 static iconv_t converter = Helpers::requireUTF8ToUCS4Converter( );
421 char * dst = reinterpret_cast< char * >( & value_ );
422 size_t outbytesleft = sizeof( value_ );
424 size_t tmp_src_avail;
425 if( **src > 0 )
427 tmp_src_avail = 1;
429 else
431 switch( 0xF0 & **src )
433 case 0xE0: tmp_src_avail = 3; break;
434 case 0xF0: tmp_src_avail = 4; break;
435 default: tmp_src_avail = 2; break;
438 if( tmp_src_avail > *src_avail )
440 throw Exceptions::InternalError( "The UTF-8 source did not contain a complete character when initializing a single UCS-4 code point." );
442 *src_avail -= tmp_src_avail;
444 size_t count = iconv( converter,
445 ICONV_CAST( src ), & tmp_src_avail,
446 & dst, & outbytesleft );
447 if( count == (size_t)(-1) )
449 if( errno == EILSEQ )
451 throw Exceptions::InternalError( "Failed to initialize UCS-4 code point from UTF-8 data." );
453 else if( errno == EINVAL )
455 throw Exceptions::MiscellaneousRequirement( "Malformed UTF-8 value in initialization of UCS-4 code point." );
457 else if( errno == E2BIG )
459 throw Exceptions::InternalError( "The UTF-8 source buffer contained more than one character when initializing a single UCS-4 code point." );
461 else
463 std::ostringstream msg;
464 msg << "iconv failed with an unrecognized error code: " << errno ;
465 throw Exceptions::InternalError( strrefdup( msg ) );
468 else if( outbytesleft != 0 )
470 throw Exceptions::InternalError( "Failed to initialize the entire UCS-4 code point when converting from UTF-8." );
474 void
475 Kernel::UnicodeCodePoint::decode_UTF8( const char * src )
477 static iconv_t converter = Helpers::requireUTF8ToUCS4Converter( );
479 char * dst = reinterpret_cast< char * >( & value_ );
480 size_t outbytesleft = sizeof( value_ );
482 size_t tmp_src_avail;
483 if( *src > 0 )
485 tmp_src_avail = 1;
487 else
489 switch( 0xF0 & *src )
491 case 0xE0: tmp_src_avail = 3; break;
492 case 0xF0: tmp_src_avail = 4; break;
493 default: tmp_src_avail = 2; break;
497 size_t count = iconv( converter,
498 ICONV_CAST( & src ), & tmp_src_avail,
499 & dst, & outbytesleft );
500 if( count == (size_t)(-1) )
502 if( errno == EILSEQ )
504 throw Exceptions::InternalError( "Failed to initialize UCS-4 code point from UTF-8 data." );
506 else if( errno == EINVAL )
508 throw Exceptions::MiscellaneousRequirement( "Malformed UTF-8 value in initialization of UCS-4 code point." );
510 else if( errno == E2BIG )
512 throw Exceptions::InternalError( "The UTF-8 source buffer contained more than one character when initializing a single UCS-4 code point." );
514 else
516 std::ostringstream msg;
517 msg << "iconv failed with an unrecognized error code: " << errno ;
518 throw Exceptions::InternalError( strrefdup( msg ) );
521 else if( outbytesleft != 0 )
523 throw Exceptions::InternalError( "Failed to initialize the entire UCS-4 code point when converting from UTF-8." );
527 void
528 Kernel::UnicodeCodePoint::decode_UCS4( const char ** src, size_t * src_avail )
530 if( *src_avail < 4 )
532 throw Exceptions::InternalError( "Not enough data available when initializing UCS-4 code point (needs four bytes)." );
534 memcpy( reinterpret_cast< char * >( & value_ ), *src, 4 );
535 *src += 4;
536 *src_avail -= 4;
539 void
540 Kernel::UnicodeCodePoint::decode_UCS4( const char * src )
542 memcpy( reinterpret_cast< char * >( & value_ ), src, 4 );
545 void
546 Kernel::UnicodeCodePoint::encode_UTF8( char ** dst, size_t * dst_avail ) const
548 static iconv_t converter = Helpers::requireUCS4ToUTF8Converter( );
550 const char * src = reinterpret_cast< const char * >( & value_ );
551 size_t inbytesleft = sizeof( value_ );
553 size_t count = iconv( converter,
554 ICONV_CAST( & src ), & inbytesleft,
555 dst, dst_avail );
556 if( count == (size_t)(-1) )
558 if( errno == EILSEQ )
560 throw Exceptions::InternalError( "Failed to convert UCS-4 code point to UTF-8." );
562 else if( errno == EINVAL )
564 throw Exceptions::InternalError( "Malformed UCS-4 value (in conversion to UTF-8)." );
566 else if( errno == E2BIG )
568 throw Exceptions::InternalError( "The UTF-8 destination buffer was too small when encoding a single UCS-4 code point." );
570 else
572 std::ostringstream msg;
573 msg << "iconv failed with an unrecognized error code: " << errno ;
574 throw Exceptions::InternalError( strrefdup( msg ) );
577 else if( inbytesleft != 0 )
579 throw Exceptions::InternalError( "Failed to use the entire UCS-4 code point when converting to UTF-8." );
583 void
584 Kernel::UnicodeCodePoint::encode_UTF16BE( char ** dst, size_t * dst_avail ) const
586 static iconv_t converter = Helpers::requireUCS4ToUTF16BEConverter( );
588 const char * src = reinterpret_cast< const char * >( & value_ );
589 size_t inbytesleft = sizeof( value_ );
591 size_t count = iconv( converter,
592 ICONV_CAST( & src ), & inbytesleft,
593 dst, dst_avail );
594 if( count == (size_t)(-1) )
596 if( errno == EILSEQ )
598 throw Exceptions::InternalError( "Failed to convert UCS-4 code point to UTF-16-BE." );
600 else if( errno == EINVAL )
602 throw Exceptions::InternalError( "Malformed UCS-4 value (in conversion to UTF-16-BE)." );
604 else if( errno == E2BIG )
606 throw Exceptions::InternalError( "The UTF-16-BE destination buffer was too small when encoding a single UCS-4 code point." );
608 else
610 std::ostringstream msg;
611 msg << "iconv failed with an unrecognized error code: " << errno ;
612 throw Exceptions::InternalError( strrefdup( msg ) );
615 else if( inbytesleft != 0 )
617 throw Exceptions::InternalError( "Failed to use the entire UCS-4 code point when converting to UTF-16-BE." );
621 void
622 Kernel::UnicodeCodePoint::decode_glyph_name( const char * name )
624 static const FontMetrics::GlyphList & glyphList = Helpers::requireGlyphList( );
625 if( ! glyphList.name_to_UCS4( name, & value_ ) )
627 std::ostringstream msg;
628 msg << "The glyph name \"" << name << "\" is not in the glyph list, and cannot be converted to a UCS-4 code point." ;
629 throw Exceptions::InternalError( strrefdup( msg ) );
633 Kernel::UnicodeCodePoint Kernel::UnicodeCodePoint::SPACE( 32 );
634 Kernel::UnicodeCodePoint Kernel::UnicodeCodePoint::NEWLINE( 10 );