1 /**********************************************************************
2 Freeciv - Copyright (C) 1996 - A Kjeldberg, L Gregersen, P Unold
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2, or (at your option)
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12 ***********************************************************************/
15 #include <fc_config.h>
29 /* The length of a character for external use (at least 1 to avoid infinite
30 * loops). See also fc_ut8_next_char(). */
31 const char fc_utf8_skip
[256] = {
32 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00000000 to 00001111. */
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00010000 to 00011111. */
34 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00100000 to 00101111. */
35 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00110000 to 00111111. */
36 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01000000 to 01001111. */
37 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01010000 to 01011111. */
38 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01100000 to 01101111. */
39 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01110000 to 01111111. */
40 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10000000 to 10001111. */
41 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10010000 to 10011111. */
42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10100000 to 10101111. */
43 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10110000 to 10111111. */
44 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11000000 to 11001111. */
45 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11010000 to 11011111. */
46 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 11100000 to 11101111. */
47 #ifdef USE_6_BYTES_CHAR
48 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 /* 11110000 to 11111111. */
50 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1 /* 11110000 to 11111111. */
51 #endif /* USE_6_BYTES_CHAR */
54 /* The length of a character for internal use (0 means an invalid start of
56 static const char fc_utf8_char_size
[256] = {
57 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00000000 to 00001111. */
58 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00010000 to 00011111. */
59 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00100000 to 00101111. */
60 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00110000 to 00111111. */
61 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01000000 to 01001111. */
62 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01010000 to 01011111. */
63 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01100000 to 01101111. */
64 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01110000 to 01111111. */
65 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10000000 to 10001111. */
66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10010000 to 10011111. */
67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10100000 to 10101111. */
68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10110000 to 10111111. */
69 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11000000 to 11001111. */
70 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11010000 to 11011111. */
71 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 11100000 to 11101111. */
72 #ifdef USE_6_BYTES_CHAR
73 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 /* 11110000 to 11111111. */
75 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 /* 11110000 to 11111111. */
76 #endif /* USE_6_BYTES_CHAR */
79 #define FC_UTF8_CHAR_SIZE(utf8_char) \
80 fc_utf8_char_size[*(unsigned char *) utf8_char]
82 #define FC_UTF8_REP_CHAR "\xef\xbf\xbd" /* U+FFFD. */
85 /****************************************************************************
86 Returns TRUE if the character beginning at the pointer 'utf8_char' of size
87 'size' is a valid UTF-8 character.
88 ****************************************************************************/
89 static inline bool base_fc_utf8_char_validate(const char *utf8_char
,
95 if (0x80 != (0xC0 & *(unsigned char *) utf8_char
)) {
96 /* Not a valid byte of the sequence. */
107 /****************************************************************************
108 UTF-8-safe variant of fc_strlcpy() base function.
109 ****************************************************************************/
110 static inline size_t base_fc_utf8_strlcpy_trunc(char *dest
, const char *src
,
116 (void) fc_utf8_validate_len(src
, n
, &end
);
120 memcpy(dest
, src
, len
);
126 /****************************************************************************
127 UTF-8-safe variant of fc_strlcpy() base function.
128 ****************************************************************************/
129 static inline size_t base_fc_utf8_strlcpy_rep(char *dest
, const char *src
,
135 fc_assert_ret_val(NULL
!= src
, 0);
137 src_len
= strlen(src
);
139 if (fc_utf8_validate_len(src
, n
, &end
)) {
146 memcpy(dest
, src
, len
);
148 dest
[len
] = '\0'; /* Valid UTF-8 string part. */
151 /* '*end' is not a valid UTF-8 character. */
157 memcpy(dest
, src
, len
);
163 /* Try to insert the replacement character. */
164 len
= sizeof(FC_UTF8_REP_CHAR
);
166 memcpy(dest
, FC_UTF8_REP_CHAR
, len
);
173 return src_len
; /* End of 'dest' reached. */
176 /* Jump to next character in src. */
177 src
= fc_utf8_find_next_char(end
);
180 return src_len
; /* End of 'src' reached. */
184 fc_assert(FALSE
); /* Shouldn't occur! */
189 /****************************************************************************
190 Returns TRUE if the character beginning at the pointer 'utf8_char' is
191 a valid UTF-8 character.
192 ****************************************************************************/
193 bool fc_utf8_char_validate(const char *utf8_char
)
195 fc_assert_ret_val(NULL
!= utf8_char
, FALSE
);
197 return base_fc_utf8_char_validate(utf8_char
, FC_UTF8_CHAR_SIZE(utf8_char
));
200 /****************************************************************************
201 Jump to next UTF-8 character start.
203 NB: This function can return a invalid UTF-8 character. Check with
204 fc_utf8_char_validate() to unsure.
205 ****************************************************************************/
206 char *fc_utf8_find_next_char(const char *utf8_char
)
208 fc_assert_ret_val(NULL
!= utf8_char
, NULL
);
212 } while (0 == FC_UTF8_CHAR_SIZE(utf8_char
));
213 return (char *) utf8_char
;
216 /****************************************************************************
217 Jump to previous UTF-8 character start in the limit of the 'utf8_string'
218 pointer. If no character is found, returns 'utf8_string'.
220 NB: This function can return a invalid UTF-8 character. Check with
221 fc_utf8_char_validate() to unsure.
222 ****************************************************************************/
223 char *fc_utf8_find_prev_char(const char *utf8_char
, const char *utf8_string
)
225 fc_assert_ret_val(NULL
!= utf8_char
, NULL
);
227 for (utf8_char
--; utf8_char
> utf8_string
; utf8_char
--) {
228 if (0 != FC_UTF8_CHAR_SIZE(utf8_char
)) {
229 return (char *) utf8_char
;
232 return (char *) utf8_string
;
236 /****************************************************************************
237 Returns TRUE if the string 'utf8_string' contains only valid UTF-8
238 characters. If 'end' is not NULL, the end of the valid string will be
239 stored there, even if it returns TRUE.
241 See also fc_utf8_validate_len().
242 ****************************************************************************/
243 bool fc_utf8_validate(const char *utf8_string
, const char **end
)
247 fc_assert_ret_val(NULL
!= utf8_string
, FALSE
);
249 while ('\0' != *utf8_string
) {
250 size
= FC_UTF8_CHAR_SIZE(utf8_string
);
251 if (!base_fc_utf8_char_validate(utf8_string
, size
)) {
265 /****************************************************************************
266 Returns TRUE if the string 'utf8_string' contains only valid UTF-8
267 characters in the limit of the length (in bytes) 'byte_len'. If 'end' is
268 not NULL, the end of the valid string will be stored there, even if it
271 See also fc_utf8_validate().
272 ****************************************************************************/
273 bool fc_utf8_validate_len(const char *utf8_string
, size_t byte_len
,
278 fc_assert_ret_val(NULL
!= utf8_string
, FALSE
);
280 while ('\0' != *utf8_string
) {
281 size
= FC_UTF8_CHAR_SIZE(utf8_string
);
283 if (!base_fc_utf8_char_validate(utf8_string
, size
)) {
290 if (size
> byte_len
) {
307 /****************************************************************************
308 Truncate the string 'utf8_string' at the first invalid UTF-8 character.
309 Returns 'utf8_string'.
311 See also fc_utf8_validate(), fc_utf8_validate_trunc_len(),
312 and fc_utf8_validate_trunc_dup().
313 ****************************************************************************/
314 char *fc_utf8_validate_trunc(char *utf8_string
)
318 fc_assert_ret_val(NULL
!= utf8_string
, NULL
);
320 if (!fc_utf8_validate(utf8_string
, (const char **) &end
)) {
326 /****************************************************************************
327 Truncate the string 'utf8_string' at the first invalid UTF-8 character in
328 the limit (in bytes) of 'byte_len'. Returns 'utf8_string'.
330 See also fc_utf8_validate_trunc(), fc_utf8_validate_trunc_dup(),
331 and fc_utf8_validate_rep_len().
332 ****************************************************************************/
333 char *fc_utf8_validate_trunc_len(char *utf8_string
, size_t byte_len
)
337 fc_assert_ret_val(NULL
!= utf8_string
, NULL
);
339 if (!fc_utf8_validate_len(utf8_string
, byte_len
, (const char **) &end
)) {
345 /****************************************************************************
346 Duplicate the truncation of the string 'utf8_string' at the first invalid
349 See also fc_utf8_validate_trunc(), fc_utf8_validate_trunc_len(),
350 and fc_utf8_validate_rep_dup().
351 ****************************************************************************/
352 char *fc_utf8_validate_trunc_dup(const char *utf8_string
)
358 fc_assert_ret_val(NULL
!= utf8_string
, NULL
);
360 (void) fc_utf8_validate(utf8_string
, &end
);
361 size
= end
- utf8_string
;
362 ret
= fc_malloc(size
+ 1); /* Keep a spot for '\0'. */
363 memcpy(ret
, utf8_string
, size
);
369 /****************************************************************************
370 Transform 'utf8_string' with replacing all invalid characters with the
371 replacement character in the limit of 'byte_len', truncate the last
372 character. Returns 'utf8_string'.
374 See also fc_utf8_validate_len(), fc_utf8_validate_trunc(),
375 and fc_utf8_validate_rep_dup().
376 ****************************************************************************/
377 char *fc_utf8_validate_rep_len(char *utf8_string
, size_t byte_len
)
379 fc_assert_ret_val(NULL
!= utf8_string
, NULL
);
384 fc_strlcpy(copy
, utf8_string
, byte_len
);
385 base_fc_utf8_strlcpy_rep(utf8_string
, copy
, byte_len
);
390 /****************************************************************************
391 Duplicate 'utf8_string' and replace all invalid characters with the
392 replacement character.
394 See also fc_utf8_validate_rep_len(), and fc_utf8_validate_trunc_dup().
395 ****************************************************************************/
396 char *fc_utf8_validate_rep_dup(const char *utf8_string
)
399 const char *utf8_char
;
400 size_t size
= 1; /* '\0'. */
403 fc_assert_ret_val(NULL
!= utf8_string
, NULL
);
405 /* Check needed size. */
406 utf8_char
= utf8_string
;
407 while ('\0' != *utf8_char
) {
408 char_size
= FC_UTF8_CHAR_SIZE(utf8_char
);
409 if (base_fc_utf8_char_validate(utf8_char
, char_size
)) {
410 /* Normal valid character. */
412 utf8_char
+= char_size
;
414 /* Replacement character. */
415 size
+= sizeof(FC_UTF8_REP_CHAR
);
416 /* Find next character. */
419 } while (0 == FC_UTF8_CHAR_SIZE(utf8_char
));
423 /* Do the allocation. */
424 ret
= fc_malloc(size
);
425 base_fc_utf8_strlcpy_rep(ret
, utf8_string
, size
);
430 /****************************************************************************
431 Returns the number of characters in the string 'utf8_string'. To know the
432 number of used bytes, used strlen() instead.
434 NB: 'utf8_string' must be UTF-8 valid (see fc_utf8_validate()), or the
435 behaviour of this function will be unknown.
436 ****************************************************************************/
437 size_t fc_utf8_strlen(const char *utf8_string
)
441 fc_assert_ret_val(NULL
!= utf8_string
, 0);
443 for (len
= 0; '\0' != *utf8_string
; len
++) {
444 utf8_string
= fc_ut8_next_char(utf8_string
);
450 /****************************************************************************
451 This is a variant of fc_strlcpy() to unsure the result will be a valid
452 UTF-8 string. It truncates the string at the first UTF-8 invalid
455 See also fc_strlcpy(), fc_utf8_strlcpy_rep().
456 ****************************************************************************/
457 size_t fc_utf8_strlcpy_trunc(char *dest
, const char *src
, size_t n
)
459 fc_assert_ret_val(NULL
!= dest
, -1);
460 fc_assert_ret_val(NULL
!= src
, -1);
461 fc_assert_ret_val(0 < n
, -1);
463 return base_fc_utf8_strlcpy_trunc(dest
, src
, n
);
466 /****************************************************************************
467 This is a variant of fc_strlcpy() to unsure the result will be a valid
468 UTF-8 string. Unlike fc_utf8_strlcpy_trunc(), it replaces the invalid
469 characters by the replacement character, instead of truncating the string.
471 See also fc_strlcpy(), fc_utf8_strlcpy_trunc().
472 ****************************************************************************/
473 size_t fc_utf8_strlcpy_rep(char *dest
, const char *src
, size_t n
)
475 fc_assert_ret_val(NULL
!= dest
, -1);
476 fc_assert_ret_val(NULL
!= src
, -1);
477 fc_assert_ret_val(0 < n
, -1);
479 return base_fc_utf8_strlcpy_rep(dest
, src
, n
);
482 /****************************************************************************
483 This is a variant of fc_strlcat() to unsure the result will be a valid
484 UTF-8 string. It truncates the string at the first UTF-8 invalid
487 NB: This function doesn't perform anything on the already edited part of
488 the string 'dest', which can contain invalid UTF-8 characters.
490 See also fc_strlcat(), fc_utf8_strlcat_rep().
491 ****************************************************************************/
492 size_t fc_utf8_strlcat_trunc(char *dest
, const char *src
, size_t n
)
496 fc_assert_ret_val(NULL
!= dest
, -1);
497 fc_assert_ret_val(NULL
!= src
, -1);
498 fc_assert_ret_val(0 < n
, -1);
501 fc_assert_ret_val(len
< n
, -1);
502 return len
+ base_fc_utf8_strlcpy_trunc(dest
+ len
, src
, n
- len
);
505 /****************************************************************************
506 This is a variant of fc_strlcat() to unsure the result will be a valid
507 UTF-8 string. Unlike fc_utf8_strlcat_trunc(), it replaces the invalid
508 characters by the replacement character, instead of truncating the string.
510 NB: This function doesn't perform anything on the already edited part of
511 the string 'dest', which can contain invalid UTF-8 characters.
513 See also fc_strlcat(), fc_utf8_strlcat_trunc().
514 ****************************************************************************/
515 size_t fc_utf8_strlcat_rep(char *dest
, const char *src
, size_t n
)
519 fc_assert_ret_val(NULL
!= dest
, -1);
520 fc_assert_ret_val(NULL
!= src
, -1);
521 fc_assert_ret_val(0 < n
, -1);
524 fc_assert_ret_val(len
< n
, -1);
525 return len
+ base_fc_utf8_strlcpy_rep(dest
+ len
, src
, n
- len
);
528 /****************************************************************************
529 This is a variant of fc_snprintf() to unsure the result will be a valid
530 UTF-8 string. It truncates the string at the first UTF-8 invalid
533 See also fc_snprintf(), fc_utf8_snprintf_rep().
534 ****************************************************************************/
535 int fc_utf8_snprintf_trunc(char *str
, size_t n
, const char *format
, ...)
540 va_start(args
, format
);
541 ret
= fc_utf8_vsnprintf_trunc(str
, n
, format
, args
);
546 /****************************************************************************
547 This is a variant of fc_snprintf() to unsure the result will be a valid
548 UTF-8 string. Unlike fc_utf8_snprintf_trunc(), it replaces the invalid
549 characters by the replacement character, instead of truncating the string.
551 See also fc_snprintf(), fc_utf8_snprintf_trunc().
552 ****************************************************************************/
553 int fc_utf8_snprintf_rep(char *str
, size_t n
, const char *format
, ...)
558 va_start(args
, format
);
559 ret
= fc_utf8_vsnprintf_rep(str
, n
, format
, args
);
564 /****************************************************************************
565 This is a variant of fc_vsnprintf() to unsure the result will be a valid
566 UTF-8 string. It truncates the string at the first UTF-8 invalid
569 See also fc_vsnprintf(), fc_utf8_vsnprintf_rep().
570 ****************************************************************************/
571 int fc_utf8_vsnprintf_trunc(char *str
, size_t n
, const char *format
,
577 fc_assert_ret_val(NULL
!= str
, -1);
578 fc_assert_ret_val(0 < n
, -1);
579 fc_assert_ret_val(NULL
!= format
, -1);
581 ret
= fc_vsnprintf(str
, n
, format
, args
);
582 if (fc_utf8_validate(str
, (const char **) &end
)) {
583 /* Already valid UTF-8. */
586 /* Truncate at last valid UTF-8 character. */
588 return (-1 == ret
? -1 : end
- str
);
592 /****************************************************************************
593 This is a variant of fc_vsnprintf() to unsure the result will be a valid
594 UTF-8 string. Unlike fc_utf8_vsnprintf_trunc(), it replaces the invalid
595 characters by the replacement character, instead of truncating the string.
597 See also fc_vsnprintf(), fc_utf8_vsnprintf_trunc().
598 ****************************************************************************/
599 int fc_utf8_vsnprintf_rep(char *str
, size_t n
, const char *format
,
605 fc_assert_ret_val(NULL
!= str
, -1);
606 fc_assert_ret_val(0 < n
, -1);
607 fc_assert_ret_val(NULL
!= format
, -1);
609 ret
= fc_vsnprintf(str
, n
, format
, args
);
610 if (fc_utf8_validate(str
, (const char **) &end
)) {
611 /* Already valid UTF-8. */
614 (void) fc_utf8_validate_rep_len(end
, n
- (end
- str
));
615 return (-1 == ret
? -1 : strlen(str
));
619 /****************************************************************************
620 This is a variant of cat_snprintf() to unsure the result will be a valid
621 UTF-8 string. It truncates the string at the first UTF-8 invalid
624 NB: This function doesn't perform anything on the already edited part of
625 the string 'str', which can contain invalid UTF-8 characters.
627 See also cat_snprintf(), cat_utf8_snprintf_rep().
628 ****************************************************************************/
629 int cat_utf8_snprintf_trunc(char *str
, size_t n
, const char *format
, ...)
635 fc_assert_ret_val(NULL
!= format
, -1);
636 fc_assert_ret_val(NULL
!= str
, -1);
637 fc_assert_ret_val(0 < n
, -1);
640 fc_assert_ret_val(len
< n
, -1);
642 va_start(args
, format
);
643 ret
= fc_utf8_vsnprintf_trunc(str
+ len
, n
- len
, format
, args
);
645 return (-1 == ret
? -1 : ret
+ len
);
648 /****************************************************************************
649 This is a variant of cat_snprintf() to unsure the result will be a valid
650 UTF-8 string. Unlike cat_utf8_snprintf_trunc(), it replaces the invalid
651 characters by the replacement character, instead of truncating the string.
653 NB: This function doesn't perform anything on the already edited part of
654 the string 'str', which can contain invalid UTF-8 characters.
656 See also cat_snprintf(), cat_utf8_snprintf_trunc().
657 ****************************************************************************/
658 int cat_utf8_snprintf_rep(char *str
, size_t n
, const char *format
, ...)
664 fc_assert_ret_val(NULL
!= format
, -1);
665 fc_assert_ret_val(NULL
!= str
, -1);
666 fc_assert_ret_val(0 < n
, -1);
669 fc_assert_ret_val(len
< n
, -1);
671 va_start(args
, format
);
672 ret
= fc_utf8_vsnprintf_rep(str
+ len
, n
- len
, format
, args
);
674 return (-1 == ret
? -1 : ret
+ len
);