utility/fc_utf8.c

   1 /**********************************************************************
   2  Freeciv - Copyright (C) 1996 - A Kjeldberg, L Gregersen, P Unold
   3    This program is free software; you can redistribute it and/or modify
   4    it under the terms of the GNU General Public License as published by
   5    the Free Software Foundation; either version 2, or (at your option)
   6    any later version.
   7
   8    This program is distributed in the hope that it will be useful,
   9    but WITHOUT ANY WARRANTY; without even the implied warranty of
  10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11    GNU General Public License for more details.
  12 ***********************************************************************/
  13
  14 #ifdef HAVE_CONFIG_H
  15 #include <fc_config.h>
  16 #endif
  17
  18 #include <stdarg.h>
  19 #include <string.h>
  20
  21 /* utility */
  22 #include "log.h"
  23 #include "mem.h"
  24 #include "support.h"
  25
  26 #include "fc_utf8.h"
  27
  28
  29 /* The length of a character for external use (at least 1 to avoid infinite
  30  * loops). See also fc_ut8_next_char(). */
  31 const char fc_utf8_skip[256] = {
  32   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00000000 to 00001111. */
  33   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00010000 to 00011111. */
  34   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00100000 to 00101111. */
  35   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00110000 to 00111111. */
  36   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01000000 to 01001111. */
  37   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01010000 to 01011111. */
  38   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01100000 to 01101111. */
  39   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01110000 to 01111111. */
  40   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10000000 to 10001111. */
  41   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10010000 to 10011111. */
  42   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10100000 to 10101111. */
  43   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10110000 to 10111111. */
  44   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11000000 to 11001111. */
  45   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11010000 to 11011111. */
  46   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 11100000 to 11101111. */
  47 #ifdef USE_6_BYTES_CHAR
  48   4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1  /* 11110000 to 11111111. */
  49 #else
  50   4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1  /* 11110000 to 11111111. */
  51 #endif /* USE_6_BYTES_CHAR */
  52 };
  53
  54 /* The length of a character for internal use (0 means an invalid start of
  55  * a character). */
  56 static const char fc_utf8_char_size[256] = {
  57   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00000000 to 00001111. */
  58   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00010000 to 00011111. */
  59   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00100000 to 00101111. */
  60   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00110000 to 00111111. */
  61   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01000000 to 01001111. */
  62   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01010000 to 01011111. */
  63   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01100000 to 01101111. */
  64   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01110000 to 01111111. */
  65   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10000000 to 10001111. */
  66   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10010000 to 10011111. */
  67   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10100000 to 10101111. */
  68   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10110000 to 10111111. */
  69   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11000000 to 11001111. */
  70   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11010000 to 11011111. */
  71   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 11100000 to 11101111. */
  72 #ifdef USE_6_BYTES_CHAR
  73   4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0  /* 11110000 to 11111111. */
  74 #else
  75   4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0  /* 11110000 to 11111111. */
  76 #endif /* USE_6_BYTES_CHAR */
  77 };
  78
  79 #define FC_UTF8_CHAR_SIZE(utf8_char) \
  80   fc_utf8_char_size[*(unsigned char *) utf8_char]
  81
  82 #define FC_UTF8_REP_CHAR "\xef\xbf\xbd" /* U+FFFD. */
  83
  84
  85 /****************************************************************************
  86   Returns TRUE if the character beginning at the pointer 'utf8_char' of size
  87   'size' is a valid UTF-8 character.
  88 ****************************************************************************/
  89 static inline bool base_fc_utf8_char_validate(const char *utf8_char,
  90                                               char size)
  91 {
  92   if (1 < size) {
  93     do {
  94       utf8_char++;
  95       if (0x80 != (0xC0 & *(unsigned char *) utf8_char)) {
  96         /* Not a valid byte of the sequence. */
  97         return FALSE;
  98       }
  99       size--;
 100     } while (1 < size);
 101     return TRUE;
 102   } else {
 103     return (1 == size);
 104   }
 105 }
 106
 107 /****************************************************************************
 108   UTF-8-safe variant of fc_strlcpy() base function.
 109 ****************************************************************************/
 110 static inline size_t base_fc_utf8_strlcpy_trunc(char *dest, const char *src,
 111                                                 size_t n)
 112 {
 113   const char *end;
 114   size_t len;
 115
 116   (void) fc_utf8_validate_len(src, n, &end);
 117   len = end - src;
 118   fc_assert(len < n);
 119   if (0 < len) {
 120     memcpy(dest, src, len);
 121   }
 122   dest[len] = '\0';
 123   return strlen(src);
 124 }
 125
 126 /****************************************************************************
 127   UTF-8-safe variant of fc_strlcpy() base function.
 128 ****************************************************************************/
 129 static inline size_t base_fc_utf8_strlcpy_rep(char *dest, const char *src,
 130                                               size_t n)
 131 {
 132   const char *end;
 133   size_t src_len, len;
 134
 135   fc_assert_ret_val(NULL != src, 0);
 136
 137   src_len = strlen(src);
 138   while (TRUE) {
 139     if (fc_utf8_validate_len(src, n, &end)) {
 140       /* Valid UTF-8. */
 141       len = end - src;
 142
 143       fc_assert(len < n);
 144
 145       if (0 < len) {
 146         memcpy(dest, src, len);
 147       }
 148       dest[len] = '\0'; /* Valid UTF-8 string part. */
 149       return src_len;
 150     } else {
 151       /* '*end' is not a valid UTF-8 character. */
 152       len = end - src;
 153
 154       fc_assert(len < n);
 155
 156       if (0 < len) {
 157         memcpy(dest, src, len);
 158       }
 159
 160       n -= len;
 161       dest += len;
 162
 163       /* Try to insert the replacement character. */
 164       len = sizeof(FC_UTF8_REP_CHAR);
 165       if (n > len) {
 166         memcpy(dest, FC_UTF8_REP_CHAR, len);
 167         n -= len;
 168         dest += len;
 169       }
 170
 171       if (1 == n) {
 172         *dest = '\0';
 173         return src_len; /* End of 'dest' reached. */
 174       }
 175
 176       /* Jump to next character in src. */
 177       src = fc_utf8_find_next_char(end);
 178       if ('\0' == *src) {
 179         *dest = '\0';
 180         return src_len; /* End of 'src' reached. */
 181       }
 182     }
 183   }
 184   fc_assert(FALSE);     /* Shouldn't occur! */
 185   return src_len;
 186 }
 187
 188
 189 /****************************************************************************
 190   Returns TRUE if the character beginning at the pointer 'utf8_char' is
 191   a valid UTF-8 character.
 192 ****************************************************************************/
 193 bool fc_utf8_char_validate(const char *utf8_char)
 194 {
 195   fc_assert_ret_val(NULL != utf8_char, FALSE);
 196
 197   return base_fc_utf8_char_validate(utf8_char, FC_UTF8_CHAR_SIZE(utf8_char));
 198 }
 199
 200 /****************************************************************************
 201   Jump to next UTF-8 character start.
 202
 203   NB: This function can return a invalid UTF-8 character. Check with
 204   fc_utf8_char_validate() to unsure.
 205 ****************************************************************************/
 206 char *fc_utf8_find_next_char(const char *utf8_char)
 207 {
 208   fc_assert_ret_val(NULL != utf8_char, NULL);
 209
 210   do {
 211     utf8_char++;
 212   } while (0 == FC_UTF8_CHAR_SIZE(utf8_char));
 213   return (char *) utf8_char;
 214 }
 215
 216 /****************************************************************************
 217   Jump to previous UTF-8 character start in the limit of the 'utf8_string'
 218   pointer. If no character is found, returns 'utf8_string'.
 219
 220   NB: This function can return a invalid UTF-8 character. Check with
 221   fc_utf8_char_validate() to unsure.
 222 ****************************************************************************/
 223 char *fc_utf8_find_prev_char(const char *utf8_char, const char *utf8_string)
 224 {
 225   fc_assert_ret_val(NULL != utf8_char, NULL);
 226
 227   for (utf8_char--; utf8_char > utf8_string; utf8_char--) {
 228     if (0 != FC_UTF8_CHAR_SIZE(utf8_char)) {
 229       return (char *) utf8_char;
 230     }
 231   }
 232   return (char *) utf8_string;
 233 }
 234
 235
 236 /****************************************************************************
 237   Returns TRUE if the string 'utf8_string' contains only valid UTF-8
 238   characters. If 'end' is not NULL, the end of the valid string will be
 239   stored there, even if it returns TRUE.
 240
 241   See also fc_utf8_validate_len().
 242 ****************************************************************************/
 243 bool fc_utf8_validate(const char *utf8_string, const char **end)
 244 {
 245   char size;
 246
 247   fc_assert_ret_val(NULL != utf8_string, FALSE);
 248
 249   while ('\0' != *utf8_string) {
 250     size = FC_UTF8_CHAR_SIZE(utf8_string);
 251     if (!base_fc_utf8_char_validate(utf8_string, size)) {
 252       if (NULL != end) {
 253         *end = utf8_string;
 254       }
 255       return FALSE;
 256     }
 257     utf8_string += size;
 258   }
 259   if (NULL != end) {
 260     *end = utf8_string;
 261   }
 262   return TRUE;
 263 }
 264
 265 /****************************************************************************
 266   Returns TRUE if the string 'utf8_string' contains only valid UTF-8
 267   characters in the limit of the length (in bytes) 'byte_len'. If 'end' is
 268   not NULL, the end of the valid string will be stored there, even if it
 269   returns TRUE.
 270
 271   See also fc_utf8_validate().
 272 ****************************************************************************/
 273 bool fc_utf8_validate_len(const char *utf8_string, size_t byte_len,
 274                           const char **end)
 275 {
 276   char size;
 277
 278   fc_assert_ret_val(NULL != utf8_string, FALSE);
 279
 280   while ('\0' != *utf8_string) {
 281     size = FC_UTF8_CHAR_SIZE(utf8_string);
 282
 283     if (!base_fc_utf8_char_validate(utf8_string, size)) {
 284       if (NULL != end) {
 285         *end = utf8_string;
 286       }
 287       return FALSE;
 288     }
 289
 290     if (size > byte_len) {
 291       if (NULL != end) {
 292         *end = utf8_string;
 293       }
 294       return FALSE;
 295     } else {
 296       byte_len -= size;
 297     }
 298
 299     utf8_string += size;
 300   }
 301   if (NULL != end) {
 302     *end = utf8_string;
 303   }
 304   return TRUE;
 305 }
 306
 307 /****************************************************************************
 308   Truncate the string 'utf8_string' at the first invalid UTF-8 character.
 309   Returns 'utf8_string'.
 310
 311   See also fc_utf8_validate(), fc_utf8_validate_trunc_len(),
 312   and fc_utf8_validate_trunc_dup().
 313 ****************************************************************************/
 314 char *fc_utf8_validate_trunc(char *utf8_string)
 315 {
 316   char *end;
 317
 318   fc_assert_ret_val(NULL != utf8_string, NULL);
 319
 320   if (!fc_utf8_validate(utf8_string, (const char **) &end)) {
 321     *end = '\0';
 322   }
 323   return utf8_string;
 324 }
 325
 326 /****************************************************************************
 327   Truncate the string 'utf8_string' at the first invalid UTF-8 character in
 328   the limit (in bytes) of 'byte_len'. Returns 'utf8_string'.
 329
 330   See also fc_utf8_validate_trunc(), fc_utf8_validate_trunc_dup(),
 331   and fc_utf8_validate_rep_len().
 332 ****************************************************************************/
 333 char *fc_utf8_validate_trunc_len(char *utf8_string, size_t byte_len)
 334 {
 335   char *end;
 336
 337   fc_assert_ret_val(NULL != utf8_string, NULL);
 338
 339   if (!fc_utf8_validate_len(utf8_string, byte_len, (const char **) &end)) {
 340     *end = '\0';
 341   }
 342   return utf8_string;
 343 }
 344
 345 /****************************************************************************
 346   Duplicate the truncation of the string 'utf8_string' at the first invalid
 347   UTF-8 character.
 348
 349   See also fc_utf8_validate_trunc(), fc_utf8_validate_trunc_len(),
 350   and fc_utf8_validate_rep_dup().
 351 ****************************************************************************/
 352 char *fc_utf8_validate_trunc_dup(const char *utf8_string)
 353 {
 354   const char *end;
 355   size_t size;
 356   char *ret;
 357
 358   fc_assert_ret_val(NULL != utf8_string, NULL);
 359
 360   (void) fc_utf8_validate(utf8_string, &end);
 361   size = end - utf8_string;
 362   ret = fc_malloc(size + 1);    /* Keep a spot for '\0'. */
 363   memcpy(ret, utf8_string, size);
 364   ret[size] = '\0';
 365
 366   return ret;
 367 }
 368
 369 /****************************************************************************
 370   Transform 'utf8_string' with replacing all invalid characters with the
 371   replacement character in the limit of 'byte_len', truncate the last
 372   character. Returns 'utf8_string'.
 373
 374   See also fc_utf8_validate_len(), fc_utf8_validate_trunc(),
 375   and fc_utf8_validate_rep_dup().
 376 ****************************************************************************/
 377 char *fc_utf8_validate_rep_len(char *utf8_string, size_t byte_len)
 378 {
 379   fc_assert_ret_val(NULL != utf8_string, NULL);
 380
 381   if (0 < byte_len) {
 382     char copy[byte_len];
 383
 384     fc_strlcpy(copy, utf8_string, byte_len);
 385     base_fc_utf8_strlcpy_rep(utf8_string, copy, byte_len);
 386   }
 387   return utf8_string;
 388 }
 389
 390 /****************************************************************************
 391   Duplicate 'utf8_string' and replace all invalid characters with the
 392   replacement character.
 393
 394   See also fc_utf8_validate_rep_len(), and fc_utf8_validate_trunc_dup().
 395 ****************************************************************************/
 396 char *fc_utf8_validate_rep_dup(const char *utf8_string)
 397 {
 398   char *ret;
 399   const char *utf8_char;
 400   size_t size = 1;      /* '\0'. */
 401   char char_size;
 402
 403   fc_assert_ret_val(NULL != utf8_string, NULL);
 404
 405   /* Check needed size. */
 406   utf8_char = utf8_string;
 407   while ('\0' != *utf8_char) {
 408     char_size = FC_UTF8_CHAR_SIZE(utf8_char);
 409     if (base_fc_utf8_char_validate(utf8_char, char_size)) {
 410       /* Normal valid character. */
 411       size += char_size;
 412       utf8_char += char_size;
 413     } else {
 414       /* Replacement character. */
 415       size += sizeof(FC_UTF8_REP_CHAR);
 416       /* Find next character. */
 417       do {
 418         utf8_char++;
 419       } while (0 == FC_UTF8_CHAR_SIZE(utf8_char));
 420     }
 421   }
 422
 423   /* Do the allocation. */
 424   ret = fc_malloc(size);
 425   base_fc_utf8_strlcpy_rep(ret, utf8_string, size);
 426
 427   return ret;
 428 }
 429
 430 /****************************************************************************
 431   Returns the number of characters in the string 'utf8_string'. To know the
 432   number of used bytes, used strlen() instead.
 433
 434   NB: 'utf8_string' must be UTF-8 valid (see fc_utf8_validate()), or the
 435   behaviour of this function will be unknown.
 436 ****************************************************************************/
 437 size_t fc_utf8_strlen(const char *utf8_string)
 438 {
 439   size_t len;
 440
 441   fc_assert_ret_val(NULL != utf8_string, 0);
 442
 443   for (len = 0; '\0' != *utf8_string; len++) {
 444     utf8_string = fc_ut8_next_char(utf8_string);
 445   }
 446   return len;
 447 }
 448
 449
 450 /****************************************************************************
 451   This is a variant of fc_strlcpy() to unsure the result will be a valid
 452   UTF-8 string. It truncates the string at the first UTF-8 invalid
 453   character.
 454
 455   See also fc_strlcpy(), fc_utf8_strlcpy_rep().
 456 ****************************************************************************/
 457 size_t fc_utf8_strlcpy_trunc(char *dest, const char *src, size_t n)
 458 {
 459   fc_assert_ret_val(NULL != dest, -1);
 460   fc_assert_ret_val(NULL != src, -1);
 461   fc_assert_ret_val(0 < n, -1);
 462
 463   return base_fc_utf8_strlcpy_trunc(dest, src, n);
 464 }
 465
 466 /****************************************************************************
 467   This is a variant of fc_strlcpy() to unsure the result will be a valid
 468   UTF-8 string. Unlike fc_utf8_strlcpy_trunc(), it replaces the invalid
 469   characters by the replacement character, instead of truncating the string.
 470
 471   See also fc_strlcpy(), fc_utf8_strlcpy_trunc().
 472 ****************************************************************************/
 473 size_t fc_utf8_strlcpy_rep(char *dest, const char *src, size_t n)
 474 {
 475   fc_assert_ret_val(NULL != dest, -1);
 476   fc_assert_ret_val(NULL != src, -1);
 477   fc_assert_ret_val(0 < n, -1);
 478
 479   return base_fc_utf8_strlcpy_rep(dest, src, n);
 480 }
 481
 482 /****************************************************************************
 483   This is a variant of fc_strlcat() to unsure the result will be a valid
 484   UTF-8 string. It truncates the string at the first UTF-8 invalid
 485   character.
 486
 487   NB: This function doesn't perform anything on the already edited part of
 488   the string 'dest', which can contain invalid UTF-8 characters.
 489
 490   See also fc_strlcat(), fc_utf8_strlcat_rep().
 491 ****************************************************************************/
 492 size_t fc_utf8_strlcat_trunc(char *dest, const char *src, size_t n)
 493 {
 494   size_t len;
 495
 496   fc_assert_ret_val(NULL != dest, -1);
 497   fc_assert_ret_val(NULL != src, -1);
 498   fc_assert_ret_val(0 < n, -1);
 499
 500   len = strlen(dest);
 501   fc_assert_ret_val(len < n, -1);
 502   return len + base_fc_utf8_strlcpy_trunc(dest + len, src, n - len);
 503 }
 504
 505 /****************************************************************************
 506   This is a variant of fc_strlcat() to unsure the result will be a valid
 507   UTF-8 string. Unlike fc_utf8_strlcat_trunc(), it replaces the invalid
 508   characters by the replacement character, instead of truncating the string.
 509
 510   NB: This function doesn't perform anything on the already edited part of
 511   the string 'dest', which can contain invalid UTF-8 characters.
 512
 513   See also fc_strlcat(), fc_utf8_strlcat_trunc().
 514 ****************************************************************************/
 515 size_t fc_utf8_strlcat_rep(char *dest, const char *src, size_t n)
 516 {
 517   size_t len;
 518
 519   fc_assert_ret_val(NULL != dest, -1);
 520   fc_assert_ret_val(NULL != src, -1);
 521   fc_assert_ret_val(0 < n, -1);
 522
 523   len = strlen(dest);
 524   fc_assert_ret_val(len < n, -1);
 525   return len + base_fc_utf8_strlcpy_rep(dest + len, src, n - len);
 526 }
 527
 528 /****************************************************************************
 529   This is a variant of fc_snprintf() to unsure the result will be a valid
 530   UTF-8 string. It truncates the string at the first UTF-8 invalid
 531   character.
 532
 533   See also fc_snprintf(), fc_utf8_snprintf_rep().
 534 ****************************************************************************/
 535 int fc_utf8_snprintf_trunc(char *str, size_t n, const char *format, ...)
 536 {
 537   int ret;
 538   va_list args;
 539
 540   va_start(args, format);
 541   ret = fc_utf8_vsnprintf_trunc(str, n, format, args);
 542   va_end(args);
 543   return ret;
 544 }
 545
 546 /****************************************************************************
 547   This is a variant of fc_snprintf() to unsure the result will be a valid
 548   UTF-8 string. Unlike fc_utf8_snprintf_trunc(), it replaces the invalid
 549   characters by the replacement character, instead of truncating the string.
 550
 551   See also fc_snprintf(), fc_utf8_snprintf_trunc().
 552 ****************************************************************************/
 553 int fc_utf8_snprintf_rep(char *str, size_t n, const char *format, ...)
 554 {
 555   int ret;
 556   va_list args;
 557
 558   va_start(args, format);
 559   ret = fc_utf8_vsnprintf_rep(str, n, format, args);
 560   va_end(args);
 561   return ret;
 562 }
 563
 564 /****************************************************************************
 565   This is a variant of fc_vsnprintf() to unsure the result will be a valid
 566   UTF-8 string. It truncates the string at the first UTF-8 invalid
 567   character.
 568
 569   See also fc_vsnprintf(), fc_utf8_vsnprintf_rep().
 570 ****************************************************************************/
 571 int fc_utf8_vsnprintf_trunc(char *str, size_t n, const char *format,
 572                             va_list args)
 573 {
 574   char *end;
 575   int ret;
 576
 577   fc_assert_ret_val(NULL != str, -1);
 578   fc_assert_ret_val(0 < n, -1);
 579   fc_assert_ret_val(NULL != format, -1);
 580
 581   ret = fc_vsnprintf(str, n, format, args);
 582   if (fc_utf8_validate(str, (const char **) &end)) {
 583     /* Already valid UTF-8. */
 584     return ret;
 585   } else {
 586     /* Truncate at last valid UTF-8 character. */
 587     *end = '\0';
 588     return (-1 == ret ? -1 : end - str);
 589   }
 590 }
 591
 592 /****************************************************************************
 593   This is a variant of fc_vsnprintf() to unsure the result will be a valid
 594   UTF-8 string. Unlike fc_utf8_vsnprintf_trunc(), it replaces the invalid
 595   characters by the replacement character, instead of truncating the string.
 596
 597   See also fc_vsnprintf(), fc_utf8_vsnprintf_trunc().
 598 ****************************************************************************/
 599 int fc_utf8_vsnprintf_rep(char *str, size_t n, const char *format,
 600                           va_list args)
 601 {
 602   char *end;
 603   int ret;
 604
 605   fc_assert_ret_val(NULL != str, -1);
 606   fc_assert_ret_val(0 < n, -1);
 607   fc_assert_ret_val(NULL != format, -1);
 608
 609   ret = fc_vsnprintf(str, n, format, args);
 610   if (fc_utf8_validate(str, (const char **) &end)) {
 611     /* Already valid UTF-8. */
 612     return ret;
 613   } else {
 614     (void) fc_utf8_validate_rep_len(end, n - (end - str));
 615     return (-1 == ret ? -1 : strlen(str));
 616   }
 617 }
 618
 619 /****************************************************************************
 620   This is a variant of cat_snprintf() to unsure the result will be a valid
 621   UTF-8 string. It truncates the string at the first UTF-8 invalid
 622   character.
 623
 624   NB: This function doesn't perform anything on the already edited part of
 625   the string 'str', which can contain invalid UTF-8 characters.
 626
 627   See also cat_snprintf(), cat_utf8_snprintf_rep().
 628 ****************************************************************************/
 629 int cat_utf8_snprintf_trunc(char *str, size_t n, const char *format, ...)
 630 {
 631   size_t len;
 632   int ret;
 633   va_list args;
 634
 635   fc_assert_ret_val(NULL != format, -1);
 636   fc_assert_ret_val(NULL != str, -1);
 637   fc_assert_ret_val(0 < n, -1);
 638
 639   len = strlen(str);
 640   fc_assert_ret_val(len < n, -1);
 641
 642   va_start(args, format);
 643   ret = fc_utf8_vsnprintf_trunc(str + len, n - len, format, args);
 644   va_end(args);
 645   return (-1 == ret ? -1 : ret + len);
 646 }
 647
 648 /****************************************************************************
 649   This is a variant of cat_snprintf() to unsure the result will be a valid
 650   UTF-8 string. Unlike cat_utf8_snprintf_trunc(), it replaces the invalid
 651   characters by the replacement character, instead of truncating the string.
 652
 653   NB: This function doesn't perform anything on the already edited part of
 654   the string 'str', which can contain invalid UTF-8 characters.
 655
 656   See also cat_snprintf(), cat_utf8_snprintf_trunc().
 657 ****************************************************************************/
 658 int cat_utf8_snprintf_rep(char *str, size_t n, const char *format, ...)
 659 {
 660   size_t len;
 661   int ret;
 662   va_list args;
 663
 664   fc_assert_ret_val(NULL != format, -1);
 665   fc_assert_ret_val(NULL != str, -1);
 666   fc_assert_ret_val(0 < n, -1);
 667
 668   len = strlen(str);
 669   fc_assert_ret_val(len < n, -1);
 670
 671   va_start(args, format);
 672   ret = fc_utf8_vsnprintf_rep(str + len, n - len, format, args);
 673   va_end(args);
 674   return (-1 == ret ? -1 : ret + len);
 675 }