Merge branch 'source-get-id-docs' into 'master'
[glib.git] / glib / gconvert.c
blob50286478c4d8998a3409e8d2ccdd4f52f0682195
1 /* GLIB - Library of useful routines for C programming
3 * gconvert.c: Convert between character sets using iconv
4 * Copyright Red Hat Inc., 2000
5 * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com>
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
21 #include "config.h"
22 #include "glibconfig.h"
24 #ifndef G_OS_WIN32
25 #include <iconv.h>
26 #endif
27 #include <errno.h>
28 #include <stdio.h>
29 #include <string.h>
30 #include <stdlib.h>
32 #ifdef G_OS_WIN32
33 #include "win_iconv.c"
34 #endif
36 #ifdef G_PLATFORM_WIN32
37 #define STRICT
38 #include <windows.h>
39 #undef STRICT
40 #endif
42 #include "gconvert.h"
44 #include "gcharsetprivate.h"
45 #include "gslist.h"
46 #include "gstrfuncs.h"
47 #include "gtestutils.h"
48 #include "gthread.h"
49 #include "gunicode.h"
50 #include "gfileutils.h"
52 #include "glibintl.h"
54 #if defined(USE_LIBICONV_GNU) && !defined (_LIBICONV_H)
55 #error GNU libiconv in use but included iconv.h not from libiconv
56 #endif
57 #if !defined(USE_LIBICONV_GNU) && defined (_LIBICONV_H) \
58 && !defined (__APPLE_CC__) && !defined (__LP_64__)
59 #error GNU libiconv not in use but included iconv.h is from libiconv
60 #endif
63 /**
64 * SECTION:conversions
65 * @title: Character Set Conversion
66 * @short_description: convert strings between different character sets
68 * The g_convert() family of function wraps the functionality of iconv().
69 * In addition to pure character set conversions, GLib has functions to
70 * deal with the extra complications of encodings for file names.
72 * ## File Name Encodings
74 * Historically, UNIX has not had a defined encoding for file names:
75 * a file name is valid as long as it does not have path separators
76 * in it ("/"). However, displaying file names may require conversion:
77 * from the character set in which they were created, to the character
78 * set in which the application operates. Consider the Spanish file name
79 * "Presentación.sxi". If the application which created it uses
80 * ISO-8859-1 for its encoding,
81 * |[
82 * Character: P r e s e n t a c i ó n . s x i
83 * Hex code: 50 72 65 73 65 6e 74 61 63 69 f3 6e 2e 73 78 69
84 * ]|
85 * However, if the application use UTF-8, the actual file name on
86 * disk would look like this:
87 * |[
88 * Character: P r e s e n t a c i ó n . s x i
89 * Hex code: 50 72 65 73 65 6e 74 61 63 69 c3 b3 6e 2e 73 78 69
90 * ]|
91 * Glib uses UTF-8 for its strings, and GUI toolkits like GTK+ that use
92 * GLib do the same thing. If you get a file name from the file system,
93 * for example, from readdir() or from g_dir_read_name(), and you wish
94 * to display the file name to the user, you will need to convert it
95 * into UTF-8. The opposite case is when the user types the name of a
96 * file they wish to save: the toolkit will give you that string in
97 * UTF-8 encoding, and you will need to convert it to the character
98 * set used for file names before you can create the file with open()
99 * or fopen().
101 * By default, GLib assumes that file names on disk are in UTF-8
102 * encoding. This is a valid assumption for file systems which
103 * were created relatively recently: most applications use UTF-8
104 * encoding for their strings, and that is also what they use for
105 * the file names they create. However, older file systems may
106 * still contain file names created in "older" encodings, such as
107 * ISO-8859-1. In this case, for compatibility reasons, you may want
108 * to instruct GLib to use that particular encoding for file names
109 * rather than UTF-8. You can do this by specifying the encoding for
110 * file names in the [`G_FILENAME_ENCODING`][G_FILENAME_ENCODING]
111 * environment variable. For example, if your installation uses
112 * ISO-8859-1 for file names, you can put this in your `~/.profile`:
113 * |[
114 * export G_FILENAME_ENCODING=ISO-8859-1
115 * ]|
116 * GLib provides the functions g_filename_to_utf8() and
117 * g_filename_from_utf8() to perform the necessary conversions.
118 * These functions convert file names from the encoding specified
119 * in `G_FILENAME_ENCODING` to UTF-8 and vice-versa. This
120 * [diagram][file-name-encodings-diagram] illustrates how
121 * these functions are used to convert between UTF-8 and the
122 * encoding for file names in the file system.
124 * ## Conversion between file name encodings # {#file-name-encodings-diagram)
126 * ![](file-name-encodings.png)
128 * ## Checklist for Application Writers
130 * This section is a practical summary of the detailed
131 * things to do to make sure your applications process file
132 * name encodings correctly.
134 * 1. If you get a file name from the file system from a function
135 * such as readdir() or gtk_file_chooser_get_filename(), you do
136 * not need to do any conversion to pass that file name to
137 * functions like open(), rename(), or fopen() -- those are "raw"
138 * file names which the file system understands.
140 * 2. If you need to display a file name, convert it to UTF-8 first
141 * by using g_filename_to_utf8(). If conversion fails, display a
142 * string like "Unknown file name". Do not convert this string back
143 * into the encoding used for file names if you wish to pass it to
144 * the file system; use the original file name instead.
146 * For example, the document window of a word processor could display
147 * "Unknown file name" in its title bar but still let the user save
148 * the file, as it would keep the raw file name internally. This
149 * can happen if the user has not set the `G_FILENAME_ENCODING`
150 * environment variable even though he has files whose names are
151 * not encoded in UTF-8.
153 * 3. If your user interface lets the user type a file name for saving
154 * or renaming, convert it to the encoding used for file names in
155 * the file system by using g_filename_from_utf8(). Pass the converted
156 * file name to functions like fopen(). If conversion fails, ask the
157 * user to enter a different file name. This can happen if the user
158 * types Japanese characters when `G_FILENAME_ENCODING` is set to
159 * `ISO-8859-1`, for example.
162 /* We try to terminate strings in unknown charsets with this many zero bytes
163 * to ensure that multibyte strings really are nul-terminated when we return
164 * them from g_convert() and friends.
166 #define NUL_TERMINATOR_LENGTH 4
168 G_DEFINE_QUARK (g_convert_error, g_convert_error)
170 static gboolean
171 try_conversion (const char *to_codeset,
172 const char *from_codeset,
173 iconv_t *cd)
175 *cd = iconv_open (to_codeset, from_codeset);
177 if (*cd == (iconv_t)-1 && errno == EINVAL)
178 return FALSE;
179 else
180 return TRUE;
183 static gboolean
184 try_to_aliases (const char **to_aliases,
185 const char *from_codeset,
186 iconv_t *cd)
188 if (to_aliases)
190 const char **p = to_aliases;
191 while (*p)
193 if (try_conversion (*p, from_codeset, cd))
194 return TRUE;
196 p++;
200 return FALSE;
204 * g_iconv_open: (skip)
205 * @to_codeset: destination codeset
206 * @from_codeset: source codeset
208 * Same as the standard UNIX routine iconv_open(), but
209 * may be implemented via libiconv on UNIX flavors that lack
210 * a native implementation.
212 * GLib provides g_convert() and g_locale_to_utf8() which are likely
213 * more convenient than the raw iconv wrappers.
215 * Returns: a "conversion descriptor", or (GIConv)-1 if
216 * opening the converter failed.
218 GIConv
219 g_iconv_open (const gchar *to_codeset,
220 const gchar *from_codeset)
222 iconv_t cd;
224 if (!try_conversion (to_codeset, from_codeset, &cd))
226 const char **to_aliases = _g_charset_get_aliases (to_codeset);
227 const char **from_aliases = _g_charset_get_aliases (from_codeset);
229 if (from_aliases)
231 const char **p = from_aliases;
232 while (*p)
234 if (try_conversion (to_codeset, *p, &cd))
235 goto out;
237 if (try_to_aliases (to_aliases, *p, &cd))
238 goto out;
240 p++;
244 if (try_to_aliases (to_aliases, from_codeset, &cd))
245 goto out;
248 out:
249 return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd;
253 * g_iconv: (skip)
254 * @converter: conversion descriptor from g_iconv_open()
255 * @inbuf: bytes to convert
256 * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf
257 * @outbuf: converted output bytes
258 * @outbytes_left: inout parameter, bytes available to fill in @outbuf
260 * Same as the standard UNIX routine iconv(), but
261 * may be implemented via libiconv on UNIX flavors that lack
262 * a native implementation.
264 * GLib provides g_convert() and g_locale_to_utf8() which are likely
265 * more convenient than the raw iconv wrappers.
267 * Note that the behaviour of iconv() for characters which are valid in the
268 * input character set, but which have no representation in the output character
269 * set, is implementation defined. This function may return success (with a
270 * positive number of non-reversible conversions as replacement characters were
271 * used), or it may return -1 and set an error such as %EILSEQ, in such a
272 * situation.
274 * Returns: count of non-reversible conversions, or -1 on error
276 gsize
277 g_iconv (GIConv converter,
278 gchar **inbuf,
279 gsize *inbytes_left,
280 gchar **outbuf,
281 gsize *outbytes_left)
283 iconv_t cd = (iconv_t)converter;
285 return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);
289 * g_iconv_close: (skip)
290 * @converter: a conversion descriptor from g_iconv_open()
292 * Same as the standard UNIX routine iconv_close(), but
293 * may be implemented via libiconv on UNIX flavors that lack
294 * a native implementation. Should be called to clean up
295 * the conversion descriptor from g_iconv_open() when
296 * you are done converting things.
298 * GLib provides g_convert() and g_locale_to_utf8() which are likely
299 * more convenient than the raw iconv wrappers.
301 * Returns: -1 on error, 0 on success
303 gint
304 g_iconv_close (GIConv converter)
306 iconv_t cd = (iconv_t)converter;
308 return iconv_close (cd);
311 static GIConv
312 open_converter (const gchar *to_codeset,
313 const gchar *from_codeset,
314 GError **error)
316 GIConv cd;
318 cd = g_iconv_open (to_codeset, from_codeset);
320 if (cd == (GIConv) -1)
322 /* Something went wrong. */
323 if (error)
325 if (errno == EINVAL)
326 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,
327 _("Conversion from character set “%s” to “%s” is not supported"),
328 from_codeset, to_codeset);
329 else
330 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
331 _("Could not open converter from “%s” to “%s”"),
332 from_codeset, to_codeset);
336 return cd;
339 static int
340 close_converter (GIConv cd)
342 if (cd == (GIConv) -1)
343 return 0;
345 return g_iconv_close (cd);
349 * g_convert_with_iconv: (skip)
350 * @str: (array length=len) (element-type guint8):
351 * the string to convert.
352 * @len: the length of the string in bytes, or -1 if the string is
353 * nul-terminated (Note that some encodings may allow nul
354 * bytes to occur inside strings. In that case, using -1
355 * for the @len parameter is unsafe)
356 * @converter: conversion descriptor from g_iconv_open()
357 * @bytes_read: (out) (optional): location to store the number of bytes in
358 * the input string that were successfully converted, or %NULL.
359 * Even if the conversion was successful, this may be
360 * less than @len if there were partial characters
361 * at the end of the input. If the error
362 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
363 * stored will be the byte offset after the last valid
364 * input sequence.
365 * @bytes_written: (out) (optional): the number of bytes stored in
366 * the output buffer (not including the terminating nul).
367 * @error: location to store the error occurring, or %NULL to ignore
368 * errors. Any of the errors in #GConvertError may occur.
370 * Converts a string from one character set to another.
372 * Note that you should use g_iconv() for streaming conversions.
373 * Despite the fact that @bytes_read can return information about partial
374 * characters, the g_convert_... functions are not generally suitable
375 * for streaming. If the underlying converter maintains internal state,
376 * then this won't be preserved across successive calls to g_convert(),
377 * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
378 * this is the GNU C converter for CP1255 which does not emit a base
379 * character until it knows that the next character is not a mark that
380 * could combine with the base character.)
382 * Characters which are valid in the input character set, but which have no
383 * representation in the output character set will result in a
384 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error. This is in contrast to the iconv()
385 * specification, which leaves this behaviour implementation defined. Note that
386 * this is the same error code as is returned for an invalid byte sequence in
387 * the input character set. To get defined behaviour for conversion of
388 * unrepresentable characters, use g_convert_with_fallback().
390 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
391 * If the conversion was successful, a newly allocated buffer
392 * containing the converted string, which must be freed with
393 * g_free(). Otherwise %NULL and @error will be set.
395 gchar*
396 g_convert_with_iconv (const gchar *str,
397 gssize len,
398 GIConv converter,
399 gsize *bytes_read,
400 gsize *bytes_written,
401 GError **error)
403 gchar *dest;
404 gchar *outp;
405 const gchar *p;
406 gsize inbytes_remaining;
407 gsize outbytes_remaining;
408 gsize err;
409 gsize outbuf_size;
410 gboolean have_error = FALSE;
411 gboolean done = FALSE;
412 gboolean reset = FALSE;
414 g_return_val_if_fail (converter != (GIConv) -1, NULL);
416 if (len < 0)
417 len = strlen (str);
419 p = str;
420 inbytes_remaining = len;
421 outbuf_size = len + NUL_TERMINATOR_LENGTH;
423 outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
424 outp = dest = g_malloc (outbuf_size);
426 while (!done && !have_error)
428 if (reset)
429 err = g_iconv (converter, NULL, &inbytes_remaining, &outp, &outbytes_remaining);
430 else
431 err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);
433 if (err == (gsize) -1)
435 switch (errno)
437 case EINVAL:
438 /* Incomplete text, do not report an error */
439 done = TRUE;
440 break;
441 case E2BIG:
443 gsize used = outp - dest;
445 outbuf_size *= 2;
446 dest = g_realloc (dest, outbuf_size);
448 outp = dest + used;
449 outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
451 break;
452 case EILSEQ:
453 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
454 _("Invalid byte sequence in conversion input"));
455 have_error = TRUE;
456 break;
457 default:
459 int errsv = errno;
461 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
462 _("Error during conversion: %s"),
463 g_strerror (errsv));
465 have_error = TRUE;
466 break;
469 else if (err > 0)
471 /* @err gives the number of replacement characters used. */
472 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
473 _("Unrepresentable character in conversion input"));
474 have_error = TRUE;
476 else
478 if (!reset)
480 /* call g_iconv with NULL inbuf to cleanup shift state */
481 reset = TRUE;
482 inbytes_remaining = 0;
484 else
485 done = TRUE;
489 memset (outp, 0, NUL_TERMINATOR_LENGTH);
491 if (bytes_read)
492 *bytes_read = p - str;
493 else
495 if ((p - str) != len)
497 if (!have_error)
499 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
500 _("Partial character sequence at end of input"));
501 have_error = TRUE;
506 if (bytes_written)
507 *bytes_written = outp - dest; /* Doesn't include '\0' */
509 if (have_error)
511 g_free (dest);
512 return NULL;
514 else
515 return dest;
519 * g_convert:
520 * @str: (array length=len) (element-type guint8):
521 * the string to convert.
522 * @len: the length of the string in bytes, or -1 if the string is
523 * nul-terminated (Note that some encodings may allow nul
524 * bytes to occur inside strings. In that case, using -1
525 * for the @len parameter is unsafe)
526 * @to_codeset: name of character set into which to convert @str
527 * @from_codeset: character set of @str.
528 * @bytes_read: (out) (optional): location to store the number of bytes in
529 * the input string that were successfully converted, or %NULL.
530 * Even if the conversion was successful, this may be
531 * less than @len if there were partial characters
532 * at the end of the input. If the error
533 * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
534 * stored will be the byte offset after the last valid
535 * input sequence.
536 * @bytes_written: (out) (optional): the number of bytes stored in
537 * the output buffer (not including the terminating nul).
538 * @error: location to store the error occurring, or %NULL to ignore
539 * errors. Any of the errors in #GConvertError may occur.
541 * Converts a string from one character set to another.
543 * Note that you should use g_iconv() for streaming conversions.
544 * Despite the fact that @bytes_read can return information about partial
545 * characters, the g_convert_... functions are not generally suitable
546 * for streaming. If the underlying converter maintains internal state,
547 * then this won't be preserved across successive calls to g_convert(),
548 * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
549 * this is the GNU C converter for CP1255 which does not emit a base
550 * character until it knows that the next character is not a mark that
551 * could combine with the base character.)
553 * Using extensions such as "//TRANSLIT" may not work (or may not work
554 * well) on many platforms. Consider using g_str_to_ascii() instead.
556 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
557 * If the conversion was successful, a newly allocated buffer
558 * containing the converted string, which must be freed with g_free().
559 * Otherwise %NULL and @error will be set.
561 gchar*
562 g_convert (const gchar *str,
563 gssize len,
564 const gchar *to_codeset,
565 const gchar *from_codeset,
566 gsize *bytes_read,
567 gsize *bytes_written,
568 GError **error)
570 gchar *res;
571 GIConv cd;
573 g_return_val_if_fail (str != NULL, NULL);
574 g_return_val_if_fail (to_codeset != NULL, NULL);
575 g_return_val_if_fail (from_codeset != NULL, NULL);
577 cd = open_converter (to_codeset, from_codeset, error);
579 if (cd == (GIConv) -1)
581 if (bytes_read)
582 *bytes_read = 0;
584 if (bytes_written)
585 *bytes_written = 0;
587 return NULL;
590 res = g_convert_with_iconv (str, len, cd,
591 bytes_read, bytes_written,
592 error);
594 close_converter (cd);
596 return res;
600 * g_convert_with_fallback:
601 * @str: (array length=len) (element-type guint8):
602 * the string to convert.
603 * @len: the length of the string in bytes, or -1 if the string is
604 * nul-terminated (Note that some encodings may allow nul
605 * bytes to occur inside strings. In that case, using -1
606 * for the @len parameter is unsafe)
607 * @to_codeset: name of character set into which to convert @str
608 * @from_codeset: character set of @str.
609 * @fallback: UTF-8 string to use in place of characters not
610 * present in the target encoding. (The string must be
611 * representable in the target encoding).
612 * If %NULL, characters not in the target encoding will
613 * be represented as Unicode escapes \uxxxx or \Uxxxxyyyy.
614 * @bytes_read: (out) (optional): location to store the number of bytes in
615 * the input string that were successfully converted, or %NULL.
616 * Even if the conversion was successful, this may be
617 * less than @len if there were partial characters
618 * at the end of the input.
619 * @bytes_written: (out) (optional): the number of bytes stored in
620 * the output buffer (not including the terminating nul).
621 * @error: location to store the error occurring, or %NULL to ignore
622 * errors. Any of the errors in #GConvertError may occur.
624 * Converts a string from one character set to another, possibly
625 * including fallback sequences for characters not representable
626 * in the output. Note that it is not guaranteed that the specification
627 * for the fallback sequences in @fallback will be honored. Some
628 * systems may do an approximate conversion from @from_codeset
629 * to @to_codeset in their iconv() functions,
630 * in which case GLib will simply return that approximate conversion.
632 * Note that you should use g_iconv() for streaming conversions.
633 * Despite the fact that @bytes_read can return information about partial
634 * characters, the g_convert_... functions are not generally suitable
635 * for streaming. If the underlying converter maintains internal state,
636 * then this won't be preserved across successive calls to g_convert(),
637 * g_convert_with_iconv() or g_convert_with_fallback(). (An example of
638 * this is the GNU C converter for CP1255 which does not emit a base
639 * character until it knows that the next character is not a mark that
640 * could combine with the base character.)
642 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
643 * If the conversion was successful, a newly allocated buffer
644 * containing the converted string, which must be freed with g_free().
645 * Otherwise %NULL and @error will be set.
647 gchar*
648 g_convert_with_fallback (const gchar *str,
649 gssize len,
650 const gchar *to_codeset,
651 const gchar *from_codeset,
652 const gchar *fallback,
653 gsize *bytes_read,
654 gsize *bytes_written,
655 GError **error)
657 gchar *utf8;
658 gchar *dest;
659 gchar *outp;
660 const gchar *insert_str = NULL;
661 const gchar *p;
662 gsize inbytes_remaining;
663 const gchar *save_p = NULL;
664 gsize save_inbytes = 0;
665 gsize outbytes_remaining;
666 gsize err;
667 GIConv cd;
668 gsize outbuf_size;
669 gboolean have_error = FALSE;
670 gboolean done = FALSE;
672 GError *local_error = NULL;
674 g_return_val_if_fail (str != NULL, NULL);
675 g_return_val_if_fail (to_codeset != NULL, NULL);
676 g_return_val_if_fail (from_codeset != NULL, NULL);
678 if (len < 0)
679 len = strlen (str);
681 /* Try an exact conversion; we only proceed if this fails
682 * due to an illegal sequence in the input string.
684 dest = g_convert (str, len, to_codeset, from_codeset,
685 bytes_read, bytes_written, &local_error);
686 if (!local_error)
687 return dest;
689 if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
691 g_propagate_error (error, local_error);
692 return NULL;
694 else
695 g_error_free (local_error);
697 local_error = NULL;
699 /* No go; to proceed, we need a converter from "UTF-8" to
700 * to_codeset, and the string as UTF-8.
702 cd = open_converter (to_codeset, "UTF-8", error);
703 if (cd == (GIConv) -1)
705 if (bytes_read)
706 *bytes_read = 0;
708 if (bytes_written)
709 *bytes_written = 0;
711 return NULL;
714 utf8 = g_convert (str, len, "UTF-8", from_codeset,
715 bytes_read, &inbytes_remaining, error);
716 if (!utf8)
718 close_converter (cd);
719 if (bytes_written)
720 *bytes_written = 0;
721 return NULL;
724 /* Now the heart of the code. We loop through the UTF-8 string, and
725 * whenever we hit an offending character, we form fallback, convert
726 * the fallback to the target codeset, and then go back to
727 * converting the original string after finishing with the fallback.
729 * The variables save_p and save_inbytes store the input state
730 * for the original string while we are converting the fallback
732 p = utf8;
734 outbuf_size = len + NUL_TERMINATOR_LENGTH;
735 outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
736 outp = dest = g_malloc (outbuf_size);
738 while (!done && !have_error)
740 gsize inbytes_tmp = inbytes_remaining;
741 err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);
742 inbytes_remaining = inbytes_tmp;
744 if (err == (gsize) -1)
746 switch (errno)
748 case EINVAL:
749 g_assert_not_reached();
750 break;
751 case E2BIG:
753 gsize used = outp - dest;
755 outbuf_size *= 2;
756 dest = g_realloc (dest, outbuf_size);
758 outp = dest + used;
759 outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
761 break;
763 case EILSEQ:
764 if (save_p)
766 /* Error converting fallback string - fatal
768 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
769 _("Cannot convert fallback “%s” to codeset “%s”"),
770 insert_str, to_codeset);
771 have_error = TRUE;
772 break;
774 else if (p)
776 if (!fallback)
778 gunichar ch = g_utf8_get_char (p);
779 insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x",
780 ch);
782 else
783 insert_str = fallback;
785 save_p = g_utf8_next_char (p);
786 save_inbytes = inbytes_remaining - (save_p - p);
787 p = insert_str;
788 inbytes_remaining = strlen (p);
789 break;
791 /* fall thru if p is NULL */
792 default:
794 int errsv = errno;
796 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,
797 _("Error during conversion: %s"),
798 g_strerror (errsv));
801 have_error = TRUE;
802 break;
805 else
807 if (save_p)
809 if (!fallback)
810 g_free ((gchar *)insert_str);
811 p = save_p;
812 inbytes_remaining = save_inbytes;
813 save_p = NULL;
815 else if (p)
817 /* call g_iconv with NULL inbuf to cleanup shift state */
818 p = NULL;
819 inbytes_remaining = 0;
821 else
822 done = TRUE;
826 /* Cleanup
828 memset (outp, 0, NUL_TERMINATOR_LENGTH);
830 close_converter (cd);
832 if (bytes_written)
833 *bytes_written = outp - dest; /* Doesn't include '\0' */
835 g_free (utf8);
837 if (have_error)
839 if (save_p && !fallback)
840 g_free ((gchar *)insert_str);
841 g_free (dest);
842 return NULL;
844 else
845 return dest;
849 * g_locale_to_utf8
855 * Validate @string as UTF-8. @len can be negative if @string is
856 * nul-terminated, or a non-negative value in bytes. If @string ends in an
857 * incomplete sequence, or contains any illegal sequences or nul codepoints,
858 * %NULL will be returned and the error set to
859 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
860 * On success, @bytes_read and @bytes_written, if provided, will be set to
861 * the number of bytes in @string up to @len or the terminating nul byte.
862 * On error, @bytes_read will be set to the byte offset after the last valid
863 * and non-nul UTF-8 sequence in @string, and @bytes_written will be set to 0.
865 static gchar *
866 strdup_len (const gchar *string,
867 gssize len,
868 gsize *bytes_read,
869 gsize *bytes_written,
870 GError **error)
872 gsize real_len;
873 const gchar *end_valid;
875 if (!g_utf8_validate (string, len, &end_valid))
877 if (bytes_read)
878 *bytes_read = end_valid - string;
879 if (bytes_written)
880 *bytes_written = 0;
882 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
883 _("Invalid byte sequence in conversion input"));
884 return NULL;
887 real_len = end_valid - string;
889 if (bytes_read)
890 *bytes_read = real_len;
891 if (bytes_written)
892 *bytes_written = real_len;
894 return g_strndup (string, real_len);
897 typedef enum
899 CONVERT_CHECK_NO_NULS_IN_INPUT = 1 << 0,
900 CONVERT_CHECK_NO_NULS_IN_OUTPUT = 1 << 1
901 } ConvertCheckFlags;
904 * Convert from @string in the encoding identified by @from_codeset,
905 * returning a string in the encoding identifed by @to_codeset.
906 * @len can be negative if @string is nul-terminated, or a non-negative
907 * value in bytes. Flags defined in #ConvertCheckFlags can be set in @flags
908 * to check the input, the output, or both, for embedded nul bytes.
909 * On success, @bytes_read, if provided, will be set to the number of bytes
910 * in @string up to @len or the terminating nul byte, and @bytes_written, if
911 * provided, will be set to the number of output bytes written into the
912 * returned buffer, excluding the terminating nul sequence.
913 * On error, @bytes_read will be set to the byte offset after the last valid
914 * sequence in @string, and @bytes_written will be set to 0.
916 static gchar *
917 convert_checked (const gchar *string,
918 gssize len,
919 const gchar *to_codeset,
920 const gchar *from_codeset,
921 ConvertCheckFlags flags,
922 gsize *bytes_read,
923 gsize *bytes_written,
924 GError **error)
926 gchar *out;
927 gsize outbytes;
929 if ((flags & CONVERT_CHECK_NO_NULS_IN_INPUT) && len > 0)
931 const gchar *early_nul = memchr (string, '\0', len);
932 if (early_nul != NULL)
934 if (bytes_read)
935 *bytes_read = early_nul - string;
936 if (bytes_written)
937 *bytes_written = 0;
939 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
940 _("Embedded NUL byte in conversion input"));
941 return NULL;
945 out = g_convert (string, len, to_codeset, from_codeset,
946 bytes_read, &outbytes, error);
947 if (out == NULL)
949 if (bytes_written)
950 *bytes_written = 0;
951 return NULL;
954 if ((flags & CONVERT_CHECK_NO_NULS_IN_OUTPUT)
955 && memchr (out, '\0', outbytes) != NULL)
957 g_free (out);
958 if (bytes_written)
959 *bytes_written = 0;
960 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_EMBEDDED_NUL,
961 _("Embedded NUL byte in conversion output"));
962 return NULL;
965 if (bytes_written)
966 *bytes_written = outbytes;
967 return out;
971 * g_locale_to_utf8:
972 * @opsysstring: (array length=len) (element-type guint8): a string in the
973 * encoding of the current locale. On Windows
974 * this means the system codepage.
975 * @len: the length of the string, or -1 if the string is
976 * nul-terminated (Note that some encodings may allow nul
977 * bytes to occur inside strings. In that case, using -1
978 * for the @len parameter is unsafe)
979 * @bytes_read: (out) (optional): location to store the number of bytes in the
980 * input string that were successfully converted, or %NULL.
981 * Even if the conversion was successful, this may be
982 * less than @len if there were partial characters
983 * at the end of the input. If the error
984 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
985 * stored will be the byte offset after the last valid
986 * input sequence.
987 * @bytes_written: (out) (optional): the number of bytes stored in the output
988 * buffer (not including the terminating nul).
989 * @error: location to store the error occurring, or %NULL to ignore
990 * errors. Any of the errors in #GConvertError may occur.
992 * Converts a string which is in the encoding used for strings by
993 * the C runtime (usually the same as that used by the operating
994 * system) in the [current locale][setlocale] into a UTF-8 string.
996 * If the source encoding is not UTF-8 and the conversion output contains a
997 * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
998 * function returns %NULL.
999 * If the source encoding is UTF-8, an embedded nul character is treated with
1000 * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with
1001 * earlier versions of this library. Use g_convert() to produce output that
1002 * may contain embedded nul characters.
1004 * Returns: (type utf8): The converted string, or %NULL on an error.
1006 gchar *
1007 g_locale_to_utf8 (const gchar *opsysstring,
1008 gssize len,
1009 gsize *bytes_read,
1010 gsize *bytes_written,
1011 GError **error)
1013 const char *charset;
1015 if (g_get_charset (&charset))
1016 return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1017 else
1018 return convert_checked (opsysstring, len, "UTF-8", charset,
1019 CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1020 bytes_read, bytes_written, error);
1024 * g_locale_from_utf8:
1025 * @utf8string: a UTF-8 encoded string
1026 * @len: the length of the string, or -1 if the string is
1027 * nul-terminated.
1028 * @bytes_read: (out) (optional): location to store the number of bytes in the
1029 * input string that were successfully converted, or %NULL.
1030 * Even if the conversion was successful, this may be
1031 * less than @len if there were partial characters
1032 * at the end of the input. If the error
1033 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1034 * stored will be the byte offset after the last valid
1035 * input sequence.
1036 * @bytes_written: (out) (optional): the number of bytes stored in the output
1037 * buffer (not including the terminating nul).
1038 * @error: location to store the error occurring, or %NULL to ignore
1039 * errors. Any of the errors in #GConvertError may occur.
1041 * Converts a string from UTF-8 to the encoding used for strings by
1042 * the C runtime (usually the same as that used by the operating
1043 * system) in the [current locale][setlocale]. On Windows this means
1044 * the system codepage.
1046 * The input string shall not contain nul characters even if the @len
1047 * argument is positive. A nul character found inside the string will result
1048 * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Use g_convert() to convert
1049 * input that may contain embedded nul characters.
1051 * Returns: (array length=bytes_written) (element-type guint8) (transfer full):
1052 * A newly-allocated buffer containing the converted string,
1053 * or %NULL on an error, and error will be set.
1055 gchar *
1056 g_locale_from_utf8 (const gchar *utf8string,
1057 gssize len,
1058 gsize *bytes_read,
1059 gsize *bytes_written,
1060 GError **error)
1062 const gchar *charset;
1064 if (g_get_charset (&charset))
1065 return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1066 else
1067 return convert_checked (utf8string, len, charset, "UTF-8",
1068 CONVERT_CHECK_NO_NULS_IN_INPUT,
1069 bytes_read, bytes_written, error);
1072 #ifndef G_PLATFORM_WIN32
1074 typedef struct _GFilenameCharsetCache GFilenameCharsetCache;
1076 struct _GFilenameCharsetCache {
1077 gboolean is_utf8;
1078 gchar *charset;
1079 gchar **filename_charsets;
1082 static void
1083 filename_charset_cache_free (gpointer data)
1085 GFilenameCharsetCache *cache = data;
1086 g_free (cache->charset);
1087 g_strfreev (cache->filename_charsets);
1088 g_free (cache);
1092 * g_get_filename_charsets:
1093 * @filename_charsets: (out) (transfer none) (array zero-terminated=1):
1094 * return location for the %NULL-terminated list of encoding names
1096 * Determines the preferred character sets used for filenames.
1097 * The first character set from the @charsets is the filename encoding, the
1098 * subsequent character sets are used when trying to generate a displayable
1099 * representation of a filename, see g_filename_display_name().
1101 * On Unix, the character sets are determined by consulting the
1102 * environment variables `G_FILENAME_ENCODING` and `G_BROKEN_FILENAMES`.
1103 * On Windows, the character set used in the GLib API is always UTF-8
1104 * and said environment variables have no effect.
1106 * `G_FILENAME_ENCODING` may be set to a comma-separated list of
1107 * character set names. The special token "\@locale" is taken
1108 * to mean the character set for the [current locale][setlocale].
1109 * If `G_FILENAME_ENCODING` is not set, but `G_BROKEN_FILENAMES` is,
1110 * the character set of the current locale is taken as the filename
1111 * encoding. If neither environment variable is set, UTF-8 is taken
1112 * as the filename encoding, but the character set of the current locale
1113 * is also put in the list of encodings.
1115 * The returned @charsets belong to GLib and must not be freed.
1117 * Note that on Unix, regardless of the locale character set or
1118 * `G_FILENAME_ENCODING` value, the actual file names present
1119 * on a system might be in any random encoding or just gibberish.
1121 * Returns: %TRUE if the filename encoding is UTF-8.
1123 * Since: 2.6
1125 gboolean
1126 g_get_filename_charsets (const gchar ***filename_charsets)
1128 static GPrivate cache_private = G_PRIVATE_INIT (filename_charset_cache_free);
1129 GFilenameCharsetCache *cache = g_private_get (&cache_private);
1130 const gchar *charset;
1132 if (!cache)
1134 cache = g_new0 (GFilenameCharsetCache, 1);
1135 g_private_set (&cache_private, cache);
1138 g_get_charset (&charset);
1140 if (!(cache->charset && strcmp (cache->charset, charset) == 0))
1142 const gchar *new_charset;
1143 gchar *p;
1144 gint i;
1146 g_free (cache->charset);
1147 g_strfreev (cache->filename_charsets);
1148 cache->charset = g_strdup (charset);
1150 p = getenv ("G_FILENAME_ENCODING");
1151 if (p != NULL && p[0] != '\0')
1153 cache->filename_charsets = g_strsplit (p, ",", 0);
1154 cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0);
1156 for (i = 0; cache->filename_charsets[i]; i++)
1158 if (strcmp ("@locale", cache->filename_charsets[i]) == 0)
1160 g_get_charset (&new_charset);
1161 g_free (cache->filename_charsets[i]);
1162 cache->filename_charsets[i] = g_strdup (new_charset);
1166 else if (getenv ("G_BROKEN_FILENAMES") != NULL)
1168 cache->filename_charsets = g_new0 (gchar *, 2);
1169 cache->is_utf8 = g_get_charset (&new_charset);
1170 cache->filename_charsets[0] = g_strdup (new_charset);
1172 else
1174 cache->filename_charsets = g_new0 (gchar *, 3);
1175 cache->is_utf8 = TRUE;
1176 cache->filename_charsets[0] = g_strdup ("UTF-8");
1177 if (!g_get_charset (&new_charset))
1178 cache->filename_charsets[1] = g_strdup (new_charset);
1182 if (filename_charsets)
1183 *filename_charsets = (const gchar **)cache->filename_charsets;
1185 return cache->is_utf8;
1188 #else /* G_PLATFORM_WIN32 */
1190 gboolean
1191 g_get_filename_charsets (const gchar ***filename_charsets)
1193 static const gchar *charsets[] = {
1194 "UTF-8",
1195 NULL
1198 #ifdef G_OS_WIN32
1199 /* On Windows GLib pretends that the filename charset is UTF-8 */
1200 if (filename_charsets)
1201 *filename_charsets = charsets;
1203 return TRUE;
1204 #else
1205 gboolean result;
1207 /* Cygwin works like before */
1208 result = g_get_charset (&(charsets[0]));
1210 if (filename_charsets)
1211 *filename_charsets = charsets;
1213 return result;
1214 #endif
1217 #endif /* G_PLATFORM_WIN32 */
1219 static gboolean
1220 get_filename_charset (const gchar **filename_charset)
1222 const gchar **charsets;
1223 gboolean is_utf8;
1225 is_utf8 = g_get_filename_charsets (&charsets);
1227 if (filename_charset)
1228 *filename_charset = charsets[0];
1230 return is_utf8;
1234 * g_filename_to_utf8:
1235 * @opsysstring: (type filename): a string in the encoding for filenames
1236 * @len: the length of the string, or -1 if the string is
1237 * nul-terminated (Note that some encodings may allow nul
1238 * bytes to occur inside strings. In that case, using -1
1239 * for the @len parameter is unsafe)
1240 * @bytes_read: (out) (optional): location to store the number of bytes in the
1241 * input string that were successfully converted, or %NULL.
1242 * Even if the conversion was successful, this may be
1243 * less than @len if there were partial characters
1244 * at the end of the input. If the error
1245 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1246 * stored will be the byte offset after the last valid
1247 * input sequence.
1248 * @bytes_written: (out) (optional): the number of bytes stored in the output
1249 * buffer (not including the terminating nul).
1250 * @error: location to store the error occurring, or %NULL to ignore
1251 * errors. Any of the errors in #GConvertError may occur.
1253 * Converts a string which is in the encoding used by GLib for
1254 * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8
1255 * for filenames; on other platforms, this function indirectly depends on
1256 * the [current locale][setlocale].
1258 * The input string shall not contain nul characters even if the @len
1259 * argument is positive. A nul character found inside the string will result
1260 * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
1261 * If the source encoding is not UTF-8 and the conversion output contains a
1262 * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
1263 * function returns %NULL. Use g_convert() to produce output that
1264 * may contain embedded nul characters.
1266 * Returns: (type utf8): The converted string, or %NULL on an error.
1268 gchar*
1269 g_filename_to_utf8 (const gchar *opsysstring,
1270 gssize len,
1271 gsize *bytes_read,
1272 gsize *bytes_written,
1273 GError **error)
1275 const gchar *charset;
1277 g_return_val_if_fail (opsysstring != NULL, NULL);
1279 if (get_filename_charset (&charset))
1280 return strdup_len (opsysstring, len, bytes_read, bytes_written, error);
1281 else
1282 return convert_checked (opsysstring, len, "UTF-8", charset,
1283 CONVERT_CHECK_NO_NULS_IN_INPUT |
1284 CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1285 bytes_read, bytes_written, error);
1289 * g_filename_from_utf8:
1290 * @utf8string: (type utf8): a UTF-8 encoded string.
1291 * @len: the length of the string, or -1 if the string is
1292 * nul-terminated.
1293 * @bytes_read: (out) (optional): location to store the number of bytes in
1294 * the input string that were successfully converted, or %NULL.
1295 * Even if the conversion was successful, this may be
1296 * less than @len if there were partial characters
1297 * at the end of the input. If the error
1298 * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1299 * stored will be the byte offset after the last valid
1300 * input sequence.
1301 * @bytes_written: (out) (optional): the number of bytes stored in
1302 * the output buffer (not including the terminating nul).
1303 * @error: location to store the error occurring, or %NULL to ignore
1304 * errors. Any of the errors in #GConvertError may occur.
1306 * Converts a string from UTF-8 to the encoding GLib uses for
1307 * filenames. Note that on Windows GLib uses UTF-8 for filenames;
1308 * on other platforms, this function indirectly depends on the
1309 * [current locale][setlocale].
1311 * The input string shall not contain nul characters even if the @len
1312 * argument is positive. A nul character found inside the string will result
1313 * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. If the filename encoding is
1314 * not UTF-8 and the conversion output contains a nul character, the error
1315 * %G_CONVERT_ERROR_EMBEDDED_NUL is set and the function returns %NULL.
1317 * Returns: (type filename):
1318 * The converted string, or %NULL on an error.
1320 gchar*
1321 g_filename_from_utf8 (const gchar *utf8string,
1322 gssize len,
1323 gsize *bytes_read,
1324 gsize *bytes_written,
1325 GError **error)
1327 const gchar *charset;
1329 if (get_filename_charset (&charset))
1330 return strdup_len (utf8string, len, bytes_read, bytes_written, error);
1331 else
1332 return convert_checked (utf8string, len, charset, "UTF-8",
1333 CONVERT_CHECK_NO_NULS_IN_INPUT |
1334 CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1335 bytes_read, bytes_written, error);
1338 /* Test of haystack has the needle prefix, comparing case
1339 * insensitive. haystack may be UTF-8, but needle must
1340 * contain only ascii. */
1341 static gboolean
1342 has_case_prefix (const gchar *haystack, const gchar *needle)
1344 const gchar *h, *n;
1346 /* Eat one character at a time. */
1347 h = haystack;
1348 n = needle;
1350 while (*n && *h &&
1351 g_ascii_tolower (*n) == g_ascii_tolower (*h))
1353 n++;
1354 h++;
1357 return *n == '\0';
1360 typedef enum {
1361 UNSAFE_ALL = 0x1, /* Escape all unsafe characters */
1362 UNSAFE_ALLOW_PLUS = 0x2, /* Allows '+' */
1363 UNSAFE_PATH = 0x8, /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */
1364 UNSAFE_HOST = 0x10, /* Allows '/' and ':' and '@' */
1365 UNSAFE_SLASHES = 0x20 /* Allows all characters except for '/' and '%' */
1366 } UnsafeCharacterSet;
1368 static const guchar acceptable[96] = {
1369 /* A table of the ASCII chars from space (32) to DEL (127) */
1370 /* ! " # $ % & ' ( ) * + , - . / */
1371 0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C,
1372 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1373 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20,
1374 /* @ A B C D E F G H I J K L M N O */
1375 0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1376 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1377 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F,
1378 /* ` a b c d e f g h i j k l m n o */
1379 0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,
1380 /* p q r s t u v w x y z { | } ~ DEL */
1381 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20
1384 static const gchar hex[16] = "0123456789ABCDEF";
1386 /* Note: This escape function works on file: URIs, but if you want to
1387 * escape something else, please read RFC-2396 */
1388 static gchar *
1389 g_escape_uri_string (const gchar *string,
1390 UnsafeCharacterSet mask)
1392 #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))
1394 const gchar *p;
1395 gchar *q;
1396 gchar *result;
1397 int c;
1398 gint unacceptable;
1399 UnsafeCharacterSet use_mask;
1401 g_return_val_if_fail (mask == UNSAFE_ALL
1402 || mask == UNSAFE_ALLOW_PLUS
1403 || mask == UNSAFE_PATH
1404 || mask == UNSAFE_HOST
1405 || mask == UNSAFE_SLASHES, NULL);
1407 unacceptable = 0;
1408 use_mask = mask;
1409 for (p = string; *p != '\0'; p++)
1411 c = (guchar) *p;
1412 if (!ACCEPTABLE (c))
1413 unacceptable++;
1416 result = g_malloc (p - string + unacceptable * 2 + 1);
1418 use_mask = mask;
1419 for (q = result, p = string; *p != '\0'; p++)
1421 c = (guchar) *p;
1423 if (!ACCEPTABLE (c))
1425 *q++ = '%'; /* means hex coming */
1426 *q++ = hex[c >> 4];
1427 *q++ = hex[c & 15];
1429 else
1430 *q++ = *p;
1433 *q = '\0';
1435 return result;
1439 static gchar *
1440 g_escape_file_uri (const gchar *hostname,
1441 const gchar *pathname)
1443 char *escaped_hostname = NULL;
1444 char *escaped_path;
1445 char *res;
1447 #ifdef G_OS_WIN32
1448 char *p, *backslash;
1450 /* Turn backslashes into forward slashes. That's what Netscape
1451 * does, and they are actually more or less equivalent in Windows.
1454 pathname = g_strdup (pathname);
1455 p = (char *) pathname;
1457 while ((backslash = strchr (p, '\\')) != NULL)
1459 *backslash = '/';
1460 p = backslash + 1;
1462 #endif
1464 if (hostname && *hostname != '\0')
1466 escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST);
1469 escaped_path = g_escape_uri_string (pathname, UNSAFE_PATH);
1471 res = g_strconcat ("file://",
1472 (escaped_hostname) ? escaped_hostname : "",
1473 (*escaped_path != '/') ? "/" : "",
1474 escaped_path,
1475 NULL);
1477 #ifdef G_OS_WIN32
1478 g_free ((char *) pathname);
1479 #endif
1481 g_free (escaped_hostname);
1482 g_free (escaped_path);
1484 return res;
1487 static int
1488 unescape_character (const char *scanner)
1490 int first_digit;
1491 int second_digit;
1493 first_digit = g_ascii_xdigit_value (scanner[0]);
1494 if (first_digit < 0)
1495 return -1;
1497 second_digit = g_ascii_xdigit_value (scanner[1]);
1498 if (second_digit < 0)
1499 return -1;
1501 return (first_digit << 4) | second_digit;
1504 static gchar *
1505 g_unescape_uri_string (const char *escaped,
1506 int len,
1507 const char *illegal_escaped_characters,
1508 gboolean ascii_must_not_be_escaped)
1510 const gchar *in, *in_end;
1511 gchar *out, *result;
1512 int c;
1514 if (escaped == NULL)
1515 return NULL;
1517 if (len < 0)
1518 len = strlen (escaped);
1520 result = g_malloc (len + 1);
1522 out = result;
1523 for (in = escaped, in_end = escaped + len; in < in_end; in++)
1525 c = *in;
1527 if (c == '%')
1529 /* catch partial escape sequences past the end of the substring */
1530 if (in + 3 > in_end)
1531 break;
1533 c = unescape_character (in + 1);
1535 /* catch bad escape sequences and NUL characters */
1536 if (c <= 0)
1537 break;
1539 /* catch escaped ASCII */
1540 if (ascii_must_not_be_escaped && c <= 0x7F)
1541 break;
1543 /* catch other illegal escaped characters */
1544 if (strchr (illegal_escaped_characters, c) != NULL)
1545 break;
1547 in += 2;
1550 *out++ = c;
1553 g_assert (out - result <= len);
1554 *out = '\0';
1556 if (in != in_end)
1558 g_free (result);
1559 return NULL;
1562 return result;
1565 static gboolean
1566 is_asciialphanum (gunichar c)
1568 return c <= 0x7F && g_ascii_isalnum (c);
1571 static gboolean
1572 is_asciialpha (gunichar c)
1574 return c <= 0x7F && g_ascii_isalpha (c);
1577 /* allows an empty string */
1578 static gboolean
1579 hostname_validate (const char *hostname)
1581 const char *p;
1582 gunichar c, first_char, last_char;
1584 p = hostname;
1585 if (*p == '\0')
1586 return TRUE;
1589 /* read in a label */
1590 c = g_utf8_get_char (p);
1591 p = g_utf8_next_char (p);
1592 if (!is_asciialphanum (c))
1593 return FALSE;
1594 first_char = c;
1597 last_char = c;
1598 c = g_utf8_get_char (p);
1599 p = g_utf8_next_char (p);
1601 while (is_asciialphanum (c) || c == '-');
1602 if (last_char == '-')
1603 return FALSE;
1605 /* if that was the last label, check that it was a toplabel */
1606 if (c == '\0' || (c == '.' && *p == '\0'))
1607 return is_asciialpha (first_char);
1609 while (c == '.');
1610 return FALSE;
1614 * g_filename_from_uri:
1615 * @uri: a uri describing a filename (escaped, encoded in ASCII).
1616 * @hostname: (out) (optional) (nullable): Location to store hostname for the URI.
1617 * If there is no hostname in the URI, %NULL will be
1618 * stored in this location.
1619 * @error: location to store the error occurring, or %NULL to ignore
1620 * errors. Any of the errors in #GConvertError may occur.
1622 * Converts an escaped ASCII-encoded URI to a local filename in the
1623 * encoding used for filenames.
1625 * Returns: (type filename): a newly-allocated string holding
1626 * the resulting filename, or %NULL on an error.
1628 gchar *
1629 g_filename_from_uri (const gchar *uri,
1630 gchar **hostname,
1631 GError **error)
1633 const char *path_part;
1634 const char *host_part;
1635 char *unescaped_hostname;
1636 char *result;
1637 char *filename;
1638 int offs;
1639 #ifdef G_OS_WIN32
1640 char *p, *slash;
1641 #endif
1643 if (hostname)
1644 *hostname = NULL;
1646 if (!has_case_prefix (uri, "file:/"))
1648 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1649 _("The URI “%s” is not an absolute URI using the “file” scheme"),
1650 uri);
1651 return NULL;
1654 path_part = uri + strlen ("file:");
1656 if (strchr (path_part, '#') != NULL)
1658 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1659 _("The local file URI “%s” may not include a “#”"),
1660 uri);
1661 return NULL;
1664 if (has_case_prefix (path_part, "///"))
1665 path_part += 2;
1666 else if (has_case_prefix (path_part, "//"))
1668 path_part += 2;
1669 host_part = path_part;
1671 path_part = strchr (path_part, '/');
1673 if (path_part == NULL)
1675 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1676 _("The URI “%s” is invalid"),
1677 uri);
1678 return NULL;
1681 unescaped_hostname = g_unescape_uri_string (host_part, path_part - host_part, "", TRUE);
1683 if (unescaped_hostname == NULL ||
1684 !hostname_validate (unescaped_hostname))
1686 g_free (unescaped_hostname);
1687 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1688 _("The hostname of the URI “%s” is invalid"),
1689 uri);
1690 return NULL;
1693 if (hostname)
1694 *hostname = unescaped_hostname;
1695 else
1696 g_free (unescaped_hostname);
1699 filename = g_unescape_uri_string (path_part, -1, "/", FALSE);
1701 if (filename == NULL)
1703 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,
1704 _("The URI “%s” contains invalidly escaped characters"),
1705 uri);
1706 return NULL;
1709 offs = 0;
1710 #ifdef G_OS_WIN32
1711 /* Drop localhost */
1712 if (hostname && *hostname != NULL &&
1713 g_ascii_strcasecmp (*hostname, "localhost") == 0)
1715 g_free (*hostname);
1716 *hostname = NULL;
1719 /* Turn slashes into backslashes, because that's the canonical spelling */
1720 p = filename;
1721 while ((slash = strchr (p, '/')) != NULL)
1723 *slash = '\\';
1724 p = slash + 1;
1727 /* Windows URIs with a drive letter can be like "file://host/c:/foo"
1728 * or "file://host/c|/foo" (some Netscape versions). In those cases, start
1729 * the filename from the drive letter.
1731 if (g_ascii_isalpha (filename[1]))
1733 if (filename[2] == ':')
1734 offs = 1;
1735 else if (filename[2] == '|')
1737 filename[2] = ':';
1738 offs = 1;
1741 #endif
1743 result = g_strdup (filename + offs);
1744 g_free (filename);
1746 return result;
1750 * g_filename_to_uri:
1751 * @filename: (type filename): an absolute filename specified in the GLib file
1752 * name encoding, which is the on-disk file name bytes on Unix, and UTF-8
1753 * on Windows
1754 * @hostname: (nullable): A UTF-8 encoded hostname, or %NULL for none.
1755 * @error: location to store the error occurring, or %NULL to ignore
1756 * errors. Any of the errors in #GConvertError may occur.
1758 * Converts an absolute filename to an escaped ASCII-encoded URI, with the path
1759 * component following Section 3.3. of RFC 2396.
1761 * Returns: a newly-allocated string holding the resulting
1762 * URI, or %NULL on an error.
1764 gchar *
1765 g_filename_to_uri (const gchar *filename,
1766 const gchar *hostname,
1767 GError **error)
1769 char *escaped_uri;
1771 g_return_val_if_fail (filename != NULL, NULL);
1773 if (!g_path_is_absolute (filename))
1775 g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
1776 _("The pathname “%s” is not an absolute path"),
1777 filename);
1778 return NULL;
1781 if (hostname &&
1782 !(g_utf8_validate (hostname, -1, NULL)
1783 && hostname_validate (hostname)))
1785 g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1786 _("Invalid hostname"));
1787 return NULL;
1790 #ifdef G_OS_WIN32
1791 /* Don't use localhost unnecessarily */
1792 if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0)
1793 hostname = NULL;
1794 #endif
1796 escaped_uri = g_escape_file_uri (hostname, filename);
1798 return escaped_uri;
1802 * g_uri_list_extract_uris:
1803 * @uri_list: an URI list
1805 * Splits an URI list conforming to the text/uri-list
1806 * mime type defined in RFC 2483 into individual URIs,
1807 * discarding any comments. The URIs are not validated.
1809 * Returns: (transfer full): a newly allocated %NULL-terminated list
1810 * of strings holding the individual URIs. The array should be freed
1811 * with g_strfreev().
1813 * Since: 2.6
1815 gchar **
1816 g_uri_list_extract_uris (const gchar *uri_list)
1818 GSList *uris, *u;
1819 const gchar *p, *q;
1820 gchar **result;
1821 gint n_uris = 0;
1823 uris = NULL;
1825 p = uri_list;
1827 /* We don't actually try to validate the URI according to RFC
1828 * 2396, or even check for allowed characters - we just ignore
1829 * comments and trim whitespace off the ends. We also
1830 * allow LF delimination as well as the specified CRLF.
1832 * We do allow comments like specified in RFC 2483.
1834 while (p)
1836 if (*p != '#')
1838 while (g_ascii_isspace (*p))
1839 p++;
1841 q = p;
1842 while (*q && (*q != '\n') && (*q != '\r'))
1843 q++;
1845 if (q > p)
1847 q--;
1848 while (q > p && g_ascii_isspace (*q))
1849 q--;
1851 if (q > p)
1853 uris = g_slist_prepend (uris, g_strndup (p, q - p + 1));
1854 n_uris++;
1858 p = strchr (p, '\n');
1859 if (p)
1860 p++;
1863 result = g_new (gchar *, n_uris + 1);
1865 result[n_uris--] = NULL;
1866 for (u = uris; u; u = u->next)
1867 result[n_uris--] = u->data;
1869 g_slist_free (uris);
1871 return result;
1875 * g_filename_display_basename:
1876 * @filename: (type filename): an absolute pathname in the
1877 * GLib file name encoding
1879 * Returns the display basename for the particular filename, guaranteed
1880 * to be valid UTF-8. The display name might not be identical to the filename,
1881 * for instance there might be problems converting it to UTF-8, and some files
1882 * can be translated in the display.
1884 * If GLib cannot make sense of the encoding of @filename, as a last resort it
1885 * replaces unknown characters with U+FFFD, the Unicode replacement character.
1886 * You can search the result for the UTF-8 encoding of this character (which is
1887 * "\357\277\275" in octal notation) to find out if @filename was in an invalid
1888 * encoding.
1890 * You must pass the whole absolute pathname to this functions so that
1891 * translation of well known locations can be done.
1893 * This function is preferred over g_filename_display_name() if you know the
1894 * whole path, as it allows translation.
1896 * Returns: a newly allocated string containing
1897 * a rendition of the basename of the filename in valid UTF-8
1899 * Since: 2.6
1901 gchar *
1902 g_filename_display_basename (const gchar *filename)
1904 char *basename;
1905 char *display_name;
1907 g_return_val_if_fail (filename != NULL, NULL);
1909 basename = g_path_get_basename (filename);
1910 display_name = g_filename_display_name (basename);
1911 g_free (basename);
1912 return display_name;
1916 * g_filename_display_name:
1917 * @filename: (type filename): a pathname hopefully in the
1918 * GLib file name encoding
1920 * Converts a filename into a valid UTF-8 string. The conversion is
1921 * not necessarily reversible, so you should keep the original around
1922 * and use the return value of this function only for display purposes.
1923 * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL
1924 * even if the filename actually isn't in the GLib file name encoding.
1926 * If GLib cannot make sense of the encoding of @filename, as a last resort it
1927 * replaces unknown characters with U+FFFD, the Unicode replacement character.
1928 * You can search the result for the UTF-8 encoding of this character (which is
1929 * "\357\277\275" in octal notation) to find out if @filename was in an invalid
1930 * encoding.
1932 * If you know the whole pathname of the file you should use
1933 * g_filename_display_basename(), since that allows location-based
1934 * translation of filenames.
1936 * Returns: a newly allocated string containing
1937 * a rendition of the filename in valid UTF-8
1939 * Since: 2.6
1941 gchar *
1942 g_filename_display_name (const gchar *filename)
1944 gint i;
1945 const gchar **charsets;
1946 gchar *display_name = NULL;
1947 gboolean is_utf8;
1949 is_utf8 = g_get_filename_charsets (&charsets);
1951 if (is_utf8)
1953 if (g_utf8_validate (filename, -1, NULL))
1954 display_name = g_strdup (filename);
1957 if (!display_name)
1959 /* Try to convert from the filename charsets to UTF-8.
1960 * Skip the first charset if it is UTF-8.
1962 for (i = is_utf8 ? 1 : 0; charsets[i]; i++)
1964 display_name = g_convert (filename, -1, "UTF-8", charsets[i],
1965 NULL, NULL, NULL);
1967 if (display_name)
1968 break;
1972 /* if all conversions failed, we replace invalid UTF-8
1973 * by a question mark
1975 if (!display_name)
1976 display_name = g_utf8_make_valid (filename, -1);
1978 return display_name;
1981 #ifdef G_OS_WIN32
1983 /* Binary compatibility versions. Not for newly compiled code. */
1985 _GLIB_EXTERN gchar *g_filename_to_utf8_utf8 (const gchar *opsysstring,
1986 gssize len,
1987 gsize *bytes_read,
1988 gsize *bytes_written,
1989 GError **error) G_GNUC_MALLOC;
1990 _GLIB_EXTERN gchar *g_filename_from_utf8_utf8 (const gchar *utf8string,
1991 gssize len,
1992 gsize *bytes_read,
1993 gsize *bytes_written,
1994 GError **error) G_GNUC_MALLOC;
1995 _GLIB_EXTERN gchar *g_filename_from_uri_utf8 (const gchar *uri,
1996 gchar **hostname,
1997 GError **error) G_GNUC_MALLOC;
1998 _GLIB_EXTERN gchar *g_filename_to_uri_utf8 (const gchar *filename,
1999 const gchar *hostname,
2000 GError **error) G_GNUC_MALLOC;
2002 gchar *
2003 g_filename_to_utf8_utf8 (const gchar *opsysstring,
2004 gssize len,
2005 gsize *bytes_read,
2006 gsize *bytes_written,
2007 GError **error)
2009 return g_filename_to_utf8 (opsysstring, len, bytes_read, bytes_written, error);
2012 gchar *
2013 g_filename_from_utf8_utf8 (const gchar *utf8string,
2014 gssize len,
2015 gsize *bytes_read,
2016 gsize *bytes_written,
2017 GError **error)
2019 return g_filename_from_utf8 (utf8string, len, bytes_read, bytes_written, error);
2022 gchar *
2023 g_filename_from_uri_utf8 (const gchar *uri,
2024 gchar **hostname,
2025 GError **error)
2027 return g_filename_from_uri (uri, hostname, error);
2030 gchar *
2031 g_filename_to_uri_utf8 (const gchar *filename,
2032 const gchar *hostname,
2033 GError **error)
2035 return g_filename_to_uri (filename, hostname, error);
2038 #endif