TortoiseGitMerge: Updated libsvn stuff
[TortoiseGit.git] / src / TortoiseMerge / libsvn_diff / utf.c
blob0b48bdb66acdc9140c0ca1c37dcf58cfa6aa12f4
1 /*
2 * utf.c: UTF-8 conversion routines
4 * ====================================================================
5 * Licensed to the Apache Software Foundation (ASF) under one
6 * or more contributor license agreements. See the NOTICE file
7 * distributed with this work for additional information
8 * regarding copyright ownership. The ASF licenses this file
9 * to you under the Apache License, Version 2.0 (the
10 * "License"); you may not use this file except in compliance
11 * with the License. You may obtain a copy of the License at
13 * http://www.apache.org/licenses/LICENSE-2.0
15 * Unless required by applicable law or agreed to in writing,
16 * software distributed under the License is distributed on an
17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 * KIND, either express or implied. See the License for the
19 * specific language governing permissions and limitations
20 * under the License.
21 * ====================================================================
26 #include <stdlib.h>
27 #include <string.h>
28 #include <assert.h>
30 #include <apr_strings.h>
31 #include <apr_lib.h>
32 #include <apr_xlate.h>
33 #include <apr_atomic.h>
35 #include "svn_hash.h"
36 #include "svn_string.h"
37 #include "svn_error.h"
38 #include "svn_pools.h"
39 #include "svn_ctype.h"
40 #include "svn_utf.h"
41 #include "svn_private_config.h"
42 #include "win32_xlate.h"
44 #include "private/svn_utf_private.h"
45 #include "private/svn_dep_compat.h"
46 #include "private/svn_string_private.h"
47 #include "private/svn_mutex.h"
51 /* Use these static strings to maximize performance on standard conversions.
52 * Any strings on other locations are still valid, however.
54 static const char *SVN_UTF_NTOU_XLATE_HANDLE = "svn-utf-ntou-xlate-handle";
55 static const char *SVN_UTF_UTON_XLATE_HANDLE = "svn-utf-uton-xlate-handle";
57 static const char *SVN_APR_UTF8_CHARSET = "UTF-8";
59 static svn_mutex__t *xlate_handle_mutex = NULL;
60 static svn_boolean_t assume_native_charset_is_utf8 = FALSE;
62 /* The xlate handle cache is a global hash table with linked lists of xlate
63 * handles. In multi-threaded environments, a thread "borrows" an xlate
64 * handle from the cache during a translation and puts it back afterwards.
65 * This avoids holding a global lock for all translations.
66 * If there is no handle for a particular key when needed, a new is
67 * handle is created and put in the cache after use.
68 * This means that there will be at most N handles open for a key, where N
69 * is the number of simultanous handles in use for that key. */
71 typedef struct xlate_handle_node_t {
72 apr_xlate_t *handle;
73 /* FALSE if the handle is not valid, since its pool is being
74 destroyed. */
75 svn_boolean_t valid;
76 /* The name of a char encoding or APR_LOCALE_CHARSET. */
77 const char *frompage, *topage;
78 struct xlate_handle_node_t *next;
79 } xlate_handle_node_t;
81 /* This maps const char * userdata_key strings to xlate_handle_node_t **
82 handles to the first entry in the linked list of xlate handles. We don't
83 store the pointer to the list head directly in the hash table, since we
84 remove/insert entries at the head in the list in the code below, and
85 we can't use apr_hash_set() in each character translation because that
86 function allocates memory in each call where the value is non-NULL.
87 Since these allocations take place in a global pool, this would be a
88 memory leak. */
89 static apr_hash_t *xlate_handle_hash = NULL;
91 /* "1st level cache" to standard conversion maps. We may access these
92 * using atomic xchange ops, i.e. without further thread synchronization.
93 * If the respective item is NULL, fallback to hash lookup.
95 static void * volatile xlat_ntou_static_handle = NULL;
96 static void * volatile xlat_uton_static_handle = NULL;
98 /* Clean up the xlate handle cache. */
99 static apr_status_t
100 xlate_cleanup(void *arg)
102 /* We set the cache variables to NULL so that translation works in other
103 cleanup functions, even if it isn't cached then. */
104 xlate_handle_hash = NULL;
106 /* ensure no stale objects get accessed */
107 xlat_ntou_static_handle = NULL;
108 xlat_uton_static_handle = NULL;
110 return APR_SUCCESS;
113 /* Set the handle of ARG to NULL. */
114 static apr_status_t
115 xlate_handle_node_cleanup(void *arg)
117 xlate_handle_node_t *node = arg;
119 node->valid = FALSE;
120 return APR_SUCCESS;
123 void
124 svn_utf_initialize2(apr_pool_t *pool,
125 svn_boolean_t assume_native_utf8)
127 if (!xlate_handle_hash)
129 /* We create our own subpool, which we protect with the mutex.
130 We can't use the pool passed to us by the caller, since we will
131 use it for xlate handle allocations, possibly in multiple threads,
132 and pool allocation is not thread-safe. */
133 apr_pool_t *subpool = svn_pool_create(pool);
134 svn_mutex__t *mutex;
135 svn_error_t *err = svn_mutex__init(&mutex, TRUE, subpool);
136 if (err)
138 svn_error_clear(err);
139 return;
142 xlate_handle_mutex = mutex;
143 xlate_handle_hash = apr_hash_make(subpool);
145 apr_pool_cleanup_register(subpool, NULL, xlate_cleanup,
146 apr_pool_cleanup_null);
149 if (!assume_native_charset_is_utf8)
150 assume_native_charset_is_utf8 = assume_native_utf8;
153 /* Return a unique string key based on TOPAGE and FROMPAGE. TOPAGE and
154 * FROMPAGE can be any valid arguments of the same name to
155 * apr_xlate_open(). Allocate the returned string in POOL. */
156 static const char*
157 get_xlate_key(const char *topage,
158 const char *frompage,
159 apr_pool_t *pool)
161 /* In the cases of SVN_APR_LOCALE_CHARSET and SVN_APR_DEFAULT_CHARSET
162 * topage/frompage is really an int, not a valid string. So generate a
163 * unique key accordingly. */
164 if (frompage == SVN_APR_LOCALE_CHARSET)
165 frompage = "APR_LOCALE_CHARSET";
166 else if (frompage == SVN_APR_DEFAULT_CHARSET)
167 frompage = "APR_DEFAULT_CHARSET";
169 if (topage == SVN_APR_LOCALE_CHARSET)
170 topage = "APR_LOCALE_CHARSET";
171 else if (topage == SVN_APR_DEFAULT_CHARSET)
172 topage = "APR_DEFAULT_CHARSET";
174 return apr_pstrcat(pool, "svn-utf-", frompage, "to", topage,
175 "-xlate-handle", (char *)NULL);
178 /* Atomically replace the content in *MEM with NEW_VALUE and return
179 * the previous content of *MEM. If atomicy cannot be guaranteed,
180 * *MEM will not be modified and NEW_VALUE is simply returned to
181 * the caller.
183 static APR_INLINE void*
184 atomic_swap(void * volatile * mem, void *new_value)
186 #if APR_HAS_THREADS
187 #if APR_VERSION_AT_LEAST(1,3,0)
188 /* Cast is necessary because of APR bug:
189 https://issues.apache.org/bugzilla/show_bug.cgi?id=50731 */
190 return apr_atomic_xchgptr((volatile void **)mem, new_value);
191 #else
192 /* old APRs don't support atomic swaps. Simply return the
193 * input to the caller for further proccessing. */
194 return new_value;
195 #endif
196 #else
197 /* no threads - no sync. necessary */
198 void *old_value = (void*)*mem;
199 *mem = new_value;
200 return old_value;
201 #endif
204 /* Set *RET to a newly created handle node for converting from FROMPAGE
205 to TOPAGE, If apr_xlate_open() returns APR_EINVAL or APR_ENOTIMPL, set
206 (*RET)->handle to NULL. If fail for any other reason, return the error.
207 Allocate *RET and its xlate handle in POOL. */
208 static svn_error_t *
209 xlate_alloc_handle(xlate_handle_node_t **ret,
210 const char *topage, const char *frompage,
211 apr_pool_t *pool)
213 apr_status_t apr_err;
214 apr_xlate_t *handle;
216 /* The error handling doesn't support the following cases, since we don't
217 use them currently. Catch this here. */
218 SVN_ERR_ASSERT(frompage != SVN_APR_DEFAULT_CHARSET
219 && topage != SVN_APR_DEFAULT_CHARSET
220 && (frompage != SVN_APR_LOCALE_CHARSET
221 || topage != SVN_APR_LOCALE_CHARSET));
223 /* Try to create a handle. */
224 #if defined(WIN32)
225 apr_err = svn_subr__win32_xlate_open((win32_xlate_t **)&handle, topage,
226 frompage, pool);
227 #else
228 apr_err = apr_xlate_open(&handle, topage, frompage, pool);
229 #endif
231 if (APR_STATUS_IS_EINVAL(apr_err) || APR_STATUS_IS_ENOTIMPL(apr_err))
232 handle = NULL;
233 else if (apr_err != APR_SUCCESS)
235 const char *errstr;
236 /* Can't use svn_error_wrap_apr here because it calls functions in
237 this file, leading to infinite recursion. */
238 if (frompage == SVN_APR_LOCALE_CHARSET)
239 errstr = apr_psprintf(pool,
240 _("Can't create a character converter from "
241 "native encoding to '%s'"), topage);
242 else if (topage == SVN_APR_LOCALE_CHARSET)
243 errstr = apr_psprintf(pool,
244 _("Can't create a character converter from "
245 "'%s' to native encoding"), frompage);
246 else
247 errstr = apr_psprintf(pool,
248 _("Can't create a character converter from "
249 "'%s' to '%s'"), frompage, topage);
251 return svn_error_create(apr_err, NULL, errstr);
254 /* Allocate and initialize the node. */
255 *ret = apr_palloc(pool, sizeof(xlate_handle_node_t));
256 (*ret)->handle = handle;
257 (*ret)->valid = TRUE;
258 (*ret)->frompage = ((frompage != SVN_APR_LOCALE_CHARSET)
259 ? apr_pstrdup(pool, frompage) : frompage);
260 (*ret)->topage = ((topage != SVN_APR_LOCALE_CHARSET)
261 ? apr_pstrdup(pool, topage) : topage);
262 (*ret)->next = NULL;
264 /* If we are called from inside a pool cleanup handler, the just created
265 xlate handle will be closed when that handler returns by a newly
266 registered cleanup handler, however, the handle is still cached by us.
267 To prevent this, we register a cleanup handler that will reset the valid
268 flag of our node, so we don't use an invalid handle. */
269 if (handle)
270 apr_pool_cleanup_register(pool, *ret, xlate_handle_node_cleanup,
271 apr_pool_cleanup_null);
273 return SVN_NO_ERROR;
276 /* Extend xlate_alloc_handle by using USERDATA_KEY as a key in our
277 global hash map, if available.
279 Allocate *RET and its xlate handle in POOL if svn_utf_initialize()
280 hasn't been called or USERDATA_KEY is NULL. Else, allocate them
281 in the pool of xlate_handle_hash.
283 Note: this function is not thread-safe. Call get_xlate_handle_node
284 instead. */
285 static svn_error_t *
286 get_xlate_handle_node_internal(xlate_handle_node_t **ret,
287 const char *topage, const char *frompage,
288 const char *userdata_key, apr_pool_t *pool)
290 /* If we already have a handle, just return it. */
291 if (userdata_key && xlate_handle_hash)
293 xlate_handle_node_t *old_node = NULL;
295 /* 2nd level: hash lookup */
296 xlate_handle_node_t **old_node_p = svn_hash_gets(xlate_handle_hash,
297 userdata_key);
298 if (old_node_p)
299 old_node = *old_node_p;
300 if (old_node)
302 /* Ensure that the handle is still valid. */
303 if (old_node->valid)
305 /* Remove from the list. */
306 *old_node_p = old_node->next;
307 old_node->next = NULL;
308 *ret = old_node;
309 return SVN_NO_ERROR;
314 /* Note that we still have the mutex locked (if it is initialized), so we
315 can use the global pool for creating the new xlate handle. */
317 /* Use the correct pool for creating the handle. */
318 pool = apr_hash_pool_get(xlate_handle_hash);
320 return xlate_alloc_handle(ret, topage, frompage, pool);
323 /* Set *RET to a handle node for converting from FROMPAGE to TOPAGE,
324 creating the handle node if it doesn't exist in USERDATA_KEY.
325 If a node is not cached and apr_xlate_open() returns APR_EINVAL or
326 APR_ENOTIMPL, set (*RET)->handle to NULL. If fail for any other
327 reason, return the error.
329 Allocate *RET and its xlate handle in POOL if svn_utf_initialize()
330 hasn't been called or USERDATA_KEY is NULL. Else, allocate them
331 in the pool of xlate_handle_hash. */
332 static svn_error_t *
333 get_xlate_handle_node(xlate_handle_node_t **ret,
334 const char *topage, const char *frompage,
335 const char *userdata_key, apr_pool_t *pool)
337 xlate_handle_node_t *old_node = NULL;
339 /* If we already have a handle, just return it. */
340 if (userdata_key)
342 if (xlate_handle_hash)
344 /* 1st level: global, static items */
345 if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE)
346 old_node = atomic_swap(&xlat_ntou_static_handle, NULL);
347 else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE)
348 old_node = atomic_swap(&xlat_uton_static_handle, NULL);
350 if (old_node && old_node->valid)
352 *ret = old_node;
353 return SVN_NO_ERROR;
356 else
358 void *p;
359 /* We fall back on a per-pool cache instead. */
360 apr_pool_userdata_get(&p, userdata_key, pool);
361 old_node = p;
362 /* Ensure that the handle is still valid. */
363 if (old_node && old_node->valid)
365 *ret = old_node;
366 return SVN_NO_ERROR;
369 return xlate_alloc_handle(ret, topage, frompage, pool);
373 SVN_MUTEX__WITH_LOCK(xlate_handle_mutex,
374 get_xlate_handle_node_internal(ret,
375 topage,
376 frompage,
377 userdata_key,
378 pool));
380 return SVN_NO_ERROR;
383 /* Put back NODE into the xlate handle cache for use by other calls.
385 Note: this function is not thread-safe. Call put_xlate_handle_node
386 instead. */
387 static svn_error_t *
388 put_xlate_handle_node_internal(xlate_handle_node_t *node,
389 const char *userdata_key)
391 xlate_handle_node_t **node_p = svn_hash_gets(xlate_handle_hash, userdata_key);
392 if (node_p == NULL)
394 userdata_key = apr_pstrdup(apr_hash_pool_get(xlate_handle_hash),
395 userdata_key);
396 node_p = apr_palloc(apr_hash_pool_get(xlate_handle_hash),
397 sizeof(*node_p));
398 *node_p = NULL;
399 svn_hash_sets(xlate_handle_hash, userdata_key, node_p);
401 node->next = *node_p;
402 *node_p = node;
404 return SVN_NO_ERROR;
407 /* Put back NODE into the xlate handle cache for use by other calls.
408 If there is no global cache, store the handle in POOL.
409 Ignore errors related to locking/unlocking the mutex. */
410 static svn_error_t *
411 put_xlate_handle_node(xlate_handle_node_t *node,
412 const char *userdata_key,
413 apr_pool_t *pool)
415 assert(node->next == NULL);
416 if (!userdata_key)
417 return SVN_NO_ERROR;
419 /* push previous global node to the hash */
420 if (xlate_handle_hash)
422 /* 1st level: global, static items */
423 if (userdata_key == SVN_UTF_NTOU_XLATE_HANDLE)
424 node = atomic_swap(&xlat_ntou_static_handle, node);
425 else if (userdata_key == SVN_UTF_UTON_XLATE_HANDLE)
426 node = atomic_swap(&xlat_uton_static_handle, node);
427 if (node == NULL)
428 return SVN_NO_ERROR;
430 SVN_MUTEX__WITH_LOCK(xlate_handle_mutex,
431 put_xlate_handle_node_internal(node,
432 userdata_key));
434 else
436 /* Store it in the per-pool cache. */
437 apr_pool_userdata_set(node, userdata_key, apr_pool_cleanup_null, pool);
440 return SVN_NO_ERROR;
443 /* Return the apr_xlate handle for converting native characters to UTF-8. */
444 static svn_error_t *
445 get_ntou_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool)
447 return get_xlate_handle_node(ret, SVN_APR_UTF8_CHARSET,
448 assume_native_charset_is_utf8
449 ? SVN_APR_UTF8_CHARSET
450 : SVN_APR_LOCALE_CHARSET,
451 SVN_UTF_NTOU_XLATE_HANDLE, pool);
455 /* Return the apr_xlate handle for converting UTF-8 to native characters.
456 Create one if it doesn't exist. If unable to find a handle, or
457 unable to create one because apr_xlate_open returned APR_EINVAL, then
458 set *RET to null and return SVN_NO_ERROR; if fail for some other
459 reason, return error. */
460 static svn_error_t *
461 get_uton_xlate_handle_node(xlate_handle_node_t **ret, apr_pool_t *pool)
463 return get_xlate_handle_node(ret,
464 assume_native_charset_is_utf8
465 ? SVN_APR_UTF8_CHARSET
466 : SVN_APR_LOCALE_CHARSET,
467 SVN_APR_UTF8_CHARSET,
468 SVN_UTF_UTON_XLATE_HANDLE, pool);
472 /* Copy LEN bytes of SRC, converting non-ASCII and zero bytes to ?\nnn
473 sequences, allocating the result in POOL. */
474 static const char *
475 fuzzy_escape(const char *src, apr_size_t len, apr_pool_t *pool)
477 const char *src_orig = src, *src_end = src + len;
478 apr_size_t new_len = 0;
479 char *new;
480 const char *new_orig;
482 /* First count how big a dest string we'll need. */
483 while (src < src_end)
485 if (! svn_ctype_isascii(*src) || *src == '\0')
486 new_len += 5; /* 5 slots, for "?\XXX" */
487 else
488 new_len += 1; /* one slot for the 7-bit char */
490 src++;
493 /* Allocate that amount, plus one slot for '\0' character. */
494 new = apr_palloc(pool, new_len + 1);
496 new_orig = new;
498 /* And fill it up. */
499 while (src_orig < src_end)
501 if (! svn_ctype_isascii(*src_orig) || src_orig == '\0')
503 /* This is the same format as svn_xml_fuzzy_escape uses, but that
504 function escapes different characters. Please keep in sync!
505 ### If we add another fuzzy escape somewhere, we should abstract
506 ### this out to a common function. */
507 apr_snprintf(new, 6, "?\\%03u", (unsigned char) *src_orig);
508 new += 5;
510 else
512 *new = *src_orig;
513 new += 1;
516 src_orig++;
519 *new = '\0';
521 return new_orig;
524 /* Convert SRC_LENGTH bytes of SRC_DATA in NODE->handle, store the result
525 in *DEST, which is allocated in POOL. */
526 static svn_error_t *
527 convert_to_stringbuf(xlate_handle_node_t *node,
528 const char *src_data,
529 apr_size_t src_length,
530 svn_stringbuf_t **dest,
531 apr_pool_t *pool)
533 #ifdef WIN32
534 apr_status_t apr_err;
536 apr_err = svn_subr__win32_xlate_to_stringbuf((win32_xlate_t *) node->handle,
537 src_data, src_length,
538 dest, pool);
539 #else
540 apr_size_t buflen = src_length * 2;
541 apr_status_t apr_err;
542 apr_size_t srclen = src_length;
543 apr_size_t destlen = buflen;
545 /* Initialize *DEST to an empty stringbuf.
546 A 1:2 ratio of input bytes to output bytes (as assigned above)
547 should be enough for most translations, and if it turns out not
548 to be enough, we'll grow the buffer again, sizing it based on a
549 1:3 ratio of the remainder of the string. */
550 *dest = svn_stringbuf_create_ensure(buflen + 1, pool);
552 /* Not only does it not make sense to convert an empty string, but
553 apr-iconv is quite unreasonable about not allowing that. */
554 if (src_length == 0)
555 return SVN_NO_ERROR;
559 /* Set up state variables for xlate. */
560 destlen = buflen - (*dest)->len;
562 /* Attempt the conversion. */
563 apr_err = apr_xlate_conv_buffer(node->handle,
564 src_data + (src_length - srclen),
565 &srclen,
566 (*dest)->data + (*dest)->len,
567 &destlen);
569 /* Now, update the *DEST->len to track the amount of output data
570 churned out so far from this loop. */
571 (*dest)->len += ((buflen - (*dest)->len) - destlen);
572 buflen += srclen * 3; /* 3 is middle ground, 2 wasn't enough
573 for all characters in the buffer, 4 is
574 maximum character size (currently) */
577 } while (apr_err == APR_SUCCESS && srclen != 0);
578 #endif
580 /* If we exited the loop with an error, return the error. */
581 if (apr_err)
583 const char *errstr;
584 svn_error_t *err;
586 /* Can't use svn_error_wrap_apr here because it calls functions in
587 this file, leading to infinite recursion. */
588 if (node->frompage == SVN_APR_LOCALE_CHARSET)
589 errstr = apr_psprintf
590 (pool, _("Can't convert string from native encoding to '%s':"),
591 node->topage);
592 else if (node->topage == SVN_APR_LOCALE_CHARSET)
593 errstr = apr_psprintf
594 (pool, _("Can't convert string from '%s' to native encoding:"),
595 node->frompage);
596 else
597 errstr = apr_psprintf
598 (pool, _("Can't convert string from '%s' to '%s':"),
599 node->frompage, node->topage);
601 err = svn_error_create(apr_err, NULL, fuzzy_escape(src_data,
602 src_length, pool));
603 return svn_error_create(apr_err, err, errstr);
605 /* Else, exited due to success. Trim the result buffer down to the
606 right length. */
607 (*dest)->data[(*dest)->len] = '\0';
609 return SVN_NO_ERROR;
613 /* Return APR_EINVAL if the first LEN bytes of DATA contain anything
614 other than seven-bit, non-control (except for whitespace) ASCII
615 characters, finding the error pool from POOL. Otherwise, return
616 SVN_NO_ERROR. */
617 static svn_error_t *
618 check_non_ascii(const char *data, apr_size_t len, apr_pool_t *pool)
620 const char *data_start = data;
622 for (; len > 0; --len, data++)
624 if ((! svn_ctype_isascii(*data))
625 || ((! svn_ctype_isspace(*data))
626 && svn_ctype_iscntrl(*data)))
628 /* Show the printable part of the data, followed by the
629 decimal code of the questionable character. Because if a
630 user ever gets this error, she's going to have to spend
631 time tracking down the non-ASCII data, so we want to help
632 as much as possible. And yes, we just call the unsafe
633 data "non-ASCII", even though the actual constraint is
634 somewhat more complex than that. */
636 if (data - data_start)
638 const char *error_data
639 = apr_pstrndup(pool, data_start, (data - data_start));
641 return svn_error_createf
642 (APR_EINVAL, NULL,
643 _("Safe data '%s' was followed by non-ASCII byte %d: "
644 "unable to convert to/from UTF-8"),
645 error_data, *((const unsigned char *) data));
647 else
649 return svn_error_createf
650 (APR_EINVAL, NULL,
651 _("Non-ASCII character (code %d) detected, "
652 "and unable to convert to/from UTF-8"),
653 *((const unsigned char *) data));
658 return SVN_NO_ERROR;
661 /* Construct an error with code APR_EINVAL and with a suitable message
662 * to describe the invalid UTF-8 sequence DATA of length LEN (which
663 * may have embedded NULLs). We can't simply print the data, almost
664 * by definition we don't really know how it is encoded.
666 static svn_error_t *
667 invalid_utf8(const char *data, apr_size_t len, apr_pool_t *pool)
669 const char *last = svn_utf__last_valid(data, len);
670 const char *valid_txt = "", *invalid_txt = "";
671 apr_size_t i;
672 size_t valid, invalid;
674 /* We will display at most 24 valid octets (this may split a leading
675 multi-byte character) as that should fit on one 80 character line. */
676 valid = last - data;
677 if (valid > 24)
678 valid = 24;
679 for (i = 0; i < valid; ++i)
680 valid_txt = apr_pstrcat(pool, valid_txt,
681 apr_psprintf(pool, " %02x",
682 (unsigned char)last[i-valid]),
683 (char *)NULL);
685 /* 4 invalid octets will guarantee that the faulty octet is displayed */
686 invalid = data + len - last;
687 if (invalid > 4)
688 invalid = 4;
689 for (i = 0; i < invalid; ++i)
690 invalid_txt = apr_pstrcat(pool, invalid_txt,
691 apr_psprintf(pool, " %02x",
692 (unsigned char)last[i]),
693 (char *)NULL);
695 return svn_error_createf(APR_EINVAL, NULL,
696 _("Valid UTF-8 data\n(hex:%s)\n"
697 "followed by invalid UTF-8 sequence\n(hex:%s)"),
698 valid_txt, invalid_txt);
701 /* Verify that the sequence DATA of length LEN is valid UTF-8.
702 If it is not, return an error with code APR_EINVAL. */
703 static svn_error_t *
704 check_utf8(const char *data, apr_size_t len, apr_pool_t *pool)
706 if (! svn_utf__is_valid(data, len))
707 return invalid_utf8(data, len, pool);
708 return SVN_NO_ERROR;
711 /* Verify that the NULL terminated sequence DATA is valid UTF-8.
712 If it is not, return an error with code APR_EINVAL. */
713 static svn_error_t *
714 check_cstring_utf8(const char *data, apr_pool_t *pool)
717 if (! svn_utf__cstring_is_valid(data))
718 return invalid_utf8(data, strlen(data), pool);
719 return SVN_NO_ERROR;
723 svn_error_t *
724 svn_utf_stringbuf_to_utf8(svn_stringbuf_t **dest,
725 const svn_stringbuf_t *src,
726 apr_pool_t *pool)
728 xlate_handle_node_t *node;
729 svn_error_t *err;
731 SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
733 if (node->handle)
735 err = convert_to_stringbuf(node, src->data, src->len, dest, pool);
736 if (! err)
737 err = check_utf8((*dest)->data, (*dest)->len, pool);
739 else
741 err = check_non_ascii(src->data, src->len, pool);
742 if (! err)
743 *dest = svn_stringbuf_dup(src, pool);
746 return svn_error_compose_create(err,
747 put_xlate_handle_node
748 (node,
749 SVN_UTF_NTOU_XLATE_HANDLE,
750 pool));
754 svn_error_t *
755 svn_utf_string_to_utf8(const svn_string_t **dest,
756 const svn_string_t *src,
757 apr_pool_t *pool)
759 svn_stringbuf_t *destbuf;
760 xlate_handle_node_t *node;
761 svn_error_t *err;
763 SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
765 if (node->handle)
767 err = convert_to_stringbuf(node, src->data, src->len, &destbuf, pool);
768 if (! err)
769 err = check_utf8(destbuf->data, destbuf->len, pool);
770 if (! err)
771 *dest = svn_stringbuf__morph_into_string(destbuf);
773 else
775 err = check_non_ascii(src->data, src->len, pool);
776 if (! err)
777 *dest = svn_string_dup(src, pool);
780 return svn_error_compose_create(err,
781 put_xlate_handle_node
782 (node,
783 SVN_UTF_NTOU_XLATE_HANDLE,
784 pool));
788 /* Common implementation for svn_utf_cstring_to_utf8,
789 svn_utf_cstring_to_utf8_ex, svn_utf_cstring_from_utf8 and
790 svn_utf_cstring_from_utf8_ex. Convert SRC to DEST using NODE->handle as
791 the translator and allocating from POOL. */
792 static svn_error_t *
793 convert_cstring(const char **dest,
794 const char *src,
795 xlate_handle_node_t *node,
796 apr_pool_t *pool)
798 if (node->handle)
800 svn_stringbuf_t *destbuf;
801 SVN_ERR(convert_to_stringbuf(node, src, strlen(src),
802 &destbuf, pool));
803 *dest = destbuf->data;
805 else
807 apr_size_t len = strlen(src);
808 SVN_ERR(check_non_ascii(src, len, pool));
809 *dest = apr_pstrmemdup(pool, src, len);
811 return SVN_NO_ERROR;
815 svn_error_t *
816 svn_utf_cstring_to_utf8(const char **dest,
817 const char *src,
818 apr_pool_t *pool)
820 xlate_handle_node_t *node;
821 svn_error_t *err;
823 SVN_ERR(get_ntou_xlate_handle_node(&node, pool));
824 err = convert_cstring(dest, src, node, pool);
825 SVN_ERR(svn_error_compose_create(err,
826 put_xlate_handle_node
827 (node,
828 SVN_UTF_NTOU_XLATE_HANDLE,
829 pool)));
830 return check_cstring_utf8(*dest, pool);
834 svn_error_t *
835 svn_utf_cstring_to_utf8_ex2(const char **dest,
836 const char *src,
837 const char *frompage,
838 apr_pool_t *pool)
840 xlate_handle_node_t *node;
841 svn_error_t *err;
842 const char *convset_key = get_xlate_key(SVN_APR_UTF8_CHARSET, frompage,
843 pool);
845 SVN_ERR(get_xlate_handle_node(&node, SVN_APR_UTF8_CHARSET, frompage,
846 convset_key, pool));
847 err = convert_cstring(dest, src, node, pool);
848 SVN_ERR(svn_error_compose_create(err,
849 put_xlate_handle_node
850 (node,
851 SVN_UTF_NTOU_XLATE_HANDLE,
852 pool)));
854 return check_cstring_utf8(*dest, pool);
858 svn_error_t *
859 svn_utf_cstring_to_utf8_ex(const char **dest,
860 const char *src,
861 const char *frompage,
862 const char *convset_key,
863 apr_pool_t *pool)
865 return svn_utf_cstring_to_utf8_ex2(dest, src, frompage, pool);
869 svn_error_t *
870 svn_utf_stringbuf_from_utf8(svn_stringbuf_t **dest,
871 const svn_stringbuf_t *src,
872 apr_pool_t *pool)
874 xlate_handle_node_t *node;
875 svn_error_t *err;
877 SVN_ERR(get_uton_xlate_handle_node(&node, pool));
879 if (node->handle)
881 err = check_utf8(src->data, src->len, pool);
882 if (! err)
883 err = convert_to_stringbuf(node, src->data, src->len, dest, pool);
885 else
887 err = check_non_ascii(src->data, src->len, pool);
888 if (! err)
889 *dest = svn_stringbuf_dup(src, pool);
892 err = svn_error_compose_create(
893 err,
894 put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
896 return err;
900 svn_error_t *
901 svn_utf_string_from_utf8(const svn_string_t **dest,
902 const svn_string_t *src,
903 apr_pool_t *pool)
905 svn_stringbuf_t *dbuf;
906 xlate_handle_node_t *node;
907 svn_error_t *err;
909 SVN_ERR(get_uton_xlate_handle_node(&node, pool));
911 if (node->handle)
913 err = check_utf8(src->data, src->len, pool);
914 if (! err)
915 err = convert_to_stringbuf(node, src->data, src->len,
916 &dbuf, pool);
917 if (! err)
918 *dest = svn_stringbuf__morph_into_string(dbuf);
920 else
922 err = check_non_ascii(src->data, src->len, pool);
923 if (! err)
924 *dest = svn_string_dup(src, pool);
927 err = svn_error_compose_create(
928 err,
929 put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
931 return err;
935 svn_error_t *
936 svn_utf_cstring_from_utf8(const char **dest,
937 const char *src,
938 apr_pool_t *pool)
940 xlate_handle_node_t *node;
941 svn_error_t *err;
943 SVN_ERR(check_cstring_utf8(src, pool));
945 SVN_ERR(get_uton_xlate_handle_node(&node, pool));
946 err = convert_cstring(dest, src, node, pool);
947 err = svn_error_compose_create(
948 err,
949 put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
951 return err;
955 svn_error_t *
956 svn_utf_cstring_from_utf8_ex2(const char **dest,
957 const char *src,
958 const char *topage,
959 apr_pool_t *pool)
961 xlate_handle_node_t *node;
962 svn_error_t *err;
963 const char *convset_key = get_xlate_key(topage, SVN_APR_UTF8_CHARSET,
964 pool);
966 SVN_ERR(check_cstring_utf8(src, pool));
968 SVN_ERR(get_xlate_handle_node(&node, topage, SVN_APR_UTF8_CHARSET,
969 convset_key, pool));
970 err = convert_cstring(dest, src, node, pool);
971 err = svn_error_compose_create(
972 err,
973 put_xlate_handle_node(node, convset_key, pool));
975 return err;
979 svn_error_t *
980 svn_utf_cstring_from_utf8_ex(const char **dest,
981 const char *src,
982 const char *topage,
983 const char *convset_key,
984 apr_pool_t *pool)
986 return svn_utf_cstring_from_utf8_ex2(dest, src, topage, pool);
990 const char *
991 svn_utf__cstring_from_utf8_fuzzy(const char *src,
992 apr_pool_t *pool,
993 svn_error_t *(*convert_from_utf8)
994 (const char **, const char *, apr_pool_t *))
996 const char *escaped, *converted;
997 svn_error_t *err;
999 escaped = fuzzy_escape(src, strlen(src), pool);
1001 /* Okay, now we have a *new* UTF-8 string, one that's guaranteed to
1002 contain only 7-bit bytes :-). Recode to native... */
1003 err = convert_from_utf8(((const char **) &converted), escaped, pool);
1005 if (err)
1007 svn_error_clear(err);
1008 return escaped;
1010 else
1011 return converted;
1013 /* ### Check the client locale, maybe we can avoid that second
1014 * conversion! See Ulrich Drepper's patch at
1015 * http://subversion.tigris.org/issues/show_bug.cgi?id=807.
1020 const char *
1021 svn_utf_cstring_from_utf8_fuzzy(const char *src,
1022 apr_pool_t *pool)
1024 return svn_utf__cstring_from_utf8_fuzzy(src, pool,
1025 svn_utf_cstring_from_utf8);
1029 svn_error_t *
1030 svn_utf_cstring_from_utf8_stringbuf(const char **dest,
1031 const svn_stringbuf_t *src,
1032 apr_pool_t *pool)
1034 svn_stringbuf_t *destbuf;
1036 SVN_ERR(svn_utf_stringbuf_from_utf8(&destbuf, src, pool));
1037 *dest = destbuf->data;
1039 return SVN_NO_ERROR;
1043 svn_error_t *
1044 svn_utf_cstring_from_utf8_string(const char **dest,
1045 const svn_string_t *src,
1046 apr_pool_t *pool)
1048 svn_stringbuf_t *dbuf;
1049 xlate_handle_node_t *node;
1050 svn_error_t *err;
1052 SVN_ERR(get_uton_xlate_handle_node(&node, pool));
1054 if (node->handle)
1056 err = check_utf8(src->data, src->len, pool);
1057 if (! err)
1058 err = convert_to_stringbuf(node, src->data, src->len,
1059 &dbuf, pool);
1060 if (! err)
1061 *dest = dbuf->data;
1063 else
1065 err = check_non_ascii(src->data, src->len, pool);
1066 if (! err)
1067 *dest = apr_pstrmemdup(pool, src->data, src->len);
1070 err = svn_error_compose_create(
1071 err,
1072 put_xlate_handle_node(node, SVN_UTF_UTON_XLATE_HANDLE, pool));
1074 return err;