From 97d72d15a0dbde3e8c8c51eb139f270874e37fda Mon Sep 17 00:00:00 2001 From: Kalle Olavi Niemitalo Date: Mon, 20 Oct 2008 01:09:45 +0300 Subject: [PATCH] bug 153, 1066: Convert properties of SMJS bookmark to/from UTF-8. SpiderMonkey uses UTF-16 and the strings in struct bookmark are in UTF-8. Previously, the conversions behaved as if the strings had been in ISO-8859-1. SpiderMonkey also supports JS_SetCStringsAreUTF8(), which would make the existing functions convert between UTF-16 and UTF-8, but that effect is global so I dare not enable it yet. Besides, I don't know if that function works in all the SpiderMonkey versions that ELinks claims to work with. --- src/bookmarks/bookmarks.c | 13 +++- src/scripting/smjs/bookmarks.c | 96 ++++++++++++++++-------- src/scripting/smjs/core.c | 161 +++++++++++++++++++++++++++++++++++++++++ src/scripting/smjs/core.h | 5 ++ 4 files changed, 241 insertions(+), 34 deletions(-) diff --git a/src/bookmarks/bookmarks.c b/src/bookmarks/bookmarks.c index 91add9ec..9416a5aa 100644 --- a/src/bookmarks/bookmarks.c +++ b/src/bookmarks/bookmarks.c @@ -525,8 +525,16 @@ update_bookmark(struct bookmark *bm, int codepage, return 1; } -/* Search for a bookmark with the given title. Search in the given folder - * or in the root if folder is NULL. */ +/** Search for a bookmark with the given title. The search does not + * recurse into subfolders. + * + * @param folder + * Search in this folder. NULL means search in the root. + * + * @param title + * Search for this title. Must be in UTF-8 and not NULL. + * + * @return The bookmark, or NULL if not found. */ struct bookmark * get_bookmark_by_name(struct bookmark *folder, unsigned char *title) { @@ -535,7 +543,6 @@ get_bookmark_by_name(struct bookmark *folder, unsigned char *title) lh = folder ? &folder->child : &bookmarks; - /** @todo Bug 153: bookmark->title should be UTF-8 */ foreach (bookmark, *lh) if (!strcmp(bookmark->title, title)) return bookmark; diff --git a/src/scripting/smjs/bookmarks.c b/src/scripting/smjs/bookmarks.c index 280a7f27..60e2e70c 100644 --- a/src/scripting/smjs/bookmarks.c +++ b/src/scripting/smjs/bookmarks.c @@ -8,6 +8,7 @@ #include "bookmarks/bookmarks.h" #include "ecmascript/spidermonkey-shared.h" +#include "intl/charsets.h" #include "main/event.h" #include "scripting/smjs/core.h" #include "scripting/smjs/elinks_object.h" @@ -78,6 +79,60 @@ static const JSPropertySpec bookmark_props[] = { static JSObject *smjs_get_bookmark_folder_object(struct bookmark *bookmark); +/** Convert a string retrieved from struct bookmark to a jsval. + * + * @return JS_TRUE if successful. On error, report the error and + * return JS_FALSE. */ +static JSBool +bookmark_string_to_jsval(JSContext *ctx, const unsigned char *str, jsval *vp) +{ + JSString *jsstr = utf8_to_jsstring(ctx, str, -1); + + if (jsstr == NULL) + return JS_FALSE; + *vp = STRING_TO_JSVAL(jsstr); + return JS_TRUE; +} + +/** Convert a jsval to a string and store it in struct bookmark. + * + * @param ctx + * Context for memory allocations and error reports. + * @param val + * The @c jsval that should be converted. + * @param[in,out] result + * A string allocated with mem_alloc(). + * On success, this function frees the original string, if any. + * + * @return JS_TRUE if successful. On error, report the error to + * SpiderMonkey and return JS_FALSE. */ +static JSBool +jsval_to_bookmark_string(JSContext *ctx, jsval val, unsigned char **result) +{ + JSString *jsstr = NULL; + unsigned char *str; + + /* jsstring_to_utf8() might GC; protect the string to come. */ + if (!JS_AddNamedRoot(ctx, &jsstr, "jsval_to_bookmark_string")) + return JS_FALSE; + + jsstr = JS_ValueToString(ctx, val); + if (jsstr == NULL) { + JS_RemoveRoot(ctx, &jsstr); + return JS_FALSE; + } + + str = jsstring_to_utf8(ctx, jsstr, NULL); + if (str == NULL) { + JS_RemoveRoot(ctx, &jsstr); + return JS_FALSE; + } + + JS_RemoveRoot(ctx, &jsstr); + mem_free_set(result, str); + return JS_TRUE; +} + /* @bookmark_class.getProperty */ static JSBool bookmark_get_property(JSContext *ctx, JSObject *obj, jsval id, jsval *vp) @@ -102,17 +157,9 @@ bookmark_get_property(JSContext *ctx, JSObject *obj, jsval id, jsval *vp) switch (JSVAL_TO_INT(id)) { case BOOKMARK_TITLE: - /** @todo Bug 153: bookmark->title should be UTF-8 */ - *vp = STRING_TO_JSVAL(JS_NewStringCopyZ(smjs_ctx, - bookmark->title)); - - return JS_TRUE; + return bookmark_string_to_jsval(ctx, bookmark->title, vp); case BOOKMARK_URL: - /** @todo Bug 1066: bookmark->url should be UTF-8 */ - *vp = STRING_TO_JSVAL(JS_NewStringCopyZ(smjs_ctx, - bookmark->url)); - - return JS_TRUE; + return bookmark_string_to_jsval(ctx, bookmark->url, vp); case BOOKMARK_CHILDREN: *vp = OBJECT_TO_JSVAL(smjs_get_bookmark_folder_object(bookmark)); @@ -149,24 +196,10 @@ bookmark_set_property(JSContext *ctx, JSObject *obj, jsval id, jsval *vp) return JS_FALSE; switch (JSVAL_TO_INT(id)) { - case BOOKMARK_TITLE: { - JSString *jsstr = JS_ValueToString(smjs_ctx, *vp); - unsigned char *str = JS_GetStringBytes(jsstr); - - /** @todo Bug 153: bookmark->title should be UTF-8 */ - mem_free_set(&bookmark->title, stracpy(str)); - - return JS_TRUE; - } - case BOOKMARK_URL: { - JSString *jsstr = JS_ValueToString(smjs_ctx, *vp); - unsigned char *str = JS_GetStringBytes(jsstr); - - /** @todo Bug 1066: bookmark->url should be UTF-8 */ - mem_free_set(&bookmark->url, stracpy(str)); - - return JS_TRUE; - } + case BOOKMARK_TITLE: + return jsval_to_bookmark_string(ctx, *vp, &bookmark->title); + case BOOKMARK_URL: + return jsval_to_bookmark_string(ctx, *vp, &bookmark->url); default: /* Unrecognized integer property ID; someone is using * the object as an array. SMJS builtin classes (e.g. @@ -209,7 +242,7 @@ bookmark_folder_get_property(JSContext *ctx, JSObject *obj, jsval id, jsval *vp) { struct bookmark *bookmark; struct bookmark *folder; - unsigned char *title; + unsigned char *title = NULL; /* This can be called if @obj if not itself an instance of the * appropriate class but has one in its prototype chain. Fail @@ -222,14 +255,15 @@ bookmark_folder_get_property(JSContext *ctx, JSObject *obj, jsval id, jsval *vp) *vp = JSVAL_NULL; - title = JS_GetStringBytes(JS_ValueToString(ctx, id)); - if (!title) return JS_TRUE; + if (!jsval_to_bookmark_string(ctx, id, &title)) + return JS_FALSE; bookmark = get_bookmark_by_name(folder, title); if (bookmark) { *vp = OBJECT_TO_JSVAL(smjs_get_bookmark_object(bookmark)); } + mem_free(title); return JS_TRUE; } diff --git a/src/scripting/smjs/core.c b/src/scripting/smjs/core.c index 7fa296a8..57a8eb41 100644 --- a/src/scripting/smjs/core.c +++ b/src/scripting/smjs/core.c @@ -8,6 +8,7 @@ #include "config/home.h" #include "ecmascript/spidermonkey-shared.h" +#include "intl/charsets.h" #include "main/module.h" #include "osdep/osdep.h" #include "scripting/scripting.h" @@ -163,3 +164,163 @@ cleanup_smjs(struct module *module) JS_DestroyContext(smjs_ctx); spidermonkey_runtime_release(); } + +/** Convert a UTF-8 string to a JSString. + * + * @param ctx + * Allocate the string in this JSContext. + * @param[in] str + * The input string that should be converted. + * @param[in] length + * Length of @a str in bytes, or -1 if it is null-terminated. + * + * @return the new string. On error, report the error to SpiderMonkey + * and return NULL. */ +JSString * +utf8_to_jsstring(JSContext *ctx, const unsigned char *str, int length) +{ + size_t in_bytes; + const unsigned char *in_end; + size_t utf16_alloc; + jschar *utf16; + size_t utf16_used; + JSString *jsstr; + + if (length == -1) + in_bytes = strlen(str); + else + in_bytes = length; + + /* Each byte of input can become at most one UTF-16 unit. + * Check whether the multiplication could overflow. */ + assert(!needs_utf16_surrogates(UCS_REPLACEMENT_CHARACTER)); + if (in_bytes > ((size_t) -1) / sizeof(jschar)) { + JS_ReportAllocationOverflow(ctx); + return NULL; + } + utf16_alloc = in_bytes; + /* Don't use fmem_alloc here because long strings could + * exhaust the stack. */ + utf16 = mem_alloc(utf16_alloc * sizeof(jschar)); + if (utf16 == NULL) { + JS_ReportOutOfMemory(ctx); + return NULL; + } + + in_end = str + in_bytes; + + utf16_used = 0; + for (;;) { + unicode_val_T unicode; + + unicode = utf8_to_unicode((unsigned char **) &str, in_end); + if (unicode == UCS_NO_CHAR) + break; + + if (needs_utf16_surrogates(unicode)) { + assert(utf16_alloc - utf16_used >= 2); + if_assert_failed { mem_free(utf16); return NULL; } + utf16[utf16_used++] = get_utf16_high_surrogate(unicode); + utf16[utf16_used++] = get_utf16_low_surrogate(unicode); + } else { + assert(utf16_alloc - utf16_used >= 1); + if_assert_failed { mem_free(utf16); return NULL; } + utf16[utf16_used++] = unicode; + } + } + + jsstr = JS_NewUCString(ctx, utf16, utf16_used); + mem_free(utf16); + return jsstr; +} + +/** Convert a jschar array to UTF-8 and append it to struct string. + * Replace misused surrogate codepoints with UCS_REPLACEMENT_CHARACTER. + * + * @param[in,out] utf8 + * The function appends characters to this UTF-8 string. + * + * @param[in] utf16 + * Pointer to the first element in an array of jschars. + * + * @param[i] len + * Number of jschars in the @a utf16 array. + * + * @return @a utf8 if successful, or NULL if not. */ +static struct string * +add_jschars_to_utf8_string(struct string *utf8, + const jschar *utf16, size_t len) +{ + size_t pos; + + for (pos = 0; pos < len; ) { + unicode_val_T unicode = utf16[pos++]; + + if (is_utf16_surrogate(unicode)) { + if (is_utf16_high_surrogate(unicode) + && pos < len + && is_utf16_low_surrogate(utf16[pos])) { + unicode = join_utf16_surrogates(unicode, + utf16[pos++]); + } else { + unicode = UCS_REPLACEMENT_CHARACTER; + } + } + + if (unicode == 0) { + if (!add_char_to_string(utf8, '\0')) + return NULL; + } else { + if (!add_to_string(utf8, encode_utf8(unicode))) + return NULL; + } + } + + return utf8; +} + +/** Convert a JSString to a UTF-8 string. + * + * @param ctx + * For reporting errors. + * @param[in] jsstr + * The input string that should be converted. Must not be NULL. + * @param[out] length + * Optional. The number of bytes in the returned string, + * not counting the terminating null. + * + * @return the new string, which the caller must eventually free + * with mem_free(). On error, report the error to SpiderMonkey + * and return NULL; *@a length is then undefined. */ +unsigned char * +jsstring_to_utf8(JSContext *ctx, JSString *jsstr, int *length) +{ + size_t utf16_len; + const jschar *utf16; + struct string utf8; + + utf16_len = JS_GetStringLength(jsstr); + utf16 = JS_GetStringChars(jsstr); /* stays owned by jsstr */ + if (utf16 == NULL) { + /* JS_GetStringChars doesn't have a JSContext * + * parameter so it can't report the error + * (and can't collect garbage either). */ + JS_ReportOutOfMemory(ctx); + return NULL; + } + + if (!init_string(&utf8)) { + JS_ReportOutOfMemory(ctx); + return NULL; + } + + if (!add_jschars_to_utf8_string(&utf8, utf16, utf16_len)) { + done_string(&utf8); + JS_ReportOutOfMemory(ctx); + return NULL; + } + + if (length) + *length = utf8.length; + return utf8.source; +} diff --git a/src/scripting/smjs/core.h b/src/scripting/smjs/core.h index ffbc82c1..ec5771fc 100644 --- a/src/scripting/smjs/core.h +++ b/src/scripting/smjs/core.h @@ -16,4 +16,9 @@ void alert_smjs_error(unsigned char *msg); void init_smjs(struct module *module); void cleanup_smjs(struct module *module); +JSString *utf8_to_jsstring(JSContext *ctx, const unsigned char *str, + int length); +unsigned char *jsstring_to_utf8(JSContext *ctx, JSString *jsstr, + int *length); + #endif -- 2.11.4.GIT