[gjs/wip/ptomato/mozjs38: 12/20] js: Accommodate both Latin-1 and two-byte strings



commit e4d4432167882b91d1ccb1e830cee02dc0043ffb
Author: Philip Chimento <philip chimento gmail com>
Date:   Tue Jan 17 23:36:07 2017 -0800

    js: Accommodate both Latin-1 and two-byte strings
    
    Starting with SpiderMonkey 38, strings can be stored internally in the
    JS engine as either Latin-1 or two bytes per character (sort-of-UTF-16).
    When operating on the characters directly, we must check
    JS_StringHasLatin1Chars() and use the appropriate accessors depending on
    how they are stored.
    
    Additionally, since the string's characters are still owned by the
    string and may get moved around during garbage collection, we cannot use
    any JSAPI functions that might trigger a GC while maintaining a pointer
    to the string's characters. To enforce this, we must construct a
    JS::AutoCheckCannotGC object, which will cause a crash if a GC starts
    while it is in scope.
    
    https://bugzilla.gnome.org/show_bug.cgi?id=777962

 gjs/byteArray.cpp                   |   48 +++++++++++++++-------
 gjs/jsapi-util-string.cpp           |   71 ++++++++++++++++++++++++++------
 installed-tests/js/testByteArray.js |   18 ++++++++
 test/gjs-tests.cpp                  |   75 +++++++++++++++++++++++++++++++----
 4 files changed, 175 insertions(+), 37 deletions(-)
---
diff --git a/gjs/byteArray.cpp b/gjs/byteArray.cpp
index 2809abd..d385ad6 100644
--- a/gjs/byteArray.cpp
+++ b/gjs/byteArray.cpp
@@ -603,24 +603,42 @@ from_string_func(JSContext *context,
         g_byte_array_append(priv->array, (guint8*) utf8, strlen(utf8));
         g_free(utf8);
     } else {
-        char *encoded;
+        JSString *str = argv[0].toString();  /* Rooted by argv */
+        GError *error = NULL;
+        char *encoded = NULL;
         gsize bytes_written;
-        GError *error;
-        const char16_t *u16_chars;
-        gsize u16_len;
 
-        u16_chars = JS_GetStringCharsAndLength(context, argv[0].toString(), &u16_len);
-        if (u16_chars == NULL)
-            return false;
+        /* Scope for AutoCheckCannotGC, will crash if a GC is triggered
+         * while we are using the string's chars */
+        {
+            JS::AutoCheckCannotGC nogc;
+            size_t len;
+
+            if (JS_StringHasLatin1Chars(str)) {
+                const JS::Latin1Char *chars =
+                    JS_GetLatin1StringCharsAndLength(context, nogc, str, &len);
+                if (chars == NULL)
+                    return false;
+
+                encoded = g_convert((char *) chars, len,
+                                    encoding,  /* to_encoding */
+                                    "LATIN1",  /* from_encoding */
+                                    NULL,  /* bytes read */
+                                    &bytes_written, &error);
+            } else {
+                const char16_t *chars =
+                    JS_GetTwoByteStringCharsAndLength(context, nogc, str, &len);
+                if (chars == NULL)
+                    return false;
+
+                encoded = g_convert((char *) chars, len * 2,
+                                    encoding,  /* to_encoding */
+                                    "UTF-16",  /* from_encoding */
+                                    NULL,  /* bytes read */
+                                    &bytes_written, &error);
+            }
+        }
 
-        error = NULL;
-        encoded = g_convert((char*) u16_chars,
-                            u16_len * 2,
-                            encoding, /* to_encoding */
-                            "UTF-16", /* from_encoding */
-                            NULL, /* bytes read */
-                            &bytes_written,
-                            &error);
         g_free(encoding);
         if (encoded == NULL) {
             /* frees the GError */
diff --git a/gjs/jsapi-util-string.cpp b/gjs/jsapi-util-string.cpp
index 6143b47..3157ab9 100644
--- a/gjs/jsapi-util-string.cpp
+++ b/gjs/jsapi-util-string.cpp
@@ -23,6 +23,7 @@
 
 #include <config.h>
 
+#include <algorithm>
 #include <string.h>
 
 #include "jsapi-util.h"
@@ -161,6 +162,35 @@ gjs_string_from_filename(JSContext             *context,
     return true;
 }
 
+/* Converts a JSString's array of Latin-1 chars to an array of a wider integer
+ * type, by what the compiler believes is the most efficient method possible */
+template<typename T>
+static bool
+from_latin1(JSContext *cx,
+            JSString  *str,
+            T        **data_p,
+            size_t    *len_p)
+{
+    /* No garbage collection should be triggered while we are using the string's
+     * chars. Crash if that happens. */
+    JS::AutoCheckCannotGC nogc;
+
+    const JS::Latin1Char *js_data =
+        JS_GetLatin1StringCharsAndLength(cx, nogc, str, len_p);
+    if (js_data == NULL)
+        return false;
+
+    /* Unicode codepoints 0x00-0xFF are the same as Latin-1
+     * codepoints, so we can preserve the string length and simply
+     * copy the codepoints to an array of different-sized ints */
+
+    *data_p = g_new(T, *len_p);
+
+    /* This will probably use a loop, unfortunately */
+    std::copy(js_data, js_data + *len_p, *data_p);
+    return true;
+}
+
 /**
  * gjs_string_get_char16_data:
  * @context: js context
@@ -180,27 +210,31 @@ gjs_string_get_char16_data(JSContext       *context,
                            char16_t       **data_p,
                            size_t          *len_p)
 {
-    const char16_t *js_data;
-    bool retval = false;
-
-    JS_BeginRequest(context);
+    JSAutoRequest ar(context);
 
     if (!value.isString()) {
         gjs_throw(context,
                   "Value is not a string, can't return binary data from it");
-        goto out;
+        return false;
     }
 
-    js_data = JS_GetStringCharsAndLength(context, value.toString(), len_p);
+    if (JS_StringHasLatin1Chars(value.toString()))
+        return from_latin1(context, value.toString(), data_p, len_p);
+
+    /* From this point on, crash if a GC is triggered while we are using
+     * the string's chars */
+    JS::AutoCheckCannotGC nogc;
+
+    const char16_t *js_data =
+        JS_GetTwoByteStringCharsAndLength(context, nogc,
+                                          value.toString(), len_p);
+
     if (js_data == NULL)
-        goto out;
+        return false;
 
     *data_p = (char16_t *) g_memdup(js_data, sizeof(*js_data) * (*len_p));
 
-    retval = true;
-out:
-    JS_EndRequest(context);
-    return retval;
+    return true;
 }
 
 /**
@@ -228,10 +262,19 @@ gjs_string_to_ucs4(JSContext      *cx,
 
     JSAutoRequest ar(cx);
     JS::RootedString str(cx, value.toString());
-    size_t utf16_len;
+    size_t len;
     GError *error = NULL;
 
-    const char16_t *utf16 = JS_GetStringCharsAndLength(cx, str, &utf16_len);
+    if (JS_StringHasLatin1Chars(str))
+        return from_latin1(cx, value.toString(), ucs4_string_p, len_p);
+
+    /* From this point on, crash if a GC is triggered while we are using
+     * the string's chars */
+    JS::AutoCheckCannotGC nogc;
+
+    const char16_t *utf16 =
+        JS_GetTwoByteStringCharsAndLength(cx, nogc, str, &len);
+
     if (utf16 == NULL) {
         gjs_throw(cx, "Failed to get UTF-16 string data");
         return false;
@@ -240,7 +283,7 @@ gjs_string_to_ucs4(JSContext      *cx,
     if (ucs4_string_p != NULL) {
         long length;
         *ucs4_string_p = g_utf16_to_ucs4(reinterpret_cast<const gunichar2 *>(utf16),
-                                         utf16_len, NULL, &length, &error);
+                                         len, NULL, &length, &error);
         if (*ucs4_string_p == NULL) {
             gjs_throw(cx, "Failed to convert UTF-16 string to UCS-4: %s",
                       error->message);
diff --git a/installed-tests/js/testByteArray.js b/installed-tests/js/testByteArray.js
index 97f93e3..57ec948 100644
--- a/installed-tests/js/testByteArray.js
+++ b/installed-tests/js/testByteArray.js
@@ -94,6 +94,24 @@ describe('Byte array', function () {
         [97, 98, 99, 100].forEach((val, ix) => expect(a[ix]).toEqual(val));
     });
 
+    it('can be encoded from a string', function () {
+        // Pick a string likely to be stored internally as Latin1
+        let a = ByteArray.fromString('äbcd', 'LATIN1');
+        expect(a.length).toEqual(4);
+        [228, 98, 99, 100].forEach((val, ix) => expect(a[ix]).toEqual(val));
+
+        // Try again with a string not likely to be Latin1 internally
+        a = ByteArray.fromString('⅜', 'UTF-8');
+        expect(a.length).toEqual(3);
+        [0xe2, 0x85, 0x9c].forEach((val, ix) => expect(a[ix]).toEqual(val));
+    });
+
+    it('encodes as UTF-8 by default', function () {
+        let a = ByteArray.fromString('⅜');
+        expect(a.length).toEqual(3);
+        [0xe2, 0x85, 0x9c].forEach((val, ix) => expect(a[ix]).toEqual(val));
+    });
+
     it('can be created from an array', function () {
         let a = ByteArray.fromArray([ 1, 2, 3, 4 ]);
         expect(a.length).toEqual(4);
diff --git a/test/gjs-tests.cpp b/test/gjs-tests.cpp
index a73daf7..7131fb7 100644
--- a/test/gjs-tests.cpp
+++ b/test/gjs-tests.cpp
@@ -22,6 +22,9 @@
  */
 
 #include <config.h>
+
+#include <string>
+
 #include <glib.h>
 #include <glib-object.h>
 #include <util/glib.h>
@@ -180,11 +183,63 @@ gjstest_test_func_gjs_jsapi_util_error_throw(GjsUnitTestFixture *fx,
 }
 
 static void
-gjstest_test_func_gjs_jsapi_util_debug_string_valid_utf8(GjsUnitTestFixture *fx,
-                                                         gconstpointer       unused)
+test_jsapi_util_string_char16_data(GjsUnitTestFixture *fx,
+                                   gconstpointer       unused)
+{
+    char16_t *chars;
+    size_t len;
+    JS::RootedValue v_string(fx->cx);
+
+    g_assert_true(gjs_string_from_utf8(fx->cx, VALID_UTF8_STRING, -1,
+                                       &v_string));
+    g_assert_true(gjs_string_get_char16_data(fx->cx, v_string, &chars,
+                                             &len));
+    std::u16string result(chars, len);
+    g_assert_true(result == u"\xc9\xd6 foobar \u30df");
+    g_free(chars);
+
+    /* Try with a string that is likely to be stored as Latin-1 */
+    v_string.setString(JS_NewStringCopyZ(fx->cx, "abcd"));
+    g_assert_true(gjs_string_get_char16_data(fx->cx, v_string, &chars,
+                                             &len));
+
+    result.assign(chars, len);
+    g_assert_true(result == u"abcd");
+    g_free(chars);
+}
+
+static void
+test_jsapi_util_string_to_ucs4(GjsUnitTestFixture *fx,
+                               gconstpointer       unused)
+{
+    gunichar *chars;
+    size_t len;
+    JS::RootedValue v_string(fx->cx);
+
+    g_assert_true(gjs_string_from_utf8(fx->cx, VALID_UTF8_STRING, -1,
+                                       &v_string));
+    g_assert_true(gjs_string_to_ucs4(fx->cx, v_string, &chars, &len));
+
+    std::u32string result(chars, chars + len);
+    g_assert_true(result == U"\xc9\xd6 foobar \u30df");
+    g_free(chars);
+
+    /* Try with a string that is likely to be stored as Latin-1 */
+    v_string.setString(JS_NewStringCopyZ(fx->cx, "abcd"));
+    g_assert_true(gjs_string_to_ucs4(fx->cx, v_string, &chars, &len));
+
+    result.assign(chars, chars + len);
+    g_assert_true(result == U"abcd");
+    g_free(chars);
+}
+
+static void
+test_jsapi_util_debug_string_valid_utf8(GjsUnitTestFixture *fx,
+                                        gconstpointer       unused)
 {
     JS::RootedValue v_string(fx->cx);
-    g_assert_true(gjs_string_from_utf8(fx->cx, VALID_UTF8_STRING, -1, &v_string));
+    g_assert_true(gjs_string_from_utf8(fx->cx, VALID_UTF8_STRING, -1,
+                                       &v_string));
 
     char *debug_output = gjs_value_debug_string(fx->cx, v_string);
 
@@ -195,8 +250,8 @@ gjstest_test_func_gjs_jsapi_util_debug_string_valid_utf8(GjsUnitTestFixture *fx,
 }
 
 static void
-gjstest_test_func_gjs_jsapi_util_debug_string_invalid_utf8(GjsUnitTestFixture *fx,
-                                                           gconstpointer       unused)
+test_jsapi_util_debug_string_invalid_utf8(GjsUnitTestFixture *fx,
+                                          gconstpointer       unused)
 {
     g_test_skip("SpiderMonkey doesn't validate UTF-8 after encoding it");
 
@@ -207,7 +262,7 @@ gjstest_test_func_gjs_jsapi_util_debug_string_invalid_utf8(GjsUnitTestFixture *f
     char *debug_output = gjs_value_debug_string(fx->cx, v_string);
 
     g_assert_nonnull(debug_output);
-    /* g_assert_cmpstr("\"\\xf5\\xf6\"", ==, debug_output); */
+    /* g_assert_cmpstr("\"\\xff\\xff\\xff\\xff\"", ==, debug_output); */
 
     g_free(debug_output);
 }
@@ -331,10 +386,14 @@ main(int    argc,
                         gjstest_test_func_gjs_jsapi_util_error_throw);
     ADD_JSAPI_UTIL_TEST("string/js/string/utf8",
                         gjstest_test_func_gjs_jsapi_util_string_js_string_utf8);
+    ADD_JSAPI_UTIL_TEST("string/char16_data",
+                        test_jsapi_util_string_char16_data);
+    ADD_JSAPI_UTIL_TEST("string/to_ucs4",
+                        test_jsapi_util_string_to_ucs4);
     ADD_JSAPI_UTIL_TEST("debug_string/valid-utf8",
-                        gjstest_test_func_gjs_jsapi_util_debug_string_valid_utf8);
+                        test_jsapi_util_debug_string_valid_utf8);
     ADD_JSAPI_UTIL_TEST("debug_string/invalid-utf8",
-                        gjstest_test_func_gjs_jsapi_util_debug_string_invalid_utf8);
+                        test_jsapi_util_debug_string_invalid_utf8);
 
 #undef ADD_JSAPI_UTIL_TEST
 


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]