[gjs/ewlsh/text-encoding: 2/3] modules: Implement non-fatal encoding and decoding




commit 8c5a98b1d5fd979d405c0897b91a38b42c42ffbb
Author: Evan Welsh <contact evanwelsh com>
Date:   Sun Jul 4 22:16:36 2021 -0700

    modules: Implement non-fatal encoding and decoding

 gjs/byteArray.cpp                 |   4 +-
 gjs/jsapi-util-string.cpp         |  44 ++++++++
 gjs/jsapi-util.h                  |   5 +
 gjs/text-encoding.cpp             | 204 ++++++++++++++++++++++++++++++++------
 gjs/text-encoding.h               |   3 +-
 modules/esm/_encoding/encoding.js |   2 +-
 6 files changed, 227 insertions(+), 35 deletions(-)
---
diff --git a/gjs/byteArray.cpp b/gjs/byteArray.cpp
index a5979df0..e3afc11c 100644
--- a/gjs/byteArray.cpp
+++ b/gjs/byteArray.cpp
@@ -50,7 +50,7 @@ static bool to_string_func(JSContext* cx, unsigned argc, JS::Value* vp) {
     const char* actual_encoding = encoding ? encoding.get() : "utf-8";
     JS::RootedString str(
         cx, gjs_decode_from_uint8array(cx, byte_array, actual_encoding,
-                                       GjsStringTermination::ZERO_TERMINATED));
+                                       GjsStringTermination::ZERO_TERMINATED, true));
     if (!str)
         return false;
 
@@ -76,7 +76,7 @@ static bool instance_to_string_func(JSContext* cx, unsigned argc,
     const char* actual_encoding = encoding ? encoding.get() : "utf-8";
     JS::RootedString str(
         cx, gjs_decode_from_uint8array(cx, this_obj, actual_encoding,
-                                       GjsStringTermination::ZERO_TERMINATED));
+                                       GjsStringTermination::ZERO_TERMINATED, true));
     if (!str)
         return false;
 
diff --git a/gjs/jsapi-util-string.cpp b/gjs/jsapi-util-string.cpp
index 51ca5f9c..692a719d 100644
--- a/gjs/jsapi-util-string.cpp
+++ b/gjs/jsapi-util-string.cpp
@@ -129,6 +129,50 @@ bool gjs_string_to_utf8_n(JSContext* cx, JS::HandleString str, JS::UniqueChars*
     return true;
 }
 
+/**
+ * gjs_lossy_string_from_utf8:
+ *
+ * @brief Converts an array of UTF-8 characters to a JS string.
+ * Instead of throwing, any invalid characters will be converted
+ * to the UTF-8 invalid character fallback.
+ *
+ * @param cx the current #JSContext
+ * @param utf8_string an array of UTF-8 characters
+ * @param value_p a value to store the resulting string in
+ */
+JSString* gjs_lossy_string_from_utf8(JSContext* cx, const char* utf8_string) {
+    JS::ConstUTF8CharsZ chars(utf8_string, strlen(utf8_string));
+    size_t outlen;
+    JS::UniqueTwoByteChars twobyte_chars(
+        JS::LossyUTF8CharsToNewTwoByteCharsZ(cx, chars, &outlen,
+                                             js::MallocArena)
+            .get());
+    if (!twobyte_chars)
+        return nullptr;
+
+    return JS_NewUCStringCopyN(cx, twobyte_chars.get(), outlen);
+}
+
+/**
+ * gjs_lossy_string_from_utf8_n:
+ *
+ * @brief Provides the same conversion behavior as gjs_lossy_string_from_utf8
+ * with a fixed length. See gjs_lossy_string_from_utf8()
+ */
+JSString* gjs_lossy_string_from_utf8_n(JSContext* cx, const char* utf8_string,
+                                       size_t len) {
+    JS::UTF8Chars chars(utf8_string, len);
+    size_t outlen;
+    JS::UniqueTwoByteChars twobyte_chars(
+        JS::LossyUTF8CharsToNewTwoByteCharsZ(cx, chars, &outlen,
+                                             js::MallocArena)
+            .get());
+    if (!twobyte_chars)
+        return nullptr;
+
+    return JS_NewUCStringCopyN(cx, twobyte_chars.get(), outlen);
+}
+
 bool
 gjs_string_from_utf8(JSContext             *context,
                      const char            *utf8_string,
diff --git a/gjs/jsapi-util.h b/gjs/jsapi-util.h
index ec648347..388de031 100644
--- a/gjs/jsapi-util.h
+++ b/gjs/jsapi-util.h
@@ -454,6 +454,11 @@ GJS_JSAPI_RETURN_CONVENTION
 bool gjs_string_to_utf8_n(JSContext* cx, JS::HandleString str, JS::UniqueChars* output,
                           size_t* output_len);
 GJS_JSAPI_RETURN_CONVENTION
+JSString* gjs_lossy_string_from_utf8(JSContext* cx, const char* utf8_string);
+GJS_JSAPI_RETURN_CONVENTION
+JSString* gjs_lossy_string_from_utf8_n(JSContext* cx, const char* utf8_string,
+                                       size_t len);
+GJS_JSAPI_RETURN_CONVENTION
 bool gjs_string_from_utf8(JSContext             *context,
                           const char            *utf8_string,
                           JS::MutableHandleValue value_p);
diff --git a/gjs/text-encoding.cpp b/gjs/text-encoding.cpp
index 3aba1827..2dd384e8 100644
--- a/gjs/text-encoding.cpp
+++ b/gjs/text-encoding.cpp
@@ -62,9 +62,141 @@ static const char* UTF16_CODESET = "UTF-16BE";
 #endif
 
 GJS_JSAPI_RETURN_CONVENTION
-static JSString* gjs_decode_from_uint8array_slow(JSContext* cx, uint8_t* input,
+static JSString* gjs_lossy_decode_from_uint8array_slow(
+    JSContext* cx, const uint8_t* bytes, size_t bytes_len,
+    const char* from_codeset) {
+    GError* error = nullptr;
+    GjsAutoUnref<GCharsetConverter> converter(
+        g_charset_converter_new(UTF16_CODESET, from_codeset, &error));
+
+    // This should only throw if an encoding is not available.
+    if (error)
+        return gjs_throw_type_error_from_gerror(cx, error);
+
+    // This function converts *to* UTF-16, using a std::u16string
+    // as its buffer.
+    //
+    // UTF-16 represents each character with 2 bytes or
+    // 4 bytes, the best case scenario when converting to
+    // UTF-16 is that every input byte encodes to two bytes,
+    // this is typical for ASCII and non-supplementary characters.
+    // Because we are converting from an unknown encoding
+    // technically a single byte could be supplementary in
+    // Unicode (4 bytes) or even represen multiple Unicode characters.
+    //
+    // std::u16string does not care about these implementation
+    // details, its only concern is that is consists of byte pairs.
+    // Given this, a single UTF-16 character could be represented
+    // by one or two std::u16string characters.
+
+    // Allocate bytes_len * 2 + 12 as our initial buffer.
+    // bytes_len * 2 is the "best case" for LATIN1 strings
+    // and strings which are in the basic multilingual plane.
+    // Add 12 as a slight cushion and set the minimum allocation
+    // at 256 to prefer running a single iteration for
+    // small strings with supplemental plane characters.
+    //
+    // When converting Chinese characters, for example,
+    // some dialectal characters are in the supplemental plane
+    // Adding a padding of 12 prevents a few dialectal characters
+    // from requiring a reallocation.
+    size_t buffer_size = std::max(bytes_len * 2 + 12, 256lu);
+
+    // Cast data to correct input types
+    const char* input = reinterpret_cast<const char*>(bytes);
+    size_t input_len = bytes_len;
+
+    // The base string that we'll append to.
+    std::u16string output_str = u"";
+
+    do {
+        // Create a buffer to convert into.
+        std::vector<char> buffer(buffer_size);
+        size_t bytes_written = 0, bytes_read = 0;
+
+        g_converter_convert(G_CONVERTER(converter.get()), input, input_len,
+                            buffer.data(), buffer.size(),
+                            G_CONVERTER_INPUT_AT_END, &bytes_read,
+                            &bytes_written, &error);
+
+        // If bytes were read, adjust input.
+        if (bytes_read > 0) {
+            input += bytes_read;
+            input_len -= bytes_read;
+        }
+
+        // If bytes were written append the buffer contents to our string
+        // accumulator
+        if (bytes_written > 0) {
+            char16_t* utf16_buffer = reinterpret_cast<char16_t*>(buffer.data());
+            // std::u16string uses exactly 2 bytes for every character.
+            output_str.append(utf16_buffer, bytes_written / 2);
+        } else if (error) {
+            // A PARTIAL_INPUT error can only occur if the user does not provide
+            // the full sequence for a multi-byte character, we skip over the
+            // next character and insert a unicode fallback.
+
+            // An INVALID_DATA error occurs when there is no way to decode a
+            // given byte into UTF-16 or the given byte does not exist in the
+            // source encoding.
+            if (g_error_matches(error, G_IO_ERROR, G_IO_ERROR_INVALID_DATA) ||
+                g_error_matches(error, G_IO_ERROR, G_IO_ERROR_PARTIAL_INPUT)) {
+                // If we're already at the end of the string, don't insert a
+                // fallback.
+                if (input_len > 0) {
+                    // Skip the next byte and reduce length by one.
+                    input += 1;
+                    input_len -= 1;
+
+                    // Append the unicode fallback character to the output
+                    output_str.append(u"\ufffd", 1);
+                }
+
+                // Clear the error.
+                g_clear_error(&error);
+            } else if (g_error_matches(error, G_IO_ERROR,
+                                       G_IO_ERROR_NO_SPACE)) {
+                // If the buffer was full increase the buffer
+                // size and re-try the conversion.
+                //
+                // This logic allocates bytes_len * 3 first,
+                // then bytes_len * 4 (the worst case scenario
+                // is nearly impossible) and then continues appending
+                // arbitrary padding because we'll trust Gio and give
+                // it additional space.
+                if (buffer_size > bytes_len * 4) {
+                    buffer_size += 256;
+                } else {
+                    buffer_size += bytes_len;
+                }
+
+                // Clear the error.
+                g_clear_error(&error);
+            }
+        }
+
+        // Stop decoding if an unknown error occurs.
+    } while (input_len > 0 && !error);
+
+    // An unexpected error occurred.
+    if (error)
+        return gjs_throw_type_error_from_gerror(cx, error);
+
+    // Copy the accumulator's data into a JSString of Unicode (UTF-16) chars.
+    return JS_NewUCStringCopyN(cx, output_str.c_str(), output_str.size());
+}
+
+GJS_JSAPI_RETURN_CONVENTION
+static JSString* gjs_decode_from_uint8array_slow(JSContext* cx,
+                                                 const uint8_t* input,
                                                  uint32_t input_len,
-                                                 const char* encoding) {
+                                                 const char* encoding,
+                                                 bool fatal) {
+    // If the decoding is not fatal we use the lossy decoder.
+    if (!fatal)
+        return gjs_lossy_decode_from_uint8array_slow(cx, input, input_len,
+                                                     encoding);
+
     size_t bytes_written, bytes_read;
     GError* error = nullptr;
 
@@ -120,7 +252,8 @@ template <class T, class L>
 // decode() function implementation
 JSString* gjs_decode_from_uint8array(JSContext* cx, JS::HandleObject byte_array,
                                      const char* encoding,
-                                     GjsStringTermination string_termination) {
+                                     GjsStringTermination string_termination,
+                                     bool fatal) {
     g_assert(encoding && "encoding must be non-null");
 
     if (!JS_IsUint8Array(byte_array)) {
@@ -148,35 +281,41 @@ JSString* gjs_decode_from_uint8array(JSContext* cx, JS::HandleObject byte_array,
     // and encoders.
     bool encoding_is_utf8 = is_utf8_label(encoding);
     if (!encoding_is_utf8)
-        return gjs_decode_from_uint8array_slow(cx, data, len, encoding);
+        return gjs_decode_from_uint8array_slow(cx, data, len, encoding, fatal);
 
     JS::RootedString decoded(cx);
-    JS::UTF8Chars chars(reinterpret_cast<char*>(data), len);
-    JS::RootedString str(cx, JS_NewStringCopyUTF8N(cx, chars));
-
-    // If an exception occurred, we need to check if the
-    // exception was an InternalError. Unfortunately,
-    // SpiderMonkey's decoder can throw InternalError for some
-    // invalid UTF-8 sources, we have to convert this into a
-    // TypeError to match the Encoding specification.
-    if (str) {
-        decoded.set(str);
+    if (!fatal) {
+        decoded.set(gjs_lossy_string_from_utf8_n(
+            cx, reinterpret_cast<char*>(data), len));
     } else {
-        JS::RootedValue exc(cx);
-        if (!JS_GetPendingException(cx, &exc) || !exc.isObject())
-            return nullptr;
+        JS::UTF8Chars chars(reinterpret_cast<char*>(data), len);
+        JS::RootedString str(cx, JS_NewStringCopyUTF8N(cx, chars));
+
+        // If an exception occurred, we need to check if the
+        // exception was an InternalError. Unfortunately,
+        // SpiderMonkey's decoder can throw InternalError for some
+        // invalid UTF-8 sources, we have to convert this into a
+        // TypeError to match the Encoding specification.
+        if (str) {
+            decoded.set(str);
+        } else {
+            JS::RootedValue exc(cx);
+            if (!JS_GetPendingException(cx, &exc) || !exc.isObject())
+                return nullptr;
+
+            JS::RootedObject exc_obj(cx, &exc.toObject());
+            const JSClass* internal_error =
+                js::ProtoKeyToClass(JSProto_InternalError);
+            if (JS_InstanceOf(cx, exc_obj, internal_error, nullptr)) {
+                // Clear the existing exception.
+                JS_ClearPendingException(cx);
+                gjs_throw_custom(
+                    cx, JSProto_TypeError, nullptr,
+                    "The provided encoded data was not valid UTF-8");
+            }
 
-        JS::RootedObject exc_obj(cx, &exc.toObject());
-        const JSClass* internal_error =
-            js::ProtoKeyToClass(JSProto_InternalError);
-        if (JS_InstanceOf(cx, exc_obj, internal_error, nullptr)) {
-            // Clear the existing exception.
-            JS_ClearPendingException(cx);
-            gjs_throw_custom(cx, JSProto_TypeError, nullptr,
-                             "The provided encoded data was not valid UTF-8");
+            return nullptr;
         }
-
-        return nullptr;
     }
 
     uint8_t* current_data;
@@ -203,7 +342,7 @@ JSString* gjs_decode_from_uint8array(JSContext* cx, JS::HandleObject byte_array,
 
     // This was the UTF-8 optimized path, so we explicitly pass the encoding
     return gjs_decode_from_uint8array_slow(cx, current_data, current_len,
-                                           "UTF-8");
+                                           "utf-8", fatal);
 }
 
 GJS_JSAPI_RETURN_CONVENTION
@@ -212,13 +351,16 @@ static bool gjs_decode(JSContext* cx, unsigned argc, JS::Value* vp) {
 
     JS::RootedObject byte_array(cx);
     JS::UniqueChars encoding;
-    if (!gjs_parse_call_args(cx, "decode", args, "os", "byteArray", &byte_array,
-                             "encoding", &encoding))
+    bool fatal = false;
+    if (!gjs_parse_call_args(cx, "decode", args, "os|b", "byteArray",
+                             &byte_array, "encoding", &encoding, "fatal",
+                             &fatal))
         return false;
 
     JS::RootedString decoded(
         cx, gjs_decode_from_uint8array(cx, byte_array, encoding.get(),
-                                       GjsStringTermination::EXPLICIT_LENGTH));
+                                       GjsStringTermination::EXPLICIT_LENGTH,
+                                       fatal));
     if (!decoded)
         return false;
 
diff --git a/gjs/text-encoding.h b/gjs/text-encoding.h
index eee174bb..e4daa85e 100644
--- a/gjs/text-encoding.h
+++ b/gjs/text-encoding.h
@@ -22,7 +22,8 @@ enum class GjsStringTermination {
 GJS_JSAPI_RETURN_CONVENTION
 JSString* gjs_decode_from_uint8array(JSContext* cx, JS::HandleObject uint8array,
                                      const char* encoding,
-                                     GjsStringTermination string_termination);
+                                     GjsStringTermination string_termination,
+                                     bool fatal);
 
 GJS_JSAPI_RETURN_CONVENTION
 JSObject* gjs_encode_to_uint8array(JSContext* cx, JS::HandleString str,
diff --git a/modules/esm/_encoding/encoding.js b/modules/esm/_encoding/encoding.js
index 3e2f449b..e84b752b 100644
--- a/modules/esm/_encoding/encoding.js
+++ b/modules/esm/_encoding/encoding.js
@@ -132,7 +132,7 @@ class TextDecoder {
             input = new Uint8Array(buffer, byteOffset + 3, byteLength - 3);
         }
 
-        return Encoding.decode(input, this._internalEncoding);
+        return Encoding.decode(input, this._internalEncoding, this.fatal);
     }
 }
 


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]