ソースを参照

LibTextCodec+AK: Don't validate UTF-8 strings twice

UTF8Decoder was already converting invalid data into replacement
characters while converting, so we know for sure we have valid UTF-8
by the time conversion is finished.

This patch adds a new StringBuilder::to_string_without_validation()
and uses it to make UTF8Decoder avoid half the work it was doing.
Andreas Kling 1 年間 前
コミット
3c039903fb
5 ファイル変更22 行追加1 行削除
  1. 13 0
      AK/String.cpp
  2. 2 0
      AK/String.h
  3. 5 0
      AK/StringBuilder.cpp
  4. 1 0
      AK/StringBuilder.h
  5. 1 1
      Userland/Libraries/LibTextCodec/Decoder.cpp

+ 13 - 0
AK/String.cpp

@@ -233,6 +233,19 @@ void String::destroy_string()
         m_data->unref();
 }
 
+String String::from_utf8_without_validation(ReadonlyBytes bytes)
+{
+    if (bytes.size() <= MAX_SHORT_STRING_BYTE_COUNT) {
+        ShortString short_string;
+        if (!bytes.is_empty())
+            memcpy(short_string.storage, bytes.data(), bytes.size());
+        short_string.byte_count_and_short_string_flag = (bytes.size() << 1) | SHORT_STRING_FLAG;
+        return String { short_string };
+    }
+    auto data = MUST(Detail::StringData::from_utf8(reinterpret_cast<char const*>(bytes.data()), bytes.size()));
+    return String { move(data) };
+}
+
 ErrorOr<String> String::from_utf8(StringView view)
 {
     if (!Utf8View { view }.validate())

+ 2 - 0
AK/String.h

@@ -70,6 +70,8 @@ public:
     requires(IsOneOf<RemoveCVReference<T>, ByteString, DeprecatedFlyString, FlyString, String>)
     static ErrorOr<String> from_utf8(T&&) = delete;
 
+    [[nodiscard]] static String from_utf8_without_validation(ReadonlyBytes);
+
     // Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream.
     static ErrorOr<String> from_stream(Stream&, size_t byte_count);
 

+ 5 - 0
AK/StringBuilder.cpp

@@ -156,6 +156,11 @@ ErrorOr<String> StringBuilder::to_string() const
     return String::from_utf8(string_view());
 }
 
+String StringBuilder::to_string_without_validation() const
+{
+    return String::from_utf8_without_validation(string_view().bytes());
+}
+
 ErrorOr<FlyString> StringBuilder::to_fly_string() const
 {
     return FlyString::from_utf8(string_view());

+ 1 - 0
AK/StringBuilder.h

@@ -73,6 +73,7 @@ public:
     [[nodiscard]] ByteString to_byte_string() const;
 #endif
 
+    [[nodiscard]] String to_string_without_validation() const;
     ErrorOr<String> to_string() const;
     ErrorOr<FlyString> to_fly_string() const;
 

+ 1 - 1
Userland/Libraries/LibTextCodec/Decoder.cpp

@@ -247,7 +247,7 @@ ErrorOr<String> Decoder::to_utf8(StringView input)
 {
     StringBuilder builder(input.length());
     TRY(process(input, [&builder](u32 c) { return builder.try_append_code_point(c); }));
-    return builder.to_string();
+    return builder.to_string_without_validation();
 }
 
 ErrorOr<void> UTF8Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)