From 079c96376c678341c7483ee88e53b4e27662dbf5 Mon Sep 17 00:00:00 2001 From: Idan Horowitz Date: Fri, 17 Nov 2023 15:08:44 +0200 Subject: [PATCH] LibTextCodec: Support validating encoded inputs --- Userland/Libraries/LibTextCodec/Decoder.cpp | 51 +++++++++++++++++++++ Userland/Libraries/LibTextCodec/Decoder.h | 4 ++ 2 files changed, 55 insertions(+) diff --git a/Userland/Libraries/LibTextCodec/Decoder.cpp b/Userland/Libraries/LibTextCodec/Decoder.cpp index 31d0dfc8f5d..cd44aecfe64 100644 --- a/Userland/Libraries/LibTextCodec/Decoder.cpp +++ b/Userland/Libraries/LibTextCodec/Decoder.cpp @@ -232,6 +232,12 @@ StringView get_output_encoding(StringView encoding) return encoding; } +bool Decoder::validate(StringView) +{ + // By-default we assume that any input sequence is valid, character encodings that do not accept all inputs may override this + return true; +} + ErrorOr Decoder::to_utf8(StringView input) { StringBuilder builder(input.length()); @@ -247,6 +253,11 @@ ErrorOr UTF8Decoder::process(StringView input, Function(u32) return {}; } +bool UTF8Decoder::validate(StringView input) +{ + return Utf8View(input).validate(); +} + ErrorOr UTF8Decoder::to_utf8(StringView input) { // Discard the BOM @@ -299,6 +310,26 @@ ErrorOr UTF16BEDecoder::process(StringView input, Function(u return {}; } +bool UTF16BEDecoder::validate(StringView input) +{ + size_t utf16_length = input.length() - (input.length() % 2); + for (size_t i = 0; i < utf16_length; i += 2) { + u16 w1 = (static_cast(input[i]) << 8) | static_cast(input[i + 1]); + if (!is_unicode_surrogate(w1)) + continue; + + if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) + return false; + + u16 w2 = (static_cast(input[i + 2]) << 8) | static_cast(input[i + 3]); + if (!Utf16View::is_low_surrogate(w2)) + return false; + + i += 2; + } + return true; +} + ErrorOr UTF16BEDecoder::to_utf8(StringView input) { // Discard the BOM @@ -352,6 +383,26 @@ ErrorOr UTF16LEDecoder::process(StringView input, Function(u return {}; } +bool UTF16LEDecoder::validate(StringView input) +{ + size_t utf16_length = input.length() - (input.length() % 2); + for (size_t i = 0; i < utf16_length; i += 2) { + u16 w1 = static_cast(input[i]) | (static_cast(input[i + 1]) << 8); + if (!is_unicode_surrogate(w1)) + continue; + + if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) + return false; + + u16 w2 = static_cast(input[i + 2]) | (static_cast(input[i + 3]) << 8); + if (!Utf16View::is_low_surrogate(w2)) + return false; + + i += 2; + } + return true; +} + ErrorOr UTF16LEDecoder::to_utf8(StringView input) { // Discard the BOM diff --git a/Userland/Libraries/LibTextCodec/Decoder.h b/Userland/Libraries/LibTextCodec/Decoder.h index 5586db1fc4f..a4a1c1526b8 100644 --- a/Userland/Libraries/LibTextCodec/Decoder.h +++ b/Userland/Libraries/LibTextCodec/Decoder.h @@ -18,6 +18,7 @@ namespace TextCodec { class Decoder { public: virtual ErrorOr process(StringView, Function(u32)> on_code_point) = 0; + virtual bool validate(StringView); virtual ErrorOr to_utf8(StringView); protected: @@ -27,18 +28,21 @@ protected: class UTF8Decoder final : public Decoder { public: virtual ErrorOr process(StringView, Function(u32)> on_code_point) override; + virtual bool validate(StringView) override; virtual ErrorOr to_utf8(StringView) override; }; class UTF16BEDecoder final : public Decoder { public: virtual ErrorOr process(StringView, Function(u32)> on_code_point) override; + virtual bool validate(StringView) override; virtual ErrorOr to_utf8(StringView) override; }; class UTF16LEDecoder final : public Decoder { public: virtual ErrorOr process(StringView, Function(u32)> on_code_point) override; + virtual bool validate(StringView) override; virtual ErrorOr to_utf8(StringView) override; };