LibTextCodec: Support validating encoded inputs

This commit is contained in:
Idan Horowitz 2023-11-17 15:08:44 +02:00 committed by Andreas Kling
parent ad4470bc39
commit 079c96376c
Notes: sideshowbarker 2024-07-17 06:54:15 +09:00
2 changed files with 55 additions and 0 deletions

View file

@ -232,6 +232,12 @@ StringView get_output_encoding(StringView encoding)
return encoding;
}
bool Decoder::validate(StringView)
{
// By-default we assume that any input sequence is valid, character encodings that do not accept all inputs may override this
return true;
}
ErrorOr<String> Decoder::to_utf8(StringView input)
{
StringBuilder builder(input.length());
@ -247,6 +253,11 @@ ErrorOr<void> UTF8Decoder::process(StringView input, Function<ErrorOr<void>(u32)
return {};
}
bool UTF8Decoder::validate(StringView input)
{
return Utf8View(input).validate();
}
ErrorOr<String> UTF8Decoder::to_utf8(StringView input)
{
// Discard the BOM
@ -299,6 +310,26 @@ ErrorOr<void> UTF16BEDecoder::process(StringView input, Function<ErrorOr<void>(u
return {};
}
bool UTF16BEDecoder::validate(StringView input)
{
size_t utf16_length = input.length() - (input.length() % 2);
for (size_t i = 0; i < utf16_length; i += 2) {
u16 w1 = (static_cast<u8>(input[i]) << 8) | static_cast<u8>(input[i + 1]);
if (!is_unicode_surrogate(w1))
continue;
if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length)
return false;
u16 w2 = (static_cast<u8>(input[i + 2]) << 8) | static_cast<u8>(input[i + 3]);
if (!Utf16View::is_low_surrogate(w2))
return false;
i += 2;
}
return true;
}
ErrorOr<String> UTF16BEDecoder::to_utf8(StringView input)
{
// Discard the BOM
@ -352,6 +383,26 @@ ErrorOr<void> UTF16LEDecoder::process(StringView input, Function<ErrorOr<void>(u
return {};
}
bool UTF16LEDecoder::validate(StringView input)
{
size_t utf16_length = input.length() - (input.length() % 2);
for (size_t i = 0; i < utf16_length; i += 2) {
u16 w1 = static_cast<u8>(input[i]) | (static_cast<u8>(input[i + 1]) << 8);
if (!is_unicode_surrogate(w1))
continue;
if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length)
return false;
u16 w2 = static_cast<u8>(input[i + 2]) | (static_cast<u8>(input[i + 3]) << 8);
if (!Utf16View::is_low_surrogate(w2))
return false;
i += 2;
}
return true;
}
ErrorOr<String> UTF16LEDecoder::to_utf8(StringView input)
{
// Discard the BOM

View file

@ -18,6 +18,7 @@ namespace TextCodec {
class Decoder {
public:
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)> on_code_point) = 0;
virtual bool validate(StringView);
virtual ErrorOr<String> to_utf8(StringView);
protected:
@ -27,18 +28,21 @@ protected:
class UTF8Decoder final : public Decoder {
public:
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)> on_code_point) override;
virtual bool validate(StringView) override;
virtual ErrorOr<String> to_utf8(StringView) override;
};
class UTF16BEDecoder final : public Decoder {
public:
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)> on_code_point) override;
virtual bool validate(StringView) override;
virtual ErrorOr<String> to_utf8(StringView) override;
};
class UTF16LEDecoder final : public Decoder {
public:
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)> on_code_point) override;
virtual bool validate(StringView) override;
virtual ErrorOr<String> to_utf8(StringView) override;
};