فهرست منبع

LibWeb: Implement spec-compliant integer parsing

We have code inside LibWeb that uses the
`AK::StringUtils::convert_to_uint`and `AK::StringUtils::convert_to_int`
methods for parsing integers. This works well for the most part, but
according to the spec, trailing characters are allowed and should be
ignored, but this is not how the `StringUtil` methods are implemented.

This patch adds two new methods named `parse_integer` and
`parse_non_negative_integer` inside the `Web::HTML` namespace that uses
`StringUtils` under the hood but adds a bit more logic to make it spec
compliant.
Jonatan Klemets 2 سال پیش
والد
کامیت
9812031a02

+ 1 - 0
Tests/LibWeb/CMakeLists.txt

@@ -2,6 +2,7 @@ set(TEST_SOURCES
     TestCSSIDSpeed.cpp
     TestCSSPixels.cpp
     TestHTMLTokenizer.cpp
+    TestNumbers.cpp
 )
 
 foreach(source IN LISTS TEST_SOURCES)

+ 106 - 0
Tests/LibWeb/TestNumbers.cpp

@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2023, Jonatan Klemets <jonatan.r.klemets@gmail.com>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <LibTest/TestCase.h>
+
+#include <LibWeb/HTML/Numbers.h>
+
+TEST_CASE(parse_integer)
+{
+    auto optional_value = Web::HTML::parse_integer(""sv);
+    EXPECT(!optional_value.has_value());
+
+    optional_value = Web::HTML::parse_integer("123"sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), 123);
+
+    optional_value = Web::HTML::parse_integer(" 456"sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), 456);
+
+    optional_value = Web::HTML::parse_integer("789 "sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), 789);
+
+    optional_value = Web::HTML::parse_integer("   22   "sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), 22);
+
+    optional_value = Web::HTML::parse_integer(" \n\t31\t\t\n\n"sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), 31);
+
+    optional_value = Web::HTML::parse_integer("765foo"sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), 765);
+
+    optional_value = Web::HTML::parse_integer("3;"sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), 3);
+
+    optional_value = Web::HTML::parse_integer("foo765"sv);
+    EXPECT(!optional_value.has_value());
+
+    optional_value = Web::HTML::parse_integer("1"sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), 1);
+
+    optional_value = Web::HTML::parse_integer("+2"sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), 2);
+
+    optional_value = Web::HTML::parse_integer("-3"sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), -3);
+}
+
+TEST_CASE(parse_non_negative_integer)
+{
+    auto optional_value = Web::HTML::parse_non_negative_integer(""sv);
+    EXPECT(!optional_value.has_value());
+
+    optional_value = Web::HTML::parse_non_negative_integer("123"sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), 123u);
+
+    optional_value = Web::HTML::parse_non_negative_integer(" 456"sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), 456u);
+
+    optional_value = Web::HTML::parse_non_negative_integer("789 "sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), 789u);
+
+    optional_value = Web::HTML::parse_non_negative_integer("   22   "sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), 22u);
+
+    optional_value = Web::HTML::parse_non_negative_integer(" \n\t31\t\t\n\n"sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), 31u);
+
+    optional_value = Web::HTML::parse_non_negative_integer("765foo"sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), 765u);
+
+    optional_value = Web::HTML::parse_non_negative_integer("3;"sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), 3u);
+
+    optional_value = Web::HTML::parse_non_negative_integer("foo765"sv);
+    EXPECT(!optional_value.has_value());
+
+    optional_value = Web::HTML::parse_non_negative_integer("1"sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), 1u);
+
+    optional_value = Web::HTML::parse_non_negative_integer("+2"sv);
+    EXPECT(optional_value.has_value());
+    EXPECT_EQ(optional_value.value(), 2u);
+
+    optional_value = Web::HTML::parse_non_negative_integer("-3"sv);
+    EXPECT(!optional_value.has_value());
+}

+ 1 - 0
Userland/Libraries/LibWeb/CMakeLists.txt

@@ -358,6 +358,7 @@ set(SOURCES
     HTML/NavigationTransition.cpp
     HTML/Navigator.cpp
     HTML/NavigatorID.cpp
+    HTML/Numbers.cpp
     HTML/PageTransitionEvent.cpp
     HTML/Parser/Entities.cpp
     HTML/Parser/HTMLEncodingDetection.cpp

+ 82 - 0
Userland/Libraries/LibWeb/HTML/Numbers.cpp

@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2023, Jonatan Klemets <jonatan.r.klemets@gmail.com>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/GenericLexer.h>
+#include <LibWeb/HTML/Numbers.h>
+#include <LibWeb/Infra/CharacterTypes.h>
+
+namespace Web::HTML {
+
+// https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#rules-for-parsing-integers
+Optional<i32> parse_integer(StringView string)
+{
+    // 1. Let input be the string being parsed.
+    // 2. Let position be a pointer into input, initially pointing at the start of the string.
+    GenericLexer lexer { string };
+
+    // 3. Let sign have the value "positive".
+    // NOTE: Skipped, see comment on step 6.
+
+    // 4. Skip ASCII whitespace within input given position.
+    lexer.ignore_while(Web::Infra::is_ascii_whitespace);
+
+    // 5. If position is past the end of input, return an error.
+    if (lexer.is_eof()) {
+        return {};
+    }
+
+    // 6. If the character indicated by position (the first character) is a U+002D HYPHEN-MINUS character (-):
+    //
+    // If we parse a signed integer, then we include the sign character (if present) in the collect step
+    // (step 8) and lean on `AK::StringUtils::convert_to_int` to handle it for us.
+    size_t start_index = lexer.tell();
+    if (lexer.peek() == '-' || lexer.peek() == '+') {
+        lexer.consume();
+    }
+
+    // 7. If the character indicated by position is not an ASCII digit, then return an error.
+    if (!lexer.next_is(is_ascii_digit)) {
+        return {};
+    }
+
+    // 8. Collect a sequence of code points that are ASCII digits from input given position, and interpret the resulting sequence as a base-ten integer. Let value be that integer.
+    lexer.consume_while(is_ascii_digit);
+    size_t end_index = lexer.tell();
+    auto digits = lexer.input().substring_view(start_index, end_index - start_index);
+    auto optional_value = AK::StringUtils::convert_to_int<i32>(digits);
+
+    // 9. If sign is "positive", return value, otherwise return the result of subtracting value from zero.
+    // NOTE: Skipped, see comment on step 6.
+
+    return optional_value;
+}
+
+// https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#rules-for-parsing-non-negative-integers
+Optional<u32> parse_non_negative_integer(StringView string)
+{
+    // 1. Let input be the string being parsed.
+    // 2. Let value be the result of parsing input using the rules for parsing integers.
+    //
+    // NOTE: Because we call `parse_integer`, we parse all integers as signed. If we need the extra
+    //       size that an unsigned integer offers, then this would need to be improved. That said,
+    //       I don't think we need to support such large integers at the moment.
+    auto optional_value = parse_integer(string);
+
+    // 3. If value is an error, return an error.
+    if (!optional_value.has_value()) {
+        return {};
+    }
+
+    // 4. If value is less than zero, return an error.
+    if (optional_value.value() < 0) {
+        return {};
+    }
+
+    // 5. Return value.
+    return static_cast<u32>(optional_value.value());
+}
+
+}

+ 18 - 0
Userland/Libraries/LibWeb/HTML/Numbers.h

@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2023, Jonatan Klemets <jonatan.r.klemets@gmail.com>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/Forward.h>
+#include <AK/String.h>
+
+namespace Web::HTML {
+
+Optional<i32> parse_integer(StringView string);
+
+Optional<u32> parse_non_negative_integer(StringView string);
+
+}