mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-25 00:50:22 +00:00
AK+LibUnicode: Implement a case-insensitive variant of find_byte_offset
The existing String::find_byte_offset is case-sensitive. This variant allows performing searches using Unicode-aware case folding.
This commit is contained in:
parent
cf6aa77816
commit
fe3fde2411
Notes:
sideshowbarker
2024-07-16 21:30:46 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/fe3fde2411 Pull-request: https://github.com/SerenityOS/serenity/pull/24501
5 changed files with 188 additions and 0 deletions
|
@ -132,6 +132,9 @@ public:
|
|||
Optional<size_t> find_byte_offset(u32 code_point, size_t from_byte_offset = 0) const;
|
||||
Optional<size_t> find_byte_offset(StringView substring, size_t from_byte_offset = 0) const;
|
||||
|
||||
// Using this method requires linking LibUnicode into your application.
|
||||
Optional<size_t> find_byte_offset_ignoring_case(StringView, size_t from_byte_offset = 0) const;
|
||||
|
||||
[[nodiscard]] bool operator==(String const&) const = default;
|
||||
[[nodiscard]] bool operator==(FlyString const&) const;
|
||||
[[nodiscard]] bool operator==(StringView) const;
|
||||
|
|
|
@ -920,6 +920,133 @@ TEST_CASE(find_byte_offset)
|
|||
}
|
||||
}
|
||||
|
||||
TEST_CASE(find_byte_offset_ignoring_case)
|
||||
{
|
||||
{
|
||||
auto string = ""_string;
|
||||
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case(""sv).has_value(), false);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("1"sv).has_value(), false);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("2"sv).has_value(), false);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("23"sv).has_value(), false);
|
||||
}
|
||||
{
|
||||
auto string = "1234567"_string;
|
||||
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case(""sv), 0u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("1"sv), 0u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("2"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("3"sv), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("4"sv), 3u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("5"sv), 4u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("6"sv), 5u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("7"sv), 6u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("34"sv), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("45"sv), 3u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("56"sv), 4u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("67"sv), 5u);
|
||||
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("a"sv).has_value(), false);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("8"sv).has_value(), false);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("78"sv).has_value(), false);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("46"sv).has_value(), false);
|
||||
}
|
||||
{
|
||||
auto string = "abCDef"_string;
|
||||
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("A"sv), 0u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("B"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("c"sv), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("d"sv), 3u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("e"sv), 4u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("f"sv), 5u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("AbC"sv), 0u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("BcdE"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("cd"sv), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("cD"sv), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("E"sv), 4u);
|
||||
}
|
||||
{
|
||||
auto string = "abßcd"_string;
|
||||
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("SS"sv), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("Ss"sv), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("ss"sv), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("S"sv), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("s"sv), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("ß"sv), 2u);
|
||||
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bSS"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bSs"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bss"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bS"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bs"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bß"sv), 1u);
|
||||
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bSSc"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bSsc"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bssc"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bßc"sv), 1u);
|
||||
EXPECT(!string.find_byte_offset_ignoring_case("bSc"sv).has_value());
|
||||
EXPECT(!string.find_byte_offset_ignoring_case("bsc"sv).has_value());
|
||||
}
|
||||
{
|
||||
auto string = "abSScd"_string;
|
||||
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("SS"sv), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("Ss"sv), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("ss"sv), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("S"sv), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("s"sv), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("ß"sv), 2u);
|
||||
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bSS"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bSs"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bss"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bS"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bs"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bß"sv), 1u);
|
||||
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bSSc"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bSsc"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bssc"sv), 1u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("bßc"sv), 1u);
|
||||
EXPECT(!string.find_byte_offset_ignoring_case("bSc"sv).has_value());
|
||||
EXPECT(!string.find_byte_offset_ignoring_case("bsc"sv).has_value());
|
||||
}
|
||||
{
|
||||
auto string = "ßSßs"_string;
|
||||
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("SS"sv), 0u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("Ss"sv), 0u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("ss"sv), 0u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("S"sv), 0u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("s"sv), 0u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("ß"sv), 0u);
|
||||
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("SS"sv, 2), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("Ss"sv, 2), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("ss"sv, 2), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("S"sv, 2), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("s"sv, 2), 2u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("ß"sv, 2), 2u);
|
||||
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("SS"sv, 3), 3u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("Ss"sv, 3), 3u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("ss"sv, 3), 3u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("S"sv, 3), 3u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("s"sv, 3), 3u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("ß"sv, 3), 3u);
|
||||
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("S"sv, 5), 5u);
|
||||
EXPECT_EQ(string.find_byte_offset_ignoring_case("s"sv, 5), 5u);
|
||||
EXPECT(!string.find_byte_offset_ignoring_case("SS"sv, 5).has_value());
|
||||
EXPECT(!string.find_byte_offset_ignoring_case("Ss"sv, 5).has_value());
|
||||
EXPECT(!string.find_byte_offset_ignoring_case("ss"sv, 5).has_value());
|
||||
EXPECT(!string.find_byte_offset_ignoring_case("ß"sv, 5).has_value());
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE(repeated)
|
||||
{
|
||||
{
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
#include <AK/CharacterTypes.h>
|
||||
#include <AK/Platform.h>
|
||||
#include <AK/ScopeGuard.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <AK/Types.h>
|
||||
#include <AK/Utf16View.h>
|
||||
|
@ -55,6 +56,18 @@ public:
|
|||
return !m_casefolded_code_points.is_empty() || (m_it != m_string.end());
|
||||
}
|
||||
|
||||
size_t index() const
|
||||
{
|
||||
if constexpr (IsSame<ViewType, Utf8View>)
|
||||
return m_string.byte_offset_of(m_it);
|
||||
else if constexpr (IsSame<ViewType, Utf16View>)
|
||||
return m_string.code_unit_offset_of(m_it);
|
||||
else if constexpr (IsSame<ViewType, Utf32View>)
|
||||
return m_string.iterator_offset(m_it);
|
||||
else
|
||||
static_assert(DependentFalse<ViewType>);
|
||||
}
|
||||
|
||||
u32 next_code_point()
|
||||
{
|
||||
VERIFY(has_more_data());
|
||||
|
@ -103,6 +116,38 @@ template bool equals_ignoring_case(Utf8View, Utf8View);
|
|||
template bool equals_ignoring_case(Utf16View, Utf16View);
|
||||
template bool equals_ignoring_case(Utf32View, Utf32View);
|
||||
|
||||
template<typename ViewType>
|
||||
Optional<size_t> find_ignoring_case(ViewType lhs, ViewType rhs)
|
||||
{
|
||||
CasefoldStringComparator lhs_comparator { lhs };
|
||||
|
||||
while (lhs_comparator.has_more_data()) {
|
||||
CasefoldStringComparator rhs_comparator { rhs };
|
||||
|
||||
auto saved_state = lhs_comparator;
|
||||
auto matches = true;
|
||||
|
||||
while (lhs_comparator.has_more_data() && rhs_comparator.has_more_data()) {
|
||||
if (lhs_comparator.next_code_point() != rhs_comparator.next_code_point()) {
|
||||
matches = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (matches && !rhs_comparator.has_more_data())
|
||||
return saved_state.index();
|
||||
|
||||
lhs_comparator = move(saved_state);
|
||||
lhs_comparator.next_code_point();
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
template Optional<size_t> find_ignoring_case(Utf8View, Utf8View);
|
||||
template Optional<size_t> find_ignoring_case(Utf16View, Utf16View);
|
||||
template Optional<size_t> find_ignoring_case(Utf32View, Utf32View);
|
||||
|
||||
Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
|
||||
bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
|
||||
Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; }
|
||||
|
|
|
@ -51,6 +51,9 @@ u32 to_unicode_titlecase(u32 code_point);
|
|||
template<typename ViewType>
|
||||
bool equals_ignoring_case(ViewType, ViewType);
|
||||
|
||||
template<typename ViewType>
|
||||
Optional<size_t> find_ignoring_case(ViewType, ViewType);
|
||||
|
||||
Optional<GeneralCategory> general_category_from_string(StringView);
|
||||
bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);
|
||||
|
||||
|
|
|
@ -47,4 +47,14 @@ bool String::equals_ignoring_case(String const& other) const
|
|||
return Unicode::equals_ignoring_case(code_points(), other.code_points());
|
||||
}
|
||||
|
||||
Optional<size_t> String::find_byte_offset_ignoring_case(StringView needle, size_t from_byte_offset) const
|
||||
{
|
||||
auto haystack = code_points().substring_view(from_byte_offset);
|
||||
|
||||
if (auto index = Unicode::find_ignoring_case(haystack, Utf8View { needle }); index.has_value())
|
||||
return *index + from_byte_offset;
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue