mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-22 15:40:19 +00:00
LibUnicode: Fix typos causing text segmentation on mid-word punctuation
For example the words "can't" and "32.3" should not have boundaries detected on the "'" and "." code points, respectively. The String test cases fixed here are because "b'ar" is now considered one word.
This commit is contained in:
parent
6e7a6e2d02
commit
5cbf054651
Notes:
sideshowbarker
2024-07-17 00:24:31 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/5cbf054651 Pull-request: https://github.com/SerenityOS/serenity/pull/17477
4 changed files with 45 additions and 5 deletions
|
@ -211,7 +211,7 @@ TEST_CASE(to_titlecase)
|
|||
{
|
||||
auto string = MUST(String::from_utf8("f\"oo\" b'ar'"sv));
|
||||
auto result = MUST(string.to_titlecase());
|
||||
EXPECT_EQ(result, "F\"Oo\" B'Ar'"sv);
|
||||
EXPECT_EQ(result, "F\"Oo\" B'ar'"sv);
|
||||
}
|
||||
{
|
||||
auto string = MUST(String::from_utf8("123dollars"sv));
|
||||
|
|
|
@ -93,7 +93,7 @@ TEST_CASE(to_unicode_titlecase)
|
|||
|
||||
EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("foo bar baz"sv)), "Foo Bar Baz"sv);
|
||||
EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("foo \n \r bar \t baz"sv)), "Foo \n \r Bar \t Baz"sv);
|
||||
EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("f\"oo\" b'ar'"sv)), "F\"Oo\" B'Ar'"sv);
|
||||
EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("f\"oo\" b'ar'"sv)), "F\"Oo\" B'ar'"sv);
|
||||
EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("123dollars"sv)), "123Dollars"sv);
|
||||
}
|
||||
|
||||
|
|
|
@ -82,7 +82,6 @@ describe("correct behavior", () => {
|
|||
];
|
||||
index = 0;
|
||||
for (const segment of wordSegments) {
|
||||
console.log(JSON.stringify(segment));
|
||||
expect(segment.segment).toBe(expectedSegments[index].segment);
|
||||
expect(segment.index).toBe(expectedSegments[index].index);
|
||||
expect(segment.input).toBe(string);
|
||||
|
@ -103,4 +102,45 @@ describe("correct behavior", () => {
|
|||
}
|
||||
expect(index).toBe(1);
|
||||
});
|
||||
|
||||
test("word segmentation of string with mid-word punctuation", () => {
|
||||
const string = "The quick (“brown”) fox can’t jump 32.3 feet, right?";
|
||||
|
||||
const segmenter = new Intl.Segmenter([], { granularity: "word" });
|
||||
const segments = segmenter.segment(string);
|
||||
|
||||
const expectedSegments = [
|
||||
{ segment: "The", index: 0, isWordLike: true },
|
||||
{ segment: " ", index: 3, isWordLike: false },
|
||||
{ segment: "quick", index: 4, isWordLike: true },
|
||||
{ segment: " ", index: 9, isWordLike: false },
|
||||
{ segment: "(", index: 10, isWordLike: false },
|
||||
{ segment: "“", index: 11, isWordLike: false },
|
||||
{ segment: "brown", index: 12, isWordLike: true },
|
||||
{ segment: "”", index: 17, isWordLike: false },
|
||||
{ segment: ")", index: 18, isWordLike: false },
|
||||
{ segment: " ", index: 19, isWordLike: false },
|
||||
{ segment: "fox", index: 20, isWordLike: true },
|
||||
{ segment: " ", index: 23, isWordLike: false },
|
||||
{ segment: "can’t", index: 24, isWordLike: true },
|
||||
{ segment: " ", index: 29, isWordLike: false },
|
||||
{ segment: "jump", index: 30, isWordLike: true },
|
||||
{ segment: " ", index: 34, isWordLike: false },
|
||||
{ segment: "32.3", index: 35, isWordLike: true },
|
||||
{ segment: " ", index: 39, isWordLike: false },
|
||||
{ segment: "feet", index: 40, isWordLike: true },
|
||||
{ segment: ",", index: 44, isWordLike: false },
|
||||
{ segment: " ", index: 45, isWordLike: false },
|
||||
{ segment: "right", index: 46, isWordLike: true },
|
||||
{ segment: "?", index: 51, isWordLike: false },
|
||||
];
|
||||
|
||||
let index = 0;
|
||||
for (const segment of segments) {
|
||||
expect(segment.segment).toBe(expectedSegments[index].segment);
|
||||
expect(segment.index).toBe(expectedSegments[index].index);
|
||||
expect(segment.input).toBe(string);
|
||||
index++;
|
||||
}
|
||||
});
|
||||
});
|
||||
|
|
|
@ -215,7 +215,7 @@ static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType c
|
|||
auto it_copy = it;
|
||||
++it_copy;
|
||||
if (it_copy != view.end())
|
||||
next_next_code_point = *it;
|
||||
next_next_code_point = *it_copy;
|
||||
}
|
||||
bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter);
|
||||
bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter));
|
||||
|
@ -256,7 +256,7 @@ static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType c
|
|||
if (code_point_is_numeric && next_code_point_is_ah_letter)
|
||||
continue;
|
||||
|
||||
auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(code_point, WBP::Numeric);
|
||||
auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Numeric);
|
||||
|
||||
// WB11
|
||||
if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum)))
|
||||
|
|
Loading…
Reference in a new issue