LibWeb: Iterate over text chunks using a grapheme-aware segmenter
Our current text iterator is not aware of multi-code point graphemes. Instead of simply incrementing an iterator one code point at a time, use our Unicode grapheme segmenter to break text into fragments.
This commit is contained in:
parent
aef85a83bd
commit
f0105b473b
Notes:
github-actions[bot]
2024-09-21 06:58:43 +00:00
Author: https://github.com/trflynn89 Commit: https://github.com/LadybirdBrowser/ladybird/commit/f0105b473b9 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/1465
5 changed files with 84 additions and 32 deletions
34
Tests/LibWeb/Layout/expected/multi-code-point-graphemes.txt
Normal file
34
Tests/LibWeb/Layout/expected/multi-code-point-graphemes.txt
Normal file
|
@ -0,0 +1,34 @@
|
|||
Viewport <#document> at (0,0) content-size 800x600 children: not-inline
|
||||
BlockContainer <html> at (0,0) content-size 800x600 [BFC] children: not-inline
|
||||
BlockContainer <body> at (8,16) content-size 784x83 children: not-inline
|
||||
BlockContainer <p> at (8,16) content-size 784x17 children: inline
|
||||
frag 0 from TextNode start: 0, length: 11, rect: [8,16 20.3125x17] baseline: 13.296875
|
||||
"🧑🚒"
|
||||
TextNode <#text>
|
||||
BlockContainer <(anonymous)> at (8,49) content-size 784x0 children: inline
|
||||
TextNode <#text>
|
||||
BlockContainer <p> at (8,49) content-size 784x17 children: inline
|
||||
frag 0 from TextNode start: 0, length: 13, rect: [8,49 20.3125x17] baseline: 13.296875
|
||||
"🏴☠️"
|
||||
TextNode <#text>
|
||||
BlockContainer <(anonymous)> at (8,82) content-size 784x0 children: inline
|
||||
TextNode <#text>
|
||||
BlockContainer <p> at (8,82) content-size 784x17 children: inline
|
||||
frag 0 from TextNode start: 0, length: 25, rect: [8,82 20.3125x17] baseline: 13.296875
|
||||
"🧑🧑🧒🧒"
|
||||
TextNode <#text>
|
||||
BlockContainer <(anonymous)> at (8,115) content-size 784x0 children: inline
|
||||
TextNode <#text>
|
||||
|
||||
ViewportPaintable (Viewport<#document>) [0,0 800x600]
|
||||
PaintableWithLines (BlockContainer<HTML>) [0,0 800x600]
|
||||
PaintableWithLines (BlockContainer<BODY>) [8,16 784x83] overflow: [8,16 784x99]
|
||||
PaintableWithLines (BlockContainer<P>) [8,16 784x17]
|
||||
TextPaintable (TextNode<#text>)
|
||||
PaintableWithLines (BlockContainer(anonymous)) [8,49 784x0]
|
||||
PaintableWithLines (BlockContainer<P>) [8,49 784x17]
|
||||
TextPaintable (TextNode<#text>)
|
||||
PaintableWithLines (BlockContainer(anonymous)) [8,82 784x0]
|
||||
PaintableWithLines (BlockContainer<P>) [8,82 784x17]
|
||||
TextPaintable (TextNode<#text>)
|
||||
PaintableWithLines (BlockContainer(anonymous)) [8,115 784x0]
|
|
@ -0,0 +1,3 @@
|
|||
<p>🧑🚒</p>
|
||||
<p>🏴☠️</p>
|
||||
<p>🧑🧑🧒🧒</p>
|
Binary file not shown.
Before Width: | Height: | Size: 20 KiB After Width: | Height: | Size: 20 KiB |
|
@ -391,13 +391,14 @@ void TextNode::compute_text_for_rendering()
|
|||
m_text_for_rendering = MUST(builder.to_string());
|
||||
}
|
||||
|
||||
TextNode::ChunkIterator::ChunkIterator(StringView text, bool wrap_lines, bool respect_linebreaks, Gfx::FontCascadeList const& font_cascade_list)
|
||||
TextNode::ChunkIterator::ChunkIterator(String const& text, bool wrap_lines, bool respect_linebreaks, Gfx::FontCascadeList const& font_cascade_list)
|
||||
: m_wrap_lines(wrap_lines)
|
||||
, m_respect_linebreaks(respect_linebreaks)
|
||||
, m_utf8_view(text)
|
||||
, m_iterator(m_utf8_view.begin())
|
||||
, m_font_cascade_list(font_cascade_list)
|
||||
, m_segmenter(Unicode::Segmenter::create(Unicode::SegmenterGranularity::Grapheme))
|
||||
{
|
||||
m_segmenter->set_segmented_text(text);
|
||||
}
|
||||
|
||||
static Gfx::GlyphRun::TextType text_type_for_code_point(u32 code_point)
|
||||
|
@ -462,75 +463,85 @@ Optional<TextNode::Chunk> TextNode::ChunkIterator::peek(size_t count)
|
|||
|
||||
Optional<TextNode::Chunk> TextNode::ChunkIterator::next_without_peek()
|
||||
{
|
||||
if (m_iterator == m_utf8_view.end())
|
||||
if (m_current_index >= m_utf8_view.byte_length())
|
||||
return {};
|
||||
|
||||
auto start_of_chunk = m_iterator;
|
||||
auto current_code_point = [this]() {
|
||||
return *m_utf8_view.iterator_at_byte_offset_without_validation(m_current_index);
|
||||
};
|
||||
auto next_grapheme_boundary = [this]() {
|
||||
return m_segmenter->next_boundary(m_current_index).value_or(m_utf8_view.byte_length());
|
||||
};
|
||||
|
||||
Gfx::Font const& font = m_font_cascade_list.font_for_code_point(*m_iterator);
|
||||
auto text_type = text_type_for_code_point(*m_iterator);
|
||||
while (m_iterator != m_utf8_view.end()) {
|
||||
if (&font != &m_font_cascade_list.font_for_code_point(*m_iterator)) {
|
||||
if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font, text_type); result.has_value())
|
||||
auto code_point = current_code_point();
|
||||
auto start_of_chunk = m_current_index;
|
||||
|
||||
Gfx::Font const& font = m_font_cascade_list.font_for_code_point(code_point);
|
||||
auto text_type = text_type_for_code_point(code_point);
|
||||
|
||||
while (m_current_index < m_utf8_view.byte_length()) {
|
||||
code_point = current_code_point();
|
||||
|
||||
if (&font != &m_font_cascade_list.font_for_code_point(code_point)) {
|
||||
if (auto result = try_commit_chunk(start_of_chunk, m_current_index, false, font, text_type); result.has_value())
|
||||
return result.release_value();
|
||||
}
|
||||
|
||||
if (m_respect_linebreaks && *m_iterator == '\n') {
|
||||
if (m_respect_linebreaks && code_point == '\n') {
|
||||
// Newline encountered, and we're supposed to preserve them.
|
||||
// If we have accumulated some code points in the current chunk, commit them now and continue with the newline next time.
|
||||
if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font, text_type); result.has_value())
|
||||
if (auto result = try_commit_chunk(start_of_chunk, m_current_index, false, font, text_type); result.has_value())
|
||||
return result.release_value();
|
||||
|
||||
// Otherwise, commit the newline!
|
||||
++m_iterator;
|
||||
auto result = try_commit_chunk(start_of_chunk, m_iterator, true, font, text_type);
|
||||
m_current_index = next_grapheme_boundary();
|
||||
auto result = try_commit_chunk(start_of_chunk, m_current_index, true, font, text_type);
|
||||
VERIFY(result.has_value());
|
||||
return result.release_value();
|
||||
}
|
||||
|
||||
if (m_wrap_lines) {
|
||||
if (text_type != text_type_for_code_point(*m_iterator)) {
|
||||
if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font, text_type); result.has_value())
|
||||
if (text_type != text_type_for_code_point(code_point)) {
|
||||
if (auto result = try_commit_chunk(start_of_chunk, m_current_index, false, font, text_type); result.has_value()) {
|
||||
return result.release_value();
|
||||
}
|
||||
}
|
||||
|
||||
if (is_ascii_space(*m_iterator)) {
|
||||
if (is_ascii_space(code_point)) {
|
||||
// Whitespace encountered, and we're allowed to break on whitespace.
|
||||
// If we have accumulated some code points in the current chunk, commit them now and continue with the whitespace next time.
|
||||
if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font, text_type); result.has_value())
|
||||
if (auto result = try_commit_chunk(start_of_chunk, m_current_index, false, font, text_type); result.has_value()) {
|
||||
return result.release_value();
|
||||
}
|
||||
|
||||
// Otherwise, commit the whitespace!
|
||||
++m_iterator;
|
||||
if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font, text_type); result.has_value())
|
||||
m_current_index = next_grapheme_boundary();
|
||||
if (auto result = try_commit_chunk(start_of_chunk, m_current_index, false, font, text_type); result.has_value())
|
||||
return result.release_value();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
++m_iterator;
|
||||
m_current_index = next_grapheme_boundary();
|
||||
}
|
||||
|
||||
if (start_of_chunk != m_utf8_view.end()) {
|
||||
if (start_of_chunk != m_utf8_view.byte_length()) {
|
||||
// Try to output whatever's left at the end of the text node.
|
||||
if (auto result = try_commit_chunk(start_of_chunk, m_utf8_view.end(), false, font, text_type); result.has_value())
|
||||
if (auto result = try_commit_chunk(start_of_chunk, m_utf8_view.byte_length(), false, font, text_type); result.has_value())
|
||||
return result.release_value();
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
Optional<TextNode::Chunk> TextNode::ChunkIterator::try_commit_chunk(Utf8View::Iterator const& start, Utf8View::Iterator const& end, bool has_breaking_newline, Gfx::Font const& font, Gfx::GlyphRun::TextType text_type) const
|
||||
Optional<TextNode::Chunk> TextNode::ChunkIterator::try_commit_chunk(size_t start, size_t end, bool has_breaking_newline, Gfx::Font const& font, Gfx::GlyphRun::TextType text_type) const
|
||||
{
|
||||
auto byte_offset = m_utf8_view.byte_offset_of(start);
|
||||
auto byte_length = m_utf8_view.byte_offset_of(end) - byte_offset;
|
||||
|
||||
if (byte_length > 0) {
|
||||
auto chunk_view = m_utf8_view.substring_view(byte_offset, byte_length);
|
||||
if (auto byte_length = end - start; byte_length > 0) {
|
||||
auto chunk_view = m_utf8_view.substring_view(start, byte_length);
|
||||
return Chunk {
|
||||
.view = chunk_view,
|
||||
.font = font,
|
||||
.start = byte_offset,
|
||||
.start = start,
|
||||
.length = byte_length,
|
||||
.has_breaking_newline = has_breaking_newline,
|
||||
.is_all_whitespace = is_all_whitespace(chunk_view.as_string()),
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
#pragma once
|
||||
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibUnicode/Segmenter.h>
|
||||
#include <LibWeb/DOM/Text.h>
|
||||
#include <LibWeb/Layout/Node.h>
|
||||
|
||||
|
@ -38,20 +39,23 @@ public:
|
|||
|
||||
class ChunkIterator {
|
||||
public:
|
||||
ChunkIterator(StringView text, bool wrap_lines, bool respect_linebreaks, Gfx::FontCascadeList const&);
|
||||
ChunkIterator(String const& text, bool wrap_lines, bool respect_linebreaks, Gfx::FontCascadeList const&);
|
||||
|
||||
Optional<Chunk> next();
|
||||
Optional<Chunk> peek(size_t);
|
||||
|
||||
private:
|
||||
Optional<Chunk> next_without_peek();
|
||||
Optional<Chunk> try_commit_chunk(Utf8View::Iterator const& start, Utf8View::Iterator const& end, bool has_breaking_newline, Gfx::Font const&, Gfx::GlyphRun::TextType) const;
|
||||
Optional<Chunk> try_commit_chunk(size_t start, size_t end, bool has_breaking_newline, Gfx::Font const&, Gfx::GlyphRun::TextType) const;
|
||||
|
||||
bool const m_wrap_lines;
|
||||
bool const m_respect_linebreaks;
|
||||
Utf8View m_utf8_view;
|
||||
Utf8View::Iterator m_iterator;
|
||||
Gfx::FontCascadeList const& m_font_cascade_list;
|
||||
|
||||
NonnullOwnPtr<Unicode::Segmenter> m_segmenter;
|
||||
size_t m_current_index { 0 };
|
||||
|
||||
Vector<Chunk> m_peek_queue;
|
||||
};
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue