ladybird/Libraries/LibWeb/Layout/TextNode.cpp
Sam Atkins 331dafea98 LibWeb/Layout: Bring text for rendering closer to spec
This makes a difference to our score on the innerText tests, but the
actual effect is hard to know innerText wants us to make adjustments to
the normal algorithm, which are not yet implemented.
2024-12-05 16:32:21 +00:00

649 lines
22 KiB
C++

/*
* Copyright (c) 2018-2021, Andreas Kling <andreas@ladybird.org>
* Copyright (c) 2022, Tobias Christiansen <tobyase@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/CharacterTypes.h>
#include <AK/StringBuilder.h>
#include <LibUnicode/CharacterTypes.h>
#include <LibUnicode/Locale.h>
#include <LibWeb/DOM/Document.h>
#include <LibWeb/Layout/BlockContainer.h>
#include <LibWeb/Layout/InlineFormattingContext.h>
#include <LibWeb/Layout/TextNode.h>
#include <LibWeb/Painting/TextPaintable.h>
namespace Web::Layout {
GC_DEFINE_ALLOCATOR(TextNode);
TextNode::TextNode(DOM::Document& document, DOM::Text& text)
: Node(document, &text)
{
}
TextNode::~TextNode() = default;
static bool is_all_whitespace(StringView string)
{
for (size_t i = 0; i < string.length(); ++i) {
if (!is_ascii_space(string[i]))
return false;
}
return true;
}
// https://w3c.github.io/mathml-core/#new-text-transform-values
static String apply_math_auto_text_transform(String const& string)
{
// https://w3c.github.io/mathml-core/#italic-mappings
auto map_code_point_to_italic = [](u32 code_point) -> u32 {
switch (code_point) {
case 0x0041:
return 0x1D434;
case 0x0042:
return 0x1D435;
case 0x0043:
return 0x1D436;
case 0x0044:
return 0x1D437;
case 0x0045:
return 0x1D438;
case 0x0046:
return 0x1D439;
case 0x0047:
return 0x1D43A;
case 0x0048:
return 0x1D43B;
case 0x0049:
return 0x1D43C;
case 0x004A:
return 0x1D43D;
case 0x004B:
return 0x1D43E;
case 0x004C:
return 0x1D43F;
case 0x004D:
return 0x1D440;
case 0x004E:
return 0x1D441;
case 0x004F:
return 0x1D442;
case 0x0050:
return 0x1D443;
case 0x0051:
return 0x1D444;
case 0x0052:
return 0x1D445;
case 0x0053:
return 0x1D446;
case 0x0054:
return 0x1D447;
case 0x0055:
return 0x1D448;
case 0x0056:
return 0x1D449;
case 0x0057:
return 0x1D44A;
case 0x0058:
return 0x1D44B;
case 0x0059:
return 0x1D44C;
case 0x005A:
return 0x1D44D;
case 0x0061:
return 0x1D44E;
case 0x0062:
return 0x1D44F;
case 0x0063:
return 0x1D450;
case 0x0064:
return 0x1D451;
case 0x0065:
return 0x1D452;
case 0x0066:
return 0x1D453;
case 0x0067:
return 0x1D454;
case 0x0068:
return 0x0210E;
case 0x0069:
return 0x1D456;
case 0x006A:
return 0x1D457;
case 0x006B:
return 0x1D458;
case 0x006C:
return 0x1D459;
case 0x006D:
return 0x1D45A;
case 0x006E:
return 0x1D45B;
case 0x006F:
return 0x1D45C;
case 0x0070:
return 0x1D45D;
case 0x0071:
return 0x1D45E;
case 0x0072:
return 0x1D45F;
case 0x0073:
return 0x1D460;
case 0x0074:
return 0x1D461;
case 0x0075:
return 0x1D462;
case 0x0076:
return 0x1D463;
case 0x0077:
return 0x1D464;
case 0x0078:
return 0x1D465;
case 0x0079:
return 0x1D466;
case 0x007A:
return 0x1D467;
case 0x0131:
return 0x1D6A4;
case 0x0237:
return 0x1D6A5;
case 0x0391:
return 0x1D6E2;
case 0x0392:
return 0x1D6E3;
case 0x0393:
return 0x1D6E4;
case 0x0394:
return 0x1D6E5;
case 0x0395:
return 0x1D6E6;
case 0x0396:
return 0x1D6E7;
case 0x0397:
return 0x1D6E8;
case 0x0398:
return 0x1D6E9;
case 0x0399:
return 0x1D6EA;
case 0x039A:
return 0x1D6EB;
case 0x039B:
return 0x1D6EC;
case 0x039C:
return 0x1D6ED;
case 0x039D:
return 0x1D6EE;
case 0x039E:
return 0x1D6EF;
case 0x039F:
return 0x1D6F0;
case 0x03A0:
return 0x1D6F1;
case 0x03A1:
return 0x1D6F2;
case 0x03F4:
return 0x1D6F3;
case 0x03A3:
return 0x1D6F4;
case 0x03A4:
return 0x1D6F5;
case 0x03A5:
return 0x1D6F6;
case 0x03A6:
return 0x1D6F7;
case 0x03A7:
return 0x1D6F8;
case 0x03A8:
return 0x1D6F9;
case 0x03A9:
return 0x1D6FA;
case 0x2207:
return 0x1D6FB;
case 0x03B1:
return 0x1D6FC;
case 0x03B2:
return 0x1D6FD;
case 0x03B3:
return 0x1D6FE;
case 0x03B4:
return 0x1D6FF;
case 0x03B5:
return 0x1D700;
case 0x03B6:
return 0x1D701;
case 0x03B7:
return 0x1D702;
case 0x03B8:
return 0x1D703;
case 0x03B9:
return 0x1D704;
case 0x03BA:
return 0x1D705;
case 0x03BB:
return 0x1D706;
case 0x03BC:
return 0x1D707;
case 0x03BD:
return 0x1D708;
case 0x03BE:
return 0x1D709;
case 0x03BF:
return 0x1D70A;
case 0x03C0:
return 0x1D70B;
case 0x03C1:
return 0x1D70C;
case 0x03C2:
return 0x1D70D;
case 0x03C3:
return 0x1D70E;
case 0x03C4:
return 0x1D70F;
case 0x03C5:
return 0x1D710;
case 0x03C6:
return 0x1D711;
case 0x03C7:
return 0x1D712;
case 0x03C8:
return 0x1D713;
case 0x03C9:
return 0x1D714;
case 0x2202:
return 0x1D715;
case 0x03F5:
return 0x1D716;
case 0x03D1:
return 0x1D717;
case 0x03F0:
return 0x1D718;
case 0x03D5:
return 0x1D719;
case 0x03F1:
return 0x1D71A;
case 0x03D6:
return 0x1D71B;
default:
return code_point;
}
};
StringBuilder builder(string.bytes().size());
for (auto code_point : string.code_points())
builder.append_code_point(map_code_point_to_italic(code_point));
return MUST(builder.to_string());
}
static String apply_text_transform(String const& string, CSS::TextTransform text_transform, Optional<StringView> const& locale)
{
switch (text_transform) {
case CSS::TextTransform::Uppercase:
return MUST(string.to_uppercase(locale));
case CSS::TextTransform::Lowercase:
return MUST(string.to_lowercase(locale));
case CSS::TextTransform::None:
return string;
case CSS::TextTransform::MathAuto:
return apply_math_auto_text_transform(string);
case CSS::TextTransform::Capitalize: {
return MUST(string.to_titlecase(locale, TrailingCodePointTransformation::PreserveExisting));
}
case CSS::TextTransform::FullSizeKana: {
// FIXME: Implement this!
return string;
}
case CSS::TextTransform::FullWidth: {
return MUST(string.to_fullwidth());
}
}
VERIFY_NOT_REACHED();
}
static bool is_segment_break(u32 code_point)
{
// FIXME: What code points are segment breaks?
return code_point == '\n';
}
// https://drafts.csswg.org/css-text/#white-space-phase-1
static String apply_white_space(String&& input, CSS::WhiteSpace white_space)
{
auto data_view = input.code_points();
// AD-HOC: A fast returns to avoid unnecessarily allocating a StringBuilder.
bool contains_space = false;
for (auto c : data_view) {
if (is_ascii_space(c)) {
contains_space = true;
break;
}
}
if (!contains_space)
return move(input);
// For each inline (including anonymous inlines; see CSS 2.1 § 9.2.2.1 Anonymous inline boxes [CSS2])
// within an inline formatting context, white space characters are processed as follows prior to line breaking
// and bidi reordering, ignoring bidi formatting characters (characters with the Bidi_Control property [UAX9])
// as if they were not there:
switch (white_space) {
case CSS::WhiteSpace::Normal:
case CSS::WhiteSpace::Nowrap:
case CSS::WhiteSpace::PreLine: {
// If white-space is set to normal, nowrap, or pre-line, white space characters are considered collapsible
// and are processed by performing the following steps:
StringBuilder builder(input.byte_count());
// TODO: We need to know if each line is first or last, so we gather up all of the lines before processing.
// There's probably a smarter way to do this?
Vector<Utf8View> lines;
input.code_points().for_each_split_view(is_segment_break, SplitBehavior::KeepEmpty, [&](Utf8View line) {
lines.append(move(line));
});
auto collapsible_spaces_and_tabs = Utf8View(" \t"sv);
for (auto i = 0u; i < lines.size(); ++i) {
auto line = lines[i];
bool const is_first = i == 0;
bool const is_last = i == lines.size() - 1;
// 1. Any sequence of collapsible spaces and tabs immediately preceding or following a segment break is removed.
// NOTE: The first line doesn't follow a segment break, and the last line doesn't precede one.
if (!is_first)
line = line.trim(collapsible_spaces_and_tabs, TrimMode::Left);
if (!is_last)
line = line.trim(collapsible_spaces_and_tabs, TrimMode::Right);
// 2. Collapsible segment breaks are transformed for rendering according to the segment break transformation rules.
Optional<char> segment_break_character;
if (!is_last) {
// https://drafts.csswg.org/css-text/#line-break-transform
// When white-space is pre, pre-wrap, break-spaces, or pre-line, segment breaks are not collapsible
// and are instead transformed into a preserved line feed (U+000A).
// NOTE: Only pre-line is possible here due to the earlier switch
if (white_space == CSS::WhiteSpace::PreLine) {
segment_break_character = '\n';
}
// For other values of white-space, segment breaks are collapsible, and are collapsed as follows:
else {
// 1. First, any collapsible segment break immediately following another collapsible segment break is removed.
// NOTE: For us, that means skipping empty lines.
if (line.is_empty())
continue;
// 2. Then any remaining segment break is either transformed into a space (U+0020) or removed depending on
// the context before and after the break. The rules for this operation are UA-defined in this level.
// TODO: What should we actually be doing here?
if (!builder.string_view().ends_with(' '))
segment_break_character = ' ';
}
}
// 3. Every collapsible tab is converted to a collapsible space (U+0020).
// 4. Any collapsible space immediately following another collapsible space — even one outside the boundary of the
// inline containing that space, provided both spaces are within the same inline formatting context - is collapsed
// to have zero advance width. (It is invisible, but retains its soft wrap opportunity, if any.)
bool just_saw_space = false;
for (auto code_point : line) {
// "Carriage returns (U+000D) are treated identically to spaces (U+0020) in all respects."
// - https://drafts.csswg.org/css-text/#white-space-processing
if (code_point == '\t' || code_point == ' ' || code_point == '\r') {
if (just_saw_space)
continue;
just_saw_space = true;
builder.append(' ');
continue;
}
just_saw_space = false;
builder.append_code_point(code_point);
}
// NOTE: Append the segment break from step 2, but only if it wouldn't collapse with the most recent code point.
if (segment_break_character.has_value() && !builder.string_view().ends_with(*segment_break_character))
builder.append(*segment_break_character);
}
return builder.to_string_without_validation();
}
case CSS::WhiteSpace::Pre:
case CSS::WhiteSpace::PreWrap: {
// If white-space is set to pre, pre-wrap, or break-spaces, any sequence of spaces is treated as a sequence
// of non-breaking spaces. However, for pre-wrap, a soft wrap opportunity exists at the end of a sequence of
// spaces and/or tabs, while for break-spaces, a soft wrap opportunity exists after every space and every tab.
// FIXME: break-spaces
// NOTE: Soft wrap opportunities are not relevant here so we just return the input.
return move(input);
}
}
VERIFY_NOT_REACHED();
}
void TextNode::invalidate_text_for_rendering()
{
m_text_for_rendering = {};
m_grapheme_segmenter.clear();
}
String const& TextNode::text_for_rendering() const
{
if (!m_text_for_rendering.has_value())
m_text_for_rendering = compute_text_for_rendering();
return *m_text_for_rendering;
}
// NOTE: This collapses whitespace into a single ASCII space if the CSS white-space property tells us to.
String TextNode::compute_text_for_rendering() const
{
if (dom_node().is_password_input())
return MUST(String::repeated('*', dom_node().data().code_points().length()));
auto const* parent_element = dom_node().parent_element();
auto const maybe_lang = parent_element ? parent_element->lang() : Optional<String> {};
auto const lang = maybe_lang.has_value() ? maybe_lang.value() : Optional<StringView> {};
auto data = apply_text_transform(dom_node().data(), computed_values().text_transform(), lang);
if (data.is_empty() || (dom_node().is_editable() && !dom_node().is_uninteresting_whitespace_node()))
return data;
return apply_white_space(move(data), computed_values().white_space());
}
Unicode::Segmenter& TextNode::grapheme_segmenter() const
{
if (!m_grapheme_segmenter) {
m_grapheme_segmenter = document().grapheme_segmenter().clone();
m_grapheme_segmenter->set_segmented_text(text_for_rendering());
}
return *m_grapheme_segmenter;
}
TextNode::ChunkIterator::ChunkIterator(TextNode const& text_node, bool wrap_lines, bool respect_linebreaks)
: m_wrap_lines(wrap_lines)
, m_respect_linebreaks(respect_linebreaks)
, m_utf8_view(text_node.text_for_rendering())
, m_font_cascade_list(text_node.computed_values().font_list())
, m_grapheme_segmenter(text_node.grapheme_segmenter())
{
}
static Gfx::GlyphRun::TextType text_type_for_code_point(u32 code_point)
{
switch (Unicode::bidirectional_class(code_point)) {
case Unicode::BidiClass::WhiteSpaceNeutral:
case Unicode::BidiClass::BlockSeparator:
case Unicode::BidiClass::SegmentSeparator:
case Unicode::BidiClass::CommonNumberSeparator:
case Unicode::BidiClass::DirNonSpacingMark:
case Unicode::BidiClass::ArabicNumber:
case Unicode::BidiClass::EuropeanNumber:
case Unicode::BidiClass::EuropeanNumberSeparator:
case Unicode::BidiClass::EuropeanNumberTerminator:
return Gfx::GlyphRun::TextType::ContextDependent;
case Unicode::BidiClass::BoundaryNeutral:
case Unicode::BidiClass::OtherNeutral:
case Unicode::BidiClass::FirstStrongIsolate:
case Unicode::BidiClass::PopDirectionalFormat:
case Unicode::BidiClass::PopDirectionalIsolate:
return Gfx::GlyphRun::TextType::Common;
case Unicode::BidiClass::LeftToRight:
case Unicode::BidiClass::LeftToRightEmbedding:
case Unicode::BidiClass::LeftToRightIsolate:
case Unicode::BidiClass::LeftToRightOverride:
return Gfx::GlyphRun::TextType::Ltr;
case Unicode::BidiClass::RightToLeft:
case Unicode::BidiClass::RightToLeftArabic:
case Unicode::BidiClass::RightToLeftEmbedding:
case Unicode::BidiClass::RightToLeftIsolate:
case Unicode::BidiClass::RightToLeftOverride:
return Gfx::GlyphRun::TextType::Rtl;
default:
VERIFY_NOT_REACHED();
}
}
Optional<TextNode::Chunk> TextNode::ChunkIterator::next()
{
if (!m_peek_queue.is_empty())
return m_peek_queue.take_first();
return next_without_peek();
}
Optional<TextNode::Chunk> TextNode::ChunkIterator::peek(size_t count)
{
while (m_peek_queue.size() <= count) {
auto next = next_without_peek();
if (!next.has_value())
return {};
m_peek_queue.append(*next);
}
return m_peek_queue[count];
}
Optional<TextNode::Chunk> TextNode::ChunkIterator::next_without_peek()
{
if (m_current_index >= m_utf8_view.byte_length())
return {};
auto current_code_point = [this]() {
return *m_utf8_view.iterator_at_byte_offset_without_validation(m_current_index);
};
auto next_grapheme_boundary = [this]() {
return m_grapheme_segmenter.next_boundary(m_current_index).value_or(m_utf8_view.byte_length());
};
auto code_point = current_code_point();
auto start_of_chunk = m_current_index;
Gfx::Font const& font = m_font_cascade_list.font_for_code_point(code_point);
auto text_type = text_type_for_code_point(code_point);
auto broken_on_tab = false;
while (m_current_index < m_utf8_view.byte_length()) {
code_point = current_code_point();
if (code_point == '\t') {
if (auto result = try_commit_chunk(start_of_chunk, m_current_index, false, broken_on_tab, font, text_type); result.has_value())
return result.release_value();
broken_on_tab = true;
// consume any consecutive tabs
while (m_current_index < m_utf8_view.byte_length() && current_code_point() == '\t') {
m_current_index = next_grapheme_boundary();
}
}
if (&font != &m_font_cascade_list.font_for_code_point(code_point)) {
if (auto result = try_commit_chunk(start_of_chunk, m_current_index, false, broken_on_tab, font, text_type); result.has_value())
return result.release_value();
}
if (m_respect_linebreaks && code_point == '\n') {
// Newline encountered, and we're supposed to preserve them.
// If we have accumulated some code points in the current chunk, commit them now and continue with the newline next time.
if (auto result = try_commit_chunk(start_of_chunk, m_current_index, false, broken_on_tab, font, text_type); result.has_value())
return result.release_value();
// Otherwise, commit the newline!
m_current_index = next_grapheme_boundary();
auto result = try_commit_chunk(start_of_chunk, m_current_index, true, broken_on_tab, font, text_type);
VERIFY(result.has_value());
return result.release_value();
}
if (m_wrap_lines) {
if (text_type != text_type_for_code_point(code_point)) {
if (auto result = try_commit_chunk(start_of_chunk, m_current_index, false, broken_on_tab, font, text_type); result.has_value()) {
return result.release_value();
}
}
if (is_ascii_space(code_point)) {
// Whitespace encountered, and we're allowed to break on whitespace.
// If we have accumulated some code points in the current chunk, commit them now and continue with the whitespace next time.
if (auto result = try_commit_chunk(start_of_chunk, m_current_index, false, broken_on_tab, font, text_type); result.has_value()) {
return result.release_value();
}
// Otherwise, commit the whitespace!
m_current_index = next_grapheme_boundary();
if (auto result = try_commit_chunk(start_of_chunk, m_current_index, false, broken_on_tab, font, text_type); result.has_value())
return result.release_value();
continue;
}
}
m_current_index
= next_grapheme_boundary();
}
if (start_of_chunk != m_utf8_view.byte_length()) {
// Try to output whatever's left at the end of the text node.
if (auto result = try_commit_chunk(start_of_chunk, m_utf8_view.byte_length(), false, broken_on_tab, font, text_type); result.has_value())
return result.release_value();
}
return {};
}
Optional<TextNode::Chunk> TextNode::ChunkIterator::try_commit_chunk(size_t start, size_t end, bool has_breaking_newline, bool has_breaking_tab, Gfx::Font const& font, Gfx::GlyphRun::TextType text_type) const
{
if (auto byte_length = end - start; byte_length > 0) {
auto chunk_view = m_utf8_view.substring_view(start, byte_length);
return Chunk {
.view = chunk_view,
.font = font,
.start = start,
.length = byte_length,
.has_breaking_newline = has_breaking_newline,
.has_breaking_tab = has_breaking_tab,
.is_all_whitespace = is_all_whitespace(chunk_view.as_string()),
.text_type = text_type,
};
}
return {};
}
GC::Ptr<Painting::Paintable> TextNode::create_paintable() const
{
return Painting::TextPaintable::create(*this, text_for_rendering());
}
}