LibGUI: Make the TextEditor widget store UTF-32 codepoints

A TextDocumentLine is now backed by a non-null-terminated sequence of
Unicode codepoints encoded as UTF-32 (one u32 per codepoint.)

This makes it possible to view and edit arbitrary Unicode text without
strange cursor and selection behavior. You can freely copy and paste
emojis between TextEditor and Terminal now. :^)

Storing UTF-32 is quite space-inefficient, but we should be able to
use the same optimization techniques as LibVT does to reduce it in
the typical case where most text is ASCII.

There are a lot of things that can be cleaned up around this code,
but this works well enough that I'm pretty happy with it.
This commit is contained in:
Andreas Kling 2020-05-17 20:33:06 +02:00
parent 0272bbb4fb
commit 1bef057ec3
Notes: sideshowbarker 2024-07-19 06:33:09 +09:00
3 changed files with 105 additions and 84 deletions

View file

@ -26,6 +26,7 @@
#include <AK/Badge.h>
#include <AK/StringBuilder.h>
#include <AK/Utf8View.h>
#include <LibCore/Timer.h>
#include <LibGUI/TextDocument.h>
#include <LibGUI/TextEditor.h>
@ -86,12 +87,20 @@ void TextDocument::set_text(const StringView& text)
size_t TextDocumentLine::first_non_whitespace_column() const
{
for (size_t i = 0; i < length(); ++i) {
if (!isspace(m_text[i]))
auto codepoint = codepoints()[i];
if (!isspace(codepoint))
return i;
}
return length();
}
String TextDocumentLine::to_utf8() const
{
StringBuilder builder;
builder.append(view());
return builder.to_string();
}
TextDocumentLine::TextDocumentLine(TextDocument& document)
{
clear(document);
@ -105,50 +114,44 @@ TextDocumentLine::TextDocumentLine(TextDocument& document, const StringView& tex
void TextDocumentLine::clear(TextDocument& document)
{
m_text.clear();
m_text.append(0);
document.update_views({});
}
void TextDocumentLine::set_text(TextDocument& document, const StringView& text)
{
if (text.length() == length() && !memcmp(text.characters_without_null_termination(), characters(), length()))
return;
if (text.is_empty()) {
clear(document);
return;
}
m_text.resize((int)text.length() + 1);
memcpy(m_text.data(), text.characters_without_null_termination(), text.length());
m_text.last() = 0;
m_text.clear();
Utf8View utf8_view(text);
for (auto codepoint : utf8_view)
m_text.append(codepoint);
document.update_views({});
}
void TextDocumentLine::append(TextDocument& document, const char* characters, size_t length)
void TextDocumentLine::append(TextDocument& document, const u32* codepoints, size_t length)
{
int old_length = m_text.size() - 1;
m_text.resize(m_text.size() + length);
memcpy(m_text.data() + old_length, characters, length);
m_text.last() = 0;
m_text.append(codepoints, length);
document.update_views({});
}
void TextDocumentLine::append(TextDocument& document, char ch)
void TextDocumentLine::append(TextDocument& document, u32 codepoint)
{
insert(document, length(), ch);
insert(document, length(), codepoint);
}
void TextDocumentLine::prepend(TextDocument& document, char ch)
void TextDocumentLine::prepend(TextDocument& document, u32 codepoint)
{
insert(document, 0, ch);
insert(document, 0, codepoint);
}
void TextDocumentLine::insert(TextDocument& document, size_t index, char ch)
void TextDocumentLine::insert(TextDocument& document, size_t index, u32 codepoint)
{
if (index == length()) {
m_text.last() = ch;
m_text.append(0);
m_text.append(codepoint);
} else {
m_text.insert((int)index, move(ch));
m_text.insert(index, codepoint);
}
document.update_views({});
}
@ -157,17 +160,29 @@ void TextDocumentLine::remove(TextDocument& document, size_t index)
{
if (index == length()) {
m_text.take_last();
m_text.last() = 0;
} else {
m_text.remove((int)index);
m_text.remove(index);
}
document.update_views({});
}
void TextDocumentLine::remove_range(TextDocument& document, size_t start, size_t length)
{
ASSERT(length <= m_text.size());
Vector<u32> new_data;
new_data.ensure_capacity(m_text.size() - length);
for (size_t i = 0; i < start; ++i)
new_data.append(m_text[i]);
for (size_t i = (start + length); i < m_text.size(); ++i)
new_data.append(m_text[i]);
m_text = move(new_data);
document.update_views({});
}
void TextDocumentLine::truncate(TextDocument& document, size_t length)
{
m_text.resize((int)length + 1);
m_text.last() = 0;
m_text.resize(length);
document.update_views({});
}
@ -251,7 +266,7 @@ String TextDocument::text_in_range(const TextRange& a_range) const
auto& line = this->line(i);
size_t selection_start_column_on_line = range.start().line() == i ? range.start().column() : 0;
size_t selection_end_column_on_line = range.end().line() == i ? range.end().column() : line.length();
builder.append(line.characters() + selection_start_column_on_line, selection_end_column_on_line - selection_start_column_on_line);
builder.append(Utf32View(line.codepoints() + selection_start_column_on_line, selection_end_column_on_line - selection_start_column_on_line));
if (i != range.end().line())
builder.append('\n');
}
@ -259,13 +274,13 @@ String TextDocument::text_in_range(const TextRange& a_range) const
return builder.to_string();
}
char TextDocument::character_at(const TextPosition& position) const
u32 TextDocument::codepoint_at(const TextPosition& position) const
{
ASSERT(position.line() < line_count());
auto& line = this->line(position.line());
if (position.column() == line.length())
return '\n';
return line.characters()[position.column()];
return line.codepoints()[position.column()];
}
TextPosition TextDocument::next_position_after(const TextPosition& position, SearchShouldWrap should_wrap) const
@ -310,8 +325,9 @@ TextRange TextDocument::find_next(const StringView& needle, const TextPosition&
size_t needle_index = 0;
do {
auto ch = character_at(position);
if (ch == needle[needle_index]) {
auto ch = codepoint_at(position);
// FIXME: This is not the right way to use a Unicode needle!
if (ch == (u32)needle[needle_index]) {
if (needle_index == 0)
start_of_potential_match = position;
++needle_index;
@ -341,8 +357,9 @@ TextRange TextDocument::find_previous(const StringView& needle, const TextPositi
size_t needle_index = needle.length() - 1;
do {
auto ch = character_at(position);
if (ch == needle[needle_index]) {
auto ch = codepoint_at(position);
// FIXME: This is not the right way to use a Unicode needle!
if (ch == (u32)needle[needle_index]) {
if (needle_index == needle.length() - 1)
end_of_potential_match = position;
if (needle_index == 0)
@ -481,26 +498,27 @@ void TextDocument::update_undo_timer()
TextPosition TextDocument::insert_at(const TextPosition& position, const StringView& text, const Client* client)
{
TextPosition cursor = position;
for (size_t i = 0; i < text.length(); ++i)
cursor = insert_at(cursor, text[i], client);
Utf8View utf8_view(text);
for (auto codepoint : utf8_view)
cursor = insert_at(cursor, codepoint, client);
return cursor;
}
TextPosition TextDocument::insert_at(const TextPosition& position, char ch, const Client* client)
TextPosition TextDocument::insert_at(const TextPosition& position, u32 codepoint, const Client* client)
{
bool automatic_indentation_enabled = client ? client->is_automatic_indentation_enabled() : false;
size_t m_soft_tab_width = client ? client->soft_tab_width() : 4;
bool at_head = position.column() == 0;
bool at_tail = position.column() == line(position.line()).length();
if (ch == '\n') {
if (codepoint == '\n') {
if (at_tail || at_head) {
String new_line_contents;
if (automatic_indentation_enabled && at_tail) {
size_t leading_spaces = 0;
auto& old_line = lines()[position.line()];
for (size_t i = 0; i < old_line.length(); ++i) {
if (old_line.characters()[i] == ' ')
if (old_line.codepoints()[i] == ' ')
++leading_spaces;
else
break;
@ -510,25 +528,25 @@ TextPosition TextDocument::insert_at(const TextPosition& position, char ch, cons
}
size_t row = position.line();
Vector<char> line_content;
Vector<u32> line_content;
for (size_t i = position.column(); i < line(row).length(); i++)
line_content.append(line(row).characters()[i]);
line_content.append(line(row).codepoints()[i]);
insert_line(position.line() + (at_tail ? 1 : 0), make<TextDocumentLine>(*this, new_line_contents));
notify_did_change();
return { position.line() + 1, line(position.line() + 1).length() };
}
auto new_line = make<TextDocumentLine>(*this);
new_line->append(*this, line(position.line()).characters() + position.column(), line(position.line()).length() - position.column());
new_line->append(*this, line(position.line()).codepoints() + position.column(), line(position.line()).length() - position.column());
Vector<char> line_content;
Vector<u32> line_content;
for (size_t i = 0; i < new_line->length(); i++)
line_content.append(new_line->characters()[i]);
line_content.append(new_line->codepoints()[i]);
line(position.line()).truncate(*this, position.column());
insert_line(position.line() + 1, move(new_line));
notify_did_change();
return { position.line() + 1, 0 };
}
if (ch == '\t') {
if (codepoint == '\t') {
size_t next_soft_tab_stop = ((position.column() + m_soft_tab_width) / m_soft_tab_width) * m_soft_tab_width;
size_t spaces_to_insert = next_soft_tab_stop - position.column();
for (size_t i = 0; i < spaces_to_insert; ++i) {
@ -537,7 +555,7 @@ TextPosition TextDocument::insert_at(const TextPosition& position, char ch, cons
notify_did_change();
return { position.line(), next_soft_tab_stop };
}
line(position.line()).insert(*this, position.column(), ch);
line(position.line()).insert(*this, position.column(), codepoint);
notify_did_change();
return { position.line(), position.column() + 1 };
}
@ -563,25 +581,16 @@ void TextDocument::remove(const TextRange& unnormalized_range)
if (whole_line_is_selected) {
line.clear(*this);
} else {
auto before_selection = String(line.characters(), line.length()).substring(0, range.start().column());
auto after_selection = String(line.characters(), line.length()).substring(range.end().column(), line.length() - range.end().column());
StringBuilder builder(before_selection.length() + after_selection.length());
builder.append(before_selection);
builder.append(after_selection);
line.set_text(*this, builder.to_string());
line.remove_range(*this, range.start().column(), range.end().column() - range.start().column());
}
} else {
// Delete across a newline, merging lines.
ASSERT(range.start().line() == range.end().line() - 1);
auto& first_line = line(range.start().line());
auto& second_line = line(range.end().line());
auto before_selection = String(first_line.characters(), first_line.length()).substring(0, range.start().column());
auto after_selection = String(second_line.characters(), second_line.length()).substring(range.end().column(), second_line.length() - range.end().column());
StringBuilder builder(before_selection.length() + after_selection.length());
builder.append(before_selection);
builder.append(after_selection);
first_line.set_text(*this, builder.to_string());
first_line.clear(*this);
first_line.append(*this, first_line.codepoints(), range.start().column());
first_line.append(*this, second_line.codepoints() + range.end().column(), second_line.length() - range.end().column());
remove_line(range.end().line());
}

View file

@ -31,6 +31,7 @@
#include <AK/NonnullRefPtr.h>
#include <AK/Optional.h>
#include <AK/RefCounted.h>
#include <AK/Utf32View.h>
#include <LibCore/Forward.h>
#include <LibGUI/Command.h>
#include <LibGUI/Forward.h>
@ -112,7 +113,7 @@ public:
TextPosition next_position_after(const TextPosition&, SearchShouldWrap = SearchShouldWrap::Yes) const;
TextPosition previous_position_before(const TextPosition&, SearchShouldWrap = SearchShouldWrap::Yes) const;
char character_at(const TextPosition&) const;
u32 codepoint_at(const TextPosition&) const;
TextRange range_for_entire_line(size_t line_index) const;
@ -129,7 +130,7 @@ public:
void notify_did_change();
void set_all_cursors(const TextPosition&);
TextPosition insert_at(const TextPosition&, char, const Client* = nullptr);
TextPosition insert_at(const TextPosition&, u32, const Client* = nullptr);
TextPosition insert_at(const TextPosition&, const StringView&, const Client* = nullptr);
void remove(const TextRange&);
@ -153,22 +154,26 @@ public:
explicit TextDocumentLine(TextDocument&);
explicit TextDocumentLine(TextDocument&, const StringView&);
StringView view() const { return { characters(), (size_t)length() }; }
const char* characters() const { return m_text.data(); }
size_t length() const { return m_text.size() - 1; }
String to_utf8() const;
Utf32View view() const { return { codepoints(), length() }; }
const u32* codepoints() const { return m_text.data(); }
size_t length() const { return m_text.size(); }
void set_text(TextDocument&, const StringView&);
void append(TextDocument&, char);
void prepend(TextDocument&, char);
void insert(TextDocument&, size_t index, char);
void append(TextDocument&, u32);
void prepend(TextDocument&, u32);
void insert(TextDocument&, size_t index, u32);
void remove(TextDocument&, size_t index);
void append(TextDocument&, const char*, size_t);
void append(TextDocument&, const u32*, size_t);
void truncate(TextDocument&, size_t length);
void clear(TextDocument&);
void remove_range(TextDocument&, size_t start, size_t length);
size_t first_non_whitespace_column() const;
private:
// NOTE: This vector is null terminated.
Vector<char> m_text;
Vector<u32> m_text;
};
class TextDocumentUndoCommand : public Command {

View file

@ -104,9 +104,6 @@ void TextEditor::create_actions()
void TextEditor::set_text(const StringView& text)
{
if (is_single_line() && text.length() == line(0).length() && !memcmp(text.characters_without_null_termination(), line(0).characters(), text.length()))
return;
m_selection.clear();
document().set_text(text);
@ -170,7 +167,7 @@ TextPosition TextEditor::text_position_at(const Gfx::Point& a_position) const
else
column_index = (position.x() + glyph_width() / 2) / glyph_width();
if (is_line_wrapping_enabled()) {
for_each_visual_line(line_index, [&](const Gfx::Rect& rect, const StringView&, size_t start_of_line) {
for_each_visual_line(line_index, [&](const Gfx::Rect& rect, auto&, size_t start_of_line) {
if (rect.contains_vertically(position.y())) {
column_index += start_of_line;
return IterationDecision::Break;
@ -209,13 +206,13 @@ void TextEditor::doubleclick_event(MouseEvent& event)
if (!document().has_spans()) {
while (start.column() > 0) {
if (isspace(line.characters()[start.column() - 1]))
if (isspace(line.codepoints()[start.column() - 1]))
break;
start.set_column(start.column() - 1);
}
while (end.column() < line.length()) {
if (isspace(line.characters()[end.column()]))
if (isspace(line.codepoints()[end.column()]))
break;
end.set_column(end.column() + 1);
}
@ -432,7 +429,7 @@ void TextEditor::paint_event(PaintEvent& event)
size_t selection_end_column_within_line = selection.end().line() == line_index ? selection.end().column() : line.length();
size_t visual_line_index = 0;
for_each_visual_line(line_index, [&](const Gfx::Rect& visual_line_rect, const StringView& visual_line_text, size_t start_of_visual_line) {
for_each_visual_line(line_index, [&](const Gfx::Rect& visual_line_rect, auto& visual_line_text, size_t start_of_visual_line) {
if (is_multi_line() && line_index == m_cursor.line())
painter.fill_rect(visual_line_rect, widget_background_color.darkened(0.9f));
#ifdef DEBUG_TEXTEDITOR
@ -503,8 +500,8 @@ void TextEditor::paint_event(PaintEvent& event)
painter.fill_rect(selection_rect, background_color);
StringView visual_selected_text {
visual_line_text.characters_without_null_termination() + start_of_selection_within_visual_line,
Utf32View visual_selected_text {
visual_line_text.codepoints() + start_of_selection_within_visual_line,
end_of_selection_within_visual_line - start_of_selection_within_visual_line
};
@ -605,6 +602,15 @@ void TextEditor::move_selected_lines_down()
update();
}
int strcmp_utf32(const u32* s1, const u32* s2, size_t n)
{
while (n-- > 0) {
if (*s1++ != *s2++)
return s1[-1] < s2[-1] ? -1 : 1;
}
return 0;
}
void TextEditor::sort_selected_lines()
{
if (is_readonly())
@ -623,7 +629,7 @@ void TextEditor::sort_selected_lines()
auto end = lines.begin() + (int)last_line + 1;
quick_sort(start, end, [](auto& a, auto& b) {
return strcmp(a.characters(), b.characters()) < 0;
return strcmp_utf32(a.codepoints(), b.codepoints(), min(a.length(), b.length())) < 0;
});
did_change();
@ -925,7 +931,7 @@ int TextEditor::content_x_for_position(const TextPosition& position) const
int x_offset = -1;
switch (m_text_alignment) {
case Gfx::TextAlignment::CenterLeft:
for_each_visual_line(position.line(), [&](const Gfx::Rect&, const StringView& view, size_t start_of_visual_line) {
for_each_visual_line(position.line(), [&](const Gfx::Rect&, auto& view, size_t start_of_visual_line) {
if (position.column() >= start_of_visual_line && ((position.column() - start_of_visual_line) <= view.length())) {
x_offset = (position.column() - start_of_visual_line) * glyph_width();
return IterationDecision::Break;
@ -958,7 +964,7 @@ Gfx::Rect TextEditor::content_rect_for_position(const TextPosition& position) co
}
Gfx::Rect rect;
for_each_visual_line(position.line(), [&](const Gfx::Rect& visual_line_rect, const StringView& view, size_t start_of_visual_line) {
for_each_visual_line(position.line(), [&](const Gfx::Rect& visual_line_rect, auto& view, size_t start_of_visual_line) {
if (position.column() >= start_of_visual_line && ((position.column() - start_of_visual_line) <= view.length())) {
// NOTE: We have to subtract the horizontal padding here since it's part of the visual line rect
// *and* included in what we get from content_x_for_position().
@ -1106,7 +1112,8 @@ bool TextEditor::write_to_file(const StringView& path)
for (size_t i = 0; i < line_count(); ++i) {
auto& line = this->line(i);
if (line.length()) {
ssize_t nwritten = write(fd, line.characters(), line.length());
auto line_as_utf8 = line.to_utf8();
ssize_t nwritten = write(fd, line_as_utf8.characters(), line_as_utf8.length());
if (nwritten < 0) {
perror("write");
close(fd);
@ -1133,7 +1140,7 @@ String TextEditor::text() const
StringBuilder builder;
for (size_t i = 0; i < line_count(); ++i) {
auto& line = this->line(i);
builder.append(line.characters(), line.length());
builder.append(line.view());
if (i != line_count() - 1)
builder.append('\n');
}
@ -1357,7 +1364,7 @@ void TextEditor::ensure_cursor_is_valid()
size_t TextEditor::visual_line_containing(size_t line_index, size_t column) const
{
size_t visual_line_index = 0;
for_each_visual_line(line_index, [&](const Gfx::Rect&, const StringView& view, size_t start_of_visual_line) {
for_each_visual_line(line_index, [&](const Gfx::Rect&, auto& view, size_t start_of_visual_line) {
if (column >= start_of_visual_line && ((column - start_of_visual_line) < view.length()))
return IterationDecision::Break;
++visual_line_index;
@ -1379,8 +1386,8 @@ void TextEditor::recompute_visual_lines(size_t line_index)
int line_width_so_far = 0;
for (size_t i = 0; i < line.length(); ++i) {
auto ch = line.characters()[i];
auto glyph_width = font().glyph_width(ch);
auto codepoint = line.codepoints()[i];
auto glyph_width = font().glyph_or_emoji_width(codepoint);
if ((line_width_so_far + glyph_width) > available_width) {
visual_data.visual_line_breaks.append(i);
line_width_so_far = glyph_width;
@ -1409,7 +1416,7 @@ void TextEditor::for_each_visual_line(size_t line_index, Callback callback) cons
auto& visual_data = m_line_visual_data[line_index];
for (auto visual_line_break : visual_data.visual_line_breaks) {
auto visual_line_view = StringView(line.characters() + start_of_line, visual_line_break - start_of_line);
auto visual_line_view = Utf32View(line.codepoints() + start_of_line, visual_line_break - start_of_line);
Gfx::Rect visual_line_rect {
visual_data.visual_rect.x(),
visual_data.visual_rect.y() + ((int)visual_line_index * line_height()),