Ver Fonte

LibMarkdown: Rewrite Inline text parser to be more forgiving

The previous Text::parse was not able to give up on parsing a textual
element, and just leave it as plain text. Because this is a very
important part of markdown, I fully rewrote the parser to support this
without having to backtrack. Also the parser now some other little
features, such ast delimiter runs and flanking.
Peter Elliott há 3 anos atrás
pai
commit
ec9f892899

+ 4 - 66
Userland/Libraries/LibMarkdown/CodeBlock.cpp

@@ -10,51 +10,24 @@
 
 namespace Markdown {
 
-Text::Style CodeBlock::style() const
-{
-    if (m_style_spec.spans().is_empty())
-        return {};
-    return m_style_spec.spans()[0].style;
-}
-
-String CodeBlock::style_language() const
-{
-    if (m_style_spec.spans().is_empty())
-        return {};
-    return m_style_spec.spans()[0].text;
-}
-
 String CodeBlock::render_to_html() const
 {
     StringBuilder builder;
 
-    String style_language = this->style_language();
-    Text::Style style = this->style();
-
     builder.append("<pre>");
 
-    if (style.strong)
-        builder.append("<b>");
-    if (style.emph)
-        builder.append("<em>");
-
-    if (style_language.is_empty())
+    if (m_language.is_empty())
         builder.append("<code>");
     else
-        builder.appendff("<code class=\"{}\">", escape_html_entities(style_language));
+        builder.appendff("<code class=\"{}\">", escape_html_entities(m_language));
 
-    if (style_language == "js")
+    if (m_language == "js")
         builder.append(JS::MarkupGenerator::html_from_source(m_code));
     else
         builder.append(escape_html_entities(m_code));
 
     builder.append("\n</code>");
 
-    if (style.emph)
-        builder.append("</em>");
-    if (style.strong)
-        builder.append("</b>");
-
     builder.append("</pre>\n");
 
     return builder.build();
@@ -64,28 +37,7 @@ String CodeBlock::render_for_terminal(size_t) const
 {
     StringBuilder builder;
 
-    Text::Style style = this->style();
-    bool needs_styling = style.strong || style.emph;
-    if (needs_styling) {
-        builder.append("\033[");
-        bool first = true;
-        if (style.strong) {
-            builder.append('1');
-            first = false;
-        }
-        if (style.emph) {
-            if (!first)
-                builder.append(';');
-            builder.append('4');
-        }
-        builder.append('m');
-    }
-
     builder.append(m_code);
-
-    if (needs_styling)
-        builder.append("\033[0m");
-
     builder.append("\n\n");
 
     return builder.build();
@@ -102,21 +54,7 @@ OwnPtr<CodeBlock> CodeBlock::parse(Vector<StringView>::ConstIterator& lines)
     if (!line.starts_with(tick_tick_tick))
         return {};
 
-    // Our Markdown extension: we allow
-    // specifying a style and a language
-    // for a code block, like so:
-    //
-    // ```**sh**
-    // $ echo hello friends!
-    // ````
-    //
-    // The code block will be made bold,
-    // and if possible syntax-highlighted
-    // as appropriate for a shell script.
     StringView style_spec = line.substring_view(3, line.length() - 3);
-    auto spec = Text::parse(style_spec);
-    if (!spec.has_value())
-        return {};
 
     ++lines;
 
@@ -136,7 +74,7 @@ OwnPtr<CodeBlock> CodeBlock::parse(Vector<StringView>::ConstIterator& lines)
         first = false;
     }
 
-    return make<CodeBlock>(move(spec.value()), builder.build());
+    return make<CodeBlock>(style_spec, builder.build());
 }
 
 }

+ 3 - 6
Userland/Libraries/LibMarkdown/CodeBlock.h

@@ -14,9 +14,9 @@ namespace Markdown {
 
 class CodeBlock final : public Block {
 public:
-    CodeBlock(Text&& style_spec, const String& code)
+    CodeBlock(const String& language, const String& code)
         : m_code(move(code))
-        , m_style_spec(move(style_spec))
+        , m_language(language)
     {
     }
     virtual ~CodeBlock() override { }
@@ -26,11 +26,8 @@ public:
     static OwnPtr<CodeBlock> parse(Vector<StringView>::ConstIterator& lines);
 
 private:
-    String style_language() const;
-    Text::Style style() const;
-
     String m_code;
-    Text m_style_spec;
+    String m_language;
 };
 
 }

+ 10 - 12
Userland/Libraries/LibMarkdown/Document.cpp

@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2019-2020, Sergey Bugaev <bugaevc@serenityos.org>
+ * Copyright (c) 2021, Peter Elliott <pelliott@serenityos.org>
  *
  * SPDX-License-Identifier: BSD-2-Clause
  */
@@ -75,15 +76,16 @@ OwnPtr<Document> Document::parse(const StringView& str)
     auto lines = lines_vec.begin();
     auto document = make<Document>();
     auto& blocks = document->m_blocks;
-    NonnullOwnPtrVector<Paragraph::Line> paragraph_lines;
+    StringBuilder paragraph_text;
 
     auto flush_paragraph = [&] {
-        if (paragraph_lines.is_empty())
+        if (paragraph_text.is_empty())
             return;
-        auto paragraph = make<Paragraph>(move(paragraph_lines));
+        auto paragraph = make<Paragraph>(Text::parse(paragraph_text.build()));
         document->m_blocks.append(move(paragraph));
-        paragraph_lines.clear();
+        paragraph_text.clear();
     };
+
     while (true) {
         if (lines.is_end())
             break;
@@ -98,7 +100,7 @@ OwnPtr<Document> Document::parse(const StringView& str)
             || helper<Heading>(lines, blocks) || helper<HorizontalRule>(lines, blocks);
 
         if (any) {
-            if (!paragraph_lines.is_empty()) {
+            if (!paragraph_text.is_empty()) {
                 auto last_block = document->m_blocks.take_last();
                 flush_paragraph();
                 document->m_blocks.append(move(last_block));
@@ -106,15 +108,11 @@ OwnPtr<Document> Document::parse(const StringView& str)
             continue;
         }
 
-        auto line = Paragraph::Line::parse(lines);
-        if (!line)
-            return {};
-
-        paragraph_lines.append(line.release_nonnull());
+        paragraph_text.append(*lines++);
+        paragraph_text.append("\n");
     }
 
-    if (!paragraph_lines.is_empty())
-        flush_paragraph();
+    flush_paragraph();
 
     return document;
 }

+ 1 - 4
Userland/Libraries/LibMarkdown/Heading.cpp

@@ -53,10 +53,7 @@ OwnPtr<Heading> Heading::parse(Vector<StringView>::ConstIterator& lines)
 
     StringView title_view = line.substring_view(level + 1, line.length() - level - 1);
     auto text = Text::parse(title_view);
-    if (!text.has_value())
-        return {};
-
-    auto heading = make<Heading>(move(text.value()), level);
+    auto heading = make<Heading>(move(text), level);
 
     ++lines;
     return heading;

+ 1 - 4
Userland/Libraries/LibMarkdown/List.cpp

@@ -59,10 +59,7 @@ OwnPtr<List> List::parse(Vector<StringView>::ConstIterator& lines)
             return true;
 
         auto text = Text::parse(item_builder.string_view());
-        if (!text.has_value())
-            return false;
-
-        items.append(move(text.value()));
+        items.append(move(text));
 
         item_builder.clear();
         return true;

+ 2 - 25
Userland/Libraries/LibMarkdown/Paragraph.cpp

@@ -13,13 +13,7 @@ String Paragraph::render_to_html() const
 {
     StringBuilder builder;
     builder.append("<p>");
-    bool first = true;
-    for (auto& line : m_lines) {
-        if (!first)
-            builder.append('\n');
-        first = false;
-        builder.append(line.text().render_to_html().trim(" \t"));
-    }
+    builder.append(m_text.render_to_html());
     builder.append("</p>\n");
     return builder.build();
 }
@@ -27,26 +21,9 @@ String Paragraph::render_to_html() const
 String Paragraph::render_for_terminal(size_t) const
 {
     StringBuilder builder;
-    bool first = true;
-    for (auto& line : m_lines) {
-        if (!first)
-            builder.append(' ');
-        first = false;
-        builder.append(line.text().render_for_terminal());
-    }
+    builder.append(m_text.render_for_terminal());
     builder.append("\n\n");
     return builder.build();
 }
 
-OwnPtr<Paragraph::Line> Paragraph::Line::parse(Vector<StringView>::ConstIterator& lines)
-{
-    if (lines.is_end())
-        return {};
-
-    auto text = Text::parse(*lines++);
-    if (!text.has_value())
-        return {};
-
-    return make<Paragraph::Line>(text.release_value());
-}
 }

+ 3 - 17
Userland/Libraries/LibMarkdown/Paragraph.h

@@ -15,22 +15,8 @@ namespace Markdown {
 
 class Paragraph final : public Block {
 public:
-    class Line {
-    public:
-        explicit Line(Text&& text)
-            : m_text(move(text))
-        {
-        }
-
-        static OwnPtr<Line> parse(Vector<StringView>::ConstIterator& lines);
-        const Text& text() const { return m_text; }
-
-    private:
-        Text m_text;
-    };
-
-    Paragraph(NonnullOwnPtrVector<Line>&& lines)
-        : m_lines(move(lines))
+    Paragraph(Text text)
+        : m_text(move(text))
     {
     }
 
@@ -40,7 +26,7 @@ public:
     virtual String render_for_terminal(size_t view_width = 0) const override;
 
 private:
-    NonnullOwnPtrVector<Line> m_lines;
+    Text m_text;
 };
 
 }

+ 5 - 16
Userland/Libraries/LibMarkdown/Table.cpp

@@ -16,9 +16,7 @@ String Table::render_for_terminal(size_t view_width) const
     StringBuilder builder;
 
     auto write_aligned = [&](const auto& text, auto width, auto alignment) {
-        size_t original_length = 0;
-        for (auto& span : text.spans())
-            original_length += span.text.length();
+        size_t original_length = text.terminal_length();
         auto string = text.render_for_terminal();
         if (alignment == Alignment::Center) {
             auto padding_length = (width - original_length) / 2;
@@ -137,11 +135,8 @@ OwnPtr<Table> Table::parse(Vector<StringView>::ConstIterator& lines)
     table->m_columns.resize(header_delimiters.size());
 
     for (size_t i = 0; i < header_segments.size(); ++i) {
-        auto text_option = Text::parse(header_segments[i]);
-        if (!text_option.has_value())
-            return {}; // An invalid 'text' in the header should just fail the table parse.
+        auto text = Text::parse(header_segments[i]);
 
-        auto text = text_option.release_value();
         auto& column = table->m_columns[i];
 
         column.header = move(text);
@@ -199,16 +194,10 @@ OwnPtr<Table> Table::parse(Vector<StringView>::ConstIterator& lines)
             if (i >= segments.size()) {
                 // Ran out of segments, but still have headers.
                 // Just make an empty cell.
-                table->m_columns[i].rows.append(Text { "" });
+                table->m_columns[i].rows.append(Text::parse(""));
             } else {
-                auto text_option = Text::parse(segments[i]);
-                // We treat an invalid 'text' as a literal.
-                if (text_option.has_value()) {
-                    auto text = text_option.release_value();
-                    table->m_columns[i].rows.append(move(text));
-                } else {
-                    table->m_columns[i].rows.append(Text { segments[i] });
-                }
+                auto text = Text::parse(segments[i]);
+                table->m_columns[i].rows.append(move(text));
             }
         }
     }

+ 305 - 210
Userland/Libraries/LibMarkdown/Text.cpp

@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2019-2020, Sergey Bugaev <bugaevc@serenityos.org>
+ * Copyright (c) 2021, Peter Elliott <pelliott@serenityos.org>
  *
  * SPDX-License-Identifier: BSD-2-Clause
  */
@@ -8,268 +9,362 @@
 #include <AK/ScopeGuard.h>
 #include <AK/StringBuilder.h>
 #include <LibMarkdown/Text.h>
+#include <ctype.h>
 #include <string.h>
 
 namespace Markdown {
 
-static String unescape(const StringView& text)
+void Text::EmphasisNode::render_to_html(StringBuilder& builder) const
 {
-    StringBuilder builder;
-    for (size_t i = 0; i < text.length(); ++i) {
-        if (text[i] == '\\' && i != text.length() - 1) {
-            builder.append(text[i + 1]);
-            i++;
-            continue;
-        }
-        builder.append(text[i]);
+    builder.append((strong) ? "<strong>" : "<em>");
+    child->render_to_html(builder);
+    builder.append((strong) ? "</strong>" : "</em>");
+}
+
+void Text::EmphasisNode::render_for_terminal(StringBuilder&) const
+{
+    // FIXME.
+}
+
+size_t Text::EmphasisNode::terminal_length() const
+{
+    return child->terminal_length();
+}
+
+void Text::CodeNode::render_to_html(StringBuilder& builder) const
+{
+    builder.append("<code>");
+    code->render_to_html(builder);
+    builder.append("</code>");
+}
+
+void Text::CodeNode::render_for_terminal(StringBuilder&) const
+{
+    // FIXME.
+}
+
+size_t Text::CodeNode::terminal_length() const
+{
+    return code->terminal_length();
+}
+
+void Text::TextNode::render_to_html(StringBuilder& builder) const
+{
+    builder.append(escape_html_entities(text));
+}
+
+void Text::TextNode::render_for_terminal(StringBuilder&) const
+{
+    // FIXME.
+}
+
+size_t Text::TextNode::terminal_length() const
+{
+    return text.length();
+}
+
+void Text::LinkNode::render_to_html(StringBuilder& builder) const
+{
+    if (is_image) {
+        builder.append("<img src=\"");
+        href->render_to_html(builder);
+        builder.append("\" alt=\"");
+        text->render_to_html(builder);
+        builder.append("\" >");
+    } else {
+        builder.append("<a href=\"");
+        href->render_to_html(builder);
+        builder.append("\">");
+        text->render_to_html(builder);
+        builder.append("</a>");
+    }
+}
+
+void Text::LinkNode::render_for_terminal(StringBuilder&) const
+{
+    // FIXME.
+}
+
+size_t Text::LinkNode::terminal_length() const
+{
+    return text->terminal_length();
+}
+
+void Text::MultiNode::render_to_html(StringBuilder& builder) const
+{
+    for (auto& child : children) {
+        child.render_to_html(builder);
     }
-    return builder.build();
 }
 
-Text::Text(String&& text)
+void Text::MultiNode::render_for_terminal(StringBuilder&) const
 {
-    m_spans.append({ move(text), Style {} });
+    // FIXME.
+}
+
+size_t Text::MultiNode::terminal_length() const
+{
+    size_t length = 0;
+    for (auto& child : children) {
+        length += child.terminal_length();
+    }
+    return length;
+}
+
+size_t Text::terminal_length() const
+{
+    return m_node->terminal_length();
 }
 
 String Text::render_to_html() const
 {
     StringBuilder builder;
+    m_node->render_to_html(builder);
+    return builder.build().trim(" \n\t");
+}
 
-    Vector<String> open_tags;
-    Style current_style;
+String Text::render_for_terminal() const
+{
+    StringBuilder builder;
+    m_node->render_for_terminal(builder);
+    return builder.build().trim(" \n\t");
+}
 
-    for (auto& span : m_spans) {
-        struct TagAndFlag {
-            String tag;
-            bool Style::*flag;
+Text Text::parse(StringView const& str)
+{
+    Text text;
+    auto const tokens = tokenize(str);
+    auto iterator = tokens.begin();
+    text.m_node = parse_sequence(iterator, false);
+    return text;
+}
+
+Vector<Text::Token> Text::tokenize(StringView const& str)
+{
+    Vector<Token> tokens;
+    StringBuilder current_token;
+
+    auto flush_token = [&](bool left_flanking, bool right_flanking, bool is_run) {
+        if (current_token.is_empty())
+            return;
+
+        tokens.append({
+            current_token.build(),
+            left_flanking,
+            right_flanking,
+            is_run,
+        });
+        current_token.clear();
+    };
+
+    for (size_t offset = 0; offset < str.length(); ++offset) {
+        auto has = [&](StringView const& seq) {
+            if (offset + seq.length() > str.length())
+                return false;
+
+            return str.substring_view(offset, seq.length()) == seq;
         };
-        TagAndFlag tags_and_flags[] = {
-            { "em", &Style::emph },
-            { "b", &Style::strong },
-            { "code", &Style::code }
+
+        auto expect = [&](StringView const& seq) {
+            VERIFY(has(seq));
+            flush_token(false, false, false);
+            current_token.append(seq);
+            flush_token(false, false, false);
+            offset += seq.length() - 1;
         };
-        auto it = open_tags.find_if([&](const String& open_tag) {
-            if (open_tag == "a" && current_style.href != span.style.href)
-                return true;
-            if (open_tag == "img" && current_style.img != span.style.img)
-                return true;
-            for (auto& tag_and_flag : tags_and_flags) {
-                if (open_tag == tag_and_flag.tag && !(span.style.*tag_and_flag.flag))
-                    return true;
-            }
-            return false;
-        });
 
-        if (!it.is_end()) {
-            // We found an open tag that should
-            // not be open for the new span. Close
-            // it and all the open tags that follow
-            // it.
-            for (ssize_t j = open_tags.size() - 1; j >= static_cast<ssize_t>(it.index()); --j) {
-                auto& tag = open_tags[j];
-                if (tag == "img") {
-                    builder.append("\" />");
-                    current_style.img = {};
-                    continue;
-                }
-                builder.appendff("</{}>", tag);
-                if (tag == "a") {
-                    current_style.href = {};
-                    continue;
-                }
-                for (auto& tag_and_flag : tags_and_flags)
-                    if (tag == tag_and_flag.tag)
-                        current_style.*tag_and_flag.flag = false;
+        char ch = str[offset];
+
+        if (ch == '\\' && offset + 1 < str.length()) {
+            current_token.append(str[offset + 1]);
+            ++offset;
+        } else if (ch == '*' || ch == '_' || ch == '`') {
+            flush_token(false, false, false);
+
+            char delim = ch;
+            size_t run_offset;
+            for (run_offset = offset; run_offset < str.length() && str[run_offset] == delim; ++run_offset) {
+                current_token.append(str[run_offset]);
             }
-            open_tags.shrink(it.index());
-        }
-        if (current_style.href.is_null() && !span.style.href.is_null()) {
-            open_tags.append("a");
-            builder.appendff("<a href=\"{}\">", span.style.href);
-        }
-        if (current_style.img.is_null() && !span.style.img.is_null()) {
-            open_tags.append("img");
-            builder.appendff("<img src=\"{}\" alt=\"", span.style.img);
+
+            bool left_flanking = run_offset < str.length() && !isspace(str[run_offset]);
+            bool right_flanking = offset > 0 && !isspace(str[offset - 1]);
+            flush_token(left_flanking, right_flanking, true);
+            offset = run_offset - 1;
+
+        } else if (ch == '\n') {
+            flush_token(false, false, false);
+            current_token.append(ch);
+            flush_token(false, false, false);
+        } else if (has("[")) {
+            expect("[");
+        } else if (has("![")) {
+            expect("![");
+        } else if (has("](")) {
+            expect("](");
+        } else if (has(")")) {
+            expect(")");
+        } else {
+            current_token.append(ch);
         }
-        for (auto& tag_and_flag : tags_and_flags) {
-            if (current_style.*tag_and_flag.flag != span.style.*tag_and_flag.flag) {
-                open_tags.append(tag_and_flag.tag);
-                builder.appendff("<{}>", tag_and_flag.tag);
+    }
+    flush_token(false, false, false);
+    return tokens;
+}
+
+NonnullOwnPtr<Text::MultiNode> Text::parse_sequence(Vector<Token>::ConstIterator& tokens, bool in_link)
+{
+    auto node = make<MultiNode>();
+
+    for (; !tokens.is_end(); ++tokens) {
+        if (tokens->is_run) {
+            switch (tokens->run_char()) {
+            case '*':
+            case '_':
+                node->children.append(parse_emph(tokens, in_link));
+                break;
+            case '`':
+                node->children.append(parse_code(tokens));
+                break;
             }
+        } else if (!in_link && (*tokens == "[" || *tokens == "![")) {
+            node->children.append(parse_link(tokens));
+        } else if (in_link && *tokens == "](") {
+            return node;
+        } else {
+            node->children.append(make<TextNode>(tokens->data));
         }
 
-        current_style = span.style;
-        builder.append(escape_html_entities(span.text));
-    }
+        if (in_link && !tokens.is_end() && *tokens == "](")
+            return node;
 
-    for (ssize_t i = open_tags.size() - 1; i >= 0; --i) {
-        auto& tag = open_tags[i];
-        if (tag == "img") {
-            builder.append("\" />");
-            continue;
-        }
-        builder.appendff("</{}>", tag);
+        if (tokens.is_end())
+            break;
     }
+    return node;
+}
 
-    return builder.build();
+bool Text::can_open(Token const& opening)
+{
+    return (opening.run_char() == '*' && opening.left_flanking) || (opening.run_char() == '_' && opening.left_flanking && !opening.right_flanking);
 }
 
-String Text::render_for_terminal() const
+bool Text::can_close_for(Token const& opening, Text::Token const& closing)
 {
-    StringBuilder builder;
+    if (opening.run_char() != closing.run_char())
+        return false;
 
-    for (auto& span : m_spans) {
-        bool needs_styling = span.style.strong || span.style.emph || span.style.code;
-        if (needs_styling) {
-            builder.append("\033[");
-            bool first = true;
-            if (span.style.strong || span.style.code) {
-                builder.append('1');
-                first = false;
-            }
-            if (span.style.emph) {
-                if (!first)
-                    builder.append(';');
-                builder.append('4');
-            }
-            builder.append('m');
-        }
+    if (opening.run_length() != closing.run_length())
+        return false;
 
-        if (!span.style.href.is_null()) {
-            if (strstr(span.style.href.characters(), "://") != nullptr) {
-                builder.append("\033]8;;");
-                builder.append(span.style.href);
-                builder.append("\033\\");
-            }
-        }
+    return (opening.run_char() == '*' && closing.right_flanking) || (opening.run_char() == '_' && !closing.left_flanking && closing.right_flanking);
+}
 
-        builder.append(span.text.characters());
+NonnullOwnPtr<Text::Node> Text::parse_emph(Vector<Token>::ConstIterator& tokens, bool in_link)
+{
+    auto opening = *tokens;
 
-        if (needs_styling)
-            builder.append("\033[0m");
+    // Check that the opening delimiter run is properly flanking.
+    if (!can_open(opening))
+        return make<TextNode>(opening.data);
 
-        if (!span.style.href.is_null()) {
-            // When rendering for the terminal, ignore any
-            // non-absolute links, because the user has no
-            // chance to follow them anyway.
-            if (strstr(span.style.href.characters(), "://") != nullptr) {
-                builder.appendff(" <{}>", span.style.href);
-                builder.append("\033]8;;\033\\");
+    auto child = make<MultiNode>();
+    for (++tokens; !tokens.is_end(); ++tokens) {
+        if (tokens->is_run) {
+            if (can_close_for(opening, *tokens)) {
+                return make<EmphasisNode>(opening.run_length() >= 2, move(child));
             }
-        }
-        if (!span.style.img.is_null()) {
-            if (strstr(span.style.img.characters(), "://") != nullptr) {
-                builder.appendff(" <{}>", span.style.img);
+
+            switch (tokens->run_char()) {
+            case '*':
+            case '_':
+                child->children.append(parse_emph(tokens, in_link));
+                break;
+            case '`':
+                child->children.append(parse_code(tokens));
+                break;
             }
+        } else if (*tokens == "[" || *tokens == "![") {
+            child->children.append(parse_link(tokens));
+        } else if (in_link && *tokens == "](") {
+            child->children.prepend(make<TextNode>(opening.data));
+            return child;
+        } else {
+            child->children.append(make<TextNode>(tokens->data));
+        }
+
+        if (in_link && !tokens.is_end() && *tokens == "](") {
+            child->children.prepend(make<TextNode>(opening.data));
+            return child;
         }
-    }
 
-    return builder.build();
+        if (tokens.is_end())
+            break;
+    }
+    child->children.prepend(make<TextNode>(opening.data));
+    return child;
 }
 
-Optional<Text> Text::parse(const StringView& str)
+NonnullOwnPtr<Text::Node> Text::parse_code(Vector<Token>::ConstIterator& tokens)
 {
-    Style current_style;
-    size_t current_span_start = 0;
-    int first_span_in_the_current_link = -1;
-    bool current_link_is_actually_img = false;
-    Vector<Span> spans;
-
-    auto append_span_if_needed = [&](size_t offset) {
-        VERIFY(current_span_start <= offset);
-        if (current_span_start != offset) {
-            Span span {
-                unescape(str.substring_view(current_span_start, offset - current_span_start)),
-                current_style
-            };
-            spans.append(move(span));
-            current_span_start = offset;
-        }
+    auto opening = *tokens;
+
+    auto is_closing = [&](Token const& token) {
+        return token.is_run && token.run_char() == '`' && token.run_length() == opening.run_length();
     };
 
-    for (size_t offset = 0; offset < str.length(); offset++) {
-        char ch = str[offset];
+    bool is_all_whitespace = true;
+    auto code = make<MultiNode>();
+    for (auto iterator = tokens + 1; !iterator.is_end(); ++iterator) {
+        if (is_closing(*iterator)) {
+            tokens = iterator;
+
+            // Strip first and last space, when appropriate.
+            if (!is_all_whitespace) {
+                auto& first = dynamic_cast<TextNode&>(code->children.first());
+                auto& last = dynamic_cast<TextNode&>(code->children.last());
+                if (first.text.starts_with(" ") && last.text.ends_with(" ")) {
+                    first.text = first.text.substring(1);
+                    last.text = last.text.substring(0, last.text.length() - 1);
+                }
+            }
 
-        bool is_escape = ch == '\\';
-        if (is_escape && offset != str.length() - 1) {
-            offset++;
-            continue;
+            return make<CodeNode>(move(code));
         }
 
-        bool is_special_character = false;
-        is_special_character |= ch == '`';
-        if (!current_style.code)
-            is_special_character |= ch == '*' || ch == '_' || ch == '[' || ch == ']' || (ch == '!' && offset + 1 < str.length() && str[offset + 1] == '[');
-        if (!is_special_character)
-            continue;
+        is_all_whitespace = is_all_whitespace && iterator->data.is_whitespace();
+        code->children.append(make<TextNode>((*iterator == "\n") ? " " : iterator->data));
+    }
 
-        append_span_if_needed(offset);
+    return make<TextNode>(opening.data);
+}
 
-        switch (ch) {
-        case '`':
-            current_style.code = !current_style.code;
-            break;
-        case '*':
-        case '_':
-            if (offset + 1 < str.length() && str[offset + 1] == ch) {
-                offset++;
-                current_style.strong = !current_style.strong;
-            } else {
-                current_style.emph = !current_style.emph;
-            }
-            break;
-        case '!':
-            current_link_is_actually_img = true;
-            break;
-        case '[':
-            if constexpr (MARKDOWN_DEBUG) {
-                if (first_span_in_the_current_link != -1)
-                    dbgln("Dropping the outer link");
-            }
-            first_span_in_the_current_link = spans.size();
-            break;
-        case ']': {
-            if (first_span_in_the_current_link == -1) {
-                dbgln_if(MARKDOWN_DEBUG, "Unmatched ]");
-                continue;
-            }
-            ScopeGuard guard = [&] {
-                first_span_in_the_current_link = -1;
-                current_link_is_actually_img = false;
-            };
-            if (offset + 2 >= str.length() || str[offset + 1] != '(')
-                continue;
-            offset += 2;
-            size_t start_of_href = offset;
-
-            do
-                offset++;
-            while (offset < str.length() && str[offset] != ')');
-            if (offset == str.length())
-                offset--;
-
-            const StringView href = str.substring_view(start_of_href, offset - start_of_href);
-            for (size_t i = first_span_in_the_current_link; i < spans.size(); i++) {
-                if (current_link_is_actually_img)
-                    spans[i].style.img = href;
-                else
-                    spans[i].style.href = href;
-            }
-            break;
-        }
-        default:
-            VERIFY_NOT_REACHED();
-        }
+NonnullOwnPtr<Text::Node> Text::parse_link(Vector<Token>::ConstIterator& tokens)
+{
+    auto opening = *tokens++;
+    bool is_image = opening == "![";
 
-        // We've processed the character as a special, so the next offset will
-        // start after it. Note that explicit continue statements skip over this
-        // line, effectively treating the character as not special.
-        current_span_start = offset + 1;
+    auto link_text = parse_sequence(tokens, true);
+
+    if (tokens.is_end() || *tokens != "](") {
+        link_text->children.prepend(make<TextNode>(opening.data));
+        return link_text;
     }
+    auto seperator = *tokens;
+    VERIFY(seperator == "](");
 
-    append_span_if_needed(str.length());
+    auto address = make<MultiNode>();
+    for (auto iterator = tokens + 1; !iterator.is_end(); ++iterator) {
+        if (*iterator == ")") {
+            tokens = iterator;
+            return make<LinkNode>(is_image, move(link_text), move(address));
+        }
 
-    return Text(move(spans));
-}
+        address->children.append(make<TextNode>(iterator->data));
+    }
 
+    link_text->children.prepend(make<TextNode>(opening.data));
+    link_text->children.append(make<TextNode>(seperator.data));
+    return link_text;
+}
 }

+ 113 - 22
Userland/Libraries/LibMarkdown/Text.h

@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2019-2020, Sergey Bugaev <bugaevc@serenityos.org>
+ * Copyright (c) 2021, Peter Elliott <pelliott@serenityos.org>
  *
  * SPDX-License-Identifier: BSD-2-Clause
  */
@@ -7,48 +8,138 @@
 #pragma once
 
 #include <AK/Noncopyable.h>
+#include <AK/NonnullOwnPtrVector.h>
+#include <AK/OwnPtr.h>
 #include <AK/String.h>
-#include <AK/Vector.h>
 
 namespace Markdown {
 
 class Text final {
-    AK_MAKE_NONCOPYABLE(Text);
-
 public:
-    struct Style {
-        bool emph { false };
-        bool strong { false };
-        bool code { false };
-        String href;
-        String img;
+    class Node {
+    public:
+        virtual void render_to_html(StringBuilder& builder) const = 0;
+        virtual void render_for_terminal(StringBuilder& builder) const = 0;
+        virtual size_t terminal_length() const = 0;
+
+        virtual ~Node() { }
+    };
+
+    class EmphasisNode : public Node {
+    public:
+        bool strong;
+        NonnullOwnPtr<Node> child;
+
+        EmphasisNode(bool strong, NonnullOwnPtr<Node> child)
+            : strong(strong)
+            , child(move(child))
+        {
+        }
+
+        virtual void render_to_html(StringBuilder& builder) const override;
+        virtual void render_for_terminal(StringBuilder& builder) const override;
+        virtual size_t terminal_length() const override;
     };
 
-    struct Span {
+    class CodeNode : public Node {
+    public:
+        NonnullOwnPtr<Node> code;
+
+        CodeNode(NonnullOwnPtr<Node> code)
+            : code(move(code))
+        {
+        }
+
+        virtual void render_to_html(StringBuilder& builder) const override;
+        virtual void render_for_terminal(StringBuilder& builder) const override;
+        virtual size_t terminal_length() const override;
+    };
+
+    class TextNode : public Node {
+    public:
         String text;
-        Style style;
+
+        TextNode(StringView const& text)
+            : text(text)
+        {
+        }
+
+        virtual void render_to_html(StringBuilder& builder) const override;
+        virtual void render_for_terminal(StringBuilder& builder) const override;
+        virtual size_t terminal_length() const override;
+    };
+
+    class LinkNode : public Node {
+    public:
+        bool is_image;
+        NonnullOwnPtr<Node> text;
+        NonnullOwnPtr<Node> href;
+
+        LinkNode(bool is_image, NonnullOwnPtr<Node> text, NonnullOwnPtr<Node> href)
+            : is_image(is_image)
+            , text(move(text))
+            , href(move(href))
+        {
+        }
+
+        virtual void render_to_html(StringBuilder& builder) const override;
+        virtual void render_for_terminal(StringBuilder& builder) const override;
+        virtual size_t terminal_length() const override;
     };
 
-    explicit Text(String&& text);
-    Text(Text&& text) = default;
-    Text() = default;
+    class MultiNode : public Node {
+    public:
+        NonnullOwnPtrVector<Node> children;
 
-    Text& operator=(Text&&) = default;
+        virtual void render_to_html(StringBuilder& builder) const override;
+        virtual void render_for_terminal(StringBuilder& builder) const override;
+        virtual size_t terminal_length() const override;
+    };
 
-    const Vector<Span>& spans() const { return m_spans; }
+    size_t terminal_length() const;
 
     String render_to_html() const;
     String render_for_terminal() const;
 
-    static Optional<Text> parse(const StringView&);
+    static Text parse(StringView const&);
 
 private:
-    Text(Vector<Span>&& spans)
-        : m_spans(move(spans))
-    {
-    }
+    struct Token {
+        String data;
+        // Flanking basically means that a delimiter run has a non-whitespace,
+        // non-punctuation character on the corresponsing side. For a more exact
+        // definition, see the CommonMark spec.
+        bool left_flanking;
+        bool right_flanking;
+        // is_run indicates that this token is a 'delimiter run'. A delimiter
+        // run occurs when several of the same sytactical character ('`', '_',
+        // or '*') occur in a row.
+        bool is_run;
+
+        char run_char() const
+        {
+            VERIFY(is_run);
+            return data[0];
+        }
+        char run_length() const
+        {
+            VERIFY(is_run);
+            return data.length();
+        }
+        bool operator==(StringView const& str) const { return str == data; }
+    };
+
+    static Vector<Token> tokenize(StringView const&);
+
+    static bool can_open(Token const& opening);
+    static bool can_close_for(Token const& opening, Token const& closing);
+
+    static NonnullOwnPtr<MultiNode> parse_sequence(Vector<Token>::ConstIterator& tokens, bool in_link);
+    static NonnullOwnPtr<Node> parse_emph(Vector<Token>::ConstIterator& tokens, bool in_link);
+    static NonnullOwnPtr<Node> parse_code(Vector<Token>::ConstIterator& tokens);
+    static NonnullOwnPtr<Node> parse_link(Vector<Token>::ConstIterator& tokens);
 
-    Vector<Span> m_spans;
+    OwnPtr<Node> m_node;
 };
 
 }