Browse Source

markdown-checker: New tool that checks document links

Ben Wiederhake 3 years ago
parent
commit
3f88d65b78
3 changed files with 248 additions and 0 deletions
  1. 4 0
      Meta/Lagom/CMakeLists.txt
  2. 1 0
      Userland/Utilities/CMakeLists.txt
  3. 243 0
      Userland/Utilities/markdown-check.cpp

+ 4 - 0
Meta/Lagom/CMakeLists.txt

@@ -436,6 +436,10 @@ if (BUILD_LAGOM)
         set_target_properties(js_lagom PROPERTIES OUTPUT_NAME js)
         target_link_libraries(js_lagom LagomJS LagomLine Threads::Threads)
 
+        add_executable(markdown-check_lagom ../../Userland/Utilities/markdown-check.cpp)
+        set_target_properties(markdown-check_lagom PROPERTIES OUTPUT_NAME markdown-check)
+        target_link_libraries(markdown-check_lagom LagomMarkdown)
+
         add_executable(ntpquery_lagom ../../Userland/Utilities/ntpquery.cpp)
         set_target_properties(ntpquery_lagom PROPERTIES OUTPUT_NAME ntpquery)
         target_link_libraries(ntpquery_lagom LagomCore)

+ 1 - 0
Userland/Utilities/CMakeLists.txt

@@ -73,6 +73,7 @@ target_link_libraries(keymap LibKeyboard)
 target_link_libraries(lspci LibPCIDB)
 target_link_libraries(lsusb LibUSBDB)
 target_link_libraries(man LibMarkdown)
+target_link_libraries(markdown-check LibMarkdown)
 target_link_libraries(matroska LibVideo)
 target_link_libraries(md LibMarkdown)
 target_link_libraries(misbehaving-application LibCore)

+ 243 - 0
Userland/Utilities/markdown-check.cpp

@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2021, Ben Wiederhake <BenWiederhake.GitHub@gmx.de>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+/*
+ * You may want to invoke the checker like this:
+ * $ cd Build/lagom
+ * $ ninja
+ * $ find ../../AK ../../Base ../../Documentation/ ../../Kernel/ ../../Meta/ ../../Ports/ ../../Tests/ ../../Userland/ -type f -name '*.md' | xargs ./markdown-check ../../README.md
+ */
+
+#include <AK/Format.h>
+#include <AK/HashMap.h>
+#include <AK/HashTable.h>
+#include <AK/LexicalPath.h>
+#include <AK/OwnPtr.h>
+#include <AK/StdLibExtras.h>
+#include <AK/Vector.h>
+#include <LibCore/File.h>
+#include <LibMarkdown/Document.h>
+#include <LibMarkdown/Visitor.h>
+
+struct FileLink {
+    String file_path; // May be empty, but not null
+    String anchor;    // May be null ("foo.md", "bar.png"), may be empty ("baz.md#")
+    String label;     // May be empty, but not null
+};
+
+class MarkdownLinkage final : Markdown::Visitor {
+public:
+    ~MarkdownLinkage() = default;
+
+    static MarkdownLinkage analyze(Markdown::Document const&);
+
+    bool has_anchor(String const& anchor) const { return m_anchors.contains(anchor); }
+    HashTable<String> const& anchors() const { return m_anchors; }
+    Vector<FileLink> const& file_links() const { return m_file_links; }
+
+private:
+    MarkdownLinkage() = default;
+
+    virtual RecursionDecision visit(Markdown::Heading const&) override;
+    virtual RecursionDecision visit(Markdown::Text::LinkNode const&) override;
+
+    HashTable<String> m_anchors;
+    Vector<FileLink> m_file_links;
+};
+
+MarkdownLinkage MarkdownLinkage::analyze(Markdown::Document const& document)
+{
+    MarkdownLinkage linkage;
+
+    document.walk(linkage);
+
+    return linkage;
+}
+
+class StringCollector final : Markdown::Visitor {
+public:
+    StringCollector() = default;
+    virtual ~StringCollector() = default;
+
+    String build() { return m_builder.build(); }
+
+    static String from(Markdown::Heading const& heading)
+    {
+        StringCollector collector;
+        heading.walk(collector);
+        return collector.build();
+    }
+
+    static String from(Markdown::Text::Node const& node)
+    {
+        StringCollector collector;
+        node.walk(collector);
+        return collector.build();
+    }
+
+private:
+    virtual RecursionDecision visit(String const& text) override
+    {
+        m_builder.append(text);
+        return RecursionDecision::Recurse;
+    }
+
+    StringBuilder m_builder;
+};
+
+static String slugify(String const& text)
+{
+    // TODO: This feels like it belongs into LibWeb.
+    String slug = text.to_lowercase();
+    // Reverse-engineered through github, using:
+    // find AK/ Base/ Documentation/ Kernel/ Meta/ Ports/ Tests/ Userland/ -name '*.md' | xargs grep --color=always -Pin '^##+ .*[^a-z0-9 ?()`_:/!&|.$'"'"',<>"+-]' README.md
+    slug = slug.replace(" ", "-", true)
+               .replace("!", "", true)
+               .replace("?", "", true)
+               .replace("(", "", true)
+               .replace(")", "", true)
+               .replace(":", "", true)
+               .replace("/", "-", true)
+               .replace("&", "", true)
+               .replace("|", "", true)
+               .replace(".", "", true)
+               .replace("$", "", true)
+               .replace("'", "", true)
+               .replace(",", "", true)
+               .replace("\"", "", true)
+               .replace("+", "", true)
+               .replace("\\", "", true)
+               .replace("<", "", true)
+               .replace(">", "", true);
+    // What about "="?
+    return slug;
+}
+
+RecursionDecision MarkdownLinkage::visit(Markdown::Heading const& heading)
+{
+    m_anchors.set(slugify(StringCollector::from(heading)));
+    return RecursionDecision::Recurse;
+}
+
+RecursionDecision MarkdownLinkage::visit(Markdown::Text::LinkNode const& link_node)
+{
+    String const& href = link_node.href;
+    if (href.is_null()) {
+        // Nothing to do here.
+        return RecursionDecision::Recurse;
+    }
+    if (href.starts_with("https://") || href.starts_with("http://")) {
+        outln("Not checking external link {}", href);
+        return RecursionDecision::Recurse;
+    }
+    if (href.starts_with("file://")) {
+        // TODO: Resolve relative to $SERENITY_SOURCE_DIR/Base/
+        // Currently, this affects only one link, so it's not worth the effort.
+        outln("Not checking local link {}", href);
+        return RecursionDecision::Recurse;
+    }
+
+    String label = StringCollector::from(*link_node.text);
+    Optional<size_t> last_hash = href.find_last('#');
+    if (last_hash.has_value()) {
+        m_file_links.append({ href.substring(0, last_hash.value()), href.substring(last_hash.value() + 1), label });
+    } else {
+        m_file_links.append({ href, String(), label });
+    }
+
+    return RecursionDecision::Recurse;
+}
+
+int main(int argc, char** argv)
+{
+    if (argc < 2) {
+        // Technically it is valid to call this program with zero markdown files: When there are
+        // no files, there are no dead links. However, any such usage is probably erroneous.
+        warnln("Usage: {} Foo.md Bar.md ...", argv[0]);
+        // E.g.: find AK/ Base/ Documentation/ Kernel/ Meta/ Ports/ Tests/ Userland/ -name '*.md' -print0 | xargs -0 ./MarkdownCheck
+        return 1;
+    }
+
+    outln("Reading and parsing Markdown files ...");
+    HashMap<String, MarkdownLinkage> files;
+    for (int i = 1; i < argc; ++i) {
+        auto path = argv[i];
+        auto file_or_error = Core::File::open(path, Core::OpenMode::ReadOnly);
+        if (file_or_error.is_error()) {
+            warnln("Failed to read {}: {}", path, file_or_error.error());
+            // Since this should never happen anyway, fail early.
+            return 1;
+        }
+        auto file = file_or_error.release_value();
+        auto content_buffer = file->read_all();
+        auto content = StringView(content_buffer);
+        auto document = Markdown::Document::parse(content);
+        if (!document) {
+            warnln("Failed to parse {} due to an unspecified error.", path);
+            // Since this should never happen anyway, fail early.
+            return 1;
+        }
+        files.set(Core::File::real_path_for(path), MarkdownLinkage::analyze(*document));
+    }
+
+    outln("Checking links ...");
+    bool any_problems = false;
+    for (auto const& file_item : files) {
+        auto file_lexical_path = LexicalPath(file_item.key);
+        auto file_dir = file_lexical_path.dirname();
+        for (auto const& file_link : file_item.value.file_links()) {
+            String pointee_file;
+            if (file_link.file_path.is_empty()) {
+                pointee_file = file_item.key;
+            } else {
+                pointee_file = LexicalPath::absolute_path(file_dir, file_link.file_path);
+            }
+            if (!Core::File::exists(pointee_file)) {
+                outln("File '{}' points to '{}' (label '{}'), but '{}' does not exist!",
+                    file_item.key, file_link.file_path, file_link.label, pointee_file);
+                any_problems = true;
+                continue;
+            }
+            if (file_link.anchor.is_empty()) {
+                // No anchor to test for.
+                continue;
+            }
+
+            auto pointee_linkage = files.find(pointee_file);
+            if (pointee_linkage == files.end()) {
+                outln("File '{}' points to file '{}', which exists, but was not scanned. Add it to the command-line arguments and re-run.",
+                    file_item.key, pointee_file);
+                any_problems = true;
+                continue;
+            }
+
+            if (!pointee_linkage->value.has_anchor(file_link.anchor)) {
+                outln("File '{}' points to '{}#{}' (label '{}'), but file '{}' does not have any heading that results in the anchor '{}'.",
+                    file_item.key, file_link.file_path, file_link.anchor, file_link.label, pointee_file, file_link.anchor);
+                out("    The following anchors seem to be available:\n    ");
+                bool any_anchors = false;
+                for (auto const& anchor : pointee_linkage->value.anchors()) {
+                    if (any_anchors)
+                        out(", ");
+                    out("'{}'", anchor);
+                    any_anchors = true;
+                }
+                if (!any_anchors)
+                    out("(none)");
+                outln();
+                any_problems = true;
+            }
+        }
+    }
+
+    if (any_problems) {
+        outln("Done. Some errors were encountered, please check above log.");
+        return 1;
+    } else {
+        outln("Done. No problems detected.");
+        return 0;
+    }
+}