markdown-check.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. /*
  2. * Copyright (c) 2021, Ben Wiederhake <BenWiederhake.GitHub@gmx.de>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. /*
  7. * You may want to invoke the checker like this:
  8. * $ cd Build/lagom
  9. * $ ninja
  10. * $ find ../../AK ../../Base ../../Documentation/ ../../Kernel/ ../../Meta/ ../../Ports/ ../../Tests/ ../../Userland/ -type f -name '*.md' | xargs ./markdown-check ../../README.md
  11. */
  12. #include <AK/Format.h>
  13. #include <AK/HashMap.h>
  14. #include <AK/HashTable.h>
  15. #include <AK/LexicalPath.h>
  16. #include <AK/RecursionDecision.h>
  17. #include <AK/URL.h>
  18. #include <AK/Vector.h>
  19. #include <LibCore/ArgsParser.h>
  20. #include <LibCore/File.h>
  21. #include <LibCore/Stream.h>
  22. #include <LibMain/Main.h>
  23. #include <LibMarkdown/Document.h>
  24. #include <LibMarkdown/Visitor.h>
  25. #include <stdlib.h>
  26. static bool is_missing_file_acceptable(DeprecatedString const& filename)
  27. {
  28. const StringView acceptable_missing_files[] = {
  29. // FIXME: Please write these manpages!
  30. "/usr/share/man/man2/accept.md"sv,
  31. "/usr/share/man/man2/exec.md"sv,
  32. "/usr/share/man/man2/fcntl.md"sv,
  33. "/usr/share/man/man2/fork.md"sv,
  34. "/usr/share/man/man2/ioctl.md"sv,
  35. "/usr/share/man/man2/listen.md"sv,
  36. "/usr/share/man/man2/mmap.md"sv,
  37. "/usr/share/man/man2/mprotect.md"sv,
  38. "/usr/share/man/man2/open.md"sv,
  39. "/usr/share/man/man2/ptrace.md"sv,
  40. "/usr/share/man/man5/perfcore.md"sv,
  41. // These ones are okay:
  42. "/home/anon/Tests/js-tests/test-common.js"sv,
  43. "/man1/index.html"sv,
  44. "/man2/index.html"sv,
  45. "/man3/index.html"sv,
  46. "/man4/index.html"sv,
  47. "/man5/index.html"sv,
  48. "/man6/index.html"sv,
  49. "/man7/index.html"sv,
  50. "/man8/index.html"sv,
  51. "index.html"sv,
  52. };
  53. for (auto const& suffix : acceptable_missing_files) {
  54. if (filename.ends_with(suffix))
  55. return true;
  56. }
  57. return false;
  58. }
  59. struct FileLink {
  60. DeprecatedString file_path; // May be empty, but not null
  61. DeprecatedString anchor; // May be null ("foo.md", "bar.png"), may be empty ("baz.md#")
  62. DeprecatedString label; // May be empty, but not null
  63. };
  64. class MarkdownLinkage final : Markdown::Visitor {
  65. public:
  66. ~MarkdownLinkage() = default;
  67. static MarkdownLinkage analyze(Markdown::Document const&);
  68. bool has_anchor(DeprecatedString const& anchor) const { return m_anchors.contains(anchor); }
  69. HashTable<DeprecatedString> const& anchors() const { return m_anchors; }
  70. bool has_invalid_link() const { return m_has_invalid_link; }
  71. Vector<FileLink> const& file_links() const { return m_file_links; }
  72. private:
  73. MarkdownLinkage()
  74. {
  75. auto const* source_directory = getenv("SERENITY_SOURCE_DIR");
  76. if (source_directory != nullptr) {
  77. m_serenity_source_directory = source_directory;
  78. } else {
  79. warnln("The environment variable SERENITY_SOURCE_DIR was not found. Link checking inside Serenity's filesystem will fail.");
  80. }
  81. }
  82. virtual RecursionDecision visit(Markdown::Heading const&) override;
  83. virtual RecursionDecision visit(Markdown::Text::LinkNode const&) override;
  84. HashTable<DeprecatedString> m_anchors;
  85. Vector<FileLink> m_file_links;
  86. bool m_has_invalid_link { false };
  87. DeprecatedString m_serenity_source_directory;
  88. };
  89. MarkdownLinkage MarkdownLinkage::analyze(Markdown::Document const& document)
  90. {
  91. MarkdownLinkage linkage;
  92. document.walk(linkage);
  93. return linkage;
  94. }
  95. class StringCollector final : Markdown::Visitor {
  96. public:
  97. StringCollector() = default;
  98. virtual ~StringCollector() = default;
  99. DeprecatedString build() { return m_builder.build(); }
  100. static DeprecatedString from(Markdown::Heading const& heading)
  101. {
  102. StringCollector collector;
  103. heading.walk(collector);
  104. return collector.build();
  105. }
  106. static DeprecatedString from(Markdown::Text::Node const& node)
  107. {
  108. StringCollector collector;
  109. node.walk(collector);
  110. return collector.build();
  111. }
  112. private:
  113. virtual RecursionDecision visit(DeprecatedString const& text) override
  114. {
  115. m_builder.append(text);
  116. return RecursionDecision::Recurse;
  117. }
  118. StringBuilder m_builder;
  119. };
  120. static DeprecatedString slugify(DeprecatedString const& text)
  121. {
  122. // TODO: This feels like it belongs into LibWeb.
  123. DeprecatedString slug = text.to_lowercase();
  124. // Reverse-engineered through github, using:
  125. // find AK/ Base/ Documentation/ Kernel/ Meta/ Ports/ Tests/ Userland/ -name '*.md' | xargs grep --color=always -Pin '^##+ .*[^a-z0-9 ?()`_:/!&|.$'"'"',<>"+-]' README.md
  126. slug = slug.replace(" "sv, "-"sv, ReplaceMode::All)
  127. .replace("!"sv, ""sv, ReplaceMode::All)
  128. .replace("?"sv, ""sv, ReplaceMode::All)
  129. .replace("("sv, ""sv, ReplaceMode::All)
  130. .replace(")"sv, ""sv, ReplaceMode::All)
  131. .replace(":"sv, ""sv, ReplaceMode::All)
  132. .replace("/"sv, "-"sv, ReplaceMode::All)
  133. .replace("&"sv, ""sv, ReplaceMode::All)
  134. .replace("|"sv, ""sv, ReplaceMode::All)
  135. .replace("."sv, ""sv, ReplaceMode::All)
  136. .replace("$"sv, ""sv, ReplaceMode::All)
  137. .replace("'"sv, ""sv, ReplaceMode::All)
  138. .replace(","sv, ""sv, ReplaceMode::All)
  139. .replace("\""sv, ""sv, ReplaceMode::All)
  140. .replace("+"sv, ""sv, ReplaceMode::All)
  141. .replace("\\"sv, ""sv, ReplaceMode::All)
  142. .replace("<"sv, ""sv, ReplaceMode::All)
  143. .replace(">"sv, ""sv, ReplaceMode::All);
  144. // What about "="?
  145. return slug;
  146. }
  147. RecursionDecision MarkdownLinkage::visit(Markdown::Heading const& heading)
  148. {
  149. m_anchors.set(slugify(StringCollector::from(heading)));
  150. return RecursionDecision::Recurse;
  151. }
  152. RecursionDecision MarkdownLinkage::visit(Markdown::Text::LinkNode const& link_node)
  153. {
  154. DeprecatedString const& href = link_node.href;
  155. if (href.is_null()) {
  156. // Nothing to do here.
  157. return RecursionDecision::Recurse;
  158. }
  159. auto url = URL::create_with_url_or_path(href);
  160. if (url.is_valid()) {
  161. if (url.scheme() == "https" || url.scheme() == "http") {
  162. outln("Not checking external link {}", href);
  163. return RecursionDecision::Recurse;
  164. }
  165. if (url.scheme() == "help") {
  166. if (url.host() != "man") {
  167. warnln("help:// URL without 'man': {}", href);
  168. m_has_invalid_link = true;
  169. return RecursionDecision::Recurse;
  170. }
  171. if (url.paths().size() < 2) {
  172. warnln("help://man URL is missing section or page: {}", href);
  173. m_has_invalid_link = true;
  174. return RecursionDecision::Recurse;
  175. }
  176. // Remove leading '/' from the path.
  177. auto file = DeprecatedString::formatted("{}/Base/usr/share/man/man{}.md", m_serenity_source_directory, url.path().substring(1));
  178. m_file_links.append({ file, DeprecatedString(), StringCollector::from(*link_node.text) });
  179. return RecursionDecision::Recurse;
  180. }
  181. if (url.scheme() == "file") {
  182. if (url.path().contains("man"sv) && url.path().ends_with(".md"sv)) {
  183. warnln("Inter-manpage link without the help:// scheme: {}\nPlease use help URLs of the form 'help://man/<section>/<subsection...>/<page>'", href);
  184. m_has_invalid_link = true;
  185. return RecursionDecision::Recurse;
  186. }
  187. // TODO: Check more possible links other than icons.
  188. if (url.path().starts_with("/res/icons/"sv)) {
  189. auto file = DeprecatedString::formatted("{}/Base{}", m_serenity_source_directory, url.path());
  190. m_file_links.append({ file, DeprecatedString(), StringCollector::from(*link_node.text) });
  191. return RecursionDecision::Recurse;
  192. }
  193. outln("Not checking local link {}", href);
  194. return RecursionDecision::Recurse;
  195. }
  196. }
  197. DeprecatedString label = StringCollector::from(*link_node.text);
  198. Optional<size_t> last_hash = href.find_last('#');
  199. if (last_hash.has_value()) {
  200. m_file_links.append({ href.substring(0, last_hash.value()), href.substring(last_hash.value() + 1), label });
  201. } else {
  202. m_file_links.append({ href, DeprecatedString(), label });
  203. }
  204. return RecursionDecision::Recurse;
  205. }
  206. ErrorOr<int> serenity_main(Main::Arguments arguments)
  207. {
  208. Core::ArgsParser args_parser;
  209. Vector<StringView> file_paths;
  210. args_parser.add_positional_argument(file_paths, "Path to markdown files to read and parse", "paths", Core::ArgsParser::Required::Yes);
  211. args_parser.parse(arguments);
  212. outln("Reading and parsing Markdown files ...");
  213. HashMap<DeprecatedString, MarkdownLinkage> files;
  214. for (auto path : file_paths) {
  215. auto file_or_error = Core::Stream::File::open(path, Core::Stream::OpenMode::Read);
  216. if (file_or_error.is_error()) {
  217. warnln("Failed to open {}: {}", path, file_or_error.error());
  218. // Since this should never happen anyway, fail early.
  219. return file_or_error.release_error();
  220. }
  221. auto file = file_or_error.release_value();
  222. auto content_buffer_or_error = file->read_until_eof();
  223. if (content_buffer_or_error.is_error()) {
  224. warnln("Failed to read {}: {}", path, file_or_error.error());
  225. // Since this should never happen anyway, fail early.
  226. return file_or_error.release_error();
  227. }
  228. auto content_buffer = content_buffer_or_error.release_value();
  229. auto content = StringView(content_buffer);
  230. auto document = Markdown::Document::parse(content);
  231. if (!document) {
  232. warnln("Failed to parse {} due to an unspecified error.", path);
  233. // Since this should never happen anyway, fail early.
  234. return 1;
  235. }
  236. files.set(Core::File::real_path_for(path), MarkdownLinkage::analyze(*document));
  237. }
  238. outln("Checking links ...");
  239. bool any_problems = false;
  240. for (auto const& file_item : files) {
  241. if (file_item.value.has_invalid_link()) {
  242. outln("File '{}' has invalid links.", file_item.key);
  243. any_problems = true;
  244. continue;
  245. }
  246. auto file_lexical_path = LexicalPath(file_item.key);
  247. auto file_dir = file_lexical_path.dirname();
  248. for (auto const& file_link : file_item.value.file_links()) {
  249. DeprecatedString pointee_file;
  250. if (file_link.file_path.is_empty()) {
  251. pointee_file = file_item.key;
  252. } else {
  253. pointee_file = LexicalPath::absolute_path(file_dir, file_link.file_path);
  254. }
  255. if (!Core::File::exists(pointee_file) && !is_missing_file_acceptable(pointee_file)) {
  256. outln("File '{}' points to '{}' (label '{}'), but '{}' does not exist!",
  257. file_item.key, file_link.file_path, file_link.label, pointee_file);
  258. any_problems = true;
  259. continue;
  260. }
  261. if (file_link.anchor.is_empty()) {
  262. // No anchor to test for.
  263. continue;
  264. }
  265. auto pointee_linkage = files.find(pointee_file);
  266. if (pointee_linkage == files.end()) {
  267. outln("File '{}' points to file '{}', which exists, but was not scanned. Add it to the command-line arguments and re-run.",
  268. file_item.key, pointee_file);
  269. any_problems = true;
  270. continue;
  271. }
  272. if (!pointee_linkage->value.has_anchor(file_link.anchor)) {
  273. outln("File '{}' points to '{}#{}' (label '{}'), but file '{}' does not have any heading that results in the anchor '{}'.",
  274. file_item.key, file_link.file_path, file_link.anchor, file_link.label, pointee_file, file_link.anchor);
  275. out(" The following anchors seem to be available:\n ");
  276. bool any_anchors = false;
  277. for (auto const& anchor : pointee_linkage->value.anchors()) {
  278. if (any_anchors)
  279. out(", ");
  280. out("'{}'", anchor);
  281. any_anchors = true;
  282. }
  283. if (!any_anchors)
  284. out("(none)");
  285. outln();
  286. any_problems = true;
  287. }
  288. }
  289. }
  290. if (any_problems) {
  291. outln("Done. Some errors were encountered, please check above log.");
  292. return 1;
  293. } else {
  294. outln("Done. No problems detected.");
  295. return 0;
  296. }
  297. }