Parser.h 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. /*
  2. * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include <AK/ByteString.h>
  8. #include <AK/Debug.h>
  9. #include <AK/Function.h>
  10. #include <AK/GenericLexer.h>
  11. #include <AK/HashMap.h>
  12. #include <AK/OwnPtr.h>
  13. #include <AK/SourceLocation.h>
  14. #include <AK/TemporaryChange.h>
  15. #include <LibXML/DOM/Document.h>
  16. #include <LibXML/DOM/DocumentTypeDeclaration.h>
  17. #include <LibXML/DOM/Node.h>
  18. #include <LibXML/Forward.h>
  19. namespace XML {
  20. struct ParseError {
  21. size_t offset;
  22. ByteString error;
  23. };
  24. struct Listener {
  25. virtual ~Listener() { }
  26. virtual void set_source(ByteString) { }
  27. virtual void document_start() { }
  28. virtual void document_end() { }
  29. virtual void element_start(Name const&, HashMap<Name, ByteString> const&) { }
  30. virtual void element_end(Name const&) { }
  31. virtual void text(StringView) { }
  32. virtual void comment(StringView) { }
  33. virtual void error(ParseError const&) { }
  34. };
  35. // FIXME: This is also used in JSSpecCompiler, so should probably live in AK or even merged with
  36. // AK::GenericLexer.
  37. class LineTrackingLexer : public GenericLexer {
  38. public:
  39. using GenericLexer::GenericLexer;
  40. LineTrackingLexer(StringView input, XML::Offset start_offset)
  41. : GenericLexer(input)
  42. , m_cached_offset {
  43. .line = start_offset.line,
  44. .column = start_offset.column,
  45. }
  46. {
  47. }
  48. Offset cached_offset() const { return m_cached_offset; }
  49. void restore_cached_offset(Offset cached_offset) { m_cached_offset = cached_offset; }
  50. Offset offset_for(size_t) const;
  51. protected:
  52. mutable Offset m_cached_offset;
  53. };
  54. class Parser {
  55. public:
  56. struct Options {
  57. bool preserve_cdata { true };
  58. bool preserve_comments { false };
  59. bool treat_errors_as_fatal { true };
  60. Function<ErrorOr<ByteString>(SystemID const&, Optional<PublicID> const&)> resolve_external_resource {};
  61. };
  62. Parser(StringView source, Options options)
  63. : m_source(source)
  64. , m_lexer(source)
  65. , m_options(move(options))
  66. {
  67. }
  68. explicit Parser(StringView source)
  69. : m_source(source)
  70. , m_lexer(source)
  71. {
  72. }
  73. ErrorOr<Document, ParseError> parse();
  74. ErrorOr<void, ParseError> parse_with_listener(Listener&);
  75. Vector<ParseError> const& parse_error_causes() const { return m_parse_errors; }
  76. private:
  77. struct EntityReference {
  78. Name name;
  79. };
  80. ErrorOr<void, ParseError> parse_internal();
  81. void append_node(NonnullOwnPtr<Node>);
  82. void append_text(StringView, Offset);
  83. void append_comment(StringView, Offset);
  84. void enter_node(Node&);
  85. void leave_node();
  86. enum class ReferencePlacement {
  87. AttributeValue,
  88. Content,
  89. };
  90. ErrorOr<ByteString, ParseError> resolve_reference(EntityReference const&, ReferencePlacement);
  91. enum class Required {
  92. No,
  93. Yes,
  94. };
  95. ErrorOr<void, ParseError> skip_whitespace(Required = Required::No);
  96. ErrorOr<void, ParseError> parse_prolog();
  97. ErrorOr<void, ParseError> parse_element();
  98. ErrorOr<void, ParseError> parse_misc();
  99. ErrorOr<void, ParseError> parse_xml_decl();
  100. ErrorOr<void, ParseError> parse_doctype_decl();
  101. ErrorOr<void, ParseError> parse_version_info();
  102. ErrorOr<void, ParseError> parse_encoding_decl();
  103. ErrorOr<void, ParseError> parse_standalone_document_decl();
  104. ErrorOr<void, ParseError> parse_eq();
  105. ErrorOr<void, ParseError> parse_comment();
  106. ErrorOr<void, ParseError> parse_processing_instruction();
  107. ErrorOr<Name, ParseError> parse_processing_instruction_target();
  108. ErrorOr<Name, ParseError> parse_name();
  109. ErrorOr<NonnullOwnPtr<Node>, ParseError> parse_empty_element_tag();
  110. ErrorOr<NonnullOwnPtr<Node>, ParseError> parse_start_tag();
  111. ErrorOr<Name, ParseError> parse_end_tag();
  112. ErrorOr<void, ParseError> parse_content();
  113. ErrorOr<Attribute, ParseError> parse_attribute();
  114. ErrorOr<ByteString, ParseError> parse_attribute_value();
  115. ErrorOr<Variant<EntityReference, ByteString>, ParseError> parse_reference();
  116. ErrorOr<StringView, ParseError> parse_char_data();
  117. ErrorOr<Vector<MarkupDeclaration>, ParseError> parse_internal_subset();
  118. ErrorOr<Optional<MarkupDeclaration>, ParseError> parse_markup_declaration();
  119. ErrorOr<Optional<ByteString>, ParseError> parse_declaration_separator();
  120. ErrorOr<Vector<MarkupDeclaration>, ParseError> parse_external_subset_declaration();
  121. ErrorOr<ElementDeclaration, ParseError> parse_element_declaration();
  122. ErrorOr<AttributeListDeclaration, ParseError> parse_attribute_list_declaration();
  123. ErrorOr<EntityDeclaration, ParseError> parse_entity_declaration();
  124. ErrorOr<NotationDeclaration, ParseError> parse_notation_declaration();
  125. ErrorOr<Name, ParseError> parse_parameter_entity_reference();
  126. ErrorOr<ElementDeclaration::ContentSpec, ParseError> parse_content_spec();
  127. ErrorOr<AttributeListDeclaration::Definition, ParseError> parse_attribute_definition();
  128. ErrorOr<StringView, ParseError> parse_nm_token();
  129. ErrorOr<EntityDeclaration, ParseError> parse_general_entity_declaration();
  130. ErrorOr<EntityDeclaration, ParseError> parse_parameter_entity_declaration();
  131. ErrorOr<PublicID, ParseError> parse_public_id();
  132. ErrorOr<SystemID, ParseError> parse_system_id();
  133. ErrorOr<ExternalID, ParseError> parse_external_id();
  134. ErrorOr<ByteString, ParseError> parse_entity_value();
  135. ErrorOr<Name, ParseError> parse_notation_data_declaration();
  136. ErrorOr<StringView, ParseError> parse_public_id_literal();
  137. ErrorOr<StringView, ParseError> parse_system_id_literal();
  138. ErrorOr<StringView, ParseError> parse_cdata_section();
  139. ErrorOr<ByteString, ParseError> parse_attribute_value_inner(StringView disallow);
  140. ErrorOr<Vector<MarkupDeclaration>, ParseError> parse_external_subset();
  141. ErrorOr<void, ParseError> parse_text_declaration();
  142. ErrorOr<void, ParseError> expect(StringView);
  143. template<typename Pred>
  144. requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> expect(Pred, StringView description);
  145. template<typename Pred>
  146. requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> expect_many(Pred, StringView description);
  147. static size_t s_debug_indent_level;
  148. [[nodiscard]] auto rollback_point(SourceLocation location = SourceLocation::current())
  149. {
  150. return ArmedScopeGuard {
  151. [this, position = m_lexer.tell(), cached_offset = m_lexer.cached_offset(), location] {
  152. m_lexer.retreat(m_lexer.tell() - position);
  153. m_lexer.restore_cached_offset(cached_offset);
  154. (void)location;
  155. dbgln_if(XML_PARSER_DEBUG, "{:->{}}FAIL @ {} -- \x1b[31m{}\x1b[0m", " ", s_debug_indent_level * 2, location, m_lexer.remaining().substring_view(0, min(16, m_lexer.tell_remaining())).replace("\n"sv, "\\n"sv, ReplaceMode::All));
  156. }
  157. };
  158. }
  159. [[nodiscard]] auto accept_rule()
  160. {
  161. return TemporaryChange { m_current_rule.accept, true };
  162. }
  163. [[nodiscard]] auto enter_rule(SourceLocation location = SourceLocation::current())
  164. {
  165. dbgln_if(XML_PARSER_DEBUG, "{:->{}}Enter {}", " ", s_debug_indent_level * 2, location);
  166. ++s_debug_indent_level;
  167. auto rule = m_current_rule;
  168. m_current_rule = { location.function_name(), false };
  169. return ScopeGuard {
  170. [location, rule, this] {
  171. m_current_rule = rule;
  172. --s_debug_indent_level;
  173. (void)location;
  174. dbgln_if(XML_PARSER_DEBUG, "{:->{}}Leave {}", " ", s_debug_indent_level * 2, location);
  175. }
  176. };
  177. }
  178. template<typename... Ts>
  179. ParseError parse_error(Ts&&... args)
  180. {
  181. auto error = ParseError { forward<Ts>(args)... };
  182. if (m_current_rule.accept) {
  183. auto rule_name = m_current_rule.rule.value_or("<?>");
  184. if (rule_name.starts_with("parse_"sv))
  185. rule_name = rule_name.substring_view(6);
  186. m_parse_errors.append({
  187. error.offset,
  188. ByteString::formatted("{}: {}", rule_name, error.error),
  189. });
  190. }
  191. return error;
  192. }
  193. StringView m_source;
  194. LineTrackingLexer m_lexer;
  195. Options m_options;
  196. Listener* m_listener { nullptr };
  197. OwnPtr<Node> m_root_node;
  198. Node* m_entered_node { nullptr };
  199. Version m_version { Version::Version11 };
  200. bool m_in_compatibility_mode { false };
  201. ByteString m_encoding;
  202. bool m_standalone { false };
  203. HashMap<Name, ByteString> m_processing_instructions;
  204. struct AcceptedRule {
  205. Optional<ByteString> rule {};
  206. bool accept { false };
  207. } m_current_rule {};
  208. Vector<ParseError> m_parse_errors;
  209. Optional<Doctype> m_doctype;
  210. };
  211. }
  212. template<>
  213. struct AK::Formatter<XML::ParseError> : public AK::Formatter<FormatString> {
  214. ErrorOr<void> format(FormatBuilder& builder, XML::ParseError const& error)
  215. {
  216. return Formatter<FormatString>::format(builder, "{} at offset {}"sv, error.error, error.offset);
  217. }
  218. };