Parser.h 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. /*
  2. * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include <AK/ByteString.h>
  8. #include <AK/Debug.h>
  9. #include <AK/Function.h>
  10. #include <AK/GenericLexer.h>
  11. #include <AK/HashMap.h>
  12. #include <AK/OwnPtr.h>
  13. #include <AK/SourceLocation.h>
  14. #include <AK/TemporaryChange.h>
  15. #include <LibXML/DOM/Document.h>
  16. #include <LibXML/DOM/DocumentTypeDeclaration.h>
  17. #include <LibXML/DOM/Node.h>
  18. #include <LibXML/Forward.h>
  19. namespace XML {
  20. struct ParseError {
  21. size_t offset;
  22. ByteString error;
  23. };
  24. struct Listener {
  25. virtual ~Listener() { }
  26. virtual void set_source(ByteString) { }
  27. virtual void document_start() { }
  28. virtual void document_end() { }
  29. virtual void element_start(Name const&, HashMap<Name, ByteString> const&) { }
  30. virtual void element_end(Name const&) { }
  31. virtual void text(StringView) { }
  32. virtual void comment(StringView) { }
  33. virtual void error(ParseError const&) { }
  34. };
  35. class Parser {
  36. public:
  37. struct Options {
  38. bool preserve_cdata { true };
  39. bool preserve_comments { false };
  40. bool treat_errors_as_fatal { true };
  41. Function<ErrorOr<ByteString>(SystemID const&, Optional<PublicID> const&)> resolve_external_resource {};
  42. };
  43. Parser(StringView source, Options options)
  44. : m_source(source)
  45. , m_lexer(source)
  46. , m_options(move(options))
  47. {
  48. }
  49. explicit Parser(StringView source)
  50. : m_source(source)
  51. , m_lexer(source)
  52. {
  53. }
  54. ErrorOr<Document, ParseError> parse();
  55. ErrorOr<void, ParseError> parse_with_listener(Listener&);
  56. Vector<ParseError> const& parse_error_causes() const { return m_parse_errors; }
  57. private:
  58. struct EntityReference {
  59. Name name;
  60. };
  61. ErrorOr<void, ParseError> parse_internal();
  62. void append_node(NonnullOwnPtr<Node>);
  63. void append_text(StringView, LineTrackingLexer::Position);
  64. void append_comment(StringView, LineTrackingLexer::Position);
  65. void enter_node(Node&);
  66. void leave_node();
  67. enum class ReferencePlacement {
  68. AttributeValue,
  69. Content,
  70. };
  71. ErrorOr<ByteString, ParseError> resolve_reference(EntityReference const&, ReferencePlacement);
  72. enum class Required {
  73. No,
  74. Yes,
  75. };
  76. ErrorOr<void, ParseError> skip_whitespace(Required = Required::No);
  77. ErrorOr<void, ParseError> parse_prolog();
  78. ErrorOr<void, ParseError> parse_element();
  79. ErrorOr<void, ParseError> parse_misc();
  80. ErrorOr<void, ParseError> parse_xml_decl();
  81. ErrorOr<void, ParseError> parse_doctype_decl();
  82. ErrorOr<void, ParseError> parse_version_info();
  83. ErrorOr<void, ParseError> parse_encoding_decl();
  84. ErrorOr<void, ParseError> parse_standalone_document_decl();
  85. ErrorOr<void, ParseError> parse_eq();
  86. ErrorOr<void, ParseError> parse_comment();
  87. ErrorOr<void, ParseError> parse_processing_instruction();
  88. ErrorOr<Name, ParseError> parse_processing_instruction_target();
  89. ErrorOr<Name, ParseError> parse_name();
  90. ErrorOr<NonnullOwnPtr<Node>, ParseError> parse_empty_element_tag();
  91. ErrorOr<NonnullOwnPtr<Node>, ParseError> parse_start_tag();
  92. ErrorOr<Name, ParseError> parse_end_tag();
  93. ErrorOr<void, ParseError> parse_content();
  94. ErrorOr<Attribute, ParseError> parse_attribute();
  95. ErrorOr<ByteString, ParseError> parse_attribute_value();
  96. ErrorOr<Variant<EntityReference, ByteString>, ParseError> parse_reference();
  97. ErrorOr<StringView, ParseError> parse_char_data();
  98. ErrorOr<Vector<MarkupDeclaration>, ParseError> parse_internal_subset();
  99. ErrorOr<Optional<MarkupDeclaration>, ParseError> parse_markup_declaration();
  100. ErrorOr<Optional<ByteString>, ParseError> parse_declaration_separator();
  101. ErrorOr<Vector<MarkupDeclaration>, ParseError> parse_external_subset_declaration();
  102. ErrorOr<ElementDeclaration, ParseError> parse_element_declaration();
  103. ErrorOr<AttributeListDeclaration, ParseError> parse_attribute_list_declaration();
  104. ErrorOr<EntityDeclaration, ParseError> parse_entity_declaration();
  105. ErrorOr<NotationDeclaration, ParseError> parse_notation_declaration();
  106. ErrorOr<Name, ParseError> parse_parameter_entity_reference();
  107. ErrorOr<ElementDeclaration::ContentSpec, ParseError> parse_content_spec();
  108. ErrorOr<AttributeListDeclaration::Definition, ParseError> parse_attribute_definition();
  109. ErrorOr<StringView, ParseError> parse_nm_token();
  110. ErrorOr<EntityDeclaration, ParseError> parse_general_entity_declaration();
  111. ErrorOr<EntityDeclaration, ParseError> parse_parameter_entity_declaration();
  112. ErrorOr<PublicID, ParseError> parse_public_id();
  113. ErrorOr<SystemID, ParseError> parse_system_id();
  114. ErrorOr<ExternalID, ParseError> parse_external_id();
  115. ErrorOr<ByteString, ParseError> parse_entity_value();
  116. ErrorOr<Name, ParseError> parse_notation_data_declaration();
  117. ErrorOr<StringView, ParseError> parse_public_id_literal();
  118. ErrorOr<StringView, ParseError> parse_system_id_literal();
  119. ErrorOr<StringView, ParseError> parse_cdata_section();
  120. ErrorOr<ByteString, ParseError> parse_attribute_value_inner(StringView disallow);
  121. ErrorOr<Vector<MarkupDeclaration>, ParseError> parse_external_subset();
  122. ErrorOr<void, ParseError> parse_text_declaration();
  123. ErrorOr<void, ParseError> expect(StringView);
  124. template<typename Pred>
  125. requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> expect(Pred, StringView description);
  126. template<typename Pred>
  127. requires(IsCallableWithArguments<Pred, bool, char>) ErrorOr<StringView, ParseError> expect_many(Pred, StringView description);
  128. static size_t s_debug_indent_level;
  129. [[nodiscard]] auto rollback_point(SourceLocation location = SourceLocation::current())
  130. {
  131. return ArmedScopeGuard {
  132. [this, position = m_lexer.tell(), cached_position = m_lexer.cached_position(), location] {
  133. m_lexer.retreat(m_lexer.tell() - position);
  134. m_lexer.restore_cached_offset(cached_position);
  135. (void)location;
  136. dbgln_if(XML_PARSER_DEBUG, "{:->{}}FAIL @ {} -- \x1b[31m{}\x1b[0m", " ", s_debug_indent_level * 2, location, m_lexer.remaining().substring_view(0, min(16, m_lexer.tell_remaining())).replace("\n"sv, "\\n"sv, ReplaceMode::All));
  137. }
  138. };
  139. }
  140. [[nodiscard]] auto accept_rule()
  141. {
  142. return TemporaryChange { m_current_rule.accept, true };
  143. }
  144. [[nodiscard]] auto enter_rule(SourceLocation location = SourceLocation::current())
  145. {
  146. dbgln_if(XML_PARSER_DEBUG, "{:->{}}Enter {}", " ", s_debug_indent_level * 2, location);
  147. ++s_debug_indent_level;
  148. auto rule = m_current_rule;
  149. m_current_rule = { location.function_name(), false };
  150. return ScopeGuard {
  151. [location, rule, this] {
  152. m_current_rule = rule;
  153. --s_debug_indent_level;
  154. (void)location;
  155. dbgln_if(XML_PARSER_DEBUG, "{:->{}}Leave {}", " ", s_debug_indent_level * 2, location);
  156. }
  157. };
  158. }
  159. template<typename... Ts>
  160. ParseError parse_error(Ts&&... args)
  161. {
  162. auto error = ParseError { forward<Ts>(args)... };
  163. if (m_current_rule.accept) {
  164. auto rule_name = m_current_rule.rule.value_or("<?>");
  165. if (rule_name.starts_with("parse_"sv))
  166. rule_name = rule_name.substring_view(6);
  167. m_parse_errors.append({
  168. error.offset,
  169. ByteString::formatted("{}: {}", rule_name, error.error),
  170. });
  171. }
  172. return error;
  173. }
  174. StringView m_source;
  175. LineTrackingLexer m_lexer;
  176. Options m_options;
  177. Listener* m_listener { nullptr };
  178. OwnPtr<Node> m_root_node;
  179. Node* m_entered_node { nullptr };
  180. Version m_version { Version::Version11 };
  181. bool m_in_compatibility_mode { false };
  182. ByteString m_encoding;
  183. bool m_standalone { false };
  184. HashMap<Name, ByteString> m_processing_instructions;
  185. struct AcceptedRule {
  186. Optional<ByteString> rule {};
  187. bool accept { false };
  188. } m_current_rule {};
  189. Vector<ParseError> m_parse_errors;
  190. Optional<Doctype> m_doctype;
  191. };
  192. }
  193. template<>
  194. struct AK::Formatter<XML::ParseError> : public AK::Formatter<FormatString> {
  195. ErrorOr<void> format(FormatBuilder& builder, XML::ParseError const& error)
  196. {
  197. return Formatter<FormatString>::format(builder, "{} at offset {}"sv, error.error, error.offset);
  198. }
  199. };