move markup parser to markup.cpp/hpp

2024-10-09 15:30:05 +05:30 · 2024-10-09 15:30:05 +05:30 · 1ca69e1eca
commit 1ca69e1eca
parent c2757a1d35
5 changed files with 29 additions and 393 deletions
--- a/src/gui/widgets/rich_label.cpp
+++ b/src/gui/widgets/rich_label.cpp
@ -30,6 +30,7 @@
 #include "help/help_impl.hpp"
 #include "gettext.hpp"
 #include "log.hpp"
+#include "serialization/markup.hpp"
 #include "serialization/unicode.hpp"
 #include "serialization/string_utils.hpp"
 #include "sound.hpp"
@ -249,7 +250,7 @@ std::vector<std::string> rich_label::split_in_width(const std::string &s, const
 			res.push_back(s.substr(first_line.size()));
 		}
 	} catch (utf8::invalid_utf8_exception&) {
-		throw help::parse_error (_("corrupted original file"));
+		throw markup::parse_error (_("corrupted original file"));
 	}

 	return res;
--- a/src/help/help_impl.cpp
+++ b/src/help/help_impl.cpp
@ -33,7 +33,6 @@
 #include "serialization/parser.hpp"
 #include "serialization/string_utils.hpp"  // for split, quoted_split, etc
 #include "serialization/unicode.hpp"    // for iterator
-#include "serialization/unicode_cast.hpp"  // for unicode_cast
 #include "serialization/utf8_exception.hpp"  // for char_t, etc
 #include "terrain/terrain.hpp"          // for terrain_type
 #include "terrain/translation.hpp"      // for operator==, ter_list, etc
@ -380,7 +379,7 @@ topic_text& topic_text::operator=(std::shared_ptr<topic_generator> g)
 const config& topic_text::parsed_text() const
 {
 	if (generator_) {
-		parsed_text_ = parse_text((*generator_)());
+		parsed_text_ = markup::parse_text((*generator_)());
 		// This caches the result, so doesn't need the generator any more
 		generator_.reset();
 	}
@ -1326,381 +1325,6 @@ section *find_section(section &sec, const std::string &id)
 	return const_cast<section *>(find_section(const_cast<const section &>(sec), id));
 }

-/*
-
-Here's a little mini-grammar of the markup language:
-
-DOCUMENT ::= (TEXT | TAG)*
-TEXT ::= ([^<&\] | ENTITY | ESCAPE)*
-ESCAPE ::= '\' [:unicode-char:]
-ENTITY ::= '&' '#' [0-9]+ ';'
-ENTITY ::= '&' 'x' [0-9a-fA-F]+ ';'
-ENTITY ::= '&' NAME ';'
-TAG ::= '<' NAME ATTRIBUTE* '/' '>'
-TAG ::= '<' NAME ATTRIBUTE* '>' DOCUMENT '<' '/' NAME '>' ## NB: the names must match!
-TAG ::= '<' NAME '>' ATTRIBUTE* TEXT? '<' '/' NAME '>' ## NB: the names must match!
-ATTRIBUTE ::= NAME
-ATTRIBUTE ::= NAME '=' [^'" ]*
-ATTRIBUTE ::= NAME '=' "'" TEXT "'"
-ATTRIBUTE ::= NAME '=' '"' TEXT '"'
-NAME ::= [_0-9a-zA-Z]+
-
-Notes:
-* Entities and the first two tag formats are Pango-style. The tags can be nested inside each other.
-* Escapes and the third tag format are for compatibility with the old help markup. Tags cannot be nested.
-* This mostly doesn't attempt to define the meaning of specific tags or entity names. It does however substitute numeric entities, as well as some very basic named entities: lt, gt, amp, quot, apos.
-* The definition of TEXT is left a bit nebulous, but just think of it as "non-greedy"
-* Attributes without a value are only supported in Pango-style tags
-* Some restrictions may apply beyond what the grammar specifies. For example, arbitrary named entities are not supported in attribute values (numeric ones and the 5 special ones work though).
-
------
-
-The result of the parsing is represented in the format of a WML config.
-Text spans are represented as a [text] tag, and character entities as a [character_entity] tag.
-All other tags are represented by a tag of the same name.
-Any attributes on a tag become key-value pairs within the tag.
-Old-style help markup tags with text at the end put the text in a "text" key in the tag.
-The same approach is used for new-style Pango tags, but only if there are no nested tags or entities.
-If there ARE nested tags or entities, the contents of the tag is broken down into spans as subtags of the parent tag.
-Thus, a tag with content has EITHER a text attribute OR some subtags.
-
-Note: Only unrecognized named entities count for the above purposes!
-Numerical entities and the special five lt, gt, amp, apos, quot are directly substituted in-place.
-
-Also, text spans will be broken up on paragraph breaks (double newlines).
-This means that adjacent [text] tags should be rendered with a paragraph break between them.
-However, no paragraph break should be used when [text] is followed by something else.
-It is possible to have empty text spans in some cases, for example given a run of more than 2 newlines,
-or a character entity directly followed by a paragraph break.
-
-*/
-static config parse_entity(std::string::const_iterator& beg, std::string::const_iterator end)
-{
-	config entity;
-	std::stringstream s;
-	enum { UNKNOWN, NAMED, HEX, DECIMAL } type = UNKNOWN;
-	assert(*beg == '&');
-	++beg;
-	for(; beg != end && *beg != ';'; ++beg) {
-		switch(type) {
-		case UNKNOWN:
-			if(*beg == '#') {
-				type = DECIMAL;
-			} else if(isalnum(*beg) || *beg == '_') {
-				type = NAMED;
-				s << *beg;
-			} else {
-				throw parse_error("TODO");
-			}
-			break;
-		case NAMED:
-			if(!isalnum(*beg)) {
-				throw parse_error("TODO");
-			}
-			s << *beg;
-			break;
-		case DECIMAL:
-			if(*beg == 'x') {
-				type = HEX;
-			} else if(isdigit(*beg)) {
-				s << *beg;
-			} else {
-				throw parse_error("TODO");
-			}
-			break;
-		case HEX:
-			if(isxdigit(*beg)) {
-				s << *beg;
-			} else {
-				throw parse_error("TODO");
-			}
-			break;
-		}
-	}
-	if(type == NAMED) {
-		std::string name = s.str();
-		entity["name"] = name;
-		if(name == "lt") {
-			entity["code_point"] = '<';
-		} else if(name == "gt") {
-			entity["code_point"] = '>';
-		} else if(name == "apos") {
-			entity["code_point"] = '\'';
-		} else if(name == "quot") {
-			entity["code_point"] = '"';
-		} else if(name == "amp") {
-			entity["code_point"] = '&';
-		}
-	} else {
-		s.seekg(0);
-		if(type == HEX) {
-			s >> std::hex;
-		}
-		int n;
-		s >> n;
-		entity["code_point"] = n;
-	}
-	return entity;
-}
-
-static char parse_escape(std::string::const_iterator& beg, std::string::const_iterator end)
-{
-	assert(*beg == '\\');
-	// An escape at the end of stream is just treated as a literal.
-	// Otherwise, take the next character as a literal and be done with it.
-	if((beg + 1) != end) {
-		++beg;
-	}
-	return *beg;
-}
-
-static config parse_text_until(std::string::const_iterator& beg, std::string::const_iterator end, char close)
-{
-	// In practice, close will be one of < ' "
-	// Parsing will go until either close or eos, and will emit one or more text and character_entity tags.
-	// However, recognized character entities will be collapsed into the text tags.
-	std::ostringstream s;
-	bool saw_newline = false;
-	config res;
-	for(; beg != end && *beg != close; ++beg) {
-		if(*beg == '&') {
-			auto entity = parse_entity(beg, end);
-			if(beg == end) {
-				throw parse_error("unexpected eos after entity");
-			}
-			if(entity.has_attribute("code_point")) {
-				s << unicode_cast<std::string>(entity["code_point"].to_int());
-			} else {
-				// TODO: Adding the text here seems wrong in the case that the stream BEGINS with an entity...
-				res.add_child("text", config("text", s.str()));
-				res.add_child("character_entity", entity);
-				s.str("");
-			}
-		} else if(*beg == '\\') {
-			s << parse_escape(beg, end);
-		} else if(*beg == '\n') {
-			if(saw_newline) {
-				res.add_child("text", config("text", s.str()));
-				s.str("");
-			} else {
-				saw_newline = true;
-				continue;
-			}
-		} else {
-			if(saw_newline) {
-				s << '\n';
-			}
-			s << *beg;
-		}
-		saw_newline = false;
-	}
-	// If the span ended in a newline, preserve it
-	if(saw_newline) {
-		s << '\n';
-	}
-	res.add_child("text", config("text", s.str()));
-	assert(beg == end || *beg == close);
-	return res;
-}
-
-static std::string parse_name(std::string::const_iterator& beg, std::string::const_iterator end)
-{
-	std::ostringstream s;
-	for(; beg != end && (isalnum(*beg) || *beg == '_'); ++beg) {
-		s << *beg;
-	}
-	return s.str();
-}
-
-static std::pair<std::string, std::string> parse_attribute(std::string::const_iterator& beg, std::string::const_iterator end, bool allow_empty)
-{
-	std::string attr = parse_name(beg, end), value;
-	if(attr.empty()) {
-		throw parse_error("missing attribute name");
-	}
-	while(isspace(*beg)) ++beg;
-	if(*beg != '=') {
-		if(allow_empty) {
-			// The caller expects beg to point to the last character of the attribute upon return.
-			// But in this path, we're now pointing to the character AFTER that.
-			--beg;
-			return {attr, value};
-		} else throw parse_error("attribute missing value in old-style tag");
-	}
-	++beg;
-	while(isspace(*beg)) ++beg;
-	if(*beg == '\'' || *beg == '"') {
-		config res = parse_text_until(beg, end, *beg++);
-		if(res.has_child("character_entity")) {
-			throw parse_error("unsupported entity in attribute value");
-		} else if(res.all_children_count() > 1) {
-			throw parse_error("paragraph break in attribute value");
-		}
-		if(auto t = res.optional_child("text")) {
-			value = t["text"].str();
-		}
-	} else {
-		std::ostringstream s;
-		bool found_slash = false;
-		for(; beg != end && *beg != '>' && *beg != '<' && !isspace(*beg); ++beg) {
-			if(*beg == '&') {
-				auto entity = parse_entity(beg, end);
-				if(beg == end) {
-					throw parse_error("unexpected eos after entity");
-				}
-				if(entity.has_attribute("code_point")) {
-					s << unicode_cast<std::string>(entity["code_point"].to_int());
-				} else {
-					throw parse_error("unsupported entity in attribute value");
-				}
-			} else if(*beg == '\\') {
-				s << parse_escape(beg, end);
-			} else if(*beg == '/') {
-				found_slash = true;
-			} else {
-				if(found_slash) {
-					s << '/';
-					found_slash = false;
-				}
-				s << *beg;
-			}
-		}
-		value = s.str();
-		// The caller expects beg to point to the last character of the attribute upon return.
-		// But in this path, we're now pointing to the character AFTER that.
-		--beg;
-		if(found_slash) --beg;
-	}
-	return {attr, value};
-}
-
-static void check_closing_tag(std::string::const_iterator& beg, std::string::const_iterator end, std::string_view match)
-{
-	size_t remaining = end - beg;
-	assert(remaining >= 2 && *beg == '<' && *(beg + 1) == '/');
-	if(remaining < match.size() + 3) {
-		throw parse_error("Unexpected eos in closing tag");
-	}
-	beg += 2;
-	if(!std::equal(match.begin(), match.end(), beg)) {
-		throw parse_error("Mismatched closing tag");
-	}
-	beg += match.size();
-	if(*beg != '>') {
-		throw parse_error("Unterminated closing tag");
-	}
-	++beg;
-}
-
-static std::pair<std::string, config> parse_tag(std::string::const_iterator& beg, std::string::const_iterator end);
-static config parse_tag_contents(std::string::const_iterator& beg, std::string::const_iterator end, std::string_view match, bool check_for_attributes)
-{
-	assert(*beg == '>');
-	++beg;
-	// This also parses the matching closing tag!
-	config res;
-	for(; check_for_attributes && beg != end && *beg != '<'; ++beg) {
-		if(isspace(*beg)) continue;
-		auto save_beg = beg;
-		try {
-			auto [key, val] = parse_attribute(beg, end, false);
-			res[key] = val;
-		} catch(parse_error&) {
-			beg = save_beg;
-			while(beg != end && isspace(*beg)) ++beg;
-			break;
-		}
-	}
-	if(res.has_attribute("text")) {
-		if(beg == end || *beg != '<' || (beg + 1) == end || *(beg + 1) != '/') {
-			throw parse_error("Extra text at the end of old-style tag with explicit 'text' attribute");
-		}
-		check_closing_tag(beg, end, match);
-		return res;
-	} else if(res.attribute_count() > 0) {
-		config text = parse_text_until(beg, end, '<');
-		if(beg == end || *beg != '<' || (beg + 1) == end || *(beg + 1) != '/') {
-			throw parse_error("Extra text at the end of old-style tag with explicit 'text' attribute");
-		}
-		if(text.all_children_count() == 1 && text.has_child("text")) {
-			res["text"] = text.mandatory_child("text")["text"];
-		} else {
-			res.append_children(text);
-		}
-		check_closing_tag(beg, end, match);
-		return res;
-	}
-	while(true) {
-		config text = parse_text_until(beg, end, '<');
-		if(beg == end || beg + 1 == end) {
-			throw parse_error("Missing closing tag");
-		}
-		res.append_children(text);
-		if(*(beg + 1) == '/') {
-			check_closing_tag(beg, end, match);
-			break;
-		}
-		auto [tag, contents] = parse_tag(beg, end);
-		res.add_child(tag, contents);
-	}
-	if(res.all_children_count() == 1 && res.has_child("text")) {
-		return res.mandatory_child("text");
-	}
-	return res;
-}
-
-static std::pair<std::string, config> parse_tag(std::string::const_iterator& beg, std::string::const_iterator end)
-{
-	assert(*beg == '<');
-	++beg;
-	std::string tag_name = parse_name(beg, end);
-	if(tag_name.empty()) {
-		throw parse_error("missing tag name");
-	}
-	bool auto_closed = false;
-	config elem;
-	for(; beg != end && *beg != '>'; ++beg) {
-		if(isspace(*beg)) continue;
-		if(*beg == '/' && (beg + 1) != end && *(beg + 1) == '>') {
-			auto_closed = true;
-		} else if(isalnum(*beg) || *beg == '_') {
-			const auto& [key, value] = parse_attribute(beg, end, true);
-			if(beg == end) {
-				throw parse_error("unexpected eos following attribute");
-			}
-			elem[key] = value;
-		}
-	}
-	if(auto_closed) {
-		assert(*beg == '>');
-		++beg;
-	} else {
-		config contents = parse_tag_contents(beg, end, tag_name, elem.attribute_count() == 0);
-		if(contents.all_children_count() == 0 && contents.attribute_count() == 1 && contents.has_attribute("text")) {
-			elem["text"] = contents["text"];
-		} else {
-			elem.append(contents);
-		}
-	}
-	return {tag_name, elem};
-}
-
-config parse_text(const std::string &text)
-{
-	config res;
-	auto beg = text.begin(), end = text.end();
-	while(beg != end) {
-		if(*beg == '<') {
-			auto [tag, contents] = parse_tag(beg, end);
-			res.add_child(tag, contents);
-		} else {
-			config text = parse_text_until(beg, end, '<');
-			res.append_children(text);
-		}
-	}
-	return res;
-}
-
 std::string remove_first_space(const std::string& text)
 {
 	if (text.length() > 0 && text[0] == ' ') {
--- a/src/help/help_impl.hpp
+++ b/src/help/help_impl.hpp
@ -211,12 +211,6 @@ public:
 	}
 };

-/** Thrown when the help system fails to parse something. */
-struct parse_error : public game::error
-{
-	parse_error(const std::string& msg) : game::error(msg) {}
-};
-
 // Generator stuff below. Maybe move to a separate file? This one is
 // getting crowded. Dunno if much more is needed though so I'll wait and
 // see.
@ -228,6 +222,12 @@ std::string generate_topic_text(const std::string &generator, const config *help
 std::string generate_contents_links(const std::string& section_name, config const *help_cfg);
 std::string generate_contents_links(const section &sec);

+/** Thrown when the help system fails to parse something. */
+struct parse_error : public game::error
+{
+	parse_error(const std::string& msg) : game::error(msg) {}
+};
+
 /**
 * return a hyperlink with the unit's name and pointing to the unit page
 * return empty string if this unit is hidden. If not yet discovered add the (?) suffix
@ -306,13 +306,6 @@ const topic *find_topic(const section &sec, const std::string &id);
 const section *find_section(const section &sec, const std::string &id);
 section *find_section(section &sec, const std::string &id);

-/**
- * Parse a xml style marked up text string. Return a config with the different parts of the
- * text. Each markup item is a separate part while the text between
- * markups are separate parts.
- */
-config parse_text(const std::string &text);
-
 std::string remove_first_space(const std::string& text);

 /** Return the first word in s, not removing any spaces in the start of it. */
--- a/src/serialization/markup.cpp
+++ b/src/serialization/markup.cpp
@ -12,10 +12,11 @@
 	See the COPYING file for more details.
 */

-#include "serialization/markup.hpp"

 #include "game_config.hpp"
 #include "gettext.hpp"
+#include "serialization/markup.hpp"
+#include "serialization/unicode_cast.hpp"  // for unicode_cast

 namespace markup {

--- a/src/serialization/markup.hpp
+++ b/src/serialization/markup.hpp
@ -15,7 +15,7 @@
 #pragma once

 #include "color.hpp"
-
+#include "config.hpp"
 // This file isn't needed by any of these functions, but this allows any
 // standard color to be passed to span_color without an extra include.
 #include "font/standard_colors.hpp"
@ -126,4 +126,21 @@ std::string img(const std::string& src, const std::string& align = "left", const

 std::string make_link(const std::string& text, const std::string& dst);

+//
+// Markup Parser
+//
+
+/** Thrown when the help system fails to parse something. */
+struct parse_error : public game::error
+{
+	parse_error(const std::string& msg) : game::error(msg) {}
+};
+
+/**
+ * Parse a xml style marked up text string. Return a config with the different parts of the
+ * text. Each markup item is a separate part while the text between
+ * markups are separate parts.
+ */
+config parse_text(const std::string &text);
+
 } //end namespace markup