move markup parser to markup.cpp/hpp
This commit is contained in:
parent
c2757a1d35
commit
1ca69e1eca
5 changed files with 29 additions and 393 deletions
|
@ -30,6 +30,7 @@
|
|||
#include "help/help_impl.hpp"
|
||||
#include "gettext.hpp"
|
||||
#include "log.hpp"
|
||||
#include "serialization/markup.hpp"
|
||||
#include "serialization/unicode.hpp"
|
||||
#include "serialization/string_utils.hpp"
|
||||
#include "sound.hpp"
|
||||
|
@ -249,7 +250,7 @@ std::vector<std::string> rich_label::split_in_width(const std::string &s, const
|
|||
res.push_back(s.substr(first_line.size()));
|
||||
}
|
||||
} catch (utf8::invalid_utf8_exception&) {
|
||||
throw help::parse_error (_("corrupted original file"));
|
||||
throw markup::parse_error (_("corrupted original file"));
|
||||
}
|
||||
|
||||
return res;
|
||||
|
|
|
@ -33,7 +33,6 @@
|
|||
#include "serialization/parser.hpp"
|
||||
#include "serialization/string_utils.hpp" // for split, quoted_split, etc
|
||||
#include "serialization/unicode.hpp" // for iterator
|
||||
#include "serialization/unicode_cast.hpp" // for unicode_cast
|
||||
#include "serialization/utf8_exception.hpp" // for char_t, etc
|
||||
#include "terrain/terrain.hpp" // for terrain_type
|
||||
#include "terrain/translation.hpp" // for operator==, ter_list, etc
|
||||
|
@ -380,7 +379,7 @@ topic_text& topic_text::operator=(std::shared_ptr<topic_generator> g)
|
|||
const config& topic_text::parsed_text() const
|
||||
{
|
||||
if (generator_) {
|
||||
parsed_text_ = parse_text((*generator_)());
|
||||
parsed_text_ = markup::parse_text((*generator_)());
|
||||
// This caches the result, so doesn't need the generator any more
|
||||
generator_.reset();
|
||||
}
|
||||
|
@ -1326,381 +1325,6 @@ section *find_section(section &sec, const std::string &id)
|
|||
return const_cast<section *>(find_section(const_cast<const section &>(sec), id));
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
Here's a little mini-grammar of the markup language:
|
||||
|
||||
DOCUMENT ::= (TEXT | TAG)*
|
||||
TEXT ::= ([^<&\] | ENTITY | ESCAPE)*
|
||||
ESCAPE ::= '\' [:unicode-char:]
|
||||
ENTITY ::= '&' '#' [0-9]+ ';'
|
||||
ENTITY ::= '&' 'x' [0-9a-fA-F]+ ';'
|
||||
ENTITY ::= '&' NAME ';'
|
||||
TAG ::= '<' NAME ATTRIBUTE* '/' '>'
|
||||
TAG ::= '<' NAME ATTRIBUTE* '>' DOCUMENT '<' '/' NAME '>' ## NB: the names must match!
|
||||
TAG ::= '<' NAME '>' ATTRIBUTE* TEXT? '<' '/' NAME '>' ## NB: the names must match!
|
||||
ATTRIBUTE ::= NAME
|
||||
ATTRIBUTE ::= NAME '=' [^'" ]*
|
||||
ATTRIBUTE ::= NAME '=' "'" TEXT "'"
|
||||
ATTRIBUTE ::= NAME '=' '"' TEXT '"'
|
||||
NAME ::= [_0-9a-zA-Z]+
|
||||
|
||||
Notes:
|
||||
* Entities and the first two tag formats are Pango-style. The tags can be nested inside each other.
|
||||
* Escapes and the third tag format are for compatibility with the old help markup. Tags cannot be nested.
|
||||
* This mostly doesn't attempt to define the meaning of specific tags or entity names. It does however substitute numeric entities, as well as some very basic named entities: lt, gt, amp, quot, apos.
|
||||
* The definition of TEXT is left a bit nebulous, but just think of it as "non-greedy"
|
||||
* Attributes without a value are only supported in Pango-style tags
|
||||
* Some restrictions may apply beyond what the grammar specifies. For example, arbitrary named entities are not supported in attribute values (numeric ones and the 5 special ones work though).
|
||||
|
||||
------
|
||||
|
||||
The result of the parsing is represented in the format of a WML config.
|
||||
Text spans are represented as a [text] tag, and character entities as a [character_entity] tag.
|
||||
All other tags are represented by a tag of the same name.
|
||||
Any attributes on a tag become key-value pairs within the tag.
|
||||
Old-style help markup tags with text at the end put the text in a "text" key in the tag.
|
||||
The same approach is used for new-style Pango tags, but only if there are no nested tags or entities.
|
||||
If there ARE nested tags or entities, the contents of the tag is broken down into spans as subtags of the parent tag.
|
||||
Thus, a tag with content has EITHER a text attribute OR some subtags.
|
||||
|
||||
Note: Only unrecognized named entities count for the above purposes!
|
||||
Numerical entities and the special five lt, gt, amp, apos, quot are directly substituted in-place.
|
||||
|
||||
Also, text spans will be broken up on paragraph breaks (double newlines).
|
||||
This means that adjacent [text] tags should be rendered with a paragraph break between them.
|
||||
However, no paragraph break should be used when [text] is followed by something else.
|
||||
It is possible to have empty text spans in some cases, for example given a run of more than 2 newlines,
|
||||
or a character entity directly followed by a paragraph break.
|
||||
|
||||
*/
|
||||
static config parse_entity(std::string::const_iterator& beg, std::string::const_iterator end)
|
||||
{
|
||||
config entity;
|
||||
std::stringstream s;
|
||||
enum { UNKNOWN, NAMED, HEX, DECIMAL } type = UNKNOWN;
|
||||
assert(*beg == '&');
|
||||
++beg;
|
||||
for(; beg != end && *beg != ';'; ++beg) {
|
||||
switch(type) {
|
||||
case UNKNOWN:
|
||||
if(*beg == '#') {
|
||||
type = DECIMAL;
|
||||
} else if(isalnum(*beg) || *beg == '_') {
|
||||
type = NAMED;
|
||||
s << *beg;
|
||||
} else {
|
||||
throw parse_error("TODO");
|
||||
}
|
||||
break;
|
||||
case NAMED:
|
||||
if(!isalnum(*beg)) {
|
||||
throw parse_error("TODO");
|
||||
}
|
||||
s << *beg;
|
||||
break;
|
||||
case DECIMAL:
|
||||
if(*beg == 'x') {
|
||||
type = HEX;
|
||||
} else if(isdigit(*beg)) {
|
||||
s << *beg;
|
||||
} else {
|
||||
throw parse_error("TODO");
|
||||
}
|
||||
break;
|
||||
case HEX:
|
||||
if(isxdigit(*beg)) {
|
||||
s << *beg;
|
||||
} else {
|
||||
throw parse_error("TODO");
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(type == NAMED) {
|
||||
std::string name = s.str();
|
||||
entity["name"] = name;
|
||||
if(name == "lt") {
|
||||
entity["code_point"] = '<';
|
||||
} else if(name == "gt") {
|
||||
entity["code_point"] = '>';
|
||||
} else if(name == "apos") {
|
||||
entity["code_point"] = '\'';
|
||||
} else if(name == "quot") {
|
||||
entity["code_point"] = '"';
|
||||
} else if(name == "amp") {
|
||||
entity["code_point"] = '&';
|
||||
}
|
||||
} else {
|
||||
s.seekg(0);
|
||||
if(type == HEX) {
|
||||
s >> std::hex;
|
||||
}
|
||||
int n;
|
||||
s >> n;
|
||||
entity["code_point"] = n;
|
||||
}
|
||||
return entity;
|
||||
}
|
||||
|
||||
static char parse_escape(std::string::const_iterator& beg, std::string::const_iterator end)
|
||||
{
|
||||
assert(*beg == '\\');
|
||||
// An escape at the end of stream is just treated as a literal.
|
||||
// Otherwise, take the next character as a literal and be done with it.
|
||||
if((beg + 1) != end) {
|
||||
++beg;
|
||||
}
|
||||
return *beg;
|
||||
}
|
||||
|
||||
static config parse_text_until(std::string::const_iterator& beg, std::string::const_iterator end, char close)
|
||||
{
|
||||
// In practice, close will be one of < ' "
|
||||
// Parsing will go until either close or eos, and will emit one or more text and character_entity tags.
|
||||
// However, recognized character entities will be collapsed into the text tags.
|
||||
std::ostringstream s;
|
||||
bool saw_newline = false;
|
||||
config res;
|
||||
for(; beg != end && *beg != close; ++beg) {
|
||||
if(*beg == '&') {
|
||||
auto entity = parse_entity(beg, end);
|
||||
if(beg == end) {
|
||||
throw parse_error("unexpected eos after entity");
|
||||
}
|
||||
if(entity.has_attribute("code_point")) {
|
||||
s << unicode_cast<std::string>(entity["code_point"].to_int());
|
||||
} else {
|
||||
// TODO: Adding the text here seems wrong in the case that the stream BEGINS with an entity...
|
||||
res.add_child("text", config("text", s.str()));
|
||||
res.add_child("character_entity", entity);
|
||||
s.str("");
|
||||
}
|
||||
} else if(*beg == '\\') {
|
||||
s << parse_escape(beg, end);
|
||||
} else if(*beg == '\n') {
|
||||
if(saw_newline) {
|
||||
res.add_child("text", config("text", s.str()));
|
||||
s.str("");
|
||||
} else {
|
||||
saw_newline = true;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if(saw_newline) {
|
||||
s << '\n';
|
||||
}
|
||||
s << *beg;
|
||||
}
|
||||
saw_newline = false;
|
||||
}
|
||||
// If the span ended in a newline, preserve it
|
||||
if(saw_newline) {
|
||||
s << '\n';
|
||||
}
|
||||
res.add_child("text", config("text", s.str()));
|
||||
assert(beg == end || *beg == close);
|
||||
return res;
|
||||
}
|
||||
|
||||
static std::string parse_name(std::string::const_iterator& beg, std::string::const_iterator end)
|
||||
{
|
||||
std::ostringstream s;
|
||||
for(; beg != end && (isalnum(*beg) || *beg == '_'); ++beg) {
|
||||
s << *beg;
|
||||
}
|
||||
return s.str();
|
||||
}
|
||||
|
||||
static std::pair<std::string, std::string> parse_attribute(std::string::const_iterator& beg, std::string::const_iterator end, bool allow_empty)
|
||||
{
|
||||
std::string attr = parse_name(beg, end), value;
|
||||
if(attr.empty()) {
|
||||
throw parse_error("missing attribute name");
|
||||
}
|
||||
while(isspace(*beg)) ++beg;
|
||||
if(*beg != '=') {
|
||||
if(allow_empty) {
|
||||
// The caller expects beg to point to the last character of the attribute upon return.
|
||||
// But in this path, we're now pointing to the character AFTER that.
|
||||
--beg;
|
||||
return {attr, value};
|
||||
} else throw parse_error("attribute missing value in old-style tag");
|
||||
}
|
||||
++beg;
|
||||
while(isspace(*beg)) ++beg;
|
||||
if(*beg == '\'' || *beg == '"') {
|
||||
config res = parse_text_until(beg, end, *beg++);
|
||||
if(res.has_child("character_entity")) {
|
||||
throw parse_error("unsupported entity in attribute value");
|
||||
} else if(res.all_children_count() > 1) {
|
||||
throw parse_error("paragraph break in attribute value");
|
||||
}
|
||||
if(auto t = res.optional_child("text")) {
|
||||
value = t["text"].str();
|
||||
}
|
||||
} else {
|
||||
std::ostringstream s;
|
||||
bool found_slash = false;
|
||||
for(; beg != end && *beg != '>' && *beg != '<' && !isspace(*beg); ++beg) {
|
||||
if(*beg == '&') {
|
||||
auto entity = parse_entity(beg, end);
|
||||
if(beg == end) {
|
||||
throw parse_error("unexpected eos after entity");
|
||||
}
|
||||
if(entity.has_attribute("code_point")) {
|
||||
s << unicode_cast<std::string>(entity["code_point"].to_int());
|
||||
} else {
|
||||
throw parse_error("unsupported entity in attribute value");
|
||||
}
|
||||
} else if(*beg == '\\') {
|
||||
s << parse_escape(beg, end);
|
||||
} else if(*beg == '/') {
|
||||
found_slash = true;
|
||||
} else {
|
||||
if(found_slash) {
|
||||
s << '/';
|
||||
found_slash = false;
|
||||
}
|
||||
s << *beg;
|
||||
}
|
||||
}
|
||||
value = s.str();
|
||||
// The caller expects beg to point to the last character of the attribute upon return.
|
||||
// But in this path, we're now pointing to the character AFTER that.
|
||||
--beg;
|
||||
if(found_slash) --beg;
|
||||
}
|
||||
return {attr, value};
|
||||
}
|
||||
|
||||
static void check_closing_tag(std::string::const_iterator& beg, std::string::const_iterator end, std::string_view match)
|
||||
{
|
||||
size_t remaining = end - beg;
|
||||
assert(remaining >= 2 && *beg == '<' && *(beg + 1) == '/');
|
||||
if(remaining < match.size() + 3) {
|
||||
throw parse_error("Unexpected eos in closing tag");
|
||||
}
|
||||
beg += 2;
|
||||
if(!std::equal(match.begin(), match.end(), beg)) {
|
||||
throw parse_error("Mismatched closing tag");
|
||||
}
|
||||
beg += match.size();
|
||||
if(*beg != '>') {
|
||||
throw parse_error("Unterminated closing tag");
|
||||
}
|
||||
++beg;
|
||||
}
|
||||
|
||||
static std::pair<std::string, config> parse_tag(std::string::const_iterator& beg, std::string::const_iterator end);
|
||||
static config parse_tag_contents(std::string::const_iterator& beg, std::string::const_iterator end, std::string_view match, bool check_for_attributes)
|
||||
{
|
||||
assert(*beg == '>');
|
||||
++beg;
|
||||
// This also parses the matching closing tag!
|
||||
config res;
|
||||
for(; check_for_attributes && beg != end && *beg != '<'; ++beg) {
|
||||
if(isspace(*beg)) continue;
|
||||
auto save_beg = beg;
|
||||
try {
|
||||
auto [key, val] = parse_attribute(beg, end, false);
|
||||
res[key] = val;
|
||||
} catch(parse_error&) {
|
||||
beg = save_beg;
|
||||
while(beg != end && isspace(*beg)) ++beg;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(res.has_attribute("text")) {
|
||||
if(beg == end || *beg != '<' || (beg + 1) == end || *(beg + 1) != '/') {
|
||||
throw parse_error("Extra text at the end of old-style tag with explicit 'text' attribute");
|
||||
}
|
||||
check_closing_tag(beg, end, match);
|
||||
return res;
|
||||
} else if(res.attribute_count() > 0) {
|
||||
config text = parse_text_until(beg, end, '<');
|
||||
if(beg == end || *beg != '<' || (beg + 1) == end || *(beg + 1) != '/') {
|
||||
throw parse_error("Extra text at the end of old-style tag with explicit 'text' attribute");
|
||||
}
|
||||
if(text.all_children_count() == 1 && text.has_child("text")) {
|
||||
res["text"] = text.mandatory_child("text")["text"];
|
||||
} else {
|
||||
res.append_children(text);
|
||||
}
|
||||
check_closing_tag(beg, end, match);
|
||||
return res;
|
||||
}
|
||||
while(true) {
|
||||
config text = parse_text_until(beg, end, '<');
|
||||
if(beg == end || beg + 1 == end) {
|
||||
throw parse_error("Missing closing tag");
|
||||
}
|
||||
res.append_children(text);
|
||||
if(*(beg + 1) == '/') {
|
||||
check_closing_tag(beg, end, match);
|
||||
break;
|
||||
}
|
||||
auto [tag, contents] = parse_tag(beg, end);
|
||||
res.add_child(tag, contents);
|
||||
}
|
||||
if(res.all_children_count() == 1 && res.has_child("text")) {
|
||||
return res.mandatory_child("text");
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
static std::pair<std::string, config> parse_tag(std::string::const_iterator& beg, std::string::const_iterator end)
|
||||
{
|
||||
assert(*beg == '<');
|
||||
++beg;
|
||||
std::string tag_name = parse_name(beg, end);
|
||||
if(tag_name.empty()) {
|
||||
throw parse_error("missing tag name");
|
||||
}
|
||||
bool auto_closed = false;
|
||||
config elem;
|
||||
for(; beg != end && *beg != '>'; ++beg) {
|
||||
if(isspace(*beg)) continue;
|
||||
if(*beg == '/' && (beg + 1) != end && *(beg + 1) == '>') {
|
||||
auto_closed = true;
|
||||
} else if(isalnum(*beg) || *beg == '_') {
|
||||
const auto& [key, value] = parse_attribute(beg, end, true);
|
||||
if(beg == end) {
|
||||
throw parse_error("unexpected eos following attribute");
|
||||
}
|
||||
elem[key] = value;
|
||||
}
|
||||
}
|
||||
if(auto_closed) {
|
||||
assert(*beg == '>');
|
||||
++beg;
|
||||
} else {
|
||||
config contents = parse_tag_contents(beg, end, tag_name, elem.attribute_count() == 0);
|
||||
if(contents.all_children_count() == 0 && contents.attribute_count() == 1 && contents.has_attribute("text")) {
|
||||
elem["text"] = contents["text"];
|
||||
} else {
|
||||
elem.append(contents);
|
||||
}
|
||||
}
|
||||
return {tag_name, elem};
|
||||
}
|
||||
|
||||
config parse_text(const std::string &text)
|
||||
{
|
||||
config res;
|
||||
auto beg = text.begin(), end = text.end();
|
||||
while(beg != end) {
|
||||
if(*beg == '<') {
|
||||
auto [tag, contents] = parse_tag(beg, end);
|
||||
res.add_child(tag, contents);
|
||||
} else {
|
||||
config text = parse_text_until(beg, end, '<');
|
||||
res.append_children(text);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
std::string remove_first_space(const std::string& text)
|
||||
{
|
||||
if (text.length() > 0 && text[0] == ' ') {
|
||||
|
|
|
@ -211,12 +211,6 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
/** Thrown when the help system fails to parse something. */
|
||||
struct parse_error : public game::error
|
||||
{
|
||||
parse_error(const std::string& msg) : game::error(msg) {}
|
||||
};
|
||||
|
||||
// Generator stuff below. Maybe move to a separate file? This one is
|
||||
// getting crowded. Dunno if much more is needed though so I'll wait and
|
||||
// see.
|
||||
|
@ -228,6 +222,12 @@ std::string generate_topic_text(const std::string &generator, const config *help
|
|||
std::string generate_contents_links(const std::string& section_name, config const *help_cfg);
|
||||
std::string generate_contents_links(const section &sec);
|
||||
|
||||
/** Thrown when the help system fails to parse something. */
|
||||
struct parse_error : public game::error
|
||||
{
|
||||
parse_error(const std::string& msg) : game::error(msg) {}
|
||||
};
|
||||
|
||||
/**
|
||||
* return a hyperlink with the unit's name and pointing to the unit page
|
||||
* return empty string if this unit is hidden. If not yet discovered add the (?) suffix
|
||||
|
@ -306,13 +306,6 @@ const topic *find_topic(const section &sec, const std::string &id);
|
|||
const section *find_section(const section &sec, const std::string &id);
|
||||
section *find_section(section &sec, const std::string &id);
|
||||
|
||||
/**
|
||||
* Parse a xml style marked up text string. Return a config with the different parts of the
|
||||
* text. Each markup item is a separate part while the text between
|
||||
* markups are separate parts.
|
||||
*/
|
||||
config parse_text(const std::string &text);
|
||||
|
||||
std::string remove_first_space(const std::string& text);
|
||||
|
||||
/** Return the first word in s, not removing any spaces in the start of it. */
|
||||
|
|
|
@ -12,10 +12,11 @@
|
|||
See the COPYING file for more details.
|
||||
*/
|
||||
|
||||
#include "serialization/markup.hpp"
|
||||
|
||||
#include "game_config.hpp"
|
||||
#include "gettext.hpp"
|
||||
#include "serialization/markup.hpp"
|
||||
#include "serialization/unicode_cast.hpp" // for unicode_cast
|
||||
|
||||
namespace markup {
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "color.hpp"
|
||||
|
||||
#include "config.hpp"
|
||||
// This file isn't needed by any of these functions, but this allows any
|
||||
// standard color to be passed to span_color without an extra include.
|
||||
#include "font/standard_colors.hpp"
|
||||
|
@ -126,4 +126,21 @@ std::string img(const std::string& src, const std::string& align = "left", const
|
|||
|
||||
std::string make_link(const std::string& text, const std::string& dst);
|
||||
|
||||
//
|
||||
// Markup Parser
|
||||
//
|
||||
|
||||
/** Thrown when the help system fails to parse something. */
|
||||
struct parse_error : public game::error
|
||||
{
|
||||
parse_error(const std::string& msg) : game::error(msg) {}
|
||||
};
|
||||
|
||||
/**
|
||||
* Parse a xml style marked up text string. Return a config with the different parts of the
|
||||
* text. Each markup item is a separate part while the text between
|
||||
* markups are separate parts.
|
||||
*/
|
||||
config parse_text(const std::string &text);
|
||||
|
||||
} //end namespace markup
|
||||
|
|
Loading…
Add table
Reference in a new issue