Clean up wmlparser3.py a bit, fix json and xml output.

2015-09-20 14:52:45 -04:00 · 2015-09-20 14:52:45 -04:00 · 4c034d7705
commit 4c034d7705
parent 00d96b3016
3 changed files with 178 additions and 94 deletions
--- a/data/tools/unit_tree/helpers.py
+++ b/data/tools/unit_tree/helpers.py
@ -161,7 +161,7 @@ class WesnothList:
        self.era_lookup = {}
        self.campaign_lookup = {}
        self.parser = wmlparser3.Parser(wesnoth_exe, config_dir,
-            data_dir, no_preprocess = False)
+            data_dir)


    def add_terrains(self):
@ -184,7 +184,7 @@ class WesnothList:
        self.languages_found = {}

        parser = wmlparser3.Parser(options.wesnoth, options.config_dir,
-            options.data_dir, no_preprocess = False)
+            options.data_dir)
        parser.parse_text("{languages}")

        for locale in parser.get_all(tag="locale"):
--- a/data/tools/wesnoth/wmlparser3.py
+++ b/data/tools/wesnoth/wmlparser3.py
@ -3,7 +3,44 @@

 """
 This parser uses the --preprocess option of wesnoth so a working
-wesnoth executable must be available at runtime.
+wesnoth executable must be available at runtime if the WML to parse
+contains preprocessing directives.
+
+Pure WML can be parsed as is.
+
+For example:
+
+    wml = ""
+    [unit]
+        id=elve
+        name=Elve
+        [abilities]
+            [damage]
+                id=Ensnare
+            [/dama  ge]
+        [/abilities]
+    [/unit]
+    ""
+
+    p = Parser()
+    cfg = p.parse_text(wml)
+
+    for unit in cfg.get_all(tag = "unit"):
+        print(unit.get_text_val("id"))
+        print(unit.get_text_val("name"))
+        for abilities in unit.get_all(tag = "abilitities"):
+            for ability in abilities.get_all(tag = ""):
+                print(ability.get_name())
+                print(ability.get_text_val("id"))
+
+Because no preprocessing is required, we did not have to pass the
+location of the wesnoth executable to Parser.
+
+The get_all method always returns a list over matching tags or
+attributes.
+
+The get_name method can be used to get the name and the get_text_val
+method can be used to query the value of an attribute.
 """

 import os, glob, sys, re, subprocess, argparse, tempfile, shutil
@ -38,13 +75,18 @@ class WMLError(Exception):
 class StringNode:
    """
    One part of an attribute's value. Because a single WML string
-    can be made from multiple translatable strings we need to model
-    it this way (as a list of several StringNode).
+    can be made from multiple translatable strings we model
+    it as a list of several StringNode each with its own text domain.
    """
-    def __init__(self, data):
+    def __init__(self, data : bytes):
        self.textdomain = None # non-translatable by default
        self.data = data

+    def wml(self) -> bytes:
+        if not self.data:
+            return b""
+        return self.data
+
    def debug(self):
        if self.textdomain:
            return "_<%s>'%s'" % (self.textdomain,
@ -64,11 +106,24 @@ class AttributeNode:
        self.location = location
        self.value = [] # List of StringNode

+    def wml(self) -> bytes:
+        s = self.name + b"=\""
+        for v in self.value:
+            s += v.wml().replace(b"\"", b"\"\"")
+        s += b"\""
+        return s
+    
    def debug(self):
        return self.name.decode("utf8") + "=" + " .. ".join(
            [v.debug() for v in self.value])

-    def get_text(self, translation=None):
+    def get_text(self, translation = None) -> str:
+        """
+        Returns a text representation of the node's value. The
+        translation callback, if provided, will be called on each
+        partial string with the string and its corresponding textdomain
+        and the returned translation will be used.
+        """
        r = ""
        for s in self.value:
            ustr = s.data.decode("utf8", "ignore")
@ -78,6 +133,18 @@ class AttributeNode:
                r += ustr
        return r

+    def get_binary(self):
+        """
+        Returns the unmodified binary representation of the value.
+        """
+        r = b""
+        for s in self.value:
+            r += s.data
+        return r
+
+    def get_name(self):
+        return self.name.decode("utf8")
+
 class TagNode:
    """
    A WML tag. For example the "unit" in this example:
@ -94,6 +161,20 @@ class TagNode:

        self.speedy_tags = {}

+    def wml(self) -> bytes:
+        """
+        Returns a (binary) WML representation of the entire node.
+        All attribute values are enclosed in quotes and quotes are
+        escaped (as double quotes). Note that no other escaping is
+        performed (see the BinaryWML specification for additional
+        escaping you may require).
+        """
+        s = b"[" + self.name + b"]\n"
+        for sub in self.data:
+            s += sub.wml() + b"\n"
+        s += b"[/" + self.name + b"]\n"
+        return s
+
    def debug(self):
        s = "[%s]\n" % self.name.decode("utf8")
        for sub in self.data:
@ -164,11 +245,25 @@ class TagNode:
        it to gettext.translation if you have the binary message
        catalogues loaded.
        """
-        x = self.get_all(att=name)
+        x = self.get_all(att = name)
        if not x: return default
        return x[val].get_text(translation)

+    def get_binary(self, name, default = None):
+        """
+        Returns the unmodified binary data for the first attribute
+        of the given name or the passed default value if it is not
+        found.
+        """
+        x = self.get_all(att = name)
+        if not x: return default
+        return x[0].get_binary()
+
    def append(self, node):
+        """
+        Appends a child node (must be either a TagNode or
+        AttributeNode).
+        """
        self.data.append(node)

        if isinstance(node, TagNode):
@ -176,6 +271,9 @@ class TagNode:
                self.speedy_tags[node.name] = []
            self.speedy_tags[node.name].append(node)

+    def get_name(self):
+        return self.name.decode("utf8")
+
 class RootNode(TagNode):
    """
    The root node. There is exactly one such node.
@ -191,12 +289,18 @@ class RootNode(TagNode):
        return s

 class Parser:
-    def __init__(self, wesnoth_exe, config_dir, data_dir,
-        no_preprocess):
+    def __init__(self, wesnoth_exe = None, config_dir = None,
+            data_dir = None):
        """
-        path - Path to the file to parse.
        wesnoth_exe - Wesnoth executable to use. This should have been
            configured to use the desired data and config directories.
+        config_dir - The Wesnoth configuration directory, can be
+            None to use the wesnoth default.
+        data_dir - The Wesnoth data  directory, can be None to use
+            the wesnoth default.
+
+        After parsing is done the root node of the result will be
+        in the root attribute.
        """
        self.wesnoth_exe = wesnoth_exe
        self.config_dir = None
@ -205,7 +309,7 @@ class Parser:
        if data_dir: self.data_dir = os.path.abspath(data_dir)
        self.keep_temp_dir = None
        self.temp_dir = None
-        self.no_preprocess = no_preprocess
+        self.no_preprocess = (wesnoth_exe == None)
        self.preprocessed = None
        self.verbose = False

@ -214,29 +318,39 @@ class Parser:
        self.line_in_file = 42424242
        self.chunk_start = "?"

-    def parse_file(self, path, defines=""):
+    def parse_file(self, path, defines = "") -> RootNode:
+        """
+        Parse the given file found under path.
+        """
        self.path = path
        if not self.no_preprocess:
            self.preprocess(defines)
-        self.parse()
+        return self.parse()

-    def parse_text(self, text, defines=""):
-        temp = tempfile.NamedTemporaryFile(prefix="wmlparser_",
+    def parse_binary(self, binary : bytes, defines = "") -> RootNode:
+        """
+        Parse a chunk of binary WML.
+        """
+        temp = tempfile.NamedTemporaryFile(prefix = "wmlparser_",
            suffix=".cfg")
-        temp.write(text.encode("utf8"))
+        temp.write(binary)
        temp.flush()
        self.path = temp.name
        if not self.no_preprocess:
            self.preprocess(defines)
-        self.parse()
+        return self.parse()
+
+    def parse_text(self, text, defines = "") -> RootNode:
+        """
+        Parse a text string.
+        """
+        return self.parse_binary(text.encode("utf8"), defines)

    def preprocess(self, defines):
        """
-        Call wesnoth --preprocess to get preprocessed WML which we
-        can subsequently parse.
-
-        If this is not called then the .parse method will assume the
-        WML is already preprocessed.
+        This is called by the parse functions to preprocess the
+        input from a normal WML .cfg file into a preprocessed
+        .plain file.
        """
        if self.keep_temp_dir:
            output = self.keep_temp_dir
@ -436,7 +550,7 @@ class Parser:
        if segment.endswith(b"\n") and not self.skip_newlines_after_plus:
            self.temp_key_nodes = []

-    def parse(self):
+    def parse(self) -> RootNode:
        """
        Parse preprocessed WML into a tree of tags and attributes.
        """
@ -477,6 +591,8 @@ class Parser:
                print(("removing " + self.temp_dir))
            shutil.rmtree(self.temp_dir, ignore_errors=True)

+        return self.root
+
    def handle_command(self, com):
        if com.startswith(b"line "):
            self.last_wml_line = com[5:]
@ -494,73 +610,40 @@ class Parser:
    def get_text_val(self, name, default=None, translation=None):
        return self.root.get_text_val(name, default, translation)

-
-import json
-def jsonify(tree, verbose=False, depth=0):
+def jsonify(tree, verbose=False, depth=1):
    """
-Convert a DataSub into JSON
+Convert a Parser tree into JSON

-If verbose, insert a linebreak after every brace and comma (put every item on its own line), otherwise, condense everything into a single line.
+If verbose, insert a linebreak after every brace and comma (put every
+item on its own line), otherwise, condense everything into a single line.
 """
-    print("{", end=' ')
-    first = True
-    sdepth1 = "\n" + " " * depth
-    sdepth2 = sdepth1 + " "
-    for pair in tree.speedy_tags.items():
-        if first:
-            first = False
-        else:
-            sys.stdout.write(",")
-        if verbose:
-            sys.stdout.write(sdepth2)
-        print('"%s":' % pair[0], end=' ')
-        if verbose:
-            sys.stdout.write(sdepth1)
-        print('[', end=' ')
-        first_tag = True
-        for tag in pair[1]:
-            if first_tag:
-                first_tag = False
-            else:
-                sys.stdout.write(",")
-            if verbose:
-                sys.stdout.write(sdepth2)
-            jsonify(tag, verbose, depth + 2)
-        if verbose:
-            sys.stdout.write(sdepth2)
-        sys.stdout.write("]")
-    for child in tree.data:
-        if isinstance(child, TagNode):
-            continue
-        if first:
-            first = False
-        else:
-            sys.stdout.write(",")
-        if verbose:
-            sys.stdout.write(sdepth2)
-        print('"%s":' % child.name, end=' ')
-        print(json.dumps(child.get_text()), end=' ')
-    if verbose:
-        sys.stdout.write(sdepth1)
-    sys.stdout.write("}")
+    import json
+    def node_to_dict(n):
+        d = {}
+        tags = set(x.get_name() for x in n.get_all(tag = ""))
+        for tag in tags:
+            d[tag] = [node_to_dict(x) for x in n.get_all(tag = tag)]
+        for att in n.get_all(att = ""):
+            d[att.get_name()] = att.get_text()
+        return d
+
+    print(json.dumps(node_to_dict(tree), indent = depth if verbose else None))

-from xml.sax.saxutils import escape
 def xmlify(tree, verbose=False, depth=0):
-    sdepth = ""
-    if verbose:
-        sdepth = "  " * depth
-    for child in tree.data:
-        if isinstance(child, TagNode):
-            print('%s<%s>' % (sdepth, child.name))
-            xmlify(child, verbose, depth + 1)
-            print('%s</%s>' % (sdepth, child.name))
-        else:
-            if "\n" in child.get_text() or "\r" in child.get_text():
-                print(sdepth + '<' + child.name + '>' + \
-            '<![CDATA[' + child.get_text() + ']]>' + '</' + child.name + '>')
-            else:
-                print(sdepth + '<' + child.name + '>' + \
-            escape(child.get_text()) + '</' + child.name + '>')
+    import xml.etree.ElementTree as ET
+
+    def node_to_et(n):
+        et = ET.Element(n.get_name())
+        for att in n.get_all(att = ""):
+            attel = ET.Element(att.get_name())
+            attel.text = att.get_text()
+            et.append(attel)
+        for tag in n.get_all(tag = ""):
+            et.append(node_to_et(tag))
+        return et
+    
+    ET.ElementTree(node_to_et(tree.get_all()[0])).write(
+        sys.stdout, encoding = "unicode")

 if __name__ == "__main__":
    arg = argparse.ArgumentParser()
@ -573,7 +656,6 @@ if __name__ == "__main__":
    arg.add_argument("-d", "--defines", help="comma separated list of WML defines")
    arg.add_argument("-T", "--test", action="store_true")
    arg.add_argument("-j", "--to-json", action="store_true")
-    arg.add_argument("-n", "--no-preprocess", action="store_true")
    arg.add_argument("-v", "--verbose", action="store_true")
    arg.add_argument("-x", "--to-xml", action="store_true")
    args = arg.parse_args()
@ -582,15 +664,18 @@ if __name__ == "__main__":
        sys.stderr.write("No input given. Use -h for help.\n")
        sys.exit(1)

-    if not args.no_preprocess and (not args.wesnoth or not
-        os.path.exists(args.wesnoth)):
+    if (args.wesnoth and not os.path.exists(args.wesnoth)):
        sys.stderr.write("Wesnoth executable not found.\n")
        sys.exit(1)

+    if not args.wesnoth:
+        print("Warning: Without the -w option WML is not preprocessed!",
+            file = sys.stderr)
+
    if args.test:
        print("Running tests")
        p = Parser(args.wesnoth, args.config_dir,
-            args.data_dir, args.no_preprocess)
+            args.data_dir)
        if args.keep_temp:
            p.keep_temp_dir = args.keep_temp
        if args.verbose: p.verbose = True
@ -784,8 +869,7 @@ foo='bar' .. 'baz'

        sys.exit(0)

-    p = Parser(args.wesnoth, args.config_dir, args.data_dir,
-        args.no_preprocess)
+    p = Parser(args.wesnoth, args.config_dir, args.data_dir)
    if args.keep_temp:
        p.keep_temp_dir = args.keep_temp
    if args.verbose: p.verbose = True
--- a/data/tools/wmlunits
+++ b/data/tools/wmlunits
@ -78,7 +78,7 @@ def get_info(addon):
        path = options.addons + "/" + addon + "/_info.cfg"
        if os.path.exists(path):
            parser = wmlparser3.Parser(options.wesnoth, options.config_dir,
-                options.data_dir, no_preprocess = False)
+                options.data_dir)
            parser.parse_file(path)
            _info[addon] = parser
        else:
@ -627,7 +627,7 @@ if __name__ == '__main__':
    if options.language == "all":
        languages = []
        parser = wmlparser3.Parser(options.wesnoth, options.config_dir,
-            options.data_dir, no_preprocess = False)
+            options.data_dir)
        parser.parse_text("{languages}")

        for locale in parser.get_all(tag="locale"):