wmlunits: Made the Python wml parser more robust...

...when dealing with invalid WML, it should now abort with a meaningful error message with about any bad WML you throw at it.
2008-05-16 16:05:51 +00:00 · 2008-05-16 16:05:51 +00:00 · 5cb6bf80eb
commit 5cb6bf80eb
parent fc65c10b60
3 changed files with 175 additions and 97 deletions
--- a/data/tools/unit_tree/helpers.py
+++ b/data/tools/unit_tree/helpers.py
@ -22,7 +22,6 @@ class ParserWithCoreMacros:

        # Create a new parser for the macros.
        parser = wmlparser.Parser(datadir)
-        parser.do_preprocessor_logic = True
        parser.gettext = self.gettext
        
        # Parse core macros.
@ -33,10 +32,11 @@ class ParserWithCoreMacros:
    def parse(self, text_to_parse, ignore_macros = None):
        # Create the real parser.
        parser = wmlparser.Parser(self.datadir, self.userdir)
-        parser.do_preprocessor_logic = True
        parser.gettext = self.gettext
        parser.macros = copy.copy(self.core_macros)
        
+        #parser.verbose = True
+        
        # Suppress complaints about undefined terrain macros
        parser.set_macro_not_found_callback(lambda wmlparser, name, params:
        name.startswith("TERRAIN") or name == "DISABLE_TRANSITIONS")
@ -136,6 +136,7 @@ class ImageCollector:
                sys.stderr.write("Warning: Looked at the following locations:\n")
                sys.stderr.write("\n".join(bases) + "\n")

+blah = 1
 class WesnothList:
    """
    Lists various Wesnoth stuff like units, campaigns, terrains, factions...
@ -192,6 +193,10 @@ class WesnothList:

    def add_campaign(self, campaign):
        name = campaign.get_text_val("id")
+        if not name:
+            global blah
+            name = "noid%d" % blah
+            blah += 1
        self.campaign_lookup[name] = campaign
        return name

@ -229,18 +234,15 @@ class WesnothList:
                #define RANDOM_SIDE\n#enddef
                {~campaigns}
                """)
-        except Exception, e:
+        except wmlparser.Error, e:
            print e
            return n
        for campaign in WML.find_all("campaign"):
            cid = self.add_campaign(campaign)
-            n += 1
-            image_collector.add_binary_pathes_from_WML(cid, WML)
        
        for era in WML.find_all("era"):
            eid = self.add_era(era)
            image_collector.add_binary_pathes_from_WML(eid, WML)
-            n += 1
            
        n = self.add_units(WML, "addons")
        
--- a/data/tools/wesnoth/wmlparser.py
+++ b/data/tools/wesnoth/wmlparser.py
@ -75,11 +75,9 @@ class Parser:
        # is expected to return a translation.
        self.gettext = None

-        # If set, we actually do preprocessor logic with #ifdef and so on.
-        # Otherwise, they get included in the parse tree as nodes.
-        self.do_preprocessor_logic = False
-        # Internal flag while inside a non-parsed block
-        self.just_parse = False
+        # A list containing the stacked up #ifdefs.
+        self.preprocessor_nesting = []
+        self.stay_in_file = False

        # If set, included files are only parsed when under the given directory.
        self.only_expand_pathes = []
@ -102,7 +100,7 @@ class Parser:
        except UnicodeDecodeError:
            u = text.decode("latin1")
        text = u
-        text = text.replace("\r\n", "\n").replace("\t", " ")
+        text = text.replace("\r\n", "\n").replace("\t", " ").replace("\r", "\n")
        if text[-1] != "\n": text += "\n"
        return text

@ -127,7 +125,7 @@ class Parser:
        Set the parser to parse from a file object.
        """
        text = stream.read()
-        text = text.replace("\r\n", "\n").replace("\t", " ")
+        text = text.replace("\r\n", "\n").replace("\t", " ").replace("\r", "\n")
        self.push_text("inline", text)

    def parse_text(self, text):
@ -188,7 +186,7 @@ class Parser:
        if c == "\n":
            self.line += 1
        if self.textpos == len(self.text):
-            if len(self.texts): self.pop_text()
+            if len(self.texts) and not self.stay_in_file: self.pop_text()
        return c

    def at_end(self):
@ -196,12 +194,17 @@ class Parser:
        Return True if the parser is at the very end of the input, that is the
        last character of the topmost input text has been read.
        """
-        return len(self.texts) == 0 and self.textpos == len(self.text)
+        return (len(self.texts) == 0
+            or self.stay_in_file) and self.textpos == len(self.text)
+
+    def check_end(self):
+        if self.textpos == len(self.text):
+            if len(self.texts): self.pop_text()

    def peek_next(self):
        """Like read_next, but does not consume."""
        if self.textpos >= len(self.text):
-            if len(self.texts):
+            if len(self.texts) and not self.stay_in_file:
                ts = self.texts[-1]
                if ts.textpos >= len(ts.text): return ""
                return ts.text[ts.textpos]
@ -214,14 +217,14 @@ class Parser:
            found = mob.group(0)
            self.line += found.count("\n")
            self.textpos = mob.end(0)
-            if self.textpos == len(self.text):
+            if self.textpos == len(self.text) and not self.stay_in_file:
                if len(self.texts): self.pop_text()
            return found
        else:
            found = self.text[self.textpos:]
            self.line += found.count("\n")
            self.textpos = len(self.text)
-            if len(self.texts):
+            if len(self.texts) and not self.stay_in_file:
                self.pop_text()
                found += self.read_until(sep)
            return found
@ -240,18 +243,63 @@ class Parser:
    def skip_whitespace_and_newlines(self):
        self.read_while(" \t\r\n")

+    preprocessor_commands = ["#define", "#undef", "#textdomain", "#ifdef",
+        "#ifndef", "#else", "#enddef", "#endif"]
+
+    def read_lines_until(self, string):
+        """
+        Read lines until one contains the given string, but throw away any
+        comments.
+        """
+        text = ""
+        in_string = False
+        while 1:
+            if self.at_end():
+                return None
+            line = self.read_until("\n")
+
+            line_start = 0
+            if in_string:
+                string_end = line.find('"')
+                if string_end < 0:
+                    text += line
+                    continue
+                in_string = False
+                line_start = string_end + 1
+
+            elif line.lstrip().startswith("#"):
+                possible_comment = line.lstrip()
+                for com in self.preprocessor_commands:
+                    if possible_comment.startswith(com):
+                        break
+                else:
+                    continue
+            
+            
+            quotation = line.find('"', line_start)
+            while quotation >= 0:
+                in_string = True
+                string_end = line.find('"', quotation + 1)
+                if string_end < 0: break
+                line_start = string_end + 1
+                in_string = False
+                quotation = line.find('"', line_start)
+            
+            if not in_string:
+                end = line.find(string, line_start)
+                if end >= 0:
+                    text += line[:end]
+                    break
+            text += line
+        return text
+
    def skip_whitespace_inside_statement(self):
        self.read_while(" \t\r\n")
        if not self.at_end():
            c = self.peek_next()
            if c == "#":
-                if self.check_for("#define"): return
-                if self.check_for("#undef"): return
-                if self.check_for("#textdomain"): return
-                if self.check_for("#ifdef"): return
-                if self.check_for("#ifndef"): return
-                if self.check_for("#else"): return
-                if self.check_for("#end"): return
+                for command in self.preprocessor_commands:
+                    if self.check_for(command): return
                self.read_until("\n")
                self.skip_whitespace_inside_statement()

@ -262,18 +310,6 @@ class Parser:
        """Compare the following text with str."""
        return self.text[self.textpos:self.textpos + len(str)] == str

-    def read_upto_string(self, str):
-        """Read input up to and including the given string."""
-        pos = self.text.find(str, self.textpos)
-        if pos == -1:
-            return None
-        found = self.text[self.textpos:pos]
-        self.textpos = pos + len(str)
-        self.line += found.count("\n")
-        if self.textpos == len(self.text):
-            self.pop_text()
-        return found
-
    def parse_macro(self):
        """No recursive macro processing is done here. If a macro is passed as
        parameter to a macro, then whoever parses the macro replacement will do
@ -295,10 +331,6 @@ class Parser:
                raise Error(self, "Unclosed macro")
                return

-        if self.just_parse:
-            # We do not execute any macros or file inclusions.
-            return None
-
        preserve = macro
        macro = macro[:-1] # Get rid of final }
        
@ -448,8 +480,8 @@ class Parser:
                    rep = self.gettext(self.textdomain, rep[q + 1:qe])
                    rep = '"' + rep + '"'
                if self.verbose:                        
-                    s = "Replacing {%s} with %s" % (macro.params[i], rep)
-                    print s.encode("utf8")
+                    s = "Replacing {%s} with %s\n" % (macro.params[i], rep)
+                    sys.stderr.write(s.encode("utf8"))
                text = text.replace("{%s}" % macro.params[i], rep)

            if text:
@ -507,6 +539,7 @@ class Parser:
            elif not got_assign:
                if c == "=":
                    variables += [variable.rstrip()]
+                    if variables[-1] == "mode": fuck = True
                    got_assign = True
                    translatable = False
                    self.skip_whitespace()
@ -567,11 +600,15 @@ class Parser:
        data = []
        j = 0
        for i in range(len(variables)):
-            data += [wmldata.DataText(variables[i], values[j])]
+            try:
+                data += [wmldata.DataText(variables[i], values[j])]
+            except IndexError:
+                raise Error(self, "Assignement does not match: %s = %s" % (
+                    str(variables), str(values)))
            j += 1
        return data

-    def parse_top(self, data, state = None, dont_parse_else = False):
+    def parse_top(self, data, state = None):
        while 1:
            self.skip_whitespace_and_newlines()
            if self.at_end():
@ -588,20 +625,21 @@ class Parser:
                        if name: params += [name]
                        if sep == "\n": break
                        self.read_while(" ")
-                    text = self.read_upto_string("#enddef")
+                        
+                    text = self.read_lines_until("#enddef")
                    if text == None:
                        raise Error(self, "#define without #enddef")
-                        return
-                    if not self.just_parse:
-                        self.macros[params[0]] = self.Macro(
-                            params[0], params[1:], text, self.textdomain)
-                        if self.verbose:
-                            sys.stderr.write("New macro: %s.\n" % params[0])
+
+                    self.macros[params[0]] = self.Macro(
+                        params[0], params[1:], text, self.textdomain)
+                    if self.verbose:
+                        sys.stderr.write("New macro: %s.\n" % params[0])

                elif self.check_for("undef "):
                    self.read_until(" ")
                    name = self.read_until(" \n")
-                    self.macros[name] = None
+                    name = name.rstrip()
+                    if name in self.macros: del self.macros[name]
                elif self.check_for("ifdef ") or self.check_for("ifndef"):

                    what = "#" + self.read_until(" ").rstrip()
@ -610,58 +648,75 @@ class Parser:
                    if name[-1] == " ": self.read_while(" \n")
                    name = name[:-1]

-                    subdata = wmldata.DataIfDef(name, [], "then")
-                    
-                    dont_parse_else = False
-                    if self.do_preprocessor_logic:
-                        dont_parse_else = True
-                        prev = self.just_parse
-                        if what == "#ifdef":
-                            if not name in self.macros:
-                                self.just_parse = True
-                                dont_parse_else = prev
-                        elif what == "#ifndef":
-                            if name in self.macros:
-                                self.just_parse = True
-                                dont_parse_else = prev
+                    condition_failed = False
+                    if what == "#ifdef":
+                        if name in self.macros:
+                            pass
+                        else:
+                            condition_failed = True
+                    else: # what == "#ifndef"
+                        if not name in self.macros:
+                            pass
+                        else:
+                            condition_failed = True

-                    self.parse_top(subdata, what, dont_parse_else)
+                    self.preprocessor_nesting.append((what, condition_failed))

-                    if self.do_preprocessor_logic:
-                        self.just_parse = prev
-                        if self.just_parse == False:
-                            elses = subdata.get_ifdefs("else")
-                            if dont_parse_else:
-                                if elses:
-                                    subdata.remove(elses[0])
-                                data.append(subdata)
-                            else:
-                                if elses:
-                                    data.append(elses[0])
-                    else:
-                        data.insert(subdata)
+                    # If the condition is true, we simply continue parsing. At
+                    # some point we will either hit an #else or #endif, and
+                    # things continue there. If the condition failed, we skip
+                    # over everything until we find the matching #else or
+                    # endif.
+                    if condition_failed:
+                        self.stay_in_file = True
+                        balance = 1
+                        while balance > 0 and not self.at_end():
+                            line = self.read_until("\n")
+                            line = line.lstrip()
+                            if line.startswith("#ifdef"): balance += 1
+                            if line.startswith("#ifndef"): balance += 1
+                            if line.startswith("#endif"): balance -= 1
+                            if line.startswith("#else"):
+                                if balance == 1:
+                                    balance = -1
+                                    break
+
+                        self.stay_in_file = False
+                        if balance == 0:
+                            self.preprocessor_nesting.pop()
+                        if balance > 0:
+                            raise Error(self, "Missing  #endif for %s" % what)
+
+                        self.check_end()

                elif self.check_for("else"):
+                    if not self.preprocessor_nesting:
+                        raise Error(self, "#else without #ifdef")

                    self.read_until("\n")
-                    if state != "#ifdef" and state != "#ifndef":
-                        raise Error(self, "#else without #ifdef")
-                    subdata = wmldata.DataIfDef("else", [], "else")
                    
-                    if self.do_preprocessor_logic:
-                        self.just_parse = dont_parse_else
-                    self.parse_top(subdata, "#else")
+                    # We seen an #else - that means we are at the end of a
+                    # conditional preprocessor block which has executed. So
+                    # we should now ignore everything up to the #endif.
+                    balance = 1
+                    self.stay_in_file = True
+                    while balance > 0 and not self.at_end():
+                        line = self.read_until("\n")
+                        line = line.lstrip()
+                        if line.startswith("#ifdef"): balance += 1
+                        if line.startswith("#ifndef"): balance += 1
+                        if line.startswith("#endif"): balance -= 1
+                    self.stay_in_file = False
+                    if balance != 0:
+                        raise Error(self, "Missing #endif for #else")

-                    data.insert(subdata)
-                    return
+                    self.check_end()

                elif self.check_for("endif"):
+                    if not self.preprocessor_nesting:
+                        raise Error(self, "#endif without #ifdef")
+                    self.preprocessor_nesting.pop()
                    self.read_until("\n")
-                    if state != "#ifdef" and state != "#else" and state !=\
-                        "#ifndef":
-                        self.read_until("\n")
-                        raise Error(self, "#endif without #ifdef or #else")
-                    return

                elif self.check_for("textdomain"):
                    self.read_until(" ")
--- a/data/tools/wmlunits
+++ b/data/tools/wmlunits
@ -112,7 +112,7 @@ class HTMLOutput:
                            au = self.wesnoth.unit_lookup[auid]
                        except KeyError:
                            sys.stderr.write(
-                                "Warning:Unit %s not found as advancement of %s" %
+                                "Warning: Unit %s not found as advancement of %s\n" %
                                (auid, uid))
                            continue
                        forest.add_node(helpers.UnitNode(au))
@ -321,7 +321,9 @@ class HTMLOutput:
        anames = []
        already = {}
        for abilities in u.get_all("abilities"):
-            for ability in abilities.children():
+            try: c = abilities.children()
+            except AttributeError: c = []
+            for ability in c:
                id = ability.get_text_val("id")
                if id in already: continue
                already[id] = True
@ -879,7 +881,7 @@ def output(isocode):
    n = stuff.add_units(WML, "mainline")
    print n, "mainline units found."

-    # Now we read each campaign in turn to get its units.
+    # Now we read each mainline campaign in turn to get its units.
    cnames = stuff.campaign_lookup.keys()
    for cname in cnames:
        print "Parsing %s units ..." % cname,
@ -894,9 +896,28 @@ def output(isocode):
        image_collector.add_binary_pathes_from_WML(cname, WML)
        print n, "units found."

+    sys.stderr.flush()
    print "Parsing addons ...",
+    sys.stdout.flush()
    n = stuff.add_addons(image_collector)
    print "%d units found." % n
+    
+    # Now we read each addon campaign in turn to get its units.
+    cnames = stuff.campaign_lookup.keys()
+    for cname in cnames:
+        if cname in stuff.is_mainline_campaign: continue
+        campaign = stuff.campaign_lookup[cname]
+        print "Parsing %s units ..." % cname,
+        sys.stdout.flush()
+
+        define = campaign.get_text_val("define")
+        WML = stuff.parser.parse("""
+            #define %s\n#enddef
+            {~campaigns}""" % define,
+            ignore_macros = lambda x: x.find("/scenarios") == -1)
+        n = stuff.add_units(WML, cname)
+        image_collector.add_binary_pathes_from_WML(cname, WML)
+        print n, "units found."

    stuff.find_unit_factions()