Optimized tokenizer a bit (skip_comment)

2008-02-08 12:03:57 +00:00 · 2008-02-08 12:03:57 +00:00 · 376408d8a2
commit 376408d8a2
parent b0a17b80f4
5 changed files with 155 additions and 47 deletions
--- a/configure.ac
+++ b/configure.ac
@ -84,6 +84,11 @@ AC_ARG_ENABLE([debug],
 	      [debug=$enableval],
 	      [debug=no])

+AC_ARG_ENABLE([profile],
+	      AS_HELP_STRING([--enable-profile], [enable profiling in wesnoth]),
+	      [profile=$enableval],
+	      [profile=no])
+
 if test "x$debug" = "xyes"
 then
 	CXXFLAGS="$CXXFLAGS -O0 -DDEBUG -ggdb3 -W -Wall -ansi"
@ -91,6 +96,10 @@ else
 	CXXFLAGS="-O2 -W -Wall -ansi $CXXFLAGS"
 fi

+if test "x$profile" = "xyes"
+then
+	CXXFLAGS="$CXXFLAGS -pg"
+fi

 # Make tests default in svn version
 svn_in_version=`expr match "$WESNOTH_VERSION" '.*svn'`
@ -363,6 +372,11 @@ AC_PROG_CC
 AC_PROG_INSTALL
 AM_PROG_CC_C_O

+# Check for __builtin_expect
+AC_TRY_LINK([int foo (int a) { a = __builtin_expect (a, 10); return a == 10 ? 0 : 1; }],
+      [],
+      [CPPFLAGS="$CPPFLAGS -DHAVE_BUILTIN_EXPECT"])
+
 have_libx11='no'
 if test "$with_x" != 'no'; then

--- a/src/Makefile.am
+++ b/src/Makefile.am
@ -478,6 +478,10 @@ if PREFSDIR
    CXXFLAGS += -DPREFERENCES_DIR=\"$(prefsdir)\"
 endif

+##if HAVE_BUILTIN_EXPECT
+##	CXXFLAGS += -DHAVE_BUILTIN_EXPECT
+##endif
+
 if BOOST_TEST_DYN_LINK
    CXXFLAGS += -DBOOST_TEST_DYN_LINK
 endif
--- a/src/serialization/tokenizer.cpp
+++ b/src/serialization/tokenizer.cpp
@ -17,11 +17,13 @@

 #include "global.hpp"

+#include "util.hpp"
 #include "serialization/tokenizer.hpp"
 #include "serialization/string_utils.hpp"

 #include <iostream>
 #include <sstream>
+#include <list>

 tokenizer::tokenizer() :
 	current_(EOF),
@ -32,33 +34,98 @@ tokenizer::tokenizer() :
 	token_()
 {
 }
+const size_t matching_comments = 2;
+const std::string comment[] = {"textdomain","line"};

 void tokenizer::skip_comment()
 {
-	// Dump comments up to \n
-	std::string comment;
-	next_char();
-	while (current_ != EOF && current_ != '\n') {
-		comment += current_;
-		next_char();
+	std::list<int> matching;
+	std::list<int>::iterator index;
+	size_t n;
+	for (n = 0; n < matching_comments; ++n)
+	{
+		matching.push_back(n);
 	}
+	n = 0;
+	this->next_char_fast();
+ 	while (current_ != EOF && current_ != '\n') {
+		for (index = matching.begin(); index != matching.end();)
+		{
+			if(comment[*index][n] != static_cast<unsigned char>(current_))
+			{
+				index = matching.erase(index);
+			}
+			else
+			{
+				if (n+1 == comment[*index].size())
+				{
+					// We have a match
+					switch(*index)
+					{
+					case 0:
+						do {
+							this->next_char_fast();
+						} while (current_ == ' ' || current_ == '\t');
+						textdomain_ = "";
+						while(current_ != EOF && current_ != '\n')
+						{
+							textdomain_ += current_;
+							this->next_char_fast();
+						}
+						std::cerr << textdomain_ << " ";
+						return;
+					case 1:
+						do {
+							this->next_char_fast();
+						} while (current_ == ' ' || current_ == '\t');
+						std::string lineno;
+						while(current_ != EOF && current_ != '\n')
+						{
+							if (current_ == ' ' || current_ == '\t')
+							{
+								break;
+							}
+							lineno += current_;
+							this->next_char_fast();
+						}

-	// Identifies and processes tokenizer directives
-	std::string::size_type pos = comment.find_first_of(" \t");
-	if (pos != std::string::npos) {
-		const std::string word = comment.substr(0, pos);

-		if (word == "textdomain" && pos < comment.size() - 1) {
-			textdomain_ = comment.substr(pos + 1);
-		} else if (word == "line" && pos < comment.size() - 1) {
-			std::string::size_type pos2 = comment.find_first_of(" \t", pos + 1);
+						if (current_ == EOF || current_ == '\n')
+						{
+							return;
+						}
+						do {
+							this->next_char_fast();
+						} while (current_ == ' ' || current_ == '\t');
+						file_ = "";
+						while (current_ != EOF && current_ != '\n')
+						{
+							file_ += current_;
+							this->next_char_fast();
+						}
+						lineno_ = lexical_cast<size_t>(lineno);
+						std::cerr << lineno_ << " " << file_ << " ";

-			if (pos2 != std::string::npos) {
-				lineno_ = lexical_cast<size_t>(comment.substr(pos + 1, pos2 - pos));
-				file_ = comment.substr(pos2 + 1);
+						return;
+					}
+				}
+				++index;
 			}
 		}
+		++n;
+		if (!matching.empty())
+		{
+			break;
+		}
+		this->next_char_fast();
+ 	}
+ 
+	while (current_ != '\n' && current_ != EOF)
+	{
+		this->next_char_fast();
 	}
+
+
 }

 const token& tokenizer::next_token()
@ -70,7 +137,7 @@ const token& tokenizer::next_token()
 	for(;;) {
 		while (is_space(current_)) {
 			token_.leading_spaces += current_;
-			next_char();
+			this->next_char_fast();
 		}
 		if (current_ != 254)
 			break;
@ -100,7 +167,7 @@ const token& tokenizer::next_token()
 			if(current_ == '"' && peek_char() != '"')
 				break;
 			if(current_ == '"' && peek_char() == '"')
-				next_char();
+				this->next_char_fast();
 			if (current_ == 254) {
 				skip_comment();
 				--lineno_;
@ -119,7 +186,7 @@ const token& tokenizer::next_token()
 			token_.type = token::STRING;
 			token_.value += current_;
 			while(is_alnum(peek_char())) {
-				next_char();
+				this->next_char_fast();
 				token_.value += current_;
 			}
 		} else {
@ -170,30 +237,32 @@ tokenizer_string::tokenizer_string(std::string& in) :
 	in_(in),
 	offset_(0)
 {
-	next_char();
+	this->next_char_fast();
 }


 tokenizer_stream::tokenizer_stream(std::istream& in) :
 	in_(in)
 {
-	if(in_.good()) {
-		current_ = in_.get();
-	}
+	this->next_char_fast();
 }

-void tokenizer_stream::next_char()
+void tokenizer_stream::next_char_fast()
 {
-	if (current_ == '\n')
-		lineno_++;
-
-	do {
-		if(in_.good()) {
-			current_ = in_.get();
-		} else {
-			current_ = EOF;
+	if(LIKELY(in_.good())) {
+		current_ = in_.get();
+		if (UNLIKELY(current_ == '\r'))
+		{
+			// we assume that there is only one '\r'
+			if(LIKELY(in_.good())) {
+				current_ = in_.get();
+			} else {
+				current_ = EOF;
+			}
 		}
-	} while(current_ == '\r');
+	} else {
+		current_ = EOF;
+	}
 }

 int tokenizer_stream::peek_char() const
@ -202,19 +271,22 @@ int tokenizer_stream::peek_char() const
 }


-void tokenizer_string::next_char()
+void tokenizer_string::next_char_fast()
 {

-	if (current_ == '\n')
-		lineno_++;
-
-	do {
-		if(offset_ < in_.size()) {
-			current_ = in_[offset_++];
-		} else {
-			current_ = EOF;
+	if(LIKELY(offset_ < in_.size())) {
+		current_ = in_[offset_++];
+		if (UNLIKELY(current_ == '\r'))
+		{
+			if(LIKELY(offset_ < in_.size())) {
+				current_ = in_[offset_++];
+			} else {
+				current_ = EOF;
+			}
 		}
-	} while(current_ == '\r');
+	} else {
+		current_ = EOF;
+	}
 	
 }

--- a/src/serialization/tokenizer.hpp
+++ b/src/serialization/tokenizer.hpp
@ -18,6 +18,8 @@
 #ifndef TOKENIZER_H_INCLUDED
 #define TOKENIZER_H_INCLUDED

+#include "util.hpp"
+
 #include <istream>
 #include <string>

@ -68,7 +70,14 @@ protected:
 	int current_;
 	size_t lineno_;

-	virtual void next_char() = 0;
+	void next_char()
+	{
+		if (UNLIKELY(current_ == '\n'))
+			lineno_++;
+		this->next_char_fast();
+	}
+
+	virtual void next_char_fast() = 0;
 	virtual int peek_char() const = 0;
 private:
 	bool is_space(const int c) const;
@ -88,7 +97,7 @@ public:
 	tokenizer_stream(std::istream& in);

 protected:
-	void next_char();
+	void next_char_fast();
 	int peek_char() const;

 private:
@ -102,7 +111,7 @@ public:
 	tokenizer_string(std::string& in);

 protected:
-	void next_char();
+	void next_char_fast();
 	int peek_char() const;

 private:
--- a/src/util.hpp
+++ b/src/util.hpp
@ -145,6 +145,15 @@ void push_back(T& str, C c)
 	str[str.size()-1] = c;
 }

+#ifdef HAVE_BUILTIN_EXPECT
+#define LIKELY(a)    __builtin_expect((a),1)
+#define UNLIKELY(a)  __builtin_expect((a),1)
+#else
+#define LIKELY(a)    a
+#define UNLIKELY(a)  a
+#endif
+
+
 #if 1
 # include <SDL_types.h>
 typedef Sint32 fixed_t;