Merge pull request #293 from gfgtdf/utf16_to_utf32

Add Utf16 conversion functions
2014-10-09 03:08:06 +02:00 · 2014-10-09 03:08:06 +02:00 · d246baa567
commit d246baa567
parent 4576dd0d73 4a46ef1f68
4 changed files with 243 additions and 116 deletions
--- a/src/serialization/ucs4_iterator_base.hpp
+++ b/src/serialization/ucs4_iterator_base.hpp
@ -0,0 +1,98 @@
+#ifndef UCS4_ITERATOR_BASE_HPP_INCLUDED
+#define UCS4_ITERATOR_BASE_HPP_INCLUDED
+
+#include <stdint.h>  //uint32_t
+#include <iterator>  //input_iterator_tag
+#include <utility>   //pair
+#include <cstddef>   //ptrdiff_t
+#include <cassert>   //assert
+
+namespace ucs4
+{
+	typedef uint32_t char_t;
+	template<typename string_type, typename update_implementation>
+	class iterator_base
+	{
+	public:
+		typedef std::input_iterator_tag iterator_category;
+		typedef ucs4::char_t value_type;
+		typedef ptrdiff_t difference_type;
+		typedef ucs4::char_t* pointer;
+		typedef ucs4::char_t& reference;
+
+		iterator_base(const string_type& str)
+			: current_char(0)
+			, string_end(str.end())
+			, current_substr(std::make_pair(str.begin(), str.begin()))
+		{
+			update();
+		}
+
+		iterator_base(typename string_type::const_iterator const &begin, typename string_type::const_iterator const &end)
+			: current_char(0)
+			, string_end(end)
+			, current_substr(std::make_pair(begin, begin))
+		{
+			update();
+		}
+
+		static iterator_base begin(const string_type& str)
+		{
+			return iterator_base(str.begin(), str.end());
+		}
+
+		static iterator_base end(const string_type& str)
+		{
+			return iterator_base(str.end(), str.end());
+		}
+
+		bool operator==(const iterator_base& a) const
+		{
+			return current_substr.first == a.current_substr.first;
+		}
+
+		bool operator!=(const iterator_base& a) const
+		{ 
+			return ! (*this == a); 
+		}
+
+		iterator_base& operator++()
+		{
+			current_substr.first = current_substr.second;
+			update();
+			return *this;
+		}
+
+		ucs4::char_t operator*() const
+		{
+			return current_char;
+		}
+
+		bool next_is_end() const
+		{
+			if(current_substr.second == string_end)
+				return true;
+			return false;
+		}
+
+		const std::pair<typename string_type::const_iterator, typename string_type::const_iterator>& substr() const
+		{
+			return current_substr;
+		}
+	private:
+		void update()
+		{
+			assert(current_substr.first == current_substr.second);
+			if(current_substr.first == string_end)
+				return;
+			current_char = update_implementation::get_next_char(current_substr.second, string_end);
+		}
+
+		ucs4::char_t current_char;
+		typename string_type::const_iterator string_end;
+		std::pair<typename string_type::const_iterator, typename string_type::const_iterator> current_substr;
+	};
+
+}
+
+#endif
--- a/src/serialization/unicode.cpp
+++ b/src/serialization/unicode.cpp
@ -54,31 +54,32 @@ size_t byte_size_from_ucs4_codepoint(ucs4::char_t ch)
 } // anonymous namespace

 namespace implementation {
+
+inline void push_ucs4char_to_string(std::string& out, ucs4::char_t ch)
+{
+	size_t count = byte_size_from_ucs4_codepoint(ch);
+
+	if(count == 1) {
+		out.push_back(static_cast<char>(ch));
+	} else {
+		for(int j = static_cast<int>(count) - 1; j >= 0; --j) {
+			unsigned char c = (ch >> (6 * j)) & 0x3f;
+			c |= 0x80;
+			if(j == static_cast<int>(count) - 1) {
+				c |= 0xff << (8 - count);
+			}
+			out.push_back(c);
+		}
+	}
+}
+
 std::string ucs4string_to_string(const ucs4::string &src)
 {
 	std::string ret;

 	try {
 		for(ucs4::string::const_iterator i = src.begin(); i != src.end(); ++i) {
-			unsigned int count;
-			ucs4::char_t ch = *i;
-
-			// Determine the bytes required
-			count = byte_size_from_ucs4_codepoint(ch);
-
-			if(count == 1) {
-				ret.push_back(static_cast<char>(ch));
-			} else {
-				for(int j = static_cast<int>(count) - 1; j >= 0; --j) {
-					unsigned char c = (ch >> (6 * j)) & 0x3f;
-					c |= 0x80;
-					if(j == static_cast<int>(count) - 1) {
-						c |= 0xff << (8 - count);
-					}
-					ret.push_back(c);
-				}
-			}
-
+			push_ucs4char_to_string(ret, *i);
 		}

 		return ret;
@ -118,6 +119,49 @@ ucs4::string string_to_ucs4string(const std::string &src)
 	return res;
 }

+ucs4::string utf16string_to_ucs4string(const utf16::string & src)
+{
+	ucs4::string res;
+
+	try {
+		utf16::iterator i1(src);
+		const utf16::iterator i2(utf16::iterator::end(src));
+
+		// Equivalent to res.insert(res.end(),i1,i2) which doesn't work on VC++6.
+		while(i1 != i2) {
+			res.push_back(*i1);
+			++i1;
+		}
+	}
+	catch(utf8::invalid_utf8_exception&) {
+		ERR_GENERAL << "Invalid UTF-16 string" << std::endl;
+		return res;
+	}
+
+	return res;
+}
+
+std::string utf16string_to_string(const utf16::string & src)
+{
+	std::string res;
+
+	try {
+		utf16::iterator i1(src);
+		const utf16::iterator i2(utf16::iterator::end(src));
+
+		while(i1 != i2) {
+			push_ucs4char_to_string(res, *i1);
+			++i1;
+		}
+	}
+	catch(utf8::invalid_utf8_exception&) {
+		ERR_GENERAL << "Invalid UTF-16 string" << std::endl;
+		return res;
+	}
+
+	return res;
+}
+
 utf16::string ucs4string_to_utf16string(const ucs4::string &src)
 {
 	utf16::string res;
@ -142,6 +186,45 @@ utf16::string ucs4string_to_utf16string(const ucs4::string &src)

 } // implementation namespace

+namespace utf16 {
+ucs4::char_t iterator_implementation::get_next_char(utf16::string::const_iterator& start, const utf16::string::const_iterator& end)
+{
+
+	const int32_t last10 = 0x3FF;
+	const int32_t type_filter = 0xFC00;
+	const int32_t type_lead = 0xD800;
+	const int32_t type_trail = 0xDC00;
+
+	assert(start != end);
+	uint32_t current_char = static_cast<uint16_t>(*start);
+	++start;
+	uint32_t type = current_char & type_filter;
+	if(type == type_trail)
+	{
+		//found trail without head
+		throw utf8::invalid_utf8_exception();
+	}
+	else if(type == type_lead)
+	{
+		if(start == end)
+		{
+			//If the string ends occurs within an UTF16-sequence, this is bad.
+			throw utf8::invalid_utf8_exception();
+		}
+		if((*start & type_filter) != type_trail)
+		{
+			throw utf8::invalid_utf8_exception();
+		}
+		current_char &= last10;
+		current_char <<= 10;
+		current_char += (*start & last10);
+		current_char += 0x10000;
+		++start;
+	}
+	return current_char;
+}
+}
+
 namespace utf8 {

 static int byte_size_from_utf8_first(const unsigned char ch)
@ -159,72 +242,12 @@ static int byte_size_from_utf8_first(const unsigned char ch)
 	return count;
 }

-iterator::iterator(const std::string& str) :
-	current_char(0),
-	string_end(str.end()),
-	current_substr(std::make_pair(str.begin(), str.begin()))
+ucs4::char_t iterator_implementation::get_next_char(std::string::const_iterator& start, const std::string::const_iterator& string_end)
 {
-	update();
-}
+	assert(start != string_end);
+	size_t size = byte_size_from_utf8_first(*start);

-iterator::iterator(std::string::const_iterator const &beg,
-		std::string::const_iterator const &end) :
-	current_char(0),
-	string_end(end),
-	current_substr(std::make_pair(beg, beg))
-{
-	update();
-}
-
-iterator iterator::begin(std::string const &str)
-{
-	return iterator(str.begin(), str.end());
-}
-
-iterator iterator::end(const std::string& str)
-{
-	return iterator(str.end(), str.end());
-}
-
-bool iterator::operator==(const utf8::iterator& a) const
-{
-	return current_substr.first == a.current_substr.first;
-}
-
-iterator& iterator::operator++()
-{
-	current_substr.first = current_substr.second;
-	update();
-	return *this;
-}
-
-ucs4::char_t iterator::operator*() const
-{
-	return current_char;
-}
-
-bool iterator::next_is_end()
-{
-	if(current_substr.second == string_end)
-		return true;
-	return false;
-}
-
-const std::pair<std::string::const_iterator, std::string::const_iterator>& iterator::substr() const
-{
-	return current_substr;
-}
-
-void iterator::update()
-{
-	// Do not try to update the current unicode char at end-of-string.
-	if(current_substr.first == string_end)
-		return;
-
-	size_t size = byte_size_from_utf8_first(*current_substr.first);
-	current_substr.second = current_substr.first + size;
-
-	current_char = static_cast<unsigned char>(*current_substr.first);
+	uint32_t current_char = static_cast<unsigned char>(*start);

 	// Convert the first character
 	if(size != 1) {
@ -232,25 +255,27 @@ void iterator::update()
 	}

 	// Convert the continuation bytes
-	for(std::string::const_iterator c = current_substr.first+1;
-			c != current_substr.second; ++c) {
+	// i == number of '++start'
+	++start;
+	for(size_t i = 1; i < size; ++i, ++start) {
 		// If the string ends occurs within an UTF8-sequence, this is bad.
-		if (c == string_end)
+		if (start == string_end)
 			throw invalid_utf8_exception();

-		if ((*c & 0xC0) != 0x80)
+		if ((*start & 0xC0) != 0x80)
 			throw invalid_utf8_exception();

-		current_char = (current_char << 6) | (static_cast<unsigned char>(*c) & 0x3F);
+		current_char = (current_char << 6) | (static_cast<unsigned char>(*start) & 0x3F);
 	}
+	//i == size => start was increased size times.

 	// Check for non-shortest-form encoding
 	// This has been forbidden in Unicode 3.1 for security reasons
 	if (size > ::byte_size_from_ucs4_codepoint(current_char))
 		throw invalid_utf8_exception();
+	return current_char;
 }

-
 utf8::string lowercase(const utf8::string& s)
 {
 	if(!s.empty()) {
--- a/src/serialization/unicode.hpp
+++ b/src/serialization/unicode.hpp
@ -16,6 +16,7 @@
 #ifndef SERIALIZATION_UNICODE_HPP_INCLUDED
 #define SERIALIZATION_UNICODE_HPP_INCLUDED

+#include "ucs4_iterator_base.hpp"
 #include <boost/static_assert.hpp>
 #include <string>
 #include <vector>
@ -35,6 +36,13 @@ namespace utf8 {
 */
 namespace utf16 {
 	typedef std::vector<wchar_t> string;
+	
+	struct iterator_implementation
+	{
+		static ucs4::char_t get_next_char(utf16::string::const_iterator& start, const utf16::string::const_iterator& end);
+	};
+
+	typedef ucs4::iterator_base<utf16::string, iterator_implementation> iterator;
 }

 namespace utf8 {
@ -43,37 +51,15 @@ namespace utf8 {
 	* Functions for converting Unicode wide-char strings to UTF-8 encoded strings,
 	* back and forth.
 	*/
+	/** also used for invalid utf16 or ucs4 strings */
 	class invalid_utf8_exception : public std::exception {};
-
-	class iterator
+	struct iterator_implementation
 	{
-	public:
-		typedef std::input_iterator_tag iterator_category;
-		typedef ucs4::char_t value_type;
-		typedef ptrdiff_t difference_type;
-		typedef ucs4::char_t* pointer;
-		typedef ucs4::char_t& reference;
-
-		iterator(const std::string& str);
-		iterator(std::string::const_iterator const &begin, std::string::const_iterator const &end);
-
-		static iterator begin(const std::string& str);
-		static iterator end(const std::string& str);
-
-		bool operator==(const utf8::iterator& a) const;
-		bool operator!=(const utf8::iterator& a) const { return ! (*this == a); }
-		iterator& operator++();
-		ucs4::char_t operator*() const;
-		bool next_is_end();
-		const std::pair<std::string::const_iterator, std::string::const_iterator>& substr() const;
-	private:
-		void update();
-
-		ucs4::char_t current_char;
-		std::string::const_iterator string_end;
-		std::pair<std::string::const_iterator, std::string::const_iterator> current_substr;
+		static ucs4::char_t get_next_char(std::string::const_iterator& start, const std::string::const_iterator& end);
 	};

+	typedef ucs4::iterator_base<std::string, iterator_implementation> iterator;
+
 	/** Returns a lowercased version of the string. */
 	utf8::string lowercase(const utf8::string&);

@ -121,6 +107,8 @@ namespace implementation {
 	std::string ucs4string_to_string(const ucs4::string &);
 	ucs4::string string_to_ucs4string(const std::string &);
 	std::string ucs4char_to_string(const ucs4::char_t);
+	ucs4::string utf16string_to_ucs4string(const utf16::string &);
+	std::string utf16string_to_string(const utf16::string &);
 	utf16::string ucs4string_to_utf16string(const ucs4::string &);
 } // end namespace implementation

@ -156,4 +144,14 @@ utf16::string unicode_cast<utf16::string, utf8::string>(const utf8::string &in)
 	return unicode_cast<utf16::string>(u4str);
 }

+template <> inline
+ucs4::string unicode_cast<ucs4::string, utf16::string>(const utf16::string &in) {
+	return implementation::utf16string_to_ucs4string(in);
+}
+
+template <> inline
+std::string unicode_cast<std::string, utf16::string>(const utf16::string &in) {
+	return implementation::utf16string_to_string(in);
+}
+
 #endif
--- a/src/tests/test_serialization.cpp
+++ b/src/tests/test_serialization.cpp
@ -54,6 +54,9 @@ BOOST_AUTO_TEST_CASE( utils_unicode_test )

 	BOOST_CHECK( apple_u4.size() == 5 );
 	BOOST_CHECK_EQUAL( apple_u8, unicode_cast<utf8::string>(apple_u4) );
+	BOOST_CHECK_EQUAL( apple_u8, unicode_cast<utf8::string>(apple_u16) );
+	BOOST_CHECK( apple_u4 == unicode_cast<ucs4::string>(apple_u16) );
+	BOOST_CHECK( apple_u16 == unicode_cast<utf16::string>(apple_u4) );
 	BOOST_CHECK_EQUAL( apple_u8.size(), apple_u16.size() );

 	ucs4::string water_u4;
@ -73,6 +76,9 @@ BOOST_AUTO_TEST_CASE( utils_unicode_test )
 	BOOST_CHECK_EQUAL(nonbmp_u16[0], 0xD800);
 	BOOST_CHECK_EQUAL(nonbmp_u16[1], 0xDC00);
 	BOOST_CHECK_EQUAL(nonbmp_u8, unicode_cast<utf8::string>(nonbmp_u4));
+	BOOST_CHECK_EQUAL(nonbmp_u8, unicode_cast<utf8::string>(nonbmp_u16));
+	BOOST_CHECK(nonbmp_u16 == unicode_cast<utf16::string>(nonbmp_u4));
+	BOOST_CHECK(nonbmp_u4 == unicode_cast<ucs4::string>(nonbmp_u16));
 }

 BOOST_AUTO_TEST_CASE( test_lowercase )