Merge pull request #293 from gfgtdf/utf16_to_utf32
Add Utf16 conversion functions
This commit is contained in:
commit
d246baa567
4 changed files with 243 additions and 116 deletions
98
src/serialization/ucs4_iterator_base.hpp
Normal file
98
src/serialization/ucs4_iterator_base.hpp
Normal file
|
@ -0,0 +1,98 @@
|
|||
#ifndef UCS4_ITERATOR_BASE_HPP_INCLUDED
|
||||
#define UCS4_ITERATOR_BASE_HPP_INCLUDED
|
||||
|
||||
#include <stdint.h> //uint32_t
|
||||
#include <iterator> //input_iterator_tag
|
||||
#include <utility> //pair
|
||||
#include <cstddef> //ptrdiff_t
|
||||
#include <cassert> //assert
|
||||
|
||||
namespace ucs4
|
||||
{
|
||||
typedef uint32_t char_t;
|
||||
template<typename string_type, typename update_implementation>
|
||||
class iterator_base
|
||||
{
|
||||
public:
|
||||
typedef std::input_iterator_tag iterator_category;
|
||||
typedef ucs4::char_t value_type;
|
||||
typedef ptrdiff_t difference_type;
|
||||
typedef ucs4::char_t* pointer;
|
||||
typedef ucs4::char_t& reference;
|
||||
|
||||
iterator_base(const string_type& str)
|
||||
: current_char(0)
|
||||
, string_end(str.end())
|
||||
, current_substr(std::make_pair(str.begin(), str.begin()))
|
||||
{
|
||||
update();
|
||||
}
|
||||
|
||||
iterator_base(typename string_type::const_iterator const &begin, typename string_type::const_iterator const &end)
|
||||
: current_char(0)
|
||||
, string_end(end)
|
||||
, current_substr(std::make_pair(begin, begin))
|
||||
{
|
||||
update();
|
||||
}
|
||||
|
||||
static iterator_base begin(const string_type& str)
|
||||
{
|
||||
return iterator_base(str.begin(), str.end());
|
||||
}
|
||||
|
||||
static iterator_base end(const string_type& str)
|
||||
{
|
||||
return iterator_base(str.end(), str.end());
|
||||
}
|
||||
|
||||
bool operator==(const iterator_base& a) const
|
||||
{
|
||||
return current_substr.first == a.current_substr.first;
|
||||
}
|
||||
|
||||
bool operator!=(const iterator_base& a) const
|
||||
{
|
||||
return ! (*this == a);
|
||||
}
|
||||
|
||||
iterator_base& operator++()
|
||||
{
|
||||
current_substr.first = current_substr.second;
|
||||
update();
|
||||
return *this;
|
||||
}
|
||||
|
||||
ucs4::char_t operator*() const
|
||||
{
|
||||
return current_char;
|
||||
}
|
||||
|
||||
bool next_is_end() const
|
||||
{
|
||||
if(current_substr.second == string_end)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
const std::pair<typename string_type::const_iterator, typename string_type::const_iterator>& substr() const
|
||||
{
|
||||
return current_substr;
|
||||
}
|
||||
private:
|
||||
void update()
|
||||
{
|
||||
assert(current_substr.first == current_substr.second);
|
||||
if(current_substr.first == string_end)
|
||||
return;
|
||||
current_char = update_implementation::get_next_char(current_substr.second, string_end);
|
||||
}
|
||||
|
||||
ucs4::char_t current_char;
|
||||
typename string_type::const_iterator string_end;
|
||||
std::pair<typename string_type::const_iterator, typename string_type::const_iterator> current_substr;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -54,31 +54,32 @@ size_t byte_size_from_ucs4_codepoint(ucs4::char_t ch)
|
|||
} // anonymous namespace
|
||||
|
||||
namespace implementation {
|
||||
|
||||
inline void push_ucs4char_to_string(std::string& out, ucs4::char_t ch)
|
||||
{
|
||||
size_t count = byte_size_from_ucs4_codepoint(ch);
|
||||
|
||||
if(count == 1) {
|
||||
out.push_back(static_cast<char>(ch));
|
||||
} else {
|
||||
for(int j = static_cast<int>(count) - 1; j >= 0; --j) {
|
||||
unsigned char c = (ch >> (6 * j)) & 0x3f;
|
||||
c |= 0x80;
|
||||
if(j == static_cast<int>(count) - 1) {
|
||||
c |= 0xff << (8 - count);
|
||||
}
|
||||
out.push_back(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::string ucs4string_to_string(const ucs4::string &src)
|
||||
{
|
||||
std::string ret;
|
||||
|
||||
try {
|
||||
for(ucs4::string::const_iterator i = src.begin(); i != src.end(); ++i) {
|
||||
unsigned int count;
|
||||
ucs4::char_t ch = *i;
|
||||
|
||||
// Determine the bytes required
|
||||
count = byte_size_from_ucs4_codepoint(ch);
|
||||
|
||||
if(count == 1) {
|
||||
ret.push_back(static_cast<char>(ch));
|
||||
} else {
|
||||
for(int j = static_cast<int>(count) - 1; j >= 0; --j) {
|
||||
unsigned char c = (ch >> (6 * j)) & 0x3f;
|
||||
c |= 0x80;
|
||||
if(j == static_cast<int>(count) - 1) {
|
||||
c |= 0xff << (8 - count);
|
||||
}
|
||||
ret.push_back(c);
|
||||
}
|
||||
}
|
||||
|
||||
push_ucs4char_to_string(ret, *i);
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
@ -118,6 +119,49 @@ ucs4::string string_to_ucs4string(const std::string &src)
|
|||
return res;
|
||||
}
|
||||
|
||||
ucs4::string utf16string_to_ucs4string(const utf16::string & src)
|
||||
{
|
||||
ucs4::string res;
|
||||
|
||||
try {
|
||||
utf16::iterator i1(src);
|
||||
const utf16::iterator i2(utf16::iterator::end(src));
|
||||
|
||||
// Equivalent to res.insert(res.end(),i1,i2) which doesn't work on VC++6.
|
||||
while(i1 != i2) {
|
||||
res.push_back(*i1);
|
||||
++i1;
|
||||
}
|
||||
}
|
||||
catch(utf8::invalid_utf8_exception&) {
|
||||
ERR_GENERAL << "Invalid UTF-16 string" << std::endl;
|
||||
return res;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
std::string utf16string_to_string(const utf16::string & src)
|
||||
{
|
||||
std::string res;
|
||||
|
||||
try {
|
||||
utf16::iterator i1(src);
|
||||
const utf16::iterator i2(utf16::iterator::end(src));
|
||||
|
||||
while(i1 != i2) {
|
||||
push_ucs4char_to_string(res, *i1);
|
||||
++i1;
|
||||
}
|
||||
}
|
||||
catch(utf8::invalid_utf8_exception&) {
|
||||
ERR_GENERAL << "Invalid UTF-16 string" << std::endl;
|
||||
return res;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
utf16::string ucs4string_to_utf16string(const ucs4::string &src)
|
||||
{
|
||||
utf16::string res;
|
||||
|
@ -142,6 +186,45 @@ utf16::string ucs4string_to_utf16string(const ucs4::string &src)
|
|||
|
||||
} // implementation namespace
|
||||
|
||||
namespace utf16 {
|
||||
ucs4::char_t iterator_implementation::get_next_char(utf16::string::const_iterator& start, const utf16::string::const_iterator& end)
|
||||
{
|
||||
|
||||
const int32_t last10 = 0x3FF;
|
||||
const int32_t type_filter = 0xFC00;
|
||||
const int32_t type_lead = 0xD800;
|
||||
const int32_t type_trail = 0xDC00;
|
||||
|
||||
assert(start != end);
|
||||
uint32_t current_char = static_cast<uint16_t>(*start);
|
||||
++start;
|
||||
uint32_t type = current_char & type_filter;
|
||||
if(type == type_trail)
|
||||
{
|
||||
//found trail without head
|
||||
throw utf8::invalid_utf8_exception();
|
||||
}
|
||||
else if(type == type_lead)
|
||||
{
|
||||
if(start == end)
|
||||
{
|
||||
//If the string ends occurs within an UTF16-sequence, this is bad.
|
||||
throw utf8::invalid_utf8_exception();
|
||||
}
|
||||
if((*start & type_filter) != type_trail)
|
||||
{
|
||||
throw utf8::invalid_utf8_exception();
|
||||
}
|
||||
current_char &= last10;
|
||||
current_char <<= 10;
|
||||
current_char += (*start & last10);
|
||||
current_char += 0x10000;
|
||||
++start;
|
||||
}
|
||||
return current_char;
|
||||
}
|
||||
}
|
||||
|
||||
namespace utf8 {
|
||||
|
||||
static int byte_size_from_utf8_first(const unsigned char ch)
|
||||
|
@ -159,72 +242,12 @@ static int byte_size_from_utf8_first(const unsigned char ch)
|
|||
return count;
|
||||
}
|
||||
|
||||
iterator::iterator(const std::string& str) :
|
||||
current_char(0),
|
||||
string_end(str.end()),
|
||||
current_substr(std::make_pair(str.begin(), str.begin()))
|
||||
ucs4::char_t iterator_implementation::get_next_char(std::string::const_iterator& start, const std::string::const_iterator& string_end)
|
||||
{
|
||||
update();
|
||||
}
|
||||
assert(start != string_end);
|
||||
size_t size = byte_size_from_utf8_first(*start);
|
||||
|
||||
iterator::iterator(std::string::const_iterator const &beg,
|
||||
std::string::const_iterator const &end) :
|
||||
current_char(0),
|
||||
string_end(end),
|
||||
current_substr(std::make_pair(beg, beg))
|
||||
{
|
||||
update();
|
||||
}
|
||||
|
||||
iterator iterator::begin(std::string const &str)
|
||||
{
|
||||
return iterator(str.begin(), str.end());
|
||||
}
|
||||
|
||||
iterator iterator::end(const std::string& str)
|
||||
{
|
||||
return iterator(str.end(), str.end());
|
||||
}
|
||||
|
||||
bool iterator::operator==(const utf8::iterator& a) const
|
||||
{
|
||||
return current_substr.first == a.current_substr.first;
|
||||
}
|
||||
|
||||
iterator& iterator::operator++()
|
||||
{
|
||||
current_substr.first = current_substr.second;
|
||||
update();
|
||||
return *this;
|
||||
}
|
||||
|
||||
ucs4::char_t iterator::operator*() const
|
||||
{
|
||||
return current_char;
|
||||
}
|
||||
|
||||
bool iterator::next_is_end()
|
||||
{
|
||||
if(current_substr.second == string_end)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
const std::pair<std::string::const_iterator, std::string::const_iterator>& iterator::substr() const
|
||||
{
|
||||
return current_substr;
|
||||
}
|
||||
|
||||
void iterator::update()
|
||||
{
|
||||
// Do not try to update the current unicode char at end-of-string.
|
||||
if(current_substr.first == string_end)
|
||||
return;
|
||||
|
||||
size_t size = byte_size_from_utf8_first(*current_substr.first);
|
||||
current_substr.second = current_substr.first + size;
|
||||
|
||||
current_char = static_cast<unsigned char>(*current_substr.first);
|
||||
uint32_t current_char = static_cast<unsigned char>(*start);
|
||||
|
||||
// Convert the first character
|
||||
if(size != 1) {
|
||||
|
@ -232,25 +255,27 @@ void iterator::update()
|
|||
}
|
||||
|
||||
// Convert the continuation bytes
|
||||
for(std::string::const_iterator c = current_substr.first+1;
|
||||
c != current_substr.second; ++c) {
|
||||
// i == number of '++start'
|
||||
++start;
|
||||
for(size_t i = 1; i < size; ++i, ++start) {
|
||||
// If the string ends occurs within an UTF8-sequence, this is bad.
|
||||
if (c == string_end)
|
||||
if (start == string_end)
|
||||
throw invalid_utf8_exception();
|
||||
|
||||
if ((*c & 0xC0) != 0x80)
|
||||
if ((*start & 0xC0) != 0x80)
|
||||
throw invalid_utf8_exception();
|
||||
|
||||
current_char = (current_char << 6) | (static_cast<unsigned char>(*c) & 0x3F);
|
||||
current_char = (current_char << 6) | (static_cast<unsigned char>(*start) & 0x3F);
|
||||
}
|
||||
//i == size => start was increased size times.
|
||||
|
||||
// Check for non-shortest-form encoding
|
||||
// This has been forbidden in Unicode 3.1 for security reasons
|
||||
if (size > ::byte_size_from_ucs4_codepoint(current_char))
|
||||
throw invalid_utf8_exception();
|
||||
return current_char;
|
||||
}
|
||||
|
||||
|
||||
utf8::string lowercase(const utf8::string& s)
|
||||
{
|
||||
if(!s.empty()) {
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#ifndef SERIALIZATION_UNICODE_HPP_INCLUDED
|
||||
#define SERIALIZATION_UNICODE_HPP_INCLUDED
|
||||
|
||||
#include "ucs4_iterator_base.hpp"
|
||||
#include <boost/static_assert.hpp>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
@ -35,6 +36,13 @@ namespace utf8 {
|
|||
*/
|
||||
namespace utf16 {
|
||||
typedef std::vector<wchar_t> string;
|
||||
|
||||
struct iterator_implementation
|
||||
{
|
||||
static ucs4::char_t get_next_char(utf16::string::const_iterator& start, const utf16::string::const_iterator& end);
|
||||
};
|
||||
|
||||
typedef ucs4::iterator_base<utf16::string, iterator_implementation> iterator;
|
||||
}
|
||||
|
||||
namespace utf8 {
|
||||
|
@ -43,37 +51,15 @@ namespace utf8 {
|
|||
* Functions for converting Unicode wide-char strings to UTF-8 encoded strings,
|
||||
* back and forth.
|
||||
*/
|
||||
/** also used for invalid utf16 or ucs4 strings */
|
||||
class invalid_utf8_exception : public std::exception {};
|
||||
|
||||
class iterator
|
||||
struct iterator_implementation
|
||||
{
|
||||
public:
|
||||
typedef std::input_iterator_tag iterator_category;
|
||||
typedef ucs4::char_t value_type;
|
||||
typedef ptrdiff_t difference_type;
|
||||
typedef ucs4::char_t* pointer;
|
||||
typedef ucs4::char_t& reference;
|
||||
|
||||
iterator(const std::string& str);
|
||||
iterator(std::string::const_iterator const &begin, std::string::const_iterator const &end);
|
||||
|
||||
static iterator begin(const std::string& str);
|
||||
static iterator end(const std::string& str);
|
||||
|
||||
bool operator==(const utf8::iterator& a) const;
|
||||
bool operator!=(const utf8::iterator& a) const { return ! (*this == a); }
|
||||
iterator& operator++();
|
||||
ucs4::char_t operator*() const;
|
||||
bool next_is_end();
|
||||
const std::pair<std::string::const_iterator, std::string::const_iterator>& substr() const;
|
||||
private:
|
||||
void update();
|
||||
|
||||
ucs4::char_t current_char;
|
||||
std::string::const_iterator string_end;
|
||||
std::pair<std::string::const_iterator, std::string::const_iterator> current_substr;
|
||||
static ucs4::char_t get_next_char(std::string::const_iterator& start, const std::string::const_iterator& end);
|
||||
};
|
||||
|
||||
typedef ucs4::iterator_base<std::string, iterator_implementation> iterator;
|
||||
|
||||
/** Returns a lowercased version of the string. */
|
||||
utf8::string lowercase(const utf8::string&);
|
||||
|
||||
|
@ -121,6 +107,8 @@ namespace implementation {
|
|||
std::string ucs4string_to_string(const ucs4::string &);
|
||||
ucs4::string string_to_ucs4string(const std::string &);
|
||||
std::string ucs4char_to_string(const ucs4::char_t);
|
||||
ucs4::string utf16string_to_ucs4string(const utf16::string &);
|
||||
std::string utf16string_to_string(const utf16::string &);
|
||||
utf16::string ucs4string_to_utf16string(const ucs4::string &);
|
||||
} // end namespace implementation
|
||||
|
||||
|
@ -156,4 +144,14 @@ utf16::string unicode_cast<utf16::string, utf8::string>(const utf8::string &in)
|
|||
return unicode_cast<utf16::string>(u4str);
|
||||
}
|
||||
|
||||
template <> inline
|
||||
ucs4::string unicode_cast<ucs4::string, utf16::string>(const utf16::string &in) {
|
||||
return implementation::utf16string_to_ucs4string(in);
|
||||
}
|
||||
|
||||
template <> inline
|
||||
std::string unicode_cast<std::string, utf16::string>(const utf16::string &in) {
|
||||
return implementation::utf16string_to_string(in);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -54,6 +54,9 @@ BOOST_AUTO_TEST_CASE( utils_unicode_test )
|
|||
|
||||
BOOST_CHECK( apple_u4.size() == 5 );
|
||||
BOOST_CHECK_EQUAL( apple_u8, unicode_cast<utf8::string>(apple_u4) );
|
||||
BOOST_CHECK_EQUAL( apple_u8, unicode_cast<utf8::string>(apple_u16) );
|
||||
BOOST_CHECK( apple_u4 == unicode_cast<ucs4::string>(apple_u16) );
|
||||
BOOST_CHECK( apple_u16 == unicode_cast<utf16::string>(apple_u4) );
|
||||
BOOST_CHECK_EQUAL( apple_u8.size(), apple_u16.size() );
|
||||
|
||||
ucs4::string water_u4;
|
||||
|
@ -73,6 +76,9 @@ BOOST_AUTO_TEST_CASE( utils_unicode_test )
|
|||
BOOST_CHECK_EQUAL(nonbmp_u16[0], 0xD800);
|
||||
BOOST_CHECK_EQUAL(nonbmp_u16[1], 0xDC00);
|
||||
BOOST_CHECK_EQUAL(nonbmp_u8, unicode_cast<utf8::string>(nonbmp_u4));
|
||||
BOOST_CHECK_EQUAL(nonbmp_u8, unicode_cast<utf8::string>(nonbmp_u16));
|
||||
BOOST_CHECK(nonbmp_u16 == unicode_cast<utf16::string>(nonbmp_u4));
|
||||
BOOST_CHECK(nonbmp_u4 == unicode_cast<ucs4::string>(nonbmp_u16));
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( test_lowercase )
|
||||
|
|
Loading…
Add table
Reference in a new issue