add utf16::iterator

This commit is contained in:
gfgtdf 2014-10-08 18:24:12 +02:00
parent fd9cda9cd2
commit 41fa779b08
3 changed files with 162 additions and 97 deletions

View file

@ -0,0 +1,98 @@
#ifndef UCS4_ITERATOR_BASE_HPP_INCLUDED
#define UCS4_ITERATOR_BASE_HPP_INCLUDED
#include <stdint.h> //uint32_t
#include <iterator> //input_iterator_tag
#include <utility> //pair
#include <cstddef> //ptrdiff_t
#include <cassert> //assert
namespace ucs4
{
typedef uint32_t char_t;
template<typename string_type, typename update_implementation>
class iterator_base
{
public:
typedef std::input_iterator_tag iterator_category;
typedef ucs4::char_t value_type;
typedef ptrdiff_t difference_type;
typedef ucs4::char_t* pointer;
typedef ucs4::char_t& reference;
iterator_base(const string_type& str)
: current_char(0)
, string_end(str.end())
, current_substr(std::make_pair(str.begin(), str.begin()))
{
update();
}
iterator_base(typename string_type::const_iterator const &begin, typename string_type::const_iterator const &end)
: current_char(0)
, string_end(end)
, current_substr(std::make_pair(begin, begin))
{
update();
}
static iterator_base begin(const string_type& str)
{
return iterator_base(str.begin(), str.end());
}
static iterator_base end(const string_type& str)
{
return iterator_base(str.end(), str.end());
}
bool operator==(const iterator_base& a) const
{
return current_substr.first == a.current_substr.first;
}
bool operator!=(const iterator_base& a) const
{
return ! (*this == a);
}
iterator_base& operator++()
{
current_substr.first = current_substr.second;
update();
return *this;
}
ucs4::char_t operator*() const
{
return current_char;
}
bool next_is_end() const
{
if(current_substr.second == string_end)
return true;
return false;
}
const std::pair<typename string_type::const_iterator, typename string_type::const_iterator>& substr() const
{
return current_substr;
}
private:
void update()
{
assert(current_substr.first == current_substr.second);
if(current_substr.first == string_end)
return;
current_char = update_implementation::get_next_char(current_substr.second, string_end);
}
ucs4::char_t current_char;
typename string_type::const_iterator string_end;
std::pair<typename string_type::const_iterator, typename string_type::const_iterator> current_substr;
};
}
#endif

View file

@ -142,6 +142,45 @@ utf16::string ucs4string_to_utf16string(const ucs4::string &src)
} // implementation namespace } // implementation namespace
namespace utf16 {
ucs4::char_t iterator_implementation::get_next_char(utf16::string::const_iterator& start, const utf16::string::const_iterator& end)
{
const int32_t last10 = 0x3FF;
const int32_t type_filter = 0xFC00;
const int32_t type_lead = 0xD800;
const int32_t type_trail = 0xDC00;
assert(start != end);
uint32_t current_char = static_cast<uint16_t>(*start);
++start;
uint32_t type = current_char & type_filter;
if(type == type_trail)
{
//found trail without head
throw utf8::invalid_utf8_exception();
}
else if(type == type_lead)
{
if(start == end)
{
//If the string ends occurs within an UTF16-sequence, this is bad.
throw utf8::invalid_utf8_exception();
}
if((*start & type_filter) != type_trail)
{
throw utf8::invalid_utf8_exception();
}
current_char &= last10;
current_char <<= 10;
current_char += (*start & last10);
current_char += 0x10000;
++start;
}
return current_char;
}
}
namespace utf8 { namespace utf8 {
static int byte_size_from_utf8_first(const unsigned char ch) static int byte_size_from_utf8_first(const unsigned char ch)
@ -159,72 +198,12 @@ static int byte_size_from_utf8_first(const unsigned char ch)
return count; return count;
} }
iterator::iterator(const std::string& str) : ucs4::char_t iterator_implementation::get_next_char(std::string::const_iterator& start, const std::string::const_iterator& string_end)
current_char(0),
string_end(str.end()),
current_substr(std::make_pair(str.begin(), str.begin()))
{ {
update(); assert(start != string_end);
} size_t size = byte_size_from_utf8_first(*start);
iterator::iterator(std::string::const_iterator const &beg, uint32_t current_char = static_cast<unsigned char>(*start);
std::string::const_iterator const &end) :
current_char(0),
string_end(end),
current_substr(std::make_pair(beg, beg))
{
update();
}
iterator iterator::begin(std::string const &str)
{
return iterator(str.begin(), str.end());
}
iterator iterator::end(const std::string& str)
{
return iterator(str.end(), str.end());
}
bool iterator::operator==(const utf8::iterator& a) const
{
return current_substr.first == a.current_substr.first;
}
iterator& iterator::operator++()
{
current_substr.first = current_substr.second;
update();
return *this;
}
ucs4::char_t iterator::operator*() const
{
return current_char;
}
bool iterator::next_is_end()
{
if(current_substr.second == string_end)
return true;
return false;
}
const std::pair<std::string::const_iterator, std::string::const_iterator>& iterator::substr() const
{
return current_substr;
}
void iterator::update()
{
// Do not try to update the current unicode char at end-of-string.
if(current_substr.first == string_end)
return;
size_t size = byte_size_from_utf8_first(*current_substr.first);
current_substr.second = current_substr.first + size;
current_char = static_cast<unsigned char>(*current_substr.first);
// Convert the first character // Convert the first character
if(size != 1) { if(size != 1) {
@ -232,25 +211,27 @@ void iterator::update()
} }
// Convert the continuation bytes // Convert the continuation bytes
for(std::string::const_iterator c = current_substr.first+1; // i == number of '++start'
c != current_substr.second; ++c) { ++start;
for(size_t i = 1; i < size; ++i, ++start) {
// If the string ends occurs within an UTF8-sequence, this is bad. // If the string ends occurs within an UTF8-sequence, this is bad.
if (c == string_end) if (start == string_end)
throw invalid_utf8_exception(); throw invalid_utf8_exception();
if ((*c & 0xC0) != 0x80) if ((*start & 0xC0) != 0x80)
throw invalid_utf8_exception(); throw invalid_utf8_exception();
current_char = (current_char << 6) | (static_cast<unsigned char>(*c) & 0x3F); current_char = (current_char << 6) | (static_cast<unsigned char>(*start) & 0x3F);
} }
//i == size => start was increased size times.
// Check for non-shortest-form encoding // Check for non-shortest-form encoding
// This has been forbidden in Unicode 3.1 for security reasons // This has been forbidden in Unicode 3.1 for security reasons
if (size > ::byte_size_from_ucs4_codepoint(current_char)) if (size > ::byte_size_from_ucs4_codepoint(current_char))
throw invalid_utf8_exception(); throw invalid_utf8_exception();
return current_char;
} }
utf8::string lowercase(const utf8::string& s) utf8::string lowercase(const utf8::string& s)
{ {
if(!s.empty()) { if(!s.empty()) {

View file

@ -16,6 +16,7 @@
#ifndef SERIALIZATION_UNICODE_HPP_INCLUDED #ifndef SERIALIZATION_UNICODE_HPP_INCLUDED
#define SERIALIZATION_UNICODE_HPP_INCLUDED #define SERIALIZATION_UNICODE_HPP_INCLUDED
#include "ucs4_iterator_base.hpp"
#include <boost/static_assert.hpp> #include <boost/static_assert.hpp>
#include <string> #include <string>
#include <vector> #include <vector>
@ -35,6 +36,13 @@ namespace utf8 {
*/ */
namespace utf16 { namespace utf16 {
typedef std::vector<wchar_t> string; typedef std::vector<wchar_t> string;
struct iterator_implementation
{
static ucs4::char_t get_next_char(utf16::string::const_iterator& start, const utf16::string::const_iterator& end);
};
typedef ucs4::iterator_base<utf16::string, iterator_implementation> iterator;
} }
namespace utf8 { namespace utf8 {
@ -43,37 +51,15 @@ namespace utf8 {
* Functions for converting Unicode wide-char strings to UTF-8 encoded strings, * Functions for converting Unicode wide-char strings to UTF-8 encoded strings,
* back and forth. * back and forth.
*/ */
/** also used for invalid utf16 or ucs4 strings */
class invalid_utf8_exception : public std::exception {}; class invalid_utf8_exception : public std::exception {};
struct iterator_implementation
class iterator
{ {
public: static ucs4::char_t get_next_char(std::string::const_iterator& start, const std::string::const_iterator& end);
typedef std::input_iterator_tag iterator_category;
typedef ucs4::char_t value_type;
typedef ptrdiff_t difference_type;
typedef ucs4::char_t* pointer;
typedef ucs4::char_t& reference;
iterator(const std::string& str);
iterator(std::string::const_iterator const &begin, std::string::const_iterator const &end);
static iterator begin(const std::string& str);
static iterator end(const std::string& str);
bool operator==(const utf8::iterator& a) const;
bool operator!=(const utf8::iterator& a) const { return ! (*this == a); }
iterator& operator++();
ucs4::char_t operator*() const;
bool next_is_end();
const std::pair<std::string::const_iterator, std::string::const_iterator>& substr() const;
private:
void update();
ucs4::char_t current_char;
std::string::const_iterator string_end;
std::pair<std::string::const_iterator, std::string::const_iterator> current_substr;
}; };
typedef ucs4::iterator_base<std::string, iterator_implementation> iterator;
/** Returns a lowercased version of the string. */ /** Returns a lowercased version of the string. */
utf8::string lowercase(const utf8::string&); utf8::string lowercase(const utf8::string&);