ICU 78.2 78.2
Loading...
Searching...
No Matches
Data Structures | Typedefs | Enumerations | Variables
utfiterator.h File Reference

C++ header-only API: C++ iterators over Unicode strings (=UTF-8/16/32 if well-formed). More...

#include "unicode/utypes.h"
#include <iterator>
#include <string>
#include <string_view>
#include <type_traits>
#include "unicode/utf16.h"
#include "unicode/utf8.h"
#include "unicode/uversion.h"

Go to the source code of this file.

Data Structures

struct  U_HEADER_ONLY_NAMESPACE::prv::range_type< Range, typename >
 
struct  U_HEADER_ONLY_NAMESPACE::prv::range_type< Range, std::void_t< decltype(std::declval< Range >().begin()), decltype(std::declval< Range >().end())> >
 
struct  U_HEADER_ONLY_NAMESPACE::prv::is_basic_string_view< T >
 
struct  U_HEADER_ONLY_NAMESPACE::prv::is_basic_string_view< std::basic_string_view< Args... > >
 
class  U_HEADER_ONLY_NAMESPACE::prv::CodePointsIterator< CP32, skipSurrogates >
 
class  U_HEADER_ONLY_NAMESPACE::AllCodePoints< CP32 >
 A C++ "range" over all Unicode code points U+0000..U+10FFFF. More...
 
class  U_HEADER_ONLY_NAMESPACE::AllScalarValues< CP32 >
 A C++ "range" over all Unicode scalar values U+0000..U+D7FF & U+E000..U+10FFFF. More...
 
class  U_HEADER_ONLY_NAMESPACE::UnsafeCodeUnits< CP32, UnitIter, typename >
 Result of decoding a code unit sequence for one code point. More...
 
class  U_HEADER_ONLY_NAMESPACE::CodeUnits< CP32, UnitIter, typename >
 Result of validating and decoding a code unit sequence for one code point. More...
 
class  U_HEADER_ONLY_NAMESPACE::UTFIterator< CP32, behavior, UnitIter, LimitIter, typename >
 Validating iterator over the code points in a Unicode string. More...
 

Typedefs

typedef enum UTFIllFormedBehavior UTFIllFormedBehavior
 Some defined behaviors for handling ill-formed Unicode strings.
 
template<typename Iter >
using U_HEADER_ONLY_NAMESPACE::prv::iter_value_t = typename std::iterator_traits< Iter >::value_type
 
template<typename Iter >
using U_HEADER_ONLY_NAMESPACE::prv::iter_difference_t = typename std::iterator_traits< Iter >::difference_type
 

Enumerations

enum  UTFIllFormedBehavior { UTF_BEHAVIOR_NEGATIVE , UTF_BEHAVIOR_FFFD , UTF_BEHAVIOR_SURROGATE }
 Some defined behaviors for handling ill-formed Unicode strings. More...
 

Variables

template<typename Iter >
constexpr bool U_HEADER_ONLY_NAMESPACE::prv::forward_iterator
 
template<typename Iter >
constexpr bool U_HEADER_ONLY_NAMESPACE::prv::bidirectional_iterator
 
template<typename Range >
constexpr bool U_HEADER_ONLY_NAMESPACE::prv::range = range_type<Range>::value
 
template<typename T >
constexpr bool U_HEADER_ONLY_NAMESPACE::prv::is_basic_string_view_v = is_basic_string_view<T>::value
 

Detailed Description

C++ header-only API: C++ iterators over Unicode strings (=UTF-8/16/32 if well-formed).

Sample code:

#include <string_view>
#include <iostream>
#include "unicode/utypes.h"
using icu::header::utfIterator;
using icu::header::utfStringCodePoints;
using icu::header::unsafeUTFIterator;
using icu::header::unsafeUTFStringCodePoints;
int32_t rangeLoop16(std::u16string_view s) {
// We are just adding up the code points for minimal-code demonstration purposes.
int32_t sum = 0;
for (auto units : utfStringCodePoints<UChar32, UTF_BEHAVIOR_NEGATIVE>(s)) {
sum += units.codePoint(); // < 0 if ill-formed
}
return sum;
}
int32_t loopIterPlusPlus16(std::u16string_view s) {
auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
int32_t sum = 0;
for (auto iter = range.begin(), limit = range.end(); iter != limit;) {
sum += (*iter++).codePoint(); // U+FFFD if ill-formed
}
return sum;
}
int32_t backwardLoop16(std::u16string_view s) {
auto range = utfStringCodePoints<UChar32, UTF_BEHAVIOR_SURROGATE>(s);
int32_t sum = 0;
for (auto start = range.begin(), iter = range.end(); start != iter;) {
sum += (*--iter).codePoint(); // surrogate code point if unpaired / ill-formed
}
return sum;
}
int32_t reverseLoop8(std::string_view s) {
auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
int32_t sum = 0;
for (auto iter = range.rbegin(), limit = range.rend(); iter != limit; ++iter) {
sum += iter->codePoint(); // U+FFFD if ill-formed
}
return sum;
}
int32_t countCodePoints16(std::u16string_view s) {
auto range = utfStringCodePoints<UChar32, UTF_BEHAVIOR_SURROGATE>(s);
return std::distance(range.begin(), range.end());
}
int32_t unsafeRangeLoop16(std::u16string_view s) {
int32_t sum = 0;
for (auto units : unsafeUTFStringCodePoints<UChar32>(s)) {
sum += units.codePoint();
}
return sum;
}
int32_t unsafeReverseLoop8(std::string_view s) {
auto range = unsafeUTFStringCodePoints<UChar32>(s);
int32_t sum = 0;
for (auto iter = range.rbegin(), limit = range.rend(); iter != limit; ++iter) {
sum += iter->codePoint();
}
return sum;
}
char32_t firstCodePointOrFFFD16(std::u16string_view s) {
if (s.empty()) { return 0xfffd; }
auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
return range.begin()->codePoint();
}
std::string_view firstSequence8(std::string_view s) {
if (s.empty()) { return {}; }
auto range = utfStringCodePoints<char32_t, UTF_BEHAVIOR_FFFD>(s);
auto units = *(range.begin());
if (units.wellFormed()) {
return units.stringView();
} else {
return {};
}
}
template<typename InputStream> // some istream or streambuf
std::u32string cpFromInput(InputStream &in) {
// This is a single-pass input_iterator.
std::istreambuf_iterator bufIter(in);
std::istreambuf_iterator<typename InputStream::char_type> bufLimit;
auto iter = utfIterator<char32_t, UTF_BEHAVIOR_FFFD>(bufIter);
auto limit = utfIterator<char32_t, UTF_BEHAVIOR_FFFD>(bufLimit);
std::u32string s32;
for (; iter != limit; ++iter) {
s32.push_back(iter->codePoint());
}
return s32;
}
std::u32string cpFromStdin() { return cpFromInput(std::cin); }
std::u32string cpFromWideStdin() { return cpFromInput(std::wcin); }
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition umachine.h:449
C++ header-only API: C++ iterators over Unicode strings (=UTF-8/16/32 if well-formed).
@ UTF_BEHAVIOR_NEGATIVE
Returns a negative value (-1=U_SENTINEL) instead of a code point.
Basic definitions for ICU, for both C and C++ APIs.

Definition in file utfiterator.h.

Typedef Documentation

◆ iter_difference_t

template<typename Iter >
using U_HEADER_ONLY_NAMESPACE::prv::iter_difference_t = typedef typename std::iterator_traits<Iter>::difference_type
Internal:
Do not use. This API is for internal use only.

Definition at line 203 of file utfiterator.h.

◆ iter_value_t

template<typename Iter >
using U_HEADER_ONLY_NAMESPACE::prv::iter_value_t = typedef typename std::iterator_traits<Iter>::value_type
Internal:
Do not use. This API is for internal use only.

Definition at line 199 of file utfiterator.h.

◆ UTFIllFormedBehavior

Some defined behaviors for handling ill-formed Unicode strings.

This is a template parameter for UTFIterator and related classes.

When a validating UTFIterator encounters an ill-formed code unit sequence, then CodeUnits.codePoint() is a value according to this parameter.

Draft:
This API may be changed in the future versions and was introduced in ICU 78
See also
CodeUnits
UTFIterator
UTFStringCodePoints

Enumeration Type Documentation

◆ UTFIllFormedBehavior

Some defined behaviors for handling ill-formed Unicode strings.

This is a template parameter for UTFIterator and related classes.

When a validating UTFIterator encounters an ill-formed code unit sequence, then CodeUnits.codePoint() is a value according to this parameter.

Draft:
This API may be changed in the future versions and was introduced in ICU 78
See also
CodeUnits
UTFIterator
UTFStringCodePoints
Enumerator
UTF_BEHAVIOR_NEGATIVE 

Returns a negative value (-1=U_SENTINEL) instead of a code point.

If the CP32 template parameter for the relevant classes is an unsigned type, then the negative value becomes 0xffffffff=UINT32_MAX.

Draft:
This API may be changed in the future versions and was introduced in ICU 78
UTF_BEHAVIOR_FFFD 

Returns U+FFFD Replacement Character.

Draft:
This API may be changed in the future versions and was introduced in ICU 78
UTF_BEHAVIOR_SURROGATE 

UTF-8: Not allowed; UTF-16: returns the unpaired surrogate; UTF-32: returns the surrogate code point, or U+FFFD if out of range.

Draft:
This API may be changed in the future versions and was introduced in ICU 78

Definition at line 149 of file utfiterator.h.

Variable Documentation

◆ bidirectional_iterator

template<typename Iter >
constexpr bool U_HEADER_ONLY_NAMESPACE::prv::bidirectional_iterator
constexpr
Initial value:
=
std::is_base_of_v<
std::bidirectional_iterator_tag,
typename std::iterator_traits<Iter>::iterator_category>
Internal:
Do not use. This API is for internal use only.

Definition at line 214 of file utfiterator.h.

◆ forward_iterator

template<typename Iter >
constexpr bool U_HEADER_ONLY_NAMESPACE::prv::forward_iterator
constexpr
Initial value:
=
std::is_base_of_v<
std::forward_iterator_tag,
typename std::iterator_traits<Iter>::iterator_category>
Internal:
Do not use. This API is for internal use only.

Definition at line 207 of file utfiterator.h.

◆ is_basic_string_view_v

template<typename T >
constexpr bool U_HEADER_ONLY_NAMESPACE::prv::is_basic_string_view_v = is_basic_string_view<T>::value
constexpr
Internal:
Do not use. This API is for internal use only.

Definition at line 244 of file utfiterator.h.

◆ range

template<typename Range >
constexpr bool U_HEADER_ONLY_NAMESPACE::prv::range = range_type<Range>::value
constexpr
Internal:
Do not use. This API is for internal use only.

Definition at line 232 of file utfiterator.h.