Skip to content

Commit ce3f11a

Browse files
committed
ICU-23152 Unicode helper APIs
See #3539
1 parent f0fd39a commit ce3f11a

File tree

21 files changed

+1099
-17
lines changed

21 files changed

+1099
-17
lines changed

icu4c/source/common/common.vcxproj.filters

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1270,6 +1270,9 @@
12701270
<CustomBuild Include="unicode\utfiterator.h">
12711271
<Filter>strings</Filter>
12721272
</CustomBuild>
1273+
<CustomBuild Include="unicode\utfstring.h">
1274+
<Filter>strings</Filter>
1275+
</CustomBuild>
12731276
<CustomBuild Include="unicode\bytestrie.h">
12741277
<Filter>collections</Filter>
12751278
</CustomBuild>

icu4c/source/common/unicode/bytestream.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141

4242
#if U_SHOW_CPLUSPLUS_API
4343

44+
#include <type_traits>
45+
4446
#include "unicode/uobject.h"
4547
#include "unicode/std_string.h"
4648

@@ -260,11 +262,12 @@ class U_COMMON_API CheckedArrayByteSink : public ByteSink {
260262

261263
/**
262264
* Implementation of ByteSink that writes to a "string".
263-
* The StringClass is usually instantiated with a std::string.
265+
* The StringClass is usually instantiated with a std::string or a std::u8string.
264266
* @stable ICU 4.2
265267
*/
266268
template<typename StringClass>
267269
class StringByteSink : public ByteSink {
270+
using Unit = typename StringClass::value_type;
268271
public:
269272
/**
270273
* Constructs a ByteSink that will append bytes to the dest string.
@@ -291,7 +294,13 @@ class StringByteSink : public ByteSink {
291294
* @param n the number of bytes; must be non-negative
292295
* @stable ICU 4.2
293296
*/
294-
virtual void Append(const char* data, int32_t n) override { dest_->append(data, n); }
297+
virtual void Append(const char* data, int32_t n) override {
298+
if constexpr (std::is_same_v<Unit, char>) {
299+
dest_->append(data, n);
300+
} else {
301+
dest_->append(reinterpret_cast<const Unit*>(data), n);
302+
}
303+
}
295304
private:
296305
StringClass* dest_;
297306

icu4c/source/common/unicode/uchar.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2943,7 +2943,7 @@ u_stringHasBinaryProperty(const UChar *s, int32_t length, UProperty which);
29432943
* @return the property as a set
29442944
* @see UProperty
29452945
* @see u_hasBinaryProperty
2946-
* @see Unicode::fromUSet
2946+
* @see UnicodeSet::fromUSet
29472947
* @stable ICU 63
29482948
*/
29492949
U_CAPI const USet * U_EXPORT2

icu4c/source/common/unicode/unistr.h

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,10 @@ class UnicodeStringAppendable; // unicode/appendable.h
215215
*
216216
* The UnicodeString equivalent of std::string’s clear() is remove().
217217
*
218+
* Starting with ICU 78, a UnicodeString is a C++ "range" of char16_t code units.
219+
* utfStringCodePoints() and unsafeUTFStringCodePoints() can be used to iterate over
220+
* the code points.
221+
*
218222
* A UnicodeString may "alias" an external array of characters
219223
* (that is, point to it, rather than own the array)
220224
* whose lifetime must then at least match the lifetime of the aliasing object.
@@ -289,12 +293,17 @@ class UnicodeStringAppendable; // unicode/appendable.h
289293
* [User Guide Strings chapter](https://unicode-org.github.io/icu/userguide/strings#maximizing-performance-with-the-unicodestring-storage-model).
290294
*
291295
* @see utf.h
296+
* @see utfiterator.h
297+
* @see utfStringCodePoints
298+
* @see unsafeUTFStringCodePoints
292299
* @see CharacterIterator
293300
* @stable ICU 2.0
294301
*/
295302
class U_COMMON_API UnicodeString : public Replaceable
296303
{
297304
public:
305+
/** C++ boilerplate @internal */
306+
using value_type = char16_t;
298307

299308
/**
300309
* Constant to be used in the UnicodeString(char *, int32_t, EInvariant) constructor
@@ -1767,7 +1776,8 @@ class U_COMMON_API UnicodeString : public Replaceable
17671776
* Unpaired surrogates are replaced with U+FFFD.
17681777
* Calls toUTF8().
17691778
*
1770-
* @param result A standard string (or a compatible object)
1779+
* @tparam StringClass A std::string or a std::u8string (or a compatible type)
1780+
* @param result A std::string or a std::u8string (or a compatible object)
17711781
* to which the UTF-8 version of the string is appended.
17721782
* @return The string object.
17731783
* @stable ICU 4.2
@@ -1780,6 +1790,27 @@ class U_COMMON_API UnicodeString : public Replaceable
17801790
return result;
17811791
}
17821792

1793+
#ifndef U_HIDE_DRAFT_API
1794+
/**
1795+
* Convert the UnicodeString to a UTF-8 string.
1796+
* Unpaired surrogates are replaced with U+FFFD.
1797+
* Calls toUTF8().
1798+
*
1799+
* @tparam StringClass A std::string or a std::u8string (or a compatible type)
1800+
* @return A std::string or a std::u8string (or a compatible object)
1801+
* with the UTF-8 version of the string.
1802+
* @draft ICU 78
1803+
* @see toUTF8
1804+
*/
1805+
template<typename StringClass>
1806+
StringClass toUTF8String() const {
1807+
StringClass result;
1808+
StringByteSink<StringClass> sbs(&result, length());
1809+
toUTF8(sbs);
1810+
return result;
1811+
}
1812+
#endif // U_HIDE_DRAFT_API
1813+
17831814
/**
17841815
* Convert the UnicodeString to UTF-32.
17851816
* Unpaired surrogates are replaced with U+FFFD.
@@ -1892,6 +1923,33 @@ class U_COMMON_API UnicodeString : public Replaceable
18921923
*/
18931924
inline UBool isBogus() const;
18941925

1926+
#ifndef U_HIDE_DRAFT_API
1927+
/**
1928+
* @return an iterator to the first code unit in this string.
1929+
* The iterator may be a pointer or a contiguous-iterator object.
1930+
* @draft ICU 78
1931+
*/
1932+
auto begin() const { return std::u16string_view(*this).begin(); }
1933+
/**
1934+
* @return an iterator to just past the last code unit in this string.
1935+
* The iterator may be a pointer or a contiguous-iterator object.
1936+
* @draft ICU 78
1937+
*/
1938+
auto end() const { return std::u16string_view(*this).end(); }
1939+
/**
1940+
* @return a reverse iterator to the last code unit in this string.
1941+
* The iterator may be a pointer or a contiguous-iterator object.
1942+
* @draft ICU 78
1943+
*/
1944+
auto rbegin() const { return std::u16string_view(*this).rbegin(); }
1945+
/**
1946+
* @return a reverse iterator to just before the first code unit in this string.
1947+
* The iterator may be a pointer or a contiguous-iterator object.
1948+
* @draft ICU 78
1949+
*/
1950+
auto rend() const { return std::u16string_view(*this).rend(); }
1951+
#endif // U_HIDE_DRAFT_API
1952+
18951953
//========================================
18961954
// Write operations
18971955
//========================================
@@ -2318,6 +2376,16 @@ class U_COMMON_API UnicodeString : public Replaceable
23182376
*/
23192377
UnicodeString& append(UChar32 srcChar);
23202378

2379+
#ifndef U_HIDE_DRAFT_API
2380+
/**
2381+
* Appends the code unit `c` to the UnicodeString object.
2382+
* Same as append(c) except does not return *this.
2383+
*
2384+
* @param c the code unit to append
2385+
* @draft ICU 78
2386+
*/
2387+
inline void push_back(char16_t c) { append(c); }
2388+
#endif // U_HIDE_DRAFT_API
23212389

23222390
/* Insert operations */
23232391

icu4c/source/common/unicode/utf.h

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,39 @@
121121

122122
/* single-code point definitions -------------------------------------------- */
123123

124+
#ifndef U_HIDE_DRAFT_API
125+
126+
/**
127+
* Is c a Unicode code point U+0000..U+10FFFF?
128+
* https://www.unicode.org/glossary/#code_point
129+
*
130+
* @param c 32-bit code point
131+
* @return true or false
132+
* @draft ICU 78
133+
* @see AllCodePoints
134+
* @see U_IS_SCALAR_VALUE
135+
*/
136+
#define U_IS_CODE_POINT(c) ((uint32_t)(c)<=0x10ffff)
137+
138+
/**
139+
* Is c a Unicode scalar value, that is, a non-surrogate code point?
140+
* Only scalar values can be represented in well-formed UTF-8/16/32.
141+
* https://www.unicode.org/glossary/#unicode_scalar_value
142+
*
143+
* @param c 32-bit code point
144+
* @return true or false
145+
* @draft ICU 78
146+
* @see AllScalarValues
147+
* @see U_IS_CODE_POINT
148+
*/
149+
#define U_IS_SCALAR_VALUE(c) ((uint32_t)(c)<0xd800 || (0xe000<=(c) && (c)<=0x10ffff))
150+
151+
#endif // U_HIDE_DRAFT_API
152+
124153
/**
125154
* Is this code point a Unicode noncharacter?
155+
* https://www.unicode.org/glossary/#noncharacter
156+
*
126157
* @param c 32-bit code point
127158
* @return true or false
128159
* @stable ICU 2.4
@@ -150,7 +181,7 @@
150181
*/
151182
#define U_IS_UNICODE_CHAR(c) \
152183
((uint32_t)(c)<0xd800 || \
153-
(0xdfff<(c) && (c)<=0x10ffff && !U_IS_UNICODE_NONCHAR(c)))
184+
(0xe000<=(c) && (c)<=0x10ffff && !U_IS_UNICODE_NONCHAR(c)))
154185

155186
/**
156187
* Is this code point a BMP code point (U+0000..U+ffff)?

icu4c/source/common/unicode/utf8.h

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
170170
* @return true or false
171171
* @stable ICU 2.4
172172
*/
173-
#define U8_IS_SINGLE(c) (((c)&0x80)==0)
173+
#define U8_IS_SINGLE(c) ((int8_t)(c)>=0)
174174

175175
/**
176176
* Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
@@ -214,6 +214,32 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
214214
*/
215215
#define U8_MAX_LENGTH 4
216216

217+
#ifndef U_HIDE_DRAFT_API
218+
219+
/**
220+
* Returns the length of a well-formed UTF-8 byte sequence according to its lead byte.
221+
* Returns 1 for 0..0xc1 as well as for 0xf5..0xff.
222+
* leadByte might be evaluated multiple times.
223+
*
224+
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
225+
* @return 1..4
226+
* @draft ICU 78
227+
*/
228+
#define U8_LENGTH_FROM_LEAD_BYTE(leadByte) (U8_COUNT_TRAIL_BYTES(leadByte) + 1)
229+
230+
/**
231+
* Returns the length of a well-formed UTF-8 byte sequence according to its lead byte.
232+
* Returns 1 for 0..0xc1. Undefined for 0xf5..0xff.
233+
* leadByte might be evaluated multiple times.
234+
*
235+
* @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff.
236+
* @return 1..4
237+
* @draft ICU 78
238+
*/
239+
#define U8_LENGTH_FROM_LEAD_BYTE_UNSAFE(leadByte) (U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) + 1)
240+
241+
#endif // U_HIDE_DRAFT_API
242+
217243
/**
218244
* Get a code point from a string at a random-access offset,
219245
* without changing the offset.

icu4c/source/common/unicode/utfiterator.h

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,8 +232,112 @@ template<typename Range>
232232
constexpr bool range = range_type<Range>::value;
233233

234234
#endif
235+
236+
/** @internal */
237+
template<typename CP32, bool skipSurrogates>
238+
class CodePointsIterator {
239+
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
240+
public:
241+
/** C++ iterator boilerplate @internal */
242+
using value_type = CP32;
243+
/** C++ iterator boilerplate @internal */
244+
using reference = value_type;
245+
/** C++ iterator boilerplate @internal */
246+
using pointer = CP32 *;
247+
/** C++ iterator boilerplate @internal */
248+
using difference_type = int32_t;
249+
/** C++ iterator boilerplate @internal */
250+
using iterator_category = std::forward_iterator_tag;
251+
252+
/** @internal */
253+
inline CodePointsIterator(CP32 c) : c_(c) {}
254+
/** @internal */
255+
inline bool operator==(const CodePointsIterator &other) const { return c_ == other.c_; }
256+
/** @internal */
257+
inline bool operator!=(const CodePointsIterator &other) const { return !operator==(other); }
258+
/** @internal */
259+
inline CP32 operator*() const { return c_; }
260+
/** @internal */
261+
inline CodePointsIterator &operator++() { // pre-increment
262+
++c_;
263+
if (skipSurrogates && c_ == 0xd800) {
264+
c_ = 0xe000;
265+
}
266+
return *this;
267+
}
268+
/** @internal */
269+
inline CodePointsIterator operator++(int) { // post-increment
270+
CodePointsIterator result(*this);
271+
++(*this);
272+
return result;
273+
}
274+
275+
private:
276+
CP32 c_;
277+
};
278+
235279
} // namespace prv
236280

281+
/**
282+
* A C++ "range" over all Unicode code points U+0000..U+10FFFF.
283+
* https://www.unicode.org/glossary/#code_point
284+
*
285+
* Intended for test and builder code.
286+
*
287+
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
288+
* @draft ICU 78
289+
* @see U_IS_CODE_POINT
290+
*/
291+
template<typename CP32>
292+
class AllCodePoints {
293+
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
294+
public:
295+
/** Constructor. @draft ICU 78 */
296+
AllCodePoints() {}
297+
/**
298+
* @return an iterator over all Unicode code points.
299+
* The iterator returns CP32 integers.
300+
* @draft ICU 78
301+
*/
302+
auto begin() const { return prv::CodePointsIterator<CP32, false>(0); }
303+
/**
304+
* @return an exclusive-end iterator over all Unicode code points.
305+
* @draft ICU 78
306+
*/
307+
auto end() const { return prv::CodePointsIterator<CP32, false>(0x110000); }
308+
};
309+
310+
/**
311+
* A C++ "range" over all Unicode scalar values U+0000..U+D7FF & U+E000..U+10FFFF.
312+
* That is, all code points except surrogates.
313+
* Only scalar values can be represented in well-formed UTF-8/16/32.
314+
* https://www.unicode.org/glossary/#unicode_scalar_value
315+
*
316+
* Intended for test and builder code.
317+
*
318+
* @tparam CP32 Code point type: UChar32 (=int32_t) or char32_t or uint32_t
319+
* @draft ICU 78
320+
* @see U_IS_SCALAR_VALUE
321+
*/
322+
template<typename CP32>
323+
class AllScalarValues {
324+
static_assert(sizeof(CP32) == 4, "CP32 must be a 32-bit type to hold a code point");
325+
public:
326+
/** Constructor. @draft ICU 78 */
327+
AllScalarValues() {}
328+
/**
329+
* @return an iterator over all Unicode scalar values.
330+
* The iterator returns CP32 integers.
331+
* @draft ICU 78
332+
*/
333+
auto begin() const { return prv::CodePointsIterator<CP32, true>(0); }
334+
/**
335+
* @return an exclusive-end iterator over all Unicode scalar values.
336+
* @draft ICU 78
337+
*/
338+
auto end() const { return prv::CodePointsIterator<CP32, true>(0x110000); }
339+
};
340+
237341
/**
238342
* Result of decoding a code unit sequence for one code point.
239343
* Returned from non-validating Unicode string code point iterators.

0 commit comments

Comments
 (0)