-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutf8.hpp
78 lines (70 loc) · 2.08 KB
/
utf8.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#pragma once
#if WS_CLIENT_USE_SIMD_UTF8 == 1
#include <simdutf.h>
#endif
namespace ws_client
{
#if WS_CLIENT_USE_SIMD_UTF8 == 1
/**
* Checks if the given string is a valid UTF-8 string.
* This function is highly optimized due to the u se
* of the `simdutf` library.
*/
inline bool is_valid_utf8(const char* str, int len) noexcept
{
return simdutf::validate_utf8(str, len);
}
#else
/**
* Checks if the given string is a valid UTF-8 string.
* This function is not optimized, therefore slow, and should be used only
* if SIMDUTF is not available.
*/
inline bool is_valid_utf8(const char* str, int len)
{
const unsigned char* s = reinterpret_cast<const unsigned char*>(str);
for (int i = 0; i < len;)
{
if (s[i] < 0x80)
{
// 0xxxxxxx
i++;
}
else if ((s[i] & 0xe0) == 0xc0)
{
// 110XXXXx 10xxxxxx
if (i + 1 >= len || (s[i + 1] & 0xc0) != 0x80 || (s[i] & 0xfe) == 0xc0) // Overlong?
return false;
else
i += 2;
}
else if ((s[i] & 0xf0) == 0xe0)
{
// 1110XXXX 10Xxxxxx 10xxxxxx
if (i + 2 >= len || (s[i + 1] & 0xc0) != 0x80 || (s[i + 2] & 0xc0) != 0x80 ||
(s[i] == 0xe0 && (s[i + 1] & 0xe0) == 0x80) || // Overlong?
(s[i] == 0xed && (s[i + 1] & 0xa0) == 0xa0)) // Surrogate half?
return false;
else
i += 3;
}
else if ((s[i] & 0xf8) == 0xf0)
{
// 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
if (i + 3 >= len || (s[i + 1] & 0xc0) != 0x80 || (s[i + 2] & 0xc0) != 0x80 ||
(s[i + 3] & 0xc0) != 0x80 ||
(s[i] == 0xf0 && (s[i + 1] & 0xf0) == 0x80) || // Overlong?
(s[i] == 0xf4 && s[i + 1] > 0x8f) || s[i] > 0xf4) // Greater than U+10FFFF?
return false;
else
i += 4;
}
else
{
return false; // Invalid UTF-8 start byte
}
}
return true;
}
#endif
} // namespace ws_client