blob: 3181eca907b9e362c2de6704b308e6fe4a15b7d4 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
/*
* Copyright (c) 2020 Proofpoint, Inc. and its suppliers.
* All rights reserved.
*
* By using this file, you agree to the terms and conditions set
* forth in the LICENSE file which can be found at the top level of
* the sendmail distribution.
*
*/
#include <sm/gen.h>
#include <sm/sendmail.h>
#include <sm/ixlen.h>
#if USE_EAI
/*
** legal utf-8 byte sequence
** http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
**
** Code Points 1st 2s 3s 4s
** U+0000..U+007F 00..7F
** U+0080..U+07FF C2..DF 80..BF
** U+0800..U+0FFF E0 A0..BF 80..BF
** U+1000..U+CFFF E1..EC 80..BF 80..BF
** U+D000..U+D7FF ED 80..9F 80..BF
** U+E000..U+FFFF EE..EF 80..BF 80..BF
** U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
** U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
** U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
*/
/*
** based on
** https://github.com/lemire/fastvalidate-utf-8.git
** which is distributed under an MIT license (besides others).
*/
bool
utf8_valid(b, length)
const char *b;
size_t length;
{
const unsigned char *bytes;
size_t index;
bytes = (const unsigned char *)b;
index = 0;
while (true)
{
unsigned char byte1;
do { /* fast ASCII Path */
if (index >= length)
return true;
byte1 = bytes[index++];
} while (byte1 < 0x80);
if (byte1 < 0xE0)
{
/* Two-byte form. */
if (index == length)
return false;
if (byte1 < 0xC2 || bytes[index++] > 0xBF)
return false;
}
else if (byte1 < 0xF0)
{
/* Three-byte form. */
if (index + 1 >= length)
return false;
unsigned char byte2 = bytes[index++];
if (byte2 > 0xBF
/* Overlong? 5 most significant bits must not all be zero. */
|| (byte1 == 0xE0 && byte2 < 0xA0)
/* Check for illegal surrogate codepoints. */
|| (byte1 == 0xED && 0xA0 <= byte2)
/* Third byte trailing-byte test. */
|| bytes[index++] > 0xBF)
return false;
}
else
{
/* Four-byte form. */
if (index + 2 >= length)
return false;
int byte2 = bytes[index++];
if (byte2 > 0xBF
/* Check that 1 <= plane <= 16. Tricky optimized form of: */
/* if (byte1 > (byte) 0xF4 */
/* || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 */
/* || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) */
|| (((byte1 << 28) + (byte2 - 0x90)) >> 30) != 0
/* Third byte trailing-byte test */
|| bytes[index++] > 0xBF
/* Fourth byte trailing-byte test */
|| bytes[index++] > 0xBF)
return false;
}
}
/* NOTREACHED */
return false;
}
#endif /* USE_EAI */
|