summaryrefslogtreecommitdiff
path: root/contrib/sendmail/libsm/utf8_valid.c
blob: 3181eca907b9e362c2de6704b308e6fe4a15b7d4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
/*
 * Copyright (c) 2020 Proofpoint, Inc. and its suppliers.
 *	All rights reserved.
 *
 * By using this file, you agree to the terms and conditions set
 * forth in the LICENSE file which can be found at the top level of
 * the sendmail distribution.
 *
 */

#include <sm/gen.h>
#include <sm/sendmail.h>
#include <sm/ixlen.h>

#if USE_EAI

/*
**  legal utf-8 byte sequence
**  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
**
**   Code Points        1st       2s       3s       4s
**  U+0000..U+007F     00..7F
**  U+0080..U+07FF     C2..DF   80..BF
**  U+0800..U+0FFF     E0       A0..BF   80..BF
**  U+1000..U+CFFF     E1..EC   80..BF   80..BF
**  U+D000..U+D7FF     ED       80..9F   80..BF
**  U+E000..U+FFFF     EE..EF   80..BF   80..BF
**  U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
**  U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
**  U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
*/

/*
**  based on
**  https://github.com/lemire/fastvalidate-utf-8.git
**  which is distributed under an MIT license (besides others).
*/

bool
utf8_valid(b, length)
	const char *b;
	size_t length;
{
	const unsigned char *bytes;
	size_t index;

	bytes = (const unsigned char *)b;
	index = 0;
	while (true)
	{
		unsigned char byte1;

		do { /* fast ASCII Path */
			if (index >= length)
				return true;
			byte1 = bytes[index++];
		} while (byte1 < 0x80);
		if (byte1 < 0xE0)
		{
			/* Two-byte form. */
			if (index == length)
				return false;
			if (byte1 < 0xC2 || bytes[index++] > 0xBF)
				return false;
		}
		else if (byte1 < 0xF0)
		{
			/* Three-byte form. */
			if (index + 1 >= length)
				return false;
			unsigned char byte2 = bytes[index++];
			if (byte2 > 0xBF
			    /* Overlong? 5 most significant bits must not all be zero. */
			    || (byte1 == 0xE0 && byte2 < 0xA0)
			    /* Check for illegal surrogate codepoints. */
			    || (byte1 == 0xED && 0xA0 <= byte2)
			    /* Third byte trailing-byte test. */
			    || bytes[index++] > 0xBF)
				return false;
		}
		else
		{

			/* Four-byte form. */
			if (index + 2 >= length)
				return false;
			int byte2 = bytes[index++];
			if (byte2 > 0xBF
			    /* Check that 1 <= plane <= 16. Tricky optimized form of: */
			    /* if (byte1 > (byte) 0xF4 */
			    /*    || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 */
			    /*    || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) */
			    || (((byte1 << 28) + (byte2 - 0x90)) >> 30) != 0
			    /* Third byte trailing-byte test */
			    || bytes[index++] > 0xBF
			    /* Fourth byte trailing-byte test */
			    || bytes[index++] > 0xBF)
				return false;
		}
	}
	/* NOTREACHED */
	return false;
}
#endif /* USE_EAI */