1 #include <libnu/utf8.h>
2 
3 #ifdef NU_WITH_UTF8_READER
4 #ifdef NU_WITH_VALIDATION
5 
nu_utf8_validread(const char * encoded,size_t max_len)6 int nu_utf8_validread(const char *encoded, size_t max_len) {
7 	int len = utf8_validread_basic(encoded, max_len);
8 
9 	if (len <= 0) {
10 		return 0;
11 	}
12 
13 	/* Unicode core spec, D92, Table 3-7
14 	 */
15 
16 	switch (len) {
17 	/* case 1: single byte sequence can't be > 0x7F and produce len == 1
18 	 */
19 
20 	case 2: {
21 		uint8_t p1 = *(const unsigned char *)(encoded);
22 
23 		if (p1 < 0xC2) { /* 2-byte sequences with p1 > 0xDF are 3-byte sequences */
24 			return 0;
25 		}
26 
27 		/* the rest will be handled by utf8_validread_basic() */
28 
29 		break;
30 	}
31 
32 	case 3: {
33 		uint8_t p1 = *(const unsigned char *)(encoded);
34 
35 		/* 3-byte sequences with p1 < 0xE0 are 2-byte sequences,
36 		 * 3-byte sequences with p1 > 0xEF are 4-byte sequences */
37 
38 		uint8_t p2 = *(const unsigned char *)(encoded + 1);
39 
40 		if (p1 == 0xE0 && p2 < 0xA0) {
41 			return 0;
42 		}
43 		else if (p1 == 0xED && p2 > 0x9F) {
44 			return 0;
45 		}
46 
47 		/* (p2 < 0x80 || p2 > 0xBF) and p3 will be covered
48 		 * by utf8_validread_basic() */
49 
50 		break;
51 	}
52 
53 	case 4: {
54 		uint8_t p1 = *(const unsigned char *)(encoded);
55 
56 		if (p1 > 0xF4) { /* 4-byte sequence with p1 < 0xF0 are 3-byte sequences */
57 			return 0;
58 		}
59 
60 		uint8_t p2 = *(const unsigned char *)(encoded + 1);
61 
62 		if (p1 == 0xF0 && p2 < 0x90) {
63 			return 0;
64 		}
65 
66 		/* (p2 < 0x80 || p2 > 0xBF) and the rest (p3, p4)
67 		 * will be covered by utf8_validread_basic() */
68 
69 		break;
70 	}
71 
72 	} /* switch */
73 
74 	return len;
75 }
76 
77 #endif /* NU_WITH_VALIDATION */
78 #endif /* NU_WITH_UTF8_READER */
79 
80 #ifdef NU_WITH_UTF8_WRITER
81 
nu_utf8_write(uint32_t unicode,char * utf8)82 char* nu_utf8_write(uint32_t unicode, char *utf8) {
83 	unsigned codepoint_len = utf8_codepoint_length(unicode);
84 
85 	if (utf8 != 0) {
86 		switch (codepoint_len) {
87 			case 1: *utf8 = (char)(unicode); break;
88 			case 2: b2_utf8(unicode, utf8); break;
89 			case 3: b3_utf8(unicode, utf8); break;
90 			default: b4_utf8(unicode, utf8); break; /* len == 4 */
91 		}
92 	}
93 
94 	return utf8 + codepoint_len;
95 }
96 
97 #endif /* NU_WITH_UTF8_WRITER */
98