1 #ifndef NU_UTF8_INTERNAL_H
2 #define NU_UTF8_INTERNAL_H
3 
4 #include <sys/types.h>
5 
6 static inline
utf8_char_length(const char c)7 unsigned utf8_char_length(const char c) {
8 	const unsigned char uc = c;
9 
10 	if ((uc & 0x80) == 0) return 1;
11 	if ((uc & 0xE0) == 0xC0) return 2;
12 	if ((uc & 0xF0) == 0xE0) return 3;
13 	if ((uc & 0xF8) == 0xF0) return 4;
14 
15 	return 0; /* undefined */
16 }
17 
18 static inline
utf8_2b(const char * p,uint32_t * codepoint)19 void utf8_2b(const char *p, uint32_t *codepoint) {
20 	const unsigned char *up = (const unsigned char *)(p);
21 
22 	/* UTF-8: 110xxxxx 10xxxxxx
23 	 *                                    |__ 1st unicode octet
24 	 * 110xxx00 << 6 -> 00000xxx 00000000 |
25 	 *                  --------
26 	 * 110000xx << 6 -> 00000xxx xx000000 |__ 2nd unicode octet
27 	 * 10xxxxxx      -> 00000xxx xxxxxxxx |
28 	 *                           --------  */
29 	*codepoint = (*(up) & 0x1C) << 6
30 	| ((*(up) & 0x03) << 6 | (*(up + 1) & 0x3F));
31 }
32 
33 static inline
utf8_3b(const char * p,uint32_t * codepoint)34 void utf8_3b(const char *p, uint32_t *codepoint) {
35 	const unsigned char *up = (const unsigned char *)(p);
36 
37 	/* UTF-8: 1110xxxx 10xxxxxx 10xxxxxx
38 	 *
39 	 * 1110xxxx << 12 -> xxxx0000 0000000 |__ 1st unicode octet
40 	 * 10xxxx00 << 6  -> xxxxxxxx 0000000 |
41 	 *                   --------
42 	 * 100000xx << 6  -> xxxxxxxx xx00000 |__ 2nd unicode octet
43 	 * 10xxxxxx       -> xxxxxxxx xxxxxxx |
44 	 *                            -------  */
45 	*codepoint =
46 	((*(up) & 0x0F) << 12 | (*(up + 1) & 0x3C) << 6)
47 	| ((*(up + 1) & 0x03) << 6 | (*(up + 2) & 0x3F));
48 }
49 
50 static inline
utf8_4b(const char * p,uint32_t * codepoint)51 void utf8_4b(const char *p, uint32_t *codepoint) {
52 	const unsigned char *up = (const unsigned char *)(p);
53 
54 	/* UTF-8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
55 	 *
56 	 * 11110xxx << 18 -> 00xxx00 00000000 00000000 |__ 1st unicode octet
57 	 * 10xx0000 << 12 -> 00xxxxx 00000000 00000000 |
58 	 *                   -------
59 	 * 1000xxxx << 12 -> 00xxxxx xxxx0000 00000000 |__ 2nd unicode octet
60 	 * 10xxxx00 << 6  -> 00xxxxx xxxxxxxx 00000000 |
61 	 *                           --------
62 	 * 100000xx << 6  -> 00xxxxx xxxxxxxx xx000000 |__ 3rd unicode octet
63 	 * 10xxxxxx       -> 00xxxxx xxxxxxxx xxxxxxxx |
64 	 *                                    ---------  */
65 	 *codepoint =
66 	((*(up) & 0x07) << 18 | (*(up + 1) & 0x30) << 12)
67 	| ((*(up + 1) & 0x0F) << 12 | (*(up + 2) & 0x3C) << 6)
68 	| ((*(up + 2) & 0x03) << 6 | (*(up + 3) & 0x3F));
69 }
70 
71 static inline
utf8_codepoint_length(uint32_t codepoint)72 unsigned utf8_codepoint_length(uint32_t codepoint) {
73 	if (codepoint < 128) return 1;
74 	if (codepoint < 0x0800) return 2;
75 	if (codepoint < 0x10000) return 3;
76 
77 	return 4; /* de facto max length in UTF-8 */
78 }
79 
80 static inline
b2_utf8(uint32_t codepoint,char * p)81 void b2_utf8(uint32_t codepoint, char *p) {
82 	unsigned char *up = (unsigned char *)(p);
83 
84 	/* UNICODE: 00000xxx xxxxxxxx
85 	 *
86 	 * 00000xxx >> 6 -> 110xxx00 10000000 |__ 1st UTF-8 octet
87 	 * xxxxxxxx >> 6 -> 110xxxxx 10000000 |
88 	 *                  --------
89 	 *                                    |__ 2nd UTF-8 octet
90 	 * xxxxxxxx      -> 110xxxxx 10xxxxxx |
91 	 *                           --------  */
92 	*(up) = (0xC0 | (codepoint & 0xFF00) >> 6 | (codepoint & 0xFF) >> 6);
93 	*(up + 1) = (0x80 | (codepoint & 0x3F));
94 }
95 
96 static inline
b3_utf8(uint32_t codepoint,char * p)97 void b3_utf8(uint32_t codepoint, char *p) {
98 	unsigned char *up = (unsigned char *)(p);
99 
100 	/* UNICODE: xxxxxxxx xxxxxxxx
101 	 *                                              |__ 1st UTF-8 octet
102 	 * xxxxxxxx >> 12 -> 1110xxxx 10000000 10000000 |
103 	 *                   --------
104 	 * xxxxxxxx >> 6  -> 1110xxxx 10xxxx00 10000000 |__ 2nd UTF-8 octet
105 	 * xxxxxxxx >> 6  -> 1110xxxx 10xxxxxx 10000000 |
106 	 *                            --------
107 	 *                                              |__ 3rd UTF-8 octet
108 	 * xxxxxxxx       -> 1110xxxx 10xxxxxx 10xxxxxx |
109 	 *                                     --------  */
110 	*(up) = (0xE0 | (codepoint & 0xF000) >> 12);
111 	*(up + 1) = (0x80 | (codepoint & 0x0F00) >> 6 | (codepoint & 0xC0) >> 6);
112 	*(up + 2) = (0x80 | (codepoint & 0x3F));
113 }
114 
115 static inline
b4_utf8(uint32_t codepoint,char * p)116 void b4_utf8(uint32_t codepoint, char *p) {
117 	unsigned char *up = (unsigned char *)(p);
118 
119 	/* UNICODE: 000xxxxx xxxxxxxx xxxxxxxx
120 	 *                                                      |__ 1st UTF-8 octet
121 	 * 000xxxxx >> 18 -> 11110xxx 1000000 10000000 10000000 |
122 	 *                   --------
123 	 * 000xxxxx >> 12 -> 11110xxx 10xx000 10000000 10000000 |__ 2nd UTF-8 octet
124 	 * xxxxxxxx >> 12 -> 11110xxx 10xxxxx 10000000 10000000 |
125 	 *                            -------
126 	 * xxxxxxxx >> 6  -> 11110xxx 10xxxxx 10xxxxx0 10000000 |__ 3rd UTF-8 octet
127 	 * xxxxxxxx >> 6  -> 11110xxx 10xxxxx 10xxxxxx 10000000 |
128 	 *                                    --------
129 	 *                                                      |__ 4th UTF-8 octet
130 	 * xxxxxxxx       -> 11110xxx 10xxxxx 10xxxxxx 10000000 | */
131 	*(up) = (0xF0 | ((codepoint & 0x1C0000) >> 18));
132 	*(up + 1) = (0x80 | (codepoint & 0x030000) >> 12 | (codepoint & 0x00E000) >> 12);
133 	*(up + 2) = (0x80 | (codepoint & 0x001F00) >> 6 | (codepoint & 0x0000E0) >> 6);
134 	*(up + 3) = (0x80 | (codepoint & 0x3F));
135 }
136 
137 static inline
utf8_validread_basic(const char * p,size_t max_len)138 int utf8_validread_basic(const char *p, size_t max_len) {
139 	const unsigned char *up = (const unsigned char *)(p);
140 
141 	/* it should be 0xxxxxxx or 110xxxxx or 1110xxxx or 11110xxx
142 	 * latter should be followed by number of 10xxxxxx */
143 
144 	unsigned len = utf8_char_length(*p);
145 
146 	/* codepoints longer than 6 bytes does not currently exist
147 	 * and not currently supported
148 	 * TODO: longer UTF-8 sequences support
149 	 */
150 	if (max_len < len) {
151 		return 0;
152 	}
153 
154 	switch (len) {
155 		case 1: return 1; /* one byte codepoint */
156 		case 2: return ((*(up + 1) & 0xC0) == 0x80 ? 2 : 0);
157 		case 3: return ((*(up + 1) & 0xC0) == 0x80
158 		&& (*(up + 2) & 0xC0) == 0x80 ? 3 : 0);
159 
160 		case 4: return ((*(up + 1) & 0xC0) == 0x80
161 		&& (*(up + 2) & 0xC0) == 0x80
162 		&& (*(up + 3) & 0xC0) == 0x80 ? 4 : 0);
163 	}
164 
165 	return 0;
166 }
167 
168 #endif /* NU_UTF8_INTERNAL_H */
169