1 #ifndef NU_UTF8_H
2 #define NU_UTF8_H
3 
4 #include <stdint.h>
5 #include <sys/types.h>
6 
7 #include <libnu/config.h>
8 #include <libnu/defines.h>
9 #include <libnu/utf8_internal.h>
10 
11 /** @defgroup utf8 UTF-8 support
12  *
13  * Note: There is no utf8_string[i] equivalent - it will be slow,
14  * use nu_utf8_read() and nu_utf8_revread() instead
15  *
16  * @example utf8.c
17  * @example revread.c
18  */
19 
20 #if defined (__cplusplus) || defined (c_plusplus)
21 extern "C" {
22 #endif
23 
24 #ifdef NU_WITH_UTF8_READER
25 
26 /** Read codepoint from UTF-8 string
27  *
28  * @ingroup utf8
29  * @param utf8 pointer to UTF-8 encoded string
30  * @param unicode output unicode codepoint or 0
31  * @return pointer to next codepoint in UTF-8 string
32  */
33 static inline
nu_utf8_read(const char * utf8,uint32_t * unicode)34 const char* nu_utf8_read(const char *utf8, uint32_t *unicode) {
35 	uint32_t c = *(unsigned char *)(utf8);
36 
37 	if (c >= 0x80) {
38 		if (c < 0xE0) {
39 			if (unicode != 0) {
40 				utf8_2b(utf8, unicode);
41 			}
42 			return utf8 + 2;
43 		}
44 		else if (c < 0xF0) {
45 			if (unicode != 0) {
46 				utf8_3b(utf8, unicode);
47 			}
48 			return utf8 + 3;
49 		}
50 		else {
51 			if (unicode != 0) {
52 				utf8_4b(utf8, unicode);
53 			}
54 			return utf8 + 4;
55 		}
56 	}
57 	else if (unicode != 0) {
58 		*unicode = c;
59 	}
60 
61 	return utf8 + 1;
62 }
63 
64 #ifdef NU_WITH_REVERSE_READ
65 
66 /** Read codepoint from UTF-8 string in backward direction
67  *
68  * Note that it is your responsibility to check that this call
69  * is not going under beginning of encoded string. Normally you
70  * shouldn't call it like this: nu_utf8_revread(&u, "hello"); which
71  * will result in undefined behavior
72  *
73  * @ingroup utf8
74  * @param unicode output unicode codepoint or 0
75  * @param utf8 pointer to UTF-8 encoded string
76  * @return pointer to previous codepoint in UTF-8 string
77  */
78 static inline
nu_utf8_revread(uint32_t * unicode,const char * utf8)79 const char* nu_utf8_revread(uint32_t *unicode, const char *utf8) {
80 	/* valid UTF-8 has either 10xxxxxx (continuation byte)
81 	 * or beginning of byte sequence */
82 	const char *p = utf8 - 1;
83 	while (((unsigned char)(*p) & 0xC0) == 0x80) { /* skip every 0b10000000 */
84 		--p;
85 	}
86 
87 	if (unicode != 0) {
88 		nu_utf8_read(p, unicode);
89 	}
90 
91 	return p;
92 }
93 
94 #endif /* NU_WITH_REVERSE_READ */
95 
96 #ifdef NU_WITH_VALIDATION
97 
98 /** Validate codepoint in string
99  *
100  * @ingroup utf8
101  * @param encoded buffer with encoded string
102  * @param max_len buffer length
103  * @return codepoint length or 0 on error
104  */
105 NU_EXPORT
106 int nu_utf8_validread(const char *encoded, size_t max_len);
107 
108 #endif /* NU_WITH_VALIDATION */
109 #endif /* NU_WITH_UTF8_READER */
110 
111 #ifdef NU_WITH_UTF8_WRITER
112 
113 /** Write unicode codepoints into UTF-8 encoded string
114  *
115  * @ingroup utf8
116  * @param unicode unicode codepoint
117  * @param utf8 pointer to buffer to write UTF-8 encoded text to,
118  * should be large enough to hold encoded value
119  * @return pointer to byte after last written
120  */
121 NU_EXPORT
122 char* nu_utf8_write(uint32_t unicode, char *utf8);
123 
124 #endif /* NU_WITH_UTF8_WRITER */
125 
126 #if defined (__cplusplus) || defined (c_plusplus)
127 }
128 #endif
129 
130 #endif /* NU_UTF8_H */
131