1 #ifndef NU_UTF8_H
2 #define NU_UTF8_H
3
4 #include <stdint.h>
5 #include <sys/types.h>
6
7 #include <libnu/config.h>
8 #include <libnu/defines.h>
9 #include <libnu/utf8_internal.h>
10
11 /** @defgroup utf8 UTF-8 support
12 *
13 * Note: There is no utf8_string[i] equivalent - it will be slow,
14 * use nu_utf8_read() and nu_utf8_revread() instead
15 *
16 * @example utf8.c
17 * @example revread.c
18 */
19
20 #if defined (__cplusplus) || defined (c_plusplus)
21 extern "C" {
22 #endif
23
24 #ifdef NU_WITH_UTF8_READER
25
26 /** Read codepoint from UTF-8 string
27 *
28 * @ingroup utf8
29 * @param utf8 pointer to UTF-8 encoded string
30 * @param unicode output unicode codepoint or 0
31 * @return pointer to next codepoint in UTF-8 string
32 */
33 static inline
nu_utf8_read(const char * utf8,uint32_t * unicode)34 const char* nu_utf8_read(const char *utf8, uint32_t *unicode) {
35 uint32_t c = *(unsigned char *)(utf8);
36
37 if (c >= 0x80) {
38 if (c < 0xE0) {
39 if (unicode != 0) {
40 utf8_2b(utf8, unicode);
41 }
42 return utf8 + 2;
43 }
44 else if (c < 0xF0) {
45 if (unicode != 0) {
46 utf8_3b(utf8, unicode);
47 }
48 return utf8 + 3;
49 }
50 else {
51 if (unicode != 0) {
52 utf8_4b(utf8, unicode);
53 }
54 return utf8 + 4;
55 }
56 }
57 else if (unicode != 0) {
58 *unicode = c;
59 }
60
61 return utf8 + 1;
62 }
63
64 #ifdef NU_WITH_REVERSE_READ
65
66 /** Read codepoint from UTF-8 string in backward direction
67 *
68 * Note that it is your responsibility to check that this call
69 * is not going under beginning of encoded string. Normally you
70 * shouldn't call it like this: nu_utf8_revread(&u, "hello"); which
71 * will result in undefined behavior
72 *
73 * @ingroup utf8
74 * @param unicode output unicode codepoint or 0
75 * @param utf8 pointer to UTF-8 encoded string
76 * @return pointer to previous codepoint in UTF-8 string
77 */
78 static inline
nu_utf8_revread(uint32_t * unicode,const char * utf8)79 const char* nu_utf8_revread(uint32_t *unicode, const char *utf8) {
80 /* valid UTF-8 has either 10xxxxxx (continuation byte)
81 * or beginning of byte sequence */
82 const char *p = utf8 - 1;
83 while (((unsigned char)(*p) & 0xC0) == 0x80) { /* skip every 0b10000000 */
84 --p;
85 }
86
87 if (unicode != 0) {
88 nu_utf8_read(p, unicode);
89 }
90
91 return p;
92 }
93
94 #endif /* NU_WITH_REVERSE_READ */
95
96 #ifdef NU_WITH_VALIDATION
97
98 /** Validate codepoint in string
99 *
100 * @ingroup utf8
101 * @param encoded buffer with encoded string
102 * @param max_len buffer length
103 * @return codepoint length or 0 on error
104 */
105 NU_EXPORT
106 int nu_utf8_validread(const char *encoded, size_t max_len);
107
108 #endif /* NU_WITH_VALIDATION */
109 #endif /* NU_WITH_UTF8_READER */
110
111 #ifdef NU_WITH_UTF8_WRITER
112
113 /** Write unicode codepoints into UTF-8 encoded string
114 *
115 * @ingroup utf8
116 * @param unicode unicode codepoint
117 * @param utf8 pointer to buffer to write UTF-8 encoded text to,
118 * should be large enough to hold encoded value
119 * @return pointer to byte after last written
120 */
121 NU_EXPORT
122 char* nu_utf8_write(uint32_t unicode, char *utf8);
123
124 #endif /* NU_WITH_UTF8_WRITER */
125
126 #if defined (__cplusplus) || defined (c_plusplus)
127 }
128 #endif
129
130 #endif /* NU_UTF8_H */
131