1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Copyright (c) 2014 SGI.
4*4882a593Smuzhiyun * All rights reserved.
5*4882a593Smuzhiyun */
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun #include "utf8n.h"
8*4882a593Smuzhiyun
9*4882a593Smuzhiyun struct utf8data {
10*4882a593Smuzhiyun unsigned int maxage;
11*4882a593Smuzhiyun unsigned int offset;
12*4882a593Smuzhiyun };
13*4882a593Smuzhiyun
14*4882a593Smuzhiyun #define __INCLUDED_FROM_UTF8NORM_C__
15*4882a593Smuzhiyun #include "utf8data.h"
16*4882a593Smuzhiyun #undef __INCLUDED_FROM_UTF8NORM_C__
17*4882a593Smuzhiyun
utf8version_is_supported(u8 maj,u8 min,u8 rev)18*4882a593Smuzhiyun int utf8version_is_supported(u8 maj, u8 min, u8 rev)
19*4882a593Smuzhiyun {
20*4882a593Smuzhiyun int i = ARRAY_SIZE(utf8agetab) - 1;
21*4882a593Smuzhiyun unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
22*4882a593Smuzhiyun
23*4882a593Smuzhiyun while (i >= 0 && utf8agetab[i] != 0) {
24*4882a593Smuzhiyun if (sb_utf8version == utf8agetab[i])
25*4882a593Smuzhiyun return 1;
26*4882a593Smuzhiyun i--;
27*4882a593Smuzhiyun }
28*4882a593Smuzhiyun return 0;
29*4882a593Smuzhiyun }
30*4882a593Smuzhiyun EXPORT_SYMBOL(utf8version_is_supported);
31*4882a593Smuzhiyun
utf8version_latest(void)32*4882a593Smuzhiyun int utf8version_latest(void)
33*4882a593Smuzhiyun {
34*4882a593Smuzhiyun return utf8vers;
35*4882a593Smuzhiyun }
36*4882a593Smuzhiyun EXPORT_SYMBOL(utf8version_latest);
37*4882a593Smuzhiyun
38*4882a593Smuzhiyun /*
39*4882a593Smuzhiyun * UTF-8 valid ranges.
40*4882a593Smuzhiyun *
41*4882a593Smuzhiyun * The UTF-8 encoding spreads the bits of a 32bit word over several
42*4882a593Smuzhiyun * bytes. This table gives the ranges that can be held and how they'd
43*4882a593Smuzhiyun * be represented.
44*4882a593Smuzhiyun *
45*4882a593Smuzhiyun * 0x00000000 0x0000007F: 0xxxxxxx
46*4882a593Smuzhiyun * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
47*4882a593Smuzhiyun * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
48*4882a593Smuzhiyun * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
49*4882a593Smuzhiyun * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
50*4882a593Smuzhiyun * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
51*4882a593Smuzhiyun *
52*4882a593Smuzhiyun * There is an additional requirement on UTF-8, in that only the
53*4882a593Smuzhiyun * shortest representation of a 32bit value is to be used. A decoder
54*4882a593Smuzhiyun * must not decode sequences that do not satisfy this requirement.
55*4882a593Smuzhiyun * Thus the allowed ranges have a lower bound.
56*4882a593Smuzhiyun *
57*4882a593Smuzhiyun * 0x00000000 0x0000007F: 0xxxxxxx
58*4882a593Smuzhiyun * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
59*4882a593Smuzhiyun * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
60*4882a593Smuzhiyun * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
61*4882a593Smuzhiyun * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
62*4882a593Smuzhiyun * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
63*4882a593Smuzhiyun *
64*4882a593Smuzhiyun * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
65*4882a593Smuzhiyun * 17 planes of 65536 values. This limits the sequences actually seen
66*4882a593Smuzhiyun * even more, to just the following.
67*4882a593Smuzhiyun *
68*4882a593Smuzhiyun * 0 - 0x7F: 0 - 0x7F
69*4882a593Smuzhiyun * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF
70*4882a593Smuzhiyun * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF
71*4882a593Smuzhiyun * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
72*4882a593Smuzhiyun *
73*4882a593Smuzhiyun * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
74*4882a593Smuzhiyun *
75*4882a593Smuzhiyun * Note that the longest sequence seen with valid usage is 4 bytes,
76*4882a593Smuzhiyun * the same a single UTF-32 character. This makes the UTF-8
77*4882a593Smuzhiyun * representation of Unicode strictly smaller than UTF-32.
78*4882a593Smuzhiyun *
79*4882a593Smuzhiyun * The shortest sequence requirement was introduced by:
80*4882a593Smuzhiyun * Corrigendum #1: UTF-8 Shortest Form
81*4882a593Smuzhiyun * It can be found here:
82*4882a593Smuzhiyun * http://www.unicode.org/versions/corrigendum1.html
83*4882a593Smuzhiyun *
84*4882a593Smuzhiyun */
85*4882a593Smuzhiyun
86*4882a593Smuzhiyun /*
87*4882a593Smuzhiyun * Return the number of bytes used by the current UTF-8 sequence.
88*4882a593Smuzhiyun * Assumes the input points to the first byte of a valid UTF-8
89*4882a593Smuzhiyun * sequence.
90*4882a593Smuzhiyun */
utf8clen(const char * s)91*4882a593Smuzhiyun static inline int utf8clen(const char *s)
92*4882a593Smuzhiyun {
93*4882a593Smuzhiyun unsigned char c = *s;
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
96*4882a593Smuzhiyun }
97*4882a593Smuzhiyun
98*4882a593Smuzhiyun /*
99*4882a593Smuzhiyun * Decode a 3-byte UTF-8 sequence.
100*4882a593Smuzhiyun */
101*4882a593Smuzhiyun static unsigned int
utf8decode3(const char * str)102*4882a593Smuzhiyun utf8decode3(const char *str)
103*4882a593Smuzhiyun {
104*4882a593Smuzhiyun unsigned int uc;
105*4882a593Smuzhiyun
106*4882a593Smuzhiyun uc = *str++ & 0x0F;
107*4882a593Smuzhiyun uc <<= 6;
108*4882a593Smuzhiyun uc |= *str++ & 0x3F;
109*4882a593Smuzhiyun uc <<= 6;
110*4882a593Smuzhiyun uc |= *str++ & 0x3F;
111*4882a593Smuzhiyun
112*4882a593Smuzhiyun return uc;
113*4882a593Smuzhiyun }
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun /*
116*4882a593Smuzhiyun * Encode a 3-byte UTF-8 sequence.
117*4882a593Smuzhiyun */
118*4882a593Smuzhiyun static int
utf8encode3(char * str,unsigned int val)119*4882a593Smuzhiyun utf8encode3(char *str, unsigned int val)
120*4882a593Smuzhiyun {
121*4882a593Smuzhiyun str[2] = (val & 0x3F) | 0x80;
122*4882a593Smuzhiyun val >>= 6;
123*4882a593Smuzhiyun str[1] = (val & 0x3F) | 0x80;
124*4882a593Smuzhiyun val >>= 6;
125*4882a593Smuzhiyun str[0] = val | 0xE0;
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun return 3;
128*4882a593Smuzhiyun }
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun /*
131*4882a593Smuzhiyun * utf8trie_t
132*4882a593Smuzhiyun *
133*4882a593Smuzhiyun * A compact binary tree, used to decode UTF-8 characters.
134*4882a593Smuzhiyun *
135*4882a593Smuzhiyun * Internal nodes are one byte for the node itself, and up to three
136*4882a593Smuzhiyun * bytes for an offset into the tree. The first byte contains the
137*4882a593Smuzhiyun * following information:
138*4882a593Smuzhiyun * NEXTBYTE - flag - advance to next byte if set
139*4882a593Smuzhiyun * BITNUM - 3 bit field - the bit number to tested
140*4882a593Smuzhiyun * OFFLEN - 2 bit field - number of bytes in the offset
141*4882a593Smuzhiyun * if offlen == 0 (non-branching node)
142*4882a593Smuzhiyun * RIGHTPATH - 1 bit field - set if the following node is for the
143*4882a593Smuzhiyun * right-hand path (tested bit is set)
144*4882a593Smuzhiyun * TRIENODE - 1 bit field - set if the following node is an internal
145*4882a593Smuzhiyun * node, otherwise it is a leaf node
146*4882a593Smuzhiyun * if offlen != 0 (branching node)
147*4882a593Smuzhiyun * LEFTNODE - 1 bit field - set if the left-hand node is internal
148*4882a593Smuzhiyun * RIGHTNODE - 1 bit field - set if the right-hand node is internal
149*4882a593Smuzhiyun *
150*4882a593Smuzhiyun * Due to the way utf8 works, there cannot be branching nodes with
151*4882a593Smuzhiyun * NEXTBYTE set, and moreover those nodes always have a righthand
152*4882a593Smuzhiyun * descendant.
153*4882a593Smuzhiyun */
154*4882a593Smuzhiyun typedef const unsigned char utf8trie_t;
155*4882a593Smuzhiyun #define BITNUM 0x07
156*4882a593Smuzhiyun #define NEXTBYTE 0x08
157*4882a593Smuzhiyun #define OFFLEN 0x30
158*4882a593Smuzhiyun #define OFFLEN_SHIFT 4
159*4882a593Smuzhiyun #define RIGHTPATH 0x40
160*4882a593Smuzhiyun #define TRIENODE 0x80
161*4882a593Smuzhiyun #define RIGHTNODE 0x40
162*4882a593Smuzhiyun #define LEFTNODE 0x80
163*4882a593Smuzhiyun
164*4882a593Smuzhiyun /*
165*4882a593Smuzhiyun * utf8leaf_t
166*4882a593Smuzhiyun *
167*4882a593Smuzhiyun * The leaves of the trie are embedded in the trie, and so the same
168*4882a593Smuzhiyun * underlying datatype: unsigned char.
169*4882a593Smuzhiyun *
170*4882a593Smuzhiyun * leaf[0]: The unicode version, stored as a generation number that is
171*4882a593Smuzhiyun * an index into utf8agetab[]. With this we can filter code
172*4882a593Smuzhiyun * points based on the unicode version in which they were
173*4882a593Smuzhiyun * defined. The CCC of a non-defined code point is 0.
174*4882a593Smuzhiyun * leaf[1]: Canonical Combining Class. During normalization, we need
175*4882a593Smuzhiyun * to do a stable sort into ascending order of all characters
176*4882a593Smuzhiyun * with a non-zero CCC that occur between two characters with
177*4882a593Smuzhiyun * a CCC of 0, or at the begin or end of a string.
178*4882a593Smuzhiyun * The unicode standard guarantees that all CCC values are
179*4882a593Smuzhiyun * between 0 and 254 inclusive, which leaves 255 available as
180*4882a593Smuzhiyun * a special value.
181*4882a593Smuzhiyun * Code points with CCC 0 are known as stoppers.
182*4882a593Smuzhiyun * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
183*4882a593Smuzhiyun * start of a NUL-terminated string that is the decomposition
184*4882a593Smuzhiyun * of the character.
185*4882a593Smuzhiyun * The CCC of a decomposable character is the same as the CCC
186*4882a593Smuzhiyun * of the first character of its decomposition.
187*4882a593Smuzhiyun * Some characters decompose as the empty string: these are
188*4882a593Smuzhiyun * characters with the Default_Ignorable_Code_Point property.
189*4882a593Smuzhiyun * These do affect normalization, as they all have CCC 0.
190*4882a593Smuzhiyun *
191*4882a593Smuzhiyun * The decompositions in the trie have been fully expanded, with the
192*4882a593Smuzhiyun * exception of Hangul syllables, which are decomposed algorithmically.
193*4882a593Smuzhiyun *
194*4882a593Smuzhiyun * Casefolding, if applicable, is also done using decompositions.
195*4882a593Smuzhiyun *
196*4882a593Smuzhiyun * The trie is constructed in such a way that leaves exist for all
197*4882a593Smuzhiyun * UTF-8 sequences that match the criteria from the "UTF-8 valid
198*4882a593Smuzhiyun * ranges" comment above, and only for those sequences. Therefore a
199*4882a593Smuzhiyun * lookup in the trie can be used to validate the UTF-8 input.
200*4882a593Smuzhiyun */
201*4882a593Smuzhiyun typedef const unsigned char utf8leaf_t;
202*4882a593Smuzhiyun
203*4882a593Smuzhiyun #define LEAF_GEN(LEAF) ((LEAF)[0])
204*4882a593Smuzhiyun #define LEAF_CCC(LEAF) ((LEAF)[1])
205*4882a593Smuzhiyun #define LEAF_STR(LEAF) ((const char *)((LEAF) + 2))
206*4882a593Smuzhiyun
207*4882a593Smuzhiyun #define MINCCC (0)
208*4882a593Smuzhiyun #define MAXCCC (254)
209*4882a593Smuzhiyun #define STOPPER (0)
210*4882a593Smuzhiyun #define DECOMPOSE (255)
211*4882a593Smuzhiyun
212*4882a593Smuzhiyun /* Marker for hangul syllable decomposition. */
213*4882a593Smuzhiyun #define HANGUL ((char)(255))
214*4882a593Smuzhiyun /* Size of the synthesized leaf used for Hangul syllable decomposition. */
215*4882a593Smuzhiyun #define UTF8HANGULLEAF (12)
216*4882a593Smuzhiyun
217*4882a593Smuzhiyun /*
218*4882a593Smuzhiyun * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
219*4882a593Smuzhiyun *
220*4882a593Smuzhiyun * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
221*4882a593Smuzhiyun * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
222*4882a593Smuzhiyun *
223*4882a593Smuzhiyun * SBase = 0xAC00
224*4882a593Smuzhiyun * LBase = 0x1100
225*4882a593Smuzhiyun * VBase = 0x1161
226*4882a593Smuzhiyun * TBase = 0x11A7
227*4882a593Smuzhiyun * LCount = 19
228*4882a593Smuzhiyun * VCount = 21
229*4882a593Smuzhiyun * TCount = 28
230*4882a593Smuzhiyun * NCount = 588 (VCount * TCount)
231*4882a593Smuzhiyun * SCount = 11172 (LCount * NCount)
232*4882a593Smuzhiyun *
233*4882a593Smuzhiyun * Decomposition:
234*4882a593Smuzhiyun * SIndex = s - SBase
235*4882a593Smuzhiyun *
236*4882a593Smuzhiyun * LV (Canonical/Full)
237*4882a593Smuzhiyun * LIndex = SIndex / NCount
238*4882a593Smuzhiyun * VIndex = (Sindex % NCount) / TCount
239*4882a593Smuzhiyun * LPart = LBase + LIndex
240*4882a593Smuzhiyun * VPart = VBase + VIndex
241*4882a593Smuzhiyun *
242*4882a593Smuzhiyun * LVT (Canonical)
243*4882a593Smuzhiyun * LVIndex = (SIndex / TCount) * TCount
244*4882a593Smuzhiyun * TIndex = (Sindex % TCount)
245*4882a593Smuzhiyun * LVPart = SBase + LVIndex
246*4882a593Smuzhiyun * TPart = TBase + TIndex
247*4882a593Smuzhiyun *
248*4882a593Smuzhiyun * LVT (Full)
249*4882a593Smuzhiyun * LIndex = SIndex / NCount
250*4882a593Smuzhiyun * VIndex = (Sindex % NCount) / TCount
251*4882a593Smuzhiyun * TIndex = (Sindex % TCount)
252*4882a593Smuzhiyun * LPart = LBase + LIndex
253*4882a593Smuzhiyun * VPart = VBase + VIndex
254*4882a593Smuzhiyun * if (TIndex == 0) {
255*4882a593Smuzhiyun * d = <LPart, VPart>
256*4882a593Smuzhiyun * } else {
257*4882a593Smuzhiyun * TPart = TBase + TIndex
258*4882a593Smuzhiyun * d = <LPart, TPart, VPart>
259*4882a593Smuzhiyun * }
260*4882a593Smuzhiyun */
261*4882a593Smuzhiyun
262*4882a593Smuzhiyun /* Constants */
263*4882a593Smuzhiyun #define SB (0xAC00)
264*4882a593Smuzhiyun #define LB (0x1100)
265*4882a593Smuzhiyun #define VB (0x1161)
266*4882a593Smuzhiyun #define TB (0x11A7)
267*4882a593Smuzhiyun #define LC (19)
268*4882a593Smuzhiyun #define VC (21)
269*4882a593Smuzhiyun #define TC (28)
270*4882a593Smuzhiyun #define NC (VC * TC)
271*4882a593Smuzhiyun #define SC (LC * NC)
272*4882a593Smuzhiyun
273*4882a593Smuzhiyun /* Algorithmic decomposition of hangul syllable. */
274*4882a593Smuzhiyun static utf8leaf_t *
utf8hangul(const char * str,unsigned char * hangul)275*4882a593Smuzhiyun utf8hangul(const char *str, unsigned char *hangul)
276*4882a593Smuzhiyun {
277*4882a593Smuzhiyun unsigned int si;
278*4882a593Smuzhiyun unsigned int li;
279*4882a593Smuzhiyun unsigned int vi;
280*4882a593Smuzhiyun unsigned int ti;
281*4882a593Smuzhiyun unsigned char *h;
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun /* Calculate the SI, LI, VI, and TI values. */
284*4882a593Smuzhiyun si = utf8decode3(str) - SB;
285*4882a593Smuzhiyun li = si / NC;
286*4882a593Smuzhiyun vi = (si % NC) / TC;
287*4882a593Smuzhiyun ti = si % TC;
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun /* Fill in base of leaf. */
290*4882a593Smuzhiyun h = hangul;
291*4882a593Smuzhiyun LEAF_GEN(h) = 2;
292*4882a593Smuzhiyun LEAF_CCC(h) = DECOMPOSE;
293*4882a593Smuzhiyun h += 2;
294*4882a593Smuzhiyun
295*4882a593Smuzhiyun /* Add LPart, a 3-byte UTF-8 sequence. */
296*4882a593Smuzhiyun h += utf8encode3((char *)h, li + LB);
297*4882a593Smuzhiyun
298*4882a593Smuzhiyun /* Add VPart, a 3-byte UTF-8 sequence. */
299*4882a593Smuzhiyun h += utf8encode3((char *)h, vi + VB);
300*4882a593Smuzhiyun
301*4882a593Smuzhiyun /* Add TPart if required, also a 3-byte UTF-8 sequence. */
302*4882a593Smuzhiyun if (ti)
303*4882a593Smuzhiyun h += utf8encode3((char *)h, ti + TB);
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun /* Terminate string. */
306*4882a593Smuzhiyun h[0] = '\0';
307*4882a593Smuzhiyun
308*4882a593Smuzhiyun return hangul;
309*4882a593Smuzhiyun }
310*4882a593Smuzhiyun
311*4882a593Smuzhiyun /*
312*4882a593Smuzhiyun * Use trie to scan s, touching at most len bytes.
313*4882a593Smuzhiyun * Returns the leaf if one exists, NULL otherwise.
314*4882a593Smuzhiyun *
315*4882a593Smuzhiyun * A non-NULL return guarantees that the UTF-8 sequence starting at s
316*4882a593Smuzhiyun * is well-formed and corresponds to a known unicode code point. The
317*4882a593Smuzhiyun * shorthand for this will be "is valid UTF-8 unicode".
318*4882a593Smuzhiyun */
utf8nlookup(const struct utf8data * data,unsigned char * hangul,const char * s,size_t len)319*4882a593Smuzhiyun static utf8leaf_t *utf8nlookup(const struct utf8data *data,
320*4882a593Smuzhiyun unsigned char *hangul, const char *s, size_t len)
321*4882a593Smuzhiyun {
322*4882a593Smuzhiyun utf8trie_t *trie = NULL;
323*4882a593Smuzhiyun int offlen;
324*4882a593Smuzhiyun int offset;
325*4882a593Smuzhiyun int mask;
326*4882a593Smuzhiyun int node;
327*4882a593Smuzhiyun
328*4882a593Smuzhiyun if (!data)
329*4882a593Smuzhiyun return NULL;
330*4882a593Smuzhiyun if (len == 0)
331*4882a593Smuzhiyun return NULL;
332*4882a593Smuzhiyun
333*4882a593Smuzhiyun trie = utf8data + data->offset;
334*4882a593Smuzhiyun node = 1;
335*4882a593Smuzhiyun while (node) {
336*4882a593Smuzhiyun offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
337*4882a593Smuzhiyun if (*trie & NEXTBYTE) {
338*4882a593Smuzhiyun if (--len == 0)
339*4882a593Smuzhiyun return NULL;
340*4882a593Smuzhiyun s++;
341*4882a593Smuzhiyun }
342*4882a593Smuzhiyun mask = 1 << (*trie & BITNUM);
343*4882a593Smuzhiyun if (*s & mask) {
344*4882a593Smuzhiyun /* Right leg */
345*4882a593Smuzhiyun if (offlen) {
346*4882a593Smuzhiyun /* Right node at offset of trie */
347*4882a593Smuzhiyun node = (*trie & RIGHTNODE);
348*4882a593Smuzhiyun offset = trie[offlen];
349*4882a593Smuzhiyun while (--offlen) {
350*4882a593Smuzhiyun offset <<= 8;
351*4882a593Smuzhiyun offset |= trie[offlen];
352*4882a593Smuzhiyun }
353*4882a593Smuzhiyun trie += offset;
354*4882a593Smuzhiyun } else if (*trie & RIGHTPATH) {
355*4882a593Smuzhiyun /* Right node after this node */
356*4882a593Smuzhiyun node = (*trie & TRIENODE);
357*4882a593Smuzhiyun trie++;
358*4882a593Smuzhiyun } else {
359*4882a593Smuzhiyun /* No right node. */
360*4882a593Smuzhiyun return NULL;
361*4882a593Smuzhiyun }
362*4882a593Smuzhiyun } else {
363*4882a593Smuzhiyun /* Left leg */
364*4882a593Smuzhiyun if (offlen) {
365*4882a593Smuzhiyun /* Left node after this node. */
366*4882a593Smuzhiyun node = (*trie & LEFTNODE);
367*4882a593Smuzhiyun trie += offlen + 1;
368*4882a593Smuzhiyun } else if (*trie & RIGHTPATH) {
369*4882a593Smuzhiyun /* No left node. */
370*4882a593Smuzhiyun return NULL;
371*4882a593Smuzhiyun } else {
372*4882a593Smuzhiyun /* Left node after this node */
373*4882a593Smuzhiyun node = (*trie & TRIENODE);
374*4882a593Smuzhiyun trie++;
375*4882a593Smuzhiyun }
376*4882a593Smuzhiyun }
377*4882a593Smuzhiyun }
378*4882a593Smuzhiyun /*
379*4882a593Smuzhiyun * Hangul decomposition is done algorithmically. These are the
380*4882a593Smuzhiyun * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
381*4882a593Smuzhiyun * always 3 bytes long, so s has been advanced twice, and the
382*4882a593Smuzhiyun * start of the sequence is at s-2.
383*4882a593Smuzhiyun */
384*4882a593Smuzhiyun if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
385*4882a593Smuzhiyun trie = utf8hangul(s - 2, hangul);
386*4882a593Smuzhiyun return trie;
387*4882a593Smuzhiyun }
388*4882a593Smuzhiyun
389*4882a593Smuzhiyun /*
390*4882a593Smuzhiyun * Use trie to scan s.
391*4882a593Smuzhiyun * Returns the leaf if one exists, NULL otherwise.
392*4882a593Smuzhiyun *
393*4882a593Smuzhiyun * Forwards to utf8nlookup().
394*4882a593Smuzhiyun */
utf8lookup(const struct utf8data * data,unsigned char * hangul,const char * s)395*4882a593Smuzhiyun static utf8leaf_t *utf8lookup(const struct utf8data *data,
396*4882a593Smuzhiyun unsigned char *hangul, const char *s)
397*4882a593Smuzhiyun {
398*4882a593Smuzhiyun return utf8nlookup(data, hangul, s, (size_t)-1);
399*4882a593Smuzhiyun }
400*4882a593Smuzhiyun
401*4882a593Smuzhiyun /*
402*4882a593Smuzhiyun * Maximum age of any character in s.
403*4882a593Smuzhiyun * Return -1 if s is not valid UTF-8 unicode.
404*4882a593Smuzhiyun * Return 0 if only non-assigned code points are used.
405*4882a593Smuzhiyun */
utf8agemax(const struct utf8data * data,const char * s)406*4882a593Smuzhiyun int utf8agemax(const struct utf8data *data, const char *s)
407*4882a593Smuzhiyun {
408*4882a593Smuzhiyun utf8leaf_t *leaf;
409*4882a593Smuzhiyun int age = 0;
410*4882a593Smuzhiyun int leaf_age;
411*4882a593Smuzhiyun unsigned char hangul[UTF8HANGULLEAF];
412*4882a593Smuzhiyun
413*4882a593Smuzhiyun if (!data)
414*4882a593Smuzhiyun return -1;
415*4882a593Smuzhiyun
416*4882a593Smuzhiyun while (*s) {
417*4882a593Smuzhiyun leaf = utf8lookup(data, hangul, s);
418*4882a593Smuzhiyun if (!leaf)
419*4882a593Smuzhiyun return -1;
420*4882a593Smuzhiyun
421*4882a593Smuzhiyun leaf_age = utf8agetab[LEAF_GEN(leaf)];
422*4882a593Smuzhiyun if (leaf_age <= data->maxage && leaf_age > age)
423*4882a593Smuzhiyun age = leaf_age;
424*4882a593Smuzhiyun s += utf8clen(s);
425*4882a593Smuzhiyun }
426*4882a593Smuzhiyun return age;
427*4882a593Smuzhiyun }
428*4882a593Smuzhiyun EXPORT_SYMBOL(utf8agemax);
429*4882a593Smuzhiyun
430*4882a593Smuzhiyun /*
431*4882a593Smuzhiyun * Minimum age of any character in s.
432*4882a593Smuzhiyun * Return -1 if s is not valid UTF-8 unicode.
433*4882a593Smuzhiyun * Return 0 if non-assigned code points are used.
434*4882a593Smuzhiyun */
utf8agemin(const struct utf8data * data,const char * s)435*4882a593Smuzhiyun int utf8agemin(const struct utf8data *data, const char *s)
436*4882a593Smuzhiyun {
437*4882a593Smuzhiyun utf8leaf_t *leaf;
438*4882a593Smuzhiyun int age;
439*4882a593Smuzhiyun int leaf_age;
440*4882a593Smuzhiyun unsigned char hangul[UTF8HANGULLEAF];
441*4882a593Smuzhiyun
442*4882a593Smuzhiyun if (!data)
443*4882a593Smuzhiyun return -1;
444*4882a593Smuzhiyun age = data->maxage;
445*4882a593Smuzhiyun while (*s) {
446*4882a593Smuzhiyun leaf = utf8lookup(data, hangul, s);
447*4882a593Smuzhiyun if (!leaf)
448*4882a593Smuzhiyun return -1;
449*4882a593Smuzhiyun leaf_age = utf8agetab[LEAF_GEN(leaf)];
450*4882a593Smuzhiyun if (leaf_age <= data->maxage && leaf_age < age)
451*4882a593Smuzhiyun age = leaf_age;
452*4882a593Smuzhiyun s += utf8clen(s);
453*4882a593Smuzhiyun }
454*4882a593Smuzhiyun return age;
455*4882a593Smuzhiyun }
456*4882a593Smuzhiyun EXPORT_SYMBOL(utf8agemin);
457*4882a593Smuzhiyun
458*4882a593Smuzhiyun /*
459*4882a593Smuzhiyun * Maximum age of any character in s, touch at most len bytes.
460*4882a593Smuzhiyun * Return -1 if s is not valid UTF-8 unicode.
461*4882a593Smuzhiyun */
utf8nagemax(const struct utf8data * data,const char * s,size_t len)462*4882a593Smuzhiyun int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
463*4882a593Smuzhiyun {
464*4882a593Smuzhiyun utf8leaf_t *leaf;
465*4882a593Smuzhiyun int age = 0;
466*4882a593Smuzhiyun int leaf_age;
467*4882a593Smuzhiyun unsigned char hangul[UTF8HANGULLEAF];
468*4882a593Smuzhiyun
469*4882a593Smuzhiyun if (!data)
470*4882a593Smuzhiyun return -1;
471*4882a593Smuzhiyun
472*4882a593Smuzhiyun while (len && *s) {
473*4882a593Smuzhiyun leaf = utf8nlookup(data, hangul, s, len);
474*4882a593Smuzhiyun if (!leaf)
475*4882a593Smuzhiyun return -1;
476*4882a593Smuzhiyun leaf_age = utf8agetab[LEAF_GEN(leaf)];
477*4882a593Smuzhiyun if (leaf_age <= data->maxage && leaf_age > age)
478*4882a593Smuzhiyun age = leaf_age;
479*4882a593Smuzhiyun len -= utf8clen(s);
480*4882a593Smuzhiyun s += utf8clen(s);
481*4882a593Smuzhiyun }
482*4882a593Smuzhiyun return age;
483*4882a593Smuzhiyun }
484*4882a593Smuzhiyun EXPORT_SYMBOL(utf8nagemax);
485*4882a593Smuzhiyun
486*4882a593Smuzhiyun /*
487*4882a593Smuzhiyun * Maximum age of any character in s, touch at most len bytes.
488*4882a593Smuzhiyun * Return -1 if s is not valid UTF-8 unicode.
489*4882a593Smuzhiyun */
utf8nagemin(const struct utf8data * data,const char * s,size_t len)490*4882a593Smuzhiyun int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
491*4882a593Smuzhiyun {
492*4882a593Smuzhiyun utf8leaf_t *leaf;
493*4882a593Smuzhiyun int leaf_age;
494*4882a593Smuzhiyun int age;
495*4882a593Smuzhiyun unsigned char hangul[UTF8HANGULLEAF];
496*4882a593Smuzhiyun
497*4882a593Smuzhiyun if (!data)
498*4882a593Smuzhiyun return -1;
499*4882a593Smuzhiyun age = data->maxage;
500*4882a593Smuzhiyun while (len && *s) {
501*4882a593Smuzhiyun leaf = utf8nlookup(data, hangul, s, len);
502*4882a593Smuzhiyun if (!leaf)
503*4882a593Smuzhiyun return -1;
504*4882a593Smuzhiyun leaf_age = utf8agetab[LEAF_GEN(leaf)];
505*4882a593Smuzhiyun if (leaf_age <= data->maxage && leaf_age < age)
506*4882a593Smuzhiyun age = leaf_age;
507*4882a593Smuzhiyun len -= utf8clen(s);
508*4882a593Smuzhiyun s += utf8clen(s);
509*4882a593Smuzhiyun }
510*4882a593Smuzhiyun return age;
511*4882a593Smuzhiyun }
512*4882a593Smuzhiyun EXPORT_SYMBOL(utf8nagemin);
513*4882a593Smuzhiyun
514*4882a593Smuzhiyun /*
515*4882a593Smuzhiyun * Length of the normalization of s.
516*4882a593Smuzhiyun * Return -1 if s is not valid UTF-8 unicode.
517*4882a593Smuzhiyun *
518*4882a593Smuzhiyun * A string of Default_Ignorable_Code_Point has length 0.
519*4882a593Smuzhiyun */
utf8len(const struct utf8data * data,const char * s)520*4882a593Smuzhiyun ssize_t utf8len(const struct utf8data *data, const char *s)
521*4882a593Smuzhiyun {
522*4882a593Smuzhiyun utf8leaf_t *leaf;
523*4882a593Smuzhiyun size_t ret = 0;
524*4882a593Smuzhiyun unsigned char hangul[UTF8HANGULLEAF];
525*4882a593Smuzhiyun
526*4882a593Smuzhiyun if (!data)
527*4882a593Smuzhiyun return -1;
528*4882a593Smuzhiyun while (*s) {
529*4882a593Smuzhiyun leaf = utf8lookup(data, hangul, s);
530*4882a593Smuzhiyun if (!leaf)
531*4882a593Smuzhiyun return -1;
532*4882a593Smuzhiyun if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
533*4882a593Smuzhiyun ret += utf8clen(s);
534*4882a593Smuzhiyun else if (LEAF_CCC(leaf) == DECOMPOSE)
535*4882a593Smuzhiyun ret += strlen(LEAF_STR(leaf));
536*4882a593Smuzhiyun else
537*4882a593Smuzhiyun ret += utf8clen(s);
538*4882a593Smuzhiyun s += utf8clen(s);
539*4882a593Smuzhiyun }
540*4882a593Smuzhiyun return ret;
541*4882a593Smuzhiyun }
542*4882a593Smuzhiyun EXPORT_SYMBOL(utf8len);
543*4882a593Smuzhiyun
544*4882a593Smuzhiyun /*
545*4882a593Smuzhiyun * Length of the normalization of s, touch at most len bytes.
546*4882a593Smuzhiyun * Return -1 if s is not valid UTF-8 unicode.
547*4882a593Smuzhiyun */
utf8nlen(const struct utf8data * data,const char * s,size_t len)548*4882a593Smuzhiyun ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
549*4882a593Smuzhiyun {
550*4882a593Smuzhiyun utf8leaf_t *leaf;
551*4882a593Smuzhiyun size_t ret = 0;
552*4882a593Smuzhiyun unsigned char hangul[UTF8HANGULLEAF];
553*4882a593Smuzhiyun
554*4882a593Smuzhiyun if (!data)
555*4882a593Smuzhiyun return -1;
556*4882a593Smuzhiyun while (len && *s) {
557*4882a593Smuzhiyun leaf = utf8nlookup(data, hangul, s, len);
558*4882a593Smuzhiyun if (!leaf)
559*4882a593Smuzhiyun return -1;
560*4882a593Smuzhiyun if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
561*4882a593Smuzhiyun ret += utf8clen(s);
562*4882a593Smuzhiyun else if (LEAF_CCC(leaf) == DECOMPOSE)
563*4882a593Smuzhiyun ret += strlen(LEAF_STR(leaf));
564*4882a593Smuzhiyun else
565*4882a593Smuzhiyun ret += utf8clen(s);
566*4882a593Smuzhiyun len -= utf8clen(s);
567*4882a593Smuzhiyun s += utf8clen(s);
568*4882a593Smuzhiyun }
569*4882a593Smuzhiyun return ret;
570*4882a593Smuzhiyun }
571*4882a593Smuzhiyun EXPORT_SYMBOL(utf8nlen);
572*4882a593Smuzhiyun
573*4882a593Smuzhiyun /*
574*4882a593Smuzhiyun * Set up an utf8cursor for use by utf8byte().
575*4882a593Smuzhiyun *
576*4882a593Smuzhiyun * u8c : pointer to cursor.
577*4882a593Smuzhiyun * data : const struct utf8data to use for normalization.
578*4882a593Smuzhiyun * s : string.
579*4882a593Smuzhiyun * len : length of s.
580*4882a593Smuzhiyun *
581*4882a593Smuzhiyun * Returns -1 on error, 0 on success.
582*4882a593Smuzhiyun */
utf8ncursor(struct utf8cursor * u8c,const struct utf8data * data,const char * s,size_t len)583*4882a593Smuzhiyun int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
584*4882a593Smuzhiyun const char *s, size_t len)
585*4882a593Smuzhiyun {
586*4882a593Smuzhiyun if (!data)
587*4882a593Smuzhiyun return -1;
588*4882a593Smuzhiyun if (!s)
589*4882a593Smuzhiyun return -1;
590*4882a593Smuzhiyun u8c->data = data;
591*4882a593Smuzhiyun u8c->s = s;
592*4882a593Smuzhiyun u8c->p = NULL;
593*4882a593Smuzhiyun u8c->ss = NULL;
594*4882a593Smuzhiyun u8c->sp = NULL;
595*4882a593Smuzhiyun u8c->len = len;
596*4882a593Smuzhiyun u8c->slen = 0;
597*4882a593Smuzhiyun u8c->ccc = STOPPER;
598*4882a593Smuzhiyun u8c->nccc = STOPPER;
599*4882a593Smuzhiyun /* Check we didn't clobber the maximum length. */
600*4882a593Smuzhiyun if (u8c->len != len)
601*4882a593Smuzhiyun return -1;
602*4882a593Smuzhiyun /* The first byte of s may not be an utf8 continuation. */
603*4882a593Smuzhiyun if (len > 0 && (*s & 0xC0) == 0x80)
604*4882a593Smuzhiyun return -1;
605*4882a593Smuzhiyun return 0;
606*4882a593Smuzhiyun }
607*4882a593Smuzhiyun EXPORT_SYMBOL(utf8ncursor);
608*4882a593Smuzhiyun
609*4882a593Smuzhiyun /*
610*4882a593Smuzhiyun * Set up an utf8cursor for use by utf8byte().
611*4882a593Smuzhiyun *
612*4882a593Smuzhiyun * u8c : pointer to cursor.
613*4882a593Smuzhiyun * data : const struct utf8data to use for normalization.
614*4882a593Smuzhiyun * s : NUL-terminated string.
615*4882a593Smuzhiyun *
616*4882a593Smuzhiyun * Returns -1 on error, 0 on success.
617*4882a593Smuzhiyun */
utf8cursor(struct utf8cursor * u8c,const struct utf8data * data,const char * s)618*4882a593Smuzhiyun int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
619*4882a593Smuzhiyun const char *s)
620*4882a593Smuzhiyun {
621*4882a593Smuzhiyun return utf8ncursor(u8c, data, s, (unsigned int)-1);
622*4882a593Smuzhiyun }
623*4882a593Smuzhiyun EXPORT_SYMBOL(utf8cursor);
624*4882a593Smuzhiyun
625*4882a593Smuzhiyun /*
626*4882a593Smuzhiyun * Get one byte from the normalized form of the string described by u8c.
627*4882a593Smuzhiyun *
628*4882a593Smuzhiyun * Returns the byte cast to an unsigned char on succes, and -1 on failure.
629*4882a593Smuzhiyun *
630*4882a593Smuzhiyun * The cursor keeps track of the location in the string in u8c->s.
631*4882a593Smuzhiyun * When a character is decomposed, the current location is stored in
632*4882a593Smuzhiyun * u8c->p, and u8c->s is set to the start of the decomposition. Note
633*4882a593Smuzhiyun * that bytes from a decomposition do not count against u8c->len.
634*4882a593Smuzhiyun *
635*4882a593Smuzhiyun * Characters are emitted if they match the current CCC in u8c->ccc.
636*4882a593Smuzhiyun * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
637*4882a593Smuzhiyun * and the function returns 0 in that case.
638*4882a593Smuzhiyun *
639*4882a593Smuzhiyun * Sorting by CCC is done by repeatedly scanning the string. The
640*4882a593Smuzhiyun * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
641*4882a593Smuzhiyun * the start of the scan. The first pass finds the lowest CCC to be
642*4882a593Smuzhiyun * emitted and stores it in u8c->nccc, the second pass emits the
643*4882a593Smuzhiyun * characters with this CCC and finds the next lowest CCC. This limits
644*4882a593Smuzhiyun * the number of passes to 1 + the number of different CCCs in the
645*4882a593Smuzhiyun * sequence being scanned.
646*4882a593Smuzhiyun *
647*4882a593Smuzhiyun * Therefore:
648*4882a593Smuzhiyun * u8c->p != NULL -> a decomposition is being scanned.
649*4882a593Smuzhiyun * u8c->ss != NULL -> this is a repeating scan.
650*4882a593Smuzhiyun * u8c->ccc == -1 -> this is the first scan of a repeating scan.
651*4882a593Smuzhiyun */
utf8byte(struct utf8cursor * u8c)652*4882a593Smuzhiyun int utf8byte(struct utf8cursor *u8c)
653*4882a593Smuzhiyun {
654*4882a593Smuzhiyun utf8leaf_t *leaf;
655*4882a593Smuzhiyun int ccc;
656*4882a593Smuzhiyun
657*4882a593Smuzhiyun for (;;) {
658*4882a593Smuzhiyun /* Check for the end of a decomposed character. */
659*4882a593Smuzhiyun if (u8c->p && *u8c->s == '\0') {
660*4882a593Smuzhiyun u8c->s = u8c->p;
661*4882a593Smuzhiyun u8c->p = NULL;
662*4882a593Smuzhiyun }
663*4882a593Smuzhiyun
664*4882a593Smuzhiyun /* Check for end-of-string. */
665*4882a593Smuzhiyun if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
666*4882a593Smuzhiyun /* There is no next byte. */
667*4882a593Smuzhiyun if (u8c->ccc == STOPPER)
668*4882a593Smuzhiyun return 0;
669*4882a593Smuzhiyun /* End-of-string during a scan counts as a stopper. */
670*4882a593Smuzhiyun ccc = STOPPER;
671*4882a593Smuzhiyun goto ccc_mismatch;
672*4882a593Smuzhiyun } else if ((*u8c->s & 0xC0) == 0x80) {
673*4882a593Smuzhiyun /* This is a continuation of the current character. */
674*4882a593Smuzhiyun if (!u8c->p)
675*4882a593Smuzhiyun u8c->len--;
676*4882a593Smuzhiyun return (unsigned char)*u8c->s++;
677*4882a593Smuzhiyun }
678*4882a593Smuzhiyun
679*4882a593Smuzhiyun /* Look up the data for the current character. */
680*4882a593Smuzhiyun if (u8c->p) {
681*4882a593Smuzhiyun leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
682*4882a593Smuzhiyun } else {
683*4882a593Smuzhiyun leaf = utf8nlookup(u8c->data, u8c->hangul,
684*4882a593Smuzhiyun u8c->s, u8c->len);
685*4882a593Smuzhiyun }
686*4882a593Smuzhiyun
687*4882a593Smuzhiyun /* No leaf found implies that the input is a binary blob. */
688*4882a593Smuzhiyun if (!leaf)
689*4882a593Smuzhiyun return -1;
690*4882a593Smuzhiyun
691*4882a593Smuzhiyun ccc = LEAF_CCC(leaf);
692*4882a593Smuzhiyun /* Characters that are too new have CCC 0. */
693*4882a593Smuzhiyun if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
694*4882a593Smuzhiyun ccc = STOPPER;
695*4882a593Smuzhiyun } else if (ccc == DECOMPOSE) {
696*4882a593Smuzhiyun u8c->len -= utf8clen(u8c->s);
697*4882a593Smuzhiyun u8c->p = u8c->s + utf8clen(u8c->s);
698*4882a593Smuzhiyun u8c->s = LEAF_STR(leaf);
699*4882a593Smuzhiyun /* Empty decomposition implies CCC 0. */
700*4882a593Smuzhiyun if (*u8c->s == '\0') {
701*4882a593Smuzhiyun if (u8c->ccc == STOPPER)
702*4882a593Smuzhiyun continue;
703*4882a593Smuzhiyun ccc = STOPPER;
704*4882a593Smuzhiyun goto ccc_mismatch;
705*4882a593Smuzhiyun }
706*4882a593Smuzhiyun
707*4882a593Smuzhiyun leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
708*4882a593Smuzhiyun if (!leaf)
709*4882a593Smuzhiyun return -1;
710*4882a593Smuzhiyun ccc = LEAF_CCC(leaf);
711*4882a593Smuzhiyun }
712*4882a593Smuzhiyun
713*4882a593Smuzhiyun /*
714*4882a593Smuzhiyun * If this is not a stopper, then see if it updates
715*4882a593Smuzhiyun * the next canonical class to be emitted.
716*4882a593Smuzhiyun */
717*4882a593Smuzhiyun if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
718*4882a593Smuzhiyun u8c->nccc = ccc;
719*4882a593Smuzhiyun
720*4882a593Smuzhiyun /*
721*4882a593Smuzhiyun * Return the current byte if this is the current
722*4882a593Smuzhiyun * combining class.
723*4882a593Smuzhiyun */
724*4882a593Smuzhiyun if (ccc == u8c->ccc) {
725*4882a593Smuzhiyun if (!u8c->p)
726*4882a593Smuzhiyun u8c->len--;
727*4882a593Smuzhiyun return (unsigned char)*u8c->s++;
728*4882a593Smuzhiyun }
729*4882a593Smuzhiyun
730*4882a593Smuzhiyun /* Current combining class mismatch. */
731*4882a593Smuzhiyun ccc_mismatch:
732*4882a593Smuzhiyun if (u8c->nccc == STOPPER) {
733*4882a593Smuzhiyun /*
734*4882a593Smuzhiyun * Scan forward for the first canonical class
735*4882a593Smuzhiyun * to be emitted. Save the position from
736*4882a593Smuzhiyun * which to restart.
737*4882a593Smuzhiyun */
738*4882a593Smuzhiyun u8c->ccc = MINCCC - 1;
739*4882a593Smuzhiyun u8c->nccc = ccc;
740*4882a593Smuzhiyun u8c->sp = u8c->p;
741*4882a593Smuzhiyun u8c->ss = u8c->s;
742*4882a593Smuzhiyun u8c->slen = u8c->len;
743*4882a593Smuzhiyun if (!u8c->p)
744*4882a593Smuzhiyun u8c->len -= utf8clen(u8c->s);
745*4882a593Smuzhiyun u8c->s += utf8clen(u8c->s);
746*4882a593Smuzhiyun } else if (ccc != STOPPER) {
747*4882a593Smuzhiyun /* Not a stopper, and not the ccc we're emitting. */
748*4882a593Smuzhiyun if (!u8c->p)
749*4882a593Smuzhiyun u8c->len -= utf8clen(u8c->s);
750*4882a593Smuzhiyun u8c->s += utf8clen(u8c->s);
751*4882a593Smuzhiyun } else if (u8c->nccc != MAXCCC + 1) {
752*4882a593Smuzhiyun /* At a stopper, restart for next ccc. */
753*4882a593Smuzhiyun u8c->ccc = u8c->nccc;
754*4882a593Smuzhiyun u8c->nccc = MAXCCC + 1;
755*4882a593Smuzhiyun u8c->s = u8c->ss;
756*4882a593Smuzhiyun u8c->p = u8c->sp;
757*4882a593Smuzhiyun u8c->len = u8c->slen;
758*4882a593Smuzhiyun } else {
759*4882a593Smuzhiyun /* All done, proceed from here. */
760*4882a593Smuzhiyun u8c->ccc = STOPPER;
761*4882a593Smuzhiyun u8c->nccc = STOPPER;
762*4882a593Smuzhiyun u8c->sp = NULL;
763*4882a593Smuzhiyun u8c->ss = NULL;
764*4882a593Smuzhiyun u8c->slen = 0;
765*4882a593Smuzhiyun }
766*4882a593Smuzhiyun }
767*4882a593Smuzhiyun }
768*4882a593Smuzhiyun EXPORT_SYMBOL(utf8byte);
769*4882a593Smuzhiyun
utf8nfdi(unsigned int maxage)770*4882a593Smuzhiyun const struct utf8data *utf8nfdi(unsigned int maxage)
771*4882a593Smuzhiyun {
772*4882a593Smuzhiyun int i = ARRAY_SIZE(utf8nfdidata) - 1;
773*4882a593Smuzhiyun
774*4882a593Smuzhiyun while (maxage < utf8nfdidata[i].maxage)
775*4882a593Smuzhiyun i--;
776*4882a593Smuzhiyun if (maxage > utf8nfdidata[i].maxage)
777*4882a593Smuzhiyun return NULL;
778*4882a593Smuzhiyun return &utf8nfdidata[i];
779*4882a593Smuzhiyun }
780*4882a593Smuzhiyun EXPORT_SYMBOL(utf8nfdi);
781*4882a593Smuzhiyun
utf8nfdicf(unsigned int maxage)782*4882a593Smuzhiyun const struct utf8data *utf8nfdicf(unsigned int maxage)
783*4882a593Smuzhiyun {
784*4882a593Smuzhiyun int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
785*4882a593Smuzhiyun
786*4882a593Smuzhiyun while (maxage < utf8nfdicfdata[i].maxage)
787*4882a593Smuzhiyun i--;
788*4882a593Smuzhiyun if (maxage > utf8nfdicfdata[i].maxage)
789*4882a593Smuzhiyun return NULL;
790*4882a593Smuzhiyun return &utf8nfdicfdata[i];
791*4882a593Smuzhiyun }
792*4882a593Smuzhiyun EXPORT_SYMBOL(utf8nfdicf);
793