xref: /OK3568_Linux_fs/kernel/fs/unicode/utf8-norm.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  * Copyright (c) 2014 SGI.
4*4882a593Smuzhiyun  * All rights reserved.
5*4882a593Smuzhiyun  */
6*4882a593Smuzhiyun 
7*4882a593Smuzhiyun #include "utf8n.h"
8*4882a593Smuzhiyun 
9*4882a593Smuzhiyun struct utf8data {
10*4882a593Smuzhiyun 	unsigned int maxage;
11*4882a593Smuzhiyun 	unsigned int offset;
12*4882a593Smuzhiyun };
13*4882a593Smuzhiyun 
14*4882a593Smuzhiyun #define __INCLUDED_FROM_UTF8NORM_C__
15*4882a593Smuzhiyun #include "utf8data.h"
16*4882a593Smuzhiyun #undef __INCLUDED_FROM_UTF8NORM_C__
17*4882a593Smuzhiyun 
utf8version_is_supported(u8 maj,u8 min,u8 rev)18*4882a593Smuzhiyun int utf8version_is_supported(u8 maj, u8 min, u8 rev)
19*4882a593Smuzhiyun {
20*4882a593Smuzhiyun 	int i = ARRAY_SIZE(utf8agetab) - 1;
21*4882a593Smuzhiyun 	unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev);
22*4882a593Smuzhiyun 
23*4882a593Smuzhiyun 	while (i >= 0 && utf8agetab[i] != 0) {
24*4882a593Smuzhiyun 		if (sb_utf8version == utf8agetab[i])
25*4882a593Smuzhiyun 			return 1;
26*4882a593Smuzhiyun 		i--;
27*4882a593Smuzhiyun 	}
28*4882a593Smuzhiyun 	return 0;
29*4882a593Smuzhiyun }
30*4882a593Smuzhiyun EXPORT_SYMBOL(utf8version_is_supported);
31*4882a593Smuzhiyun 
utf8version_latest(void)32*4882a593Smuzhiyun int utf8version_latest(void)
33*4882a593Smuzhiyun {
34*4882a593Smuzhiyun 	return utf8vers;
35*4882a593Smuzhiyun }
36*4882a593Smuzhiyun EXPORT_SYMBOL(utf8version_latest);
37*4882a593Smuzhiyun 
38*4882a593Smuzhiyun /*
39*4882a593Smuzhiyun  * UTF-8 valid ranges.
40*4882a593Smuzhiyun  *
41*4882a593Smuzhiyun  * The UTF-8 encoding spreads the bits of a 32bit word over several
42*4882a593Smuzhiyun  * bytes. This table gives the ranges that can be held and how they'd
43*4882a593Smuzhiyun  * be represented.
44*4882a593Smuzhiyun  *
45*4882a593Smuzhiyun  * 0x00000000 0x0000007F: 0xxxxxxx
46*4882a593Smuzhiyun  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
47*4882a593Smuzhiyun  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
48*4882a593Smuzhiyun  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
49*4882a593Smuzhiyun  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
50*4882a593Smuzhiyun  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
51*4882a593Smuzhiyun  *
52*4882a593Smuzhiyun  * There is an additional requirement on UTF-8, in that only the
53*4882a593Smuzhiyun  * shortest representation of a 32bit value is to be used.  A decoder
54*4882a593Smuzhiyun  * must not decode sequences that do not satisfy this requirement.
55*4882a593Smuzhiyun  * Thus the allowed ranges have a lower bound.
56*4882a593Smuzhiyun  *
57*4882a593Smuzhiyun  * 0x00000000 0x0000007F: 0xxxxxxx
58*4882a593Smuzhiyun  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
59*4882a593Smuzhiyun  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
60*4882a593Smuzhiyun  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
61*4882a593Smuzhiyun  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
62*4882a593Smuzhiyun  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
63*4882a593Smuzhiyun  *
64*4882a593Smuzhiyun  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
65*4882a593Smuzhiyun  * 17 planes of 65536 values.  This limits the sequences actually seen
66*4882a593Smuzhiyun  * even more, to just the following.
67*4882a593Smuzhiyun  *
68*4882a593Smuzhiyun  *          0 -     0x7F: 0                   - 0x7F
69*4882a593Smuzhiyun  *       0x80 -    0x7FF: 0xC2 0x80           - 0xDF 0xBF
70*4882a593Smuzhiyun  *      0x800 -   0xFFFF: 0xE0 0xA0 0x80      - 0xEF 0xBF 0xBF
71*4882a593Smuzhiyun  *    0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF
72*4882a593Smuzhiyun  *
73*4882a593Smuzhiyun  * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed.
74*4882a593Smuzhiyun  *
75*4882a593Smuzhiyun  * Note that the longest sequence seen with valid usage is 4 bytes,
76*4882a593Smuzhiyun  * the same a single UTF-32 character.  This makes the UTF-8
77*4882a593Smuzhiyun  * representation of Unicode strictly smaller than UTF-32.
78*4882a593Smuzhiyun  *
79*4882a593Smuzhiyun  * The shortest sequence requirement was introduced by:
80*4882a593Smuzhiyun  *    Corrigendum #1: UTF-8 Shortest Form
81*4882a593Smuzhiyun  * It can be found here:
82*4882a593Smuzhiyun  *    http://www.unicode.org/versions/corrigendum1.html
83*4882a593Smuzhiyun  *
84*4882a593Smuzhiyun  */
85*4882a593Smuzhiyun 
86*4882a593Smuzhiyun /*
87*4882a593Smuzhiyun  * Return the number of bytes used by the current UTF-8 sequence.
88*4882a593Smuzhiyun  * Assumes the input points to the first byte of a valid UTF-8
89*4882a593Smuzhiyun  * sequence.
90*4882a593Smuzhiyun  */
utf8clen(const char * s)91*4882a593Smuzhiyun static inline int utf8clen(const char *s)
92*4882a593Smuzhiyun {
93*4882a593Smuzhiyun 	unsigned char c = *s;
94*4882a593Smuzhiyun 
95*4882a593Smuzhiyun 	return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
96*4882a593Smuzhiyun }
97*4882a593Smuzhiyun 
98*4882a593Smuzhiyun /*
99*4882a593Smuzhiyun  * Decode a 3-byte UTF-8 sequence.
100*4882a593Smuzhiyun  */
101*4882a593Smuzhiyun static unsigned int
utf8decode3(const char * str)102*4882a593Smuzhiyun utf8decode3(const char *str)
103*4882a593Smuzhiyun {
104*4882a593Smuzhiyun 	unsigned int		uc;
105*4882a593Smuzhiyun 
106*4882a593Smuzhiyun 	uc = *str++ & 0x0F;
107*4882a593Smuzhiyun 	uc <<= 6;
108*4882a593Smuzhiyun 	uc |= *str++ & 0x3F;
109*4882a593Smuzhiyun 	uc <<= 6;
110*4882a593Smuzhiyun 	uc |= *str++ & 0x3F;
111*4882a593Smuzhiyun 
112*4882a593Smuzhiyun 	return uc;
113*4882a593Smuzhiyun }
114*4882a593Smuzhiyun 
115*4882a593Smuzhiyun /*
116*4882a593Smuzhiyun  * Encode a 3-byte UTF-8 sequence.
117*4882a593Smuzhiyun  */
118*4882a593Smuzhiyun static int
utf8encode3(char * str,unsigned int val)119*4882a593Smuzhiyun utf8encode3(char *str, unsigned int val)
120*4882a593Smuzhiyun {
121*4882a593Smuzhiyun 	str[2] = (val & 0x3F) | 0x80;
122*4882a593Smuzhiyun 	val >>= 6;
123*4882a593Smuzhiyun 	str[1] = (val & 0x3F) | 0x80;
124*4882a593Smuzhiyun 	val >>= 6;
125*4882a593Smuzhiyun 	str[0] = val | 0xE0;
126*4882a593Smuzhiyun 
127*4882a593Smuzhiyun 	return 3;
128*4882a593Smuzhiyun }
129*4882a593Smuzhiyun 
130*4882a593Smuzhiyun /*
131*4882a593Smuzhiyun  * utf8trie_t
132*4882a593Smuzhiyun  *
133*4882a593Smuzhiyun  * A compact binary tree, used to decode UTF-8 characters.
134*4882a593Smuzhiyun  *
135*4882a593Smuzhiyun  * Internal nodes are one byte for the node itself, and up to three
136*4882a593Smuzhiyun  * bytes for an offset into the tree.  The first byte contains the
137*4882a593Smuzhiyun  * following information:
138*4882a593Smuzhiyun  *  NEXTBYTE  - flag        - advance to next byte if set
139*4882a593Smuzhiyun  *  BITNUM    - 3 bit field - the bit number to tested
140*4882a593Smuzhiyun  *  OFFLEN    - 2 bit field - number of bytes in the offset
141*4882a593Smuzhiyun  * if offlen == 0 (non-branching node)
142*4882a593Smuzhiyun  *  RIGHTPATH - 1 bit field - set if the following node is for the
143*4882a593Smuzhiyun  *                            right-hand path (tested bit is set)
144*4882a593Smuzhiyun  *  TRIENODE  - 1 bit field - set if the following node is an internal
145*4882a593Smuzhiyun  *                            node, otherwise it is a leaf node
146*4882a593Smuzhiyun  * if offlen != 0 (branching node)
147*4882a593Smuzhiyun  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
148*4882a593Smuzhiyun  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
149*4882a593Smuzhiyun  *
150*4882a593Smuzhiyun  * Due to the way utf8 works, there cannot be branching nodes with
151*4882a593Smuzhiyun  * NEXTBYTE set, and moreover those nodes always have a righthand
152*4882a593Smuzhiyun  * descendant.
153*4882a593Smuzhiyun  */
154*4882a593Smuzhiyun typedef const unsigned char utf8trie_t;
155*4882a593Smuzhiyun #define BITNUM		0x07
156*4882a593Smuzhiyun #define NEXTBYTE	0x08
157*4882a593Smuzhiyun #define OFFLEN		0x30
158*4882a593Smuzhiyun #define OFFLEN_SHIFT	4
159*4882a593Smuzhiyun #define RIGHTPATH	0x40
160*4882a593Smuzhiyun #define TRIENODE	0x80
161*4882a593Smuzhiyun #define RIGHTNODE	0x40
162*4882a593Smuzhiyun #define LEFTNODE	0x80
163*4882a593Smuzhiyun 
164*4882a593Smuzhiyun /*
165*4882a593Smuzhiyun  * utf8leaf_t
166*4882a593Smuzhiyun  *
167*4882a593Smuzhiyun  * The leaves of the trie are embedded in the trie, and so the same
168*4882a593Smuzhiyun  * underlying datatype: unsigned char.
169*4882a593Smuzhiyun  *
170*4882a593Smuzhiyun  * leaf[0]: The unicode version, stored as a generation number that is
171*4882a593Smuzhiyun  *          an index into utf8agetab[].  With this we can filter code
172*4882a593Smuzhiyun  *          points based on the unicode version in which they were
173*4882a593Smuzhiyun  *          defined.  The CCC of a non-defined code point is 0.
174*4882a593Smuzhiyun  * leaf[1]: Canonical Combining Class. During normalization, we need
175*4882a593Smuzhiyun  *          to do a stable sort into ascending order of all characters
176*4882a593Smuzhiyun  *          with a non-zero CCC that occur between two characters with
177*4882a593Smuzhiyun  *          a CCC of 0, or at the begin or end of a string.
178*4882a593Smuzhiyun  *          The unicode standard guarantees that all CCC values are
179*4882a593Smuzhiyun  *          between 0 and 254 inclusive, which leaves 255 available as
180*4882a593Smuzhiyun  *          a special value.
181*4882a593Smuzhiyun  *          Code points with CCC 0 are known as stoppers.
182*4882a593Smuzhiyun  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
183*4882a593Smuzhiyun  *          start of a NUL-terminated string that is the decomposition
184*4882a593Smuzhiyun  *          of the character.
185*4882a593Smuzhiyun  *          The CCC of a decomposable character is the same as the CCC
186*4882a593Smuzhiyun  *          of the first character of its decomposition.
187*4882a593Smuzhiyun  *          Some characters decompose as the empty string: these are
188*4882a593Smuzhiyun  *          characters with the Default_Ignorable_Code_Point property.
189*4882a593Smuzhiyun  *          These do affect normalization, as they all have CCC 0.
190*4882a593Smuzhiyun  *
191*4882a593Smuzhiyun  * The decompositions in the trie have been fully expanded, with the
192*4882a593Smuzhiyun  * exception of Hangul syllables, which are decomposed algorithmically.
193*4882a593Smuzhiyun  *
194*4882a593Smuzhiyun  * Casefolding, if applicable, is also done using decompositions.
195*4882a593Smuzhiyun  *
196*4882a593Smuzhiyun  * The trie is constructed in such a way that leaves exist for all
197*4882a593Smuzhiyun  * UTF-8 sequences that match the criteria from the "UTF-8 valid
198*4882a593Smuzhiyun  * ranges" comment above, and only for those sequences.  Therefore a
199*4882a593Smuzhiyun  * lookup in the trie can be used to validate the UTF-8 input.
200*4882a593Smuzhiyun  */
201*4882a593Smuzhiyun typedef const unsigned char utf8leaf_t;
202*4882a593Smuzhiyun 
203*4882a593Smuzhiyun #define LEAF_GEN(LEAF)	((LEAF)[0])
204*4882a593Smuzhiyun #define LEAF_CCC(LEAF)	((LEAF)[1])
205*4882a593Smuzhiyun #define LEAF_STR(LEAF)	((const char *)((LEAF) + 2))
206*4882a593Smuzhiyun 
207*4882a593Smuzhiyun #define MINCCC		(0)
208*4882a593Smuzhiyun #define MAXCCC		(254)
209*4882a593Smuzhiyun #define STOPPER		(0)
210*4882a593Smuzhiyun #define	DECOMPOSE	(255)
211*4882a593Smuzhiyun 
212*4882a593Smuzhiyun /* Marker for hangul syllable decomposition. */
213*4882a593Smuzhiyun #define HANGUL		((char)(255))
214*4882a593Smuzhiyun /* Size of the synthesized leaf used for Hangul syllable decomposition. */
215*4882a593Smuzhiyun #define UTF8HANGULLEAF	(12)
216*4882a593Smuzhiyun 
217*4882a593Smuzhiyun /*
218*4882a593Smuzhiyun  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
219*4882a593Smuzhiyun  *
220*4882a593Smuzhiyun  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
221*4882a593Smuzhiyun  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
222*4882a593Smuzhiyun  *
223*4882a593Smuzhiyun  * SBase = 0xAC00
224*4882a593Smuzhiyun  * LBase = 0x1100
225*4882a593Smuzhiyun  * VBase = 0x1161
226*4882a593Smuzhiyun  * TBase = 0x11A7
227*4882a593Smuzhiyun  * LCount = 19
228*4882a593Smuzhiyun  * VCount = 21
229*4882a593Smuzhiyun  * TCount = 28
230*4882a593Smuzhiyun  * NCount = 588 (VCount * TCount)
231*4882a593Smuzhiyun  * SCount = 11172 (LCount * NCount)
232*4882a593Smuzhiyun  *
233*4882a593Smuzhiyun  * Decomposition:
234*4882a593Smuzhiyun  *   SIndex = s - SBase
235*4882a593Smuzhiyun  *
236*4882a593Smuzhiyun  * LV (Canonical/Full)
237*4882a593Smuzhiyun  *   LIndex = SIndex / NCount
238*4882a593Smuzhiyun  *   VIndex = (Sindex % NCount) / TCount
239*4882a593Smuzhiyun  *   LPart = LBase + LIndex
240*4882a593Smuzhiyun  *   VPart = VBase + VIndex
241*4882a593Smuzhiyun  *
242*4882a593Smuzhiyun  * LVT (Canonical)
243*4882a593Smuzhiyun  *   LVIndex = (SIndex / TCount) * TCount
244*4882a593Smuzhiyun  *   TIndex = (Sindex % TCount)
245*4882a593Smuzhiyun  *   LVPart = SBase + LVIndex
246*4882a593Smuzhiyun  *   TPart = TBase + TIndex
247*4882a593Smuzhiyun  *
248*4882a593Smuzhiyun  * LVT (Full)
249*4882a593Smuzhiyun  *   LIndex = SIndex / NCount
250*4882a593Smuzhiyun  *   VIndex = (Sindex % NCount) / TCount
251*4882a593Smuzhiyun  *   TIndex = (Sindex % TCount)
252*4882a593Smuzhiyun  *   LPart = LBase + LIndex
253*4882a593Smuzhiyun  *   VPart = VBase + VIndex
254*4882a593Smuzhiyun  *   if (TIndex == 0) {
255*4882a593Smuzhiyun  *          d = <LPart, VPart>
256*4882a593Smuzhiyun  *   } else {
257*4882a593Smuzhiyun  *          TPart = TBase + TIndex
258*4882a593Smuzhiyun  *          d = <LPart, TPart, VPart>
259*4882a593Smuzhiyun  *   }
260*4882a593Smuzhiyun  */
261*4882a593Smuzhiyun 
262*4882a593Smuzhiyun /* Constants */
263*4882a593Smuzhiyun #define SB	(0xAC00)
264*4882a593Smuzhiyun #define LB	(0x1100)
265*4882a593Smuzhiyun #define VB	(0x1161)
266*4882a593Smuzhiyun #define TB	(0x11A7)
267*4882a593Smuzhiyun #define LC	(19)
268*4882a593Smuzhiyun #define VC	(21)
269*4882a593Smuzhiyun #define TC	(28)
270*4882a593Smuzhiyun #define NC	(VC * TC)
271*4882a593Smuzhiyun #define SC	(LC * NC)
272*4882a593Smuzhiyun 
273*4882a593Smuzhiyun /* Algorithmic decomposition of hangul syllable. */
274*4882a593Smuzhiyun static utf8leaf_t *
utf8hangul(const char * str,unsigned char * hangul)275*4882a593Smuzhiyun utf8hangul(const char *str, unsigned char *hangul)
276*4882a593Smuzhiyun {
277*4882a593Smuzhiyun 	unsigned int	si;
278*4882a593Smuzhiyun 	unsigned int	li;
279*4882a593Smuzhiyun 	unsigned int	vi;
280*4882a593Smuzhiyun 	unsigned int	ti;
281*4882a593Smuzhiyun 	unsigned char	*h;
282*4882a593Smuzhiyun 
283*4882a593Smuzhiyun 	/* Calculate the SI, LI, VI, and TI values. */
284*4882a593Smuzhiyun 	si = utf8decode3(str) - SB;
285*4882a593Smuzhiyun 	li = si / NC;
286*4882a593Smuzhiyun 	vi = (si % NC) / TC;
287*4882a593Smuzhiyun 	ti = si % TC;
288*4882a593Smuzhiyun 
289*4882a593Smuzhiyun 	/* Fill in base of leaf. */
290*4882a593Smuzhiyun 	h = hangul;
291*4882a593Smuzhiyun 	LEAF_GEN(h) = 2;
292*4882a593Smuzhiyun 	LEAF_CCC(h) = DECOMPOSE;
293*4882a593Smuzhiyun 	h += 2;
294*4882a593Smuzhiyun 
295*4882a593Smuzhiyun 	/* Add LPart, a 3-byte UTF-8 sequence. */
296*4882a593Smuzhiyun 	h += utf8encode3((char *)h, li + LB);
297*4882a593Smuzhiyun 
298*4882a593Smuzhiyun 	/* Add VPart, a 3-byte UTF-8 sequence. */
299*4882a593Smuzhiyun 	h += utf8encode3((char *)h, vi + VB);
300*4882a593Smuzhiyun 
301*4882a593Smuzhiyun 	/* Add TPart if required, also a 3-byte UTF-8 sequence. */
302*4882a593Smuzhiyun 	if (ti)
303*4882a593Smuzhiyun 		h += utf8encode3((char *)h, ti + TB);
304*4882a593Smuzhiyun 
305*4882a593Smuzhiyun 	/* Terminate string. */
306*4882a593Smuzhiyun 	h[0] = '\0';
307*4882a593Smuzhiyun 
308*4882a593Smuzhiyun 	return hangul;
309*4882a593Smuzhiyun }
310*4882a593Smuzhiyun 
311*4882a593Smuzhiyun /*
312*4882a593Smuzhiyun  * Use trie to scan s, touching at most len bytes.
313*4882a593Smuzhiyun  * Returns the leaf if one exists, NULL otherwise.
314*4882a593Smuzhiyun  *
315*4882a593Smuzhiyun  * A non-NULL return guarantees that the UTF-8 sequence starting at s
316*4882a593Smuzhiyun  * is well-formed and corresponds to a known unicode code point.  The
317*4882a593Smuzhiyun  * shorthand for this will be "is valid UTF-8 unicode".
318*4882a593Smuzhiyun  */
utf8nlookup(const struct utf8data * data,unsigned char * hangul,const char * s,size_t len)319*4882a593Smuzhiyun static utf8leaf_t *utf8nlookup(const struct utf8data *data,
320*4882a593Smuzhiyun 			       unsigned char *hangul, const char *s, size_t len)
321*4882a593Smuzhiyun {
322*4882a593Smuzhiyun 	utf8trie_t	*trie = NULL;
323*4882a593Smuzhiyun 	int		offlen;
324*4882a593Smuzhiyun 	int		offset;
325*4882a593Smuzhiyun 	int		mask;
326*4882a593Smuzhiyun 	int		node;
327*4882a593Smuzhiyun 
328*4882a593Smuzhiyun 	if (!data)
329*4882a593Smuzhiyun 		return NULL;
330*4882a593Smuzhiyun 	if (len == 0)
331*4882a593Smuzhiyun 		return NULL;
332*4882a593Smuzhiyun 
333*4882a593Smuzhiyun 	trie = utf8data + data->offset;
334*4882a593Smuzhiyun 	node = 1;
335*4882a593Smuzhiyun 	while (node) {
336*4882a593Smuzhiyun 		offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
337*4882a593Smuzhiyun 		if (*trie & NEXTBYTE) {
338*4882a593Smuzhiyun 			if (--len == 0)
339*4882a593Smuzhiyun 				return NULL;
340*4882a593Smuzhiyun 			s++;
341*4882a593Smuzhiyun 		}
342*4882a593Smuzhiyun 		mask = 1 << (*trie & BITNUM);
343*4882a593Smuzhiyun 		if (*s & mask) {
344*4882a593Smuzhiyun 			/* Right leg */
345*4882a593Smuzhiyun 			if (offlen) {
346*4882a593Smuzhiyun 				/* Right node at offset of trie */
347*4882a593Smuzhiyun 				node = (*trie & RIGHTNODE);
348*4882a593Smuzhiyun 				offset = trie[offlen];
349*4882a593Smuzhiyun 				while (--offlen) {
350*4882a593Smuzhiyun 					offset <<= 8;
351*4882a593Smuzhiyun 					offset |= trie[offlen];
352*4882a593Smuzhiyun 				}
353*4882a593Smuzhiyun 				trie += offset;
354*4882a593Smuzhiyun 			} else if (*trie & RIGHTPATH) {
355*4882a593Smuzhiyun 				/* Right node after this node */
356*4882a593Smuzhiyun 				node = (*trie & TRIENODE);
357*4882a593Smuzhiyun 				trie++;
358*4882a593Smuzhiyun 			} else {
359*4882a593Smuzhiyun 				/* No right node. */
360*4882a593Smuzhiyun 				return NULL;
361*4882a593Smuzhiyun 			}
362*4882a593Smuzhiyun 		} else {
363*4882a593Smuzhiyun 			/* Left leg */
364*4882a593Smuzhiyun 			if (offlen) {
365*4882a593Smuzhiyun 				/* Left node after this node. */
366*4882a593Smuzhiyun 				node = (*trie & LEFTNODE);
367*4882a593Smuzhiyun 				trie += offlen + 1;
368*4882a593Smuzhiyun 			} else if (*trie & RIGHTPATH) {
369*4882a593Smuzhiyun 				/* No left node. */
370*4882a593Smuzhiyun 				return NULL;
371*4882a593Smuzhiyun 			} else {
372*4882a593Smuzhiyun 				/* Left node after this node */
373*4882a593Smuzhiyun 				node = (*trie & TRIENODE);
374*4882a593Smuzhiyun 				trie++;
375*4882a593Smuzhiyun 			}
376*4882a593Smuzhiyun 		}
377*4882a593Smuzhiyun 	}
378*4882a593Smuzhiyun 	/*
379*4882a593Smuzhiyun 	 * Hangul decomposition is done algorithmically. These are the
380*4882a593Smuzhiyun 	 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
381*4882a593Smuzhiyun 	 * always 3 bytes long, so s has been advanced twice, and the
382*4882a593Smuzhiyun 	 * start of the sequence is at s-2.
383*4882a593Smuzhiyun 	 */
384*4882a593Smuzhiyun 	if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
385*4882a593Smuzhiyun 		trie = utf8hangul(s - 2, hangul);
386*4882a593Smuzhiyun 	return trie;
387*4882a593Smuzhiyun }
388*4882a593Smuzhiyun 
389*4882a593Smuzhiyun /*
390*4882a593Smuzhiyun  * Use trie to scan s.
391*4882a593Smuzhiyun  * Returns the leaf if one exists, NULL otherwise.
392*4882a593Smuzhiyun  *
393*4882a593Smuzhiyun  * Forwards to utf8nlookup().
394*4882a593Smuzhiyun  */
utf8lookup(const struct utf8data * data,unsigned char * hangul,const char * s)395*4882a593Smuzhiyun static utf8leaf_t *utf8lookup(const struct utf8data *data,
396*4882a593Smuzhiyun 			      unsigned char *hangul, const char *s)
397*4882a593Smuzhiyun {
398*4882a593Smuzhiyun 	return utf8nlookup(data, hangul, s, (size_t)-1);
399*4882a593Smuzhiyun }
400*4882a593Smuzhiyun 
401*4882a593Smuzhiyun /*
402*4882a593Smuzhiyun  * Maximum age of any character in s.
403*4882a593Smuzhiyun  * Return -1 if s is not valid UTF-8 unicode.
404*4882a593Smuzhiyun  * Return 0 if only non-assigned code points are used.
405*4882a593Smuzhiyun  */
utf8agemax(const struct utf8data * data,const char * s)406*4882a593Smuzhiyun int utf8agemax(const struct utf8data *data, const char *s)
407*4882a593Smuzhiyun {
408*4882a593Smuzhiyun 	utf8leaf_t	*leaf;
409*4882a593Smuzhiyun 	int		age = 0;
410*4882a593Smuzhiyun 	int		leaf_age;
411*4882a593Smuzhiyun 	unsigned char	hangul[UTF8HANGULLEAF];
412*4882a593Smuzhiyun 
413*4882a593Smuzhiyun 	if (!data)
414*4882a593Smuzhiyun 		return -1;
415*4882a593Smuzhiyun 
416*4882a593Smuzhiyun 	while (*s) {
417*4882a593Smuzhiyun 		leaf = utf8lookup(data, hangul, s);
418*4882a593Smuzhiyun 		if (!leaf)
419*4882a593Smuzhiyun 			return -1;
420*4882a593Smuzhiyun 
421*4882a593Smuzhiyun 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
422*4882a593Smuzhiyun 		if (leaf_age <= data->maxage && leaf_age > age)
423*4882a593Smuzhiyun 			age = leaf_age;
424*4882a593Smuzhiyun 		s += utf8clen(s);
425*4882a593Smuzhiyun 	}
426*4882a593Smuzhiyun 	return age;
427*4882a593Smuzhiyun }
428*4882a593Smuzhiyun EXPORT_SYMBOL(utf8agemax);
429*4882a593Smuzhiyun 
430*4882a593Smuzhiyun /*
431*4882a593Smuzhiyun  * Minimum age of any character in s.
432*4882a593Smuzhiyun  * Return -1 if s is not valid UTF-8 unicode.
433*4882a593Smuzhiyun  * Return 0 if non-assigned code points are used.
434*4882a593Smuzhiyun  */
utf8agemin(const struct utf8data * data,const char * s)435*4882a593Smuzhiyun int utf8agemin(const struct utf8data *data, const char *s)
436*4882a593Smuzhiyun {
437*4882a593Smuzhiyun 	utf8leaf_t	*leaf;
438*4882a593Smuzhiyun 	int		age;
439*4882a593Smuzhiyun 	int		leaf_age;
440*4882a593Smuzhiyun 	unsigned char	hangul[UTF8HANGULLEAF];
441*4882a593Smuzhiyun 
442*4882a593Smuzhiyun 	if (!data)
443*4882a593Smuzhiyun 		return -1;
444*4882a593Smuzhiyun 	age = data->maxage;
445*4882a593Smuzhiyun 	while (*s) {
446*4882a593Smuzhiyun 		leaf = utf8lookup(data, hangul, s);
447*4882a593Smuzhiyun 		if (!leaf)
448*4882a593Smuzhiyun 			return -1;
449*4882a593Smuzhiyun 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
450*4882a593Smuzhiyun 		if (leaf_age <= data->maxage && leaf_age < age)
451*4882a593Smuzhiyun 			age = leaf_age;
452*4882a593Smuzhiyun 		s += utf8clen(s);
453*4882a593Smuzhiyun 	}
454*4882a593Smuzhiyun 	return age;
455*4882a593Smuzhiyun }
456*4882a593Smuzhiyun EXPORT_SYMBOL(utf8agemin);
457*4882a593Smuzhiyun 
458*4882a593Smuzhiyun /*
459*4882a593Smuzhiyun  * Maximum age of any character in s, touch at most len bytes.
460*4882a593Smuzhiyun  * Return -1 if s is not valid UTF-8 unicode.
461*4882a593Smuzhiyun  */
utf8nagemax(const struct utf8data * data,const char * s,size_t len)462*4882a593Smuzhiyun int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
463*4882a593Smuzhiyun {
464*4882a593Smuzhiyun 	utf8leaf_t	*leaf;
465*4882a593Smuzhiyun 	int		age = 0;
466*4882a593Smuzhiyun 	int		leaf_age;
467*4882a593Smuzhiyun 	unsigned char	hangul[UTF8HANGULLEAF];
468*4882a593Smuzhiyun 
469*4882a593Smuzhiyun 	if (!data)
470*4882a593Smuzhiyun 		return -1;
471*4882a593Smuzhiyun 
472*4882a593Smuzhiyun 	while (len && *s) {
473*4882a593Smuzhiyun 		leaf = utf8nlookup(data, hangul, s, len);
474*4882a593Smuzhiyun 		if (!leaf)
475*4882a593Smuzhiyun 			return -1;
476*4882a593Smuzhiyun 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
477*4882a593Smuzhiyun 		if (leaf_age <= data->maxage && leaf_age > age)
478*4882a593Smuzhiyun 			age = leaf_age;
479*4882a593Smuzhiyun 		len -= utf8clen(s);
480*4882a593Smuzhiyun 		s += utf8clen(s);
481*4882a593Smuzhiyun 	}
482*4882a593Smuzhiyun 	return age;
483*4882a593Smuzhiyun }
484*4882a593Smuzhiyun EXPORT_SYMBOL(utf8nagemax);
485*4882a593Smuzhiyun 
486*4882a593Smuzhiyun /*
487*4882a593Smuzhiyun  * Maximum age of any character in s, touch at most len bytes.
488*4882a593Smuzhiyun  * Return -1 if s is not valid UTF-8 unicode.
489*4882a593Smuzhiyun  */
utf8nagemin(const struct utf8data * data,const char * s,size_t len)490*4882a593Smuzhiyun int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
491*4882a593Smuzhiyun {
492*4882a593Smuzhiyun 	utf8leaf_t	*leaf;
493*4882a593Smuzhiyun 	int		leaf_age;
494*4882a593Smuzhiyun 	int		age;
495*4882a593Smuzhiyun 	unsigned char	hangul[UTF8HANGULLEAF];
496*4882a593Smuzhiyun 
497*4882a593Smuzhiyun 	if (!data)
498*4882a593Smuzhiyun 		return -1;
499*4882a593Smuzhiyun 	age = data->maxage;
500*4882a593Smuzhiyun 	while (len && *s) {
501*4882a593Smuzhiyun 		leaf = utf8nlookup(data, hangul, s, len);
502*4882a593Smuzhiyun 		if (!leaf)
503*4882a593Smuzhiyun 			return -1;
504*4882a593Smuzhiyun 		leaf_age = utf8agetab[LEAF_GEN(leaf)];
505*4882a593Smuzhiyun 		if (leaf_age <= data->maxage && leaf_age < age)
506*4882a593Smuzhiyun 			age = leaf_age;
507*4882a593Smuzhiyun 		len -= utf8clen(s);
508*4882a593Smuzhiyun 		s += utf8clen(s);
509*4882a593Smuzhiyun 	}
510*4882a593Smuzhiyun 	return age;
511*4882a593Smuzhiyun }
512*4882a593Smuzhiyun EXPORT_SYMBOL(utf8nagemin);
513*4882a593Smuzhiyun 
514*4882a593Smuzhiyun /*
515*4882a593Smuzhiyun  * Length of the normalization of s.
516*4882a593Smuzhiyun  * Return -1 if s is not valid UTF-8 unicode.
517*4882a593Smuzhiyun  *
518*4882a593Smuzhiyun  * A string of Default_Ignorable_Code_Point has length 0.
519*4882a593Smuzhiyun  */
utf8len(const struct utf8data * data,const char * s)520*4882a593Smuzhiyun ssize_t utf8len(const struct utf8data *data, const char *s)
521*4882a593Smuzhiyun {
522*4882a593Smuzhiyun 	utf8leaf_t	*leaf;
523*4882a593Smuzhiyun 	size_t		ret = 0;
524*4882a593Smuzhiyun 	unsigned char	hangul[UTF8HANGULLEAF];
525*4882a593Smuzhiyun 
526*4882a593Smuzhiyun 	if (!data)
527*4882a593Smuzhiyun 		return -1;
528*4882a593Smuzhiyun 	while (*s) {
529*4882a593Smuzhiyun 		leaf = utf8lookup(data, hangul, s);
530*4882a593Smuzhiyun 		if (!leaf)
531*4882a593Smuzhiyun 			return -1;
532*4882a593Smuzhiyun 		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
533*4882a593Smuzhiyun 			ret += utf8clen(s);
534*4882a593Smuzhiyun 		else if (LEAF_CCC(leaf) == DECOMPOSE)
535*4882a593Smuzhiyun 			ret += strlen(LEAF_STR(leaf));
536*4882a593Smuzhiyun 		else
537*4882a593Smuzhiyun 			ret += utf8clen(s);
538*4882a593Smuzhiyun 		s += utf8clen(s);
539*4882a593Smuzhiyun 	}
540*4882a593Smuzhiyun 	return ret;
541*4882a593Smuzhiyun }
542*4882a593Smuzhiyun EXPORT_SYMBOL(utf8len);
543*4882a593Smuzhiyun 
544*4882a593Smuzhiyun /*
545*4882a593Smuzhiyun  * Length of the normalization of s, touch at most len bytes.
546*4882a593Smuzhiyun  * Return -1 if s is not valid UTF-8 unicode.
547*4882a593Smuzhiyun  */
utf8nlen(const struct utf8data * data,const char * s,size_t len)548*4882a593Smuzhiyun ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
549*4882a593Smuzhiyun {
550*4882a593Smuzhiyun 	utf8leaf_t	*leaf;
551*4882a593Smuzhiyun 	size_t		ret = 0;
552*4882a593Smuzhiyun 	unsigned char	hangul[UTF8HANGULLEAF];
553*4882a593Smuzhiyun 
554*4882a593Smuzhiyun 	if (!data)
555*4882a593Smuzhiyun 		return -1;
556*4882a593Smuzhiyun 	while (len && *s) {
557*4882a593Smuzhiyun 		leaf = utf8nlookup(data, hangul, s, len);
558*4882a593Smuzhiyun 		if (!leaf)
559*4882a593Smuzhiyun 			return -1;
560*4882a593Smuzhiyun 		if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
561*4882a593Smuzhiyun 			ret += utf8clen(s);
562*4882a593Smuzhiyun 		else if (LEAF_CCC(leaf) == DECOMPOSE)
563*4882a593Smuzhiyun 			ret += strlen(LEAF_STR(leaf));
564*4882a593Smuzhiyun 		else
565*4882a593Smuzhiyun 			ret += utf8clen(s);
566*4882a593Smuzhiyun 		len -= utf8clen(s);
567*4882a593Smuzhiyun 		s += utf8clen(s);
568*4882a593Smuzhiyun 	}
569*4882a593Smuzhiyun 	return ret;
570*4882a593Smuzhiyun }
571*4882a593Smuzhiyun EXPORT_SYMBOL(utf8nlen);
572*4882a593Smuzhiyun 
573*4882a593Smuzhiyun /*
574*4882a593Smuzhiyun  * Set up an utf8cursor for use by utf8byte().
575*4882a593Smuzhiyun  *
576*4882a593Smuzhiyun  *   u8c    : pointer to cursor.
577*4882a593Smuzhiyun  *   data   : const struct utf8data to use for normalization.
578*4882a593Smuzhiyun  *   s      : string.
579*4882a593Smuzhiyun  *   len    : length of s.
580*4882a593Smuzhiyun  *
581*4882a593Smuzhiyun  * Returns -1 on error, 0 on success.
582*4882a593Smuzhiyun  */
utf8ncursor(struct utf8cursor * u8c,const struct utf8data * data,const char * s,size_t len)583*4882a593Smuzhiyun int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
584*4882a593Smuzhiyun 		const char *s, size_t len)
585*4882a593Smuzhiyun {
586*4882a593Smuzhiyun 	if (!data)
587*4882a593Smuzhiyun 		return -1;
588*4882a593Smuzhiyun 	if (!s)
589*4882a593Smuzhiyun 		return -1;
590*4882a593Smuzhiyun 	u8c->data = data;
591*4882a593Smuzhiyun 	u8c->s = s;
592*4882a593Smuzhiyun 	u8c->p = NULL;
593*4882a593Smuzhiyun 	u8c->ss = NULL;
594*4882a593Smuzhiyun 	u8c->sp = NULL;
595*4882a593Smuzhiyun 	u8c->len = len;
596*4882a593Smuzhiyun 	u8c->slen = 0;
597*4882a593Smuzhiyun 	u8c->ccc = STOPPER;
598*4882a593Smuzhiyun 	u8c->nccc = STOPPER;
599*4882a593Smuzhiyun 	/* Check we didn't clobber the maximum length. */
600*4882a593Smuzhiyun 	if (u8c->len != len)
601*4882a593Smuzhiyun 		return -1;
602*4882a593Smuzhiyun 	/* The first byte of s may not be an utf8 continuation. */
603*4882a593Smuzhiyun 	if (len > 0 && (*s & 0xC0) == 0x80)
604*4882a593Smuzhiyun 		return -1;
605*4882a593Smuzhiyun 	return 0;
606*4882a593Smuzhiyun }
607*4882a593Smuzhiyun EXPORT_SYMBOL(utf8ncursor);
608*4882a593Smuzhiyun 
609*4882a593Smuzhiyun /*
610*4882a593Smuzhiyun  * Set up an utf8cursor for use by utf8byte().
611*4882a593Smuzhiyun  *
612*4882a593Smuzhiyun  *   u8c    : pointer to cursor.
613*4882a593Smuzhiyun  *   data   : const struct utf8data to use for normalization.
614*4882a593Smuzhiyun  *   s      : NUL-terminated string.
615*4882a593Smuzhiyun  *
616*4882a593Smuzhiyun  * Returns -1 on error, 0 on success.
617*4882a593Smuzhiyun  */
utf8cursor(struct utf8cursor * u8c,const struct utf8data * data,const char * s)618*4882a593Smuzhiyun int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
619*4882a593Smuzhiyun 	       const char *s)
620*4882a593Smuzhiyun {
621*4882a593Smuzhiyun 	return utf8ncursor(u8c, data, s, (unsigned int)-1);
622*4882a593Smuzhiyun }
623*4882a593Smuzhiyun EXPORT_SYMBOL(utf8cursor);
624*4882a593Smuzhiyun 
625*4882a593Smuzhiyun /*
626*4882a593Smuzhiyun  * Get one byte from the normalized form of the string described by u8c.
627*4882a593Smuzhiyun  *
628*4882a593Smuzhiyun  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
629*4882a593Smuzhiyun  *
630*4882a593Smuzhiyun  * The cursor keeps track of the location in the string in u8c->s.
631*4882a593Smuzhiyun  * When a character is decomposed, the current location is stored in
632*4882a593Smuzhiyun  * u8c->p, and u8c->s is set to the start of the decomposition. Note
633*4882a593Smuzhiyun  * that bytes from a decomposition do not count against u8c->len.
634*4882a593Smuzhiyun  *
635*4882a593Smuzhiyun  * Characters are emitted if they match the current CCC in u8c->ccc.
636*4882a593Smuzhiyun  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
637*4882a593Smuzhiyun  * and the function returns 0 in that case.
638*4882a593Smuzhiyun  *
639*4882a593Smuzhiyun  * Sorting by CCC is done by repeatedly scanning the string.  The
640*4882a593Smuzhiyun  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
641*4882a593Smuzhiyun  * the start of the scan.  The first pass finds the lowest CCC to be
642*4882a593Smuzhiyun  * emitted and stores it in u8c->nccc, the second pass emits the
643*4882a593Smuzhiyun  * characters with this CCC and finds the next lowest CCC. This limits
644*4882a593Smuzhiyun  * the number of passes to 1 + the number of different CCCs in the
645*4882a593Smuzhiyun  * sequence being scanned.
646*4882a593Smuzhiyun  *
647*4882a593Smuzhiyun  * Therefore:
648*4882a593Smuzhiyun  *  u8c->p  != NULL -> a decomposition is being scanned.
649*4882a593Smuzhiyun  *  u8c->ss != NULL -> this is a repeating scan.
650*4882a593Smuzhiyun  *  u8c->ccc == -1   -> this is the first scan of a repeating scan.
651*4882a593Smuzhiyun  */
utf8byte(struct utf8cursor * u8c)652*4882a593Smuzhiyun int utf8byte(struct utf8cursor *u8c)
653*4882a593Smuzhiyun {
654*4882a593Smuzhiyun 	utf8leaf_t *leaf;
655*4882a593Smuzhiyun 	int ccc;
656*4882a593Smuzhiyun 
657*4882a593Smuzhiyun 	for (;;) {
658*4882a593Smuzhiyun 		/* Check for the end of a decomposed character. */
659*4882a593Smuzhiyun 		if (u8c->p && *u8c->s == '\0') {
660*4882a593Smuzhiyun 			u8c->s = u8c->p;
661*4882a593Smuzhiyun 			u8c->p = NULL;
662*4882a593Smuzhiyun 		}
663*4882a593Smuzhiyun 
664*4882a593Smuzhiyun 		/* Check for end-of-string. */
665*4882a593Smuzhiyun 		if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
666*4882a593Smuzhiyun 			/* There is no next byte. */
667*4882a593Smuzhiyun 			if (u8c->ccc == STOPPER)
668*4882a593Smuzhiyun 				return 0;
669*4882a593Smuzhiyun 			/* End-of-string during a scan counts as a stopper. */
670*4882a593Smuzhiyun 			ccc = STOPPER;
671*4882a593Smuzhiyun 			goto ccc_mismatch;
672*4882a593Smuzhiyun 		} else if ((*u8c->s & 0xC0) == 0x80) {
673*4882a593Smuzhiyun 			/* This is a continuation of the current character. */
674*4882a593Smuzhiyun 			if (!u8c->p)
675*4882a593Smuzhiyun 				u8c->len--;
676*4882a593Smuzhiyun 			return (unsigned char)*u8c->s++;
677*4882a593Smuzhiyun 		}
678*4882a593Smuzhiyun 
679*4882a593Smuzhiyun 		/* Look up the data for the current character. */
680*4882a593Smuzhiyun 		if (u8c->p) {
681*4882a593Smuzhiyun 			leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
682*4882a593Smuzhiyun 		} else {
683*4882a593Smuzhiyun 			leaf = utf8nlookup(u8c->data, u8c->hangul,
684*4882a593Smuzhiyun 					   u8c->s, u8c->len);
685*4882a593Smuzhiyun 		}
686*4882a593Smuzhiyun 
687*4882a593Smuzhiyun 		/* No leaf found implies that the input is a binary blob. */
688*4882a593Smuzhiyun 		if (!leaf)
689*4882a593Smuzhiyun 			return -1;
690*4882a593Smuzhiyun 
691*4882a593Smuzhiyun 		ccc = LEAF_CCC(leaf);
692*4882a593Smuzhiyun 		/* Characters that are too new have CCC 0. */
693*4882a593Smuzhiyun 		if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) {
694*4882a593Smuzhiyun 			ccc = STOPPER;
695*4882a593Smuzhiyun 		} else if (ccc == DECOMPOSE) {
696*4882a593Smuzhiyun 			u8c->len -= utf8clen(u8c->s);
697*4882a593Smuzhiyun 			u8c->p = u8c->s + utf8clen(u8c->s);
698*4882a593Smuzhiyun 			u8c->s = LEAF_STR(leaf);
699*4882a593Smuzhiyun 			/* Empty decomposition implies CCC 0. */
700*4882a593Smuzhiyun 			if (*u8c->s == '\0') {
701*4882a593Smuzhiyun 				if (u8c->ccc == STOPPER)
702*4882a593Smuzhiyun 					continue;
703*4882a593Smuzhiyun 				ccc = STOPPER;
704*4882a593Smuzhiyun 				goto ccc_mismatch;
705*4882a593Smuzhiyun 			}
706*4882a593Smuzhiyun 
707*4882a593Smuzhiyun 			leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
708*4882a593Smuzhiyun 			if (!leaf)
709*4882a593Smuzhiyun 				return -1;
710*4882a593Smuzhiyun 			ccc = LEAF_CCC(leaf);
711*4882a593Smuzhiyun 		}
712*4882a593Smuzhiyun 
713*4882a593Smuzhiyun 		/*
714*4882a593Smuzhiyun 		 * If this is not a stopper, then see if it updates
715*4882a593Smuzhiyun 		 * the next canonical class to be emitted.
716*4882a593Smuzhiyun 		 */
717*4882a593Smuzhiyun 		if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
718*4882a593Smuzhiyun 			u8c->nccc = ccc;
719*4882a593Smuzhiyun 
720*4882a593Smuzhiyun 		/*
721*4882a593Smuzhiyun 		 * Return the current byte if this is the current
722*4882a593Smuzhiyun 		 * combining class.
723*4882a593Smuzhiyun 		 */
724*4882a593Smuzhiyun 		if (ccc == u8c->ccc) {
725*4882a593Smuzhiyun 			if (!u8c->p)
726*4882a593Smuzhiyun 				u8c->len--;
727*4882a593Smuzhiyun 			return (unsigned char)*u8c->s++;
728*4882a593Smuzhiyun 		}
729*4882a593Smuzhiyun 
730*4882a593Smuzhiyun 		/* Current combining class mismatch. */
731*4882a593Smuzhiyun ccc_mismatch:
732*4882a593Smuzhiyun 		if (u8c->nccc == STOPPER) {
733*4882a593Smuzhiyun 			/*
734*4882a593Smuzhiyun 			 * Scan forward for the first canonical class
735*4882a593Smuzhiyun 			 * to be emitted.  Save the position from
736*4882a593Smuzhiyun 			 * which to restart.
737*4882a593Smuzhiyun 			 */
738*4882a593Smuzhiyun 			u8c->ccc = MINCCC - 1;
739*4882a593Smuzhiyun 			u8c->nccc = ccc;
740*4882a593Smuzhiyun 			u8c->sp = u8c->p;
741*4882a593Smuzhiyun 			u8c->ss = u8c->s;
742*4882a593Smuzhiyun 			u8c->slen = u8c->len;
743*4882a593Smuzhiyun 			if (!u8c->p)
744*4882a593Smuzhiyun 				u8c->len -= utf8clen(u8c->s);
745*4882a593Smuzhiyun 			u8c->s += utf8clen(u8c->s);
746*4882a593Smuzhiyun 		} else if (ccc != STOPPER) {
747*4882a593Smuzhiyun 			/* Not a stopper, and not the ccc we're emitting. */
748*4882a593Smuzhiyun 			if (!u8c->p)
749*4882a593Smuzhiyun 				u8c->len -= utf8clen(u8c->s);
750*4882a593Smuzhiyun 			u8c->s += utf8clen(u8c->s);
751*4882a593Smuzhiyun 		} else if (u8c->nccc != MAXCCC + 1) {
752*4882a593Smuzhiyun 			/* At a stopper, restart for next ccc. */
753*4882a593Smuzhiyun 			u8c->ccc = u8c->nccc;
754*4882a593Smuzhiyun 			u8c->nccc = MAXCCC + 1;
755*4882a593Smuzhiyun 			u8c->s = u8c->ss;
756*4882a593Smuzhiyun 			u8c->p = u8c->sp;
757*4882a593Smuzhiyun 			u8c->len = u8c->slen;
758*4882a593Smuzhiyun 		} else {
759*4882a593Smuzhiyun 			/* All done, proceed from here. */
760*4882a593Smuzhiyun 			u8c->ccc = STOPPER;
761*4882a593Smuzhiyun 			u8c->nccc = STOPPER;
762*4882a593Smuzhiyun 			u8c->sp = NULL;
763*4882a593Smuzhiyun 			u8c->ss = NULL;
764*4882a593Smuzhiyun 			u8c->slen = 0;
765*4882a593Smuzhiyun 		}
766*4882a593Smuzhiyun 	}
767*4882a593Smuzhiyun }
768*4882a593Smuzhiyun EXPORT_SYMBOL(utf8byte);
769*4882a593Smuzhiyun 
utf8nfdi(unsigned int maxage)770*4882a593Smuzhiyun const struct utf8data *utf8nfdi(unsigned int maxage)
771*4882a593Smuzhiyun {
772*4882a593Smuzhiyun 	int i = ARRAY_SIZE(utf8nfdidata) - 1;
773*4882a593Smuzhiyun 
774*4882a593Smuzhiyun 	while (maxage < utf8nfdidata[i].maxage)
775*4882a593Smuzhiyun 		i--;
776*4882a593Smuzhiyun 	if (maxage > utf8nfdidata[i].maxage)
777*4882a593Smuzhiyun 		return NULL;
778*4882a593Smuzhiyun 	return &utf8nfdidata[i];
779*4882a593Smuzhiyun }
780*4882a593Smuzhiyun EXPORT_SYMBOL(utf8nfdi);
781*4882a593Smuzhiyun 
utf8nfdicf(unsigned int maxage)782*4882a593Smuzhiyun const struct utf8data *utf8nfdicf(unsigned int maxage)
783*4882a593Smuzhiyun {
784*4882a593Smuzhiyun 	int i = ARRAY_SIZE(utf8nfdicfdata) - 1;
785*4882a593Smuzhiyun 
786*4882a593Smuzhiyun 	while (maxage < utf8nfdicfdata[i].maxage)
787*4882a593Smuzhiyun 		i--;
788*4882a593Smuzhiyun 	if (maxage > utf8nfdicfdata[i].maxage)
789*4882a593Smuzhiyun 		return NULL;
790*4882a593Smuzhiyun 	return &utf8nfdicfdata[i];
791*4882a593Smuzhiyun }
792*4882a593Smuzhiyun EXPORT_SYMBOL(utf8nfdicf);
793