xref: /OK3568_Linux_fs/kernel/fs/unicode/utf8n.h (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun /* SPDX-License-Identifier: GPL-2.0-only */
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  * Copyright (c) 2014 SGI.
4*4882a593Smuzhiyun  * All rights reserved.
5*4882a593Smuzhiyun  */
6*4882a593Smuzhiyun 
7*4882a593Smuzhiyun #ifndef UTF8NORM_H
8*4882a593Smuzhiyun #define UTF8NORM_H
9*4882a593Smuzhiyun 
10*4882a593Smuzhiyun #include <linux/types.h>
11*4882a593Smuzhiyun #include <linux/export.h>
12*4882a593Smuzhiyun #include <linux/string.h>
13*4882a593Smuzhiyun #include <linux/module.h>
14*4882a593Smuzhiyun 
15*4882a593Smuzhiyun /* Encoding a unicode version number as a single unsigned int. */
16*4882a593Smuzhiyun #define UNICODE_MAJ_SHIFT		(16)
17*4882a593Smuzhiyun #define UNICODE_MIN_SHIFT		(8)
18*4882a593Smuzhiyun 
19*4882a593Smuzhiyun #define UNICODE_AGE(MAJ, MIN, REV)			\
20*4882a593Smuzhiyun 	(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |	\
21*4882a593Smuzhiyun 	 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |	\
22*4882a593Smuzhiyun 	 ((unsigned int)(REV)))
23*4882a593Smuzhiyun 
24*4882a593Smuzhiyun /* Highest unicode version supported by the data tables. */
25*4882a593Smuzhiyun extern int utf8version_is_supported(u8 maj, u8 min, u8 rev);
26*4882a593Smuzhiyun extern int utf8version_latest(void);
27*4882a593Smuzhiyun 
28*4882a593Smuzhiyun /*
29*4882a593Smuzhiyun  * Look for the correct const struct utf8data for a unicode version.
30*4882a593Smuzhiyun  * Returns NULL if the version requested is too new.
31*4882a593Smuzhiyun  *
32*4882a593Smuzhiyun  * Two normalization forms are supported: nfdi and nfdicf.
33*4882a593Smuzhiyun  *
34*4882a593Smuzhiyun  * nfdi:
35*4882a593Smuzhiyun  *  - Apply unicode normalization form NFD.
36*4882a593Smuzhiyun  *  - Remove any Default_Ignorable_Code_Point.
37*4882a593Smuzhiyun  *
38*4882a593Smuzhiyun  * nfdicf:
39*4882a593Smuzhiyun  *  - Apply unicode normalization form NFD.
40*4882a593Smuzhiyun  *  - Remove any Default_Ignorable_Code_Point.
41*4882a593Smuzhiyun  *  - Apply a full casefold (C + F).
42*4882a593Smuzhiyun  */
43*4882a593Smuzhiyun extern const struct utf8data *utf8nfdi(unsigned int maxage);
44*4882a593Smuzhiyun extern const struct utf8data *utf8nfdicf(unsigned int maxage);
45*4882a593Smuzhiyun 
46*4882a593Smuzhiyun /*
47*4882a593Smuzhiyun  * Determine the maximum age of any unicode character in the string.
48*4882a593Smuzhiyun  * Returns 0 if only unassigned code points are present.
49*4882a593Smuzhiyun  * Returns -1 if the input is not valid UTF-8.
50*4882a593Smuzhiyun  */
51*4882a593Smuzhiyun extern int utf8agemax(const struct utf8data *data, const char *s);
52*4882a593Smuzhiyun extern int utf8nagemax(const struct utf8data *data, const char *s, size_t len);
53*4882a593Smuzhiyun 
54*4882a593Smuzhiyun /*
55*4882a593Smuzhiyun  * Determine the minimum age of any unicode character in the string.
56*4882a593Smuzhiyun  * Returns 0 if any unassigned code points are present.
57*4882a593Smuzhiyun  * Returns -1 if the input is not valid UTF-8.
58*4882a593Smuzhiyun  */
59*4882a593Smuzhiyun extern int utf8agemin(const struct utf8data *data, const char *s);
60*4882a593Smuzhiyun extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len);
61*4882a593Smuzhiyun 
62*4882a593Smuzhiyun /*
63*4882a593Smuzhiyun  * Determine the length of the normalized from of the string,
64*4882a593Smuzhiyun  * excluding any terminating NULL byte.
65*4882a593Smuzhiyun  * Returns 0 if only ignorable code points are present.
66*4882a593Smuzhiyun  * Returns -1 if the input is not valid UTF-8.
67*4882a593Smuzhiyun  */
68*4882a593Smuzhiyun extern ssize_t utf8len(const struct utf8data *data, const char *s);
69*4882a593Smuzhiyun extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len);
70*4882a593Smuzhiyun 
71*4882a593Smuzhiyun /* Needed in struct utf8cursor below. */
72*4882a593Smuzhiyun #define UTF8HANGULLEAF	(12)
73*4882a593Smuzhiyun 
74*4882a593Smuzhiyun /*
75*4882a593Smuzhiyun  * Cursor structure used by the normalizer.
76*4882a593Smuzhiyun  */
77*4882a593Smuzhiyun struct utf8cursor {
78*4882a593Smuzhiyun 	const struct utf8data	*data;
79*4882a593Smuzhiyun 	const char	*s;
80*4882a593Smuzhiyun 	const char	*p;
81*4882a593Smuzhiyun 	const char	*ss;
82*4882a593Smuzhiyun 	const char	*sp;
83*4882a593Smuzhiyun 	unsigned int	len;
84*4882a593Smuzhiyun 	unsigned int	slen;
85*4882a593Smuzhiyun 	short int	ccc;
86*4882a593Smuzhiyun 	short int	nccc;
87*4882a593Smuzhiyun 	unsigned char	hangul[UTF8HANGULLEAF];
88*4882a593Smuzhiyun };
89*4882a593Smuzhiyun 
90*4882a593Smuzhiyun /*
91*4882a593Smuzhiyun  * Initialize a utf8cursor to normalize a string.
92*4882a593Smuzhiyun  * Returns 0 on success.
93*4882a593Smuzhiyun  * Returns -1 on failure.
94*4882a593Smuzhiyun  */
95*4882a593Smuzhiyun extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data,
96*4882a593Smuzhiyun 		      const char *s);
97*4882a593Smuzhiyun extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data,
98*4882a593Smuzhiyun 		       const char *s, size_t len);
99*4882a593Smuzhiyun 
100*4882a593Smuzhiyun /*
101*4882a593Smuzhiyun  * Get the next byte in the normalization.
102*4882a593Smuzhiyun  * Returns a value > 0 && < 256 on success.
103*4882a593Smuzhiyun  * Returns 0 when the end of the normalization is reached.
104*4882a593Smuzhiyun  * Returns -1 if the string being normalized is not valid UTF-8.
105*4882a593Smuzhiyun  */
106*4882a593Smuzhiyun extern int utf8byte(struct utf8cursor *u8c);
107*4882a593Smuzhiyun 
108*4882a593Smuzhiyun #endif /* UTF8NORM_H */
109