xref: /OK3568_Linux_fs/kernel/fs/unicode/utf8-selftest.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  * Kernel module for testing utf-8 support.
4*4882a593Smuzhiyun  *
5*4882a593Smuzhiyun  * Copyright 2017 Collabora Ltd.
6*4882a593Smuzhiyun  */
7*4882a593Smuzhiyun 
8*4882a593Smuzhiyun #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9*4882a593Smuzhiyun 
10*4882a593Smuzhiyun #include <linux/module.h>
11*4882a593Smuzhiyun #include <linux/printk.h>
12*4882a593Smuzhiyun #include <linux/unicode.h>
13*4882a593Smuzhiyun #include <linux/dcache.h>
14*4882a593Smuzhiyun 
15*4882a593Smuzhiyun #include "utf8n.h"
16*4882a593Smuzhiyun 
17*4882a593Smuzhiyun unsigned int failed_tests;
18*4882a593Smuzhiyun unsigned int total_tests;
19*4882a593Smuzhiyun 
20*4882a593Smuzhiyun /* Tests will be based on this version. */
21*4882a593Smuzhiyun #define latest_maj 12
22*4882a593Smuzhiyun #define latest_min 1
23*4882a593Smuzhiyun #define latest_rev 0
24*4882a593Smuzhiyun 
25*4882a593Smuzhiyun #define _test(cond, func, line, fmt, ...) do {				\
26*4882a593Smuzhiyun 		total_tests++;						\
27*4882a593Smuzhiyun 		if (!cond) {						\
28*4882a593Smuzhiyun 			failed_tests++;					\
29*4882a593Smuzhiyun 			pr_err("test %s:%d Failed: %s%s",		\
30*4882a593Smuzhiyun 			       func, line, #cond, (fmt?":":"."));	\
31*4882a593Smuzhiyun 			if (fmt)					\
32*4882a593Smuzhiyun 				pr_err(fmt, ##__VA_ARGS__);		\
33*4882a593Smuzhiyun 		}							\
34*4882a593Smuzhiyun 	} while (0)
35*4882a593Smuzhiyun #define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__)
36*4882a593Smuzhiyun #define test(cond) _test(cond, __func__, __LINE__, "")
37*4882a593Smuzhiyun 
38*4882a593Smuzhiyun static const struct {
39*4882a593Smuzhiyun 	/* UTF-8 strings in this vector _must_ be NULL-terminated. */
40*4882a593Smuzhiyun 	unsigned char str[10];
41*4882a593Smuzhiyun 	unsigned char dec[10];
42*4882a593Smuzhiyun } nfdi_test_data[] = {
43*4882a593Smuzhiyun 	/* Trivial sequence */
44*4882a593Smuzhiyun 	{
45*4882a593Smuzhiyun 		/* "ABba" decomposes to itself */
46*4882a593Smuzhiyun 		.str = "aBba",
47*4882a593Smuzhiyun 		.dec = "aBba",
48*4882a593Smuzhiyun 	},
49*4882a593Smuzhiyun 	/* Simple equivalent sequences */
50*4882a593Smuzhiyun 	{
51*4882a593Smuzhiyun                /* 'VULGAR FRACTION ONE QUARTER' cannot decompose to
52*4882a593Smuzhiyun                   'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' on
53*4882a593Smuzhiyun                   canonical decomposition */
54*4882a593Smuzhiyun                .str = {0xc2, 0xbc, 0x00},
55*4882a593Smuzhiyun 	       .dec = {0xc2, 0xbc, 0x00},
56*4882a593Smuzhiyun 	},
57*4882a593Smuzhiyun 	{
58*4882a593Smuzhiyun 		/* 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to
59*4882a593Smuzhiyun 		   'LETTER A' + 'COMBINING DIAERESIS' */
60*4882a593Smuzhiyun 		.str = {0xc3, 0xa4, 0x00},
61*4882a593Smuzhiyun 		.dec = {0x61, 0xcc, 0x88, 0x00},
62*4882a593Smuzhiyun 	},
63*4882a593Smuzhiyun 	{
64*4882a593Smuzhiyun 		/* 'LATIN SMALL LETTER LJ' can't decompose to
65*4882a593Smuzhiyun 		   'LETTER L' + 'LETTER J' on canonical decomposition */
66*4882a593Smuzhiyun 		.str = {0xC7, 0x89, 0x00},
67*4882a593Smuzhiyun 		.dec = {0xC7, 0x89, 0x00},
68*4882a593Smuzhiyun 	},
69*4882a593Smuzhiyun 	{
70*4882a593Smuzhiyun 		/* GREEK ANO TELEIA decomposes to MIDDLE DOT */
71*4882a593Smuzhiyun 		.str = {0xCE, 0x87, 0x00},
72*4882a593Smuzhiyun 		.dec = {0xC2, 0xB7, 0x00}
73*4882a593Smuzhiyun 	},
74*4882a593Smuzhiyun 	/* Canonical ordering */
75*4882a593Smuzhiyun 	{
76*4882a593Smuzhiyun 		/* A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes
77*4882a593Smuzhiyun 		   to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' */
78*4882a593Smuzhiyun 		.str = {0x41, 0xcc, 0x81, 0xcc, 0xa8, 0x0},
79*4882a593Smuzhiyun 		.dec = {0x41, 0xcc, 0xa8, 0xcc, 0x81, 0x0},
80*4882a593Smuzhiyun 	},
81*4882a593Smuzhiyun 	{
82*4882a593Smuzhiyun 		/* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK'
83*4882a593Smuzhiyun 		   decomposes to
84*4882a593Smuzhiyun 		   'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' */
85*4882a593Smuzhiyun 		.str = {0xc3, 0xa4, 0xCC, 0xA8, 0x00},
86*4882a593Smuzhiyun 
87*4882a593Smuzhiyun 		.dec = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00},
88*4882a593Smuzhiyun 	},
89*4882a593Smuzhiyun 
90*4882a593Smuzhiyun };
91*4882a593Smuzhiyun 
92*4882a593Smuzhiyun static const struct {
93*4882a593Smuzhiyun 	/* UTF-8 strings in this vector _must_ be NULL-terminated. */
94*4882a593Smuzhiyun 	unsigned char str[30];
95*4882a593Smuzhiyun 	unsigned char ncf[30];
96*4882a593Smuzhiyun } nfdicf_test_data[] = {
97*4882a593Smuzhiyun 	/* Trivial sequences */
98*4882a593Smuzhiyun 	{
99*4882a593Smuzhiyun 		/* "ABba" folds to lowercase */
100*4882a593Smuzhiyun 		.str = {0x41, 0x42, 0x62, 0x61, 0x00},
101*4882a593Smuzhiyun 		.ncf = {0x61, 0x62, 0x62, 0x61, 0x00},
102*4882a593Smuzhiyun 	},
103*4882a593Smuzhiyun 	{
104*4882a593Smuzhiyun 		/* All ASCII folds to lower-case */
105*4882a593Smuzhiyun 		.str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0.1",
106*4882a593Smuzhiyun 		.ncf = "abcdefghijklmnopqrstuvwxyz0.1",
107*4882a593Smuzhiyun 	},
108*4882a593Smuzhiyun 	{
109*4882a593Smuzhiyun 		/* LATIN SMALL LETTER SHARP S folds to
110*4882a593Smuzhiyun 		   LATIN SMALL LETTER S + LATIN SMALL LETTER S */
111*4882a593Smuzhiyun 		.str = {0xc3, 0x9f, 0x00},
112*4882a593Smuzhiyun 		.ncf = {0x73, 0x73, 0x00},
113*4882a593Smuzhiyun 	},
114*4882a593Smuzhiyun 	{
115*4882a593Smuzhiyun 		/* LATIN CAPITAL LETTER A WITH RING ABOVE folds to
116*4882a593Smuzhiyun 		   LATIN SMALL LETTER A + COMBINING RING ABOVE */
117*4882a593Smuzhiyun 		.str = {0xC3, 0x85, 0x00},
118*4882a593Smuzhiyun 		.ncf = {0x61, 0xcc, 0x8a, 0x00},
119*4882a593Smuzhiyun 	},
120*4882a593Smuzhiyun 	/* Introduced by UTF-8.0.0. */
121*4882a593Smuzhiyun 	/* Cherokee letters are interesting test-cases because they fold
122*4882a593Smuzhiyun 	   to upper-case.  Before 8.0.0, Cherokee lowercase were
123*4882a593Smuzhiyun 	   undefined, thus, the folding from LC is not stable between
124*4882a593Smuzhiyun 	   7.0.0 -> 8.0.0, but it is from UC. */
125*4882a593Smuzhiyun 	{
126*4882a593Smuzhiyun 		/* CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A */
127*4882a593Smuzhiyun 		.str = {0xea, 0xad, 0xb0, 0x00},
128*4882a593Smuzhiyun 		.ncf = {0xe1, 0x8e, 0xa0, 0x00},
129*4882a593Smuzhiyun 	},
130*4882a593Smuzhiyun 	{
131*4882a593Smuzhiyun 		/* CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE */
132*4882a593Smuzhiyun 		.str = {0xe1, 0x8f, 0xb8, 0x00},
133*4882a593Smuzhiyun 		.ncf = {0xe1, 0x8f, 0xb0, 0x00},
134*4882a593Smuzhiyun 	},
135*4882a593Smuzhiyun 	{
136*4882a593Smuzhiyun 		/* OLD HUNGARIAN CAPITAL LETTER AMB folds to
137*4882a593Smuzhiyun 		   OLD HUNGARIAN SMALL LETTER AMB */
138*4882a593Smuzhiyun 		.str = {0xf0, 0x90, 0xb2, 0x83, 0x00},
139*4882a593Smuzhiyun 		.ncf = {0xf0, 0x90, 0xb3, 0x83, 0x00},
140*4882a593Smuzhiyun 	},
141*4882a593Smuzhiyun 	/* Introduced by UTF-9.0.0. */
142*4882a593Smuzhiyun 	{
143*4882a593Smuzhiyun 		/* OSAGE CAPITAL LETTER CHA folds to
144*4882a593Smuzhiyun 		   OSAGE SMALL LETTER CHA */
145*4882a593Smuzhiyun 		.str = {0xf0, 0x90, 0x92, 0xb5, 0x00},
146*4882a593Smuzhiyun 		.ncf = {0xf0, 0x90, 0x93, 0x9d, 0x00},
147*4882a593Smuzhiyun 	},
148*4882a593Smuzhiyun 	{
149*4882a593Smuzhiyun 		/* LATIN CAPITAL LETTER SMALL CAPITAL I folds to
150*4882a593Smuzhiyun 		   LATIN LETTER SMALL CAPITAL I */
151*4882a593Smuzhiyun 		.str = {0xea, 0x9e, 0xae, 0x00},
152*4882a593Smuzhiyun 		.ncf = {0xc9, 0xaa, 0x00},
153*4882a593Smuzhiyun 	},
154*4882a593Smuzhiyun 	/* Introduced by UTF-11.0.0. */
155*4882a593Smuzhiyun 	{
156*4882a593Smuzhiyun 		/* GEORGIAN SMALL LETTER AN folds to GEORGIAN MTAVRULI
157*4882a593Smuzhiyun 		   CAPITAL LETTER AN */
158*4882a593Smuzhiyun 		.str = {0xe1, 0xb2, 0x90, 0x00},
159*4882a593Smuzhiyun 		.ncf = {0xe1, 0x83, 0x90, 0x00},
160*4882a593Smuzhiyun 	}
161*4882a593Smuzhiyun };
162*4882a593Smuzhiyun 
check_utf8_nfdi(void)163*4882a593Smuzhiyun static void check_utf8_nfdi(void)
164*4882a593Smuzhiyun {
165*4882a593Smuzhiyun 	int i;
166*4882a593Smuzhiyun 	struct utf8cursor u8c;
167*4882a593Smuzhiyun 	const struct utf8data *data;
168*4882a593Smuzhiyun 
169*4882a593Smuzhiyun 	data = utf8nfdi(UNICODE_AGE(latest_maj, latest_min, latest_rev));
170*4882a593Smuzhiyun 	if (!data) {
171*4882a593Smuzhiyun 		pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n",
172*4882a593Smuzhiyun 		       __func__, latest_maj, latest_min, latest_rev);
173*4882a593Smuzhiyun 		return;
174*4882a593Smuzhiyun 	}
175*4882a593Smuzhiyun 
176*4882a593Smuzhiyun 	for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
177*4882a593Smuzhiyun 		int len = strlen(nfdi_test_data[i].str);
178*4882a593Smuzhiyun 		int nlen = strlen(nfdi_test_data[i].dec);
179*4882a593Smuzhiyun 		int j = 0;
180*4882a593Smuzhiyun 		unsigned char c;
181*4882a593Smuzhiyun 
182*4882a593Smuzhiyun 		test((utf8len(data, nfdi_test_data[i].str) == nlen));
183*4882a593Smuzhiyun 		test((utf8nlen(data, nfdi_test_data[i].str, len) == nlen));
184*4882a593Smuzhiyun 
185*4882a593Smuzhiyun 		if (utf8cursor(&u8c, data, nfdi_test_data[i].str) < 0)
186*4882a593Smuzhiyun 			pr_err("can't create cursor\n");
187*4882a593Smuzhiyun 
188*4882a593Smuzhiyun 		while ((c = utf8byte(&u8c)) > 0) {
189*4882a593Smuzhiyun 			test_f((c == nfdi_test_data[i].dec[j]),
190*4882a593Smuzhiyun 			       "Unexpected byte 0x%x should be 0x%x\n",
191*4882a593Smuzhiyun 			       c, nfdi_test_data[i].dec[j]);
192*4882a593Smuzhiyun 			j++;
193*4882a593Smuzhiyun 		}
194*4882a593Smuzhiyun 
195*4882a593Smuzhiyun 		test((j == nlen));
196*4882a593Smuzhiyun 	}
197*4882a593Smuzhiyun }
198*4882a593Smuzhiyun 
check_utf8_nfdicf(void)199*4882a593Smuzhiyun static void check_utf8_nfdicf(void)
200*4882a593Smuzhiyun {
201*4882a593Smuzhiyun 	int i;
202*4882a593Smuzhiyun 	struct utf8cursor u8c;
203*4882a593Smuzhiyun 	const struct utf8data *data;
204*4882a593Smuzhiyun 
205*4882a593Smuzhiyun 	data = utf8nfdicf(UNICODE_AGE(latest_maj, latest_min, latest_rev));
206*4882a593Smuzhiyun 	if (!data) {
207*4882a593Smuzhiyun 		pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n",
208*4882a593Smuzhiyun 		       __func__, latest_maj, latest_min, latest_rev);
209*4882a593Smuzhiyun 		return;
210*4882a593Smuzhiyun 	}
211*4882a593Smuzhiyun 
212*4882a593Smuzhiyun 	for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
213*4882a593Smuzhiyun 		int len = strlen(nfdicf_test_data[i].str);
214*4882a593Smuzhiyun 		int nlen = strlen(nfdicf_test_data[i].ncf);
215*4882a593Smuzhiyun 		int j = 0;
216*4882a593Smuzhiyun 		unsigned char c;
217*4882a593Smuzhiyun 
218*4882a593Smuzhiyun 		test((utf8len(data, nfdicf_test_data[i].str) == nlen));
219*4882a593Smuzhiyun 		test((utf8nlen(data, nfdicf_test_data[i].str, len) == nlen));
220*4882a593Smuzhiyun 
221*4882a593Smuzhiyun 		if (utf8cursor(&u8c, data, nfdicf_test_data[i].str) < 0)
222*4882a593Smuzhiyun 			pr_err("can't create cursor\n");
223*4882a593Smuzhiyun 
224*4882a593Smuzhiyun 		while ((c = utf8byte(&u8c)) > 0) {
225*4882a593Smuzhiyun 			test_f((c == nfdicf_test_data[i].ncf[j]),
226*4882a593Smuzhiyun 			       "Unexpected byte 0x%x should be 0x%x\n",
227*4882a593Smuzhiyun 			       c, nfdicf_test_data[i].ncf[j]);
228*4882a593Smuzhiyun 			j++;
229*4882a593Smuzhiyun 		}
230*4882a593Smuzhiyun 
231*4882a593Smuzhiyun 		test((j == nlen));
232*4882a593Smuzhiyun 	}
233*4882a593Smuzhiyun }
234*4882a593Smuzhiyun 
check_utf8_comparisons(void)235*4882a593Smuzhiyun static void check_utf8_comparisons(void)
236*4882a593Smuzhiyun {
237*4882a593Smuzhiyun 	int i;
238*4882a593Smuzhiyun 	struct unicode_map *table = utf8_load("12.1.0");
239*4882a593Smuzhiyun 
240*4882a593Smuzhiyun 	if (IS_ERR(table)) {
241*4882a593Smuzhiyun 		pr_err("%s: Unable to load utf8 %d.%d.%d. Skipping.\n",
242*4882a593Smuzhiyun 		       __func__, latest_maj, latest_min, latest_rev);
243*4882a593Smuzhiyun 		return;
244*4882a593Smuzhiyun 	}
245*4882a593Smuzhiyun 
246*4882a593Smuzhiyun 	for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
247*4882a593Smuzhiyun 		const struct qstr s1 = {.name = nfdi_test_data[i].str,
248*4882a593Smuzhiyun 					.len = sizeof(nfdi_test_data[i].str)};
249*4882a593Smuzhiyun 		const struct qstr s2 = {.name = nfdi_test_data[i].dec,
250*4882a593Smuzhiyun 					.len = sizeof(nfdi_test_data[i].dec)};
251*4882a593Smuzhiyun 
252*4882a593Smuzhiyun 		test_f(!utf8_strncmp(table, &s1, &s2),
253*4882a593Smuzhiyun 		       "%s %s comparison mismatch\n", s1.name, s2.name);
254*4882a593Smuzhiyun 	}
255*4882a593Smuzhiyun 
256*4882a593Smuzhiyun 	for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
257*4882a593Smuzhiyun 		const struct qstr s1 = {.name = nfdicf_test_data[i].str,
258*4882a593Smuzhiyun 					.len = sizeof(nfdicf_test_data[i].str)};
259*4882a593Smuzhiyun 		const struct qstr s2 = {.name = nfdicf_test_data[i].ncf,
260*4882a593Smuzhiyun 					.len = sizeof(nfdicf_test_data[i].ncf)};
261*4882a593Smuzhiyun 
262*4882a593Smuzhiyun 		test_f(!utf8_strncasecmp(table, &s1, &s2),
263*4882a593Smuzhiyun 		       "%s %s comparison mismatch\n", s1.name, s2.name);
264*4882a593Smuzhiyun 	}
265*4882a593Smuzhiyun 
266*4882a593Smuzhiyun 	utf8_unload(table);
267*4882a593Smuzhiyun }
268*4882a593Smuzhiyun 
check_supported_versions(void)269*4882a593Smuzhiyun static void check_supported_versions(void)
270*4882a593Smuzhiyun {
271*4882a593Smuzhiyun 	/* Unicode 7.0.0 should be supported. */
272*4882a593Smuzhiyun 	test(utf8version_is_supported(7, 0, 0));
273*4882a593Smuzhiyun 
274*4882a593Smuzhiyun 	/* Unicode 9.0.0 should be supported. */
275*4882a593Smuzhiyun 	test(utf8version_is_supported(9, 0, 0));
276*4882a593Smuzhiyun 
277*4882a593Smuzhiyun 	/* Unicode 1x.0.0 (the latest version) should be supported. */
278*4882a593Smuzhiyun 	test(utf8version_is_supported(latest_maj, latest_min, latest_rev));
279*4882a593Smuzhiyun 
280*4882a593Smuzhiyun 	/* Next versions don't exist. */
281*4882a593Smuzhiyun 	test(!utf8version_is_supported(13, 0, 0));
282*4882a593Smuzhiyun 	test(!utf8version_is_supported(0, 0, 0));
283*4882a593Smuzhiyun 	test(!utf8version_is_supported(-1, -1, -1));
284*4882a593Smuzhiyun }
285*4882a593Smuzhiyun 
init_test_ucd(void)286*4882a593Smuzhiyun static int __init init_test_ucd(void)
287*4882a593Smuzhiyun {
288*4882a593Smuzhiyun 	failed_tests = 0;
289*4882a593Smuzhiyun 	total_tests = 0;
290*4882a593Smuzhiyun 
291*4882a593Smuzhiyun 	check_supported_versions();
292*4882a593Smuzhiyun 	check_utf8_nfdi();
293*4882a593Smuzhiyun 	check_utf8_nfdicf();
294*4882a593Smuzhiyun 	check_utf8_comparisons();
295*4882a593Smuzhiyun 
296*4882a593Smuzhiyun 	if (!failed_tests)
297*4882a593Smuzhiyun 		pr_info("All %u tests passed\n", total_tests);
298*4882a593Smuzhiyun 	else
299*4882a593Smuzhiyun 		pr_err("%u out of %u tests failed\n", failed_tests,
300*4882a593Smuzhiyun 		       total_tests);
301*4882a593Smuzhiyun 	return 0;
302*4882a593Smuzhiyun }
303*4882a593Smuzhiyun 
exit_test_ucd(void)304*4882a593Smuzhiyun static void __exit exit_test_ucd(void)
305*4882a593Smuzhiyun {
306*4882a593Smuzhiyun }
307*4882a593Smuzhiyun 
308*4882a593Smuzhiyun module_init(init_test_ucd);
309*4882a593Smuzhiyun module_exit(exit_test_ucd);
310*4882a593Smuzhiyun 
311*4882a593Smuzhiyun MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>");
312*4882a593Smuzhiyun MODULE_LICENSE("GPL");
313