1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun * Copyright (c) 2014 SGI.
3*4882a593Smuzhiyun * All rights reserved.
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * This program is free software; you can redistribute it and/or
6*4882a593Smuzhiyun * modify it under the terms of the GNU General Public License as
7*4882a593Smuzhiyun * published by the Free Software Foundation.
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * This program is distributed in the hope that it would be useful,
10*4882a593Smuzhiyun * but WITHOUT ANY WARRANTY; without even the implied warranty of
11*4882a593Smuzhiyun * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12*4882a593Smuzhiyun * GNU General Public License for more details.
13*4882a593Smuzhiyun *
14*4882a593Smuzhiyun * You should have received a copy of the GNU General Public License
15*4882a593Smuzhiyun * along with this program; if not, write the Free Software Foundation,
16*4882a593Smuzhiyun * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17*4882a593Smuzhiyun */
18*4882a593Smuzhiyun
19*4882a593Smuzhiyun /* Generator for a compact trie for unicode normalization */
20*4882a593Smuzhiyun
21*4882a593Smuzhiyun #include <sys/types.h>
22*4882a593Smuzhiyun #include <stddef.h>
23*4882a593Smuzhiyun #include <stdlib.h>
24*4882a593Smuzhiyun #include <stdio.h>
25*4882a593Smuzhiyun #include <assert.h>
26*4882a593Smuzhiyun #include <string.h>
27*4882a593Smuzhiyun #include <unistd.h>
28*4882a593Smuzhiyun #include <errno.h>
29*4882a593Smuzhiyun
30*4882a593Smuzhiyun /* Default names of the in- and output files. */
31*4882a593Smuzhiyun
32*4882a593Smuzhiyun #define AGE_NAME "DerivedAge.txt"
33*4882a593Smuzhiyun #define CCC_NAME "DerivedCombiningClass.txt"
34*4882a593Smuzhiyun #define PROP_NAME "DerivedCoreProperties.txt"
35*4882a593Smuzhiyun #define DATA_NAME "UnicodeData.txt"
36*4882a593Smuzhiyun #define FOLD_NAME "CaseFolding.txt"
37*4882a593Smuzhiyun #define NORM_NAME "NormalizationCorrections.txt"
38*4882a593Smuzhiyun #define TEST_NAME "NormalizationTest.txt"
39*4882a593Smuzhiyun #define UTF8_NAME "utf8data.h"
40*4882a593Smuzhiyun
41*4882a593Smuzhiyun const char *age_name = AGE_NAME;
42*4882a593Smuzhiyun const char *ccc_name = CCC_NAME;
43*4882a593Smuzhiyun const char *prop_name = PROP_NAME;
44*4882a593Smuzhiyun const char *data_name = DATA_NAME;
45*4882a593Smuzhiyun const char *fold_name = FOLD_NAME;
46*4882a593Smuzhiyun const char *norm_name = NORM_NAME;
47*4882a593Smuzhiyun const char *test_name = TEST_NAME;
48*4882a593Smuzhiyun const char *utf8_name = UTF8_NAME;
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun int verbose = 0;
51*4882a593Smuzhiyun
52*4882a593Smuzhiyun /* An arbitrary line size limit on input lines. */
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun #define LINESIZE 1024
55*4882a593Smuzhiyun char line[LINESIZE];
56*4882a593Smuzhiyun char buf0[LINESIZE];
57*4882a593Smuzhiyun char buf1[LINESIZE];
58*4882a593Smuzhiyun char buf2[LINESIZE];
59*4882a593Smuzhiyun char buf3[LINESIZE];
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun const char *argv0;
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
64*4882a593Smuzhiyun
65*4882a593Smuzhiyun /* ------------------------------------------------------------------ */
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun /*
68*4882a593Smuzhiyun * Unicode version numbers consist of three parts: major, minor, and a
69*4882a593Smuzhiyun * revision. These numbers are packed into an unsigned int to obtain
70*4882a593Smuzhiyun * a single version number.
71*4882a593Smuzhiyun *
72*4882a593Smuzhiyun * To save space in the generated trie, the unicode version is not
73*4882a593Smuzhiyun * stored directly, instead we calculate a generation number from the
74*4882a593Smuzhiyun * unicode versions seen in the DerivedAge file, and use that as an
75*4882a593Smuzhiyun * index into a table of unicode versions.
76*4882a593Smuzhiyun */
77*4882a593Smuzhiyun #define UNICODE_MAJ_SHIFT (16)
78*4882a593Smuzhiyun #define UNICODE_MIN_SHIFT (8)
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun #define UNICODE_MAJ_MAX ((unsigned short)-1)
81*4882a593Smuzhiyun #define UNICODE_MIN_MAX ((unsigned char)-1)
82*4882a593Smuzhiyun #define UNICODE_REV_MAX ((unsigned char)-1)
83*4882a593Smuzhiyun
84*4882a593Smuzhiyun #define UNICODE_AGE(MAJ,MIN,REV) \
85*4882a593Smuzhiyun (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
86*4882a593Smuzhiyun ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
87*4882a593Smuzhiyun ((unsigned int)(REV)))
88*4882a593Smuzhiyun
89*4882a593Smuzhiyun unsigned int *ages;
90*4882a593Smuzhiyun int ages_count;
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun unsigned int unicode_maxage;
93*4882a593Smuzhiyun
age_valid(unsigned int major,unsigned int minor,unsigned int revision)94*4882a593Smuzhiyun static int age_valid(unsigned int major, unsigned int minor,
95*4882a593Smuzhiyun unsigned int revision)
96*4882a593Smuzhiyun {
97*4882a593Smuzhiyun if (major > UNICODE_MAJ_MAX)
98*4882a593Smuzhiyun return 0;
99*4882a593Smuzhiyun if (minor > UNICODE_MIN_MAX)
100*4882a593Smuzhiyun return 0;
101*4882a593Smuzhiyun if (revision > UNICODE_REV_MAX)
102*4882a593Smuzhiyun return 0;
103*4882a593Smuzhiyun return 1;
104*4882a593Smuzhiyun }
105*4882a593Smuzhiyun
106*4882a593Smuzhiyun /* ------------------------------------------------------------------ */
107*4882a593Smuzhiyun
108*4882a593Smuzhiyun /*
109*4882a593Smuzhiyun * utf8trie_t
110*4882a593Smuzhiyun *
111*4882a593Smuzhiyun * A compact binary tree, used to decode UTF-8 characters.
112*4882a593Smuzhiyun *
113*4882a593Smuzhiyun * Internal nodes are one byte for the node itself, and up to three
114*4882a593Smuzhiyun * bytes for an offset into the tree. The first byte contains the
115*4882a593Smuzhiyun * following information:
116*4882a593Smuzhiyun * NEXTBYTE - flag - advance to next byte if set
117*4882a593Smuzhiyun * BITNUM - 3 bit field - the bit number to tested
118*4882a593Smuzhiyun * OFFLEN - 2 bit field - number of bytes in the offset
119*4882a593Smuzhiyun * if offlen == 0 (non-branching node)
120*4882a593Smuzhiyun * RIGHTPATH - 1 bit field - set if the following node is for the
121*4882a593Smuzhiyun * right-hand path (tested bit is set)
122*4882a593Smuzhiyun * TRIENODE - 1 bit field - set if the following node is an internal
123*4882a593Smuzhiyun * node, otherwise it is a leaf node
124*4882a593Smuzhiyun * if offlen != 0 (branching node)
125*4882a593Smuzhiyun * LEFTNODE - 1 bit field - set if the left-hand node is internal
126*4882a593Smuzhiyun * RIGHTNODE - 1 bit field - set if the right-hand node is internal
127*4882a593Smuzhiyun *
128*4882a593Smuzhiyun * Due to the way utf8 works, there cannot be branching nodes with
129*4882a593Smuzhiyun * NEXTBYTE set, and moreover those nodes always have a righthand
130*4882a593Smuzhiyun * descendant.
131*4882a593Smuzhiyun */
132*4882a593Smuzhiyun typedef unsigned char utf8trie_t;
133*4882a593Smuzhiyun #define BITNUM 0x07
134*4882a593Smuzhiyun #define NEXTBYTE 0x08
135*4882a593Smuzhiyun #define OFFLEN 0x30
136*4882a593Smuzhiyun #define OFFLEN_SHIFT 4
137*4882a593Smuzhiyun #define RIGHTPATH 0x40
138*4882a593Smuzhiyun #define TRIENODE 0x80
139*4882a593Smuzhiyun #define RIGHTNODE 0x40
140*4882a593Smuzhiyun #define LEFTNODE 0x80
141*4882a593Smuzhiyun
142*4882a593Smuzhiyun /*
143*4882a593Smuzhiyun * utf8leaf_t
144*4882a593Smuzhiyun *
145*4882a593Smuzhiyun * The leaves of the trie are embedded in the trie, and so the same
146*4882a593Smuzhiyun * underlying datatype, unsigned char.
147*4882a593Smuzhiyun *
148*4882a593Smuzhiyun * leaf[0]: The unicode version, stored as a generation number that is
149*4882a593Smuzhiyun * an index into utf8agetab[]. With this we can filter code
150*4882a593Smuzhiyun * points based on the unicode version in which they were
151*4882a593Smuzhiyun * defined. The CCC of a non-defined code point is 0.
152*4882a593Smuzhiyun * leaf[1]: Canonical Combining Class. During normalization, we need
153*4882a593Smuzhiyun * to do a stable sort into ascending order of all characters
154*4882a593Smuzhiyun * with a non-zero CCC that occur between two characters with
155*4882a593Smuzhiyun * a CCC of 0, or at the begin or end of a string.
156*4882a593Smuzhiyun * The unicode standard guarantees that all CCC values are
157*4882a593Smuzhiyun * between 0 and 254 inclusive, which leaves 255 available as
158*4882a593Smuzhiyun * a special value.
159*4882a593Smuzhiyun * Code points with CCC 0 are known as stoppers.
160*4882a593Smuzhiyun * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
161*4882a593Smuzhiyun * start of a NUL-terminated string that is the decomposition
162*4882a593Smuzhiyun * of the character.
163*4882a593Smuzhiyun * The CCC of a decomposable character is the same as the CCC
164*4882a593Smuzhiyun * of the first character of its decomposition.
165*4882a593Smuzhiyun * Some characters decompose as the empty string: these are
166*4882a593Smuzhiyun * characters with the Default_Ignorable_Code_Point property.
167*4882a593Smuzhiyun * These do affect normalization, as they all have CCC 0.
168*4882a593Smuzhiyun *
169*4882a593Smuzhiyun * The decompositions in the trie have been fully expanded.
170*4882a593Smuzhiyun *
171*4882a593Smuzhiyun * Casefolding, if applicable, is also done using decompositions.
172*4882a593Smuzhiyun */
173*4882a593Smuzhiyun typedef unsigned char utf8leaf_t;
174*4882a593Smuzhiyun
175*4882a593Smuzhiyun #define LEAF_GEN(LEAF) ((LEAF)[0])
176*4882a593Smuzhiyun #define LEAF_CCC(LEAF) ((LEAF)[1])
177*4882a593Smuzhiyun #define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun #define MAXGEN (255)
180*4882a593Smuzhiyun
181*4882a593Smuzhiyun #define MINCCC (0)
182*4882a593Smuzhiyun #define MAXCCC (254)
183*4882a593Smuzhiyun #define STOPPER (0)
184*4882a593Smuzhiyun #define DECOMPOSE (255)
185*4882a593Smuzhiyun #define HANGUL ((char)(255))
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun #define UTF8HANGULLEAF (12)
188*4882a593Smuzhiyun
189*4882a593Smuzhiyun struct tree;
190*4882a593Smuzhiyun static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
191*4882a593Smuzhiyun const char *, size_t);
192*4882a593Smuzhiyun static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
193*4882a593Smuzhiyun
194*4882a593Smuzhiyun unsigned char *utf8data;
195*4882a593Smuzhiyun size_t utf8data_size;
196*4882a593Smuzhiyun
197*4882a593Smuzhiyun utf8trie_t *nfdi;
198*4882a593Smuzhiyun utf8trie_t *nfdicf;
199*4882a593Smuzhiyun
200*4882a593Smuzhiyun /* ------------------------------------------------------------------ */
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun /*
203*4882a593Smuzhiyun * UTF8 valid ranges.
204*4882a593Smuzhiyun *
205*4882a593Smuzhiyun * The UTF-8 encoding spreads the bits of a 32bit word over several
206*4882a593Smuzhiyun * bytes. This table gives the ranges that can be held and how they'd
207*4882a593Smuzhiyun * be represented.
208*4882a593Smuzhiyun *
209*4882a593Smuzhiyun * 0x00000000 0x0000007F: 0xxxxxxx
210*4882a593Smuzhiyun * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
211*4882a593Smuzhiyun * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
212*4882a593Smuzhiyun * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
213*4882a593Smuzhiyun * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
214*4882a593Smuzhiyun * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
215*4882a593Smuzhiyun *
216*4882a593Smuzhiyun * There is an additional requirement on UTF-8, in that only the
217*4882a593Smuzhiyun * shortest representation of a 32bit value is to be used. A decoder
218*4882a593Smuzhiyun * must not decode sequences that do not satisfy this requirement.
219*4882a593Smuzhiyun * Thus the allowed ranges have a lower bound.
220*4882a593Smuzhiyun *
221*4882a593Smuzhiyun * 0x00000000 0x0000007F: 0xxxxxxx
222*4882a593Smuzhiyun * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
223*4882a593Smuzhiyun * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
224*4882a593Smuzhiyun * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
225*4882a593Smuzhiyun * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
226*4882a593Smuzhiyun * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
227*4882a593Smuzhiyun *
228*4882a593Smuzhiyun * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
229*4882a593Smuzhiyun * 17 planes of 65536 values. This limits the sequences actually seen
230*4882a593Smuzhiyun * even more, to just the following.
231*4882a593Smuzhiyun *
232*4882a593Smuzhiyun * 0 - 0x7f: 0 0x7f
233*4882a593Smuzhiyun * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
234*4882a593Smuzhiyun * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
235*4882a593Smuzhiyun * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
236*4882a593Smuzhiyun *
237*4882a593Smuzhiyun * Even within those ranges not all values are allowed: the surrogates
238*4882a593Smuzhiyun * 0xd800 - 0xdfff should never be seen.
239*4882a593Smuzhiyun *
240*4882a593Smuzhiyun * Note that the longest sequence seen with valid usage is 4 bytes,
241*4882a593Smuzhiyun * the same a single UTF-32 character. This makes the UTF-8
242*4882a593Smuzhiyun * representation of Unicode strictly smaller than UTF-32.
243*4882a593Smuzhiyun *
244*4882a593Smuzhiyun * The shortest sequence requirement was introduced by:
245*4882a593Smuzhiyun * Corrigendum #1: UTF-8 Shortest Form
246*4882a593Smuzhiyun * It can be found here:
247*4882a593Smuzhiyun * http://www.unicode.org/versions/corrigendum1.html
248*4882a593Smuzhiyun *
249*4882a593Smuzhiyun */
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun #define UTF8_2_BITS 0xC0
252*4882a593Smuzhiyun #define UTF8_3_BITS 0xE0
253*4882a593Smuzhiyun #define UTF8_4_BITS 0xF0
254*4882a593Smuzhiyun #define UTF8_N_BITS 0x80
255*4882a593Smuzhiyun #define UTF8_2_MASK 0xE0
256*4882a593Smuzhiyun #define UTF8_3_MASK 0xF0
257*4882a593Smuzhiyun #define UTF8_4_MASK 0xF8
258*4882a593Smuzhiyun #define UTF8_N_MASK 0xC0
259*4882a593Smuzhiyun #define UTF8_V_MASK 0x3F
260*4882a593Smuzhiyun #define UTF8_V_SHIFT 6
261*4882a593Smuzhiyun
utf8encode(char * str,unsigned int val)262*4882a593Smuzhiyun static int utf8encode(char *str, unsigned int val)
263*4882a593Smuzhiyun {
264*4882a593Smuzhiyun int len;
265*4882a593Smuzhiyun
266*4882a593Smuzhiyun if (val < 0x80) {
267*4882a593Smuzhiyun str[0] = val;
268*4882a593Smuzhiyun len = 1;
269*4882a593Smuzhiyun } else if (val < 0x800) {
270*4882a593Smuzhiyun str[1] = val & UTF8_V_MASK;
271*4882a593Smuzhiyun str[1] |= UTF8_N_BITS;
272*4882a593Smuzhiyun val >>= UTF8_V_SHIFT;
273*4882a593Smuzhiyun str[0] = val;
274*4882a593Smuzhiyun str[0] |= UTF8_2_BITS;
275*4882a593Smuzhiyun len = 2;
276*4882a593Smuzhiyun } else if (val < 0x10000) {
277*4882a593Smuzhiyun str[2] = val & UTF8_V_MASK;
278*4882a593Smuzhiyun str[2] |= UTF8_N_BITS;
279*4882a593Smuzhiyun val >>= UTF8_V_SHIFT;
280*4882a593Smuzhiyun str[1] = val & UTF8_V_MASK;
281*4882a593Smuzhiyun str[1] |= UTF8_N_BITS;
282*4882a593Smuzhiyun val >>= UTF8_V_SHIFT;
283*4882a593Smuzhiyun str[0] = val;
284*4882a593Smuzhiyun str[0] |= UTF8_3_BITS;
285*4882a593Smuzhiyun len = 3;
286*4882a593Smuzhiyun } else if (val < 0x110000) {
287*4882a593Smuzhiyun str[3] = val & UTF8_V_MASK;
288*4882a593Smuzhiyun str[3] |= UTF8_N_BITS;
289*4882a593Smuzhiyun val >>= UTF8_V_SHIFT;
290*4882a593Smuzhiyun str[2] = val & UTF8_V_MASK;
291*4882a593Smuzhiyun str[2] |= UTF8_N_BITS;
292*4882a593Smuzhiyun val >>= UTF8_V_SHIFT;
293*4882a593Smuzhiyun str[1] = val & UTF8_V_MASK;
294*4882a593Smuzhiyun str[1] |= UTF8_N_BITS;
295*4882a593Smuzhiyun val >>= UTF8_V_SHIFT;
296*4882a593Smuzhiyun str[0] = val;
297*4882a593Smuzhiyun str[0] |= UTF8_4_BITS;
298*4882a593Smuzhiyun len = 4;
299*4882a593Smuzhiyun } else {
300*4882a593Smuzhiyun printf("%#x: illegal val\n", val);
301*4882a593Smuzhiyun len = 0;
302*4882a593Smuzhiyun }
303*4882a593Smuzhiyun return len;
304*4882a593Smuzhiyun }
305*4882a593Smuzhiyun
utf8decode(const char * str)306*4882a593Smuzhiyun static unsigned int utf8decode(const char *str)
307*4882a593Smuzhiyun {
308*4882a593Smuzhiyun const unsigned char *s = (const unsigned char*)str;
309*4882a593Smuzhiyun unsigned int unichar = 0;
310*4882a593Smuzhiyun
311*4882a593Smuzhiyun if (*s < 0x80) {
312*4882a593Smuzhiyun unichar = *s;
313*4882a593Smuzhiyun } else if (*s < UTF8_3_BITS) {
314*4882a593Smuzhiyun unichar = *s++ & 0x1F;
315*4882a593Smuzhiyun unichar <<= UTF8_V_SHIFT;
316*4882a593Smuzhiyun unichar |= *s & 0x3F;
317*4882a593Smuzhiyun } else if (*s < UTF8_4_BITS) {
318*4882a593Smuzhiyun unichar = *s++ & 0x0F;
319*4882a593Smuzhiyun unichar <<= UTF8_V_SHIFT;
320*4882a593Smuzhiyun unichar |= *s++ & 0x3F;
321*4882a593Smuzhiyun unichar <<= UTF8_V_SHIFT;
322*4882a593Smuzhiyun unichar |= *s & 0x3F;
323*4882a593Smuzhiyun } else {
324*4882a593Smuzhiyun unichar = *s++ & 0x0F;
325*4882a593Smuzhiyun unichar <<= UTF8_V_SHIFT;
326*4882a593Smuzhiyun unichar |= *s++ & 0x3F;
327*4882a593Smuzhiyun unichar <<= UTF8_V_SHIFT;
328*4882a593Smuzhiyun unichar |= *s++ & 0x3F;
329*4882a593Smuzhiyun unichar <<= UTF8_V_SHIFT;
330*4882a593Smuzhiyun unichar |= *s & 0x3F;
331*4882a593Smuzhiyun }
332*4882a593Smuzhiyun return unichar;
333*4882a593Smuzhiyun }
334*4882a593Smuzhiyun
utf32valid(unsigned int unichar)335*4882a593Smuzhiyun static int utf32valid(unsigned int unichar)
336*4882a593Smuzhiyun {
337*4882a593Smuzhiyun return unichar < 0x110000;
338*4882a593Smuzhiyun }
339*4882a593Smuzhiyun
340*4882a593Smuzhiyun #define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
341*4882a593Smuzhiyun
342*4882a593Smuzhiyun #define NODE 1
343*4882a593Smuzhiyun #define LEAF 0
344*4882a593Smuzhiyun
345*4882a593Smuzhiyun struct tree {
346*4882a593Smuzhiyun void *root;
347*4882a593Smuzhiyun int childnode;
348*4882a593Smuzhiyun const char *type;
349*4882a593Smuzhiyun unsigned int maxage;
350*4882a593Smuzhiyun struct tree *next;
351*4882a593Smuzhiyun int (*leaf_equal)(void *, void *);
352*4882a593Smuzhiyun void (*leaf_print)(void *, int);
353*4882a593Smuzhiyun int (*leaf_mark)(void *);
354*4882a593Smuzhiyun int (*leaf_size)(void *);
355*4882a593Smuzhiyun int *(*leaf_index)(struct tree *, void *);
356*4882a593Smuzhiyun unsigned char *(*leaf_emit)(void *, unsigned char *);
357*4882a593Smuzhiyun int leafindex[0x110000];
358*4882a593Smuzhiyun int index;
359*4882a593Smuzhiyun };
360*4882a593Smuzhiyun
361*4882a593Smuzhiyun struct node {
362*4882a593Smuzhiyun int index;
363*4882a593Smuzhiyun int offset;
364*4882a593Smuzhiyun int mark;
365*4882a593Smuzhiyun int size;
366*4882a593Smuzhiyun struct node *parent;
367*4882a593Smuzhiyun void *left;
368*4882a593Smuzhiyun void *right;
369*4882a593Smuzhiyun unsigned char bitnum;
370*4882a593Smuzhiyun unsigned char nextbyte;
371*4882a593Smuzhiyun unsigned char leftnode;
372*4882a593Smuzhiyun unsigned char rightnode;
373*4882a593Smuzhiyun unsigned int keybits;
374*4882a593Smuzhiyun unsigned int keymask;
375*4882a593Smuzhiyun };
376*4882a593Smuzhiyun
377*4882a593Smuzhiyun /*
378*4882a593Smuzhiyun * Example lookup function for a tree.
379*4882a593Smuzhiyun */
lookup(struct tree * tree,const char * key)380*4882a593Smuzhiyun static void *lookup(struct tree *tree, const char *key)
381*4882a593Smuzhiyun {
382*4882a593Smuzhiyun struct node *node;
383*4882a593Smuzhiyun void *leaf = NULL;
384*4882a593Smuzhiyun
385*4882a593Smuzhiyun node = tree->root;
386*4882a593Smuzhiyun while (!leaf && node) {
387*4882a593Smuzhiyun if (node->nextbyte)
388*4882a593Smuzhiyun key++;
389*4882a593Smuzhiyun if (*key & (1 << (node->bitnum & 7))) {
390*4882a593Smuzhiyun /* Right leg */
391*4882a593Smuzhiyun if (node->rightnode == NODE) {
392*4882a593Smuzhiyun node = node->right;
393*4882a593Smuzhiyun } else if (node->rightnode == LEAF) {
394*4882a593Smuzhiyun leaf = node->right;
395*4882a593Smuzhiyun } else {
396*4882a593Smuzhiyun node = NULL;
397*4882a593Smuzhiyun }
398*4882a593Smuzhiyun } else {
399*4882a593Smuzhiyun /* Left leg */
400*4882a593Smuzhiyun if (node->leftnode == NODE) {
401*4882a593Smuzhiyun node = node->left;
402*4882a593Smuzhiyun } else if (node->leftnode == LEAF) {
403*4882a593Smuzhiyun leaf = node->left;
404*4882a593Smuzhiyun } else {
405*4882a593Smuzhiyun node = NULL;
406*4882a593Smuzhiyun }
407*4882a593Smuzhiyun }
408*4882a593Smuzhiyun }
409*4882a593Smuzhiyun
410*4882a593Smuzhiyun return leaf;
411*4882a593Smuzhiyun }
412*4882a593Smuzhiyun
413*4882a593Smuzhiyun /*
414*4882a593Smuzhiyun * A simple non-recursive tree walker: keep track of visits to the
415*4882a593Smuzhiyun * left and right branches in the leftmask and rightmask.
416*4882a593Smuzhiyun */
tree_walk(struct tree * tree)417*4882a593Smuzhiyun static void tree_walk(struct tree *tree)
418*4882a593Smuzhiyun {
419*4882a593Smuzhiyun struct node *node;
420*4882a593Smuzhiyun unsigned int leftmask;
421*4882a593Smuzhiyun unsigned int rightmask;
422*4882a593Smuzhiyun unsigned int bitmask;
423*4882a593Smuzhiyun int indent = 1;
424*4882a593Smuzhiyun int nodes, singletons, leaves;
425*4882a593Smuzhiyun
426*4882a593Smuzhiyun nodes = singletons = leaves = 0;
427*4882a593Smuzhiyun
428*4882a593Smuzhiyun printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
429*4882a593Smuzhiyun if (tree->childnode == LEAF) {
430*4882a593Smuzhiyun assert(tree->root);
431*4882a593Smuzhiyun tree->leaf_print(tree->root, indent);
432*4882a593Smuzhiyun leaves = 1;
433*4882a593Smuzhiyun } else {
434*4882a593Smuzhiyun assert(tree->childnode == NODE);
435*4882a593Smuzhiyun node = tree->root;
436*4882a593Smuzhiyun leftmask = rightmask = 0;
437*4882a593Smuzhiyun while (node) {
438*4882a593Smuzhiyun printf("%*snode @ %p bitnum %d nextbyte %d"
439*4882a593Smuzhiyun " left %p right %p mask %x bits %x\n",
440*4882a593Smuzhiyun indent, "", node,
441*4882a593Smuzhiyun node->bitnum, node->nextbyte,
442*4882a593Smuzhiyun node->left, node->right,
443*4882a593Smuzhiyun node->keymask, node->keybits);
444*4882a593Smuzhiyun nodes += 1;
445*4882a593Smuzhiyun if (!(node->left && node->right))
446*4882a593Smuzhiyun singletons += 1;
447*4882a593Smuzhiyun
448*4882a593Smuzhiyun while (node) {
449*4882a593Smuzhiyun bitmask = 1 << node->bitnum;
450*4882a593Smuzhiyun if ((leftmask & bitmask) == 0) {
451*4882a593Smuzhiyun leftmask |= bitmask;
452*4882a593Smuzhiyun if (node->leftnode == LEAF) {
453*4882a593Smuzhiyun assert(node->left);
454*4882a593Smuzhiyun tree->leaf_print(node->left,
455*4882a593Smuzhiyun indent+1);
456*4882a593Smuzhiyun leaves += 1;
457*4882a593Smuzhiyun } else if (node->left) {
458*4882a593Smuzhiyun assert(node->leftnode == NODE);
459*4882a593Smuzhiyun indent += 1;
460*4882a593Smuzhiyun node = node->left;
461*4882a593Smuzhiyun break;
462*4882a593Smuzhiyun }
463*4882a593Smuzhiyun }
464*4882a593Smuzhiyun if ((rightmask & bitmask) == 0) {
465*4882a593Smuzhiyun rightmask |= bitmask;
466*4882a593Smuzhiyun if (node->rightnode == LEAF) {
467*4882a593Smuzhiyun assert(node->right);
468*4882a593Smuzhiyun tree->leaf_print(node->right,
469*4882a593Smuzhiyun indent+1);
470*4882a593Smuzhiyun leaves += 1;
471*4882a593Smuzhiyun } else if (node->right) {
472*4882a593Smuzhiyun assert(node->rightnode == NODE);
473*4882a593Smuzhiyun indent += 1;
474*4882a593Smuzhiyun node = node->right;
475*4882a593Smuzhiyun break;
476*4882a593Smuzhiyun }
477*4882a593Smuzhiyun }
478*4882a593Smuzhiyun leftmask &= ~bitmask;
479*4882a593Smuzhiyun rightmask &= ~bitmask;
480*4882a593Smuzhiyun node = node->parent;
481*4882a593Smuzhiyun indent -= 1;
482*4882a593Smuzhiyun }
483*4882a593Smuzhiyun }
484*4882a593Smuzhiyun }
485*4882a593Smuzhiyun printf("nodes %d leaves %d singletons %d\n",
486*4882a593Smuzhiyun nodes, leaves, singletons);
487*4882a593Smuzhiyun }
488*4882a593Smuzhiyun
489*4882a593Smuzhiyun /*
490*4882a593Smuzhiyun * Allocate an initialize a new internal node.
491*4882a593Smuzhiyun */
alloc_node(struct node * parent)492*4882a593Smuzhiyun static struct node *alloc_node(struct node *parent)
493*4882a593Smuzhiyun {
494*4882a593Smuzhiyun struct node *node;
495*4882a593Smuzhiyun int bitnum;
496*4882a593Smuzhiyun
497*4882a593Smuzhiyun node = malloc(sizeof(*node));
498*4882a593Smuzhiyun node->left = node->right = NULL;
499*4882a593Smuzhiyun node->parent = parent;
500*4882a593Smuzhiyun node->leftnode = NODE;
501*4882a593Smuzhiyun node->rightnode = NODE;
502*4882a593Smuzhiyun node->keybits = 0;
503*4882a593Smuzhiyun node->keymask = 0;
504*4882a593Smuzhiyun node->mark = 0;
505*4882a593Smuzhiyun node->index = 0;
506*4882a593Smuzhiyun node->offset = -1;
507*4882a593Smuzhiyun node->size = 4;
508*4882a593Smuzhiyun
509*4882a593Smuzhiyun if (node->parent) {
510*4882a593Smuzhiyun bitnum = parent->bitnum;
511*4882a593Smuzhiyun if ((bitnum & 7) == 0) {
512*4882a593Smuzhiyun node->bitnum = bitnum + 7 + 8;
513*4882a593Smuzhiyun node->nextbyte = 1;
514*4882a593Smuzhiyun } else {
515*4882a593Smuzhiyun node->bitnum = bitnum - 1;
516*4882a593Smuzhiyun node->nextbyte = 0;
517*4882a593Smuzhiyun }
518*4882a593Smuzhiyun } else {
519*4882a593Smuzhiyun node->bitnum = 7;
520*4882a593Smuzhiyun node->nextbyte = 0;
521*4882a593Smuzhiyun }
522*4882a593Smuzhiyun
523*4882a593Smuzhiyun return node;
524*4882a593Smuzhiyun }
525*4882a593Smuzhiyun
526*4882a593Smuzhiyun /*
527*4882a593Smuzhiyun * Insert a new leaf into the tree, and collapse any subtrees that are
528*4882a593Smuzhiyun * fully populated and end in identical leaves. A nextbyte tagged
529*4882a593Smuzhiyun * internal node will not be removed to preserve the tree's integrity.
530*4882a593Smuzhiyun * Note that due to the structure of utf8, no nextbyte tagged node
531*4882a593Smuzhiyun * will be a candidate for removal.
532*4882a593Smuzhiyun */
insert(struct tree * tree,char * key,int keylen,void * leaf)533*4882a593Smuzhiyun static int insert(struct tree *tree, char *key, int keylen, void *leaf)
534*4882a593Smuzhiyun {
535*4882a593Smuzhiyun struct node *node;
536*4882a593Smuzhiyun struct node *parent;
537*4882a593Smuzhiyun void **cursor;
538*4882a593Smuzhiyun int keybits;
539*4882a593Smuzhiyun
540*4882a593Smuzhiyun assert(keylen >= 1 && keylen <= 4);
541*4882a593Smuzhiyun
542*4882a593Smuzhiyun node = NULL;
543*4882a593Smuzhiyun cursor = &tree->root;
544*4882a593Smuzhiyun keybits = 8 * keylen;
545*4882a593Smuzhiyun
546*4882a593Smuzhiyun /* Insert, creating path along the way. */
547*4882a593Smuzhiyun while (keybits) {
548*4882a593Smuzhiyun if (!*cursor)
549*4882a593Smuzhiyun *cursor = alloc_node(node);
550*4882a593Smuzhiyun node = *cursor;
551*4882a593Smuzhiyun if (node->nextbyte)
552*4882a593Smuzhiyun key++;
553*4882a593Smuzhiyun if (*key & (1 << (node->bitnum & 7)))
554*4882a593Smuzhiyun cursor = &node->right;
555*4882a593Smuzhiyun else
556*4882a593Smuzhiyun cursor = &node->left;
557*4882a593Smuzhiyun keybits--;
558*4882a593Smuzhiyun }
559*4882a593Smuzhiyun *cursor = leaf;
560*4882a593Smuzhiyun
561*4882a593Smuzhiyun /* Merge subtrees if possible. */
562*4882a593Smuzhiyun while (node) {
563*4882a593Smuzhiyun if (*key & (1 << (node->bitnum & 7)))
564*4882a593Smuzhiyun node->rightnode = LEAF;
565*4882a593Smuzhiyun else
566*4882a593Smuzhiyun node->leftnode = LEAF;
567*4882a593Smuzhiyun if (node->nextbyte)
568*4882a593Smuzhiyun break;
569*4882a593Smuzhiyun if (node->leftnode == NODE || node->rightnode == NODE)
570*4882a593Smuzhiyun break;
571*4882a593Smuzhiyun assert(node->left);
572*4882a593Smuzhiyun assert(node->right);
573*4882a593Smuzhiyun /* Compare */
574*4882a593Smuzhiyun if (! tree->leaf_equal(node->left, node->right))
575*4882a593Smuzhiyun break;
576*4882a593Smuzhiyun /* Keep left, drop right leaf. */
577*4882a593Smuzhiyun leaf = node->left;
578*4882a593Smuzhiyun /* Check in parent */
579*4882a593Smuzhiyun parent = node->parent;
580*4882a593Smuzhiyun if (!parent) {
581*4882a593Smuzhiyun /* root of tree! */
582*4882a593Smuzhiyun tree->root = leaf;
583*4882a593Smuzhiyun tree->childnode = LEAF;
584*4882a593Smuzhiyun } else if (parent->left == node) {
585*4882a593Smuzhiyun parent->left = leaf;
586*4882a593Smuzhiyun parent->leftnode = LEAF;
587*4882a593Smuzhiyun if (parent->right) {
588*4882a593Smuzhiyun parent->keymask = 0;
589*4882a593Smuzhiyun parent->keybits = 0;
590*4882a593Smuzhiyun } else {
591*4882a593Smuzhiyun parent->keymask |= (1 << node->bitnum);
592*4882a593Smuzhiyun }
593*4882a593Smuzhiyun } else if (parent->right == node) {
594*4882a593Smuzhiyun parent->right = leaf;
595*4882a593Smuzhiyun parent->rightnode = LEAF;
596*4882a593Smuzhiyun if (parent->left) {
597*4882a593Smuzhiyun parent->keymask = 0;
598*4882a593Smuzhiyun parent->keybits = 0;
599*4882a593Smuzhiyun } else {
600*4882a593Smuzhiyun parent->keymask |= (1 << node->bitnum);
601*4882a593Smuzhiyun parent->keybits |= (1 << node->bitnum);
602*4882a593Smuzhiyun }
603*4882a593Smuzhiyun } else {
604*4882a593Smuzhiyun /* internal tree error */
605*4882a593Smuzhiyun assert(0);
606*4882a593Smuzhiyun }
607*4882a593Smuzhiyun free(node);
608*4882a593Smuzhiyun node = parent;
609*4882a593Smuzhiyun }
610*4882a593Smuzhiyun
611*4882a593Smuzhiyun /* Propagate keymasks up along singleton chains. */
612*4882a593Smuzhiyun while (node) {
613*4882a593Smuzhiyun parent = node->parent;
614*4882a593Smuzhiyun if (!parent)
615*4882a593Smuzhiyun break;
616*4882a593Smuzhiyun /* Nix the mask for parents with two children. */
617*4882a593Smuzhiyun if (node->keymask == 0) {
618*4882a593Smuzhiyun parent->keymask = 0;
619*4882a593Smuzhiyun parent->keybits = 0;
620*4882a593Smuzhiyun } else if (parent->left && parent->right) {
621*4882a593Smuzhiyun parent->keymask = 0;
622*4882a593Smuzhiyun parent->keybits = 0;
623*4882a593Smuzhiyun } else {
624*4882a593Smuzhiyun assert((parent->keymask & node->keymask) == 0);
625*4882a593Smuzhiyun parent->keymask |= node->keymask;
626*4882a593Smuzhiyun parent->keymask |= (1 << parent->bitnum);
627*4882a593Smuzhiyun parent->keybits |= node->keybits;
628*4882a593Smuzhiyun if (parent->right)
629*4882a593Smuzhiyun parent->keybits |= (1 << parent->bitnum);
630*4882a593Smuzhiyun }
631*4882a593Smuzhiyun node = parent;
632*4882a593Smuzhiyun }
633*4882a593Smuzhiyun
634*4882a593Smuzhiyun return 0;
635*4882a593Smuzhiyun }
636*4882a593Smuzhiyun
637*4882a593Smuzhiyun /*
638*4882a593Smuzhiyun * Prune internal nodes.
639*4882a593Smuzhiyun *
640*4882a593Smuzhiyun * Fully populated subtrees that end at the same leaf have already
641*4882a593Smuzhiyun * been collapsed. There are still internal nodes that have for both
642*4882a593Smuzhiyun * their left and right branches a sequence of singletons that make
643*4882a593Smuzhiyun * identical choices and end in identical leaves. The keymask and
644*4882a593Smuzhiyun * keybits collected in the nodes describe the choices made in these
645*4882a593Smuzhiyun * singleton chains. When they are identical for the left and right
646*4882a593Smuzhiyun * branch of a node, and the two leaves comare identical, the node in
647*4882a593Smuzhiyun * question can be removed.
648*4882a593Smuzhiyun *
649*4882a593Smuzhiyun * Note that nodes with the nextbyte tag set will not be removed by
650*4882a593Smuzhiyun * this to ensure tree integrity. Note as well that the structure of
651*4882a593Smuzhiyun * utf8 ensures that these nodes would not have been candidates for
652*4882a593Smuzhiyun * removal in any case.
653*4882a593Smuzhiyun */
prune(struct tree * tree)654*4882a593Smuzhiyun static void prune(struct tree *tree)
655*4882a593Smuzhiyun {
656*4882a593Smuzhiyun struct node *node;
657*4882a593Smuzhiyun struct node *left;
658*4882a593Smuzhiyun struct node *right;
659*4882a593Smuzhiyun struct node *parent;
660*4882a593Smuzhiyun void *leftleaf;
661*4882a593Smuzhiyun void *rightleaf;
662*4882a593Smuzhiyun unsigned int leftmask;
663*4882a593Smuzhiyun unsigned int rightmask;
664*4882a593Smuzhiyun unsigned int bitmask;
665*4882a593Smuzhiyun int count;
666*4882a593Smuzhiyun
667*4882a593Smuzhiyun if (verbose > 0)
668*4882a593Smuzhiyun printf("Pruning %s_%x\n", tree->type, tree->maxage);
669*4882a593Smuzhiyun
670*4882a593Smuzhiyun count = 0;
671*4882a593Smuzhiyun if (tree->childnode == LEAF)
672*4882a593Smuzhiyun return;
673*4882a593Smuzhiyun if (!tree->root)
674*4882a593Smuzhiyun return;
675*4882a593Smuzhiyun
676*4882a593Smuzhiyun leftmask = rightmask = 0;
677*4882a593Smuzhiyun node = tree->root;
678*4882a593Smuzhiyun while (node) {
679*4882a593Smuzhiyun if (node->nextbyte)
680*4882a593Smuzhiyun goto advance;
681*4882a593Smuzhiyun if (node->leftnode == LEAF)
682*4882a593Smuzhiyun goto advance;
683*4882a593Smuzhiyun if (node->rightnode == LEAF)
684*4882a593Smuzhiyun goto advance;
685*4882a593Smuzhiyun if (!node->left)
686*4882a593Smuzhiyun goto advance;
687*4882a593Smuzhiyun if (!node->right)
688*4882a593Smuzhiyun goto advance;
689*4882a593Smuzhiyun left = node->left;
690*4882a593Smuzhiyun right = node->right;
691*4882a593Smuzhiyun if (left->keymask == 0)
692*4882a593Smuzhiyun goto advance;
693*4882a593Smuzhiyun if (right->keymask == 0)
694*4882a593Smuzhiyun goto advance;
695*4882a593Smuzhiyun if (left->keymask != right->keymask)
696*4882a593Smuzhiyun goto advance;
697*4882a593Smuzhiyun if (left->keybits != right->keybits)
698*4882a593Smuzhiyun goto advance;
699*4882a593Smuzhiyun leftleaf = NULL;
700*4882a593Smuzhiyun while (!leftleaf) {
701*4882a593Smuzhiyun assert(left->left || left->right);
702*4882a593Smuzhiyun if (left->leftnode == LEAF)
703*4882a593Smuzhiyun leftleaf = left->left;
704*4882a593Smuzhiyun else if (left->rightnode == LEAF)
705*4882a593Smuzhiyun leftleaf = left->right;
706*4882a593Smuzhiyun else if (left->left)
707*4882a593Smuzhiyun left = left->left;
708*4882a593Smuzhiyun else if (left->right)
709*4882a593Smuzhiyun left = left->right;
710*4882a593Smuzhiyun else
711*4882a593Smuzhiyun assert(0);
712*4882a593Smuzhiyun }
713*4882a593Smuzhiyun rightleaf = NULL;
714*4882a593Smuzhiyun while (!rightleaf) {
715*4882a593Smuzhiyun assert(right->left || right->right);
716*4882a593Smuzhiyun if (right->leftnode == LEAF)
717*4882a593Smuzhiyun rightleaf = right->left;
718*4882a593Smuzhiyun else if (right->rightnode == LEAF)
719*4882a593Smuzhiyun rightleaf = right->right;
720*4882a593Smuzhiyun else if (right->left)
721*4882a593Smuzhiyun right = right->left;
722*4882a593Smuzhiyun else if (right->right)
723*4882a593Smuzhiyun right = right->right;
724*4882a593Smuzhiyun else
725*4882a593Smuzhiyun assert(0);
726*4882a593Smuzhiyun }
727*4882a593Smuzhiyun if (! tree->leaf_equal(leftleaf, rightleaf))
728*4882a593Smuzhiyun goto advance;
729*4882a593Smuzhiyun /*
730*4882a593Smuzhiyun * This node has identical singleton-only subtrees.
731*4882a593Smuzhiyun * Remove it.
732*4882a593Smuzhiyun */
733*4882a593Smuzhiyun parent = node->parent;
734*4882a593Smuzhiyun left = node->left;
735*4882a593Smuzhiyun right = node->right;
736*4882a593Smuzhiyun if (parent->left == node)
737*4882a593Smuzhiyun parent->left = left;
738*4882a593Smuzhiyun else if (parent->right == node)
739*4882a593Smuzhiyun parent->right = left;
740*4882a593Smuzhiyun else
741*4882a593Smuzhiyun assert(0);
742*4882a593Smuzhiyun left->parent = parent;
743*4882a593Smuzhiyun left->keymask |= (1 << node->bitnum);
744*4882a593Smuzhiyun node->left = NULL;
745*4882a593Smuzhiyun while (node) {
746*4882a593Smuzhiyun bitmask = 1 << node->bitnum;
747*4882a593Smuzhiyun leftmask &= ~bitmask;
748*4882a593Smuzhiyun rightmask &= ~bitmask;
749*4882a593Smuzhiyun if (node->leftnode == NODE && node->left) {
750*4882a593Smuzhiyun left = node->left;
751*4882a593Smuzhiyun free(node);
752*4882a593Smuzhiyun count++;
753*4882a593Smuzhiyun node = left;
754*4882a593Smuzhiyun } else if (node->rightnode == NODE && node->right) {
755*4882a593Smuzhiyun right = node->right;
756*4882a593Smuzhiyun free(node);
757*4882a593Smuzhiyun count++;
758*4882a593Smuzhiyun node = right;
759*4882a593Smuzhiyun } else {
760*4882a593Smuzhiyun node = NULL;
761*4882a593Smuzhiyun }
762*4882a593Smuzhiyun }
763*4882a593Smuzhiyun /* Propagate keymasks up along singleton chains. */
764*4882a593Smuzhiyun node = parent;
765*4882a593Smuzhiyun /* Force re-check */
766*4882a593Smuzhiyun bitmask = 1 << node->bitnum;
767*4882a593Smuzhiyun leftmask &= ~bitmask;
768*4882a593Smuzhiyun rightmask &= ~bitmask;
769*4882a593Smuzhiyun for (;;) {
770*4882a593Smuzhiyun if (node->left && node->right)
771*4882a593Smuzhiyun break;
772*4882a593Smuzhiyun if (node->left) {
773*4882a593Smuzhiyun left = node->left;
774*4882a593Smuzhiyun node->keymask |= left->keymask;
775*4882a593Smuzhiyun node->keybits |= left->keybits;
776*4882a593Smuzhiyun }
777*4882a593Smuzhiyun if (node->right) {
778*4882a593Smuzhiyun right = node->right;
779*4882a593Smuzhiyun node->keymask |= right->keymask;
780*4882a593Smuzhiyun node->keybits |= right->keybits;
781*4882a593Smuzhiyun }
782*4882a593Smuzhiyun node->keymask |= (1 << node->bitnum);
783*4882a593Smuzhiyun node = node->parent;
784*4882a593Smuzhiyun /* Force re-check */
785*4882a593Smuzhiyun bitmask = 1 << node->bitnum;
786*4882a593Smuzhiyun leftmask &= ~bitmask;
787*4882a593Smuzhiyun rightmask &= ~bitmask;
788*4882a593Smuzhiyun }
789*4882a593Smuzhiyun advance:
790*4882a593Smuzhiyun bitmask = 1 << node->bitnum;
791*4882a593Smuzhiyun if ((leftmask & bitmask) == 0 &&
792*4882a593Smuzhiyun node->leftnode == NODE &&
793*4882a593Smuzhiyun node->left) {
794*4882a593Smuzhiyun leftmask |= bitmask;
795*4882a593Smuzhiyun node = node->left;
796*4882a593Smuzhiyun } else if ((rightmask & bitmask) == 0 &&
797*4882a593Smuzhiyun node->rightnode == NODE &&
798*4882a593Smuzhiyun node->right) {
799*4882a593Smuzhiyun rightmask |= bitmask;
800*4882a593Smuzhiyun node = node->right;
801*4882a593Smuzhiyun } else {
802*4882a593Smuzhiyun leftmask &= ~bitmask;
803*4882a593Smuzhiyun rightmask &= ~bitmask;
804*4882a593Smuzhiyun node = node->parent;
805*4882a593Smuzhiyun }
806*4882a593Smuzhiyun }
807*4882a593Smuzhiyun if (verbose > 0)
808*4882a593Smuzhiyun printf("Pruned %d nodes\n", count);
809*4882a593Smuzhiyun }
810*4882a593Smuzhiyun
811*4882a593Smuzhiyun /*
812*4882a593Smuzhiyun * Mark the nodes in the tree that lead to leaves that must be
813*4882a593Smuzhiyun * emitted.
814*4882a593Smuzhiyun */
mark_nodes(struct tree * tree)815*4882a593Smuzhiyun static void mark_nodes(struct tree *tree)
816*4882a593Smuzhiyun {
817*4882a593Smuzhiyun struct node *node;
818*4882a593Smuzhiyun struct node *n;
819*4882a593Smuzhiyun unsigned int leftmask;
820*4882a593Smuzhiyun unsigned int rightmask;
821*4882a593Smuzhiyun unsigned int bitmask;
822*4882a593Smuzhiyun int marked;
823*4882a593Smuzhiyun
824*4882a593Smuzhiyun marked = 0;
825*4882a593Smuzhiyun if (verbose > 0)
826*4882a593Smuzhiyun printf("Marking %s_%x\n", tree->type, tree->maxage);
827*4882a593Smuzhiyun if (tree->childnode == LEAF)
828*4882a593Smuzhiyun goto done;
829*4882a593Smuzhiyun
830*4882a593Smuzhiyun assert(tree->childnode == NODE);
831*4882a593Smuzhiyun node = tree->root;
832*4882a593Smuzhiyun leftmask = rightmask = 0;
833*4882a593Smuzhiyun while (node) {
834*4882a593Smuzhiyun bitmask = 1 << node->bitnum;
835*4882a593Smuzhiyun if ((leftmask & bitmask) == 0) {
836*4882a593Smuzhiyun leftmask |= bitmask;
837*4882a593Smuzhiyun if (node->leftnode == LEAF) {
838*4882a593Smuzhiyun assert(node->left);
839*4882a593Smuzhiyun if (tree->leaf_mark(node->left)) {
840*4882a593Smuzhiyun n = node;
841*4882a593Smuzhiyun while (n && !n->mark) {
842*4882a593Smuzhiyun marked++;
843*4882a593Smuzhiyun n->mark = 1;
844*4882a593Smuzhiyun n = n->parent;
845*4882a593Smuzhiyun }
846*4882a593Smuzhiyun }
847*4882a593Smuzhiyun } else if (node->left) {
848*4882a593Smuzhiyun assert(node->leftnode == NODE);
849*4882a593Smuzhiyun node = node->left;
850*4882a593Smuzhiyun continue;
851*4882a593Smuzhiyun }
852*4882a593Smuzhiyun }
853*4882a593Smuzhiyun if ((rightmask & bitmask) == 0) {
854*4882a593Smuzhiyun rightmask |= bitmask;
855*4882a593Smuzhiyun if (node->rightnode == LEAF) {
856*4882a593Smuzhiyun assert(node->right);
857*4882a593Smuzhiyun if (tree->leaf_mark(node->right)) {
858*4882a593Smuzhiyun n = node;
859*4882a593Smuzhiyun while (n && !n->mark) {
860*4882a593Smuzhiyun marked++;
861*4882a593Smuzhiyun n->mark = 1;
862*4882a593Smuzhiyun n = n->parent;
863*4882a593Smuzhiyun }
864*4882a593Smuzhiyun }
865*4882a593Smuzhiyun } else if (node->right) {
866*4882a593Smuzhiyun assert(node->rightnode == NODE);
867*4882a593Smuzhiyun node = node->right;
868*4882a593Smuzhiyun continue;
869*4882a593Smuzhiyun }
870*4882a593Smuzhiyun }
871*4882a593Smuzhiyun leftmask &= ~bitmask;
872*4882a593Smuzhiyun rightmask &= ~bitmask;
873*4882a593Smuzhiyun node = node->parent;
874*4882a593Smuzhiyun }
875*4882a593Smuzhiyun
876*4882a593Smuzhiyun /* second pass: left siblings and singletons */
877*4882a593Smuzhiyun
878*4882a593Smuzhiyun assert(tree->childnode == NODE);
879*4882a593Smuzhiyun node = tree->root;
880*4882a593Smuzhiyun leftmask = rightmask = 0;
881*4882a593Smuzhiyun while (node) {
882*4882a593Smuzhiyun bitmask = 1 << node->bitnum;
883*4882a593Smuzhiyun if ((leftmask & bitmask) == 0) {
884*4882a593Smuzhiyun leftmask |= bitmask;
885*4882a593Smuzhiyun if (node->leftnode == LEAF) {
886*4882a593Smuzhiyun assert(node->left);
887*4882a593Smuzhiyun if (tree->leaf_mark(node->left)) {
888*4882a593Smuzhiyun n = node;
889*4882a593Smuzhiyun while (n && !n->mark) {
890*4882a593Smuzhiyun marked++;
891*4882a593Smuzhiyun n->mark = 1;
892*4882a593Smuzhiyun n = n->parent;
893*4882a593Smuzhiyun }
894*4882a593Smuzhiyun }
895*4882a593Smuzhiyun } else if (node->left) {
896*4882a593Smuzhiyun assert(node->leftnode == NODE);
897*4882a593Smuzhiyun node = node->left;
898*4882a593Smuzhiyun if (!node->mark && node->parent->mark) {
899*4882a593Smuzhiyun marked++;
900*4882a593Smuzhiyun node->mark = 1;
901*4882a593Smuzhiyun }
902*4882a593Smuzhiyun continue;
903*4882a593Smuzhiyun }
904*4882a593Smuzhiyun }
905*4882a593Smuzhiyun if ((rightmask & bitmask) == 0) {
906*4882a593Smuzhiyun rightmask |= bitmask;
907*4882a593Smuzhiyun if (node->rightnode == LEAF) {
908*4882a593Smuzhiyun assert(node->right);
909*4882a593Smuzhiyun if (tree->leaf_mark(node->right)) {
910*4882a593Smuzhiyun n = node;
911*4882a593Smuzhiyun while (n && !n->mark) {
912*4882a593Smuzhiyun marked++;
913*4882a593Smuzhiyun n->mark = 1;
914*4882a593Smuzhiyun n = n->parent;
915*4882a593Smuzhiyun }
916*4882a593Smuzhiyun }
917*4882a593Smuzhiyun } else if (node->right) {
918*4882a593Smuzhiyun assert(node->rightnode == NODE);
919*4882a593Smuzhiyun node = node->right;
920*4882a593Smuzhiyun if (!node->mark && node->parent->mark &&
921*4882a593Smuzhiyun !node->parent->left) {
922*4882a593Smuzhiyun marked++;
923*4882a593Smuzhiyun node->mark = 1;
924*4882a593Smuzhiyun }
925*4882a593Smuzhiyun continue;
926*4882a593Smuzhiyun }
927*4882a593Smuzhiyun }
928*4882a593Smuzhiyun leftmask &= ~bitmask;
929*4882a593Smuzhiyun rightmask &= ~bitmask;
930*4882a593Smuzhiyun node = node->parent;
931*4882a593Smuzhiyun }
932*4882a593Smuzhiyun done:
933*4882a593Smuzhiyun if (verbose > 0)
934*4882a593Smuzhiyun printf("Marked %d nodes\n", marked);
935*4882a593Smuzhiyun }
936*4882a593Smuzhiyun
937*4882a593Smuzhiyun /*
938*4882a593Smuzhiyun * Compute the index of each node and leaf, which is the offset in the
939*4882a593Smuzhiyun * emitted trie. These values must be pre-computed because relative
940*4882a593Smuzhiyun * offsets between nodes are used to navigate the tree.
941*4882a593Smuzhiyun */
index_nodes(struct tree * tree,int index)942*4882a593Smuzhiyun static int index_nodes(struct tree *tree, int index)
943*4882a593Smuzhiyun {
944*4882a593Smuzhiyun struct node *node;
945*4882a593Smuzhiyun unsigned int leftmask;
946*4882a593Smuzhiyun unsigned int rightmask;
947*4882a593Smuzhiyun unsigned int bitmask;
948*4882a593Smuzhiyun int count;
949*4882a593Smuzhiyun int indent;
950*4882a593Smuzhiyun
951*4882a593Smuzhiyun /* Align to a cache line (or half a cache line?). */
952*4882a593Smuzhiyun while (index % 64)
953*4882a593Smuzhiyun index++;
954*4882a593Smuzhiyun tree->index = index;
955*4882a593Smuzhiyun indent = 1;
956*4882a593Smuzhiyun count = 0;
957*4882a593Smuzhiyun
958*4882a593Smuzhiyun if (verbose > 0)
959*4882a593Smuzhiyun printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
960*4882a593Smuzhiyun if (tree->childnode == LEAF) {
961*4882a593Smuzhiyun index += tree->leaf_size(tree->root);
962*4882a593Smuzhiyun goto done;
963*4882a593Smuzhiyun }
964*4882a593Smuzhiyun
965*4882a593Smuzhiyun assert(tree->childnode == NODE);
966*4882a593Smuzhiyun node = tree->root;
967*4882a593Smuzhiyun leftmask = rightmask = 0;
968*4882a593Smuzhiyun while (node) {
969*4882a593Smuzhiyun if (!node->mark)
970*4882a593Smuzhiyun goto skip;
971*4882a593Smuzhiyun count++;
972*4882a593Smuzhiyun if (node->index != index)
973*4882a593Smuzhiyun node->index = index;
974*4882a593Smuzhiyun index += node->size;
975*4882a593Smuzhiyun skip:
976*4882a593Smuzhiyun while (node) {
977*4882a593Smuzhiyun bitmask = 1 << node->bitnum;
978*4882a593Smuzhiyun if (node->mark && (leftmask & bitmask) == 0) {
979*4882a593Smuzhiyun leftmask |= bitmask;
980*4882a593Smuzhiyun if (node->leftnode == LEAF) {
981*4882a593Smuzhiyun assert(node->left);
982*4882a593Smuzhiyun *tree->leaf_index(tree, node->left) =
983*4882a593Smuzhiyun index;
984*4882a593Smuzhiyun index += tree->leaf_size(node->left);
985*4882a593Smuzhiyun count++;
986*4882a593Smuzhiyun } else if (node->left) {
987*4882a593Smuzhiyun assert(node->leftnode == NODE);
988*4882a593Smuzhiyun indent += 1;
989*4882a593Smuzhiyun node = node->left;
990*4882a593Smuzhiyun break;
991*4882a593Smuzhiyun }
992*4882a593Smuzhiyun }
993*4882a593Smuzhiyun if (node->mark && (rightmask & bitmask) == 0) {
994*4882a593Smuzhiyun rightmask |= bitmask;
995*4882a593Smuzhiyun if (node->rightnode == LEAF) {
996*4882a593Smuzhiyun assert(node->right);
997*4882a593Smuzhiyun *tree->leaf_index(tree, node->right) = index;
998*4882a593Smuzhiyun index += tree->leaf_size(node->right);
999*4882a593Smuzhiyun count++;
1000*4882a593Smuzhiyun } else if (node->right) {
1001*4882a593Smuzhiyun assert(node->rightnode == NODE);
1002*4882a593Smuzhiyun indent += 1;
1003*4882a593Smuzhiyun node = node->right;
1004*4882a593Smuzhiyun break;
1005*4882a593Smuzhiyun }
1006*4882a593Smuzhiyun }
1007*4882a593Smuzhiyun leftmask &= ~bitmask;
1008*4882a593Smuzhiyun rightmask &= ~bitmask;
1009*4882a593Smuzhiyun node = node->parent;
1010*4882a593Smuzhiyun indent -= 1;
1011*4882a593Smuzhiyun }
1012*4882a593Smuzhiyun }
1013*4882a593Smuzhiyun done:
1014*4882a593Smuzhiyun /* Round up to a multiple of 16 */
1015*4882a593Smuzhiyun while (index % 16)
1016*4882a593Smuzhiyun index++;
1017*4882a593Smuzhiyun if (verbose > 0)
1018*4882a593Smuzhiyun printf("Final index %d\n", index);
1019*4882a593Smuzhiyun return index;
1020*4882a593Smuzhiyun }
1021*4882a593Smuzhiyun
1022*4882a593Smuzhiyun /*
1023*4882a593Smuzhiyun * Mark the nodes in a subtree, helper for size_nodes().
1024*4882a593Smuzhiyun */
mark_subtree(struct node * node)1025*4882a593Smuzhiyun static int mark_subtree(struct node *node)
1026*4882a593Smuzhiyun {
1027*4882a593Smuzhiyun int changed;
1028*4882a593Smuzhiyun
1029*4882a593Smuzhiyun if (!node || node->mark)
1030*4882a593Smuzhiyun return 0;
1031*4882a593Smuzhiyun node->mark = 1;
1032*4882a593Smuzhiyun node->index = node->parent->index;
1033*4882a593Smuzhiyun changed = 1;
1034*4882a593Smuzhiyun if (node->leftnode == NODE)
1035*4882a593Smuzhiyun changed += mark_subtree(node->left);
1036*4882a593Smuzhiyun if (node->rightnode == NODE)
1037*4882a593Smuzhiyun changed += mark_subtree(node->right);
1038*4882a593Smuzhiyun return changed;
1039*4882a593Smuzhiyun }
1040*4882a593Smuzhiyun
1041*4882a593Smuzhiyun /*
1042*4882a593Smuzhiyun * Compute the size of nodes and leaves. We start by assuming that
1043*4882a593Smuzhiyun * each node needs to store a three-byte offset. The indexes of the
1044*4882a593Smuzhiyun * nodes are calculated based on that, and then this function is
1045*4882a593Smuzhiyun * called to see if the sizes of some nodes can be reduced. This is
1046*4882a593Smuzhiyun * repeated until no more changes are seen.
1047*4882a593Smuzhiyun */
size_nodes(struct tree * tree)1048*4882a593Smuzhiyun static int size_nodes(struct tree *tree)
1049*4882a593Smuzhiyun {
1050*4882a593Smuzhiyun struct tree *next;
1051*4882a593Smuzhiyun struct node *node;
1052*4882a593Smuzhiyun struct node *right;
1053*4882a593Smuzhiyun struct node *n;
1054*4882a593Smuzhiyun unsigned int leftmask;
1055*4882a593Smuzhiyun unsigned int rightmask;
1056*4882a593Smuzhiyun unsigned int bitmask;
1057*4882a593Smuzhiyun unsigned int pathbits;
1058*4882a593Smuzhiyun unsigned int pathmask;
1059*4882a593Smuzhiyun unsigned int nbit;
1060*4882a593Smuzhiyun int changed;
1061*4882a593Smuzhiyun int offset;
1062*4882a593Smuzhiyun int size;
1063*4882a593Smuzhiyun int indent;
1064*4882a593Smuzhiyun
1065*4882a593Smuzhiyun indent = 1;
1066*4882a593Smuzhiyun changed = 0;
1067*4882a593Smuzhiyun size = 0;
1068*4882a593Smuzhiyun
1069*4882a593Smuzhiyun if (verbose > 0)
1070*4882a593Smuzhiyun printf("Sizing %s_%x\n", tree->type, tree->maxage);
1071*4882a593Smuzhiyun if (tree->childnode == LEAF)
1072*4882a593Smuzhiyun goto done;
1073*4882a593Smuzhiyun
1074*4882a593Smuzhiyun assert(tree->childnode == NODE);
1075*4882a593Smuzhiyun pathbits = 0;
1076*4882a593Smuzhiyun pathmask = 0;
1077*4882a593Smuzhiyun node = tree->root;
1078*4882a593Smuzhiyun leftmask = rightmask = 0;
1079*4882a593Smuzhiyun while (node) {
1080*4882a593Smuzhiyun if (!node->mark)
1081*4882a593Smuzhiyun goto skip;
1082*4882a593Smuzhiyun offset = 0;
1083*4882a593Smuzhiyun if (!node->left || !node->right) {
1084*4882a593Smuzhiyun size = 1;
1085*4882a593Smuzhiyun } else {
1086*4882a593Smuzhiyun if (node->rightnode == NODE) {
1087*4882a593Smuzhiyun /*
1088*4882a593Smuzhiyun * If the right node is not marked,
1089*4882a593Smuzhiyun * look for a corresponding node in
1090*4882a593Smuzhiyun * the next tree. Such a node need
1091*4882a593Smuzhiyun * not exist.
1092*4882a593Smuzhiyun */
1093*4882a593Smuzhiyun right = node->right;
1094*4882a593Smuzhiyun next = tree->next;
1095*4882a593Smuzhiyun while (!right->mark) {
1096*4882a593Smuzhiyun assert(next);
1097*4882a593Smuzhiyun n = next->root;
1098*4882a593Smuzhiyun while (n->bitnum != node->bitnum) {
1099*4882a593Smuzhiyun nbit = 1 << n->bitnum;
1100*4882a593Smuzhiyun if (!(pathmask & nbit))
1101*4882a593Smuzhiyun break;
1102*4882a593Smuzhiyun if (pathbits & nbit) {
1103*4882a593Smuzhiyun if (n->rightnode == LEAF)
1104*4882a593Smuzhiyun break;
1105*4882a593Smuzhiyun n = n->right;
1106*4882a593Smuzhiyun } else {
1107*4882a593Smuzhiyun if (n->leftnode == LEAF)
1108*4882a593Smuzhiyun break;
1109*4882a593Smuzhiyun n = n->left;
1110*4882a593Smuzhiyun }
1111*4882a593Smuzhiyun }
1112*4882a593Smuzhiyun if (n->bitnum != node->bitnum)
1113*4882a593Smuzhiyun break;
1114*4882a593Smuzhiyun n = n->right;
1115*4882a593Smuzhiyun right = n;
1116*4882a593Smuzhiyun next = next->next;
1117*4882a593Smuzhiyun }
1118*4882a593Smuzhiyun /* Make sure the right node is marked. */
1119*4882a593Smuzhiyun if (!right->mark)
1120*4882a593Smuzhiyun changed += mark_subtree(right);
1121*4882a593Smuzhiyun offset = right->index - node->index;
1122*4882a593Smuzhiyun } else {
1123*4882a593Smuzhiyun offset = *tree->leaf_index(tree, node->right);
1124*4882a593Smuzhiyun offset -= node->index;
1125*4882a593Smuzhiyun }
1126*4882a593Smuzhiyun assert(offset >= 0);
1127*4882a593Smuzhiyun assert(offset <= 0xffffff);
1128*4882a593Smuzhiyun if (offset <= 0xff) {
1129*4882a593Smuzhiyun size = 2;
1130*4882a593Smuzhiyun } else if (offset <= 0xffff) {
1131*4882a593Smuzhiyun size = 3;
1132*4882a593Smuzhiyun } else { /* offset <= 0xffffff */
1133*4882a593Smuzhiyun size = 4;
1134*4882a593Smuzhiyun }
1135*4882a593Smuzhiyun }
1136*4882a593Smuzhiyun if (node->size != size || node->offset != offset) {
1137*4882a593Smuzhiyun node->size = size;
1138*4882a593Smuzhiyun node->offset = offset;
1139*4882a593Smuzhiyun changed++;
1140*4882a593Smuzhiyun }
1141*4882a593Smuzhiyun skip:
1142*4882a593Smuzhiyun while (node) {
1143*4882a593Smuzhiyun bitmask = 1 << node->bitnum;
1144*4882a593Smuzhiyun pathmask |= bitmask;
1145*4882a593Smuzhiyun if (node->mark && (leftmask & bitmask) == 0) {
1146*4882a593Smuzhiyun leftmask |= bitmask;
1147*4882a593Smuzhiyun if (node->leftnode == LEAF) {
1148*4882a593Smuzhiyun assert(node->left);
1149*4882a593Smuzhiyun } else if (node->left) {
1150*4882a593Smuzhiyun assert(node->leftnode == NODE);
1151*4882a593Smuzhiyun indent += 1;
1152*4882a593Smuzhiyun node = node->left;
1153*4882a593Smuzhiyun break;
1154*4882a593Smuzhiyun }
1155*4882a593Smuzhiyun }
1156*4882a593Smuzhiyun if (node->mark && (rightmask & bitmask) == 0) {
1157*4882a593Smuzhiyun rightmask |= bitmask;
1158*4882a593Smuzhiyun pathbits |= bitmask;
1159*4882a593Smuzhiyun if (node->rightnode == LEAF) {
1160*4882a593Smuzhiyun assert(node->right);
1161*4882a593Smuzhiyun } else if (node->right) {
1162*4882a593Smuzhiyun assert(node->rightnode == NODE);
1163*4882a593Smuzhiyun indent += 1;
1164*4882a593Smuzhiyun node = node->right;
1165*4882a593Smuzhiyun break;
1166*4882a593Smuzhiyun }
1167*4882a593Smuzhiyun }
1168*4882a593Smuzhiyun leftmask &= ~bitmask;
1169*4882a593Smuzhiyun rightmask &= ~bitmask;
1170*4882a593Smuzhiyun pathmask &= ~bitmask;
1171*4882a593Smuzhiyun pathbits &= ~bitmask;
1172*4882a593Smuzhiyun node = node->parent;
1173*4882a593Smuzhiyun indent -= 1;
1174*4882a593Smuzhiyun }
1175*4882a593Smuzhiyun }
1176*4882a593Smuzhiyun done:
1177*4882a593Smuzhiyun if (verbose > 0)
1178*4882a593Smuzhiyun printf("Found %d changes\n", changed);
1179*4882a593Smuzhiyun return changed;
1180*4882a593Smuzhiyun }
1181*4882a593Smuzhiyun
1182*4882a593Smuzhiyun /*
1183*4882a593Smuzhiyun * Emit a trie for the given tree into the data array.
1184*4882a593Smuzhiyun */
emit(struct tree * tree,unsigned char * data)1185*4882a593Smuzhiyun static void emit(struct tree *tree, unsigned char *data)
1186*4882a593Smuzhiyun {
1187*4882a593Smuzhiyun struct node *node;
1188*4882a593Smuzhiyun unsigned int leftmask;
1189*4882a593Smuzhiyun unsigned int rightmask;
1190*4882a593Smuzhiyun unsigned int bitmask;
1191*4882a593Smuzhiyun int offlen;
1192*4882a593Smuzhiyun int offset;
1193*4882a593Smuzhiyun int index;
1194*4882a593Smuzhiyun int indent;
1195*4882a593Smuzhiyun int size;
1196*4882a593Smuzhiyun int bytes;
1197*4882a593Smuzhiyun int leaves;
1198*4882a593Smuzhiyun int nodes[4];
1199*4882a593Smuzhiyun unsigned char byte;
1200*4882a593Smuzhiyun
1201*4882a593Smuzhiyun nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
1202*4882a593Smuzhiyun leaves = 0;
1203*4882a593Smuzhiyun bytes = 0;
1204*4882a593Smuzhiyun index = tree->index;
1205*4882a593Smuzhiyun data += index;
1206*4882a593Smuzhiyun indent = 1;
1207*4882a593Smuzhiyun if (verbose > 0)
1208*4882a593Smuzhiyun printf("Emitting %s_%x\n", tree->type, tree->maxage);
1209*4882a593Smuzhiyun if (tree->childnode == LEAF) {
1210*4882a593Smuzhiyun assert(tree->root);
1211*4882a593Smuzhiyun tree->leaf_emit(tree->root, data);
1212*4882a593Smuzhiyun size = tree->leaf_size(tree->root);
1213*4882a593Smuzhiyun index += size;
1214*4882a593Smuzhiyun leaves++;
1215*4882a593Smuzhiyun goto done;
1216*4882a593Smuzhiyun }
1217*4882a593Smuzhiyun
1218*4882a593Smuzhiyun assert(tree->childnode == NODE);
1219*4882a593Smuzhiyun node = tree->root;
1220*4882a593Smuzhiyun leftmask = rightmask = 0;
1221*4882a593Smuzhiyun while (node) {
1222*4882a593Smuzhiyun if (!node->mark)
1223*4882a593Smuzhiyun goto skip;
1224*4882a593Smuzhiyun assert(node->offset != -1);
1225*4882a593Smuzhiyun assert(node->index == index);
1226*4882a593Smuzhiyun
1227*4882a593Smuzhiyun byte = 0;
1228*4882a593Smuzhiyun if (node->nextbyte)
1229*4882a593Smuzhiyun byte |= NEXTBYTE;
1230*4882a593Smuzhiyun byte |= (node->bitnum & BITNUM);
1231*4882a593Smuzhiyun if (node->left && node->right) {
1232*4882a593Smuzhiyun if (node->leftnode == NODE)
1233*4882a593Smuzhiyun byte |= LEFTNODE;
1234*4882a593Smuzhiyun if (node->rightnode == NODE)
1235*4882a593Smuzhiyun byte |= RIGHTNODE;
1236*4882a593Smuzhiyun if (node->offset <= 0xff)
1237*4882a593Smuzhiyun offlen = 1;
1238*4882a593Smuzhiyun else if (node->offset <= 0xffff)
1239*4882a593Smuzhiyun offlen = 2;
1240*4882a593Smuzhiyun else
1241*4882a593Smuzhiyun offlen = 3;
1242*4882a593Smuzhiyun nodes[offlen]++;
1243*4882a593Smuzhiyun offset = node->offset;
1244*4882a593Smuzhiyun byte |= offlen << OFFLEN_SHIFT;
1245*4882a593Smuzhiyun *data++ = byte;
1246*4882a593Smuzhiyun index++;
1247*4882a593Smuzhiyun while (offlen--) {
1248*4882a593Smuzhiyun *data++ = offset & 0xff;
1249*4882a593Smuzhiyun index++;
1250*4882a593Smuzhiyun offset >>= 8;
1251*4882a593Smuzhiyun }
1252*4882a593Smuzhiyun } else if (node->left) {
1253*4882a593Smuzhiyun if (node->leftnode == NODE)
1254*4882a593Smuzhiyun byte |= TRIENODE;
1255*4882a593Smuzhiyun nodes[0]++;
1256*4882a593Smuzhiyun *data++ = byte;
1257*4882a593Smuzhiyun index++;
1258*4882a593Smuzhiyun } else if (node->right) {
1259*4882a593Smuzhiyun byte |= RIGHTNODE;
1260*4882a593Smuzhiyun if (node->rightnode == NODE)
1261*4882a593Smuzhiyun byte |= TRIENODE;
1262*4882a593Smuzhiyun nodes[0]++;
1263*4882a593Smuzhiyun *data++ = byte;
1264*4882a593Smuzhiyun index++;
1265*4882a593Smuzhiyun } else {
1266*4882a593Smuzhiyun assert(0);
1267*4882a593Smuzhiyun }
1268*4882a593Smuzhiyun skip:
1269*4882a593Smuzhiyun while (node) {
1270*4882a593Smuzhiyun bitmask = 1 << node->bitnum;
1271*4882a593Smuzhiyun if (node->mark && (leftmask & bitmask) == 0) {
1272*4882a593Smuzhiyun leftmask |= bitmask;
1273*4882a593Smuzhiyun if (node->leftnode == LEAF) {
1274*4882a593Smuzhiyun assert(node->left);
1275*4882a593Smuzhiyun data = tree->leaf_emit(node->left,
1276*4882a593Smuzhiyun data);
1277*4882a593Smuzhiyun size = tree->leaf_size(node->left);
1278*4882a593Smuzhiyun index += size;
1279*4882a593Smuzhiyun bytes += size;
1280*4882a593Smuzhiyun leaves++;
1281*4882a593Smuzhiyun } else if (node->left) {
1282*4882a593Smuzhiyun assert(node->leftnode == NODE);
1283*4882a593Smuzhiyun indent += 1;
1284*4882a593Smuzhiyun node = node->left;
1285*4882a593Smuzhiyun break;
1286*4882a593Smuzhiyun }
1287*4882a593Smuzhiyun }
1288*4882a593Smuzhiyun if (node->mark && (rightmask & bitmask) == 0) {
1289*4882a593Smuzhiyun rightmask |= bitmask;
1290*4882a593Smuzhiyun if (node->rightnode == LEAF) {
1291*4882a593Smuzhiyun assert(node->right);
1292*4882a593Smuzhiyun data = tree->leaf_emit(node->right,
1293*4882a593Smuzhiyun data);
1294*4882a593Smuzhiyun size = tree->leaf_size(node->right);
1295*4882a593Smuzhiyun index += size;
1296*4882a593Smuzhiyun bytes += size;
1297*4882a593Smuzhiyun leaves++;
1298*4882a593Smuzhiyun } else if (node->right) {
1299*4882a593Smuzhiyun assert(node->rightnode == NODE);
1300*4882a593Smuzhiyun indent += 1;
1301*4882a593Smuzhiyun node = node->right;
1302*4882a593Smuzhiyun break;
1303*4882a593Smuzhiyun }
1304*4882a593Smuzhiyun }
1305*4882a593Smuzhiyun leftmask &= ~bitmask;
1306*4882a593Smuzhiyun rightmask &= ~bitmask;
1307*4882a593Smuzhiyun node = node->parent;
1308*4882a593Smuzhiyun indent -= 1;
1309*4882a593Smuzhiyun }
1310*4882a593Smuzhiyun }
1311*4882a593Smuzhiyun done:
1312*4882a593Smuzhiyun if (verbose > 0) {
1313*4882a593Smuzhiyun printf("Emitted %d (%d) leaves",
1314*4882a593Smuzhiyun leaves, bytes);
1315*4882a593Smuzhiyun printf(" %d (%d+%d+%d+%d) nodes",
1316*4882a593Smuzhiyun nodes[0] + nodes[1] + nodes[2] + nodes[3],
1317*4882a593Smuzhiyun nodes[0], nodes[1], nodes[2], nodes[3]);
1318*4882a593Smuzhiyun printf(" %d total\n", index - tree->index);
1319*4882a593Smuzhiyun }
1320*4882a593Smuzhiyun }
1321*4882a593Smuzhiyun
1322*4882a593Smuzhiyun /* ------------------------------------------------------------------ */
1323*4882a593Smuzhiyun
1324*4882a593Smuzhiyun /*
1325*4882a593Smuzhiyun * Unicode data.
1326*4882a593Smuzhiyun *
1327*4882a593Smuzhiyun * We need to keep track of the Canonical Combining Class, the Age,
1328*4882a593Smuzhiyun * and decompositions for a code point.
1329*4882a593Smuzhiyun *
1330*4882a593Smuzhiyun * For the Age, we store the index into the ages table. Effectively
1331*4882a593Smuzhiyun * this is a generation number that the table maps to a unicode
1332*4882a593Smuzhiyun * version.
1333*4882a593Smuzhiyun *
1334*4882a593Smuzhiyun * The correction field is used to indicate that this entry is in the
1335*4882a593Smuzhiyun * corrections array, which contains decompositions that were
1336*4882a593Smuzhiyun * corrected in later revisions. The value of the correction field is
1337*4882a593Smuzhiyun * the Unicode version in which the mapping was corrected.
1338*4882a593Smuzhiyun */
1339*4882a593Smuzhiyun struct unicode_data {
1340*4882a593Smuzhiyun unsigned int code;
1341*4882a593Smuzhiyun int ccc;
1342*4882a593Smuzhiyun int gen;
1343*4882a593Smuzhiyun int correction;
1344*4882a593Smuzhiyun unsigned int *utf32nfdi;
1345*4882a593Smuzhiyun unsigned int *utf32nfdicf;
1346*4882a593Smuzhiyun char *utf8nfdi;
1347*4882a593Smuzhiyun char *utf8nfdicf;
1348*4882a593Smuzhiyun };
1349*4882a593Smuzhiyun
1350*4882a593Smuzhiyun struct unicode_data unicode_data[0x110000];
1351*4882a593Smuzhiyun struct unicode_data *corrections;
1352*4882a593Smuzhiyun int corrections_count;
1353*4882a593Smuzhiyun
1354*4882a593Smuzhiyun struct tree *nfdi_tree;
1355*4882a593Smuzhiyun struct tree *nfdicf_tree;
1356*4882a593Smuzhiyun
1357*4882a593Smuzhiyun struct tree *trees;
1358*4882a593Smuzhiyun int trees_count;
1359*4882a593Smuzhiyun
1360*4882a593Smuzhiyun /*
1361*4882a593Smuzhiyun * Check the corrections array to see if this entry was corrected at
1362*4882a593Smuzhiyun * some point.
1363*4882a593Smuzhiyun */
corrections_lookup(struct unicode_data * u)1364*4882a593Smuzhiyun static struct unicode_data *corrections_lookup(struct unicode_data *u)
1365*4882a593Smuzhiyun {
1366*4882a593Smuzhiyun int i;
1367*4882a593Smuzhiyun
1368*4882a593Smuzhiyun for (i = 0; i != corrections_count; i++)
1369*4882a593Smuzhiyun if (u->code == corrections[i].code)
1370*4882a593Smuzhiyun return &corrections[i];
1371*4882a593Smuzhiyun return u;
1372*4882a593Smuzhiyun }
1373*4882a593Smuzhiyun
nfdi_equal(void * l,void * r)1374*4882a593Smuzhiyun static int nfdi_equal(void *l, void *r)
1375*4882a593Smuzhiyun {
1376*4882a593Smuzhiyun struct unicode_data *left = l;
1377*4882a593Smuzhiyun struct unicode_data *right = r;
1378*4882a593Smuzhiyun
1379*4882a593Smuzhiyun if (left->gen != right->gen)
1380*4882a593Smuzhiyun return 0;
1381*4882a593Smuzhiyun if (left->ccc != right->ccc)
1382*4882a593Smuzhiyun return 0;
1383*4882a593Smuzhiyun if (left->utf8nfdi && right->utf8nfdi &&
1384*4882a593Smuzhiyun strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1385*4882a593Smuzhiyun return 1;
1386*4882a593Smuzhiyun if (left->utf8nfdi || right->utf8nfdi)
1387*4882a593Smuzhiyun return 0;
1388*4882a593Smuzhiyun return 1;
1389*4882a593Smuzhiyun }
1390*4882a593Smuzhiyun
nfdicf_equal(void * l,void * r)1391*4882a593Smuzhiyun static int nfdicf_equal(void *l, void *r)
1392*4882a593Smuzhiyun {
1393*4882a593Smuzhiyun struct unicode_data *left = l;
1394*4882a593Smuzhiyun struct unicode_data *right = r;
1395*4882a593Smuzhiyun
1396*4882a593Smuzhiyun if (left->gen != right->gen)
1397*4882a593Smuzhiyun return 0;
1398*4882a593Smuzhiyun if (left->ccc != right->ccc)
1399*4882a593Smuzhiyun return 0;
1400*4882a593Smuzhiyun if (left->utf8nfdicf && right->utf8nfdicf &&
1401*4882a593Smuzhiyun strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
1402*4882a593Smuzhiyun return 1;
1403*4882a593Smuzhiyun if (left->utf8nfdicf && right->utf8nfdicf)
1404*4882a593Smuzhiyun return 0;
1405*4882a593Smuzhiyun if (left->utf8nfdicf || right->utf8nfdicf)
1406*4882a593Smuzhiyun return 0;
1407*4882a593Smuzhiyun if (left->utf8nfdi && right->utf8nfdi &&
1408*4882a593Smuzhiyun strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1409*4882a593Smuzhiyun return 1;
1410*4882a593Smuzhiyun if (left->utf8nfdi || right->utf8nfdi)
1411*4882a593Smuzhiyun return 0;
1412*4882a593Smuzhiyun return 1;
1413*4882a593Smuzhiyun }
1414*4882a593Smuzhiyun
nfdi_print(void * l,int indent)1415*4882a593Smuzhiyun static void nfdi_print(void *l, int indent)
1416*4882a593Smuzhiyun {
1417*4882a593Smuzhiyun struct unicode_data *leaf = l;
1418*4882a593Smuzhiyun
1419*4882a593Smuzhiyun printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1420*4882a593Smuzhiyun leaf->code, leaf->ccc, leaf->gen);
1421*4882a593Smuzhiyun
1422*4882a593Smuzhiyun if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1423*4882a593Smuzhiyun printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1424*4882a593Smuzhiyun else if (leaf->utf8nfdi)
1425*4882a593Smuzhiyun printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1426*4882a593Smuzhiyun
1427*4882a593Smuzhiyun printf("\n");
1428*4882a593Smuzhiyun }
1429*4882a593Smuzhiyun
nfdicf_print(void * l,int indent)1430*4882a593Smuzhiyun static void nfdicf_print(void *l, int indent)
1431*4882a593Smuzhiyun {
1432*4882a593Smuzhiyun struct unicode_data *leaf = l;
1433*4882a593Smuzhiyun
1434*4882a593Smuzhiyun printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1435*4882a593Smuzhiyun leaf->code, leaf->ccc, leaf->gen);
1436*4882a593Smuzhiyun
1437*4882a593Smuzhiyun if (leaf->utf8nfdicf)
1438*4882a593Smuzhiyun printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
1439*4882a593Smuzhiyun else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1440*4882a593Smuzhiyun printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1441*4882a593Smuzhiyun else if (leaf->utf8nfdi)
1442*4882a593Smuzhiyun printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1443*4882a593Smuzhiyun printf("\n");
1444*4882a593Smuzhiyun }
1445*4882a593Smuzhiyun
nfdi_mark(void * l)1446*4882a593Smuzhiyun static int nfdi_mark(void *l)
1447*4882a593Smuzhiyun {
1448*4882a593Smuzhiyun return 1;
1449*4882a593Smuzhiyun }
1450*4882a593Smuzhiyun
nfdicf_mark(void * l)1451*4882a593Smuzhiyun static int nfdicf_mark(void *l)
1452*4882a593Smuzhiyun {
1453*4882a593Smuzhiyun struct unicode_data *leaf = l;
1454*4882a593Smuzhiyun
1455*4882a593Smuzhiyun if (leaf->utf8nfdicf)
1456*4882a593Smuzhiyun return 1;
1457*4882a593Smuzhiyun return 0;
1458*4882a593Smuzhiyun }
1459*4882a593Smuzhiyun
correction_mark(void * l)1460*4882a593Smuzhiyun static int correction_mark(void *l)
1461*4882a593Smuzhiyun {
1462*4882a593Smuzhiyun struct unicode_data *leaf = l;
1463*4882a593Smuzhiyun
1464*4882a593Smuzhiyun return leaf->correction;
1465*4882a593Smuzhiyun }
1466*4882a593Smuzhiyun
nfdi_size(void * l)1467*4882a593Smuzhiyun static int nfdi_size(void *l)
1468*4882a593Smuzhiyun {
1469*4882a593Smuzhiyun struct unicode_data *leaf = l;
1470*4882a593Smuzhiyun int size = 2;
1471*4882a593Smuzhiyun
1472*4882a593Smuzhiyun if (HANGUL_SYLLABLE(leaf->code))
1473*4882a593Smuzhiyun size += 1;
1474*4882a593Smuzhiyun else if (leaf->utf8nfdi)
1475*4882a593Smuzhiyun size += strlen(leaf->utf8nfdi) + 1;
1476*4882a593Smuzhiyun return size;
1477*4882a593Smuzhiyun }
1478*4882a593Smuzhiyun
nfdicf_size(void * l)1479*4882a593Smuzhiyun static int nfdicf_size(void *l)
1480*4882a593Smuzhiyun {
1481*4882a593Smuzhiyun struct unicode_data *leaf = l;
1482*4882a593Smuzhiyun int size = 2;
1483*4882a593Smuzhiyun
1484*4882a593Smuzhiyun if (HANGUL_SYLLABLE(leaf->code))
1485*4882a593Smuzhiyun size += 1;
1486*4882a593Smuzhiyun else if (leaf->utf8nfdicf)
1487*4882a593Smuzhiyun size += strlen(leaf->utf8nfdicf) + 1;
1488*4882a593Smuzhiyun else if (leaf->utf8nfdi)
1489*4882a593Smuzhiyun size += strlen(leaf->utf8nfdi) + 1;
1490*4882a593Smuzhiyun return size;
1491*4882a593Smuzhiyun }
1492*4882a593Smuzhiyun
nfdi_index(struct tree * tree,void * l)1493*4882a593Smuzhiyun static int *nfdi_index(struct tree *tree, void *l)
1494*4882a593Smuzhiyun {
1495*4882a593Smuzhiyun struct unicode_data *leaf = l;
1496*4882a593Smuzhiyun
1497*4882a593Smuzhiyun return &tree->leafindex[leaf->code];
1498*4882a593Smuzhiyun }
1499*4882a593Smuzhiyun
nfdicf_index(struct tree * tree,void * l)1500*4882a593Smuzhiyun static int *nfdicf_index(struct tree *tree, void *l)
1501*4882a593Smuzhiyun {
1502*4882a593Smuzhiyun struct unicode_data *leaf = l;
1503*4882a593Smuzhiyun
1504*4882a593Smuzhiyun return &tree->leafindex[leaf->code];
1505*4882a593Smuzhiyun }
1506*4882a593Smuzhiyun
nfdi_emit(void * l,unsigned char * data)1507*4882a593Smuzhiyun static unsigned char *nfdi_emit(void *l, unsigned char *data)
1508*4882a593Smuzhiyun {
1509*4882a593Smuzhiyun struct unicode_data *leaf = l;
1510*4882a593Smuzhiyun unsigned char *s;
1511*4882a593Smuzhiyun
1512*4882a593Smuzhiyun *data++ = leaf->gen;
1513*4882a593Smuzhiyun
1514*4882a593Smuzhiyun if (HANGUL_SYLLABLE(leaf->code)) {
1515*4882a593Smuzhiyun *data++ = DECOMPOSE;
1516*4882a593Smuzhiyun *data++ = HANGUL;
1517*4882a593Smuzhiyun } else if (leaf->utf8nfdi) {
1518*4882a593Smuzhiyun *data++ = DECOMPOSE;
1519*4882a593Smuzhiyun s = (unsigned char*)leaf->utf8nfdi;
1520*4882a593Smuzhiyun while ((*data++ = *s++) != 0)
1521*4882a593Smuzhiyun ;
1522*4882a593Smuzhiyun } else {
1523*4882a593Smuzhiyun *data++ = leaf->ccc;
1524*4882a593Smuzhiyun }
1525*4882a593Smuzhiyun return data;
1526*4882a593Smuzhiyun }
1527*4882a593Smuzhiyun
nfdicf_emit(void * l,unsigned char * data)1528*4882a593Smuzhiyun static unsigned char *nfdicf_emit(void *l, unsigned char *data)
1529*4882a593Smuzhiyun {
1530*4882a593Smuzhiyun struct unicode_data *leaf = l;
1531*4882a593Smuzhiyun unsigned char *s;
1532*4882a593Smuzhiyun
1533*4882a593Smuzhiyun *data++ = leaf->gen;
1534*4882a593Smuzhiyun
1535*4882a593Smuzhiyun if (HANGUL_SYLLABLE(leaf->code)) {
1536*4882a593Smuzhiyun *data++ = DECOMPOSE;
1537*4882a593Smuzhiyun *data++ = HANGUL;
1538*4882a593Smuzhiyun } else if (leaf->utf8nfdicf) {
1539*4882a593Smuzhiyun *data++ = DECOMPOSE;
1540*4882a593Smuzhiyun s = (unsigned char*)leaf->utf8nfdicf;
1541*4882a593Smuzhiyun while ((*data++ = *s++) != 0)
1542*4882a593Smuzhiyun ;
1543*4882a593Smuzhiyun } else if (leaf->utf8nfdi) {
1544*4882a593Smuzhiyun *data++ = DECOMPOSE;
1545*4882a593Smuzhiyun s = (unsigned char*)leaf->utf8nfdi;
1546*4882a593Smuzhiyun while ((*data++ = *s++) != 0)
1547*4882a593Smuzhiyun ;
1548*4882a593Smuzhiyun } else {
1549*4882a593Smuzhiyun *data++ = leaf->ccc;
1550*4882a593Smuzhiyun }
1551*4882a593Smuzhiyun return data;
1552*4882a593Smuzhiyun }
1553*4882a593Smuzhiyun
utf8_create(struct unicode_data * data)1554*4882a593Smuzhiyun static void utf8_create(struct unicode_data *data)
1555*4882a593Smuzhiyun {
1556*4882a593Smuzhiyun char utf[18*4+1];
1557*4882a593Smuzhiyun char *u;
1558*4882a593Smuzhiyun unsigned int *um;
1559*4882a593Smuzhiyun int i;
1560*4882a593Smuzhiyun
1561*4882a593Smuzhiyun if (data->utf8nfdi) {
1562*4882a593Smuzhiyun assert(data->utf8nfdi[0] == HANGUL);
1563*4882a593Smuzhiyun return;
1564*4882a593Smuzhiyun }
1565*4882a593Smuzhiyun
1566*4882a593Smuzhiyun u = utf;
1567*4882a593Smuzhiyun um = data->utf32nfdi;
1568*4882a593Smuzhiyun if (um) {
1569*4882a593Smuzhiyun for (i = 0; um[i]; i++)
1570*4882a593Smuzhiyun u += utf8encode(u, um[i]);
1571*4882a593Smuzhiyun *u = '\0';
1572*4882a593Smuzhiyun data->utf8nfdi = strdup(utf);
1573*4882a593Smuzhiyun }
1574*4882a593Smuzhiyun u = utf;
1575*4882a593Smuzhiyun um = data->utf32nfdicf;
1576*4882a593Smuzhiyun if (um) {
1577*4882a593Smuzhiyun for (i = 0; um[i]; i++)
1578*4882a593Smuzhiyun u += utf8encode(u, um[i]);
1579*4882a593Smuzhiyun *u = '\0';
1580*4882a593Smuzhiyun if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
1581*4882a593Smuzhiyun data->utf8nfdicf = strdup(utf);
1582*4882a593Smuzhiyun }
1583*4882a593Smuzhiyun }
1584*4882a593Smuzhiyun
utf8_init(void)1585*4882a593Smuzhiyun static void utf8_init(void)
1586*4882a593Smuzhiyun {
1587*4882a593Smuzhiyun unsigned int unichar;
1588*4882a593Smuzhiyun int i;
1589*4882a593Smuzhiyun
1590*4882a593Smuzhiyun for (unichar = 0; unichar != 0x110000; unichar++)
1591*4882a593Smuzhiyun utf8_create(&unicode_data[unichar]);
1592*4882a593Smuzhiyun
1593*4882a593Smuzhiyun for (i = 0; i != corrections_count; i++)
1594*4882a593Smuzhiyun utf8_create(&corrections[i]);
1595*4882a593Smuzhiyun }
1596*4882a593Smuzhiyun
trees_init(void)1597*4882a593Smuzhiyun static void trees_init(void)
1598*4882a593Smuzhiyun {
1599*4882a593Smuzhiyun struct unicode_data *data;
1600*4882a593Smuzhiyun unsigned int maxage;
1601*4882a593Smuzhiyun unsigned int nextage;
1602*4882a593Smuzhiyun int count;
1603*4882a593Smuzhiyun int i;
1604*4882a593Smuzhiyun int j;
1605*4882a593Smuzhiyun
1606*4882a593Smuzhiyun /* Count the number of different ages. */
1607*4882a593Smuzhiyun count = 0;
1608*4882a593Smuzhiyun nextage = (unsigned int)-1;
1609*4882a593Smuzhiyun do {
1610*4882a593Smuzhiyun maxage = nextage;
1611*4882a593Smuzhiyun nextage = 0;
1612*4882a593Smuzhiyun for (i = 0; i <= corrections_count; i++) {
1613*4882a593Smuzhiyun data = &corrections[i];
1614*4882a593Smuzhiyun if (nextage < data->correction &&
1615*4882a593Smuzhiyun data->correction < maxage)
1616*4882a593Smuzhiyun nextage = data->correction;
1617*4882a593Smuzhiyun }
1618*4882a593Smuzhiyun count++;
1619*4882a593Smuzhiyun } while (nextage);
1620*4882a593Smuzhiyun
1621*4882a593Smuzhiyun /* Two trees per age: nfdi and nfdicf */
1622*4882a593Smuzhiyun trees_count = count * 2;
1623*4882a593Smuzhiyun trees = calloc(trees_count, sizeof(struct tree));
1624*4882a593Smuzhiyun
1625*4882a593Smuzhiyun /* Assign ages to the trees. */
1626*4882a593Smuzhiyun count = trees_count;
1627*4882a593Smuzhiyun nextage = (unsigned int)-1;
1628*4882a593Smuzhiyun do {
1629*4882a593Smuzhiyun maxage = nextage;
1630*4882a593Smuzhiyun trees[--count].maxage = maxage;
1631*4882a593Smuzhiyun trees[--count].maxage = maxage;
1632*4882a593Smuzhiyun nextage = 0;
1633*4882a593Smuzhiyun for (i = 0; i <= corrections_count; i++) {
1634*4882a593Smuzhiyun data = &corrections[i];
1635*4882a593Smuzhiyun if (nextage < data->correction &&
1636*4882a593Smuzhiyun data->correction < maxage)
1637*4882a593Smuzhiyun nextage = data->correction;
1638*4882a593Smuzhiyun }
1639*4882a593Smuzhiyun } while (nextage);
1640*4882a593Smuzhiyun
1641*4882a593Smuzhiyun /* The ages assigned above are off by one. */
1642*4882a593Smuzhiyun for (i = 0; i != trees_count; i++) {
1643*4882a593Smuzhiyun j = 0;
1644*4882a593Smuzhiyun while (ages[j] < trees[i].maxage)
1645*4882a593Smuzhiyun j++;
1646*4882a593Smuzhiyun trees[i].maxage = ages[j-1];
1647*4882a593Smuzhiyun }
1648*4882a593Smuzhiyun
1649*4882a593Smuzhiyun /* Set up the forwarding between trees. */
1650*4882a593Smuzhiyun trees[trees_count-2].next = &trees[trees_count-1];
1651*4882a593Smuzhiyun trees[trees_count-1].leaf_mark = nfdi_mark;
1652*4882a593Smuzhiyun trees[trees_count-2].leaf_mark = nfdicf_mark;
1653*4882a593Smuzhiyun for (i = 0; i != trees_count-2; i += 2) {
1654*4882a593Smuzhiyun trees[i].next = &trees[trees_count-2];
1655*4882a593Smuzhiyun trees[i].leaf_mark = correction_mark;
1656*4882a593Smuzhiyun trees[i+1].next = &trees[trees_count-1];
1657*4882a593Smuzhiyun trees[i+1].leaf_mark = correction_mark;
1658*4882a593Smuzhiyun }
1659*4882a593Smuzhiyun
1660*4882a593Smuzhiyun /* Assign the callouts. */
1661*4882a593Smuzhiyun for (i = 0; i != trees_count; i += 2) {
1662*4882a593Smuzhiyun trees[i].type = "nfdicf";
1663*4882a593Smuzhiyun trees[i].leaf_equal = nfdicf_equal;
1664*4882a593Smuzhiyun trees[i].leaf_print = nfdicf_print;
1665*4882a593Smuzhiyun trees[i].leaf_size = nfdicf_size;
1666*4882a593Smuzhiyun trees[i].leaf_index = nfdicf_index;
1667*4882a593Smuzhiyun trees[i].leaf_emit = nfdicf_emit;
1668*4882a593Smuzhiyun
1669*4882a593Smuzhiyun trees[i+1].type = "nfdi";
1670*4882a593Smuzhiyun trees[i+1].leaf_equal = nfdi_equal;
1671*4882a593Smuzhiyun trees[i+1].leaf_print = nfdi_print;
1672*4882a593Smuzhiyun trees[i+1].leaf_size = nfdi_size;
1673*4882a593Smuzhiyun trees[i+1].leaf_index = nfdi_index;
1674*4882a593Smuzhiyun trees[i+1].leaf_emit = nfdi_emit;
1675*4882a593Smuzhiyun }
1676*4882a593Smuzhiyun
1677*4882a593Smuzhiyun /* Finish init. */
1678*4882a593Smuzhiyun for (i = 0; i != trees_count; i++)
1679*4882a593Smuzhiyun trees[i].childnode = NODE;
1680*4882a593Smuzhiyun }
1681*4882a593Smuzhiyun
trees_populate(void)1682*4882a593Smuzhiyun static void trees_populate(void)
1683*4882a593Smuzhiyun {
1684*4882a593Smuzhiyun struct unicode_data *data;
1685*4882a593Smuzhiyun unsigned int unichar;
1686*4882a593Smuzhiyun char keyval[4];
1687*4882a593Smuzhiyun int keylen;
1688*4882a593Smuzhiyun int i;
1689*4882a593Smuzhiyun
1690*4882a593Smuzhiyun for (i = 0; i != trees_count; i++) {
1691*4882a593Smuzhiyun if (verbose > 0) {
1692*4882a593Smuzhiyun printf("Populating %s_%x\n",
1693*4882a593Smuzhiyun trees[i].type, trees[i].maxage);
1694*4882a593Smuzhiyun }
1695*4882a593Smuzhiyun for (unichar = 0; unichar != 0x110000; unichar++) {
1696*4882a593Smuzhiyun if (unicode_data[unichar].gen < 0)
1697*4882a593Smuzhiyun continue;
1698*4882a593Smuzhiyun keylen = utf8encode(keyval, unichar);
1699*4882a593Smuzhiyun data = corrections_lookup(&unicode_data[unichar]);
1700*4882a593Smuzhiyun if (data->correction <= trees[i].maxage)
1701*4882a593Smuzhiyun data = &unicode_data[unichar];
1702*4882a593Smuzhiyun insert(&trees[i], keyval, keylen, data);
1703*4882a593Smuzhiyun }
1704*4882a593Smuzhiyun }
1705*4882a593Smuzhiyun }
1706*4882a593Smuzhiyun
trees_reduce(void)1707*4882a593Smuzhiyun static void trees_reduce(void)
1708*4882a593Smuzhiyun {
1709*4882a593Smuzhiyun int i;
1710*4882a593Smuzhiyun int size;
1711*4882a593Smuzhiyun int changed;
1712*4882a593Smuzhiyun
1713*4882a593Smuzhiyun for (i = 0; i != trees_count; i++)
1714*4882a593Smuzhiyun prune(&trees[i]);
1715*4882a593Smuzhiyun for (i = 0; i != trees_count; i++)
1716*4882a593Smuzhiyun mark_nodes(&trees[i]);
1717*4882a593Smuzhiyun do {
1718*4882a593Smuzhiyun size = 0;
1719*4882a593Smuzhiyun for (i = 0; i != trees_count; i++)
1720*4882a593Smuzhiyun size = index_nodes(&trees[i], size);
1721*4882a593Smuzhiyun changed = 0;
1722*4882a593Smuzhiyun for (i = 0; i != trees_count; i++)
1723*4882a593Smuzhiyun changed += size_nodes(&trees[i]);
1724*4882a593Smuzhiyun } while (changed);
1725*4882a593Smuzhiyun
1726*4882a593Smuzhiyun utf8data = calloc(size, 1);
1727*4882a593Smuzhiyun utf8data_size = size;
1728*4882a593Smuzhiyun for (i = 0; i != trees_count; i++)
1729*4882a593Smuzhiyun emit(&trees[i], utf8data);
1730*4882a593Smuzhiyun
1731*4882a593Smuzhiyun if (verbose > 0) {
1732*4882a593Smuzhiyun for (i = 0; i != trees_count; i++) {
1733*4882a593Smuzhiyun printf("%s_%x idx %d\n",
1734*4882a593Smuzhiyun trees[i].type, trees[i].maxage, trees[i].index);
1735*4882a593Smuzhiyun }
1736*4882a593Smuzhiyun }
1737*4882a593Smuzhiyun
1738*4882a593Smuzhiyun nfdi = utf8data + trees[trees_count-1].index;
1739*4882a593Smuzhiyun nfdicf = utf8data + trees[trees_count-2].index;
1740*4882a593Smuzhiyun
1741*4882a593Smuzhiyun nfdi_tree = &trees[trees_count-1];
1742*4882a593Smuzhiyun nfdicf_tree = &trees[trees_count-2];
1743*4882a593Smuzhiyun }
1744*4882a593Smuzhiyun
verify(struct tree * tree)1745*4882a593Smuzhiyun static void verify(struct tree *tree)
1746*4882a593Smuzhiyun {
1747*4882a593Smuzhiyun struct unicode_data *data;
1748*4882a593Smuzhiyun utf8leaf_t *leaf;
1749*4882a593Smuzhiyun unsigned int unichar;
1750*4882a593Smuzhiyun char key[4];
1751*4882a593Smuzhiyun unsigned char hangul[UTF8HANGULLEAF];
1752*4882a593Smuzhiyun int report;
1753*4882a593Smuzhiyun int nocf;
1754*4882a593Smuzhiyun
1755*4882a593Smuzhiyun if (verbose > 0)
1756*4882a593Smuzhiyun printf("Verifying %s_%x\n", tree->type, tree->maxage);
1757*4882a593Smuzhiyun nocf = strcmp(tree->type, "nfdicf");
1758*4882a593Smuzhiyun
1759*4882a593Smuzhiyun for (unichar = 0; unichar != 0x110000; unichar++) {
1760*4882a593Smuzhiyun report = 0;
1761*4882a593Smuzhiyun data = corrections_lookup(&unicode_data[unichar]);
1762*4882a593Smuzhiyun if (data->correction <= tree->maxage)
1763*4882a593Smuzhiyun data = &unicode_data[unichar];
1764*4882a593Smuzhiyun utf8encode(key,unichar);
1765*4882a593Smuzhiyun leaf = utf8lookup(tree, hangul, key);
1766*4882a593Smuzhiyun
1767*4882a593Smuzhiyun if (!leaf) {
1768*4882a593Smuzhiyun if (data->gen != -1)
1769*4882a593Smuzhiyun report++;
1770*4882a593Smuzhiyun if (unichar < 0xd800 || unichar > 0xdfff)
1771*4882a593Smuzhiyun report++;
1772*4882a593Smuzhiyun } else {
1773*4882a593Smuzhiyun if (unichar >= 0xd800 && unichar <= 0xdfff)
1774*4882a593Smuzhiyun report++;
1775*4882a593Smuzhiyun if (data->gen == -1)
1776*4882a593Smuzhiyun report++;
1777*4882a593Smuzhiyun if (data->gen != LEAF_GEN(leaf))
1778*4882a593Smuzhiyun report++;
1779*4882a593Smuzhiyun if (LEAF_CCC(leaf) == DECOMPOSE) {
1780*4882a593Smuzhiyun if (HANGUL_SYLLABLE(data->code)) {
1781*4882a593Smuzhiyun if (data->utf8nfdi[0] != HANGUL)
1782*4882a593Smuzhiyun report++;
1783*4882a593Smuzhiyun } else if (nocf) {
1784*4882a593Smuzhiyun if (!data->utf8nfdi) {
1785*4882a593Smuzhiyun report++;
1786*4882a593Smuzhiyun } else if (strcmp(data->utf8nfdi,
1787*4882a593Smuzhiyun LEAF_STR(leaf))) {
1788*4882a593Smuzhiyun report++;
1789*4882a593Smuzhiyun }
1790*4882a593Smuzhiyun } else {
1791*4882a593Smuzhiyun if (!data->utf8nfdicf &&
1792*4882a593Smuzhiyun !data->utf8nfdi) {
1793*4882a593Smuzhiyun report++;
1794*4882a593Smuzhiyun } else if (data->utf8nfdicf) {
1795*4882a593Smuzhiyun if (strcmp(data->utf8nfdicf,
1796*4882a593Smuzhiyun LEAF_STR(leaf)))
1797*4882a593Smuzhiyun report++;
1798*4882a593Smuzhiyun } else if (strcmp(data->utf8nfdi,
1799*4882a593Smuzhiyun LEAF_STR(leaf))) {
1800*4882a593Smuzhiyun report++;
1801*4882a593Smuzhiyun }
1802*4882a593Smuzhiyun }
1803*4882a593Smuzhiyun } else if (data->ccc != LEAF_CCC(leaf)) {
1804*4882a593Smuzhiyun report++;
1805*4882a593Smuzhiyun }
1806*4882a593Smuzhiyun }
1807*4882a593Smuzhiyun if (report) {
1808*4882a593Smuzhiyun printf("%X code %X gen %d ccc %d"
1809*4882a593Smuzhiyun " nfdi -> \"%s\"",
1810*4882a593Smuzhiyun unichar, data->code, data->gen,
1811*4882a593Smuzhiyun data->ccc,
1812*4882a593Smuzhiyun data->utf8nfdi);
1813*4882a593Smuzhiyun if (leaf) {
1814*4882a593Smuzhiyun printf(" gen %d ccc %d"
1815*4882a593Smuzhiyun " nfdi -> \"%s\"",
1816*4882a593Smuzhiyun LEAF_GEN(leaf),
1817*4882a593Smuzhiyun LEAF_CCC(leaf),
1818*4882a593Smuzhiyun LEAF_CCC(leaf) == DECOMPOSE ?
1819*4882a593Smuzhiyun LEAF_STR(leaf) : "");
1820*4882a593Smuzhiyun }
1821*4882a593Smuzhiyun printf("\n");
1822*4882a593Smuzhiyun }
1823*4882a593Smuzhiyun }
1824*4882a593Smuzhiyun }
1825*4882a593Smuzhiyun
trees_verify(void)1826*4882a593Smuzhiyun static void trees_verify(void)
1827*4882a593Smuzhiyun {
1828*4882a593Smuzhiyun int i;
1829*4882a593Smuzhiyun
1830*4882a593Smuzhiyun for (i = 0; i != trees_count; i++)
1831*4882a593Smuzhiyun verify(&trees[i]);
1832*4882a593Smuzhiyun }
1833*4882a593Smuzhiyun
1834*4882a593Smuzhiyun /* ------------------------------------------------------------------ */
1835*4882a593Smuzhiyun
help(void)1836*4882a593Smuzhiyun static void help(void)
1837*4882a593Smuzhiyun {
1838*4882a593Smuzhiyun printf("Usage: %s [options]\n", argv0);
1839*4882a593Smuzhiyun printf("\n");
1840*4882a593Smuzhiyun printf("This program creates an a data trie used for parsing and\n");
1841*4882a593Smuzhiyun printf("normalization of UTF-8 strings. The trie is derived from\n");
1842*4882a593Smuzhiyun printf("a set of input files from the Unicode character database\n");
1843*4882a593Smuzhiyun printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1844*4882a593Smuzhiyun printf("\n");
1845*4882a593Smuzhiyun printf("The generated tree supports two normalization forms:\n");
1846*4882a593Smuzhiyun printf("\n");
1847*4882a593Smuzhiyun printf("\tnfdi:\n");
1848*4882a593Smuzhiyun printf("\t- Apply unicode normalization form NFD.\n");
1849*4882a593Smuzhiyun printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1850*4882a593Smuzhiyun printf("\n");
1851*4882a593Smuzhiyun printf("\tnfdicf:\n");
1852*4882a593Smuzhiyun printf("\t- Apply unicode normalization form NFD.\n");
1853*4882a593Smuzhiyun printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1854*4882a593Smuzhiyun printf("\t- Apply a full casefold (C + F).\n");
1855*4882a593Smuzhiyun printf("\n");
1856*4882a593Smuzhiyun printf("These forms were chosen as being most useful when dealing\n");
1857*4882a593Smuzhiyun printf("with file names: NFD catches most cases where characters\n");
1858*4882a593Smuzhiyun printf("should be considered equivalent. The ignorables are mostly\n");
1859*4882a593Smuzhiyun printf("invisible, making names hard to type.\n");
1860*4882a593Smuzhiyun printf("\n");
1861*4882a593Smuzhiyun printf("The options to specify the files to be used are listed\n");
1862*4882a593Smuzhiyun printf("below with their default values, which are the names used\n");
1863*4882a593Smuzhiyun printf("by version 11.0.0 of the Unicode Character Database.\n");
1864*4882a593Smuzhiyun printf("\n");
1865*4882a593Smuzhiyun printf("The input files:\n");
1866*4882a593Smuzhiyun printf("\t-a %s\n", AGE_NAME);
1867*4882a593Smuzhiyun printf("\t-c %s\n", CCC_NAME);
1868*4882a593Smuzhiyun printf("\t-p %s\n", PROP_NAME);
1869*4882a593Smuzhiyun printf("\t-d %s\n", DATA_NAME);
1870*4882a593Smuzhiyun printf("\t-f %s\n", FOLD_NAME);
1871*4882a593Smuzhiyun printf("\t-n %s\n", NORM_NAME);
1872*4882a593Smuzhiyun printf("\n");
1873*4882a593Smuzhiyun printf("Additionally, the generated tables are tested using:\n");
1874*4882a593Smuzhiyun printf("\t-t %s\n", TEST_NAME);
1875*4882a593Smuzhiyun printf("\n");
1876*4882a593Smuzhiyun printf("Finally, the output file:\n");
1877*4882a593Smuzhiyun printf("\t-o %s\n", UTF8_NAME);
1878*4882a593Smuzhiyun printf("\n");
1879*4882a593Smuzhiyun }
1880*4882a593Smuzhiyun
usage(void)1881*4882a593Smuzhiyun static void usage(void)
1882*4882a593Smuzhiyun {
1883*4882a593Smuzhiyun help();
1884*4882a593Smuzhiyun exit(1);
1885*4882a593Smuzhiyun }
1886*4882a593Smuzhiyun
open_fail(const char * name,int error)1887*4882a593Smuzhiyun static void open_fail(const char *name, int error)
1888*4882a593Smuzhiyun {
1889*4882a593Smuzhiyun printf("Error %d opening %s: %s\n", error, name, strerror(error));
1890*4882a593Smuzhiyun exit(1);
1891*4882a593Smuzhiyun }
1892*4882a593Smuzhiyun
file_fail(const char * filename)1893*4882a593Smuzhiyun static void file_fail(const char *filename)
1894*4882a593Smuzhiyun {
1895*4882a593Smuzhiyun printf("Error parsing %s\n", filename);
1896*4882a593Smuzhiyun exit(1);
1897*4882a593Smuzhiyun }
1898*4882a593Smuzhiyun
line_fail(const char * filename,const char * line)1899*4882a593Smuzhiyun static void line_fail(const char *filename, const char *line)
1900*4882a593Smuzhiyun {
1901*4882a593Smuzhiyun printf("Error parsing %s:%s\n", filename, line);
1902*4882a593Smuzhiyun exit(1);
1903*4882a593Smuzhiyun }
1904*4882a593Smuzhiyun
1905*4882a593Smuzhiyun /* ------------------------------------------------------------------ */
1906*4882a593Smuzhiyun
print_utf32(unsigned int * utf32str)1907*4882a593Smuzhiyun static void print_utf32(unsigned int *utf32str)
1908*4882a593Smuzhiyun {
1909*4882a593Smuzhiyun int i;
1910*4882a593Smuzhiyun
1911*4882a593Smuzhiyun for (i = 0; utf32str[i]; i++)
1912*4882a593Smuzhiyun printf(" %X", utf32str[i]);
1913*4882a593Smuzhiyun }
1914*4882a593Smuzhiyun
print_utf32nfdi(unsigned int unichar)1915*4882a593Smuzhiyun static void print_utf32nfdi(unsigned int unichar)
1916*4882a593Smuzhiyun {
1917*4882a593Smuzhiyun printf(" %X ->", unichar);
1918*4882a593Smuzhiyun print_utf32(unicode_data[unichar].utf32nfdi);
1919*4882a593Smuzhiyun printf("\n");
1920*4882a593Smuzhiyun }
1921*4882a593Smuzhiyun
print_utf32nfdicf(unsigned int unichar)1922*4882a593Smuzhiyun static void print_utf32nfdicf(unsigned int unichar)
1923*4882a593Smuzhiyun {
1924*4882a593Smuzhiyun printf(" %X ->", unichar);
1925*4882a593Smuzhiyun print_utf32(unicode_data[unichar].utf32nfdicf);
1926*4882a593Smuzhiyun printf("\n");
1927*4882a593Smuzhiyun }
1928*4882a593Smuzhiyun
1929*4882a593Smuzhiyun /* ------------------------------------------------------------------ */
1930*4882a593Smuzhiyun
age_init(void)1931*4882a593Smuzhiyun static void age_init(void)
1932*4882a593Smuzhiyun {
1933*4882a593Smuzhiyun FILE *file;
1934*4882a593Smuzhiyun unsigned int first;
1935*4882a593Smuzhiyun unsigned int last;
1936*4882a593Smuzhiyun unsigned int unichar;
1937*4882a593Smuzhiyun unsigned int major;
1938*4882a593Smuzhiyun unsigned int minor;
1939*4882a593Smuzhiyun unsigned int revision;
1940*4882a593Smuzhiyun int gen;
1941*4882a593Smuzhiyun int count;
1942*4882a593Smuzhiyun int ret;
1943*4882a593Smuzhiyun
1944*4882a593Smuzhiyun if (verbose > 0)
1945*4882a593Smuzhiyun printf("Parsing %s\n", age_name);
1946*4882a593Smuzhiyun
1947*4882a593Smuzhiyun file = fopen(age_name, "r");
1948*4882a593Smuzhiyun if (!file)
1949*4882a593Smuzhiyun open_fail(age_name, errno);
1950*4882a593Smuzhiyun count = 0;
1951*4882a593Smuzhiyun
1952*4882a593Smuzhiyun gen = 0;
1953*4882a593Smuzhiyun while (fgets(line, LINESIZE, file)) {
1954*4882a593Smuzhiyun ret = sscanf(line, "# Age=V%d_%d_%d",
1955*4882a593Smuzhiyun &major, &minor, &revision);
1956*4882a593Smuzhiyun if (ret == 3) {
1957*4882a593Smuzhiyun ages_count++;
1958*4882a593Smuzhiyun if (verbose > 1)
1959*4882a593Smuzhiyun printf(" Age V%d_%d_%d\n",
1960*4882a593Smuzhiyun major, minor, revision);
1961*4882a593Smuzhiyun if (!age_valid(major, minor, revision))
1962*4882a593Smuzhiyun line_fail(age_name, line);
1963*4882a593Smuzhiyun continue;
1964*4882a593Smuzhiyun }
1965*4882a593Smuzhiyun ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1966*4882a593Smuzhiyun if (ret == 2) {
1967*4882a593Smuzhiyun ages_count++;
1968*4882a593Smuzhiyun if (verbose > 1)
1969*4882a593Smuzhiyun printf(" Age V%d_%d\n", major, minor);
1970*4882a593Smuzhiyun if (!age_valid(major, minor, 0))
1971*4882a593Smuzhiyun line_fail(age_name, line);
1972*4882a593Smuzhiyun continue;
1973*4882a593Smuzhiyun }
1974*4882a593Smuzhiyun }
1975*4882a593Smuzhiyun
1976*4882a593Smuzhiyun /* We must have found something above. */
1977*4882a593Smuzhiyun if (verbose > 1)
1978*4882a593Smuzhiyun printf("%d age entries\n", ages_count);
1979*4882a593Smuzhiyun if (ages_count == 0 || ages_count > MAXGEN)
1980*4882a593Smuzhiyun file_fail(age_name);
1981*4882a593Smuzhiyun
1982*4882a593Smuzhiyun /* There is a 0 entry. */
1983*4882a593Smuzhiyun ages_count++;
1984*4882a593Smuzhiyun ages = calloc(ages_count + 1, sizeof(*ages));
1985*4882a593Smuzhiyun /* And a guard entry. */
1986*4882a593Smuzhiyun ages[ages_count] = (unsigned int)-1;
1987*4882a593Smuzhiyun
1988*4882a593Smuzhiyun rewind(file);
1989*4882a593Smuzhiyun count = 0;
1990*4882a593Smuzhiyun gen = 0;
1991*4882a593Smuzhiyun while (fgets(line, LINESIZE, file)) {
1992*4882a593Smuzhiyun ret = sscanf(line, "# Age=V%d_%d_%d",
1993*4882a593Smuzhiyun &major, &minor, &revision);
1994*4882a593Smuzhiyun if (ret == 3) {
1995*4882a593Smuzhiyun ages[++gen] =
1996*4882a593Smuzhiyun UNICODE_AGE(major, minor, revision);
1997*4882a593Smuzhiyun if (verbose > 1)
1998*4882a593Smuzhiyun printf(" Age V%d_%d_%d = gen %d\n",
1999*4882a593Smuzhiyun major, minor, revision, gen);
2000*4882a593Smuzhiyun if (!age_valid(major, minor, revision))
2001*4882a593Smuzhiyun line_fail(age_name, line);
2002*4882a593Smuzhiyun continue;
2003*4882a593Smuzhiyun }
2004*4882a593Smuzhiyun ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
2005*4882a593Smuzhiyun if (ret == 2) {
2006*4882a593Smuzhiyun ages[++gen] = UNICODE_AGE(major, minor, 0);
2007*4882a593Smuzhiyun if (verbose > 1)
2008*4882a593Smuzhiyun printf(" Age V%d_%d = %d\n",
2009*4882a593Smuzhiyun major, minor, gen);
2010*4882a593Smuzhiyun if (!age_valid(major, minor, 0))
2011*4882a593Smuzhiyun line_fail(age_name, line);
2012*4882a593Smuzhiyun continue;
2013*4882a593Smuzhiyun }
2014*4882a593Smuzhiyun ret = sscanf(line, "%X..%X ; %d.%d #",
2015*4882a593Smuzhiyun &first, &last, &major, &minor);
2016*4882a593Smuzhiyun if (ret == 4) {
2017*4882a593Smuzhiyun for (unichar = first; unichar <= last; unichar++)
2018*4882a593Smuzhiyun unicode_data[unichar].gen = gen;
2019*4882a593Smuzhiyun count += 1 + last - first;
2020*4882a593Smuzhiyun if (verbose > 1)
2021*4882a593Smuzhiyun printf(" %X..%X gen %d\n", first, last, gen);
2022*4882a593Smuzhiyun if (!utf32valid(first) || !utf32valid(last))
2023*4882a593Smuzhiyun line_fail(age_name, line);
2024*4882a593Smuzhiyun continue;
2025*4882a593Smuzhiyun }
2026*4882a593Smuzhiyun ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
2027*4882a593Smuzhiyun if (ret == 3) {
2028*4882a593Smuzhiyun unicode_data[unichar].gen = gen;
2029*4882a593Smuzhiyun count++;
2030*4882a593Smuzhiyun if (verbose > 1)
2031*4882a593Smuzhiyun printf(" %X gen %d\n", unichar, gen);
2032*4882a593Smuzhiyun if (!utf32valid(unichar))
2033*4882a593Smuzhiyun line_fail(age_name, line);
2034*4882a593Smuzhiyun continue;
2035*4882a593Smuzhiyun }
2036*4882a593Smuzhiyun }
2037*4882a593Smuzhiyun unicode_maxage = ages[gen];
2038*4882a593Smuzhiyun fclose(file);
2039*4882a593Smuzhiyun
2040*4882a593Smuzhiyun /* Nix surrogate block */
2041*4882a593Smuzhiyun if (verbose > 1)
2042*4882a593Smuzhiyun printf(" Removing surrogate block D800..DFFF\n");
2043*4882a593Smuzhiyun for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
2044*4882a593Smuzhiyun unicode_data[unichar].gen = -1;
2045*4882a593Smuzhiyun
2046*4882a593Smuzhiyun if (verbose > 0)
2047*4882a593Smuzhiyun printf("Found %d entries\n", count);
2048*4882a593Smuzhiyun if (count == 0)
2049*4882a593Smuzhiyun file_fail(age_name);
2050*4882a593Smuzhiyun }
2051*4882a593Smuzhiyun
ccc_init(void)2052*4882a593Smuzhiyun static void ccc_init(void)
2053*4882a593Smuzhiyun {
2054*4882a593Smuzhiyun FILE *file;
2055*4882a593Smuzhiyun unsigned int first;
2056*4882a593Smuzhiyun unsigned int last;
2057*4882a593Smuzhiyun unsigned int unichar;
2058*4882a593Smuzhiyun unsigned int value;
2059*4882a593Smuzhiyun int count;
2060*4882a593Smuzhiyun int ret;
2061*4882a593Smuzhiyun
2062*4882a593Smuzhiyun if (verbose > 0)
2063*4882a593Smuzhiyun printf("Parsing %s\n", ccc_name);
2064*4882a593Smuzhiyun
2065*4882a593Smuzhiyun file = fopen(ccc_name, "r");
2066*4882a593Smuzhiyun if (!file)
2067*4882a593Smuzhiyun open_fail(ccc_name, errno);
2068*4882a593Smuzhiyun
2069*4882a593Smuzhiyun count = 0;
2070*4882a593Smuzhiyun while (fgets(line, LINESIZE, file)) {
2071*4882a593Smuzhiyun ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
2072*4882a593Smuzhiyun if (ret == 3) {
2073*4882a593Smuzhiyun for (unichar = first; unichar <= last; unichar++) {
2074*4882a593Smuzhiyun unicode_data[unichar].ccc = value;
2075*4882a593Smuzhiyun count++;
2076*4882a593Smuzhiyun }
2077*4882a593Smuzhiyun if (verbose > 1)
2078*4882a593Smuzhiyun printf(" %X..%X ccc %d\n", first, last, value);
2079*4882a593Smuzhiyun if (!utf32valid(first) || !utf32valid(last))
2080*4882a593Smuzhiyun line_fail(ccc_name, line);
2081*4882a593Smuzhiyun continue;
2082*4882a593Smuzhiyun }
2083*4882a593Smuzhiyun ret = sscanf(line, "%X ; %d #", &unichar, &value);
2084*4882a593Smuzhiyun if (ret == 2) {
2085*4882a593Smuzhiyun unicode_data[unichar].ccc = value;
2086*4882a593Smuzhiyun count++;
2087*4882a593Smuzhiyun if (verbose > 1)
2088*4882a593Smuzhiyun printf(" %X ccc %d\n", unichar, value);
2089*4882a593Smuzhiyun if (!utf32valid(unichar))
2090*4882a593Smuzhiyun line_fail(ccc_name, line);
2091*4882a593Smuzhiyun continue;
2092*4882a593Smuzhiyun }
2093*4882a593Smuzhiyun }
2094*4882a593Smuzhiyun fclose(file);
2095*4882a593Smuzhiyun
2096*4882a593Smuzhiyun if (verbose > 0)
2097*4882a593Smuzhiyun printf("Found %d entries\n", count);
2098*4882a593Smuzhiyun if (count == 0)
2099*4882a593Smuzhiyun file_fail(ccc_name);
2100*4882a593Smuzhiyun }
2101*4882a593Smuzhiyun
ignore_compatibility_form(char * type)2102*4882a593Smuzhiyun static int ignore_compatibility_form(char *type)
2103*4882a593Smuzhiyun {
2104*4882a593Smuzhiyun int i;
2105*4882a593Smuzhiyun char *ignored_types[] = {"font", "noBreak", "initial", "medial",
2106*4882a593Smuzhiyun "final", "isolated", "circle", "super",
2107*4882a593Smuzhiyun "sub", "vertical", "wide", "narrow",
2108*4882a593Smuzhiyun "small", "square", "fraction", "compat"};
2109*4882a593Smuzhiyun
2110*4882a593Smuzhiyun for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
2111*4882a593Smuzhiyun if (strcmp(type, ignored_types[i]) == 0)
2112*4882a593Smuzhiyun return 1;
2113*4882a593Smuzhiyun return 0;
2114*4882a593Smuzhiyun }
2115*4882a593Smuzhiyun
nfdi_init(void)2116*4882a593Smuzhiyun static void nfdi_init(void)
2117*4882a593Smuzhiyun {
2118*4882a593Smuzhiyun FILE *file;
2119*4882a593Smuzhiyun unsigned int unichar;
2120*4882a593Smuzhiyun unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2121*4882a593Smuzhiyun char *s;
2122*4882a593Smuzhiyun char *type;
2123*4882a593Smuzhiyun unsigned int *um;
2124*4882a593Smuzhiyun int count;
2125*4882a593Smuzhiyun int i;
2126*4882a593Smuzhiyun int ret;
2127*4882a593Smuzhiyun
2128*4882a593Smuzhiyun if (verbose > 0)
2129*4882a593Smuzhiyun printf("Parsing %s\n", data_name);
2130*4882a593Smuzhiyun file = fopen(data_name, "r");
2131*4882a593Smuzhiyun if (!file)
2132*4882a593Smuzhiyun open_fail(data_name, errno);
2133*4882a593Smuzhiyun
2134*4882a593Smuzhiyun count = 0;
2135*4882a593Smuzhiyun while (fgets(line, LINESIZE, file)) {
2136*4882a593Smuzhiyun ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2137*4882a593Smuzhiyun &unichar, buf0);
2138*4882a593Smuzhiyun if (ret != 2)
2139*4882a593Smuzhiyun continue;
2140*4882a593Smuzhiyun if (!utf32valid(unichar))
2141*4882a593Smuzhiyun line_fail(data_name, line);
2142*4882a593Smuzhiyun
2143*4882a593Smuzhiyun s = buf0;
2144*4882a593Smuzhiyun /* skip over <tag> */
2145*4882a593Smuzhiyun if (*s == '<') {
2146*4882a593Smuzhiyun type = ++s;
2147*4882a593Smuzhiyun while (*++s != '>');
2148*4882a593Smuzhiyun *s++ = '\0';
2149*4882a593Smuzhiyun if(ignore_compatibility_form(type))
2150*4882a593Smuzhiyun continue;
2151*4882a593Smuzhiyun }
2152*4882a593Smuzhiyun /* decode the decomposition into UTF-32 */
2153*4882a593Smuzhiyun i = 0;
2154*4882a593Smuzhiyun while (*s) {
2155*4882a593Smuzhiyun mapping[i] = strtoul(s, &s, 16);
2156*4882a593Smuzhiyun if (!utf32valid(mapping[i]))
2157*4882a593Smuzhiyun line_fail(data_name, line);
2158*4882a593Smuzhiyun i++;
2159*4882a593Smuzhiyun }
2160*4882a593Smuzhiyun mapping[i++] = 0;
2161*4882a593Smuzhiyun
2162*4882a593Smuzhiyun um = malloc(i * sizeof(unsigned int));
2163*4882a593Smuzhiyun memcpy(um, mapping, i * sizeof(unsigned int));
2164*4882a593Smuzhiyun unicode_data[unichar].utf32nfdi = um;
2165*4882a593Smuzhiyun
2166*4882a593Smuzhiyun if (verbose > 1)
2167*4882a593Smuzhiyun print_utf32nfdi(unichar);
2168*4882a593Smuzhiyun count++;
2169*4882a593Smuzhiyun }
2170*4882a593Smuzhiyun fclose(file);
2171*4882a593Smuzhiyun if (verbose > 0)
2172*4882a593Smuzhiyun printf("Found %d entries\n", count);
2173*4882a593Smuzhiyun if (count == 0)
2174*4882a593Smuzhiyun file_fail(data_name);
2175*4882a593Smuzhiyun }
2176*4882a593Smuzhiyun
nfdicf_init(void)2177*4882a593Smuzhiyun static void nfdicf_init(void)
2178*4882a593Smuzhiyun {
2179*4882a593Smuzhiyun FILE *file;
2180*4882a593Smuzhiyun unsigned int unichar;
2181*4882a593Smuzhiyun unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2182*4882a593Smuzhiyun char status;
2183*4882a593Smuzhiyun char *s;
2184*4882a593Smuzhiyun unsigned int *um;
2185*4882a593Smuzhiyun int i;
2186*4882a593Smuzhiyun int count;
2187*4882a593Smuzhiyun int ret;
2188*4882a593Smuzhiyun
2189*4882a593Smuzhiyun if (verbose > 0)
2190*4882a593Smuzhiyun printf("Parsing %s\n", fold_name);
2191*4882a593Smuzhiyun file = fopen(fold_name, "r");
2192*4882a593Smuzhiyun if (!file)
2193*4882a593Smuzhiyun open_fail(fold_name, errno);
2194*4882a593Smuzhiyun
2195*4882a593Smuzhiyun count = 0;
2196*4882a593Smuzhiyun while (fgets(line, LINESIZE, file)) {
2197*4882a593Smuzhiyun ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
2198*4882a593Smuzhiyun if (ret != 3)
2199*4882a593Smuzhiyun continue;
2200*4882a593Smuzhiyun if (!utf32valid(unichar))
2201*4882a593Smuzhiyun line_fail(fold_name, line);
2202*4882a593Smuzhiyun /* Use the C+F casefold. */
2203*4882a593Smuzhiyun if (status != 'C' && status != 'F')
2204*4882a593Smuzhiyun continue;
2205*4882a593Smuzhiyun s = buf0;
2206*4882a593Smuzhiyun if (*s == '<')
2207*4882a593Smuzhiyun while (*s++ != ' ')
2208*4882a593Smuzhiyun ;
2209*4882a593Smuzhiyun i = 0;
2210*4882a593Smuzhiyun while (*s) {
2211*4882a593Smuzhiyun mapping[i] = strtoul(s, &s, 16);
2212*4882a593Smuzhiyun if (!utf32valid(mapping[i]))
2213*4882a593Smuzhiyun line_fail(fold_name, line);
2214*4882a593Smuzhiyun i++;
2215*4882a593Smuzhiyun }
2216*4882a593Smuzhiyun mapping[i++] = 0;
2217*4882a593Smuzhiyun
2218*4882a593Smuzhiyun um = malloc(i * sizeof(unsigned int));
2219*4882a593Smuzhiyun memcpy(um, mapping, i * sizeof(unsigned int));
2220*4882a593Smuzhiyun unicode_data[unichar].utf32nfdicf = um;
2221*4882a593Smuzhiyun
2222*4882a593Smuzhiyun if (verbose > 1)
2223*4882a593Smuzhiyun print_utf32nfdicf(unichar);
2224*4882a593Smuzhiyun count++;
2225*4882a593Smuzhiyun }
2226*4882a593Smuzhiyun fclose(file);
2227*4882a593Smuzhiyun if (verbose > 0)
2228*4882a593Smuzhiyun printf("Found %d entries\n", count);
2229*4882a593Smuzhiyun if (count == 0)
2230*4882a593Smuzhiyun file_fail(fold_name);
2231*4882a593Smuzhiyun }
2232*4882a593Smuzhiyun
ignore_init(void)2233*4882a593Smuzhiyun static void ignore_init(void)
2234*4882a593Smuzhiyun {
2235*4882a593Smuzhiyun FILE *file;
2236*4882a593Smuzhiyun unsigned int unichar;
2237*4882a593Smuzhiyun unsigned int first;
2238*4882a593Smuzhiyun unsigned int last;
2239*4882a593Smuzhiyun unsigned int *um;
2240*4882a593Smuzhiyun int count;
2241*4882a593Smuzhiyun int ret;
2242*4882a593Smuzhiyun
2243*4882a593Smuzhiyun if (verbose > 0)
2244*4882a593Smuzhiyun printf("Parsing %s\n", prop_name);
2245*4882a593Smuzhiyun file = fopen(prop_name, "r");
2246*4882a593Smuzhiyun if (!file)
2247*4882a593Smuzhiyun open_fail(prop_name, errno);
2248*4882a593Smuzhiyun assert(file);
2249*4882a593Smuzhiyun count = 0;
2250*4882a593Smuzhiyun while (fgets(line, LINESIZE, file)) {
2251*4882a593Smuzhiyun ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
2252*4882a593Smuzhiyun if (ret == 3) {
2253*4882a593Smuzhiyun if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2254*4882a593Smuzhiyun continue;
2255*4882a593Smuzhiyun if (!utf32valid(first) || !utf32valid(last))
2256*4882a593Smuzhiyun line_fail(prop_name, line);
2257*4882a593Smuzhiyun for (unichar = first; unichar <= last; unichar++) {
2258*4882a593Smuzhiyun free(unicode_data[unichar].utf32nfdi);
2259*4882a593Smuzhiyun um = malloc(sizeof(unsigned int));
2260*4882a593Smuzhiyun *um = 0;
2261*4882a593Smuzhiyun unicode_data[unichar].utf32nfdi = um;
2262*4882a593Smuzhiyun free(unicode_data[unichar].utf32nfdicf);
2263*4882a593Smuzhiyun um = malloc(sizeof(unsigned int));
2264*4882a593Smuzhiyun *um = 0;
2265*4882a593Smuzhiyun unicode_data[unichar].utf32nfdicf = um;
2266*4882a593Smuzhiyun count++;
2267*4882a593Smuzhiyun }
2268*4882a593Smuzhiyun if (verbose > 1)
2269*4882a593Smuzhiyun printf(" %X..%X Default_Ignorable_Code_Point\n",
2270*4882a593Smuzhiyun first, last);
2271*4882a593Smuzhiyun continue;
2272*4882a593Smuzhiyun }
2273*4882a593Smuzhiyun ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
2274*4882a593Smuzhiyun if (ret == 2) {
2275*4882a593Smuzhiyun if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2276*4882a593Smuzhiyun continue;
2277*4882a593Smuzhiyun if (!utf32valid(unichar))
2278*4882a593Smuzhiyun line_fail(prop_name, line);
2279*4882a593Smuzhiyun free(unicode_data[unichar].utf32nfdi);
2280*4882a593Smuzhiyun um = malloc(sizeof(unsigned int));
2281*4882a593Smuzhiyun *um = 0;
2282*4882a593Smuzhiyun unicode_data[unichar].utf32nfdi = um;
2283*4882a593Smuzhiyun free(unicode_data[unichar].utf32nfdicf);
2284*4882a593Smuzhiyun um = malloc(sizeof(unsigned int));
2285*4882a593Smuzhiyun *um = 0;
2286*4882a593Smuzhiyun unicode_data[unichar].utf32nfdicf = um;
2287*4882a593Smuzhiyun if (verbose > 1)
2288*4882a593Smuzhiyun printf(" %X Default_Ignorable_Code_Point\n",
2289*4882a593Smuzhiyun unichar);
2290*4882a593Smuzhiyun count++;
2291*4882a593Smuzhiyun continue;
2292*4882a593Smuzhiyun }
2293*4882a593Smuzhiyun }
2294*4882a593Smuzhiyun fclose(file);
2295*4882a593Smuzhiyun
2296*4882a593Smuzhiyun if (verbose > 0)
2297*4882a593Smuzhiyun printf("Found %d entries\n", count);
2298*4882a593Smuzhiyun if (count == 0)
2299*4882a593Smuzhiyun file_fail(prop_name);
2300*4882a593Smuzhiyun }
2301*4882a593Smuzhiyun
corrections_init(void)2302*4882a593Smuzhiyun static void corrections_init(void)
2303*4882a593Smuzhiyun {
2304*4882a593Smuzhiyun FILE *file;
2305*4882a593Smuzhiyun unsigned int unichar;
2306*4882a593Smuzhiyun unsigned int major;
2307*4882a593Smuzhiyun unsigned int minor;
2308*4882a593Smuzhiyun unsigned int revision;
2309*4882a593Smuzhiyun unsigned int age;
2310*4882a593Smuzhiyun unsigned int *um;
2311*4882a593Smuzhiyun unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2312*4882a593Smuzhiyun char *s;
2313*4882a593Smuzhiyun int i;
2314*4882a593Smuzhiyun int count;
2315*4882a593Smuzhiyun int ret;
2316*4882a593Smuzhiyun
2317*4882a593Smuzhiyun if (verbose > 0)
2318*4882a593Smuzhiyun printf("Parsing %s\n", norm_name);
2319*4882a593Smuzhiyun file = fopen(norm_name, "r");
2320*4882a593Smuzhiyun if (!file)
2321*4882a593Smuzhiyun open_fail(norm_name, errno);
2322*4882a593Smuzhiyun
2323*4882a593Smuzhiyun count = 0;
2324*4882a593Smuzhiyun while (fgets(line, LINESIZE, file)) {
2325*4882a593Smuzhiyun ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2326*4882a593Smuzhiyun &unichar, buf0, buf1,
2327*4882a593Smuzhiyun &major, &minor, &revision);
2328*4882a593Smuzhiyun if (ret != 6)
2329*4882a593Smuzhiyun continue;
2330*4882a593Smuzhiyun if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2331*4882a593Smuzhiyun line_fail(norm_name, line);
2332*4882a593Smuzhiyun count++;
2333*4882a593Smuzhiyun }
2334*4882a593Smuzhiyun corrections = calloc(count, sizeof(struct unicode_data));
2335*4882a593Smuzhiyun corrections_count = count;
2336*4882a593Smuzhiyun rewind(file);
2337*4882a593Smuzhiyun
2338*4882a593Smuzhiyun count = 0;
2339*4882a593Smuzhiyun while (fgets(line, LINESIZE, file)) {
2340*4882a593Smuzhiyun ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2341*4882a593Smuzhiyun &unichar, buf0, buf1,
2342*4882a593Smuzhiyun &major, &minor, &revision);
2343*4882a593Smuzhiyun if (ret != 6)
2344*4882a593Smuzhiyun continue;
2345*4882a593Smuzhiyun if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2346*4882a593Smuzhiyun line_fail(norm_name, line);
2347*4882a593Smuzhiyun corrections[count] = unicode_data[unichar];
2348*4882a593Smuzhiyun assert(corrections[count].code == unichar);
2349*4882a593Smuzhiyun age = UNICODE_AGE(major, minor, revision);
2350*4882a593Smuzhiyun corrections[count].correction = age;
2351*4882a593Smuzhiyun
2352*4882a593Smuzhiyun i = 0;
2353*4882a593Smuzhiyun s = buf0;
2354*4882a593Smuzhiyun while (*s) {
2355*4882a593Smuzhiyun mapping[i] = strtoul(s, &s, 16);
2356*4882a593Smuzhiyun if (!utf32valid(mapping[i]))
2357*4882a593Smuzhiyun line_fail(norm_name, line);
2358*4882a593Smuzhiyun i++;
2359*4882a593Smuzhiyun }
2360*4882a593Smuzhiyun mapping[i++] = 0;
2361*4882a593Smuzhiyun
2362*4882a593Smuzhiyun um = malloc(i * sizeof(unsigned int));
2363*4882a593Smuzhiyun memcpy(um, mapping, i * sizeof(unsigned int));
2364*4882a593Smuzhiyun corrections[count].utf32nfdi = um;
2365*4882a593Smuzhiyun
2366*4882a593Smuzhiyun if (verbose > 1)
2367*4882a593Smuzhiyun printf(" %X -> %s -> %s V%d_%d_%d\n",
2368*4882a593Smuzhiyun unichar, buf0, buf1, major, minor, revision);
2369*4882a593Smuzhiyun count++;
2370*4882a593Smuzhiyun }
2371*4882a593Smuzhiyun fclose(file);
2372*4882a593Smuzhiyun
2373*4882a593Smuzhiyun if (verbose > 0)
2374*4882a593Smuzhiyun printf("Found %d entries\n", count);
2375*4882a593Smuzhiyun if (count == 0)
2376*4882a593Smuzhiyun file_fail(norm_name);
2377*4882a593Smuzhiyun }
2378*4882a593Smuzhiyun
2379*4882a593Smuzhiyun /* ------------------------------------------------------------------ */
2380*4882a593Smuzhiyun
2381*4882a593Smuzhiyun /*
2382*4882a593Smuzhiyun * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2383*4882a593Smuzhiyun *
2384*4882a593Smuzhiyun * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2385*4882a593Smuzhiyun * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2386*4882a593Smuzhiyun *
2387*4882a593Smuzhiyun * SBase = 0xAC00
2388*4882a593Smuzhiyun * LBase = 0x1100
2389*4882a593Smuzhiyun * VBase = 0x1161
2390*4882a593Smuzhiyun * TBase = 0x11A7
2391*4882a593Smuzhiyun * LCount = 19
2392*4882a593Smuzhiyun * VCount = 21
2393*4882a593Smuzhiyun * TCount = 28
2394*4882a593Smuzhiyun * NCount = 588 (VCount * TCount)
2395*4882a593Smuzhiyun * SCount = 11172 (LCount * NCount)
2396*4882a593Smuzhiyun *
2397*4882a593Smuzhiyun * Decomposition:
2398*4882a593Smuzhiyun * SIndex = s - SBase
2399*4882a593Smuzhiyun *
2400*4882a593Smuzhiyun * LV (Canonical/Full)
2401*4882a593Smuzhiyun * LIndex = SIndex / NCount
2402*4882a593Smuzhiyun * VIndex = (Sindex % NCount) / TCount
2403*4882a593Smuzhiyun * LPart = LBase + LIndex
2404*4882a593Smuzhiyun * VPart = VBase + VIndex
2405*4882a593Smuzhiyun *
2406*4882a593Smuzhiyun * LVT (Canonical)
2407*4882a593Smuzhiyun * LVIndex = (SIndex / TCount) * TCount
2408*4882a593Smuzhiyun * TIndex = (Sindex % TCount)
2409*4882a593Smuzhiyun * LVPart = SBase + LVIndex
2410*4882a593Smuzhiyun * TPart = TBase + TIndex
2411*4882a593Smuzhiyun *
2412*4882a593Smuzhiyun * LVT (Full)
2413*4882a593Smuzhiyun * LIndex = SIndex / NCount
2414*4882a593Smuzhiyun * VIndex = (Sindex % NCount) / TCount
2415*4882a593Smuzhiyun * TIndex = (Sindex % TCount)
2416*4882a593Smuzhiyun * LPart = LBase + LIndex
2417*4882a593Smuzhiyun * VPart = VBase + VIndex
2418*4882a593Smuzhiyun * if (TIndex == 0) {
2419*4882a593Smuzhiyun * d = <LPart, VPart>
2420*4882a593Smuzhiyun * } else {
2421*4882a593Smuzhiyun * TPart = TBase + TIndex
2422*4882a593Smuzhiyun * d = <LPart, VPart, TPart>
2423*4882a593Smuzhiyun * }
2424*4882a593Smuzhiyun *
2425*4882a593Smuzhiyun */
2426*4882a593Smuzhiyun
hangul_decompose(void)2427*4882a593Smuzhiyun static void hangul_decompose(void)
2428*4882a593Smuzhiyun {
2429*4882a593Smuzhiyun unsigned int sb = 0xAC00;
2430*4882a593Smuzhiyun unsigned int lb = 0x1100;
2431*4882a593Smuzhiyun unsigned int vb = 0x1161;
2432*4882a593Smuzhiyun unsigned int tb = 0x11a7;
2433*4882a593Smuzhiyun /* unsigned int lc = 19; */
2434*4882a593Smuzhiyun unsigned int vc = 21;
2435*4882a593Smuzhiyun unsigned int tc = 28;
2436*4882a593Smuzhiyun unsigned int nc = (vc * tc);
2437*4882a593Smuzhiyun /* unsigned int sc = (lc * nc); */
2438*4882a593Smuzhiyun unsigned int unichar;
2439*4882a593Smuzhiyun unsigned int mapping[4];
2440*4882a593Smuzhiyun unsigned int *um;
2441*4882a593Smuzhiyun int count;
2442*4882a593Smuzhiyun int i;
2443*4882a593Smuzhiyun
2444*4882a593Smuzhiyun if (verbose > 0)
2445*4882a593Smuzhiyun printf("Decomposing hangul\n");
2446*4882a593Smuzhiyun /* Hangul */
2447*4882a593Smuzhiyun count = 0;
2448*4882a593Smuzhiyun for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
2449*4882a593Smuzhiyun unsigned int si = unichar - sb;
2450*4882a593Smuzhiyun unsigned int li = si / nc;
2451*4882a593Smuzhiyun unsigned int vi = (si % nc) / tc;
2452*4882a593Smuzhiyun unsigned int ti = si % tc;
2453*4882a593Smuzhiyun
2454*4882a593Smuzhiyun i = 0;
2455*4882a593Smuzhiyun mapping[i++] = lb + li;
2456*4882a593Smuzhiyun mapping[i++] = vb + vi;
2457*4882a593Smuzhiyun if (ti)
2458*4882a593Smuzhiyun mapping[i++] = tb + ti;
2459*4882a593Smuzhiyun mapping[i++] = 0;
2460*4882a593Smuzhiyun
2461*4882a593Smuzhiyun assert(!unicode_data[unichar].utf32nfdi);
2462*4882a593Smuzhiyun um = malloc(i * sizeof(unsigned int));
2463*4882a593Smuzhiyun memcpy(um, mapping, i * sizeof(unsigned int));
2464*4882a593Smuzhiyun unicode_data[unichar].utf32nfdi = um;
2465*4882a593Smuzhiyun
2466*4882a593Smuzhiyun assert(!unicode_data[unichar].utf32nfdicf);
2467*4882a593Smuzhiyun um = malloc(i * sizeof(unsigned int));
2468*4882a593Smuzhiyun memcpy(um, mapping, i * sizeof(unsigned int));
2469*4882a593Smuzhiyun unicode_data[unichar].utf32nfdicf = um;
2470*4882a593Smuzhiyun
2471*4882a593Smuzhiyun /*
2472*4882a593Smuzhiyun * Add a cookie as a reminder that the hangul syllable
2473*4882a593Smuzhiyun * decompositions must not be stored in the generated
2474*4882a593Smuzhiyun * trie.
2475*4882a593Smuzhiyun */
2476*4882a593Smuzhiyun unicode_data[unichar].utf8nfdi = malloc(2);
2477*4882a593Smuzhiyun unicode_data[unichar].utf8nfdi[0] = HANGUL;
2478*4882a593Smuzhiyun unicode_data[unichar].utf8nfdi[1] = '\0';
2479*4882a593Smuzhiyun
2480*4882a593Smuzhiyun if (verbose > 1)
2481*4882a593Smuzhiyun print_utf32nfdi(unichar);
2482*4882a593Smuzhiyun
2483*4882a593Smuzhiyun count++;
2484*4882a593Smuzhiyun }
2485*4882a593Smuzhiyun if (verbose > 0)
2486*4882a593Smuzhiyun printf("Created %d entries\n", count);
2487*4882a593Smuzhiyun }
2488*4882a593Smuzhiyun
nfdi_decompose(void)2489*4882a593Smuzhiyun static void nfdi_decompose(void)
2490*4882a593Smuzhiyun {
2491*4882a593Smuzhiyun unsigned int unichar;
2492*4882a593Smuzhiyun unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2493*4882a593Smuzhiyun unsigned int *um;
2494*4882a593Smuzhiyun unsigned int *dc;
2495*4882a593Smuzhiyun int count;
2496*4882a593Smuzhiyun int i;
2497*4882a593Smuzhiyun int j;
2498*4882a593Smuzhiyun int ret;
2499*4882a593Smuzhiyun
2500*4882a593Smuzhiyun if (verbose > 0)
2501*4882a593Smuzhiyun printf("Decomposing nfdi\n");
2502*4882a593Smuzhiyun
2503*4882a593Smuzhiyun count = 0;
2504*4882a593Smuzhiyun for (unichar = 0; unichar != 0x110000; unichar++) {
2505*4882a593Smuzhiyun if (!unicode_data[unichar].utf32nfdi)
2506*4882a593Smuzhiyun continue;
2507*4882a593Smuzhiyun for (;;) {
2508*4882a593Smuzhiyun ret = 1;
2509*4882a593Smuzhiyun i = 0;
2510*4882a593Smuzhiyun um = unicode_data[unichar].utf32nfdi;
2511*4882a593Smuzhiyun while (*um) {
2512*4882a593Smuzhiyun dc = unicode_data[*um].utf32nfdi;
2513*4882a593Smuzhiyun if (dc) {
2514*4882a593Smuzhiyun for (j = 0; dc[j]; j++)
2515*4882a593Smuzhiyun mapping[i++] = dc[j];
2516*4882a593Smuzhiyun ret = 0;
2517*4882a593Smuzhiyun } else {
2518*4882a593Smuzhiyun mapping[i++] = *um;
2519*4882a593Smuzhiyun }
2520*4882a593Smuzhiyun um++;
2521*4882a593Smuzhiyun }
2522*4882a593Smuzhiyun mapping[i++] = 0;
2523*4882a593Smuzhiyun if (ret)
2524*4882a593Smuzhiyun break;
2525*4882a593Smuzhiyun free(unicode_data[unichar].utf32nfdi);
2526*4882a593Smuzhiyun um = malloc(i * sizeof(unsigned int));
2527*4882a593Smuzhiyun memcpy(um, mapping, i * sizeof(unsigned int));
2528*4882a593Smuzhiyun unicode_data[unichar].utf32nfdi = um;
2529*4882a593Smuzhiyun }
2530*4882a593Smuzhiyun /* Add this decomposition to nfdicf if there is no entry. */
2531*4882a593Smuzhiyun if (!unicode_data[unichar].utf32nfdicf) {
2532*4882a593Smuzhiyun um = malloc(i * sizeof(unsigned int));
2533*4882a593Smuzhiyun memcpy(um, mapping, i * sizeof(unsigned int));
2534*4882a593Smuzhiyun unicode_data[unichar].utf32nfdicf = um;
2535*4882a593Smuzhiyun }
2536*4882a593Smuzhiyun if (verbose > 1)
2537*4882a593Smuzhiyun print_utf32nfdi(unichar);
2538*4882a593Smuzhiyun count++;
2539*4882a593Smuzhiyun }
2540*4882a593Smuzhiyun if (verbose > 0)
2541*4882a593Smuzhiyun printf("Processed %d entries\n", count);
2542*4882a593Smuzhiyun }
2543*4882a593Smuzhiyun
nfdicf_decompose(void)2544*4882a593Smuzhiyun static void nfdicf_decompose(void)
2545*4882a593Smuzhiyun {
2546*4882a593Smuzhiyun unsigned int unichar;
2547*4882a593Smuzhiyun unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2548*4882a593Smuzhiyun unsigned int *um;
2549*4882a593Smuzhiyun unsigned int *dc;
2550*4882a593Smuzhiyun int count;
2551*4882a593Smuzhiyun int i;
2552*4882a593Smuzhiyun int j;
2553*4882a593Smuzhiyun int ret;
2554*4882a593Smuzhiyun
2555*4882a593Smuzhiyun if (verbose > 0)
2556*4882a593Smuzhiyun printf("Decomposing nfdicf\n");
2557*4882a593Smuzhiyun count = 0;
2558*4882a593Smuzhiyun for (unichar = 0; unichar != 0x110000; unichar++) {
2559*4882a593Smuzhiyun if (!unicode_data[unichar].utf32nfdicf)
2560*4882a593Smuzhiyun continue;
2561*4882a593Smuzhiyun for (;;) {
2562*4882a593Smuzhiyun ret = 1;
2563*4882a593Smuzhiyun i = 0;
2564*4882a593Smuzhiyun um = unicode_data[unichar].utf32nfdicf;
2565*4882a593Smuzhiyun while (*um) {
2566*4882a593Smuzhiyun dc = unicode_data[*um].utf32nfdicf;
2567*4882a593Smuzhiyun if (dc) {
2568*4882a593Smuzhiyun for (j = 0; dc[j]; j++)
2569*4882a593Smuzhiyun mapping[i++] = dc[j];
2570*4882a593Smuzhiyun ret = 0;
2571*4882a593Smuzhiyun } else {
2572*4882a593Smuzhiyun mapping[i++] = *um;
2573*4882a593Smuzhiyun }
2574*4882a593Smuzhiyun um++;
2575*4882a593Smuzhiyun }
2576*4882a593Smuzhiyun mapping[i++] = 0;
2577*4882a593Smuzhiyun if (ret)
2578*4882a593Smuzhiyun break;
2579*4882a593Smuzhiyun free(unicode_data[unichar].utf32nfdicf);
2580*4882a593Smuzhiyun um = malloc(i * sizeof(unsigned int));
2581*4882a593Smuzhiyun memcpy(um, mapping, i * sizeof(unsigned int));
2582*4882a593Smuzhiyun unicode_data[unichar].utf32nfdicf = um;
2583*4882a593Smuzhiyun }
2584*4882a593Smuzhiyun if (verbose > 1)
2585*4882a593Smuzhiyun print_utf32nfdicf(unichar);
2586*4882a593Smuzhiyun count++;
2587*4882a593Smuzhiyun }
2588*4882a593Smuzhiyun if (verbose > 0)
2589*4882a593Smuzhiyun printf("Processed %d entries\n", count);
2590*4882a593Smuzhiyun }
2591*4882a593Smuzhiyun
2592*4882a593Smuzhiyun /* ------------------------------------------------------------------ */
2593*4882a593Smuzhiyun
2594*4882a593Smuzhiyun int utf8agemax(struct tree *, const char *);
2595*4882a593Smuzhiyun int utf8nagemax(struct tree *, const char *, size_t);
2596*4882a593Smuzhiyun int utf8agemin(struct tree *, const char *);
2597*4882a593Smuzhiyun int utf8nagemin(struct tree *, const char *, size_t);
2598*4882a593Smuzhiyun ssize_t utf8len(struct tree *, const char *);
2599*4882a593Smuzhiyun ssize_t utf8nlen(struct tree *, const char *, size_t);
2600*4882a593Smuzhiyun struct utf8cursor;
2601*4882a593Smuzhiyun int utf8cursor(struct utf8cursor *, struct tree *, const char *);
2602*4882a593Smuzhiyun int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
2603*4882a593Smuzhiyun int utf8byte(struct utf8cursor *);
2604*4882a593Smuzhiyun
2605*4882a593Smuzhiyun /*
2606*4882a593Smuzhiyun * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2607*4882a593Smuzhiyun *
2608*4882a593Smuzhiyun * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2609*4882a593Smuzhiyun * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2610*4882a593Smuzhiyun *
2611*4882a593Smuzhiyun * SBase = 0xAC00
2612*4882a593Smuzhiyun * LBase = 0x1100
2613*4882a593Smuzhiyun * VBase = 0x1161
2614*4882a593Smuzhiyun * TBase = 0x11A7
2615*4882a593Smuzhiyun * LCount = 19
2616*4882a593Smuzhiyun * VCount = 21
2617*4882a593Smuzhiyun * TCount = 28
2618*4882a593Smuzhiyun * NCount = 588 (VCount * TCount)
2619*4882a593Smuzhiyun * SCount = 11172 (LCount * NCount)
2620*4882a593Smuzhiyun *
2621*4882a593Smuzhiyun * Decomposition:
2622*4882a593Smuzhiyun * SIndex = s - SBase
2623*4882a593Smuzhiyun *
2624*4882a593Smuzhiyun * LV (Canonical/Full)
2625*4882a593Smuzhiyun * LIndex = SIndex / NCount
2626*4882a593Smuzhiyun * VIndex = (Sindex % NCount) / TCount
2627*4882a593Smuzhiyun * LPart = LBase + LIndex
2628*4882a593Smuzhiyun * VPart = VBase + VIndex
2629*4882a593Smuzhiyun *
2630*4882a593Smuzhiyun * LVT (Canonical)
2631*4882a593Smuzhiyun * LVIndex = (SIndex / TCount) * TCount
2632*4882a593Smuzhiyun * TIndex = (Sindex % TCount)
2633*4882a593Smuzhiyun * LVPart = SBase + LVIndex
2634*4882a593Smuzhiyun * TPart = TBase + TIndex
2635*4882a593Smuzhiyun *
2636*4882a593Smuzhiyun * LVT (Full)
2637*4882a593Smuzhiyun * LIndex = SIndex / NCount
2638*4882a593Smuzhiyun * VIndex = (Sindex % NCount) / TCount
2639*4882a593Smuzhiyun * TIndex = (Sindex % TCount)
2640*4882a593Smuzhiyun * LPart = LBase + LIndex
2641*4882a593Smuzhiyun * VPart = VBase + VIndex
2642*4882a593Smuzhiyun * if (TIndex == 0) {
2643*4882a593Smuzhiyun * d = <LPart, VPart>
2644*4882a593Smuzhiyun * } else {
2645*4882a593Smuzhiyun * TPart = TBase + TIndex
2646*4882a593Smuzhiyun * d = <LPart, VPart, TPart>
2647*4882a593Smuzhiyun * }
2648*4882a593Smuzhiyun */
2649*4882a593Smuzhiyun
2650*4882a593Smuzhiyun /* Constants */
2651*4882a593Smuzhiyun #define SB (0xAC00)
2652*4882a593Smuzhiyun #define LB (0x1100)
2653*4882a593Smuzhiyun #define VB (0x1161)
2654*4882a593Smuzhiyun #define TB (0x11A7)
2655*4882a593Smuzhiyun #define LC (19)
2656*4882a593Smuzhiyun #define VC (21)
2657*4882a593Smuzhiyun #define TC (28)
2658*4882a593Smuzhiyun #define NC (VC * TC)
2659*4882a593Smuzhiyun #define SC (LC * NC)
2660*4882a593Smuzhiyun
2661*4882a593Smuzhiyun /* Algorithmic decomposition of hangul syllable. */
utf8hangul(const char * str,unsigned char * hangul)2662*4882a593Smuzhiyun static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
2663*4882a593Smuzhiyun {
2664*4882a593Smuzhiyun unsigned int si;
2665*4882a593Smuzhiyun unsigned int li;
2666*4882a593Smuzhiyun unsigned int vi;
2667*4882a593Smuzhiyun unsigned int ti;
2668*4882a593Smuzhiyun unsigned char *h;
2669*4882a593Smuzhiyun
2670*4882a593Smuzhiyun /* Calculate the SI, LI, VI, and TI values. */
2671*4882a593Smuzhiyun si = utf8decode(str) - SB;
2672*4882a593Smuzhiyun li = si / NC;
2673*4882a593Smuzhiyun vi = (si % NC) / TC;
2674*4882a593Smuzhiyun ti = si % TC;
2675*4882a593Smuzhiyun
2676*4882a593Smuzhiyun /* Fill in base of leaf. */
2677*4882a593Smuzhiyun h = hangul;
2678*4882a593Smuzhiyun LEAF_GEN(h) = 2;
2679*4882a593Smuzhiyun LEAF_CCC(h) = DECOMPOSE;
2680*4882a593Smuzhiyun h += 2;
2681*4882a593Smuzhiyun
2682*4882a593Smuzhiyun /* Add LPart, a 3-byte UTF-8 sequence. */
2683*4882a593Smuzhiyun h += utf8encode((char *)h, li + LB);
2684*4882a593Smuzhiyun
2685*4882a593Smuzhiyun /* Add VPart, a 3-byte UTF-8 sequence. */
2686*4882a593Smuzhiyun h += utf8encode((char *)h, vi + VB);
2687*4882a593Smuzhiyun
2688*4882a593Smuzhiyun /* Add TPart if required, also a 3-byte UTF-8 sequence. */
2689*4882a593Smuzhiyun if (ti)
2690*4882a593Smuzhiyun h += utf8encode((char *)h, ti + TB);
2691*4882a593Smuzhiyun
2692*4882a593Smuzhiyun /* Terminate string. */
2693*4882a593Smuzhiyun h[0] = '\0';
2694*4882a593Smuzhiyun
2695*4882a593Smuzhiyun return hangul;
2696*4882a593Smuzhiyun }
2697*4882a593Smuzhiyun
2698*4882a593Smuzhiyun /*
2699*4882a593Smuzhiyun * Use trie to scan s, touching at most len bytes.
2700*4882a593Smuzhiyun * Returns the leaf if one exists, NULL otherwise.
2701*4882a593Smuzhiyun *
2702*4882a593Smuzhiyun * A non-NULL return guarantees that the UTF-8 sequence starting at s
2703*4882a593Smuzhiyun * is well-formed and corresponds to a known unicode code point. The
2704*4882a593Smuzhiyun * shorthand for this will be "is valid UTF-8 unicode".
2705*4882a593Smuzhiyun */
utf8nlookup(struct tree * tree,unsigned char * hangul,const char * s,size_t len)2706*4882a593Smuzhiyun static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
2707*4882a593Smuzhiyun const char *s, size_t len)
2708*4882a593Smuzhiyun {
2709*4882a593Smuzhiyun utf8trie_t *trie;
2710*4882a593Smuzhiyun int offlen;
2711*4882a593Smuzhiyun int offset;
2712*4882a593Smuzhiyun int mask;
2713*4882a593Smuzhiyun int node;
2714*4882a593Smuzhiyun
2715*4882a593Smuzhiyun if (!tree)
2716*4882a593Smuzhiyun return NULL;
2717*4882a593Smuzhiyun if (len == 0)
2718*4882a593Smuzhiyun return NULL;
2719*4882a593Smuzhiyun node = 1;
2720*4882a593Smuzhiyun trie = utf8data + tree->index;
2721*4882a593Smuzhiyun while (node) {
2722*4882a593Smuzhiyun offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
2723*4882a593Smuzhiyun if (*trie & NEXTBYTE) {
2724*4882a593Smuzhiyun if (--len == 0)
2725*4882a593Smuzhiyun return NULL;
2726*4882a593Smuzhiyun s++;
2727*4882a593Smuzhiyun }
2728*4882a593Smuzhiyun mask = 1 << (*trie & BITNUM);
2729*4882a593Smuzhiyun if (*s & mask) {
2730*4882a593Smuzhiyun /* Right leg */
2731*4882a593Smuzhiyun if (offlen) {
2732*4882a593Smuzhiyun /* Right node at offset of trie */
2733*4882a593Smuzhiyun node = (*trie & RIGHTNODE);
2734*4882a593Smuzhiyun offset = trie[offlen];
2735*4882a593Smuzhiyun while (--offlen) {
2736*4882a593Smuzhiyun offset <<= 8;
2737*4882a593Smuzhiyun offset |= trie[offlen];
2738*4882a593Smuzhiyun }
2739*4882a593Smuzhiyun trie += offset;
2740*4882a593Smuzhiyun } else if (*trie & RIGHTPATH) {
2741*4882a593Smuzhiyun /* Right node after this node */
2742*4882a593Smuzhiyun node = (*trie & TRIENODE);
2743*4882a593Smuzhiyun trie++;
2744*4882a593Smuzhiyun } else {
2745*4882a593Smuzhiyun /* No right node. */
2746*4882a593Smuzhiyun return NULL;
2747*4882a593Smuzhiyun }
2748*4882a593Smuzhiyun } else {
2749*4882a593Smuzhiyun /* Left leg */
2750*4882a593Smuzhiyun if (offlen) {
2751*4882a593Smuzhiyun /* Left node after this node. */
2752*4882a593Smuzhiyun node = (*trie & LEFTNODE);
2753*4882a593Smuzhiyun trie += offlen + 1;
2754*4882a593Smuzhiyun } else if (*trie & RIGHTPATH) {
2755*4882a593Smuzhiyun /* No left node. */
2756*4882a593Smuzhiyun return NULL;
2757*4882a593Smuzhiyun } else {
2758*4882a593Smuzhiyun /* Left node after this node */
2759*4882a593Smuzhiyun node = (*trie & TRIENODE);
2760*4882a593Smuzhiyun trie++;
2761*4882a593Smuzhiyun }
2762*4882a593Smuzhiyun }
2763*4882a593Smuzhiyun }
2764*4882a593Smuzhiyun /*
2765*4882a593Smuzhiyun * Hangul decomposition is done algorithmically. These are the
2766*4882a593Smuzhiyun * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
2767*4882a593Smuzhiyun * always 3 bytes long, so s has been advanced twice, and the
2768*4882a593Smuzhiyun * start of the sequence is at s-2.
2769*4882a593Smuzhiyun */
2770*4882a593Smuzhiyun if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
2771*4882a593Smuzhiyun trie = utf8hangul(s - 2, hangul);
2772*4882a593Smuzhiyun return trie;
2773*4882a593Smuzhiyun }
2774*4882a593Smuzhiyun
2775*4882a593Smuzhiyun /*
2776*4882a593Smuzhiyun * Use trie to scan s.
2777*4882a593Smuzhiyun * Returns the leaf if one exists, NULL otherwise.
2778*4882a593Smuzhiyun *
2779*4882a593Smuzhiyun * Forwards to trie_nlookup().
2780*4882a593Smuzhiyun */
utf8lookup(struct tree * tree,unsigned char * hangul,const char * s)2781*4882a593Smuzhiyun static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
2782*4882a593Smuzhiyun const char *s)
2783*4882a593Smuzhiyun {
2784*4882a593Smuzhiyun return utf8nlookup(tree, hangul, s, (size_t)-1);
2785*4882a593Smuzhiyun }
2786*4882a593Smuzhiyun
2787*4882a593Smuzhiyun /*
2788*4882a593Smuzhiyun * Return the number of bytes used by the current UTF-8 sequence.
2789*4882a593Smuzhiyun * Assumes the input points to the first byte of a valid UTF-8
2790*4882a593Smuzhiyun * sequence.
2791*4882a593Smuzhiyun */
utf8clen(const char * s)2792*4882a593Smuzhiyun static inline int utf8clen(const char *s)
2793*4882a593Smuzhiyun {
2794*4882a593Smuzhiyun unsigned char c = *s;
2795*4882a593Smuzhiyun return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
2796*4882a593Smuzhiyun }
2797*4882a593Smuzhiyun
2798*4882a593Smuzhiyun /*
2799*4882a593Smuzhiyun * Maximum age of any character in s.
2800*4882a593Smuzhiyun * Return -1 if s is not valid UTF-8 unicode.
2801*4882a593Smuzhiyun * Return 0 if only non-assigned code points are used.
2802*4882a593Smuzhiyun */
utf8agemax(struct tree * tree,const char * s)2803*4882a593Smuzhiyun int utf8agemax(struct tree *tree, const char *s)
2804*4882a593Smuzhiyun {
2805*4882a593Smuzhiyun utf8leaf_t *leaf;
2806*4882a593Smuzhiyun int age = 0;
2807*4882a593Smuzhiyun int leaf_age;
2808*4882a593Smuzhiyun unsigned char hangul[UTF8HANGULLEAF];
2809*4882a593Smuzhiyun
2810*4882a593Smuzhiyun if (!tree)
2811*4882a593Smuzhiyun return -1;
2812*4882a593Smuzhiyun
2813*4882a593Smuzhiyun while (*s) {
2814*4882a593Smuzhiyun leaf = utf8lookup(tree, hangul, s);
2815*4882a593Smuzhiyun if (!leaf)
2816*4882a593Smuzhiyun return -1;
2817*4882a593Smuzhiyun leaf_age = ages[LEAF_GEN(leaf)];
2818*4882a593Smuzhiyun if (leaf_age <= tree->maxage && leaf_age > age)
2819*4882a593Smuzhiyun age = leaf_age;
2820*4882a593Smuzhiyun s += utf8clen(s);
2821*4882a593Smuzhiyun }
2822*4882a593Smuzhiyun return age;
2823*4882a593Smuzhiyun }
2824*4882a593Smuzhiyun
2825*4882a593Smuzhiyun /*
2826*4882a593Smuzhiyun * Minimum age of any character in s.
2827*4882a593Smuzhiyun * Return -1 if s is not valid UTF-8 unicode.
2828*4882a593Smuzhiyun * Return 0 if non-assigned code points are used.
2829*4882a593Smuzhiyun */
utf8agemin(struct tree * tree,const char * s)2830*4882a593Smuzhiyun int utf8agemin(struct tree *tree, const char *s)
2831*4882a593Smuzhiyun {
2832*4882a593Smuzhiyun utf8leaf_t *leaf;
2833*4882a593Smuzhiyun int age;
2834*4882a593Smuzhiyun int leaf_age;
2835*4882a593Smuzhiyun unsigned char hangul[UTF8HANGULLEAF];
2836*4882a593Smuzhiyun
2837*4882a593Smuzhiyun if (!tree)
2838*4882a593Smuzhiyun return -1;
2839*4882a593Smuzhiyun age = tree->maxage;
2840*4882a593Smuzhiyun while (*s) {
2841*4882a593Smuzhiyun leaf = utf8lookup(tree, hangul, s);
2842*4882a593Smuzhiyun if (!leaf)
2843*4882a593Smuzhiyun return -1;
2844*4882a593Smuzhiyun leaf_age = ages[LEAF_GEN(leaf)];
2845*4882a593Smuzhiyun if (leaf_age <= tree->maxage && leaf_age < age)
2846*4882a593Smuzhiyun age = leaf_age;
2847*4882a593Smuzhiyun s += utf8clen(s);
2848*4882a593Smuzhiyun }
2849*4882a593Smuzhiyun return age;
2850*4882a593Smuzhiyun }
2851*4882a593Smuzhiyun
2852*4882a593Smuzhiyun /*
2853*4882a593Smuzhiyun * Maximum age of any character in s, touch at most len bytes.
2854*4882a593Smuzhiyun * Return -1 if s is not valid UTF-8 unicode.
2855*4882a593Smuzhiyun */
utf8nagemax(struct tree * tree,const char * s,size_t len)2856*4882a593Smuzhiyun int utf8nagemax(struct tree *tree, const char *s, size_t len)
2857*4882a593Smuzhiyun {
2858*4882a593Smuzhiyun utf8leaf_t *leaf;
2859*4882a593Smuzhiyun int age = 0;
2860*4882a593Smuzhiyun int leaf_age;
2861*4882a593Smuzhiyun unsigned char hangul[UTF8HANGULLEAF];
2862*4882a593Smuzhiyun
2863*4882a593Smuzhiyun if (!tree)
2864*4882a593Smuzhiyun return -1;
2865*4882a593Smuzhiyun
2866*4882a593Smuzhiyun while (len && *s) {
2867*4882a593Smuzhiyun leaf = utf8nlookup(tree, hangul, s, len);
2868*4882a593Smuzhiyun if (!leaf)
2869*4882a593Smuzhiyun return -1;
2870*4882a593Smuzhiyun leaf_age = ages[LEAF_GEN(leaf)];
2871*4882a593Smuzhiyun if (leaf_age <= tree->maxage && leaf_age > age)
2872*4882a593Smuzhiyun age = leaf_age;
2873*4882a593Smuzhiyun len -= utf8clen(s);
2874*4882a593Smuzhiyun s += utf8clen(s);
2875*4882a593Smuzhiyun }
2876*4882a593Smuzhiyun return age;
2877*4882a593Smuzhiyun }
2878*4882a593Smuzhiyun
2879*4882a593Smuzhiyun /*
2880*4882a593Smuzhiyun * Maximum age of any character in s, touch at most len bytes.
2881*4882a593Smuzhiyun * Return -1 if s is not valid UTF-8 unicode.
2882*4882a593Smuzhiyun */
utf8nagemin(struct tree * tree,const char * s,size_t len)2883*4882a593Smuzhiyun int utf8nagemin(struct tree *tree, const char *s, size_t len)
2884*4882a593Smuzhiyun {
2885*4882a593Smuzhiyun utf8leaf_t *leaf;
2886*4882a593Smuzhiyun int leaf_age;
2887*4882a593Smuzhiyun int age;
2888*4882a593Smuzhiyun unsigned char hangul[UTF8HANGULLEAF];
2889*4882a593Smuzhiyun
2890*4882a593Smuzhiyun if (!tree)
2891*4882a593Smuzhiyun return -1;
2892*4882a593Smuzhiyun age = tree->maxage;
2893*4882a593Smuzhiyun while (len && *s) {
2894*4882a593Smuzhiyun leaf = utf8nlookup(tree, hangul, s, len);
2895*4882a593Smuzhiyun if (!leaf)
2896*4882a593Smuzhiyun return -1;
2897*4882a593Smuzhiyun leaf_age = ages[LEAF_GEN(leaf)];
2898*4882a593Smuzhiyun if (leaf_age <= tree->maxage && leaf_age < age)
2899*4882a593Smuzhiyun age = leaf_age;
2900*4882a593Smuzhiyun len -= utf8clen(s);
2901*4882a593Smuzhiyun s += utf8clen(s);
2902*4882a593Smuzhiyun }
2903*4882a593Smuzhiyun return age;
2904*4882a593Smuzhiyun }
2905*4882a593Smuzhiyun
2906*4882a593Smuzhiyun /*
2907*4882a593Smuzhiyun * Length of the normalization of s.
2908*4882a593Smuzhiyun * Return -1 if s is not valid UTF-8 unicode.
2909*4882a593Smuzhiyun *
2910*4882a593Smuzhiyun * A string of Default_Ignorable_Code_Point has length 0.
2911*4882a593Smuzhiyun */
utf8len(struct tree * tree,const char * s)2912*4882a593Smuzhiyun ssize_t utf8len(struct tree *tree, const char *s)
2913*4882a593Smuzhiyun {
2914*4882a593Smuzhiyun utf8leaf_t *leaf;
2915*4882a593Smuzhiyun size_t ret = 0;
2916*4882a593Smuzhiyun unsigned char hangul[UTF8HANGULLEAF];
2917*4882a593Smuzhiyun
2918*4882a593Smuzhiyun if (!tree)
2919*4882a593Smuzhiyun return -1;
2920*4882a593Smuzhiyun while (*s) {
2921*4882a593Smuzhiyun leaf = utf8lookup(tree, hangul, s);
2922*4882a593Smuzhiyun if (!leaf)
2923*4882a593Smuzhiyun return -1;
2924*4882a593Smuzhiyun if (ages[LEAF_GEN(leaf)] > tree->maxage)
2925*4882a593Smuzhiyun ret += utf8clen(s);
2926*4882a593Smuzhiyun else if (LEAF_CCC(leaf) == DECOMPOSE)
2927*4882a593Smuzhiyun ret += strlen(LEAF_STR(leaf));
2928*4882a593Smuzhiyun else
2929*4882a593Smuzhiyun ret += utf8clen(s);
2930*4882a593Smuzhiyun s += utf8clen(s);
2931*4882a593Smuzhiyun }
2932*4882a593Smuzhiyun return ret;
2933*4882a593Smuzhiyun }
2934*4882a593Smuzhiyun
2935*4882a593Smuzhiyun /*
2936*4882a593Smuzhiyun * Length of the normalization of s, touch at most len bytes.
2937*4882a593Smuzhiyun * Return -1 if s is not valid UTF-8 unicode.
2938*4882a593Smuzhiyun */
utf8nlen(struct tree * tree,const char * s,size_t len)2939*4882a593Smuzhiyun ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
2940*4882a593Smuzhiyun {
2941*4882a593Smuzhiyun utf8leaf_t *leaf;
2942*4882a593Smuzhiyun size_t ret = 0;
2943*4882a593Smuzhiyun unsigned char hangul[UTF8HANGULLEAF];
2944*4882a593Smuzhiyun
2945*4882a593Smuzhiyun if (!tree)
2946*4882a593Smuzhiyun return -1;
2947*4882a593Smuzhiyun while (len && *s) {
2948*4882a593Smuzhiyun leaf = utf8nlookup(tree, hangul, s, len);
2949*4882a593Smuzhiyun if (!leaf)
2950*4882a593Smuzhiyun return -1;
2951*4882a593Smuzhiyun if (ages[LEAF_GEN(leaf)] > tree->maxage)
2952*4882a593Smuzhiyun ret += utf8clen(s);
2953*4882a593Smuzhiyun else if (LEAF_CCC(leaf) == DECOMPOSE)
2954*4882a593Smuzhiyun ret += strlen(LEAF_STR(leaf));
2955*4882a593Smuzhiyun else
2956*4882a593Smuzhiyun ret += utf8clen(s);
2957*4882a593Smuzhiyun len -= utf8clen(s);
2958*4882a593Smuzhiyun s += utf8clen(s);
2959*4882a593Smuzhiyun }
2960*4882a593Smuzhiyun return ret;
2961*4882a593Smuzhiyun }
2962*4882a593Smuzhiyun
2963*4882a593Smuzhiyun /*
2964*4882a593Smuzhiyun * Cursor structure used by the normalizer.
2965*4882a593Smuzhiyun */
2966*4882a593Smuzhiyun struct utf8cursor {
2967*4882a593Smuzhiyun struct tree *tree;
2968*4882a593Smuzhiyun const char *s;
2969*4882a593Smuzhiyun const char *p;
2970*4882a593Smuzhiyun const char *ss;
2971*4882a593Smuzhiyun const char *sp;
2972*4882a593Smuzhiyun unsigned int len;
2973*4882a593Smuzhiyun unsigned int slen;
2974*4882a593Smuzhiyun short int ccc;
2975*4882a593Smuzhiyun short int nccc;
2976*4882a593Smuzhiyun unsigned int unichar;
2977*4882a593Smuzhiyun unsigned char hangul[UTF8HANGULLEAF];
2978*4882a593Smuzhiyun };
2979*4882a593Smuzhiyun
2980*4882a593Smuzhiyun /*
2981*4882a593Smuzhiyun * Set up an utf8cursor for use by utf8byte().
2982*4882a593Smuzhiyun *
2983*4882a593Smuzhiyun * s : string.
2984*4882a593Smuzhiyun * len : length of s.
2985*4882a593Smuzhiyun * u8c : pointer to cursor.
2986*4882a593Smuzhiyun * trie : utf8trie_t to use for normalization.
2987*4882a593Smuzhiyun *
2988*4882a593Smuzhiyun * Returns -1 on error, 0 on success.
2989*4882a593Smuzhiyun */
utf8ncursor(struct utf8cursor * u8c,struct tree * tree,const char * s,size_t len)2990*4882a593Smuzhiyun int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
2991*4882a593Smuzhiyun size_t len)
2992*4882a593Smuzhiyun {
2993*4882a593Smuzhiyun if (!tree)
2994*4882a593Smuzhiyun return -1;
2995*4882a593Smuzhiyun if (!s)
2996*4882a593Smuzhiyun return -1;
2997*4882a593Smuzhiyun u8c->tree = tree;
2998*4882a593Smuzhiyun u8c->s = s;
2999*4882a593Smuzhiyun u8c->p = NULL;
3000*4882a593Smuzhiyun u8c->ss = NULL;
3001*4882a593Smuzhiyun u8c->sp = NULL;
3002*4882a593Smuzhiyun u8c->len = len;
3003*4882a593Smuzhiyun u8c->slen = 0;
3004*4882a593Smuzhiyun u8c->ccc = STOPPER;
3005*4882a593Smuzhiyun u8c->nccc = STOPPER;
3006*4882a593Smuzhiyun u8c->unichar = 0;
3007*4882a593Smuzhiyun /* Check we didn't clobber the maximum length. */
3008*4882a593Smuzhiyun if (u8c->len != len)
3009*4882a593Smuzhiyun return -1;
3010*4882a593Smuzhiyun /* The first byte of s may not be an utf8 continuation. */
3011*4882a593Smuzhiyun if (len > 0 && (*s & 0xC0) == 0x80)
3012*4882a593Smuzhiyun return -1;
3013*4882a593Smuzhiyun return 0;
3014*4882a593Smuzhiyun }
3015*4882a593Smuzhiyun
3016*4882a593Smuzhiyun /*
3017*4882a593Smuzhiyun * Set up an utf8cursor for use by utf8byte().
3018*4882a593Smuzhiyun *
3019*4882a593Smuzhiyun * s : NUL-terminated string.
3020*4882a593Smuzhiyun * u8c : pointer to cursor.
3021*4882a593Smuzhiyun * trie : utf8trie_t to use for normalization.
3022*4882a593Smuzhiyun *
3023*4882a593Smuzhiyun * Returns -1 on error, 0 on success.
3024*4882a593Smuzhiyun */
utf8cursor(struct utf8cursor * u8c,struct tree * tree,const char * s)3025*4882a593Smuzhiyun int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
3026*4882a593Smuzhiyun {
3027*4882a593Smuzhiyun return utf8ncursor(u8c, tree, s, (unsigned int)-1);
3028*4882a593Smuzhiyun }
3029*4882a593Smuzhiyun
3030*4882a593Smuzhiyun /*
3031*4882a593Smuzhiyun * Get one byte from the normalized form of the string described by u8c.
3032*4882a593Smuzhiyun *
3033*4882a593Smuzhiyun * Returns the byte cast to an unsigned char on succes, and -1 on failure.
3034*4882a593Smuzhiyun *
3035*4882a593Smuzhiyun * The cursor keeps track of the location in the string in u8c->s.
3036*4882a593Smuzhiyun * When a character is decomposed, the current location is stored in
3037*4882a593Smuzhiyun * u8c->p, and u8c->s is set to the start of the decomposition. Note
3038*4882a593Smuzhiyun * that bytes from a decomposition do not count against u8c->len.
3039*4882a593Smuzhiyun *
3040*4882a593Smuzhiyun * Characters are emitted if they match the current CCC in u8c->ccc.
3041*4882a593Smuzhiyun * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
3042*4882a593Smuzhiyun * and the function returns 0 in that case.
3043*4882a593Smuzhiyun *
3044*4882a593Smuzhiyun * Sorting by CCC is done by repeatedly scanning the string. The
3045*4882a593Smuzhiyun * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
3046*4882a593Smuzhiyun * the start of the scan. The first pass finds the lowest CCC to be
3047*4882a593Smuzhiyun * emitted and stores it in u8c->nccc, the second pass emits the
3048*4882a593Smuzhiyun * characters with this CCC and finds the next lowest CCC. This limits
3049*4882a593Smuzhiyun * the number of passes to 1 + the number of different CCCs in the
3050*4882a593Smuzhiyun * sequence being scanned.
3051*4882a593Smuzhiyun *
3052*4882a593Smuzhiyun * Therefore:
3053*4882a593Smuzhiyun * u8c->p != NULL -> a decomposition is being scanned.
3054*4882a593Smuzhiyun * u8c->ss != NULL -> this is a repeating scan.
3055*4882a593Smuzhiyun * u8c->ccc == -1 -> this is the first scan of a repeating scan.
3056*4882a593Smuzhiyun */
utf8byte(struct utf8cursor * u8c)3057*4882a593Smuzhiyun int utf8byte(struct utf8cursor *u8c)
3058*4882a593Smuzhiyun {
3059*4882a593Smuzhiyun utf8leaf_t *leaf;
3060*4882a593Smuzhiyun int ccc;
3061*4882a593Smuzhiyun
3062*4882a593Smuzhiyun for (;;) {
3063*4882a593Smuzhiyun /* Check for the end of a decomposed character. */
3064*4882a593Smuzhiyun if (u8c->p && *u8c->s == '\0') {
3065*4882a593Smuzhiyun u8c->s = u8c->p;
3066*4882a593Smuzhiyun u8c->p = NULL;
3067*4882a593Smuzhiyun }
3068*4882a593Smuzhiyun
3069*4882a593Smuzhiyun /* Check for end-of-string. */
3070*4882a593Smuzhiyun if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
3071*4882a593Smuzhiyun /* There is no next byte. */
3072*4882a593Smuzhiyun if (u8c->ccc == STOPPER)
3073*4882a593Smuzhiyun return 0;
3074*4882a593Smuzhiyun /* End-of-string during a scan counts as a stopper. */
3075*4882a593Smuzhiyun ccc = STOPPER;
3076*4882a593Smuzhiyun goto ccc_mismatch;
3077*4882a593Smuzhiyun } else if ((*u8c->s & 0xC0) == 0x80) {
3078*4882a593Smuzhiyun /* This is a continuation of the current character. */
3079*4882a593Smuzhiyun if (!u8c->p)
3080*4882a593Smuzhiyun u8c->len--;
3081*4882a593Smuzhiyun return (unsigned char)*u8c->s++;
3082*4882a593Smuzhiyun }
3083*4882a593Smuzhiyun
3084*4882a593Smuzhiyun /* Look up the data for the current character. */
3085*4882a593Smuzhiyun if (u8c->p) {
3086*4882a593Smuzhiyun leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3087*4882a593Smuzhiyun } else {
3088*4882a593Smuzhiyun leaf = utf8nlookup(u8c->tree, u8c->hangul,
3089*4882a593Smuzhiyun u8c->s, u8c->len);
3090*4882a593Smuzhiyun }
3091*4882a593Smuzhiyun
3092*4882a593Smuzhiyun /* No leaf found implies that the input is a binary blob. */
3093*4882a593Smuzhiyun if (!leaf)
3094*4882a593Smuzhiyun return -1;
3095*4882a593Smuzhiyun
3096*4882a593Smuzhiyun /* Characters that are too new have CCC 0. */
3097*4882a593Smuzhiyun if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
3098*4882a593Smuzhiyun ccc = STOPPER;
3099*4882a593Smuzhiyun } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
3100*4882a593Smuzhiyun u8c->len -= utf8clen(u8c->s);
3101*4882a593Smuzhiyun u8c->p = u8c->s + utf8clen(u8c->s);
3102*4882a593Smuzhiyun u8c->s = LEAF_STR(leaf);
3103*4882a593Smuzhiyun /* Empty decomposition implies CCC 0. */
3104*4882a593Smuzhiyun if (*u8c->s == '\0') {
3105*4882a593Smuzhiyun if (u8c->ccc == STOPPER)
3106*4882a593Smuzhiyun continue;
3107*4882a593Smuzhiyun ccc = STOPPER;
3108*4882a593Smuzhiyun goto ccc_mismatch;
3109*4882a593Smuzhiyun }
3110*4882a593Smuzhiyun leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3111*4882a593Smuzhiyun ccc = LEAF_CCC(leaf);
3112*4882a593Smuzhiyun }
3113*4882a593Smuzhiyun u8c->unichar = utf8decode(u8c->s);
3114*4882a593Smuzhiyun
3115*4882a593Smuzhiyun /*
3116*4882a593Smuzhiyun * If this is not a stopper, then see if it updates
3117*4882a593Smuzhiyun * the next canonical class to be emitted.
3118*4882a593Smuzhiyun */
3119*4882a593Smuzhiyun if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
3120*4882a593Smuzhiyun u8c->nccc = ccc;
3121*4882a593Smuzhiyun
3122*4882a593Smuzhiyun /*
3123*4882a593Smuzhiyun * Return the current byte if this is the current
3124*4882a593Smuzhiyun * combining class.
3125*4882a593Smuzhiyun */
3126*4882a593Smuzhiyun if (ccc == u8c->ccc) {
3127*4882a593Smuzhiyun if (!u8c->p)
3128*4882a593Smuzhiyun u8c->len--;
3129*4882a593Smuzhiyun return (unsigned char)*u8c->s++;
3130*4882a593Smuzhiyun }
3131*4882a593Smuzhiyun
3132*4882a593Smuzhiyun /* Current combining class mismatch. */
3133*4882a593Smuzhiyun ccc_mismatch:
3134*4882a593Smuzhiyun if (u8c->nccc == STOPPER) {
3135*4882a593Smuzhiyun /*
3136*4882a593Smuzhiyun * Scan forward for the first canonical class
3137*4882a593Smuzhiyun * to be emitted. Save the position from
3138*4882a593Smuzhiyun * which to restart.
3139*4882a593Smuzhiyun */
3140*4882a593Smuzhiyun assert(u8c->ccc == STOPPER);
3141*4882a593Smuzhiyun u8c->ccc = MINCCC - 1;
3142*4882a593Smuzhiyun u8c->nccc = ccc;
3143*4882a593Smuzhiyun u8c->sp = u8c->p;
3144*4882a593Smuzhiyun u8c->ss = u8c->s;
3145*4882a593Smuzhiyun u8c->slen = u8c->len;
3146*4882a593Smuzhiyun if (!u8c->p)
3147*4882a593Smuzhiyun u8c->len -= utf8clen(u8c->s);
3148*4882a593Smuzhiyun u8c->s += utf8clen(u8c->s);
3149*4882a593Smuzhiyun } else if (ccc != STOPPER) {
3150*4882a593Smuzhiyun /* Not a stopper, and not the ccc we're emitting. */
3151*4882a593Smuzhiyun if (!u8c->p)
3152*4882a593Smuzhiyun u8c->len -= utf8clen(u8c->s);
3153*4882a593Smuzhiyun u8c->s += utf8clen(u8c->s);
3154*4882a593Smuzhiyun } else if (u8c->nccc != MAXCCC + 1) {
3155*4882a593Smuzhiyun /* At a stopper, restart for next ccc. */
3156*4882a593Smuzhiyun u8c->ccc = u8c->nccc;
3157*4882a593Smuzhiyun u8c->nccc = MAXCCC + 1;
3158*4882a593Smuzhiyun u8c->s = u8c->ss;
3159*4882a593Smuzhiyun u8c->p = u8c->sp;
3160*4882a593Smuzhiyun u8c->len = u8c->slen;
3161*4882a593Smuzhiyun } else {
3162*4882a593Smuzhiyun /* All done, proceed from here. */
3163*4882a593Smuzhiyun u8c->ccc = STOPPER;
3164*4882a593Smuzhiyun u8c->nccc = STOPPER;
3165*4882a593Smuzhiyun u8c->sp = NULL;
3166*4882a593Smuzhiyun u8c->ss = NULL;
3167*4882a593Smuzhiyun u8c->slen = 0;
3168*4882a593Smuzhiyun }
3169*4882a593Smuzhiyun }
3170*4882a593Smuzhiyun }
3171*4882a593Smuzhiyun
3172*4882a593Smuzhiyun /* ------------------------------------------------------------------ */
3173*4882a593Smuzhiyun
normalize_line(struct tree * tree)3174*4882a593Smuzhiyun static int normalize_line(struct tree *tree)
3175*4882a593Smuzhiyun {
3176*4882a593Smuzhiyun char *s;
3177*4882a593Smuzhiyun char *t;
3178*4882a593Smuzhiyun int c;
3179*4882a593Smuzhiyun struct utf8cursor u8c;
3180*4882a593Smuzhiyun
3181*4882a593Smuzhiyun /* First test: null-terminated string. */
3182*4882a593Smuzhiyun s = buf2;
3183*4882a593Smuzhiyun t = buf3;
3184*4882a593Smuzhiyun if (utf8cursor(&u8c, tree, s))
3185*4882a593Smuzhiyun return -1;
3186*4882a593Smuzhiyun while ((c = utf8byte(&u8c)) > 0)
3187*4882a593Smuzhiyun if (c != (unsigned char)*t++)
3188*4882a593Smuzhiyun return -1;
3189*4882a593Smuzhiyun if (c < 0)
3190*4882a593Smuzhiyun return -1;
3191*4882a593Smuzhiyun if (*t != 0)
3192*4882a593Smuzhiyun return -1;
3193*4882a593Smuzhiyun
3194*4882a593Smuzhiyun /* Second test: length-limited string. */
3195*4882a593Smuzhiyun s = buf2;
3196*4882a593Smuzhiyun /* Replace NUL with a value that will cause an error if seen. */
3197*4882a593Smuzhiyun s[strlen(s) + 1] = -1;
3198*4882a593Smuzhiyun t = buf3;
3199*4882a593Smuzhiyun if (utf8cursor(&u8c, tree, s))
3200*4882a593Smuzhiyun return -1;
3201*4882a593Smuzhiyun while ((c = utf8byte(&u8c)) > 0)
3202*4882a593Smuzhiyun if (c != (unsigned char)*t++)
3203*4882a593Smuzhiyun return -1;
3204*4882a593Smuzhiyun if (c < 0)
3205*4882a593Smuzhiyun return -1;
3206*4882a593Smuzhiyun if (*t != 0)
3207*4882a593Smuzhiyun return -1;
3208*4882a593Smuzhiyun
3209*4882a593Smuzhiyun return 0;
3210*4882a593Smuzhiyun }
3211*4882a593Smuzhiyun
normalization_test(void)3212*4882a593Smuzhiyun static void normalization_test(void)
3213*4882a593Smuzhiyun {
3214*4882a593Smuzhiyun FILE *file;
3215*4882a593Smuzhiyun unsigned int unichar;
3216*4882a593Smuzhiyun struct unicode_data *data;
3217*4882a593Smuzhiyun char *s;
3218*4882a593Smuzhiyun char *t;
3219*4882a593Smuzhiyun int ret;
3220*4882a593Smuzhiyun int ignorables;
3221*4882a593Smuzhiyun int tests = 0;
3222*4882a593Smuzhiyun int failures = 0;
3223*4882a593Smuzhiyun
3224*4882a593Smuzhiyun if (verbose > 0)
3225*4882a593Smuzhiyun printf("Parsing %s\n", test_name);
3226*4882a593Smuzhiyun /* Step one, read data from file. */
3227*4882a593Smuzhiyun file = fopen(test_name, "r");
3228*4882a593Smuzhiyun if (!file)
3229*4882a593Smuzhiyun open_fail(test_name, errno);
3230*4882a593Smuzhiyun
3231*4882a593Smuzhiyun while (fgets(line, LINESIZE, file)) {
3232*4882a593Smuzhiyun ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
3233*4882a593Smuzhiyun buf0, buf1);
3234*4882a593Smuzhiyun if (ret != 2 || *line == '#')
3235*4882a593Smuzhiyun continue;
3236*4882a593Smuzhiyun s = buf0;
3237*4882a593Smuzhiyun t = buf2;
3238*4882a593Smuzhiyun while (*s) {
3239*4882a593Smuzhiyun unichar = strtoul(s, &s, 16);
3240*4882a593Smuzhiyun t += utf8encode(t, unichar);
3241*4882a593Smuzhiyun }
3242*4882a593Smuzhiyun *t = '\0';
3243*4882a593Smuzhiyun
3244*4882a593Smuzhiyun ignorables = 0;
3245*4882a593Smuzhiyun s = buf1;
3246*4882a593Smuzhiyun t = buf3;
3247*4882a593Smuzhiyun while (*s) {
3248*4882a593Smuzhiyun unichar = strtoul(s, &s, 16);
3249*4882a593Smuzhiyun data = &unicode_data[unichar];
3250*4882a593Smuzhiyun if (data->utf8nfdi && !*data->utf8nfdi)
3251*4882a593Smuzhiyun ignorables = 1;
3252*4882a593Smuzhiyun else
3253*4882a593Smuzhiyun t += utf8encode(t, unichar);
3254*4882a593Smuzhiyun }
3255*4882a593Smuzhiyun *t = '\0';
3256*4882a593Smuzhiyun
3257*4882a593Smuzhiyun tests++;
3258*4882a593Smuzhiyun if (normalize_line(nfdi_tree) < 0) {
3259*4882a593Smuzhiyun printf("Line %s -> %s", buf0, buf1);
3260*4882a593Smuzhiyun if (ignorables)
3261*4882a593Smuzhiyun printf(" (ignorables removed)");
3262*4882a593Smuzhiyun printf(" failure\n");
3263*4882a593Smuzhiyun failures++;
3264*4882a593Smuzhiyun }
3265*4882a593Smuzhiyun }
3266*4882a593Smuzhiyun fclose(file);
3267*4882a593Smuzhiyun if (verbose > 0)
3268*4882a593Smuzhiyun printf("Ran %d tests with %d failures\n", tests, failures);
3269*4882a593Smuzhiyun if (failures)
3270*4882a593Smuzhiyun file_fail(test_name);
3271*4882a593Smuzhiyun }
3272*4882a593Smuzhiyun
3273*4882a593Smuzhiyun /* ------------------------------------------------------------------ */
3274*4882a593Smuzhiyun
write_file(void)3275*4882a593Smuzhiyun static void write_file(void)
3276*4882a593Smuzhiyun {
3277*4882a593Smuzhiyun FILE *file;
3278*4882a593Smuzhiyun int i;
3279*4882a593Smuzhiyun int j;
3280*4882a593Smuzhiyun int t;
3281*4882a593Smuzhiyun int gen;
3282*4882a593Smuzhiyun
3283*4882a593Smuzhiyun if (verbose > 0)
3284*4882a593Smuzhiyun printf("Writing %s\n", utf8_name);
3285*4882a593Smuzhiyun file = fopen(utf8_name, "w");
3286*4882a593Smuzhiyun if (!file)
3287*4882a593Smuzhiyun open_fail(utf8_name, errno);
3288*4882a593Smuzhiyun
3289*4882a593Smuzhiyun fprintf(file, "/* This file is generated code, do not edit. */\n");
3290*4882a593Smuzhiyun fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
3291*4882a593Smuzhiyun fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
3292*4882a593Smuzhiyun fprintf(file, "#endif\n");
3293*4882a593Smuzhiyun fprintf(file, "\n");
3294*4882a593Smuzhiyun fprintf(file, "static const unsigned int utf8vers = %#x;\n",
3295*4882a593Smuzhiyun unicode_maxage);
3296*4882a593Smuzhiyun fprintf(file, "\n");
3297*4882a593Smuzhiyun fprintf(file, "static const unsigned int utf8agetab[] = {\n");
3298*4882a593Smuzhiyun for (i = 0; i != ages_count; i++)
3299*4882a593Smuzhiyun fprintf(file, "\t%#x%s\n", ages[i],
3300*4882a593Smuzhiyun ages[i] == unicode_maxage ? "" : ",");
3301*4882a593Smuzhiyun fprintf(file, "};\n");
3302*4882a593Smuzhiyun fprintf(file, "\n");
3303*4882a593Smuzhiyun fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
3304*4882a593Smuzhiyun t = 0;
3305*4882a593Smuzhiyun for (gen = 0; gen < ages_count; gen++) {
3306*4882a593Smuzhiyun fprintf(file, "\t{ %#x, %d }%s\n",
3307*4882a593Smuzhiyun ages[gen], trees[t].index,
3308*4882a593Smuzhiyun ages[gen] == unicode_maxage ? "" : ",");
3309*4882a593Smuzhiyun if (trees[t].maxage == ages[gen])
3310*4882a593Smuzhiyun t += 2;
3311*4882a593Smuzhiyun }
3312*4882a593Smuzhiyun fprintf(file, "};\n");
3313*4882a593Smuzhiyun fprintf(file, "\n");
3314*4882a593Smuzhiyun fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
3315*4882a593Smuzhiyun t = 1;
3316*4882a593Smuzhiyun for (gen = 0; gen < ages_count; gen++) {
3317*4882a593Smuzhiyun fprintf(file, "\t{ %#x, %d }%s\n",
3318*4882a593Smuzhiyun ages[gen], trees[t].index,
3319*4882a593Smuzhiyun ages[gen] == unicode_maxage ? "" : ",");
3320*4882a593Smuzhiyun if (trees[t].maxage == ages[gen])
3321*4882a593Smuzhiyun t += 2;
3322*4882a593Smuzhiyun }
3323*4882a593Smuzhiyun fprintf(file, "};\n");
3324*4882a593Smuzhiyun fprintf(file, "\n");
3325*4882a593Smuzhiyun fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
3326*4882a593Smuzhiyun utf8data_size);
3327*4882a593Smuzhiyun t = 0;
3328*4882a593Smuzhiyun for (i = 0; i != utf8data_size; i += 16) {
3329*4882a593Smuzhiyun if (i == trees[t].index) {
3330*4882a593Smuzhiyun fprintf(file, "\t/* %s_%x */\n",
3331*4882a593Smuzhiyun trees[t].type, trees[t].maxage);
3332*4882a593Smuzhiyun if (t < trees_count-1)
3333*4882a593Smuzhiyun t++;
3334*4882a593Smuzhiyun }
3335*4882a593Smuzhiyun fprintf(file, "\t");
3336*4882a593Smuzhiyun for (j = i; j != i + 16; j++)
3337*4882a593Smuzhiyun fprintf(file, "0x%.2x%s", utf8data[j],
3338*4882a593Smuzhiyun (j < utf8data_size -1 ? "," : ""));
3339*4882a593Smuzhiyun fprintf(file, "\n");
3340*4882a593Smuzhiyun }
3341*4882a593Smuzhiyun fprintf(file, "};\n");
3342*4882a593Smuzhiyun fclose(file);
3343*4882a593Smuzhiyun }
3344*4882a593Smuzhiyun
3345*4882a593Smuzhiyun /* ------------------------------------------------------------------ */
3346*4882a593Smuzhiyun
main(int argc,char * argv[])3347*4882a593Smuzhiyun int main(int argc, char *argv[])
3348*4882a593Smuzhiyun {
3349*4882a593Smuzhiyun unsigned int unichar;
3350*4882a593Smuzhiyun int opt;
3351*4882a593Smuzhiyun
3352*4882a593Smuzhiyun argv0 = argv[0];
3353*4882a593Smuzhiyun
3354*4882a593Smuzhiyun while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
3355*4882a593Smuzhiyun switch (opt) {
3356*4882a593Smuzhiyun case 'a':
3357*4882a593Smuzhiyun age_name = optarg;
3358*4882a593Smuzhiyun break;
3359*4882a593Smuzhiyun case 'c':
3360*4882a593Smuzhiyun ccc_name = optarg;
3361*4882a593Smuzhiyun break;
3362*4882a593Smuzhiyun case 'd':
3363*4882a593Smuzhiyun data_name = optarg;
3364*4882a593Smuzhiyun break;
3365*4882a593Smuzhiyun case 'f':
3366*4882a593Smuzhiyun fold_name = optarg;
3367*4882a593Smuzhiyun break;
3368*4882a593Smuzhiyun case 'n':
3369*4882a593Smuzhiyun norm_name = optarg;
3370*4882a593Smuzhiyun break;
3371*4882a593Smuzhiyun case 'o':
3372*4882a593Smuzhiyun utf8_name = optarg;
3373*4882a593Smuzhiyun break;
3374*4882a593Smuzhiyun case 'p':
3375*4882a593Smuzhiyun prop_name = optarg;
3376*4882a593Smuzhiyun break;
3377*4882a593Smuzhiyun case 't':
3378*4882a593Smuzhiyun test_name = optarg;
3379*4882a593Smuzhiyun break;
3380*4882a593Smuzhiyun case 'v':
3381*4882a593Smuzhiyun verbose++;
3382*4882a593Smuzhiyun break;
3383*4882a593Smuzhiyun case 'h':
3384*4882a593Smuzhiyun help();
3385*4882a593Smuzhiyun exit(0);
3386*4882a593Smuzhiyun default:
3387*4882a593Smuzhiyun usage();
3388*4882a593Smuzhiyun }
3389*4882a593Smuzhiyun }
3390*4882a593Smuzhiyun
3391*4882a593Smuzhiyun if (verbose > 1)
3392*4882a593Smuzhiyun help();
3393*4882a593Smuzhiyun for (unichar = 0; unichar != 0x110000; unichar++)
3394*4882a593Smuzhiyun unicode_data[unichar].code = unichar;
3395*4882a593Smuzhiyun age_init();
3396*4882a593Smuzhiyun ccc_init();
3397*4882a593Smuzhiyun nfdi_init();
3398*4882a593Smuzhiyun nfdicf_init();
3399*4882a593Smuzhiyun ignore_init();
3400*4882a593Smuzhiyun corrections_init();
3401*4882a593Smuzhiyun hangul_decompose();
3402*4882a593Smuzhiyun nfdi_decompose();
3403*4882a593Smuzhiyun nfdicf_decompose();
3404*4882a593Smuzhiyun utf8_init();
3405*4882a593Smuzhiyun trees_init();
3406*4882a593Smuzhiyun trees_populate();
3407*4882a593Smuzhiyun trees_reduce();
3408*4882a593Smuzhiyun trees_verify();
3409*4882a593Smuzhiyun /* Prevent "unused function" warning. */
3410*4882a593Smuzhiyun (void)lookup(nfdi_tree, " ");
3411*4882a593Smuzhiyun if (verbose > 2)
3412*4882a593Smuzhiyun tree_walk(nfdi_tree);
3413*4882a593Smuzhiyun if (verbose > 2)
3414*4882a593Smuzhiyun tree_walk(nfdicf_tree);
3415*4882a593Smuzhiyun normalization_test();
3416*4882a593Smuzhiyun write_file();
3417*4882a593Smuzhiyun
3418*4882a593Smuzhiyun return 0;
3419*4882a593Smuzhiyun }
3420