1*4882a593Smuzhiyun /* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun /* 3*4882a593Smuzhiyun * Copyright (c) 2014 SGI. 4*4882a593Smuzhiyun * All rights reserved. 5*4882a593Smuzhiyun */ 6*4882a593Smuzhiyun 7*4882a593Smuzhiyun #ifndef UTF8NORM_H 8*4882a593Smuzhiyun #define UTF8NORM_H 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun #include <linux/types.h> 11*4882a593Smuzhiyun #include <linux/export.h> 12*4882a593Smuzhiyun #include <linux/string.h> 13*4882a593Smuzhiyun #include <linux/module.h> 14*4882a593Smuzhiyun 15*4882a593Smuzhiyun /* Encoding a unicode version number as a single unsigned int. */ 16*4882a593Smuzhiyun #define UNICODE_MAJ_SHIFT (16) 17*4882a593Smuzhiyun #define UNICODE_MIN_SHIFT (8) 18*4882a593Smuzhiyun 19*4882a593Smuzhiyun #define UNICODE_AGE(MAJ, MIN, REV) \ 20*4882a593Smuzhiyun (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \ 21*4882a593Smuzhiyun ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \ 22*4882a593Smuzhiyun ((unsigned int)(REV))) 23*4882a593Smuzhiyun 24*4882a593Smuzhiyun /* Highest unicode version supported by the data tables. */ 25*4882a593Smuzhiyun extern int utf8version_is_supported(u8 maj, u8 min, u8 rev); 26*4882a593Smuzhiyun extern int utf8version_latest(void); 27*4882a593Smuzhiyun 28*4882a593Smuzhiyun /* 29*4882a593Smuzhiyun * Look for the correct const struct utf8data for a unicode version. 30*4882a593Smuzhiyun * Returns NULL if the version requested is too new. 31*4882a593Smuzhiyun * 32*4882a593Smuzhiyun * Two normalization forms are supported: nfdi and nfdicf. 33*4882a593Smuzhiyun * 34*4882a593Smuzhiyun * nfdi: 35*4882a593Smuzhiyun * - Apply unicode normalization form NFD. 36*4882a593Smuzhiyun * - Remove any Default_Ignorable_Code_Point. 37*4882a593Smuzhiyun * 38*4882a593Smuzhiyun * nfdicf: 39*4882a593Smuzhiyun * - Apply unicode normalization form NFD. 40*4882a593Smuzhiyun * - Remove any Default_Ignorable_Code_Point. 41*4882a593Smuzhiyun * - Apply a full casefold (C + F). 42*4882a593Smuzhiyun */ 43*4882a593Smuzhiyun extern const struct utf8data *utf8nfdi(unsigned int maxage); 44*4882a593Smuzhiyun extern const struct utf8data *utf8nfdicf(unsigned int maxage); 45*4882a593Smuzhiyun 46*4882a593Smuzhiyun /* 47*4882a593Smuzhiyun * Determine the maximum age of any unicode character in the string. 48*4882a593Smuzhiyun * Returns 0 if only unassigned code points are present. 49*4882a593Smuzhiyun * Returns -1 if the input is not valid UTF-8. 50*4882a593Smuzhiyun */ 51*4882a593Smuzhiyun extern int utf8agemax(const struct utf8data *data, const char *s); 52*4882a593Smuzhiyun extern int utf8nagemax(const struct utf8data *data, const char *s, size_t len); 53*4882a593Smuzhiyun 54*4882a593Smuzhiyun /* 55*4882a593Smuzhiyun * Determine the minimum age of any unicode character in the string. 56*4882a593Smuzhiyun * Returns 0 if any unassigned code points are present. 57*4882a593Smuzhiyun * Returns -1 if the input is not valid UTF-8. 58*4882a593Smuzhiyun */ 59*4882a593Smuzhiyun extern int utf8agemin(const struct utf8data *data, const char *s); 60*4882a593Smuzhiyun extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len); 61*4882a593Smuzhiyun 62*4882a593Smuzhiyun /* 63*4882a593Smuzhiyun * Determine the length of the normalized from of the string, 64*4882a593Smuzhiyun * excluding any terminating NULL byte. 65*4882a593Smuzhiyun * Returns 0 if only ignorable code points are present. 66*4882a593Smuzhiyun * Returns -1 if the input is not valid UTF-8. 67*4882a593Smuzhiyun */ 68*4882a593Smuzhiyun extern ssize_t utf8len(const struct utf8data *data, const char *s); 69*4882a593Smuzhiyun extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len); 70*4882a593Smuzhiyun 71*4882a593Smuzhiyun /* Needed in struct utf8cursor below. */ 72*4882a593Smuzhiyun #define UTF8HANGULLEAF (12) 73*4882a593Smuzhiyun 74*4882a593Smuzhiyun /* 75*4882a593Smuzhiyun * Cursor structure used by the normalizer. 76*4882a593Smuzhiyun */ 77*4882a593Smuzhiyun struct utf8cursor { 78*4882a593Smuzhiyun const struct utf8data *data; 79*4882a593Smuzhiyun const char *s; 80*4882a593Smuzhiyun const char *p; 81*4882a593Smuzhiyun const char *ss; 82*4882a593Smuzhiyun const char *sp; 83*4882a593Smuzhiyun unsigned int len; 84*4882a593Smuzhiyun unsigned int slen; 85*4882a593Smuzhiyun short int ccc; 86*4882a593Smuzhiyun short int nccc; 87*4882a593Smuzhiyun unsigned char hangul[UTF8HANGULLEAF]; 88*4882a593Smuzhiyun }; 89*4882a593Smuzhiyun 90*4882a593Smuzhiyun /* 91*4882a593Smuzhiyun * Initialize a utf8cursor to normalize a string. 92*4882a593Smuzhiyun * Returns 0 on success. 93*4882a593Smuzhiyun * Returns -1 on failure. 94*4882a593Smuzhiyun */ 95*4882a593Smuzhiyun extern int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data, 96*4882a593Smuzhiyun const char *s); 97*4882a593Smuzhiyun extern int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, 98*4882a593Smuzhiyun const char *s, size_t len); 99*4882a593Smuzhiyun 100*4882a593Smuzhiyun /* 101*4882a593Smuzhiyun * Get the next byte in the normalization. 102*4882a593Smuzhiyun * Returns a value > 0 && < 256 on success. 103*4882a593Smuzhiyun * Returns 0 when the end of the normalization is reached. 104*4882a593Smuzhiyun * Returns -1 if the string being normalized is not valid UTF-8. 105*4882a593Smuzhiyun */ 106*4882a593Smuzhiyun extern int utf8byte(struct utf8cursor *u8c); 107*4882a593Smuzhiyun 108*4882a593Smuzhiyun #endif /* UTF8NORM_H */ 109