1 #include <mbgl/util/i18n.hpp>
2 #include <mbgl/util/utf.hpp>
3 
4 #include <algorithm>
5 #include <map>
6 
7 namespace {
8 
9 /** Defines a function that returns true if a codepoint is in a named block.
10     @param name The name of the block in CamelCase.
11     @param first The first codepoint in the block, inclusive.
12     @param last The last codepoint in the block, inclusive.
13  */
14 #define DEFINE_IS_IN_UNICODE_BLOCK(name, first, last)                                              \
15     inline bool isIn##name(char16_t codepoint) {                                                   \
16         return codepoint >= first && codepoint <= last;                                            \
17     }
18 
19 // The following table comes from <http://www.unicode.org/Public/10.0.0/ucd/Blocks.txt>.
20 // Keep it synchronized with <http://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt>.
21 
22 // DEFINE_IS_IN_UNICODE_BLOCK(BasicLatin, 0x0000, 0x007F)
23 DEFINE_IS_IN_UNICODE_BLOCK(Latin1Supplement, 0x0080, 0x00FF)
24 // DEFINE_IS_IN_UNICODE_BLOCK(LatinExtendedA, 0x0100, 0x017F)
25 // DEFINE_IS_IN_UNICODE_BLOCK(LatinExtendedB, 0x0180, 0x024F)
26 // DEFINE_IS_IN_UNICODE_BLOCK(IPAExtensions, 0x0250, 0x02AF)
27 // DEFINE_IS_IN_UNICODE_BLOCK(SpacingModifierLetters, 0x02B0, 0x02FF)
28 // DEFINE_IS_IN_UNICODE_BLOCK(CombiningDiacriticalMarks, 0x0300, 0x036F)
29 // DEFINE_IS_IN_UNICODE_BLOCK(GreekandCoptic, 0x0370, 0x03FF)
30 // DEFINE_IS_IN_UNICODE_BLOCK(Cyrillic, 0x0400, 0x04FF)
31 // DEFINE_IS_IN_UNICODE_BLOCK(CyrillicSupplement, 0x0500, 0x052F)
32 // DEFINE_IS_IN_UNICODE_BLOCK(Armenian, 0x0530, 0x058F)
33 // DEFINE_IS_IN_UNICODE_BLOCK(Hebrew, 0x0590, 0x05FF)
34 DEFINE_IS_IN_UNICODE_BLOCK(Arabic, 0x0600, 0x06FF)
35 // DEFINE_IS_IN_UNICODE_BLOCK(Syriac, 0x0700, 0x074F)
36 DEFINE_IS_IN_UNICODE_BLOCK(ArabicSupplement, 0x0750, 0x077F)
37 // DEFINE_IS_IN_UNICODE_BLOCK(Thaana, 0x0780, 0x07BF)
38 // DEFINE_IS_IN_UNICODE_BLOCK(NKo, 0x07C0, 0x07FF)
39 // DEFINE_IS_IN_UNICODE_BLOCK(Samaritan, 0x0800, 0x083F)
40 // DEFINE_IS_IN_UNICODE_BLOCK(Mandaic, 0x0840, 0x085F)
41 // DEFINE_IS_IN_UNICODE_BLOCK(Syriac Supplement, 0x0860, 0x086F)
42 DEFINE_IS_IN_UNICODE_BLOCK(ArabicExtendedA, 0x08A0, 0x08FF)
43 // DEFINE_IS_IN_UNICODE_BLOCK(Devanagari, 0x0900, 0x097F)
44 // DEFINE_IS_IN_UNICODE_BLOCK(Bengali, 0x0980, 0x09FF)
45 // DEFINE_IS_IN_UNICODE_BLOCK(Gurmukhi, 0x0A00, 0x0A7F)
46 // DEFINE_IS_IN_UNICODE_BLOCK(Gujarati, 0x0A80, 0x0AFF)
47 // DEFINE_IS_IN_UNICODE_BLOCK(Oriya, 0x0B00, 0x0B7F)
48 // DEFINE_IS_IN_UNICODE_BLOCK(Tamil, 0x0B80, 0x0BFF)
49 // DEFINE_IS_IN_UNICODE_BLOCK(Telugu, 0x0C00, 0x0C7F)
50 // DEFINE_IS_IN_UNICODE_BLOCK(Kannada, 0x0C80, 0x0CFF)
51 // DEFINE_IS_IN_UNICODE_BLOCK(Malayalam, 0x0D00, 0x0D7F)
52 // DEFINE_IS_IN_UNICODE_BLOCK(Sinhala, 0x0D80, 0x0DFF)
53 // DEFINE_IS_IN_UNICODE_BLOCK(Thai, 0x0E00, 0x0E7F)
54 // DEFINE_IS_IN_UNICODE_BLOCK(Lao, 0x0E80, 0x0EFF)
55 // DEFINE_IS_IN_UNICODE_BLOCK(Tibetan, 0x0F00, 0x0FFF)
56 // DEFINE_IS_IN_UNICODE_BLOCK(Myanmar, 0x1000, 0x109F)
57 // DEFINE_IS_IN_UNICODE_BLOCK(Georgian, 0x10A0, 0x10FF)
58 DEFINE_IS_IN_UNICODE_BLOCK(HangulJamo, 0x1100, 0x11FF)
59 // DEFINE_IS_IN_UNICODE_BLOCK(Ethiopic, 0x1200, 0x137F)
60 // DEFINE_IS_IN_UNICODE_BLOCK(EthiopicSupplement, 0x1380, 0x139F)
61 // DEFINE_IS_IN_UNICODE_BLOCK(Cherokee, 0x13A0, 0x13FF)
62 DEFINE_IS_IN_UNICODE_BLOCK(UnifiedCanadianAboriginalSyllabics, 0x1400, 0x167F)
63 // DEFINE_IS_IN_UNICODE_BLOCK(Ogham, 0x1680, 0x169F)
64 // DEFINE_IS_IN_UNICODE_BLOCK(Runic, 0x16A0, 0x16FF)
65 // DEFINE_IS_IN_UNICODE_BLOCK(Tagalog, 0x1700, 0x171F)
66 // DEFINE_IS_IN_UNICODE_BLOCK(Hanunoo, 0x1720, 0x173F)
67 // DEFINE_IS_IN_UNICODE_BLOCK(Buhid, 0x1740, 0x175F)
68 // DEFINE_IS_IN_UNICODE_BLOCK(Tagbanwa, 0x1760, 0x177F)
69 DEFINE_IS_IN_UNICODE_BLOCK(Khmer, 0x1780, 0x17FF)
70 // DEFINE_IS_IN_UNICODE_BLOCK(Mongolian, 0x1800, 0x18AF)
71 DEFINE_IS_IN_UNICODE_BLOCK(UnifiedCanadianAboriginalSyllabicsExtended, 0x18B0, 0x18FF)
72 // DEFINE_IS_IN_UNICODE_BLOCK(Limbu, 0x1900, 0x194F)
73 // DEFINE_IS_IN_UNICODE_BLOCK(TaiLe, 0x1950, 0x197F)
74 // DEFINE_IS_IN_UNICODE_BLOCK(NewTaiLue, 0x1980, 0x19DF)
75 // DEFINE_IS_IN_UNICODE_BLOCK(KhmerSymbols, 0x19E0, 0x19FF)
76 // DEFINE_IS_IN_UNICODE_BLOCK(Buginese, 0x1A00, 0x1A1F)
77 // DEFINE_IS_IN_UNICODE_BLOCK(TaiTham, 0x1A20, 0x1AAF)
78 // DEFINE_IS_IN_UNICODE_BLOCK(CombiningDiacriticalMarksExtended, 0x1AB0, 0x1AFF)
79 // DEFINE_IS_IN_UNICODE_BLOCK(Balinese, 0x1B00, 0x1B7F)
80 // DEFINE_IS_IN_UNICODE_BLOCK(Sundanese, 0x1B80, 0x1BBF)
81 // DEFINE_IS_IN_UNICODE_BLOCK(Batak, 0x1BC0, 0x1BFF)
82 // DEFINE_IS_IN_UNICODE_BLOCK(Lepcha, 0x1C00, 0x1C4F)
83 // DEFINE_IS_IN_UNICODE_BLOCK(OlChiki, 0x1C50, 0x1C7F)
84 // DEFINE_IS_IN_UNICODE_BLOCK(CyrillicExtendedC, 0x1C80, 0x1C8F)
85 // DEFINE_IS_IN_UNICODE_BLOCK(SundaneseSupplement, 0x1CC0, 0x1CCF)
86 // DEFINE_IS_IN_UNICODE_BLOCK(VedicExtensions, 0x1CD0, 0x1CFF)
87 // DEFINE_IS_IN_UNICODE_BLOCK(PhoneticExtensions, 0x1D00, 0x1D7F)
88 // DEFINE_IS_IN_UNICODE_BLOCK(PhoneticExtensionsSupplement, 0x1D80, 0x1DBF)
89 // DEFINE_IS_IN_UNICODE_BLOCK(CombiningDiacriticalMarksSupplement, 0x1DC0, 0x1DFF)
90 // DEFINE_IS_IN_UNICODE_BLOCK(LatinExtendedAdditional, 0x1E00, 0x1EFF)
91 // DEFINE_IS_IN_UNICODE_BLOCK(GreekExtended, 0x1F00, 0x1FFF)
92 DEFINE_IS_IN_UNICODE_BLOCK(GeneralPunctuation, 0x2000, 0x206F)
93 // DEFINE_IS_IN_UNICODE_BLOCK(SuperscriptsandSubscripts, 0x2070, 0x209F)
94 // DEFINE_IS_IN_UNICODE_BLOCK(CurrencySymbols, 0x20A0, 0x20CF)
95 // DEFINE_IS_IN_UNICODE_BLOCK(CombiningDiacriticalMarksforSymbols, 0x20D0, 0x20FF)
96 DEFINE_IS_IN_UNICODE_BLOCK(LetterlikeSymbols, 0x2100, 0x214F)
97 DEFINE_IS_IN_UNICODE_BLOCK(NumberForms, 0x2150, 0x218F)
98 // DEFINE_IS_IN_UNICODE_BLOCK(Arrows, 0x2190, 0x21FF)
99 // DEFINE_IS_IN_UNICODE_BLOCK(MathematicalOperators, 0x2200, 0x22FF)
100 DEFINE_IS_IN_UNICODE_BLOCK(MiscellaneousTechnical, 0x2300, 0x23FF)
101 DEFINE_IS_IN_UNICODE_BLOCK(ControlPictures, 0x2400, 0x243F)
102 DEFINE_IS_IN_UNICODE_BLOCK(OpticalCharacterRecognition, 0x2440, 0x245F)
103 DEFINE_IS_IN_UNICODE_BLOCK(EnclosedAlphanumerics, 0x2460, 0x24FF)
104 // DEFINE_IS_IN_UNICODE_BLOCK(BoxDrawing, 0x2500, 0x257F)
105 // DEFINE_IS_IN_UNICODE_BLOCK(BlockElements, 0x2580, 0x259F)
106 DEFINE_IS_IN_UNICODE_BLOCK(GeometricShapes, 0x25A0, 0x25FF)
107 DEFINE_IS_IN_UNICODE_BLOCK(MiscellaneousSymbols, 0x2600, 0x26FF)
108 // DEFINE_IS_IN_UNICODE_BLOCK(Dingbats, 0x2700, 0x27BF)
109 // DEFINE_IS_IN_UNICODE_BLOCK(MiscellaneousMathematicalSymbolsA, 0x27C0, 0x27EF)
110 // DEFINE_IS_IN_UNICODE_BLOCK(SupplementalArrowsA, 0x27F0, 0x27FF)
111 // DEFINE_IS_IN_UNICODE_BLOCK(BraillePatterns, 0x2800, 0x28FF)
112 // DEFINE_IS_IN_UNICODE_BLOCK(SupplementalArrowsB, 0x2900, 0x297F)
113 // DEFINE_IS_IN_UNICODE_BLOCK(MiscellaneousMathematicalSymbolsB, 0x2980, 0x29FF)
114 // DEFINE_IS_IN_UNICODE_BLOCK(SupplementalMathematicalOperators, 0x2A00, 0x2AFF)
115 // DEFINE_IS_IN_UNICODE_BLOCK(MiscellaneousSymbolsandArrows, 0x2B00, 0x2BFF)
116 // DEFINE_IS_IN_UNICODE_BLOCK(Glagolitic, 0x2C00, 0x2C5F)
117 // DEFINE_IS_IN_UNICODE_BLOCK(LatinExtendedC, 0x2C60, 0x2C7F)
118 // DEFINE_IS_IN_UNICODE_BLOCK(Coptic, 0x2C80, 0x2CFF)
119 // DEFINE_IS_IN_UNICODE_BLOCK(GeorgianSupplement, 0x2D00, 0x2D2F)
120 // DEFINE_IS_IN_UNICODE_BLOCK(Tifinagh, 0x2D30, 0x2D7F)
121 // DEFINE_IS_IN_UNICODE_BLOCK(EthiopicExtended, 0x2D80, 0x2DDF)
122 // DEFINE_IS_IN_UNICODE_BLOCK(CyrillicExtendedA, 0x2DE0, 0x2DFF)
123 // DEFINE_IS_IN_UNICODE_BLOCK(SupplementalPunctuation, 0x2E00, 0x2E7F)
124 DEFINE_IS_IN_UNICODE_BLOCK(CJKRadicalsSupplement, 0x2E80, 0x2EFF)
125 DEFINE_IS_IN_UNICODE_BLOCK(KangxiRadicals, 0x2F00, 0x2FDF)
126 DEFINE_IS_IN_UNICODE_BLOCK(IdeographicDescriptionCharacters, 0x2FF0, 0x2FFF)
127 DEFINE_IS_IN_UNICODE_BLOCK(CJKSymbolsandPunctuation, 0x3000, 0x303F)
128 DEFINE_IS_IN_UNICODE_BLOCK(Hiragana, 0x3040, 0x309F)
129 DEFINE_IS_IN_UNICODE_BLOCK(Katakana, 0x30A0, 0x30FF)
130 DEFINE_IS_IN_UNICODE_BLOCK(Bopomofo, 0x3100, 0x312F)
131 DEFINE_IS_IN_UNICODE_BLOCK(HangulCompatibilityJamo, 0x3130, 0x318F)
132 DEFINE_IS_IN_UNICODE_BLOCK(Kanbun, 0x3190, 0x319F)
133 DEFINE_IS_IN_UNICODE_BLOCK(BopomofoExtended, 0x31A0, 0x31BF)
134 DEFINE_IS_IN_UNICODE_BLOCK(CJKStrokes, 0x31C0, 0x31EF)
135 DEFINE_IS_IN_UNICODE_BLOCK(KatakanaPhoneticExtensions, 0x31F0, 0x31FF)
136 DEFINE_IS_IN_UNICODE_BLOCK(EnclosedCJKLettersandMonths, 0x3200, 0x32FF)
137 DEFINE_IS_IN_UNICODE_BLOCK(CJKCompatibility, 0x3300, 0x33FF)
138 DEFINE_IS_IN_UNICODE_BLOCK(CJKUnifiedIdeographsExtensionA, 0x3400, 0x4DBF)
139 DEFINE_IS_IN_UNICODE_BLOCK(YijingHexagramSymbols, 0x4DC0, 0x4DFF)
140 DEFINE_IS_IN_UNICODE_BLOCK(CJKUnifiedIdeographs, 0x4E00, 0x9FFF)
141 DEFINE_IS_IN_UNICODE_BLOCK(YiSyllables, 0xA000, 0xA48F)
142 DEFINE_IS_IN_UNICODE_BLOCK(YiRadicals, 0xA490, 0xA4CF)
143 // DEFINE_IS_IN_UNICODE_BLOCK(Lisu, 0xA4D0, 0xA4FF)
144 // DEFINE_IS_IN_UNICODE_BLOCK(Vai, 0xA500, 0xA63F)
145 // DEFINE_IS_IN_UNICODE_BLOCK(CyrillicExtendedB, 0xA640, 0xA69F)
146 // DEFINE_IS_IN_UNICODE_BLOCK(Bamum, 0xA6A0, 0xA6FF)
147 // DEFINE_IS_IN_UNICODE_BLOCK(ModifierToneLetters, 0xA700, 0xA71F)
148 // DEFINE_IS_IN_UNICODE_BLOCK(LatinExtendedD, 0xA720, 0xA7FF)
149 // DEFINE_IS_IN_UNICODE_BLOCK(SylotiNagri, 0xA800, 0xA82F)
150 // DEFINE_IS_IN_UNICODE_BLOCK(CommonIndicNumberForms, 0xA830, 0xA83F)
151 // DEFINE_IS_IN_UNICODE_BLOCK(Phagspa, 0xA840, 0xA87F)
152 // DEFINE_IS_IN_UNICODE_BLOCK(Saurashtra, 0xA880, 0xA8DF)
153 // DEFINE_IS_IN_UNICODE_BLOCK(DevanagariExtended, 0xA8E0, 0xA8FF)
154 // DEFINE_IS_IN_UNICODE_BLOCK(KayahLi, 0xA900, 0xA92F)
155 // DEFINE_IS_IN_UNICODE_BLOCK(Rejang, 0xA930, 0xA95F)
156 DEFINE_IS_IN_UNICODE_BLOCK(HangulJamoExtendedA, 0xA960, 0xA97F)
157 // DEFINE_IS_IN_UNICODE_BLOCK(Javanese, 0xA980, 0xA9DF)
158 // DEFINE_IS_IN_UNICODE_BLOCK(MyanmarExtendedB, 0xA9E0, 0xA9FF)
159 // DEFINE_IS_IN_UNICODE_BLOCK(Cham, 0xAA00, 0xAA5F)
160 // DEFINE_IS_IN_UNICODE_BLOCK(MyanmarExtendedA, 0xAA60, 0xAA7F)
161 // DEFINE_IS_IN_UNICODE_BLOCK(TaiViet, 0xAA80, 0xAADF)
162 // DEFINE_IS_IN_UNICODE_BLOCK(MeeteiMayekExtensions, 0xAAE0, 0xAAFF)
163 // DEFINE_IS_IN_UNICODE_BLOCK(EthiopicExtendedA, 0xAB00, 0xAB2F)
164 // DEFINE_IS_IN_UNICODE_BLOCK(LatinExtendedE, 0xAB30, 0xAB6F)
165 // DEFINE_IS_IN_UNICODE_BLOCK(CherokeeSupplement, 0xAB70, 0xABBF)
166 // DEFINE_IS_IN_UNICODE_BLOCK(MeeteiMayek, 0xABC0, 0xABFF)
167 DEFINE_IS_IN_UNICODE_BLOCK(HangulSyllables, 0xAC00, 0xD7AF)
168 DEFINE_IS_IN_UNICODE_BLOCK(HangulJamoExtendedB, 0xD7B0, 0xD7FF)
169 // DEFINE_IS_IN_UNICODE_BLOCK(HighSurrogates, 0xD800, 0xDB7F)
170 // DEFINE_IS_IN_UNICODE_BLOCK(HighPrivateUseSurrogates, 0xDB80, 0xDBFF)
171 // DEFINE_IS_IN_UNICODE_BLOCK(LowSurrogates, 0xDC00, 0xDFFF)
172 DEFINE_IS_IN_UNICODE_BLOCK(PrivateUseArea, 0xE000, 0xF8FF)
173 DEFINE_IS_IN_UNICODE_BLOCK(CJKCompatibilityIdeographs, 0xF900, 0xFAFF)
174 // DEFINE_IS_IN_UNICODE_BLOCK(AlphabeticPresentationForms, 0xFB00, 0xFB4F)
175 DEFINE_IS_IN_UNICODE_BLOCK(ArabicPresentationFormsA, 0xFB50, 0xFDFF)
176 // DEFINE_IS_IN_UNICODE_BLOCK(VariationSelectors, 0xFE00, 0xFE0F)
177 DEFINE_IS_IN_UNICODE_BLOCK(VerticalForms, 0xFE10, 0xFE1F)
178 // DEFINE_IS_IN_UNICODE_BLOCK(CombiningHalfMarks, 0xFE20, 0xFE2F)
179 DEFINE_IS_IN_UNICODE_BLOCK(CJKCompatibilityForms, 0xFE30, 0xFE4F)
180 DEFINE_IS_IN_UNICODE_BLOCK(SmallFormVariants, 0xFE50, 0xFE6F)
181 DEFINE_IS_IN_UNICODE_BLOCK(ArabicPresentationFormsB, 0xFE70, 0xFEFF)
182 DEFINE_IS_IN_UNICODE_BLOCK(HalfwidthandFullwidthForms, 0xFF00, 0xFFEF)
183 // DEFINE_IS_IN_UNICODE_BLOCK(Specials, 0xFFF0, 0xFFFF)
184 // DEFINE_IS_IN_UNICODE_BLOCK(LinearBSyllabary, 0x10000, 0x1007F)
185 // DEFINE_IS_IN_UNICODE_BLOCK(LinearBIdeograms, 0x10080, 0x100FF)
186 // DEFINE_IS_IN_UNICODE_BLOCK(AegeanNumbers, 0x10100, 0x1013F)
187 // DEFINE_IS_IN_UNICODE_BLOCK(AncientGreekNumbers, 0x10140, 0x1018F)
188 // DEFINE_IS_IN_UNICODE_BLOCK(AncientSymbols, 0x10190, 0x101CF)
189 // DEFINE_IS_IN_UNICODE_BLOCK(PhaistosDisc, 0x101D0, 0x101FF)
190 // DEFINE_IS_IN_UNICODE_BLOCK(Lycian, 0x10280, 0x1029F)
191 // DEFINE_IS_IN_UNICODE_BLOCK(Carian, 0x102A0, 0x102DF)
192 // DEFINE_IS_IN_UNICODE_BLOCK(CopticEpactNumbers, 0x102E0, 0x102FF)
193 // DEFINE_IS_IN_UNICODE_BLOCK(OldItalic, 0x10300, 0x1032F)
194 // DEFINE_IS_IN_UNICODE_BLOCK(Gothic, 0x10330, 0x1034F)
195 // DEFINE_IS_IN_UNICODE_BLOCK(OldPermic, 0x10350, 0x1037F)
196 // DEFINE_IS_IN_UNICODE_BLOCK(Ugaritic, 0x10380, 0x1039F)
197 // DEFINE_IS_IN_UNICODE_BLOCK(OldPersian, 0x103A0, 0x103DF)
198 // DEFINE_IS_IN_UNICODE_BLOCK(Deseret, 0x10400, 0x1044F)
199 // DEFINE_IS_IN_UNICODE_BLOCK(Shavian, 0x10450, 0x1047F)
200 // DEFINE_IS_IN_UNICODE_BLOCK(Osmanya, 0x10480, 0x104AF)
201 // DEFINE_IS_IN_UNICODE_BLOCK(Osage, 0x104B0, 0x104FF)
202 // DEFINE_IS_IN_UNICODE_BLOCK(Elbasan, 0x10500, 0x1052F)
203 // DEFINE_IS_IN_UNICODE_BLOCK(CaucasianAlbanian, 0x10530, 0x1056F)
204 // DEFINE_IS_IN_UNICODE_BLOCK(LinearA, 0x10600, 0x1077F)
205 // DEFINE_IS_IN_UNICODE_BLOCK(CypriotSyllabary, 0x10800, 0x1083F)
206 // DEFINE_IS_IN_UNICODE_BLOCK(ImperialAramaic, 0x10840, 0x1085F)
207 // DEFINE_IS_IN_UNICODE_BLOCK(Palmyrene, 0x10860, 0x1087F)
208 // DEFINE_IS_IN_UNICODE_BLOCK(Nabataean, 0x10880, 0x108AF)
209 // DEFINE_IS_IN_UNICODE_BLOCK(Hatran, 0x108E0, 0x108FF)
210 // DEFINE_IS_IN_UNICODE_BLOCK(Phoenician, 0x10900, 0x1091F)
211 // DEFINE_IS_IN_UNICODE_BLOCK(Lydian, 0x10920, 0x1093F)
212 // DEFINE_IS_IN_UNICODE_BLOCK(MeroiticHieroglyphs, 0x10980, 0x1099F)
213 // DEFINE_IS_IN_UNICODE_BLOCK(MeroiticCursive, 0x109A0, 0x109FF)
214 // DEFINE_IS_IN_UNICODE_BLOCK(Kharoshthi, 0x10A00, 0x10A5F)
215 // DEFINE_IS_IN_UNICODE_BLOCK(OldSouthArabian, 0x10A60, 0x10A7F)
216 // DEFINE_IS_IN_UNICODE_BLOCK(OldNorthArabian, 0x10A80, 0x10A9F)
217 // DEFINE_IS_IN_UNICODE_BLOCK(Manichaean, 0x10AC0, 0x10AFF)
218 // DEFINE_IS_IN_UNICODE_BLOCK(Avestan, 0x10B00, 0x10B3F)
219 // DEFINE_IS_IN_UNICODE_BLOCK(InscriptionalParthian, 0x10B40, 0x10B5F)
220 // DEFINE_IS_IN_UNICODE_BLOCK(InscriptionalPahlavi, 0x10B60, 0x10B7F)
221 // DEFINE_IS_IN_UNICODE_BLOCK(PsalterPahlavi, 0x10B80, 0x10BAF)
222 // DEFINE_IS_IN_UNICODE_BLOCK(OldTurkic, 0x10C00, 0x10C4F)
223 // DEFINE_IS_IN_UNICODE_BLOCK(OldHungarian, 0x10C80, 0x10CFF)
224 // DEFINE_IS_IN_UNICODE_BLOCK(RumiNumeralSymbols, 0x10E60, 0x10E7F)
225 // DEFINE_IS_IN_UNICODE_BLOCK(Brahmi, 0x11000, 0x1107F)
226 // DEFINE_IS_IN_UNICODE_BLOCK(Kaithi, 0x11080, 0x110CF)
227 // DEFINE_IS_IN_UNICODE_BLOCK(SoraSompeng, 0x110D0, 0x110FF)
228 // DEFINE_IS_IN_UNICODE_BLOCK(Chakma, 0x11100, 0x1114F)
229 // DEFINE_IS_IN_UNICODE_BLOCK(Mahajani, 0x11150, 0x1117F)
230 // DEFINE_IS_IN_UNICODE_BLOCK(Sharada, 0x11180, 0x111DF)
231 // DEFINE_IS_IN_UNICODE_BLOCK(SinhalaArchaicNumbers, 0x111E0, 0x111FF)
232 // DEFINE_IS_IN_UNICODE_BLOCK(Khojki, 0x11200, 0x1124F)
233 // DEFINE_IS_IN_UNICODE_BLOCK(Multani, 0x11280, 0x112AF)
234 // DEFINE_IS_IN_UNICODE_BLOCK(Khudawadi, 0x112B0, 0x112FF)
235 // DEFINE_IS_IN_UNICODE_BLOCK(Grantha, 0x11300, 0x1137F)
236 // DEFINE_IS_IN_UNICODE_BLOCK(Newa, 0x11400, 0x1147F)
237 // DEFINE_IS_IN_UNICODE_BLOCK(Tirhuta, 0x11480, 0x114DF)
238 // DEFINE_IS_IN_UNICODE_BLOCK(Siddham, 0x11580, 0x115FF)
239 // DEFINE_IS_IN_UNICODE_BLOCK(Modi, 0x11600, 0x1165F)
240 // DEFINE_IS_IN_UNICODE_BLOCK(MongolianSupplement, 0x11660, 0x1167F)
241 // DEFINE_IS_IN_UNICODE_BLOCK(Takri, 0x11680, 0x116CF)
242 // DEFINE_IS_IN_UNICODE_BLOCK(Ahom, 0x11700, 0x1173F)
243 // DEFINE_IS_IN_UNICODE_BLOCK(WarangCiti, 0x118A0, 0x118FF)
244 // DEFINE_IS_IN_UNICODE_BLOCK(ZanabazarSquare, 0x11A00, 0x11A4F)
245 // DEFINE_IS_IN_UNICODE_BLOCK(Soyombo, 0x11A50, 0x11AAF)
246 // DEFINE_IS_IN_UNICODE_BLOCK(PauCinHau, 0x11AC0, 0x11AFF)
247 // DEFINE_IS_IN_UNICODE_BLOCK(Bhaiksuki, 0x11C00, 0x11C6F)
248 // DEFINE_IS_IN_UNICODE_BLOCK(Marchen, 0x11C70, 0x11CBF)
249 // DEFINE_IS_IN_UNICODE_BLOCK(MasaramGondi, 0x11D00, 0x11D5F)
250 // DEFINE_IS_IN_UNICODE_BLOCK(Cuneiform, 0x12000, 0x123FF)
251 // DEFINE_IS_IN_UNICODE_BLOCK(CuneiformNumbersandPunctuation, 0x12400, 0x1247F)
252 // DEFINE_IS_IN_UNICODE_BLOCK(EarlyDynasticCuneiform, 0x12480, 0x1254F)
253 // DEFINE_IS_IN_UNICODE_BLOCK(EgyptianHieroglyphs, 0x13000, 0x1342F)
254 // DEFINE_IS_IN_UNICODE_BLOCK(AnatolianHieroglyphs, 0x14400, 0x1467F)
255 // DEFINE_IS_IN_UNICODE_BLOCK(BamumSupplement, 0x16800, 0x16A3F)
256 // DEFINE_IS_IN_UNICODE_BLOCK(Mro, 0x16A40, 0x16A6F)
257 // DEFINE_IS_IN_UNICODE_BLOCK(BassaVah, 0x16AD0, 0x16AFF)
258 // DEFINE_IS_IN_UNICODE_BLOCK(PahawhHmong, 0x16B00, 0x16B8F)
259 // DEFINE_IS_IN_UNICODE_BLOCK(Miao, 0x16F00, 0x16F9F)
260 // DEFINE_IS_IN_UNICODE_BLOCK(IdeographicSymbolsandPunctuation, 0x16FE0, 0x16FFF)
261 // DEFINE_IS_IN_UNICODE_BLOCK(Tangut, 0x17000, 0x187FF)
262 // DEFINE_IS_IN_UNICODE_BLOCK(TangutComponents, 0x18800, 0x18AFF)
263 // DEFINE_IS_IN_UNICODE_BLOCK(KanaSupplement, 0x1B000, 0x1B0FF)
264 // DEFINE_IS_IN_UNICODE_BLOCK(KanaExtendedA, 0x1B100, 0x1B12F)
265 // DEFINE_IS_IN_UNICODE_BLOCK(Nushu, 0x1B170, 0x1B2FF)
266 // DEFINE_IS_IN_UNICODE_BLOCK(Duployan, 0x1BC00, 0x1BC9F)
267 // DEFINE_IS_IN_UNICODE_BLOCK(ShorthandFormatControls, 0x1BCA0, 0x1BCAF)
268 // DEFINE_IS_IN_UNICODE_BLOCK(ByzantineMusicalSymbols, 0x1D000, 0x1D0FF)
269 // DEFINE_IS_IN_UNICODE_BLOCK(MusicalSymbols, 0x1D100, 0x1D1FF)
270 // DEFINE_IS_IN_UNICODE_BLOCK(AncientGreekMusicalNotation, 0x1D200, 0x1D24F)
271 // DEFINE_IS_IN_UNICODE_BLOCK(TaiXuanJingSymbols, 0x1D300, 0x1D35F)
272 // DEFINE_IS_IN_UNICODE_BLOCK(CountingRodNumerals, 0x1D360, 0x1D37F)
273 // DEFINE_IS_IN_UNICODE_BLOCK(MathematicalAlphanumericSymbols, 0x1D400, 0x1D7FF)
274 // DEFINE_IS_IN_UNICODE_BLOCK(SuttonSignWriting, 0x1D800, 0x1DAAF)
275 // DEFINE_IS_IN_UNICODE_BLOCK(GlagoliticSupplement, 0x1E000, 0x1E02F)
276 // DEFINE_IS_IN_UNICODE_BLOCK(MendeKikakui, 0x1E800, 0x1E8DF)
277 // DEFINE_IS_IN_UNICODE_BLOCK(Adlam, 0x1E900, 0x1E95F)
278 // DEFINE_IS_IN_UNICODE_BLOCK(ArabicMathematicalAlphabeticSymbols, 0x1EE00, 0x1EEFF)
279 // DEFINE_IS_IN_UNICODE_BLOCK(MahjongTiles, 0x1F000, 0x1F02F)
280 // DEFINE_IS_IN_UNICODE_BLOCK(DominoTiles, 0x1F030, 0x1F09F)
281 // DEFINE_IS_IN_UNICODE_BLOCK(PlayingCards, 0x1F0A0, 0x1F0FF)
282 // DEFINE_IS_IN_UNICODE_BLOCK(EnclosedAlphanumericSupplement, 0x1F100, 0x1F1FF)
283 // DEFINE_IS_IN_UNICODE_BLOCK(EnclosedIdeographicSupplement, 0x1F200, 0x1F2FF)
284 // DEFINE_IS_IN_UNICODE_BLOCK(MiscellaneousSymbolsandPictographs, 0x1F300, 0x1F5FF)
285 // DEFINE_IS_IN_UNICODE_BLOCK(Emoticons, 0x1F600, 0x1F64F)
286 // DEFINE_IS_IN_UNICODE_BLOCK(OrnamentalDingbats, 0x1F650, 0x1F67F)
287 // DEFINE_IS_IN_UNICODE_BLOCK(TransportandMapSymbols, 0x1F680, 0x1F6FF)
288 // DEFINE_IS_IN_UNICODE_BLOCK(AlchemicalSymbols, 0x1F700, 0x1F77F)
289 // DEFINE_IS_IN_UNICODE_BLOCK(GeometricShapesExtended, 0x1F780, 0x1F7FF)
290 // DEFINE_IS_IN_UNICODE_BLOCK(SupplementalArrowsC, 0x1F800, 0x1F8FF)
291 // DEFINE_IS_IN_UNICODE_BLOCK(SupplementalSymbolsandPictographs, 0x1F900, 0x1F9FF)
292 // DEFINE_IS_IN_UNICODE_BLOCK(CJKUnifiedIdeographsExtensionB, 0x20000, 0x2A6DF)
293 // DEFINE_IS_IN_UNICODE_BLOCK(CJKUnifiedIdeographsExtensionC, 0x2A700, 0x2B73F)
294 // DEFINE_IS_IN_UNICODE_BLOCK(CJKUnifiedIdeographsExtensionD, 0x2B740, 0x2B81F)
295 // DEFINE_IS_IN_UNICODE_BLOCK(CJKUnifiedIdeographsExtensionE, 0x2B820, 0x2CEAF)
296 // DEFINE_IS_IN_UNICODE_BLOCK(CJKUnifiedIdeographsExtensionF, 0x2CEB0, 0x2EBEF)
297 // DEFINE_IS_IN_UNICODE_BLOCK(CJKCompatibilityIdeographsSupplement, 0x2F800, 0x2FA1F)
298 // DEFINE_IS_IN_UNICODE_BLOCK(Tags, 0xE0000, 0xE007F)
299 // DEFINE_IS_IN_UNICODE_BLOCK(VariationSelectorsSupplement, 0xE0100, 0xE01EF)
300 // DEFINE_IS_IN_UNICODE_BLOCK(SupplementaryPrivateUseAreaA, 0xF0000, 0xFFFFF)
301 // DEFINE_IS_IN_UNICODE_BLOCK(SupplementaryPrivateUseAreaB, 0x100000, 0x10FFFF)
302 
303 const std::map<char16_t, char16_t> verticalPunctuation = {
304     { u'!', u'︕' },  { u'#', u'#' },  { u'$', u'$' },  { u'%', u'%' },  { u'&', u'&' },
305     { u'(', u'︵' },  { u')', u'︶' },  { u'*', u'*' },  { u'+', u'+' },  { u',', u'︐' },
306     { u'-', u'︲' },  { u'.', u'・' },  { u'/', u'/' },  { u':', u'︓' },  { u';', u'︔' },
307     { u'<', u'︿' },  { u'=', u'=' },  { u'>', u'﹀' },  { u'?', u'︖' },  { u'@', u'@' },
308     { u'[', u'﹇' },  { u'\\', u'\' }, { u']', u'﹈' },  { u'^', u'^' },  { u'_', u'︳' },
309     { u'`', u'`' },  { u'{', u'︷' },  { u'|', u'―' },   { u'}', u'︸' },  { u'~', u'~' },
310     { u'¢', u'¢' },  { u'£', u'£' },  { u'¥', u'¥' },  { u'¦', u'¦' },  { u'¬', u'¬' },
311     { u'¯', u' ̄' },  { u'–', u'︲' },  { u'—', u'︱' },  { u'‘', u'﹃' },  { u'’', u'﹄' },
312     { u'“', u'﹁' },  { u'”', u'﹂' },  { u'…', u'︙' },  { u'‧', u'・' },  { u'₩', u'₩' },
313     { u'、', u'︑' }, { u'。', u'︒' }, { u'〈', u'︿' }, { u'〉', u'﹀' }, { u'《', u'︽' },
314     { u'》', u'︾' }, { u'「', u'﹁' }, { u'」', u'﹂' }, { u'『', u'﹃' }, { u'』', u'﹄' },
315     { u'【', u'︻' }, { u'】', u'︼' }, { u'〔', u'︹' }, { u'〕', u'︺' }, { u'〖', u'︗' },
316     { u'〗', u'︘' }, { u'!', u'︕' }, { u'(', u'︵' }, { u')', u'︶' }, { u',', u'︐' },
317     { u'-', u'︲' }, { u'.', u'・' }, { u':', u'︓' }, { u';', u'︔' }, { u'<', u'︿' },
318     { u'>', u'﹀' }, { u'?', u'︖' }, { u'[', u'﹇' }, { u']', u'﹈' }, { u'_', u'︳' },
319     { u'{', u'︷' }, { u'|', u'―' },  { u'}', u'︸' }, { u'⦅', u'︵' }, { u'⦆', u'︶' },
320     { u'。', u'︒' },  { u'「', u'﹁' },  { u'」', u'﹂' },
321 };
322 } // namespace
323 
324 namespace mbgl {
325 namespace util {
326 namespace i18n {
327 
allowsWordBreaking(char16_t chr)328 bool allowsWordBreaking(char16_t chr) {
329     return (chr == 0x0a      /* newline */
330             || chr == 0x20   /* space */
331             || chr == 0x26   /* ampersand */
332             || chr == 0x28   /* open parenthesis */
333             || chr == 0x29   /* close parenthesis */
334             || chr == 0x2b   /* plus sign */
335             || chr == 0x2d   /* hyphen-minus */
336             || chr == 0x2f   /* solidus */
337             || chr == 0xad   /* soft hyphen */
338             || chr == 0xb7   /* middle dot */
339             || chr == 0x200b /* zero-width space */
340             || chr == 0x2010 /* hyphen */
341             || chr == 0x2013 /* en dash */);
342 }
343 
charAllowsLetterSpacing(char16_t chr)344 bool charAllowsLetterSpacing(char16_t chr) {
345     return !(isInArabic(chr) || isInArabicSupplement(chr) || isInArabicExtendedA(chr) || isInArabicPresentationFormsA(chr) || isInArabicPresentationFormsB(chr));
346 }
347 
allowsLetterSpacing(const std::u16string & string)348 bool allowsLetterSpacing(const std::u16string& string) {
349     return std::all_of(string.begin(), string.end(), charAllowsLetterSpacing);
350 }
351 
allowsIdeographicBreaking(const std::u16string & string)352 bool allowsIdeographicBreaking(const std::u16string& string) {
353     for (char16_t chr : string) {
354         if (!allowsIdeographicBreaking(chr)) {
355             return false;
356         }
357     }
358     return true;
359 }
360 
allowsIdeographicBreaking(char16_t chr)361 bool allowsIdeographicBreaking(char16_t chr) {
362     // Allow U+2027 "Interpunct" for hyphenation of Chinese words
363     if (chr == 0x2027)
364         return true;
365 
366     // Return early for characters outside all ideographic ranges.
367     if (chr < 0x2E80)
368         return false;
369 
370     return (isInBopomofo(chr) || isInBopomofoExtended(chr) || isInCJKCompatibility(chr) ||
371             isInCJKCompatibilityForms(chr) || isInCJKCompatibilityIdeographs(chr) ||
372             isInCJKRadicalsSupplement(chr) || isInCJKStrokes(chr) ||
373             isInCJKSymbolsandPunctuation(chr) || isInCJKUnifiedIdeographs(chr) ||
374             isInCJKUnifiedIdeographsExtensionA(chr) || isInEnclosedCJKLettersandMonths(chr) ||
375             isInHalfwidthandFullwidthForms(chr) || isInHiragana(chr) ||
376             isInIdeographicDescriptionCharacters(chr) || isInKangxiRadicals(chr) ||
377             isInKatakana(chr) || isInKatakanaPhoneticExtensions(chr) || isInVerticalForms(chr) ||
378             isInYiRadicals(chr) || isInYiSyllables(chr));
379 
380     // The following blocks also allow ideographic breaking; however, for other
381     // reasons, Mapbox GL lacks support for codepoints beyond U+FFFF.
382     // https://github.com/mapbox/mapbox-gl/issues/29
383     // return (isInTangut(chr)
384     //        || isInTangutComponents(chr)
385     //        || isInIdeographicSymbolsandPunctuation(chr)
386     //        || isInNushu(chr)
387     //        || isInEnclosedIdeographicSupplement(chr)
388     //        || isInCJKUnifiedIdeographsExtensionB(chr)
389     //        || isInCJKUnifiedIdeographsExtensionC(chr)
390     //        || isInCJKUnifiedIdeographsExtensionD(chr)
391     //        || isInCJKUnifiedIdeographsExtensionE(chr)
392     //        || isInCJKUnifiedIdeographsExtensionF(chr)
393     //        || isInCJKCompatibilityIdeographsSupplement(chr));
394 }
395 
allowsFixedWidthGlyphGeneration(char16_t chr)396 bool allowsFixedWidthGlyphGeneration(char16_t chr) {
397     // Mirrors conservative set of characters used in glyph_manager.js/_tinySDF
398     return isInCJKUnifiedIdeographs(chr) || isInHangulSyllables(chr);
399 }
400 
allowsVerticalWritingMode(const std::u16string & string)401 bool allowsVerticalWritingMode(const std::u16string& string) {
402     for (char32_t chr : string) {
403         if (hasUprightVerticalOrientation(chr)) {
404             return true;
405         }
406     }
407     return false;
408 }
409 
410 // The following logic comes from
411 // <http://www.unicode.org/Public/vertical/revision-17/VerticalOrientation-17.txt>.
412 // The data file denotes with “U” or “Tu” any codepoint that may be drawn
413 // upright in vertical text but does not distinguish between upright and
414 // “neutral” characters.
415 
hasUprightVerticalOrientation(char16_t chr)416 bool hasUprightVerticalOrientation(char16_t chr) {
417     if (chr == u'˪' || chr == u'˫')
418         return true;
419 
420     // Return early for characters outside all ranges whose characters remain
421     // upright in vertical writing mode.
422     if (chr < 0x1100)
423         return false;
424 
425     if (isInBopomofo(chr) || isInBopomofoExtended(chr))
426         return true;
427     if (isInCJKCompatibilityForms(chr)) {
428         if (!(chr >= u'﹉' && chr <= u'﹏'))
429             return true;
430     }
431     if (isInCJKCompatibility(chr) || isInCJKCompatibilityIdeographs(chr) ||
432         isInCJKRadicalsSupplement(chr) || isInCJKStrokes(chr))
433         return true;
434     if (isInCJKSymbolsandPunctuation(chr)) {
435         if (!(chr >= u'〈' && chr <= u'】') && !(chr >= u'〔' && chr <= u'〟') && chr != u'〰')
436             return true;
437     }
438     if (isInCJKUnifiedIdeographs(chr) || isInCJKUnifiedIdeographsExtensionA(chr) ||
439         isInEnclosedCJKLettersandMonths(chr) || isInHangulCompatibilityJamo(chr) ||
440         isInHangulJamo(chr) || isInHangulJamoExtendedA(chr) || isInHangulJamoExtendedB(chr) ||
441         isInHangulSyllables(chr) || isInHiragana(chr) ||
442         isInIdeographicDescriptionCharacters(chr) || isInKanbun(chr) || isInKangxiRadicals(chr))
443         return true;
444     if (isInKatakana(chr)) {
445         if (chr != u'ー')
446             return true;
447     }
448     if (isInKatakanaPhoneticExtensions(chr))
449         return true;
450     if (isInHalfwidthandFullwidthForms(chr)) {
451         if (chr != u'(' && chr != u')' && chr != u'-' && !(chr >= u':' && chr <= u'>') &&
452             chr != u'[' && chr != u']' && chr != u'_' && !(chr >= u'{' && chr <= 0xFFDF) &&
453             chr != u' ̄' && !(chr >= u'│' && chr <= 0xFFEF))
454             return true;
455     }
456     if (isInSmallFormVariants(chr)) {
457         if (!(chr >= u'﹘' && chr <= u'﹞') && !(chr >= u'﹣' && chr <= u'﹦'))
458             return true;
459     }
460     if (isInUnifiedCanadianAboriginalSyllabics(chr) ||
461         isInUnifiedCanadianAboriginalSyllabicsExtended(chr) || isInVerticalForms(chr) ||
462         isInYijingHexagramSymbols(chr) || isInYiSyllables(chr) || isInYiRadicals(chr))
463         return true;
464 
465     // https://github.com/mapbox/mapbox-gl/issues/29
466 
467     // if (isInMeroiticHieroglyphs(chr)) return true;
468     // if (isInSiddham(chr)) return true;
469     // if (isInEgyptianHieroglyphs(chr)) return true;
470     // if (isInAnatolianHieroglyphs(chr)) return true;
471     // if (isInIdeographicSymbolsandPunctuation(chr)) return true;
472     // if (isInTangut(chr)) return true;
473     // if (isInTangutComponents(chr)) return true;
474     // if (isInKanaSupplement(chr)) return true;
475     // if (isInKanaExtendedA(chr)) return true;
476     // if (isInNushu(chr)) return true;
477     // if (isInByzantineMusicalSymbols(chr)) return true;
478     // if (isInMusicalSymbols(chr)) return true;
479     // if (isInTaiXuanJingSymbols(chr)) return true;
480     // if (isInCountingRodNumerals(chr)) return true;
481     // if (isInSuttonSignWriting(chr)) return true;
482     // if (isInMahjongTiles(chr)) return true;
483     // if (isInDominoTiles(chr)) return true;
484     // if (isInPlayingCards(chr)) return true;
485     // if (isInEnclosedAlphanumericSupplement(chr)) return true;
486     // if (isInEnclosedIdeographicSupplement(chr)) return true;
487     // if (isInMiscellaneousSymbolsandPictographs(chr)) return true;
488     // if (isInEmoticons(chr)) return true;
489     // if (isInOrnamentalDingbats(chr)) return true;
490     // if (isInTransportandMapSymbols(chr)) return true;
491     // if (isInAlchemicalSymbols(chr)) return true;
492     // if (isInGeometricShapesExtended(chr)) return true;
493     // if (isInSupplementalSymbolsandPictographs(chr)) return true;
494     // if (isInCJKUnifiedIdeographsExtensionB(chr)) return true;
495     // if (isInCJKUnifiedIdeographsExtensionC(chr)) return true;
496     // if (isInCJKUnifiedIdeographsExtensionD(chr)) return true;
497     // if (isInCJKUnifiedIdeographsExtensionE(chr)) return true;
498     // if (isInCJKUnifiedIdeographsExtensionF(chr)) return true;
499     // if (isInCJKCompatibilityIdeographsSupplement(chr)) return true;
500 
501     return false;
502 }
503 
hasNeutralVerticalOrientation(char16_t chr)504 bool hasNeutralVerticalOrientation(char16_t chr) {
505     if (isInLatin1Supplement(chr)) {
506         if (chr == u'§' || chr == u'©' || chr == u'®' || chr == u'±' || chr == u'¼' ||
507             chr == u'½' || chr == u'¾' || chr == u'×' || chr == u'÷') {
508             return true;
509         }
510     }
511     if (isInGeneralPunctuation(chr)) {
512         if (chr == u'‖' || chr == u'†' || chr == u'‡' || chr == u'‰' || chr == u'‱' ||
513             chr == u'※' || chr == u'‼' || chr == u'⁂' || chr == u'⁇' || chr == u'⁈' ||
514             chr == u'⁉' || chr == u'⁑') {
515             return true;
516         }
517     }
518     if (isInLetterlikeSymbols(chr) || isInNumberForms(chr)) {
519         return true;
520     }
521     if (isInMiscellaneousTechnical(chr)) {
522         if ((chr >= u'⌀' && chr <= u'⌇') || (chr >= u'⌌' && chr <= u'⌟') ||
523             (chr >= u'⌤' && chr <= u'⌨') || chr == u'⌫' || (chr >= u'⍽' && chr <= u'⎚') ||
524             (chr >= u'⎾' && chr <= u'⏍') || chr == u'⏏' || (chr >= u'⏑' && chr <= u'⏛') ||
525             (chr >= u'⏢' && chr <= 0x23FF)) {
526             return true;
527         }
528     }
529     if (isInControlPictures(chr) || isInOpticalCharacterRecognition(chr) ||
530         isInEnclosedAlphanumerics(chr) || isInGeometricShapes(chr)) {
531         return true;
532     }
533     if (isInMiscellaneousSymbols(chr)) {
534         if ((chr >= u'⬒' && chr <= u'⬯') ||
535             (chr >= u'⭐' && chr <= 0x2B59 /* heavy circled saltire */) ||
536             (chr >= 0x2BB8 /* upwards white arrow from bar with horizontal bar */ &&
537              chr <= 0x2BEB)) {
538             return true;
539         }
540     }
541     if (isInCJKSymbolsandPunctuation(chr) || isInKatakana(chr) || isInPrivateUseArea(chr) ||
542         isInCJKCompatibilityForms(chr) || isInSmallFormVariants(chr) ||
543         isInHalfwidthandFullwidthForms(chr)) {
544         return true;
545     }
546     if (chr == u'∞' || chr == u'∴' || chr == u'∵' ||
547         (chr >= 0x2700 /* black safety scissors */ && chr <= u'❧') ||
548         (chr >= u'❶' && chr <= u'➓') || chr == 0xFFFC /* object replacement character */ ||
549         chr == 0xFFFD /* replacement character */) {
550         return true;
551     }
552     return false;
553 }
554 
hasRotatedVerticalOrientation(char16_t chr)555 bool hasRotatedVerticalOrientation(char16_t chr) {
556     return !(hasUprightVerticalOrientation(chr) || hasNeutralVerticalOrientation(chr));
557 }
558 
verticalizePunctuation(const std::u16string & input)559 std::u16string verticalizePunctuation(const std::u16string& input) {
560     std::u16string output;
561 
562     for (size_t i = 0; i < input.size(); i++) {
563         char16_t nextCharCode = i < input.size() - 1 ? input[i + 1] : 0;
564         char16_t prevCharCode = i ? input[i - 1] : 0;
565 
566         bool canReplacePunctuation =
567             ((!nextCharCode || !hasRotatedVerticalOrientation(nextCharCode) ||
568               verticalPunctuation.count(input[i + 1])) &&
569              (!prevCharCode || !hasRotatedVerticalOrientation(prevCharCode) ||
570               verticalPunctuation.count(input[i - 1])));
571 
572         if (char16_t repl = canReplacePunctuation ? verticalizePunctuation(input[i]) : 0) {
573             output += repl;
574         } else {
575             output += input[i];
576         }
577     }
578 
579     return output;
580 }
581 
verticalizePunctuation(char16_t chr)582 char16_t verticalizePunctuation(char16_t chr) {
583     return verticalPunctuation.count(chr) ? verticalPunctuation.at(chr) : 0;
584 }
585 
charInSupportedScript(char16_t chr)586 bool charInSupportedScript(char16_t chr) {
587     // This is a rough heuristic: whether we "can render" a script
588     // actually depends on the properties of the font being used
589     // and whether differences from the ideal rendering are considered
590     // semantically significant.
591 
592     // Even in Latin script, we "can't render" combinations such as the fi
593     // ligature, but we don't consider that semantically significant.n false;
594     if ((chr >= 0x0900 && chr <= 0x0DFF) ||
595         // Main blocks for Indic scripts and Sinhala
596         (chr >= 0x0F00 && chr <= 0x109F) ||
597         // Main blocks for Tibetan and Myanmar
598         isInKhmer(chr)) {
599         // These blocks cover common scripts that require
600         // complex text shaping, based on unicode script metadata:
601         // http://www.unicode.org/repos/cldr/trunk/common/properties/scriptMetadata.txt
602         // where "Web Rank <= 32" "Shaping Required = YES"
603         return false;
604     }
605     return true;
606 }
607 
isStringInSupportedScript(const std::string & input)608 bool isStringInSupportedScript(const std::string& input) {
609     auto u16string = util::utf8_to_utf16::convert(input);
610     for (char16_t chr : u16string) {
611         if (!charInSupportedScript(chr)) {
612             return false;
613         }
614     }
615     return true;
616 }
617 
618 } // namespace i18n
619 } // namespace util
620 } // namespace mbgl
621