1 #include <mbgl/text/language_tag.hpp>
2
3 #pragma GCC diagnostic push
4 #pragma GCC diagnostic ignored "-Wunknown-pragmas"
5 #pragma GCC diagnostic ignored "-Wunused-parameter"
6 #pragma GCC diagnostic ignored "-Wshadow"
7 #pragma clang diagnostic push
8 #pragma clang diagnostic ignored "-Wshorten-64-to-32"
9 #pragma clang diagnostic ignored "-Wtautological-constant-compare"
10 #include <boost/spirit/include/qi.hpp>
11 #include <boost/spirit/include/phoenix_core.hpp>
12 #include <boost/spirit/include/phoenix_operator.hpp>
13 #pragma clang diagnostic pop
14 #pragma GCC diagnostic pop
15
16 #include <sstream>
17
18 /*
19 ABNF for BCP 47 from: https://tools.ietf.org/html/bcp47
20
21 Language-Tag = langtag ; normal language tags
22 / privateuse ; private use tag
23 / grandfathered ; grandfathered tags NOT IMPLEMENTED
24
25 langtag = language
26 ["-" script]
27 ["-" region]
28 *("-" variant)
29 *("-" extension)
30 ["-" privateuse]
31
32 language = 2*3ALPHA ; shortest ISO 639 code
33 ["-" extlang] ; sometimes followed by
34 ; extended language subtags
35 / 4ALPHA ; or reserved for future use
36 / 5*8ALPHA ; or registered language subtag
37
38 extlang = 3ALPHA ; selected ISO 639 codes
39 *2("-" 3ALPHA) ; permanently reserved
40
41 script = 4ALPHA ; ISO 15924 code
42
43 region = 2ALPHA ; ISO 3166-1 code
44 / 3DIGIT ; UN M.49 code
45
46 variant = 5*8alphanum ; registered variants
47 / (DIGIT 3alphanum)
48
49 extension = singleton 1*("-" (2*8alphanum))
50
51 ; Single alphanumerics
52 ; "x" reserved for private use
53 singleton = DIGIT ; 0 - 9
54 / %x41-57 ; A - W
55 / %x59-5A ; Y - Z
56 / %x61-77 ; a - w
57 / %x79-7A ; y - z
58
59 privateuse = "x" 1*("-" (1*8alphanum))
60
61 grandfathered = irregular ; non-redundant tags registered
62 / regular ; during the RFC 3066 era
63
64 irregular = "en-GB-oed" ; irregular tags do not match
65 / "i-ami" ; the 'langtag' production and
66 / "i-bnn" ; would not otherwise be
67 / "i-default" ; considered 'well-formed'
68 / "i-enochian" ; These tags are all valid,
69 / "i-hak" ; but most are deprecated
70 / "i-klingon" ; in favor of more modern
71 / "i-lux" ; subtags or subtag
72 / "i-mingo" ; combination
73 / "i-navajo"
74 / "i-pwn"
75 / "i-tao"
76 / "i-tay"
77 / "i-tsu"
78 / "sgn-BE-FR"
79 / "sgn-BE-NL"
80 / "sgn-CH-DE"
81
82 regular = "art-lojban" ; these tags match the 'langtag'
83 / "cel-gaulish" ; production, but their subtags
84 / "no-bok" ; are not extended language
85 / "no-nyn" ; or variant subtags: their meaning
86 / "zh-guoyu" ; is defined by their registration
87 / "zh-hakka" ; and all of these are deprecated
88 / "zh-min" ; in favor of a more modern
89 / "zh-min-nan" ; subtag or sequence of subtags
90 / "zh-xiang"
91
92 alphanum = (ALPHA / DIGIT) ; letters and numbers
93
94 */
95
96 namespace mbgl {
97
98 namespace qi = boost::spirit::qi;
99 namespace phoenix = boost::phoenix;
100 namespace ascii = boost::spirit::ascii;
101
102 template <typename Iterator>
103 struct bcp47_parser : qi::grammar<Iterator>
104 {
bcp47_parsermbgl::bcp47_parser105 bcp47_parser() : bcp47_parser::base_type(start)
106 {
107 using qi::lit;
108 using qi::repeat;
109 using qi::inf;
110 using qi::eoi;
111 using ascii::char_;
112 using ascii::no_case;
113 using ascii::digit;
114 using ascii::alnum;
115 using ascii::alpha;
116
117 using boost::spirit::qi::_1;
118
119 start %= no_case[langtag | privateuse | grandfathered];
120
121 langtag %= (language) [phoenix::ref(languageTag.language) = _1]
122 >> -("-" >> (script)[phoenix::ref(languageTag.script) = _1])
123 >> -("-" >> (region)[phoenix::ref(languageTag.region) = _1])
124 >> *("-" >> variant)
125 >> *("-" >> extension)
126 >> -("-" >> privateuse);
127
128 language %= (repeat(2,3)[alpha] >> -("-" >> extlang)) // shortest ISO 639 code
129 // sometimes followed by extended language subtags
130 | repeat(4)[alpha] // or reserved for future use
131 | repeat(5,8)[alpha]; // or registered language subtag
132
133 // We add lookaheads for "-"/eoi so that spurious matches on subtags don't prevent backtracking
134 extlang = repeat(3)[alpha] >> (&lit('-') | eoi) >> repeat(0,2)["-" >> repeat(3)[alpha] >> (&lit('-') | eoi)];
135
136 script = repeat(4)[alpha] >> (&lit('-') | eoi);
137
138 region = (repeat(2)[alpha] | repeat(3)[digit]) >> (&lit('-') | eoi);
139
140 variant = (repeat(5,8)[alnum] | (digit >> repeat(3,inf)[alnum])) >> (&lit('-') | eoi);
141
142 extension = singleton >> +("-" >> repeat(2,8)[alnum]) >> (&lit('-') | eoi);
143
144 singleton = digit | char_('a','w') | char_('y','z'); // "no-case" handles A-W and Y-Z
145
146 privateuse = "x" >> +("-" >> repeat(1,8)[alnum]) >> (&lit('-') | eoi);
147
148 grandfathered = regular | irregular;
149
150 irregular = lit("en-GB-oed")
151 | "i-ami"
152 | "i-bnn"
153 | "i-default"
154 | "i-enochian"
155 | "i-hak"
156 | "i-klingon"
157 | "i-lux"
158 | "i-mingo"
159 | "i-navajo"
160 | "i-pwn"
161 | "i-tao"
162 | "i-tay"
163 | "i-tsu"
164 | "sgn-BE-FR"
165 | "sgn-BE-NL"
166 | "sgn-CH-DE";
167
168 regular = lit("art-lojban")
169 | "cel-gaulish"
170 | "no-bok"
171 | "no-nyn"
172 | "zh-guoyu"
173 | "zh-hakka"
174 | "zh-min"
175 | "zh-min-nan"
176 | "zh-xiang";
177 }
178
179 qi::rule<Iterator> start;
180 qi::rule<Iterator> langtag;
181 qi::rule<Iterator, std::string()> language;
182 qi::rule<Iterator> extlang;
183 qi::rule<Iterator, std::string()> script;
184 qi::rule<Iterator, std::string()> region;
185 qi::rule<Iterator> variant;
186 qi::rule<Iterator> extension;
187 qi::rule<Iterator> singleton;
188 qi::rule<Iterator> privateuse;
189 qi::rule<Iterator> grandfathered;
190 qi::rule<Iterator> irregular;
191 qi::rule<Iterator> regular;
192
193 LanguageTag languageTag;
194 };
195
fromBCP47(const std::string & bcp47Tag)196 LanguageTag LanguageTag::fromBCP47(const std::string& bcp47Tag) {
197 typedef std::string::const_iterator iterator_type;
198 typedef bcp47_parser<iterator_type> bcp47_parser;
199
200 bcp47_parser parser;
201 std::string::const_iterator iter = bcp47Tag.begin();
202 std::string::const_iterator end = bcp47Tag.end();
203 bool r = parse(iter, end, parser);
204 if (r && iter == end) {
205 return parser.languageTag;
206 } else {
207 // Invalid tags are treated as empty/"default"
208 return LanguageTag();
209 }
210 }
211
LanguageTag(optional<std::string> language_,optional<std::string> script_,optional<std::string> region_)212 LanguageTag::LanguageTag(optional<std::string> language_, optional<std::string> script_, optional<std::string> region_)
213 : language(std::move(language_))
214 , script(std::move(script_))
215 , region(std::move(region_))
216 {}
217
toBCP47() const218 std::string LanguageTag::toBCP47() const {
219 std::stringstream bcp47;
220 if (!language) {
221 // BCP 47 requires a language, but we're matching implementations that accept ""
222 // to mean something like "default"
223 return bcp47.str();
224 } else {
225 bcp47 << *language;
226 }
227
228 if (script) {
229 bcp47 << "-" << *script;
230 }
231
232 if (region) {
233 bcp47 << "-" << *region;
234 }
235 return bcp47.str();
236 }
237 } // end namespace mbgl
238