1 #include <mbgl/text/language_tag.hpp>
2 
3 #pragma GCC diagnostic push
4 #pragma GCC diagnostic ignored "-Wunknown-pragmas"
5 #pragma GCC diagnostic ignored "-Wunused-parameter"
6 #pragma GCC diagnostic ignored "-Wshadow"
7 #pragma clang diagnostic push
8 #pragma clang diagnostic ignored "-Wshorten-64-to-32"
9 #pragma clang diagnostic ignored "-Wtautological-constant-compare"
10 #include <boost/spirit/include/qi.hpp>
11 #include <boost/spirit/include/phoenix_core.hpp>
12 #include <boost/spirit/include/phoenix_operator.hpp>
13 #pragma clang diagnostic pop
14 #pragma GCC diagnostic pop
15 
16 #include <sstream>
17 
18 /*
19  ABNF for BCP 47 from: https://tools.ietf.org/html/bcp47
20 
21  Language-Tag  = langtag             ; normal language tags
22                / privateuse          ; private use tag
23                / grandfathered       ; grandfathered tags  NOT IMPLEMENTED
24 
25  langtag       = language
26                  ["-" script]
27                  ["-" region]
28                  *("-" variant)
29                  *("-" extension)
30                  ["-" privateuse]
31 
32  language      = 2*3ALPHA            ; shortest ISO 639 code
33                  ["-" extlang]       ; sometimes followed by
34                                      ; extended language subtags
35                / 4ALPHA              ; or reserved for future use
36                / 5*8ALPHA            ; or registered language subtag
37 
38  extlang       = 3ALPHA              ; selected ISO 639 codes
39                  *2("-" 3ALPHA)      ; permanently reserved
40 
41  script        = 4ALPHA              ; ISO 15924 code
42 
43  region        = 2ALPHA              ; ISO 3166-1 code
44                / 3DIGIT              ; UN M.49 code
45 
46  variant       = 5*8alphanum         ; registered variants
47                / (DIGIT 3alphanum)
48 
49  extension     = singleton 1*("-" (2*8alphanum))
50 
51  ; Single alphanumerics
52  ; "x" reserved for private use
53  singleton     = DIGIT               ; 0 - 9
54                / %x41-57             ; A - W
55                / %x59-5A             ; Y - Z
56                / %x61-77             ; a - w
57                / %x79-7A             ; y - z
58 
59  privateuse    = "x" 1*("-" (1*8alphanum))
60 
61  grandfathered = irregular           ; non-redundant tags registered
62                / regular             ; during the RFC 3066 era
63 
64  irregular     = "en-GB-oed"         ; irregular tags do not match
65                  / "i-ami"             ; the 'langtag' production and
66                  / "i-bnn"             ; would not otherwise be
67                  / "i-default"         ; considered 'well-formed'
68                  / "i-enochian"        ; These tags are all valid,
69                  / "i-hak"             ; but most are deprecated
70                  / "i-klingon"         ; in favor of more modern
71                  / "i-lux"             ; subtags or subtag
72                  / "i-mingo"           ; combination
73                  / "i-navajo"
74                  / "i-pwn"
75                  / "i-tao"
76                  / "i-tay"
77                  / "i-tsu"
78                  / "sgn-BE-FR"
79                  / "sgn-BE-NL"
80                  / "sgn-CH-DE"
81 
82  regular       = "art-lojban"        ; these tags match the 'langtag'
83                  / "cel-gaulish"       ; production, but their subtags
84                  / "no-bok"            ; are not extended language
85                  / "no-nyn"            ; or variant subtags: their meaning
86                  / "zh-guoyu"          ; is defined by their registration
87                  / "zh-hakka"          ; and all of these are deprecated
88                  / "zh-min"            ; in favor of a more modern
89                  / "zh-min-nan"        ; subtag or sequence of subtags
90                  / "zh-xiang"
91 
92  alphanum      = (ALPHA / DIGIT)     ; letters and numbers
93 
94 */
95 
96 namespace mbgl {
97 
98 namespace qi = boost::spirit::qi;
99 namespace phoenix = boost::phoenix;
100 namespace ascii = boost::spirit::ascii;
101 
102 template <typename Iterator>
103 struct bcp47_parser : qi::grammar<Iterator>
104 {
bcp47_parsermbgl::bcp47_parser105     bcp47_parser() : bcp47_parser::base_type(start)
106     {
107         using qi::lit;
108         using qi::repeat;
109         using qi::inf;
110         using qi::eoi;
111         using ascii::char_;
112         using ascii::no_case;
113         using ascii::digit;
114         using ascii::alnum;
115         using ascii::alpha;
116 
117         using boost::spirit::qi::_1;
118 
119         start %= no_case[langtag | privateuse | grandfathered];
120 
121         langtag %= (language) [phoenix::ref(languageTag.language) = _1]
122             >> -("-" >> (script)[phoenix::ref(languageTag.script) = _1])
123             >> -("-" >> (region)[phoenix::ref(languageTag.region) = _1])
124             >> *("-" >> variant)
125             >> *("-" >> extension)
126             >> -("-" >> privateuse);
127 
128         language %= (repeat(2,3)[alpha] >> -("-" >> extlang))     // shortest ISO 639 code
129                                                                   // sometimes followed by extended language subtags
130             | repeat(4)[alpha]                                    // or reserved for future use
131             | repeat(5,8)[alpha];                                 // or registered language subtag
132 
133         // We add lookaheads for "-"/eoi so that spurious matches on subtags don't prevent backtracking
134         extlang = repeat(3)[alpha] >> (&lit('-') | eoi) >> repeat(0,2)["-" >> repeat(3)[alpha] >> (&lit('-') | eoi)];
135 
136         script = repeat(4)[alpha] >> (&lit('-') | eoi);
137 
138         region = (repeat(2)[alpha] | repeat(3)[digit]) >> (&lit('-') | eoi);
139 
140         variant = (repeat(5,8)[alnum] | (digit >> repeat(3,inf)[alnum])) >> (&lit('-') | eoi);
141 
142         extension = singleton >> +("-" >> repeat(2,8)[alnum]) >> (&lit('-') | eoi);
143 
144         singleton = digit | char_('a','w') | char_('y','z'); // "no-case" handles A-W and Y-Z
145 
146         privateuse = "x" >> +("-" >> repeat(1,8)[alnum]) >> (&lit('-') | eoi);
147 
148         grandfathered = regular | irregular;
149 
150         irregular = lit("en-GB-oed")
151             | "i-ami"
152             | "i-bnn"
153             | "i-default"
154             | "i-enochian"
155             | "i-hak"
156             | "i-klingon"
157             | "i-lux"
158             | "i-mingo"
159             | "i-navajo"
160             | "i-pwn"
161             | "i-tao"
162             | "i-tay"
163             | "i-tsu"
164             | "sgn-BE-FR"
165             | "sgn-BE-NL"
166             | "sgn-CH-DE";
167 
168         regular = lit("art-lojban")
169             | "cel-gaulish"
170             | "no-bok"
171             | "no-nyn"
172             | "zh-guoyu"
173             | "zh-hakka"
174             | "zh-min"
175             | "zh-min-nan"
176             | "zh-xiang";
177     }
178 
179     qi::rule<Iterator> start;
180     qi::rule<Iterator> langtag;
181     qi::rule<Iterator, std::string()> language;
182     qi::rule<Iterator> extlang;
183     qi::rule<Iterator, std::string()> script;
184     qi::rule<Iterator, std::string()> region;
185     qi::rule<Iterator> variant;
186     qi::rule<Iterator> extension;
187     qi::rule<Iterator> singleton;
188     qi::rule<Iterator> privateuse;
189     qi::rule<Iterator> grandfathered;
190     qi::rule<Iterator> irregular;
191     qi::rule<Iterator> regular;
192 
193     LanguageTag languageTag;
194 };
195 
fromBCP47(const std::string & bcp47Tag)196 LanguageTag LanguageTag::fromBCP47(const std::string& bcp47Tag) {
197     typedef std::string::const_iterator iterator_type;
198     typedef bcp47_parser<iterator_type> bcp47_parser;
199 
200     bcp47_parser parser;
201     std::string::const_iterator iter = bcp47Tag.begin();
202     std::string::const_iterator end = bcp47Tag.end();
203     bool r = parse(iter, end, parser);
204     if (r && iter == end) {
205         return parser.languageTag;
206     } else {
207         // Invalid tags are treated as empty/"default"
208         return LanguageTag();
209     }
210 }
211 
LanguageTag(optional<std::string> language_,optional<std::string> script_,optional<std::string> region_)212 LanguageTag::LanguageTag(optional<std::string> language_, optional<std::string> script_, optional<std::string> region_)
213     : language(std::move(language_))
214     , script(std::move(script_))
215     , region(std::move(region_))
216 {}
217 
toBCP47() const218 std::string LanguageTag::toBCP47() const {
219     std::stringstream bcp47;
220     if (!language) {
221         // BCP 47 requires a language, but we're matching implementations that accept ""
222         // to mean something like "default"
223         return bcp47.str();
224     } else {
225         bcp47 << *language;
226     }
227 
228     if (script) {
229         bcp47 << "-" << *script;
230     }
231 
232     if (region) {
233         bcp47 << "-" << *region;
234     }
235     return bcp47.str();
236 }
237 } // end namespace mbgl
238