xref: /OK3568_Linux_fs/kernel/tools/perf/util/demangle-rust.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun #include <string.h>
3*4882a593Smuzhiyun #include "debug.h"
4*4882a593Smuzhiyun 
5*4882a593Smuzhiyun #include "demangle-rust.h"
6*4882a593Smuzhiyun 
7*4882a593Smuzhiyun /*
8*4882a593Smuzhiyun  * Mangled Rust symbols look like this:
9*4882a593Smuzhiyun  *
10*4882a593Smuzhiyun  *     _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
11*4882a593Smuzhiyun  *
12*4882a593Smuzhiyun  * The original symbol is:
13*4882a593Smuzhiyun  *
14*4882a593Smuzhiyun  *     <std::sys::fd::FileDesc as core::ops::Drop>::drop
15*4882a593Smuzhiyun  *
16*4882a593Smuzhiyun  * The last component of the path is a 64-bit hash in lowercase hex, prefixed
17*4882a593Smuzhiyun  * with "h". Rust does not have a global namespace between crates, an illusion
18*4882a593Smuzhiyun  * which Rust maintains by using the hash to distinguish things that would
19*4882a593Smuzhiyun  * otherwise have the same symbol.
20*4882a593Smuzhiyun  *
21*4882a593Smuzhiyun  * Any path component not starting with a XID_Start character is prefixed with
22*4882a593Smuzhiyun  * "_".
23*4882a593Smuzhiyun  *
24*4882a593Smuzhiyun  * The following escape sequences are used:
25*4882a593Smuzhiyun  *
26*4882a593Smuzhiyun  *     ","  =>  $C$
27*4882a593Smuzhiyun  *     "@"  =>  $SP$
28*4882a593Smuzhiyun  *     "*"  =>  $BP$
29*4882a593Smuzhiyun  *     "&"  =>  $RF$
30*4882a593Smuzhiyun  *     "<"  =>  $LT$
31*4882a593Smuzhiyun  *     ">"  =>  $GT$
32*4882a593Smuzhiyun  *     "("  =>  $LP$
33*4882a593Smuzhiyun  *     ")"  =>  $RP$
34*4882a593Smuzhiyun  *     " "  =>  $u20$
35*4882a593Smuzhiyun  *     "'"  =>  $u27$
36*4882a593Smuzhiyun  *     "["  =>  $u5b$
37*4882a593Smuzhiyun  *     "]"  =>  $u5d$
38*4882a593Smuzhiyun  *     "~"  =>  $u7e$
39*4882a593Smuzhiyun  *
40*4882a593Smuzhiyun  * A double ".." means "::" and a single "." means "-".
41*4882a593Smuzhiyun  *
42*4882a593Smuzhiyun  * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$
43*4882a593Smuzhiyun  */
44*4882a593Smuzhiyun 
45*4882a593Smuzhiyun static const char *hash_prefix = "::h";
46*4882a593Smuzhiyun static const size_t hash_prefix_len = 3;
47*4882a593Smuzhiyun static const size_t hash_len = 16;
48*4882a593Smuzhiyun 
49*4882a593Smuzhiyun static bool is_prefixed_hash(const char *start);
50*4882a593Smuzhiyun static bool looks_like_rust(const char *sym, size_t len);
51*4882a593Smuzhiyun static bool unescape(const char **in, char **out, const char *seq, char value);
52*4882a593Smuzhiyun 
53*4882a593Smuzhiyun /*
54*4882a593Smuzhiyun  * INPUT:
55*4882a593Smuzhiyun  *     sym: symbol that has been through BFD-demangling
56*4882a593Smuzhiyun  *
57*4882a593Smuzhiyun  * This function looks for the following indicators:
58*4882a593Smuzhiyun  *
59*4882a593Smuzhiyun  *  1. The hash must consist of "h" followed by 16 lowercase hex digits.
60*4882a593Smuzhiyun  *
61*4882a593Smuzhiyun  *  2. As a sanity check, the hash must use between 5 and 15 of the 16 possible
62*4882a593Smuzhiyun  *     hex digits. This is true of 99.9998% of hashes so once in your life you
63*4882a593Smuzhiyun  *     may see a false negative. The point is to notice path components that
64*4882a593Smuzhiyun  *     could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In
65*4882a593Smuzhiyun  *     this case a false positive (non-Rust symbol has an important path
66*4882a593Smuzhiyun  *     component removed because it looks like a Rust hash) is worse than a
67*4882a593Smuzhiyun  *     false negative (the rare Rust symbol is not demangled) so this sets the
68*4882a593Smuzhiyun  *     balance in favor of false negatives.
69*4882a593Smuzhiyun  *
70*4882a593Smuzhiyun  *  3. There must be no characters other than a-zA-Z0-9 and _.:$
71*4882a593Smuzhiyun  *
72*4882a593Smuzhiyun  *  4. There must be no unrecognized $-sign sequences.
73*4882a593Smuzhiyun  *
74*4882a593Smuzhiyun  *  5. There must be no sequence of three or more dots in a row ("...").
75*4882a593Smuzhiyun  */
76*4882a593Smuzhiyun bool
rust_is_mangled(const char * sym)77*4882a593Smuzhiyun rust_is_mangled(const char *sym)
78*4882a593Smuzhiyun {
79*4882a593Smuzhiyun 	size_t len, len_without_hash;
80*4882a593Smuzhiyun 
81*4882a593Smuzhiyun 	if (!sym)
82*4882a593Smuzhiyun 		return false;
83*4882a593Smuzhiyun 
84*4882a593Smuzhiyun 	len = strlen(sym);
85*4882a593Smuzhiyun 	if (len <= hash_prefix_len + hash_len)
86*4882a593Smuzhiyun 		/* Not long enough to contain "::h" + hash + something else */
87*4882a593Smuzhiyun 		return false;
88*4882a593Smuzhiyun 
89*4882a593Smuzhiyun 	len_without_hash = len - (hash_prefix_len + hash_len);
90*4882a593Smuzhiyun 	if (!is_prefixed_hash(sym + len_without_hash))
91*4882a593Smuzhiyun 		return false;
92*4882a593Smuzhiyun 
93*4882a593Smuzhiyun 	return looks_like_rust(sym, len_without_hash);
94*4882a593Smuzhiyun }
95*4882a593Smuzhiyun 
96*4882a593Smuzhiyun /*
97*4882a593Smuzhiyun  * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex
98*4882a593Smuzhiyun  * digits must comprise between 5 and 15 (inclusive) distinct digits.
99*4882a593Smuzhiyun  */
is_prefixed_hash(const char * str)100*4882a593Smuzhiyun static bool is_prefixed_hash(const char *str)
101*4882a593Smuzhiyun {
102*4882a593Smuzhiyun 	const char *end;
103*4882a593Smuzhiyun 	bool seen[16];
104*4882a593Smuzhiyun 	size_t i;
105*4882a593Smuzhiyun 	int count;
106*4882a593Smuzhiyun 
107*4882a593Smuzhiyun 	if (strncmp(str, hash_prefix, hash_prefix_len))
108*4882a593Smuzhiyun 		return false;
109*4882a593Smuzhiyun 	str += hash_prefix_len;
110*4882a593Smuzhiyun 
111*4882a593Smuzhiyun 	memset(seen, false, sizeof(seen));
112*4882a593Smuzhiyun 	for (end = str + hash_len; str < end; str++)
113*4882a593Smuzhiyun 		if (*str >= '0' && *str <= '9')
114*4882a593Smuzhiyun 			seen[*str - '0'] = true;
115*4882a593Smuzhiyun 		else if (*str >= 'a' && *str <= 'f')
116*4882a593Smuzhiyun 			seen[*str - 'a' + 10] = true;
117*4882a593Smuzhiyun 		else
118*4882a593Smuzhiyun 			return false;
119*4882a593Smuzhiyun 
120*4882a593Smuzhiyun 	/* Count how many distinct digits seen */
121*4882a593Smuzhiyun 	count = 0;
122*4882a593Smuzhiyun 	for (i = 0; i < 16; i++)
123*4882a593Smuzhiyun 		if (seen[i])
124*4882a593Smuzhiyun 			count++;
125*4882a593Smuzhiyun 
126*4882a593Smuzhiyun 	return count >= 5 && count <= 15;
127*4882a593Smuzhiyun }
128*4882a593Smuzhiyun 
looks_like_rust(const char * str,size_t len)129*4882a593Smuzhiyun static bool looks_like_rust(const char *str, size_t len)
130*4882a593Smuzhiyun {
131*4882a593Smuzhiyun 	const char *end = str + len;
132*4882a593Smuzhiyun 
133*4882a593Smuzhiyun 	while (str < end)
134*4882a593Smuzhiyun 		switch (*str) {
135*4882a593Smuzhiyun 		case '$':
136*4882a593Smuzhiyun 			if (!strncmp(str, "$C$", 3))
137*4882a593Smuzhiyun 				str += 3;
138*4882a593Smuzhiyun 			else if (!strncmp(str, "$SP$", 4)
139*4882a593Smuzhiyun 					|| !strncmp(str, "$BP$", 4)
140*4882a593Smuzhiyun 					|| !strncmp(str, "$RF$", 4)
141*4882a593Smuzhiyun 					|| !strncmp(str, "$LT$", 4)
142*4882a593Smuzhiyun 					|| !strncmp(str, "$GT$", 4)
143*4882a593Smuzhiyun 					|| !strncmp(str, "$LP$", 4)
144*4882a593Smuzhiyun 					|| !strncmp(str, "$RP$", 4))
145*4882a593Smuzhiyun 				str += 4;
146*4882a593Smuzhiyun 			else if (!strncmp(str, "$u20$", 5)
147*4882a593Smuzhiyun 					|| !strncmp(str, "$u27$", 5)
148*4882a593Smuzhiyun 					|| !strncmp(str, "$u5b$", 5)
149*4882a593Smuzhiyun 					|| !strncmp(str, "$u5d$", 5)
150*4882a593Smuzhiyun 					|| !strncmp(str, "$u7e$", 5))
151*4882a593Smuzhiyun 				str += 5;
152*4882a593Smuzhiyun 			else
153*4882a593Smuzhiyun 				return false;
154*4882a593Smuzhiyun 			break;
155*4882a593Smuzhiyun 		case '.':
156*4882a593Smuzhiyun 			/* Do not allow three or more consecutive dots */
157*4882a593Smuzhiyun 			if (!strncmp(str, "...", 3))
158*4882a593Smuzhiyun 				return false;
159*4882a593Smuzhiyun 			/* Fall through */
160*4882a593Smuzhiyun 		case 'a' ... 'z':
161*4882a593Smuzhiyun 		case 'A' ... 'Z':
162*4882a593Smuzhiyun 		case '0' ... '9':
163*4882a593Smuzhiyun 		case '_':
164*4882a593Smuzhiyun 		case ':':
165*4882a593Smuzhiyun 			str++;
166*4882a593Smuzhiyun 			break;
167*4882a593Smuzhiyun 		default:
168*4882a593Smuzhiyun 			return false;
169*4882a593Smuzhiyun 		}
170*4882a593Smuzhiyun 
171*4882a593Smuzhiyun 	return true;
172*4882a593Smuzhiyun }
173*4882a593Smuzhiyun 
174*4882a593Smuzhiyun /*
175*4882a593Smuzhiyun  * INPUT:
176*4882a593Smuzhiyun  *     sym: symbol for which rust_is_mangled(sym) returns true
177*4882a593Smuzhiyun  *
178*4882a593Smuzhiyun  * The input is demangled in-place because the mangled name is always longer
179*4882a593Smuzhiyun  * than the demangled one.
180*4882a593Smuzhiyun  */
181*4882a593Smuzhiyun void
rust_demangle_sym(char * sym)182*4882a593Smuzhiyun rust_demangle_sym(char *sym)
183*4882a593Smuzhiyun {
184*4882a593Smuzhiyun 	const char *in;
185*4882a593Smuzhiyun 	char *out;
186*4882a593Smuzhiyun 	const char *end;
187*4882a593Smuzhiyun 
188*4882a593Smuzhiyun 	if (!sym)
189*4882a593Smuzhiyun 		return;
190*4882a593Smuzhiyun 
191*4882a593Smuzhiyun 	in = sym;
192*4882a593Smuzhiyun 	out = sym;
193*4882a593Smuzhiyun 	end = sym + strlen(sym) - (hash_prefix_len + hash_len);
194*4882a593Smuzhiyun 
195*4882a593Smuzhiyun 	while (in < end)
196*4882a593Smuzhiyun 		switch (*in) {
197*4882a593Smuzhiyun 		case '$':
198*4882a593Smuzhiyun 			if (!(unescape(&in, &out, "$C$", ',')
199*4882a593Smuzhiyun 					|| unescape(&in, &out, "$SP$", '@')
200*4882a593Smuzhiyun 					|| unescape(&in, &out, "$BP$", '*')
201*4882a593Smuzhiyun 					|| unescape(&in, &out, "$RF$", '&')
202*4882a593Smuzhiyun 					|| unescape(&in, &out, "$LT$", '<')
203*4882a593Smuzhiyun 					|| unescape(&in, &out, "$GT$", '>')
204*4882a593Smuzhiyun 					|| unescape(&in, &out, "$LP$", '(')
205*4882a593Smuzhiyun 					|| unescape(&in, &out, "$RP$", ')')
206*4882a593Smuzhiyun 					|| unescape(&in, &out, "$u20$", ' ')
207*4882a593Smuzhiyun 					|| unescape(&in, &out, "$u27$", '\'')
208*4882a593Smuzhiyun 					|| unescape(&in, &out, "$u5b$", '[')
209*4882a593Smuzhiyun 					|| unescape(&in, &out, "$u5d$", ']')
210*4882a593Smuzhiyun 					|| unescape(&in, &out, "$u7e$", '~'))) {
211*4882a593Smuzhiyun 				pr_err("demangle-rust: unexpected escape sequence");
212*4882a593Smuzhiyun 				goto done;
213*4882a593Smuzhiyun 			}
214*4882a593Smuzhiyun 			break;
215*4882a593Smuzhiyun 		case '_':
216*4882a593Smuzhiyun 			/*
217*4882a593Smuzhiyun 			 * If this is the start of a path component and the next
218*4882a593Smuzhiyun 			 * character is an escape sequence, ignore the
219*4882a593Smuzhiyun 			 * underscore. The mangler inserts an underscore to make
220*4882a593Smuzhiyun 			 * sure the path component begins with a XID_Start
221*4882a593Smuzhiyun 			 * character.
222*4882a593Smuzhiyun 			 */
223*4882a593Smuzhiyun 			if ((in == sym || in[-1] == ':') && in[1] == '$')
224*4882a593Smuzhiyun 				in++;
225*4882a593Smuzhiyun 			else
226*4882a593Smuzhiyun 				*out++ = *in++;
227*4882a593Smuzhiyun 			break;
228*4882a593Smuzhiyun 		case '.':
229*4882a593Smuzhiyun 			if (in[1] == '.') {
230*4882a593Smuzhiyun 				/* ".." becomes "::" */
231*4882a593Smuzhiyun 				*out++ = ':';
232*4882a593Smuzhiyun 				*out++ = ':';
233*4882a593Smuzhiyun 				in += 2;
234*4882a593Smuzhiyun 			} else {
235*4882a593Smuzhiyun 				/* "." becomes "-" */
236*4882a593Smuzhiyun 				*out++ = '-';
237*4882a593Smuzhiyun 				in++;
238*4882a593Smuzhiyun 			}
239*4882a593Smuzhiyun 			break;
240*4882a593Smuzhiyun 		case 'a' ... 'z':
241*4882a593Smuzhiyun 		case 'A' ... 'Z':
242*4882a593Smuzhiyun 		case '0' ... '9':
243*4882a593Smuzhiyun 		case ':':
244*4882a593Smuzhiyun 			*out++ = *in++;
245*4882a593Smuzhiyun 			break;
246*4882a593Smuzhiyun 		default:
247*4882a593Smuzhiyun 			pr_err("demangle-rust: unexpected character '%c' in symbol\n",
248*4882a593Smuzhiyun 				*in);
249*4882a593Smuzhiyun 			goto done;
250*4882a593Smuzhiyun 		}
251*4882a593Smuzhiyun 
252*4882a593Smuzhiyun done:
253*4882a593Smuzhiyun 	*out = '\0';
254*4882a593Smuzhiyun }
255*4882a593Smuzhiyun 
unescape(const char ** in,char ** out,const char * seq,char value)256*4882a593Smuzhiyun static bool unescape(const char **in, char **out, const char *seq, char value)
257*4882a593Smuzhiyun {
258*4882a593Smuzhiyun 	size_t len = strlen(seq);
259*4882a593Smuzhiyun 
260*4882a593Smuzhiyun 	if (strncmp(*in, seq, len))
261*4882a593Smuzhiyun 		return false;
262*4882a593Smuzhiyun 
263*4882a593Smuzhiyun 	**out = value;
264*4882a593Smuzhiyun 
265*4882a593Smuzhiyun 	*in += len;
266*4882a593Smuzhiyun 	*out += 1;
267*4882a593Smuzhiyun 
268*4882a593Smuzhiyun 	return true;
269*4882a593Smuzhiyun }
270