xref: /OK3568_Linux_fs/kernel/arch/x86/kernel/kprobes/opt.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *  Kernel Probes Jump Optimization (Optprobes)
4  *
5  * Copyright (C) IBM Corporation, 2002, 2004
6  * Copyright (C) Hitachi Ltd., 2012
7  */
8 #include <linux/kprobes.h>
9 #include <linux/perf_event.h>
10 #include <linux/ptrace.h>
11 #include <linux/string.h>
12 #include <linux/slab.h>
13 #include <linux/hardirq.h>
14 #include <linux/preempt.h>
15 #include <linux/extable.h>
16 #include <linux/kdebug.h>
17 #include <linux/kallsyms.h>
18 #include <linux/ftrace.h>
19 #include <linux/objtool.h>
20 #include <linux/pgtable.h>
21 #include <linux/static_call.h>
22 
23 #include <asm/text-patching.h>
24 #include <asm/cacheflush.h>
25 #include <asm/desc.h>
26 #include <linux/uaccess.h>
27 #include <asm/alternative.h>
28 #include <asm/insn.h>
29 #include <asm/debugreg.h>
30 #include <asm/set_memory.h>
31 #include <asm/sections.h>
32 #include <asm/nospec-branch.h>
33 
34 #include "common.h"
35 
__recover_optprobed_insn(kprobe_opcode_t * buf,unsigned long addr)36 unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
37 {
38 	struct optimized_kprobe *op;
39 	struct kprobe *kp;
40 	long offs;
41 	int i;
42 
43 	for (i = 0; i < JMP32_INSN_SIZE; i++) {
44 		kp = get_kprobe((void *)addr - i);
45 		/* This function only handles jump-optimized kprobe */
46 		if (kp && kprobe_optimized(kp)) {
47 			op = container_of(kp, struct optimized_kprobe, kp);
48 			/* If op->list is not empty, op is under optimizing */
49 			if (list_empty(&op->list))
50 				goto found;
51 		}
52 	}
53 
54 	return addr;
55 found:
56 	/*
57 	 * If the kprobe can be optimized, original bytes which can be
58 	 * overwritten by jump destination address. In this case, original
59 	 * bytes must be recovered from op->optinsn.copied_insn buffer.
60 	 */
61 	if (copy_from_kernel_nofault(buf, (void *)addr,
62 		MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
63 		return 0UL;
64 
65 	if (addr == (unsigned long)kp->addr) {
66 		buf[0] = kp->opcode;
67 		memcpy(buf + 1, op->optinsn.copied_insn, DISP32_SIZE);
68 	} else {
69 		offs = addr - (unsigned long)kp->addr - 1;
70 		memcpy(buf, op->optinsn.copied_insn + offs, DISP32_SIZE - offs);
71 	}
72 
73 	return (unsigned long)buf;
74 }
75 
synthesize_clac(kprobe_opcode_t * addr)76 static void synthesize_clac(kprobe_opcode_t *addr)
77 {
78 	/*
79 	 * Can't be static_cpu_has() due to how objtool treats this feature bit.
80 	 * This isn't a fast path anyway.
81 	 */
82 	if (!boot_cpu_has(X86_FEATURE_SMAP))
83 		return;
84 
85 	/* Replace the NOP3 with CLAC */
86 	addr[0] = 0x0f;
87 	addr[1] = 0x01;
88 	addr[2] = 0xca;
89 }
90 
91 /* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
synthesize_set_arg1(kprobe_opcode_t * addr,unsigned long val)92 static void synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
93 {
94 #ifdef CONFIG_X86_64
95 	*addr++ = 0x48;
96 	*addr++ = 0xbf;
97 #else
98 	*addr++ = 0xb8;
99 #endif
100 	*(unsigned long *)addr = val;
101 }
102 
103 asm (
104 			".pushsection .rodata\n"
105 			"optprobe_template_func:\n"
106 			".pushsection .discard.func_stack_frame_non_standard\n"
107 			"__func_stack_frame_non_standard_optprobe_template_func:\n"
108 #ifdef CONFIG_64BIT
109 		        ".quad optprobe_template_func\n"
110 #else
111 			".long optprobe_template_func\n"
112 #endif
113 			".popsection\n"
114 			".global optprobe_template_entry\n"
115 			"optprobe_template_entry:\n"
116 #ifdef CONFIG_X86_64
117 			/* We don't bother saving the ss register */
118 			"	pushq %rsp\n"
119 			"	pushfq\n"
120 			".global optprobe_template_clac\n"
121 			"optprobe_template_clac:\n"
122 			ASM_NOP3
123 			SAVE_REGS_STRING
124 			"	movq %rsp, %rsi\n"
125 			".global optprobe_template_val\n"
126 			"optprobe_template_val:\n"
127 			ASM_NOP5
128 			ASM_NOP5
129 			".global optprobe_template_call\n"
130 			"optprobe_template_call:\n"
131 			ASM_NOP5
132 			/* Move flags to rsp */
133 			"	movq 18*8(%rsp), %rdx\n"
134 			"	movq %rdx, 19*8(%rsp)\n"
135 			RESTORE_REGS_STRING
136 			/* Skip flags entry */
137 			"	addq $8, %rsp\n"
138 			"	popfq\n"
139 #else /* CONFIG_X86_32 */
140 			"	pushl %esp\n"
141 			"	pushfl\n"
142 			".global optprobe_template_clac\n"
143 			"optprobe_template_clac:\n"
144 			ASM_NOP3
145 			SAVE_REGS_STRING
146 			"	movl %esp, %edx\n"
147 			".global optprobe_template_val\n"
148 			"optprobe_template_val:\n"
149 			ASM_NOP5
150 			".global optprobe_template_call\n"
151 			"optprobe_template_call:\n"
152 			ASM_NOP5
153 			/* Move flags into esp */
154 			"	movl 14*4(%esp), %edx\n"
155 			"	movl %edx, 15*4(%esp)\n"
156 			RESTORE_REGS_STRING
157 			/* Skip flags entry */
158 			"	addl $4, %esp\n"
159 			"	popfl\n"
160 #endif
161 			".global optprobe_template_end\n"
162 			"optprobe_template_end:\n"
163 			".popsection\n");
164 
165 #define TMPL_CLAC_IDX \
166 	((long)optprobe_template_clac - (long)optprobe_template_entry)
167 #define TMPL_MOVE_IDX \
168 	((long)optprobe_template_val - (long)optprobe_template_entry)
169 #define TMPL_CALL_IDX \
170 	((long)optprobe_template_call - (long)optprobe_template_entry)
171 #define TMPL_END_IDX \
172 	((long)optprobe_template_end - (long)optprobe_template_entry)
173 
174 /* Optimized kprobe call back function: called from optinsn */
175 static void
optimized_callback(struct optimized_kprobe * op,struct pt_regs * regs)176 optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
177 {
178 	/* This is possible if op is under delayed unoptimizing */
179 	if (kprobe_disabled(&op->kp))
180 		return;
181 
182 	preempt_disable();
183 	if (kprobe_running()) {
184 		kprobes_inc_nmissed_count(&op->kp);
185 	} else {
186 		struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
187 		/* Save skipped registers */
188 		regs->cs = __KERNEL_CS;
189 #ifdef CONFIG_X86_32
190 		regs->gs = 0;
191 #endif
192 		regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE;
193 		regs->orig_ax = ~0UL;
194 
195 		__this_cpu_write(current_kprobe, &op->kp);
196 		kcb->kprobe_status = KPROBE_HIT_ACTIVE;
197 		opt_pre_handler(&op->kp, regs);
198 		__this_cpu_write(current_kprobe, NULL);
199 	}
200 	preempt_enable();
201 }
202 NOKPROBE_SYMBOL(optimized_callback);
203 
copy_optimized_instructions(u8 * dest,u8 * src,u8 * real)204 static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real)
205 {
206 	struct insn insn;
207 	int len = 0, ret;
208 
209 	while (len < JMP32_INSN_SIZE) {
210 		ret = __copy_instruction(dest + len, src + len, real + len, &insn);
211 		if (!ret || !can_boost(&insn, src + len))
212 			return -EINVAL;
213 		len += ret;
214 	}
215 	/* Check whether the address range is reserved */
216 	if (ftrace_text_reserved(src, src + len - 1) ||
217 	    alternatives_text_reserved(src, src + len - 1) ||
218 	    jump_label_text_reserved(src, src + len - 1) ||
219 	    static_call_text_reserved(src, src + len - 1))
220 		return -EBUSY;
221 
222 	return len;
223 }
224 
225 /* Check whether insn is indirect jump */
__insn_is_indirect_jump(struct insn * insn)226 static int __insn_is_indirect_jump(struct insn *insn)
227 {
228 	return ((insn->opcode.bytes[0] == 0xff &&
229 		(X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
230 		insn->opcode.bytes[0] == 0xea);	/* Segment based jump */
231 }
232 
233 /* Check whether insn jumps into specified address range */
insn_jump_into_range(struct insn * insn,unsigned long start,int len)234 static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
235 {
236 	unsigned long target = 0;
237 
238 	switch (insn->opcode.bytes[0]) {
239 	case 0xe0:	/* loopne */
240 	case 0xe1:	/* loope */
241 	case 0xe2:	/* loop */
242 	case 0xe3:	/* jcxz */
243 	case 0xe9:	/* near relative jump */
244 	case 0xeb:	/* short relative jump */
245 		break;
246 	case 0x0f:
247 		if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
248 			break;
249 		return 0;
250 	default:
251 		if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
252 			break;
253 		return 0;
254 	}
255 	target = (unsigned long)insn->next_byte + insn->immediate.value;
256 
257 	return (start <= target && target <= start + len);
258 }
259 
insn_is_indirect_jump(struct insn * insn)260 static int insn_is_indirect_jump(struct insn *insn)
261 {
262 	int ret = __insn_is_indirect_jump(insn);
263 
264 #ifdef CONFIG_RETPOLINE
265 	/*
266 	 * Jump to x86_indirect_thunk_* is treated as an indirect jump.
267 	 * Note that even with CONFIG_RETPOLINE=y, the kernel compiled with
268 	 * older gcc may use indirect jump. So we add this check instead of
269 	 * replace indirect-jump check.
270 	 */
271 	if (!ret)
272 		ret = insn_jump_into_range(insn,
273 				(unsigned long)__indirect_thunk_start,
274 				(unsigned long)__indirect_thunk_end -
275 				(unsigned long)__indirect_thunk_start);
276 #endif
277 	return ret;
278 }
279 
is_padding_int3(unsigned long addr,unsigned long eaddr)280 static bool is_padding_int3(unsigned long addr, unsigned long eaddr)
281 {
282 	unsigned char ops;
283 
284 	for (; addr < eaddr; addr++) {
285 		if (get_kernel_nofault(ops, (void *)addr) < 0 ||
286 		    ops != INT3_INSN_OPCODE)
287 			return false;
288 	}
289 
290 	return true;
291 }
292 
293 /* Decode whole function to ensure any instructions don't jump into target */
can_optimize(unsigned long paddr)294 static int can_optimize(unsigned long paddr)
295 {
296 	unsigned long addr, size = 0, offset = 0;
297 	struct insn insn;
298 	kprobe_opcode_t buf[MAX_INSN_SIZE];
299 
300 	/* Lookup symbol including addr */
301 	if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
302 		return 0;
303 
304 	/*
305 	 * Do not optimize in the entry code due to the unstable
306 	 * stack handling and registers setup.
307 	 */
308 	if (((paddr >= (unsigned long)__entry_text_start) &&
309 	     (paddr <  (unsigned long)__entry_text_end)))
310 		return 0;
311 
312 	/* Check there is enough space for a relative jump. */
313 	if (size - offset < JMP32_INSN_SIZE)
314 		return 0;
315 
316 	/* Decode instructions */
317 	addr = paddr - offset;
318 	while (addr < paddr - offset + size) { /* Decode until function end */
319 		unsigned long recovered_insn;
320 		if (search_exception_tables(addr))
321 			/*
322 			 * Since some fixup code will jumps into this function,
323 			 * we can't optimize kprobe in this function.
324 			 */
325 			return 0;
326 		recovered_insn = recover_probed_instruction(buf, addr);
327 		if (!recovered_insn)
328 			return 0;
329 		kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
330 		insn_get_length(&insn);
331 		/*
332 		 * In the case of detecting unknown breakpoint, this could be
333 		 * a padding INT3 between functions. Let's check that all the
334 		 * rest of the bytes are also INT3.
335 		 */
336 		if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
337 			return is_padding_int3(addr, paddr - offset + size) ? 1 : 0;
338 
339 		/* Recover address */
340 		insn.kaddr = (void *)addr;
341 		insn.next_byte = (void *)(addr + insn.length);
342 		/* Check any instructions don't jump into target */
343 		if (insn_is_indirect_jump(&insn) ||
344 		    insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE,
345 					 DISP32_SIZE))
346 			return 0;
347 		addr += insn.length;
348 	}
349 
350 	return 1;
351 }
352 
353 /* Check optimized_kprobe can actually be optimized. */
arch_check_optimized_kprobe(struct optimized_kprobe * op)354 int arch_check_optimized_kprobe(struct optimized_kprobe *op)
355 {
356 	int i;
357 	struct kprobe *p;
358 
359 	for (i = 1; i < op->optinsn.size; i++) {
360 		p = get_kprobe(op->kp.addr + i);
361 		if (p && !kprobe_disabled(p))
362 			return -EEXIST;
363 	}
364 
365 	return 0;
366 }
367 
368 /* Check the addr is within the optimized instructions. */
arch_within_optimized_kprobe(struct optimized_kprobe * op,unsigned long addr)369 int arch_within_optimized_kprobe(struct optimized_kprobe *op,
370 				 unsigned long addr)
371 {
372 	return ((unsigned long)op->kp.addr <= addr &&
373 		(unsigned long)op->kp.addr + op->optinsn.size > addr);
374 }
375 
376 /* Free optimized instruction slot */
377 static
__arch_remove_optimized_kprobe(struct optimized_kprobe * op,int dirty)378 void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
379 {
380 	u8 *slot = op->optinsn.insn;
381 	if (slot) {
382 		int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE;
383 
384 		/* Record the perf event before freeing the slot */
385 		if (dirty)
386 			perf_event_text_poke(slot, slot, len, NULL, 0);
387 
388 		free_optinsn_slot(slot, dirty);
389 		op->optinsn.insn = NULL;
390 		op->optinsn.size = 0;
391 	}
392 }
393 
arch_remove_optimized_kprobe(struct optimized_kprobe * op)394 void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
395 {
396 	__arch_remove_optimized_kprobe(op, 1);
397 }
398 
399 /*
400  * Copy replacing target instructions
401  * Target instructions MUST be relocatable (checked inside)
402  * This is called when new aggr(opt)probe is allocated or reused.
403  */
arch_prepare_optimized_kprobe(struct optimized_kprobe * op,struct kprobe * __unused)404 int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
405 				  struct kprobe *__unused)
406 {
407 	u8 *buf = NULL, *slot;
408 	int ret, len;
409 	long rel;
410 
411 	if (!can_optimize((unsigned long)op->kp.addr))
412 		return -EILSEQ;
413 
414 	buf = kzalloc(MAX_OPTINSN_SIZE, GFP_KERNEL);
415 	if (!buf)
416 		return -ENOMEM;
417 
418 	op->optinsn.insn = slot = get_optinsn_slot();
419 	if (!slot) {
420 		ret = -ENOMEM;
421 		goto out;
422 	}
423 
424 	/*
425 	 * Verify if the address gap is in 2GB range, because this uses
426 	 * a relative jump.
427 	 */
428 	rel = (long)slot - (long)op->kp.addr + JMP32_INSN_SIZE;
429 	if (abs(rel) > 0x7fffffff) {
430 		ret = -ERANGE;
431 		goto err;
432 	}
433 
434 	/* Copy arch-dep-instance from template */
435 	memcpy(buf, optprobe_template_entry, TMPL_END_IDX);
436 
437 	/* Copy instructions into the out-of-line buffer */
438 	ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr,
439 					  slot + TMPL_END_IDX);
440 	if (ret < 0)
441 		goto err;
442 	op->optinsn.size = ret;
443 	len = TMPL_END_IDX + op->optinsn.size;
444 
445 	synthesize_clac(buf + TMPL_CLAC_IDX);
446 
447 	/* Set probe information */
448 	synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
449 
450 	/* Set probe function call */
451 	synthesize_relcall(buf + TMPL_CALL_IDX,
452 			   slot + TMPL_CALL_IDX, optimized_callback);
453 
454 	/* Set returning jmp instruction at the tail of out-of-line buffer */
455 	synthesize_reljump(buf + len, slot + len,
456 			   (u8 *)op->kp.addr + op->optinsn.size);
457 	len += JMP32_INSN_SIZE;
458 
459 	/*
460 	 * Note	len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also
461 	 * used in __arch_remove_optimized_kprobe().
462 	 */
463 
464 	/* We have to use text_poke() for instruction buffer because it is RO */
465 	perf_event_text_poke(slot, NULL, 0, buf, len);
466 	text_poke(slot, buf, len);
467 
468 	ret = 0;
469 out:
470 	kfree(buf);
471 	return ret;
472 
473 err:
474 	__arch_remove_optimized_kprobe(op, 0);
475 	goto out;
476 }
477 
478 /*
479  * Replace breakpoints (INT3) with relative jumps (JMP.d32).
480  * Caller must call with locking kprobe_mutex and text_mutex.
481  *
482  * The caller will have installed a regular kprobe and after that issued
483  * syncrhonize_rcu_tasks(), this ensures that the instruction(s) that live in
484  * the 4 bytes after the INT3 are unused and can now be overwritten.
485  */
arch_optimize_kprobes(struct list_head * oplist)486 void arch_optimize_kprobes(struct list_head *oplist)
487 {
488 	struct optimized_kprobe *op, *tmp;
489 	u8 insn_buff[JMP32_INSN_SIZE];
490 
491 	list_for_each_entry_safe(op, tmp, oplist, list) {
492 		s32 rel = (s32)((long)op->optinsn.insn -
493 			((long)op->kp.addr + JMP32_INSN_SIZE));
494 
495 		WARN_ON(kprobe_disabled(&op->kp));
496 
497 		/* Backup instructions which will be replaced by jump address */
498 		memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_INSN_SIZE,
499 		       DISP32_SIZE);
500 
501 		insn_buff[0] = JMP32_INSN_OPCODE;
502 		*(s32 *)(&insn_buff[1]) = rel;
503 
504 		text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL);
505 
506 		list_del_init(&op->list);
507 	}
508 }
509 
510 /*
511  * Replace a relative jump (JMP.d32) with a breakpoint (INT3).
512  *
513  * After that, we can restore the 4 bytes after the INT3 to undo what
514  * arch_optimize_kprobes() scribbled. This is safe since those bytes will be
515  * unused once the INT3 lands.
516  */
arch_unoptimize_kprobe(struct optimized_kprobe * op)517 void arch_unoptimize_kprobe(struct optimized_kprobe *op)
518 {
519 	u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, };
520 	u8 old[JMP32_INSN_SIZE];
521 	u8 *addr = op->kp.addr;
522 
523 	memcpy(old, op->kp.addr, JMP32_INSN_SIZE);
524 	memcpy(new + INT3_INSN_SIZE,
525 	       op->optinsn.copied_insn,
526 	       JMP32_INSN_SIZE - INT3_INSN_SIZE);
527 
528 	text_poke(addr, new, INT3_INSN_SIZE);
529 	text_poke_sync();
530 	text_poke(addr + INT3_INSN_SIZE,
531 		  new + INT3_INSN_SIZE,
532 		  JMP32_INSN_SIZE - INT3_INSN_SIZE);
533 	text_poke_sync();
534 
535 	perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE);
536 }
537 
538 /*
539  * Recover original instructions and breakpoints from relative jumps.
540  * Caller must call with locking kprobe_mutex.
541  */
arch_unoptimize_kprobes(struct list_head * oplist,struct list_head * done_list)542 extern void arch_unoptimize_kprobes(struct list_head *oplist,
543 				    struct list_head *done_list)
544 {
545 	struct optimized_kprobe *op, *tmp;
546 
547 	list_for_each_entry_safe(op, tmp, oplist, list) {
548 		arch_unoptimize_kprobe(op);
549 		list_move(&op->list, done_list);
550 	}
551 }
552 
setup_detour_execution(struct kprobe * p,struct pt_regs * regs,int reenter)553 int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
554 {
555 	struct optimized_kprobe *op;
556 
557 	if (p->flags & KPROBE_FLAG_OPTIMIZED) {
558 		/* This kprobe is really able to run optimized path. */
559 		op = container_of(p, struct optimized_kprobe, kp);
560 		/* Detour through copied instructions */
561 		regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
562 		if (!reenter)
563 			reset_current_kprobe();
564 		return 1;
565 	}
566 	return 0;
567 }
568 NOKPROBE_SYMBOL(setup_detour_execution);
569