diff options
| author | Ingo Molnar <mingo@kernel.org> | 2014-06-05 13:18:03 +0400 | 
|---|---|---|
| committer | Ingo Molnar <mingo@kernel.org> | 2014-06-05 14:26:27 +0400 | 
| commit | c56d34064b6eb9f9cde9e35bbfe16eedf3d81f94 (patch) | |
| tree | 9877ef9e1b238e14a1878f10d51ea55fbca5f619 | |
| parent | b13fa91421213a8d1fd05086050f05e994f3b72d (diff) | |
| parent | a03b1e1c372b60183b8141cdd161316429fab5ac (diff) | |
| download | linux-c56d34064b6eb9f9cde9e35bbfe16eedf3d81f94.tar.xz | |
Merge branch 'perf/uprobes' into perf/core
These bits from Oleg are fully cooked, ship them to Linus.
Signed-off-by: Ingo Molnar <mingo@kernel.org>
| -rw-r--r-- | arch/x86/include/asm/traps.h | 1 | ||||
| -rw-r--r-- | arch/x86/include/asm/uprobes.h | 10 | ||||
| -rw-r--r-- | arch/x86/kernel/process_64.c | 7 | ||||
| -rw-r--r-- | arch/x86/kernel/traps.c | 110 | ||||
| -rw-r--r-- | arch/x86/kernel/uprobes.c | 506 | ||||
| -rw-r--r-- | include/linux/uprobes.h | 4 | ||||
| -rw-r--r-- | kernel/events/uprobes.c | 35 | ||||
| -rw-r--r-- | kernel/trace/trace_uprobe.c | 46 | 
8 files changed, 402 insertions, 317 deletions
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 58d66fe06b61..a7b212db9e04 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -98,7 +98,6 @@ static inline int get_si_code(unsigned long condition)  extern int panic_on_unrecovered_nmi; -void math_error(struct pt_regs *, int, int);  void math_emulate(struct math_emu_info *);  #ifndef CONFIG_X86_32  asmlinkage void smp_thermal_interrupt(void); diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 93bee7b93854..7be3c079e389 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -41,18 +41,18 @@ struct arch_uprobe {  		u8			ixol[MAX_UINSN_BYTES];  	}; -	u16				fixups;  	const struct uprobe_xol_ops	*ops;  	union { -#ifdef CONFIG_X86_64 -		unsigned long			rip_rela_target_address; -#endif  		struct {  			s32	offs;  			u8	ilen;  			u8	opc1; -		}				branch; +		}			branch; +		struct { +			u8	fixups; +			u8	ilen; +		} 			def;  	};  }; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 9c0280f93d05..9b53940981b7 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -413,12 +413,11 @@ void set_personality_ia32(bool x32)  	set_thread_flag(TIF_ADDR32);  	/* Mark the associated mm as containing 32-bit tasks. */ -	if (current->mm) -		current->mm->context.ia32_compat = 1; -  	if (x32) {  		clear_thread_flag(TIF_IA32);  		set_thread_flag(TIF_X32); +		if (current->mm) +			current->mm->context.ia32_compat = TIF_X32;  		current->personality &= ~READ_IMPLIES_EXEC;  		/* is_compat_task() uses the presence of the x32  		   syscall bit flag to determine compat status */ @@ -426,6 +425,8 @@ void set_personality_ia32(bool x32)  	} else {  		set_thread_flag(TIF_IA32);  		clear_thread_flag(TIF_X32); +		if (current->mm) +			current->mm->context.ia32_compat = TIF_IA32;  		current->personality |= force_personality32;  		/* Prepare the first "return" to user space */  		current_thread_info()->status |= TS_COMPAT; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 57409f6b8c62..3fdb20548c4b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -23,6 +23,7 @@  #include <linux/kernel.h>  #include <linux/module.h>  #include <linux/ptrace.h> +#include <linux/uprobes.h>  #include <linux/string.h>  #include <linux/delay.h>  #include <linux/errno.h> @@ -136,6 +137,37 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,  	return -1;  } +static siginfo_t *fill_trap_info(struct pt_regs *regs, int signr, int trapnr, +				siginfo_t *info) +{ +	unsigned long siaddr; +	int sicode; + +	switch (trapnr) { +	default: +		return SEND_SIG_PRIV; + +	case X86_TRAP_DE: +		sicode = FPE_INTDIV; +		siaddr = uprobe_get_trap_addr(regs); +		break; +	case X86_TRAP_UD: +		sicode = ILL_ILLOPN; +		siaddr = uprobe_get_trap_addr(regs); +		break; +	case X86_TRAP_AC: +		sicode = BUS_ADRALN; +		siaddr = 0; +		break; +	} + +	info->si_signo = signr; +	info->si_errno = 0; +	info->si_code = sicode; +	info->si_addr = (void __user *)siaddr; +	return info; +} +  static void __kprobes  do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,  	long error_code, siginfo_t *info) @@ -168,60 +200,42 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,  	}  #endif -	if (info) -		force_sig_info(signr, info, tsk); -	else -		force_sig(signr, tsk); +	force_sig_info(signr, info ?: SEND_SIG_PRIV, tsk);  } -#define DO_ERROR(trapnr, signr, str, name)				\ -dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	\ -{									\ -	enum ctx_state prev_state;					\ -									\ -	prev_state = exception_enter();					\ -	if (notify_die(DIE_TRAP, str, regs, error_code,			\ -			trapnr, signr) == NOTIFY_STOP) {		\ -		exception_exit(prev_state);				\ -		return;							\ -	}								\ -	conditional_sti(regs);						\ -	do_trap(trapnr, signr, str, regs, error_code, NULL);		\ -	exception_exit(prev_state);					\ +static void do_error_trap(struct pt_regs *regs, long error_code, char *str, +			  unsigned long trapnr, int signr) +{ +	enum ctx_state prev_state = exception_enter(); +	siginfo_t info; + +	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != +			NOTIFY_STOP) { +		conditional_sti(regs); +		do_trap(trapnr, signr, str, regs, error_code, +			fill_trap_info(regs, signr, trapnr, &info)); +	} + +	exception_exit(prev_state);  } -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr)		\ +#define DO_ERROR(trapnr, signr, str, name)				\  dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	\  {									\ -	siginfo_t info;							\ -	enum ctx_state prev_state;					\ -									\ -	info.si_signo = signr;						\ -	info.si_errno = 0;						\ -	info.si_code = sicode;						\ -	info.si_addr = (void __user *)siaddr;				\ -	prev_state = exception_enter();					\ -	if (notify_die(DIE_TRAP, str, regs, error_code,			\ -			trapnr, signr) == NOTIFY_STOP) {		\ -		exception_exit(prev_state);				\ -		return;							\ -	}								\ -	conditional_sti(regs);						\ -	do_trap(trapnr, signr, str, regs, error_code, &info);		\ -	exception_exit(prev_state);					\ +	do_error_trap(regs, error_code, str, trapnr, signr);		\  } -DO_ERROR_INFO(X86_TRAP_DE,     SIGFPE,  "divide error",			divide_error,		     FPE_INTDIV, regs->ip ) -DO_ERROR     (X86_TRAP_OF,     SIGSEGV, "overflow",			overflow					  ) -DO_ERROR     (X86_TRAP_BR,     SIGSEGV, "bounds",			bounds						  ) -DO_ERROR_INFO(X86_TRAP_UD,     SIGILL,  "invalid opcode",		invalid_op,		     ILL_ILLOPN, regs->ip ) -DO_ERROR     (X86_TRAP_OLD_MF, SIGFPE,  "coprocessor segment overrun",	coprocessor_segment_overrun			  ) -DO_ERROR     (X86_TRAP_TS,     SIGSEGV, "invalid TSS",			invalid_TSS					  ) -DO_ERROR     (X86_TRAP_NP,     SIGBUS,  "segment not present",		segment_not_present				  ) +DO_ERROR(X86_TRAP_DE,     SIGFPE,  "divide error",		divide_error) +DO_ERROR(X86_TRAP_OF,     SIGSEGV, "overflow",			overflow) +DO_ERROR(X86_TRAP_BR,     SIGSEGV, "bounds",			bounds) +DO_ERROR(X86_TRAP_UD,     SIGILL,  "invalid opcode",		invalid_op) +DO_ERROR(X86_TRAP_OLD_MF, SIGFPE,  "coprocessor segment overrun",coprocessor_segment_overrun) +DO_ERROR(X86_TRAP_TS,     SIGSEGV, "invalid TSS",		invalid_TSS) +DO_ERROR(X86_TRAP_NP,     SIGBUS,  "segment not present",	segment_not_present)  #ifdef CONFIG_X86_32 -DO_ERROR     (X86_TRAP_SS,     SIGBUS,  "stack segment",		stack_segment					  ) +DO_ERROR(X86_TRAP_SS,     SIGBUS,  "stack segment",		stack_segment)  #endif -DO_ERROR_INFO(X86_TRAP_AC,     SIGBUS,  "alignment check",		alignment_check,	     BUS_ADRALN, 0	  ) +DO_ERROR(X86_TRAP_AC,     SIGBUS,  "alignment check",		alignment_check)  #ifdef CONFIG_X86_64  /* Runs on IST stack */ @@ -305,7 +319,7 @@ do_general_protection(struct pt_regs *regs, long error_code)  		pr_cont("\n");  	} -	force_sig(SIGSEGV, tsk); +	force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);  exit:  	exception_exit(prev_state);  } @@ -488,7 +502,7 @@ exit:   * the correct behaviour even in the presence of the asynchronous   * IRQ13 behaviour   */ -void math_error(struct pt_regs *regs, int error_code, int trapnr) +static void math_error(struct pt_regs *regs, int error_code, int trapnr)  {  	struct task_struct *task = current;  	siginfo_t info; @@ -518,7 +532,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)  	task->thread.error_code = error_code;  	info.si_signo = SIGFPE;  	info.si_errno = 0; -	info.si_addr = (void __user *)regs->ip; +	info.si_addr = (void __user *)uprobe_get_trap_addr(regs);  	if (trapnr == X86_TRAP_MF) {  		unsigned short cwd, swd;  		/* @@ -645,7 +659,7 @@ void math_state_restore(void)  	 */  	if (unlikely(restore_fpu_checking(tsk))) {  		drop_init_fpu(tsk); -		force_sig(SIGSEGV, tsk); +		force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);  		return;  	} diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index ace22916ade3..159ca520ef5b 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -32,20 +32,20 @@  /* Post-execution fixups. */ -/* No fixup needed */ -#define UPROBE_FIX_NONE		0x0 -  /* Adjust IP back to vicinity of actual insn */ -#define UPROBE_FIX_IP		0x1 +#define UPROBE_FIX_IP		0x01  /* Adjust the return address of a call insn */ -#define UPROBE_FIX_CALL	0x2 +#define UPROBE_FIX_CALL		0x02  /* Instruction will modify TF, don't change it */ -#define UPROBE_FIX_SETF	0x4 +#define UPROBE_FIX_SETF		0x04 -#define UPROBE_FIX_RIP_AX	0x8000 -#define UPROBE_FIX_RIP_CX	0x4000 +#define UPROBE_FIX_RIP_SI	0x08 +#define UPROBE_FIX_RIP_DI	0x10 +#define UPROBE_FIX_RIP_BX	0x20 +#define UPROBE_FIX_RIP_MASK	\ +	(UPROBE_FIX_RIP_SI | UPROBE_FIX_RIP_DI | UPROBE_FIX_RIP_BX)  #define	UPROBE_TRAP_NR		UINT_MAX @@ -67,6 +67,7 @@   * to keep gcc from statically optimizing it out, as variable_test_bit makes   * some versions of gcc to think only *(unsigned long*) is used.   */ +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)  static volatile u32 good_insns_32[256 / 32] = {  	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */  	/*      ----------------------------------------------         */ @@ -89,33 +90,12 @@ static volatile u32 good_insns_32[256 / 32] = {  	/*      ----------------------------------------------         */  	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */  }; +#else +#define good_insns_32	NULL +#endif -/* Using this for both 64-bit and 32-bit apps */ -static volatile u32 good_2byte_insns[256 / 32] = { -	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */ -	/*      ----------------------------------------------         */ -	W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ -	W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ -	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ -	W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ -	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ -	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ -	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ -	W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ -	W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ -	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ -	W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ -	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ -	W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ -	W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ -	W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ -	W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* f0 */ -	/*      ----------------------------------------------         */ -	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */ -}; - -#ifdef CONFIG_X86_64  /* Good-instruction tables for 64-bit apps */ +#if defined(CONFIG_X86_64)  static volatile u32 good_insns_64[256 / 32] = {  	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */  	/*      ----------------------------------------------         */ @@ -138,7 +118,33 @@ static volatile u32 good_insns_64[256 / 32] = {  	/*      ----------------------------------------------         */  	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */  }; +#else +#define good_insns_64	NULL  #endif + +/* Using this for both 64-bit and 32-bit apps */ +static volatile u32 good_2byte_insns[256 / 32] = { +	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */ +	/*      ----------------------------------------------         */ +	W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */ +	W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */ +	W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */ +	W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ +	W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ +	W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */ +	W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */ +	W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ +	W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ +	W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ +	W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */ +	W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */ +	W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ +	W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ +	W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */ +	W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0)   /* f0 */ +	/*      ----------------------------------------------         */ +	/*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f         */ +};  #undef W  /* @@ -209,16 +215,25 @@ static bool is_prefix_bad(struct insn *insn)  	return false;  } -static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn) +static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64)  { -	insn_init(insn, auprobe->insn, false); +	u32 volatile *good_insns; + +	insn_init(insn, auprobe->insn, x86_64); +	/* has the side-effect of processing the entire instruction */ +	insn_get_length(insn); +	if (WARN_ON_ONCE(!insn_complete(insn))) +		return -ENOEXEC; -	/* Skip good instruction prefixes; reject "bad" ones. */ -	insn_get_opcode(insn);  	if (is_prefix_bad(insn))  		return -ENOTSUPP; -	if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32)) +	if (x86_64) +		good_insns = good_insns_64; +	else +		good_insns = good_insns_32; + +	if (test_bit(OPCODE1(insn), (unsigned long *)good_insns))  		return 0;  	if (insn->opcode.nbytes == 2) { @@ -230,14 +245,18 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)  }  #ifdef CONFIG_X86_64 +static inline bool is_64bit_mm(struct mm_struct *mm) +{ +	return	!config_enabled(CONFIG_IA32_EMULATION) || +		!(mm->context.ia32_compat == TIF_IA32); +}  /*   * If arch_uprobe->insn doesn't use rip-relative addressing, return   * immediately.  Otherwise, rewrite the instruction so that it accesses   * its memory operand indirectly through a scratch register.  Set - * arch_uprobe->fixups and arch_uprobe->rip_rela_target_address - * accordingly.  (The contents of the scratch register will be saved - * before we single-step the modified instruction, and restored - * afterward.) + * def->fixups accordingly. (The contents of the scratch register + * will be saved before we single-step the modified instruction, + * and restored afterward).   *   * We do this because a rip-relative instruction can access only a   * relatively small area (+/- 2 GB from the instruction), and the XOL @@ -248,164 +267,192 @@ static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)   *   * Some useful facts about rip-relative instructions:   * - *  - There's always a modrm byte. + *  - There's always a modrm byte with bit layout "00 reg 101".   *  - There's never a SIB byte.   *  - The displacement is always 4 bytes. + *  - REX.B=1 bit in REX prefix, which normally extends r/m field, + *    has no effect on rip-relative mode. It doesn't make modrm byte + *    with r/m=101 refer to register 1101 = R13.   */ -static void -handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) +static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)  {  	u8 *cursor;  	u8 reg; +	u8 reg2;  	if (!insn_rip_relative(insn))  		return;  	/* -	 * insn_rip_relative() would have decoded rex_prefix, modrm. +	 * insn_rip_relative() would have decoded rex_prefix, vex_prefix, modrm.  	 * Clear REX.b bit (extension of MODRM.rm field): -	 * we want to encode rax/rcx, not r8/r9. +	 * we want to encode low numbered reg, not r8+.  	 */  	if (insn->rex_prefix.nbytes) {  		cursor = auprobe->insn + insn_offset_rex_prefix(insn); -		*cursor &= 0xfe;	/* Clearing REX.B bit */ +		/* REX byte has 0100wrxb layout, clearing REX.b bit */ +		*cursor &= 0xfe; +	} +	/* +	 * Similar treatment for VEX3 prefix. +	 * TODO: add XOP/EVEX treatment when insn decoder supports them +	 */ +	if (insn->vex_prefix.nbytes == 3) { +		/* +		 * vex2:     c5    rvvvvLpp   (has no b bit) +		 * vex3/xop: c4/8f rxbmmmmm wvvvvLpp +		 * evex:     62    rxbR00mm wvvvv1pp zllBVaaa +		 *   (evex will need setting of both b and x since +		 *   in non-sib encoding evex.x is 4th bit of MODRM.rm) +		 * Setting VEX3.b (setting because it has inverted meaning): +		 */ +		cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; +		*cursor |= 0x20;  	}  	/* +	 * Convert from rip-relative addressing to register-relative addressing +	 * via a scratch register. +	 * +	 * This is tricky since there are insns with modrm byte +	 * which also use registers not encoded in modrm byte: +	 * [i]div/[i]mul: implicitly use dx:ax +	 * shift ops: implicitly use cx +	 * cmpxchg: implicitly uses ax +	 * cmpxchg8/16b: implicitly uses dx:ax and bx:cx +	 *   Encoding: 0f c7/1 modrm +	 *   The code below thinks that reg=1 (cx), chooses si as scratch. +	 * mulx: implicitly uses dx: mulx r/m,r1,r2 does r1:r2 = dx * r/m. +	 *   First appeared in Haswell (BMI2 insn). It is vex-encoded. +	 *   Example where none of bx,cx,dx can be used as scratch reg: +	 *   c4 e2 63 f6 0d disp32   mulx disp32(%rip),%ebx,%ecx +	 * [v]pcmpistri: implicitly uses cx, xmm0 +	 * [v]pcmpistrm: implicitly uses xmm0 +	 * [v]pcmpestri: implicitly uses ax, dx, cx, xmm0 +	 * [v]pcmpestrm: implicitly uses ax, dx, xmm0 +	 *   Evil SSE4.2 string comparison ops from hell. +	 * maskmovq/[v]maskmovdqu: implicitly uses (ds:rdi) as destination. +	 *   Encoding: 0f f7 modrm, 66 0f f7 modrm, vex-encoded: c5 f9 f7 modrm. +	 *   Store op1, byte-masked by op2 msb's in each byte, to (ds:rdi). +	 *   AMD says it has no 3-operand form (vex.vvvv must be 1111) +	 *   and that it can have only register operands, not mem +	 *   (its modrm byte must have mode=11). +	 *   If these restrictions will ever be lifted, +	 *   we'll need code to prevent selection of di as scratch reg! +	 * +	 * Summary: I don't know any insns with modrm byte which +	 * use SI register implicitly. DI register is used only +	 * by one insn (maskmovq) and BX register is used +	 * only by one too (cmpxchg8b). +	 * BP is stack-segment based (may be a problem?). +	 * AX, DX, CX are off-limits (many implicit users). +	 * SP is unusable (it's stack pointer - think about "pop mem"; +	 * also, rsp+disp32 needs sib encoding -> insn length change). +	 */ + +	reg = MODRM_REG(insn);	/* Fetch modrm.reg */ +	reg2 = 0xff;		/* Fetch vex.vvvv */ +	if (insn->vex_prefix.nbytes == 2) +		reg2 = insn->vex_prefix.bytes[1]; +	else if (insn->vex_prefix.nbytes == 3) +		reg2 = insn->vex_prefix.bytes[2]; +	/* +	 * TODO: add XOP, EXEV vvvv reading. +	 * +	 * vex.vvvv field is in bits 6-3, bits are inverted. +	 * But in 32-bit mode, high-order bit may be ignored. +	 * Therefore, let's consider only 3 low-order bits. +	 */ +	reg2 = ((reg2 >> 3) & 0x7) ^ 0x7; +	/* +	 * Register numbering is ax,cx,dx,bx, sp,bp,si,di, r8..r15. +	 * +	 * Choose scratch reg. Order is important: must not select bx +	 * if we can use si (cmpxchg8b case!) +	 */ +	if (reg != 6 && reg2 != 6) { +		reg2 = 6; +		auprobe->def.fixups |= UPROBE_FIX_RIP_SI; +	} else if (reg != 7 && reg2 != 7) { +		reg2 = 7; +		auprobe->def.fixups |= UPROBE_FIX_RIP_DI; +		/* TODO (paranoia): force maskmovq to not use di */ +	} else { +		reg2 = 3; +		auprobe->def.fixups |= UPROBE_FIX_RIP_BX; +	} +	/*  	 * Point cursor at the modrm byte.  The next 4 bytes are the  	 * displacement.  Beyond the displacement, for some instructions,  	 * is the immediate operand.  	 */  	cursor = auprobe->insn + insn_offset_modrm(insn); -	insn_get_length(insn); -  	/* -	 * Convert from rip-relative addressing to indirect addressing -	 * via a scratch register.  Change the r/m field from 0x5 (%rip) -	 * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field. +	 * Change modrm from "00 reg 101" to "10 reg reg2". Example: +	 * 89 05 disp32  mov %eax,disp32(%rip) becomes +	 * 89 86 disp32  mov %eax,disp32(%rsi)  	 */ -	reg = MODRM_REG(insn); -	if (reg == 0) { -		/* -		 * The register operand (if any) is either the A register -		 * (%rax, %eax, etc.) or (if the 0x4 bit is set in the -		 * REX prefix) %r8.  In any case, we know the C register -		 * is NOT the register operand, so we use %rcx (register -		 * #1) for the scratch register. -		 */ -		auprobe->fixups = UPROBE_FIX_RIP_CX; -		/* Change modrm from 00 000 101 to 00 000 001. */ -		*cursor = 0x1; -	} else { -		/* Use %rax (register #0) for the scratch register. */ -		auprobe->fixups = UPROBE_FIX_RIP_AX; -		/* Change modrm from 00 xxx 101 to 00 xxx 000 */ -		*cursor = (reg << 3); -	} - -	/* Target address = address of next instruction + (signed) offset */ -	auprobe->rip_rela_target_address = (long)insn->length + insn->displacement.value; +	*cursor = 0x80 | (reg << 3) | reg2; +} -	/* Displacement field is gone; slide immediate field (if any) over. */ -	if (insn->immediate.nbytes) { -		cursor++; -		memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes); -	} +static inline unsigned long * +scratch_reg(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ +	if (auprobe->def.fixups & UPROBE_FIX_RIP_SI) +		return ®s->si; +	if (auprobe->def.fixups & UPROBE_FIX_RIP_DI) +		return ®s->di; +	return ®s->bx;  }  /*   * If we're emulating a rip-relative instruction, save the contents   * of the scratch register and store the target address in that register.   */ -static void -pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, -				struct arch_uprobe_task *autask) -{ -	if (auprobe->fixups & UPROBE_FIX_RIP_AX) { -		autask->saved_scratch_register = regs->ax; -		regs->ax = current->utask->vaddr; -		regs->ax += auprobe->rip_rela_target_address; -	} else if (auprobe->fixups & UPROBE_FIX_RIP_CX) { -		autask->saved_scratch_register = regs->cx; -		regs->cx = current->utask->vaddr; -		regs->cx += auprobe->rip_rela_target_address; -	} -} - -static void -handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction) +static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)  { -	if (auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) { -		struct arch_uprobe_task *autask; - -		autask = ¤t->utask->autask; -		if (auprobe->fixups & UPROBE_FIX_RIP_AX) -			regs->ax = autask->saved_scratch_register; -		else -			regs->cx = autask->saved_scratch_register; +	if (auprobe->def.fixups & UPROBE_FIX_RIP_MASK) { +		struct uprobe_task *utask = current->utask; +		unsigned long *sr = scratch_reg(auprobe, regs); -		/* -		 * The original instruction includes a displacement, and so -		 * is 4 bytes longer than what we've just single-stepped. -		 * Caller may need to apply other fixups to handle stuff -		 * like "jmpq *...(%rip)" and "callq *...(%rip)". -		 */ -		if (correction) -			*correction += 4; +		utask->autask.saved_scratch_register = *sr; +		*sr = utask->vaddr + auprobe->def.ilen;  	}  } -static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn) +static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)  { -	insn_init(insn, auprobe->insn, true); - -	/* Skip good instruction prefixes; reject "bad" ones. */ -	insn_get_opcode(insn); -	if (is_prefix_bad(insn)) -		return -ENOTSUPP; +	if (auprobe->def.fixups & UPROBE_FIX_RIP_MASK) { +		struct uprobe_task *utask = current->utask; +		unsigned long *sr = scratch_reg(auprobe, regs); -	if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64)) -		return 0; - -	if (insn->opcode.nbytes == 2) { -		if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns)) -			return 0; +		*sr = utask->autask.saved_scratch_register;  	} -	return -ENOTSUPP;  } - -static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn) +#else /* 32-bit: */ +static inline bool is_64bit_mm(struct mm_struct *mm)  { -	if (mm->context.ia32_compat) -		return validate_insn_32bits(auprobe, insn); -	return validate_insn_64bits(auprobe, insn); +	return false;  } -#else /* 32-bit: */  /*   * No RIP-relative addressing on 32-bit   */ -static void handle_riprel_insn(struct arch_uprobe *auprobe, struct insn *insn) +static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn)  {  } -static void pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs, -				struct arch_uprobe_task *autask) +static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)  {  } -static void handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, -					long *correction) +static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)  {  } - -static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm,  struct insn *insn) -{ -	return validate_insn_32bits(auprobe, insn); -}  #endif /* CONFIG_X86_64 */  struct uprobe_xol_ops {  	bool	(*emulate)(struct arch_uprobe *, struct pt_regs *);  	int	(*pre_xol)(struct arch_uprobe *, struct pt_regs *);  	int	(*post_xol)(struct arch_uprobe *, struct pt_regs *); +	void	(*abort)(struct arch_uprobe *, struct pt_regs *);  };  static inline int sizeof_long(void) @@ -415,50 +462,67 @@ static inline int sizeof_long(void)  static int default_pre_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)  { -	pre_xol_rip_insn(auprobe, regs, ¤t->utask->autask); +	riprel_pre_xol(auprobe, regs);  	return 0;  } -/* - * Adjust the return address pushed by a call insn executed out of line. - */ -static int adjust_ret_addr(unsigned long sp, long correction) +static int push_ret_address(struct pt_regs *regs, unsigned long ip)  { -	int rasize = sizeof_long(); -	long ra; - -	if (copy_from_user(&ra, (void __user *)sp, rasize)) -		return -EFAULT; +	unsigned long new_sp = regs->sp - sizeof_long(); -	ra += correction; -	if (copy_to_user((void __user *)sp, &ra, rasize)) +	if (copy_to_user((void __user *)new_sp, &ip, sizeof_long()))  		return -EFAULT; +	regs->sp = new_sp;  	return 0;  } +/* + * We have to fix things up as follows: + * + * Typically, the new ip is relative to the copied instruction.  We need + * to make it relative to the original instruction (FIX_IP).  Exceptions + * are return instructions and absolute or indirect jump or call instructions. + * + * If the single-stepped instruction was a call, the return address that + * is atop the stack is the address following the copied instruction.  We + * need to make it the address following the original instruction (FIX_CALL). + * + * If the original instruction was a rip-relative instruction such as + * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent + * instruction using a scratch register -- e.g., "movl %edx,0xnnnn(%rsi)". + * We need to restore the contents of the scratch register + * (FIX_RIP_reg). + */  static int default_post_xol_op(struct arch_uprobe *auprobe, struct pt_regs *regs)  {  	struct uprobe_task *utask = current->utask; -	long correction = (long)(utask->vaddr - utask->xol_vaddr); -	handle_riprel_post_xol(auprobe, regs, &correction); -	if (auprobe->fixups & UPROBE_FIX_IP) +	riprel_post_xol(auprobe, regs); +	if (auprobe->def.fixups & UPROBE_FIX_IP) { +		long correction = utask->vaddr - utask->xol_vaddr;  		regs->ip += correction; - -	if (auprobe->fixups & UPROBE_FIX_CALL) { -		if (adjust_ret_addr(regs->sp, correction)) { -			regs->sp += sizeof_long(); +	} else if (auprobe->def.fixups & UPROBE_FIX_CALL) { +		regs->sp += sizeof_long(); +		if (push_ret_address(regs, utask->vaddr + auprobe->def.ilen))  			return -ERESTART; -		}  	} +	/* popf; tell the caller to not touch TF */ +	if (auprobe->def.fixups & UPROBE_FIX_SETF) +		utask->autask.saved_tf = true;  	return 0;  } +static void default_abort_op(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ +	riprel_post_xol(auprobe, regs); +} +  static struct uprobe_xol_ops default_xol_ops = {  	.pre_xol  = default_pre_xol_op,  	.post_xol = default_post_xol_op, +	.abort	  = default_abort_op,  };  static bool branch_is_call(struct arch_uprobe *auprobe) @@ -520,7 +584,6 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)  	unsigned long offs = (long)auprobe->branch.offs;  	if (branch_is_call(auprobe)) { -		unsigned long new_sp = regs->sp - sizeof_long();  		/*  		 * If it fails we execute this (mangled, see the comment in  		 * branch_clear_offset) insn out-of-line. In the likely case @@ -530,9 +593,8 @@ static bool branch_emulate_op(struct arch_uprobe *auprobe, struct pt_regs *regs)  		 *  		 * But there is corner case, see the comment in ->post_xol().  		 */ -		if (copy_to_user((void __user *)new_sp, &new_ip, sizeof_long())) +		if (push_ret_address(regs, new_ip))  			return false; -		regs->sp = new_sp;  	} else if (!check_jmp_cond(auprobe, regs)) {  		offs = 0;  	} @@ -583,11 +645,7 @@ static struct uprobe_xol_ops branch_xol_ops = {  static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)  {  	u8 opc1 = OPCODE1(insn); - -	/* has the side-effect of processing the entire instruction */ -	insn_get_length(insn); -	if (WARN_ON_ONCE(!insn_complete(insn))) -		return -ENOEXEC; +	int i;  	switch (opc1) {  	case 0xeb:	/* jmp 8 */ @@ -612,6 +670,16 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)  			return -ENOSYS;  	} +	/* +	 * 16-bit overrides such as CALLW (66 e8 nn nn) are not supported. +	 * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix. +	 * No one uses these insns, reject any branch insns with such prefix. +	 */ +	for (i = 0; i < insn->prefixes.nbytes; i++) { +		if (insn->prefixes.bytes[i] == 0x66) +			return -ENOTSUPP; +	} +  	auprobe->branch.opc1 = opc1;  	auprobe->branch.ilen = insn->length;  	auprobe->branch.offs = insn->immediate.value; @@ -630,10 +698,10 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)  int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)  {  	struct insn insn; -	bool fix_ip = true, fix_call = false; +	u8 fix_ip_or_call = UPROBE_FIX_IP;  	int ret; -	ret = validate_insn_bits(auprobe, mm, &insn); +	ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm));  	if (ret)  		return ret; @@ -642,44 +710,40 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm,  		return ret;  	/* -	 * Figure out which fixups arch_uprobe_post_xol() will need to perform, -	 * and annotate arch_uprobe->fixups accordingly. To start with, ->fixups -	 * is either zero or it reflects rip-related fixups. +	 * Figure out which fixups default_post_xol_op() will need to perform, +	 * and annotate def->fixups accordingly. To start with, ->fixups is +	 * either zero or it reflects rip-related fixups.  	 */  	switch (OPCODE1(&insn)) {  	case 0x9d:		/* popf */ -		auprobe->fixups |= UPROBE_FIX_SETF; +		auprobe->def.fixups |= UPROBE_FIX_SETF;  		break;  	case 0xc3:		/* ret or lret -- ip is correct */  	case 0xcb:  	case 0xc2:  	case 0xca: -		fix_ip = false; +	case 0xea:		/* jmp absolute -- ip is correct */ +		fix_ip_or_call = 0;  		break;  	case 0x9a:		/* call absolute - Fix return addr, not ip */ -		fix_call = true; -		fix_ip = false; -		break; -	case 0xea:		/* jmp absolute -- ip is correct */ -		fix_ip = false; +		fix_ip_or_call = UPROBE_FIX_CALL;  		break;  	case 0xff: -		insn_get_modrm(&insn);  		switch (MODRM_REG(&insn)) {  		case 2: case 3:			/* call or lcall, indirect */ -			fix_call = true; +			fix_ip_or_call = UPROBE_FIX_CALL; +			break;  		case 4: case 5:			/* jmp or ljmp, indirect */ -			fix_ip = false; +			fix_ip_or_call = 0; +			break;  		}  		/* fall through */  	default: -		handle_riprel_insn(auprobe, &insn); +		riprel_analyze(auprobe, &insn);  	} -	if (fix_ip) -		auprobe->fixups |= UPROBE_FIX_IP; -	if (fix_call) -		auprobe->fixups |= UPROBE_FIX_CALL; +	auprobe->def.ilen = insn.length; +	auprobe->def.fixups |= fix_ip_or_call;  	auprobe->ops = &default_xol_ops;  	return 0; @@ -694,6 +758,12 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)  {  	struct uprobe_task *utask = current->utask; +	if (auprobe->ops->pre_xol) { +		int err = auprobe->ops->pre_xol(auprobe, regs); +		if (err) +			return err; +	} +  	regs->ip = utask->xol_vaddr;  	utask->autask.saved_trap_nr = current->thread.trap_nr;  	current->thread.trap_nr = UPROBE_TRAP_NR; @@ -703,8 +773,6 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)  	if (test_tsk_thread_flag(current, TIF_BLOCKSTEP))  		set_task_blockstep(current, false); -	if (auprobe->ops->pre_xol) -		return auprobe->ops->pre_xol(auprobe, regs);  	return 0;  } @@ -732,56 +800,42 @@ bool arch_uprobe_xol_was_trapped(struct task_struct *t)   * single-step, we single-stepped a copy of the instruction.   *   * This function prepares to resume execution after the single-step. - * We have to fix things up as follows: - * - * Typically, the new ip is relative to the copied instruction.  We need - * to make it relative to the original instruction (FIX_IP).  Exceptions - * are return instructions and absolute or indirect jump or call instructions. - * - * If the single-stepped instruction was a call, the return address that - * is atop the stack is the address following the copied instruction.  We - * need to make it the address following the original instruction (FIX_CALL). - * - * If the original instruction was a rip-relative instruction such as - * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent - * instruction using a scratch register -- e.g., "movl %edx,(%rax)". - * We need to restore the contents of the scratch register and adjust - * the ip, keeping in mind that the instruction we executed is 4 bytes - * shorter than the original instruction (since we squeezed out the offset - * field).  (FIX_RIP_AX or FIX_RIP_CX)   */  int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)  {  	struct uprobe_task *utask = current->utask; +	bool send_sigtrap = utask->autask.saved_tf; +	int err = 0;  	WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR); +	current->thread.trap_nr = utask->autask.saved_trap_nr;  	if (auprobe->ops->post_xol) { -		int err = auprobe->ops->post_xol(auprobe, regs); +		err = auprobe->ops->post_xol(auprobe, regs);  		if (err) { -			arch_uprobe_abort_xol(auprobe, regs);  			/* -			 * Restart the probed insn. ->post_xol() must ensure -			 * this is really possible if it returns -ERESTART. +			 * Restore ->ip for restart or post mortem analysis. +			 * ->post_xol() must not return -ERESTART unless this +			 * is really possible.  			 */ +			regs->ip = utask->vaddr;  			if (err == -ERESTART) -				return 0; -			return err; +				err = 0; +			send_sigtrap = false;  		}  	} - -	current->thread.trap_nr = utask->autask.saved_trap_nr;  	/*  	 * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP  	 * so we can get an extra SIGTRAP if we do not clear TF. We need  	 * to examine the opcode to make it right.  	 */ -	if (utask->autask.saved_tf) +	if (send_sigtrap)  		send_sig(SIGTRAP, current, 0); -	else if (!(auprobe->fixups & UPROBE_FIX_SETF)) + +	if (!utask->autask.saved_tf)  		regs->flags &= ~X86_EFLAGS_TF; -	return 0; +	return err;  }  /* callback routine for handling exceptions. */ @@ -815,18 +869,18 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,  /*   * This function gets called when XOL instruction either gets trapped or - * the thread has a fatal signal, or if arch_uprobe_post_xol() failed. - * Reset the instruction pointer to its probed address for the potential - * restart or for post mortem analysis. + * the thread has a fatal signal. Reset the instruction pointer to its + * probed address for the potential restart or for post mortem analysis.   */  void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)  {  	struct uprobe_task *utask = current->utask; -	current->thread.trap_nr = utask->autask.saved_trap_nr; -	handle_riprel_post_xol(auprobe, regs, NULL); -	instruction_pointer_set(regs, utask->vaddr); +	if (auprobe->ops->abort) +		auprobe->ops->abort(auprobe, regs); +	current->thread.trap_nr = utask->autask.saved_trap_nr; +	regs->ip = utask->vaddr;  	/* clear TF if it was set by us in arch_uprobe_pre_xol() */  	if (!utask->autask.saved_tf)  		regs->flags &= ~X86_EFLAGS_TF; diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index edff2b97b864..88c3b7e8b384 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -102,6 +102,7 @@ extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, u  extern bool __weak is_swbp_insn(uprobe_opcode_t *insn);  extern bool __weak is_trap_insn(uprobe_opcode_t *insn);  extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs); +extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);  extern int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t);  extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);  extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); @@ -130,6 +131,9 @@ extern bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *r  #else /* !CONFIG_UPROBES */  struct uprobes_state {  }; + +#define uprobe_get_trap_addr(regs)	instruction_pointer(regs) +  static inline int  uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)  { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d1edc5e6fd03..3b02c72938a8 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -127,7 +127,7 @@ struct xol_area {   */  static bool valid_vma(struct vm_area_struct *vma, bool is_register)  { -	vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED; +	vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;  	if (is_register)  		flags |= VM_WRITE; @@ -279,18 +279,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t   * supported by that architecture then we need to modify is_trap_at_addr and   * uprobe_write_opcode accordingly. This would never be a problem for archs   * that have fixed length instructions. - */ - -/* + *   * uprobe_write_opcode - write the opcode at a given virtual address.   * @mm: the probed process address space.   * @vaddr: the virtual address to store the opcode.   * @opcode: opcode to be written at @vaddr.   * - * Called with mm->mmap_sem held (for read and with a reference to - * mm). - * - * For mm @mm, write the opcode at @vaddr. + * Called with mm->mmap_sem held for write.   * Return 0 (success) or a negative errno.   */  int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, @@ -310,21 +305,25 @@ retry:  	if (ret <= 0)  		goto put_old; +	ret = anon_vma_prepare(vma); +	if (ret) +		goto put_old; +  	ret = -ENOMEM;  	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);  	if (!new_page)  		goto put_old; -	__SetPageUptodate(new_page); +	if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) +		goto put_new; +	__SetPageUptodate(new_page);  	copy_highpage(new_page, old_page);  	copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); -	ret = anon_vma_prepare(vma); -	if (ret) -		goto put_new; -  	ret = __replace_page(vma, vaddr, old_page, new_page); +	if (ret) +		mem_cgroup_uncharge_page(new_page);  put_new:  	page_cache_release(new_page); @@ -1352,6 +1351,16 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)  	return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;  } +unsigned long uprobe_get_trap_addr(struct pt_regs *regs) +{ +	struct uprobe_task *utask = current->utask; + +	if (unlikely(utask && utask->active_uprobe)) +		return utask->vaddr; + +	return instruction_pointer(regs); +} +  /*   * Called with no locks held.   * Called in context of a exiting or a exec-ing thread. diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c082a7441345..5a7f1a6b3b8b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1009,56 +1009,60 @@ uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)  	return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);  } -static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) +static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)  {  	bool done;  	write_lock(&tu->filter.rwlock);  	if (event->hw.tp_target) { -		/* -		 * event->parent != NULL means copy_process(), we can avoid -		 * uprobe_apply(). current->mm must be probed and we can rely -		 * on dup_mmap() which preserves the already installed bp's. -		 * -		 * attr.enable_on_exec means that exec/mmap will install the -		 * breakpoints we need. -		 */ +		list_del(&event->hw.tp_list);  		done = tu->filter.nr_systemwide || -			event->parent || event->attr.enable_on_exec || +			(event->hw.tp_target->flags & PF_EXITING) ||  			uprobe_filter_event(tu, event); -		list_add(&event->hw.tp_list, &tu->filter.perf_events);  	} else { +		tu->filter.nr_systemwide--;  		done = tu->filter.nr_systemwide; -		tu->filter.nr_systemwide++;  	}  	write_unlock(&tu->filter.rwlock);  	if (!done) -		uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); +		return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);  	return 0;  } -static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) +static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)  {  	bool done; +	int err;  	write_lock(&tu->filter.rwlock);  	if (event->hw.tp_target) { -		list_del(&event->hw.tp_list); +		/* +		 * event->parent != NULL means copy_process(), we can avoid +		 * uprobe_apply(). current->mm must be probed and we can rely +		 * on dup_mmap() which preserves the already installed bp's. +		 * +		 * attr.enable_on_exec means that exec/mmap will install the +		 * breakpoints we need. +		 */  		done = tu->filter.nr_systemwide || -			(event->hw.tp_target->flags & PF_EXITING) || +			event->parent || event->attr.enable_on_exec ||  			uprobe_filter_event(tu, event); +		list_add(&event->hw.tp_list, &tu->filter.perf_events);  	} else { -		tu->filter.nr_systemwide--;  		done = tu->filter.nr_systemwide; +		tu->filter.nr_systemwide++;  	}  	write_unlock(&tu->filter.rwlock); -	if (!done) -		uprobe_apply(tu->inode, tu->offset, &tu->consumer, false); - -	return 0; +	err = 0; +	if (!done) { +		err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true); +		if (err) +			uprobe_perf_close(tu, event); +	} +	return err;  }  static bool uprobe_perf_filter(struct uprobe_consumer *uc,  | 
