diff options
Diffstat (limited to 'kernel')
115 files changed, 7042 insertions, 3218 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index f2a8b6246ce9..dc5c77544fd6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -3,12 +3,11 @@  #  obj-y     = fork.o exec_domain.o panic.o \ -	    cpu.o exit.o itimer.o time.o softirq.o resource.o \ -	    sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ +	    cpu.o exit.o softirq.o resource.o \ +	    sysctl.o sysctl_binary.o capability.o ptrace.o user.o \  	    signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ -	    extable.o params.o posix-timers.o \ -	    kthread.o sys_ni.o posix-cpu-timers.o \ -	    hrtimer.o nsproxy.o \ +	    extable.o params.o \ +	    kthread.o sys_ni.o nsproxy.o \  	    notifier.o ksysfs.o cred.o reboot.o \  	    async.o range.o groups.o smpboot.o @@ -87,6 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/  obj-$(CONFIG_TRACEPOINTS) += trace/  obj-$(CONFIG_IRQ_WORK) += irq_work.o  obj-$(CONFIG_CPU_PM) += cpu_pm.o +obj-$(CONFIG_NET) += bpf/  obj-$(CONFIG_PERF_EVENTS) += events/ @@ -105,27 +105,11 @@ targets += config_data.gz  $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE  	$(call if_changed,gzip) -      filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") +      filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;")  targets += config_data.h  $(obj)/config_data.h: $(obj)/config_data.gz FORCE  	$(call filechk,ikconfiggz) -$(obj)/time.o: $(obj)/timeconst.h - -quiet_cmd_hzfile = HZFILE  $@ -      cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ - -targets += hz.bc -$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE -	$(call if_changed,hzfile) - -quiet_cmd_bc  = BC      $@ -      cmd_bc  = bc -q $(filter-out FORCE,$^) > $@ - -targets += timeconst.h -$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE -	$(call if_changed,bc) -  ###############################################################################  #  # Roll all the X.509 certificates that we can find together and pull them into diff --git a/kernel/acct.c b/kernel/acct.c index 808a86ff229d..51793520566f 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -141,12 +141,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)  	if (acct->active) {  		if (act < 0) {  			acct->active = 0; -			printk(KERN_INFO "Process accounting paused\n"); +			pr_info("Process accounting paused\n");  		}  	} else {  		if (act > 0) {  			acct->active = 1; -			printk(KERN_INFO "Process accounting resumed\n"); +			pr_info("Process accounting resumed\n");  		}  	} @@ -261,6 +261,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)  	if (name) {  		struct filename *tmp = getname(name); +  		if (IS_ERR(tmp))  			return PTR_ERR(tmp);  		error = acct_on(tmp); @@ -376,7 +377,7 @@ static comp_t encode_comp_t(unsigned long value)  	return exp;  } -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2  /*   * encode an u64 into a comp2_t (24 bits)   * @@ -389,7 +390,7 @@ static comp_t encode_comp_t(unsigned long value)  #define MANTSIZE2       20                      /* 20 bit mantissa. */  #define EXPSIZE2        5                       /* 5 bit base 2 exponent. */  #define MAXFRACT2       ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ -#define MAXEXP2         ((1 <<EXPSIZE2) - 1)    /* Maximum exponent. */ +#define MAXEXP2         ((1 << EXPSIZE2) - 1)    /* Maximum exponent. */  static comp2_t encode_comp2_t(u64 value)  { @@ -420,7 +421,7 @@ static comp2_t encode_comp2_t(u64 value)  }  #endif -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3  /*   * encode an u64 into a 32 bit IEEE float   */ @@ -429,8 +430,9 @@ static u32 encode_float(u64 value)  	unsigned exp = 190;  	unsigned u; -	if (value==0) return 0; -	while ((s64)value > 0){ +	if (value == 0) +		return 0; +	while ((s64)value > 0) {  		value <<= 1;  		exp--;  	} @@ -458,9 +460,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,  	acct_t ac;  	mm_segment_t fs;  	unsigned long flim; -	u64 elapsed; -	u64 run_time; -	struct timespec uptime; +	u64 elapsed, run_time;  	struct tty_struct *tty;  	const struct cred *orig_cred; @@ -484,22 +484,21 @@ static void do_acct_process(struct bsd_acct_struct *acct,  	strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));  	/* calculate run_time in nsec*/ -	do_posix_clock_monotonic_gettime(&uptime); -	run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; -	run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC -		       + current->group_leader->start_time.tv_nsec; +	run_time = ktime_get_ns(); +	run_time -= current->group_leader->start_time;  	/* convert nsec -> AHZ */  	elapsed = nsec_to_AHZ(run_time); -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3  	ac.ac_etime = encode_float(elapsed);  #else  	ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? -	                       (unsigned long) elapsed : (unsigned long) -1l); +				(unsigned long) elapsed : (unsigned long) -1l);  #endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2  	{  		/* new enlarged etime field */  		comp2_t etime = encode_comp2_t(elapsed); +  		ac.ac_etime_hi = etime >> 16;  		ac.ac_etime_lo = (u16) etime;  	} @@ -509,15 +508,15 @@ static void do_acct_process(struct bsd_acct_struct *acct,  	/* we really need to bite the bullet and change layout */  	ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);  	ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); -#if ACCT_VERSION==2 +#if ACCT_VERSION == 2  	ac.ac_ahz = AHZ;  #endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2  	/* backward-compatible 16 bit fields */  	ac.ac_uid16 = ac.ac_uid;  	ac.ac_gid16 = ac.ac_gid;  #endif -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3  	ac.ac_pid = task_tgid_nr_ns(current, ns);  	rcu_read_lock();  	ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); @@ -578,6 +577,7 @@ void acct_collect(long exitcode, int group_dead)  	if (group_dead && current->mm) {  		struct vm_area_struct *vma; +  		down_read(¤t->mm->mmap_sem);  		vma = current->mm->mmap;  		while (vma) { diff --git a/kernel/audit.c b/kernel/audit.c index 3ef2e0e797e8..ba2ff5a5c600 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1677,7 +1677,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)  	audit_log_format(ab, " %s=", prefix);  	CAP_FOR_EACH_U32(i) {  		audit_log_format(ab, "%08x", -				 cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); +				 cap->cap[CAP_LAST_U32 - i]);  	}  } diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 8e9bc9c3dbb7..c447cd9848d1 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -106,7 +106,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count)  	if (unlikely(!entry))  		return NULL; -	fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); +	fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL);  	if (unlikely(!fields)) {  		kfree(entry);  		return NULL; @@ -160,7 +160,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES];  int __init audit_register_class(int class, unsigned *list)  { -	__u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); +	__u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL);  	if (!p)  		return -ENOMEM;  	while (*list != ~0U) { diff --git a/kernel/bounds.c b/kernel/bounds.c index 9fd4246b04b8..e1d1d1952bfa 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -9,7 +9,6 @@  #include <linux/page-flags.h>  #include <linux/mmzone.h>  #include <linux/kbuild.h> -#include <linux/page_cgroup.h>  #include <linux/log2.h>  #include <linux/spinlock_types.h> @@ -18,7 +17,6 @@ void foo(void)  	/* The enum constants to put into include/generated/bounds.h */  	DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);  	DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); -	DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);  #ifdef CONFIG_SMP  	DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));  #endif diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile new file mode 100644 index 000000000000..6a71145e2769 --- /dev/null +++ b/kernel/bpf/Makefile @@ -0,0 +1 @@ +obj-y := core.o diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c new file mode 100644 index 000000000000..7f0dbcbb34af --- /dev/null +++ b/kernel/bpf/core.c @@ -0,0 +1,534 @@ +/* + * Linux Socket Filter - Kernel level socket filtering + * + * Based on the design of the Berkeley Packet Filter. The new + * internal format has been designed by PLUMgrid: + * + *	Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com + * + * Authors: + * + *	Jay Schulist <jschlst@samba.org> + *	Alexei Starovoitov <ast@plumgrid.com> + *	Daniel Borkmann <dborkman@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Andi Kleen - Fix a few bad bugs and races. + * Kris Katterjohn - Added many additional checks in bpf_check_classic() + */ +#include <linux/filter.h> +#include <linux/skbuff.h> +#include <asm/unaligned.h> + +/* Registers */ +#define BPF_R0	regs[BPF_REG_0] +#define BPF_R1	regs[BPF_REG_1] +#define BPF_R2	regs[BPF_REG_2] +#define BPF_R3	regs[BPF_REG_3] +#define BPF_R4	regs[BPF_REG_4] +#define BPF_R5	regs[BPF_REG_5] +#define BPF_R6	regs[BPF_REG_6] +#define BPF_R7	regs[BPF_REG_7] +#define BPF_R8	regs[BPF_REG_8] +#define BPF_R9	regs[BPF_REG_9] +#define BPF_R10	regs[BPF_REG_10] + +/* Named registers */ +#define DST	regs[insn->dst_reg] +#define SRC	regs[insn->src_reg] +#define FP	regs[BPF_REG_FP] +#define ARG1	regs[BPF_REG_ARG1] +#define CTX	regs[BPF_REG_CTX] +#define IMM	insn->imm + +/* No hurry in this branch + * + * Exported for the bpf jit load helper. + */ +void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size) +{ +	u8 *ptr = NULL; + +	if (k >= SKF_NET_OFF) +		ptr = skb_network_header(skb) + k - SKF_NET_OFF; +	else if (k >= SKF_LL_OFF) +		ptr = skb_mac_header(skb) + k - SKF_LL_OFF; +	if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) +		return ptr; + +	return NULL; +} + +/* Base function for offset calculation. Needs to go into .text section, + * therefore keeping it non-static as well; will also be used by JITs + * anyway later on, so do not let the compiler omit it. + */ +noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ +	return 0; +} + +/** + *	__bpf_prog_run - run eBPF program on a given context + *	@ctx: is the data we are operating on + *	@insn: is the array of eBPF instructions + * + * Decode and execute eBPF instructions. + */ +static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) +{ +	u64 stack[MAX_BPF_STACK / sizeof(u64)]; +	u64 regs[MAX_BPF_REG], tmp; +	static const void *jumptable[256] = { +		[0 ... 255] = &&default_label, +		/* Now overwrite non-defaults ... */ +		/* 32 bit ALU operations */ +		[BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, +		[BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, +		[BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, +		[BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, +		[BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, +		[BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, +		[BPF_ALU | BPF_OR | BPF_X]  = &&ALU_OR_X, +		[BPF_ALU | BPF_OR | BPF_K]  = &&ALU_OR_K, +		[BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, +		[BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, +		[BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, +		[BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, +		[BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, +		[BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, +		[BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, +		[BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, +		[BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, +		[BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, +		[BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, +		[BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, +		[BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, +		[BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, +		[BPF_ALU | BPF_NEG] = &&ALU_NEG, +		[BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, +		[BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, +		/* 64 bit ALU operations */ +		[BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, +		[BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, +		[BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, +		[BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, +		[BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, +		[BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, +		[BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, +		[BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, +		[BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, +		[BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, +		[BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, +		[BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, +		[BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, +		[BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, +		[BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, +		[BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, +		[BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, +		[BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, +		[BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, +		[BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, +		[BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, +		[BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, +		[BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, +		[BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, +		[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, +		/* Call instruction */ +		[BPF_JMP | BPF_CALL] = &&JMP_CALL, +		/* Jumps */ +		[BPF_JMP | BPF_JA] = &&JMP_JA, +		[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, +		[BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, +		[BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, +		[BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, +		[BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, +		[BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, +		[BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, +		[BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, +		[BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, +		[BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, +		[BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, +		[BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, +		[BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, +		[BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, +		/* Program return */ +		[BPF_JMP | BPF_EXIT] = &&JMP_EXIT, +		/* Store instructions */ +		[BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, +		[BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, +		[BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, +		[BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, +		[BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, +		[BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, +		[BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, +		[BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, +		[BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, +		[BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, +		/* Load instructions */ +		[BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, +		[BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, +		[BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, +		[BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, +		[BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, +		[BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, +		[BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, +		[BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, +		[BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, +		[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, +	}; +	void *ptr; +	int off; + +#define CONT	 ({ insn++; goto select_insn; }) +#define CONT_JMP ({ insn++; goto select_insn; }) + +	FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; +	ARG1 = (u64) (unsigned long) ctx; + +	/* Registers used in classic BPF programs need to be reset first. */ +	regs[BPF_REG_A] = 0; +	regs[BPF_REG_X] = 0; + +select_insn: +	goto *jumptable[insn->code]; + +	/* ALU */ +#define ALU(OPCODE, OP)			\ +	ALU64_##OPCODE##_X:		\ +		DST = DST OP SRC;	\ +		CONT;			\ +	ALU_##OPCODE##_X:		\ +		DST = (u32) DST OP (u32) SRC;	\ +		CONT;			\ +	ALU64_##OPCODE##_K:		\ +		DST = DST OP IMM;		\ +		CONT;			\ +	ALU_##OPCODE##_K:		\ +		DST = (u32) DST OP (u32) IMM;	\ +		CONT; + +	ALU(ADD,  +) +	ALU(SUB,  -) +	ALU(AND,  &) +	ALU(OR,   |) +	ALU(LSH, <<) +	ALU(RSH, >>) +	ALU(XOR,  ^) +	ALU(MUL,  *) +#undef ALU +	ALU_NEG: +		DST = (u32) -DST; +		CONT; +	ALU64_NEG: +		DST = -DST; +		CONT; +	ALU_MOV_X: +		DST = (u32) SRC; +		CONT; +	ALU_MOV_K: +		DST = (u32) IMM; +		CONT; +	ALU64_MOV_X: +		DST = SRC; +		CONT; +	ALU64_MOV_K: +		DST = IMM; +		CONT; +	ALU64_ARSH_X: +		(*(s64 *) &DST) >>= SRC; +		CONT; +	ALU64_ARSH_K: +		(*(s64 *) &DST) >>= IMM; +		CONT; +	ALU64_MOD_X: +		if (unlikely(SRC == 0)) +			return 0; +		tmp = DST; +		DST = do_div(tmp, SRC); +		CONT; +	ALU_MOD_X: +		if (unlikely(SRC == 0)) +			return 0; +		tmp = (u32) DST; +		DST = do_div(tmp, (u32) SRC); +		CONT; +	ALU64_MOD_K: +		tmp = DST; +		DST = do_div(tmp, IMM); +		CONT; +	ALU_MOD_K: +		tmp = (u32) DST; +		DST = do_div(tmp, (u32) IMM); +		CONT; +	ALU64_DIV_X: +		if (unlikely(SRC == 0)) +			return 0; +		do_div(DST, SRC); +		CONT; +	ALU_DIV_X: +		if (unlikely(SRC == 0)) +			return 0; +		tmp = (u32) DST; +		do_div(tmp, (u32) SRC); +		DST = (u32) tmp; +		CONT; +	ALU64_DIV_K: +		do_div(DST, IMM); +		CONT; +	ALU_DIV_K: +		tmp = (u32) DST; +		do_div(tmp, (u32) IMM); +		DST = (u32) tmp; +		CONT; +	ALU_END_TO_BE: +		switch (IMM) { +		case 16: +			DST = (__force u16) cpu_to_be16(DST); +			break; +		case 32: +			DST = (__force u32) cpu_to_be32(DST); +			break; +		case 64: +			DST = (__force u64) cpu_to_be64(DST); +			break; +		} +		CONT; +	ALU_END_TO_LE: +		switch (IMM) { +		case 16: +			DST = (__force u16) cpu_to_le16(DST); +			break; +		case 32: +			DST = (__force u32) cpu_to_le32(DST); +			break; +		case 64: +			DST = (__force u64) cpu_to_le64(DST); +			break; +		} +		CONT; + +	/* CALL */ +	JMP_CALL: +		/* Function call scratches BPF_R1-BPF_R5 registers, +		 * preserves BPF_R6-BPF_R9, and stores return value +		 * into BPF_R0. +		 */ +		BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, +						       BPF_R4, BPF_R5); +		CONT; + +	/* JMP */ +	JMP_JA: +		insn += insn->off; +		CONT; +	JMP_JEQ_X: +		if (DST == SRC) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JEQ_K: +		if (DST == IMM) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JNE_X: +		if (DST != SRC) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JNE_K: +		if (DST != IMM) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JGT_X: +		if (DST > SRC) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JGT_K: +		if (DST > IMM) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JGE_X: +		if (DST >= SRC) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JGE_K: +		if (DST >= IMM) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JSGT_X: +		if (((s64) DST) > ((s64) SRC)) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JSGT_K: +		if (((s64) DST) > ((s64) IMM)) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JSGE_X: +		if (((s64) DST) >= ((s64) SRC)) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JSGE_K: +		if (((s64) DST) >= ((s64) IMM)) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JSET_X: +		if (DST & SRC) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_JSET_K: +		if (DST & IMM) { +			insn += insn->off; +			CONT_JMP; +		} +		CONT; +	JMP_EXIT: +		return BPF_R0; + +	/* STX and ST and LDX*/ +#define LDST(SIZEOP, SIZE)						\ +	STX_MEM_##SIZEOP:						\ +		*(SIZE *)(unsigned long) (DST + insn->off) = SRC;	\ +		CONT;							\ +	ST_MEM_##SIZEOP:						\ +		*(SIZE *)(unsigned long) (DST + insn->off) = IMM;	\ +		CONT;							\ +	LDX_MEM_##SIZEOP:						\ +		DST = *(SIZE *)(unsigned long) (SRC + insn->off);	\ +		CONT; + +	LDST(B,   u8) +	LDST(H,  u16) +	LDST(W,  u32) +	LDST(DW, u64) +#undef LDST +	STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ +		atomic_add((u32) SRC, (atomic_t *)(unsigned long) +			   (DST + insn->off)); +		CONT; +	STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ +		atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) +			     (DST + insn->off)); +		CONT; +	LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ +		off = IMM; +load_word: +		/* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are +		 * only appearing in the programs where ctx == +		 * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] +		 * == BPF_R6, bpf_convert_filter() saves it in BPF_R6, +		 * internal BPF verifier will check that BPF_R6 == +		 * ctx. +		 * +		 * BPF_ABS and BPF_IND are wrappers of function calls, +		 * so they scratch BPF_R1-BPF_R5 registers, preserve +		 * BPF_R6-BPF_R9, and store return value into BPF_R0. +		 * +		 * Implicit input: +		 *   ctx == skb == BPF_R6 == CTX +		 * +		 * Explicit input: +		 *   SRC == any register +		 *   IMM == 32-bit immediate +		 * +		 * Output: +		 *   BPF_R0 - 8/16/32-bit skb data converted to cpu endianness +		 */ + +		ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); +		if (likely(ptr != NULL)) { +			BPF_R0 = get_unaligned_be32(ptr); +			CONT; +		} + +		return 0; +	LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ +		off = IMM; +load_half: +		ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); +		if (likely(ptr != NULL)) { +			BPF_R0 = get_unaligned_be16(ptr); +			CONT; +		} + +		return 0; +	LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ +		off = IMM; +load_byte: +		ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); +		if (likely(ptr != NULL)) { +			BPF_R0 = *(u8 *)ptr; +			CONT; +		} + +		return 0; +	LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ +		off = IMM + SRC; +		goto load_word; +	LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ +		off = IMM + SRC; +		goto load_half; +	LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ +		off = IMM + SRC; +		goto load_byte; + +	default_label: +		/* If we ever reach this, we have a bug somewhere. */ +		WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); +		return 0; +} + +void __weak bpf_int_jit_compile(struct bpf_prog *prog) +{ +} + +/** + *	bpf_prog_select_runtime - select execution runtime for BPF program + *	@fp: bpf_prog populated with internal BPF program + * + * try to JIT internal BPF program, if JIT is not available select interpreter + * BPF program will be executed via BPF_PROG_RUN() macro + */ +void bpf_prog_select_runtime(struct bpf_prog *fp) +{ +	fp->bpf_func = (void *) __bpf_prog_run; + +	/* Probe if internal BPF can be JITed */ +	bpf_int_jit_compile(fp); +} +EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); + +/* free internal BPF program */ +void bpf_prog_free(struct bpf_prog *fp) +{ +	bpf_jit_free(fp); +} +EXPORT_SYMBOL_GPL(bpf_prog_free); diff --git a/kernel/capability.c b/kernel/capability.c index a5cf13c018ce..989f5bfc57dc 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -258,6 +258,10 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)  		i++;  	} +	effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; +	permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; +	inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; +  	new = prepare_creds();  	if (!new)  		return -ENOMEM; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 70776aec2562..7dc8788cfd52 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -149,12 +149,14 @@ struct cgroup_root cgrp_dfl_root;   */  static bool cgrp_dfl_root_visible; +/* + * Set by the boot param of the same name and makes subsystems with NULL + * ->dfl_files to use ->legacy_files on the default hierarchy. + */ +static bool cgroup_legacy_files_on_dfl; +  /* some controllers are not supported in the default hierarchy */ -static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 -#ifdef CONFIG_CGROUP_DEBUG -	| (1 << debug_cgrp_id) -#endif -	; +static unsigned int cgrp_dfl_root_inhibit_ss_mask;  /* The list of hierarchy roots */ @@ -180,13 +182,15 @@ static u64 css_serial_nr_next = 1;   */  static int need_forkexit_callback __read_mostly; -static struct cftype cgroup_base_files[]; +static struct cftype cgroup_dfl_base_files[]; +static struct cftype cgroup_legacy_base_files[];  static void cgroup_put(struct cgroup *cgrp);  static int rebind_subsystems(struct cgroup_root *dst_root,  			     unsigned int ss_mask);  static int cgroup_destroy_locked(struct cgroup *cgrp); -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, +		      bool visible);  static void css_release(struct percpu_ref *ref);  static void kill_css(struct cgroup_subsys_state *css);  static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], @@ -1037,6 +1041,58 @@ static void cgroup_put(struct cgroup *cgrp)  }  /** + * cgroup_refresh_child_subsys_mask - update child_subsys_mask + * @cgrp: the target cgroup + * + * On the default hierarchy, a subsystem may request other subsystems to be + * enabled together through its ->depends_on mask.  In such cases, more + * subsystems than specified in "cgroup.subtree_control" may be enabled. + * + * This function determines which subsystems need to be enabled given the + * current @cgrp->subtree_control and records it in + * @cgrp->child_subsys_mask.  The resulting mask is always a superset of + * @cgrp->subtree_control and follows the usual hierarchy rules. + */ +static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) +{ +	struct cgroup *parent = cgroup_parent(cgrp); +	unsigned int cur_ss_mask = cgrp->subtree_control; +	struct cgroup_subsys *ss; +	int ssid; + +	lockdep_assert_held(&cgroup_mutex); + +	if (!cgroup_on_dfl(cgrp)) { +		cgrp->child_subsys_mask = cur_ss_mask; +		return; +	} + +	while (true) { +		unsigned int new_ss_mask = cur_ss_mask; + +		for_each_subsys(ss, ssid) +			if (cur_ss_mask & (1 << ssid)) +				new_ss_mask |= ss->depends_on; + +		/* +		 * Mask out subsystems which aren't available.  This can +		 * happen only if some depended-upon subsystems were bound +		 * to non-default hierarchies. +		 */ +		if (parent) +			new_ss_mask &= parent->child_subsys_mask; +		else +			new_ss_mask &= cgrp->root->subsys_mask; + +		if (new_ss_mask == cur_ss_mask) +			break; +		cur_ss_mask = new_ss_mask; +	} + +	cgrp->child_subsys_mask = cur_ss_mask; +} + +/**   * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods   * @kn: the kernfs_node being serviced   * @@ -1208,12 +1264,15 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)  		up_write(&css_set_rwsem);  		src_root->subsys_mask &= ~(1 << ssid); -		src_root->cgrp.child_subsys_mask &= ~(1 << ssid); +		src_root->cgrp.subtree_control &= ~(1 << ssid); +		cgroup_refresh_child_subsys_mask(&src_root->cgrp);  		/* default hierarchy doesn't enable controllers by default */  		dst_root->subsys_mask |= 1 << ssid; -		if (dst_root != &cgrp_dfl_root) -			dst_root->cgrp.child_subsys_mask |= 1 << ssid; +		if (dst_root != &cgrp_dfl_root) { +			dst_root->cgrp.subtree_control |= 1 << ssid; +			cgroup_refresh_child_subsys_mask(&dst_root->cgrp); +		}  		if (ss->bind)  			ss->bind(css); @@ -1233,8 +1292,6 @@ static int cgroup_show_options(struct seq_file *seq,  	for_each_subsys(ss, ssid)  		if (root->subsys_mask & (1 << ssid))  			seq_printf(seq, ",%s", ss->name); -	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) -		seq_puts(seq, ",sane_behavior");  	if (root->flags & CGRP_ROOT_NOPREFIX)  		seq_puts(seq, ",noprefix");  	if (root->flags & CGRP_ROOT_XATTR) @@ -1268,6 +1325,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  	bool all_ss = false, one_ss = false;  	unsigned int mask = -1U;  	struct cgroup_subsys *ss; +	int nr_opts = 0;  	int i;  #ifdef CONFIG_CPUSETS @@ -1277,6 +1335,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  	memset(opts, 0, sizeof(*opts));  	while ((token = strsep(&o, ",")) != NULL) { +		nr_opts++; +  		if (!*token)  			return -EINVAL;  		if (!strcmp(token, "none")) { @@ -1361,37 +1421,33 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  			return -ENOENT;  	} -	/* Consistency checks */ -  	if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {  		pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - -		if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || -		    opts->cpuset_clone_children || opts->release_agent || -		    opts->name) { -			pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); +		if (nr_opts != 1) { +			pr_err("sane_behavior: no other mount options allowed\n");  			return -EINVAL;  		} -	} else { -		/* -		 * If the 'all' option was specified select all the -		 * subsystems, otherwise if 'none', 'name=' and a subsystem -		 * name options were not specified, let's default to 'all' -		 */ -		if (all_ss || (!one_ss && !opts->none && !opts->name)) -			for_each_subsys(ss, i) -				if (!ss->disabled) -					opts->subsys_mask |= (1 << i); - -		/* -		 * We either have to specify by name or by subsystems. (So -		 * all empty hierarchies must have a name). -		 */ -		if (!opts->subsys_mask && !opts->name) -			return -EINVAL; +		return 0;  	}  	/* +	 * If the 'all' option was specified select all the subsystems, +	 * otherwise if 'none', 'name=' and a subsystem name options were +	 * not specified, let's default to 'all' +	 */ +	if (all_ss || (!one_ss && !opts->none && !opts->name)) +		for_each_subsys(ss, i) +			if (!ss->disabled) +				opts->subsys_mask |= (1 << i); + +	/* +	 * We either have to specify by name or by subsystems. (So all +	 * empty hierarchies must have a name). +	 */ +	if (!opts->subsys_mask && !opts->name) +		return -EINVAL; + +	/*  	 * Option noprefix was introduced just for backward compatibility  	 * with the old cpuset, so we allow noprefix only if mounting just  	 * the cpuset subsystem. @@ -1399,7 +1455,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)  	if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))  		return -EINVAL; -  	/* Can't specify "none" and some subsystems */  	if (opts->subsys_mask && opts->none)  		return -EINVAL; @@ -1414,8 +1469,8 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)  	struct cgroup_sb_opts opts;  	unsigned int added_mask, removed_mask; -	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { -		pr_err("sane_behavior: remount is not allowed\n"); +	if (root == &cgrp_dfl_root) { +		pr_err("remount is not allowed\n");  		return -EINVAL;  	} @@ -1434,11 +1489,10 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)  	removed_mask = root->subsys_mask & ~opts.subsys_mask;  	/* Don't allow flags or name to change at remount */ -	if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || +	if ((opts.flags ^ root->flags) ||  	    (opts.name && strcmp(opts.name, root->name))) {  		pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", -		       opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", -		       root->flags & CGRP_ROOT_OPTION_MASK, root->name); +		       opts.flags, opts.name ?: "", root->flags, root->name);  		ret = -EINVAL;  		goto out_unlock;  	} @@ -1563,6 +1617,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)  {  	LIST_HEAD(tmp_links);  	struct cgroup *root_cgrp = &root->cgrp; +	struct cftype *base_files;  	struct css_set *cset;  	int i, ret; @@ -1600,7 +1655,12 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)  	}  	root_cgrp->kn = root->kf_root->kn; -	ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); +	if (root == &cgrp_dfl_root) +		base_files = cgroup_dfl_base_files; +	else +		base_files = cgroup_legacy_base_files; + +	ret = cgroup_addrm_files(root_cgrp, base_files, true);  	if (ret)  		goto destroy_root; @@ -1638,7 +1698,7 @@ destroy_root:  exit_root_id:  	cgroup_exit_root_id(root);  cancel_ref: -	percpu_ref_cancel_init(&root_cgrp->self.refcnt); +	percpu_ref_exit(&root_cgrp->self.refcnt);  out:  	free_cgrp_cset_links(&tmp_links);  	return ret; @@ -1672,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  		goto out_unlock;  	/* look for a matching existing root */ -	if (!opts.subsys_mask && !opts.none && !opts.name) { +	if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {  		cgrp_dfl_root_visible = true;  		root = &cgrp_dfl_root;  		cgroup_get(&root->cgrp); @@ -1730,15 +1790,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,  			goto out_unlock;  		} -		if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { -			if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { -				pr_err("sane_behavior: new mount options should match the existing superblock\n"); -				ret = -EINVAL; -				goto out_unlock; -			} else { -				pr_warn("new mount options do not match the existing superblock, will be ignored\n"); -			} -		} +		if (root->flags ^ opts.flags) +			pr_warn("new mount options do not match the existing superblock, will be ignored\n");  		/*  		 * We want to reuse @root whose lifetime is governed by its @@ -2457,9 +2510,7 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v)  static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)  { -	struct cgroup *cgrp = seq_css(seq)->cgroup; - -	seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); +	seq_puts(seq, "0\n");  	return 0;  } @@ -2496,7 +2547,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v)  {  	struct cgroup *cgrp = seq_css(seq)->cgroup; -	cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); +	cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);  	return 0;  } @@ -2505,7 +2556,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)  {  	struct cgroup *cgrp = seq_css(seq)->cgroup; -	cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); +	cgroup_print_ss_mask(seq, cgrp->subtree_control);  	return 0;  } @@ -2611,6 +2662,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,  					    loff_t off)  {  	unsigned int enable = 0, disable = 0; +	unsigned int css_enable, css_disable, old_ctrl, new_ctrl;  	struct cgroup *cgrp, *child;  	struct cgroup_subsys *ss;  	char *tok; @@ -2650,11 +2702,26 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,  	for_each_subsys(ss, ssid) {  		if (enable & (1 << ssid)) { -			if (cgrp->child_subsys_mask & (1 << ssid)) { +			if (cgrp->subtree_control & (1 << ssid)) {  				enable &= ~(1 << ssid);  				continue;  			} +			/* unavailable or not enabled on the parent? */ +			if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || +			    (cgroup_parent(cgrp) && +			     !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) { +				ret = -ENOENT; +				goto out_unlock; +			} + +			/* +			 * @ss is already enabled through dependency and +			 * we'll just make it visible.  Skip draining. +			 */ +			if (cgrp->child_subsys_mask & (1 << ssid)) +				continue; +  			/*  			 * Because css offlining is asynchronous, userland  			 * might try to re-enable the same controller while @@ -2677,23 +2744,15 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,  				return restart_syscall();  			} - -			/* unavailable or not enabled on the parent? */ -			if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || -			    (cgroup_parent(cgrp) && -			     !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) { -				ret = -ENOENT; -				goto out_unlock; -			}  		} else if (disable & (1 << ssid)) { -			if (!(cgrp->child_subsys_mask & (1 << ssid))) { +			if (!(cgrp->subtree_control & (1 << ssid))) {  				disable &= ~(1 << ssid);  				continue;  			}  			/* a child has it enabled? */  			cgroup_for_each_live_child(child, cgrp) { -				if (child->child_subsys_mask & (1 << ssid)) { +				if (child->subtree_control & (1 << ssid)) {  					ret = -EBUSY;  					goto out_unlock;  				} @@ -2707,7 +2766,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,  	}  	/* -	 * Except for the root, child_subsys_mask must be zero for a cgroup +	 * Except for the root, subtree_control must be zero for a cgroup  	 * with tasks so that child cgroups don't compete against tasks.  	 */  	if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { @@ -2716,36 +2775,75 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,  	}  	/* -	 * Create csses for enables and update child_subsys_mask.  This -	 * changes cgroup_e_css() results which in turn makes the -	 * subsequent cgroup_update_dfl_csses() associate all tasks in the -	 * subtree to the updated csses. +	 * Update subsys masks and calculate what needs to be done.  More +	 * subsystems than specified may need to be enabled or disabled +	 * depending on subsystem dependencies. +	 */ +	cgrp->subtree_control |= enable; +	cgrp->subtree_control &= ~disable; + +	old_ctrl = cgrp->child_subsys_mask; +	cgroup_refresh_child_subsys_mask(cgrp); +	new_ctrl = cgrp->child_subsys_mask; + +	css_enable = ~old_ctrl & new_ctrl; +	css_disable = old_ctrl & ~new_ctrl; +	enable |= css_enable; +	disable |= css_disable; + +	/* +	 * Create new csses or make the existing ones visible.  A css is +	 * created invisible if it's being implicitly enabled through +	 * dependency.  An invisible css is made visible when the userland +	 * explicitly enables it.  	 */  	for_each_subsys(ss, ssid) {  		if (!(enable & (1 << ssid)))  			continue;  		cgroup_for_each_live_child(child, cgrp) { -			ret = create_css(child, ss); +			if (css_enable & (1 << ssid)) +				ret = create_css(child, ss, +					cgrp->subtree_control & (1 << ssid)); +			else +				ret = cgroup_populate_dir(child, 1 << ssid);  			if (ret)  				goto err_undo_css;  		}  	} -	cgrp->child_subsys_mask |= enable; -	cgrp->child_subsys_mask &= ~disable; - +	/* +	 * At this point, cgroup_e_css() results reflect the new csses +	 * making the following cgroup_update_dfl_csses() properly update +	 * css associations of all tasks in the subtree. +	 */  	ret = cgroup_update_dfl_csses(cgrp);  	if (ret)  		goto err_undo_css; -	/* all tasks are now migrated away from the old csses, kill them */ +	/* +	 * All tasks are migrated out of disabled csses.  Kill or hide +	 * them.  A css is hidden when the userland requests it to be +	 * disabled while other subsystems are still depending on it.  The +	 * css must not actively control resources and be in the vanilla +	 * state if it's made visible again later.  Controllers which may +	 * be depended upon should provide ->css_reset() for this purpose. +	 */  	for_each_subsys(ss, ssid) {  		if (!(disable & (1 << ssid)))  			continue; -		cgroup_for_each_live_child(child, cgrp) -			kill_css(cgroup_css(child, ss)); +		cgroup_for_each_live_child(child, cgrp) { +			struct cgroup_subsys_state *css = cgroup_css(child, ss); + +			if (css_disable & (1 << ssid)) { +				kill_css(css); +			} else { +				cgroup_clear_dir(child, 1 << ssid); +				if (ss->css_reset) +					ss->css_reset(css); +			} +		}  	}  	kernfs_activate(cgrp->kn); @@ -2755,8 +2853,9 @@ out_unlock:  	return ret ?: nbytes;  err_undo_css: -	cgrp->child_subsys_mask &= ~enable; -	cgrp->child_subsys_mask |= disable; +	cgrp->subtree_control &= ~enable; +	cgrp->subtree_control |= disable; +	cgroup_refresh_child_subsys_mask(cgrp);  	for_each_subsys(ss, ssid) {  		if (!(enable & (1 << ssid))) @@ -2764,8 +2863,14 @@ err_undo_css:  		cgroup_for_each_live_child(child, cgrp) {  			struct cgroup_subsys_state *css = cgroup_css(child, ss); -			if (css) + +			if (!css) +				continue; + +			if (css_enable & (1 << ssid))  				kill_css(css); +			else +				cgroup_clear_dir(child, 1 << ssid);  		}  	}  	goto out_unlock; @@ -2878,9 +2983,9 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,  	/*  	 * This isn't a proper migration and its usefulness is very -	 * limited.  Disallow if sane_behavior. +	 * limited.  Disallow on the default hierarchy.  	 */ -	if (cgroup_sane_behavior(cgrp)) +	if (cgroup_on_dfl(cgrp))  		return -EPERM;  	/* @@ -2964,9 +3069,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],  	for (cft = cfts; cft->name[0] != '\0'; cft++) {  		/* does cft->flags tell us to skip this file on @cgrp? */ -		if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) +		if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))  			continue; -		if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) +		if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))  			continue;  		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))  			continue; @@ -3024,6 +3129,9 @@ static void cgroup_exit_cftypes(struct cftype *cfts)  			kfree(cft->kf_ops);  		cft->kf_ops = NULL;  		cft->ss = NULL; + +		/* revert flags set by cgroup core while adding @cfts */ +		cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);  	}  } @@ -3109,7 +3217,7 @@ int cgroup_rm_cftypes(struct cftype *cfts)   * function currently returns 0 as long as @cfts registration is successful   * even if some file creation attempts on existing cgroups fail.   */ -int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)  {  	int ret; @@ -3135,6 +3243,40 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)  }  /** + * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Similar to cgroup_add_cftypes() but the added files are only used for + * the default hierarchy. + */ +int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ +	struct cftype *cft; + +	for (cft = cfts; cft && cft->name[0] != '\0'; cft++) +		cft->flags |= __CFTYPE_ONLY_ON_DFL; +	return cgroup_add_cftypes(ss, cfts); +} + +/** + * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Similar to cgroup_add_cftypes() but the added files are only used for + * the legacy hierarchies. + */ +int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ +	struct cftype *cft; + +	for (cft = cfts; cft && cft->name[0] != '\0'; cft++) +		cft->flags |= __CFTYPE_NOT_ON_DFL; +	return cgroup_add_cftypes(ss, cfts); +} + +/**   * cgroup_task_count - count the number of tasks in a cgroup.   * @cgrp: the cgroup in question   * @@ -3699,8 +3841,9 @@ after:   *   * All this extra complexity was caused by the original implementation   * committing to an entirely unnecessary property.  In the long term, we - * want to do away with it.  Explicitly scramble sort order if - * sane_behavior so that no such expectation exists in the new interface. + * want to do away with it.  Explicitly scramble sort order if on the + * default hierarchy so that no such expectation exists in the new + * interface.   *   * Scrambling is done by swapping every two consecutive bits, which is   * non-identity one-to-one mapping which disturbs sort order sufficiently. @@ -3715,7 +3858,7 @@ static pid_t pid_fry(pid_t pid)  static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)  { -	if (cgroup_sane_behavior(cgrp)) +	if (cgroup_on_dfl(cgrp))  		return pid_fry(pid);  	else  		return pid; @@ -3818,7 +3961,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,  	css_task_iter_end(&it);  	length = n;  	/* now sort & (if procs) strip out duplicates */ -	if (cgroup_sane_behavior(cgrp)) +	if (cgroup_on_dfl(cgrp))  		sort(array, length, sizeof(pid_t), fried_cmppid, NULL);  	else  		sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -4040,7 +4183,8 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,  	return 0;  } -static struct cftype cgroup_base_files[] = { +/* cgroup core interface files for the default hierarchy */ +static struct cftype cgroup_dfl_base_files[] = {  	{  		.name = "cgroup.procs",  		.seq_start = cgroup_pidlist_start, @@ -4052,46 +4196,52 @@ static struct cftype cgroup_base_files[] = {  		.mode = S_IRUGO | S_IWUSR,  	},  	{ -		.name = "cgroup.clone_children", -		.flags = CFTYPE_INSANE, -		.read_u64 = cgroup_clone_children_read, -		.write_u64 = cgroup_clone_children_write, -	}, -	{ -		.name = "cgroup.sane_behavior", -		.flags = CFTYPE_ONLY_ON_ROOT, -		.seq_show = cgroup_sane_behavior_show, -	}, -	{  		.name = "cgroup.controllers", -		.flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, +		.flags = CFTYPE_ONLY_ON_ROOT,  		.seq_show = cgroup_root_controllers_show,  	},  	{  		.name = "cgroup.controllers", -		.flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, +		.flags = CFTYPE_NOT_ON_ROOT,  		.seq_show = cgroup_controllers_show,  	},  	{  		.name = "cgroup.subtree_control", -		.flags = CFTYPE_ONLY_ON_DFL,  		.seq_show = cgroup_subtree_control_show,  		.write = cgroup_subtree_control_write,  	},  	{  		.name = "cgroup.populated", -		.flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, +		.flags = CFTYPE_NOT_ON_ROOT,  		.seq_show = cgroup_populated_show,  	}, +	{ }	/* terminate */ +}; -	/* -	 * Historical crazy stuff.  These don't have "cgroup."  prefix and -	 * don't exist if sane_behavior.  If you're depending on these, be -	 * prepared to be burned. -	 */ +/* cgroup core interface files for the legacy hierarchies */ +static struct cftype cgroup_legacy_base_files[] = { +	{ +		.name = "cgroup.procs", +		.seq_start = cgroup_pidlist_start, +		.seq_next = cgroup_pidlist_next, +		.seq_stop = cgroup_pidlist_stop, +		.seq_show = cgroup_pidlist_show, +		.private = CGROUP_FILE_PROCS, +		.write = cgroup_procs_write, +		.mode = S_IRUGO | S_IWUSR, +	}, +	{ +		.name = "cgroup.clone_children", +		.read_u64 = cgroup_clone_children_read, +		.write_u64 = cgroup_clone_children_write, +	}, +	{ +		.name = "cgroup.sane_behavior", +		.flags = CFTYPE_ONLY_ON_ROOT, +		.seq_show = cgroup_sane_behavior_show, +	},  	{  		.name = "tasks", -		.flags = CFTYPE_INSANE,		/* use "procs" instead */  		.seq_start = cgroup_pidlist_start,  		.seq_next = cgroup_pidlist_next,  		.seq_stop = cgroup_pidlist_stop, @@ -4102,13 +4252,12 @@ static struct cftype cgroup_base_files[] = {  	},  	{  		.name = "notify_on_release", -		.flags = CFTYPE_INSANE,  		.read_u64 = cgroup_read_notify_on_release,  		.write_u64 = cgroup_write_notify_on_release,  	},  	{  		.name = "release_agent", -		.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, +		.flags = CFTYPE_ONLY_ON_ROOT,  		.seq_show = cgroup_release_agent_show,  		.write = cgroup_release_agent_write,  		.max_write_len = PATH_MAX - 1, @@ -4175,6 +4324,8 @@ static void css_free_work_fn(struct work_struct *work)  		container_of(work, struct cgroup_subsys_state, destroy_work);  	struct cgroup *cgrp = css->cgroup; +	percpu_ref_exit(&css->refcnt); +  	if (css->ss) {  		/* css free path */  		if (css->parent) @@ -4314,12 +4465,14 @@ static void offline_css(struct cgroup_subsys_state *css)   * create_css - create a cgroup_subsys_state   * @cgrp: the cgroup new css will be associated with   * @ss: the subsys of new css + * @visible: whether to create control knobs for the new css or not   *   * Create a new css associated with @cgrp - @ss pair.  On success, the new - * css is online and installed in @cgrp with all interface files created. - * Returns 0 on success, -errno on failure. + * css is online and installed in @cgrp with all interface files created if + * @visible.  Returns 0 on success, -errno on failure.   */ -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) +static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, +		      bool visible)  {  	struct cgroup *parent = cgroup_parent(cgrp);  	struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); @@ -4343,9 +4496,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)  		goto err_free_percpu_ref;  	css->id = err; -	err = cgroup_populate_dir(cgrp, 1 << ss->id); -	if (err) -		goto err_free_id; +	if (visible) { +		err = cgroup_populate_dir(cgrp, 1 << ss->id); +		if (err) +			goto err_free_id; +	}  	/* @css is ready to be brought online now, make it visible */  	list_add_tail_rcu(&css->sibling, &parent_css->children); @@ -4372,7 +4527,7 @@ err_list_del:  err_free_id:  	cgroup_idr_remove(&ss->css_idr, css->id);  err_free_percpu_ref: -	percpu_ref_cancel_init(&css->refcnt); +	percpu_ref_exit(&css->refcnt);  err_free_css:  	call_rcu(&css->rcu_head, css_free_rcu_fn);  	return err; @@ -4385,6 +4540,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,  	struct cgroup_root *root;  	struct cgroup_subsys *ss;  	struct kernfs_node *kn; +	struct cftype *base_files;  	int ssid, ret;  	parent = cgroup_kn_lock_live(parent_kn); @@ -4455,14 +4611,20 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,  	if (ret)  		goto out_destroy; -	ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); +	if (cgroup_on_dfl(cgrp)) +		base_files = cgroup_dfl_base_files; +	else +		base_files = cgroup_legacy_base_files; + +	ret = cgroup_addrm_files(cgrp, base_files, true);  	if (ret)  		goto out_destroy;  	/* let's create and online css's */  	for_each_subsys(ss, ssid) {  		if (parent->child_subsys_mask & (1 << ssid)) { -			ret = create_css(cgrp, ss); +			ret = create_css(cgrp, ss, +					 parent->subtree_control & (1 << ssid));  			if (ret)  				goto out_destroy;  		} @@ -4470,10 +4632,12 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,  	/*  	 * On the default hierarchy, a child doesn't automatically inherit -	 * child_subsys_mask from the parent.  Each is configured manually. +	 * subtree_control from the parent.  Each is configured manually.  	 */ -	if (!cgroup_on_dfl(cgrp)) -		cgrp->child_subsys_mask = parent->child_subsys_mask; +	if (!cgroup_on_dfl(cgrp)) { +		cgrp->subtree_control = parent->subtree_control; +		cgroup_refresh_child_subsys_mask(cgrp); +	}  	kernfs_activate(kn); @@ -4483,7 +4647,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,  out_free_id:  	cgroup_idr_remove(&root->cgroup_idr, cgrp->id);  out_cancel_ref: -	percpu_ref_cancel_init(&cgrp->self.refcnt); +	percpu_ref_exit(&cgrp->self.refcnt);  out_free_cgrp:  	kfree(cgrp);  out_unlock: @@ -4736,8 +4900,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)   */  int __init cgroup_init_early(void)  { -	static struct cgroup_sb_opts __initdata opts = -		{ .flags = CGRP_ROOT_SANE_BEHAVIOR }; +	static struct cgroup_sb_opts __initdata opts;  	struct cgroup_subsys *ss;  	int i; @@ -4775,7 +4938,8 @@ int __init cgroup_init(void)  	unsigned long key;  	int ssid, err; -	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); +	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); +	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));  	mutex_lock(&cgroup_mutex); @@ -4807,9 +4971,22 @@ int __init cgroup_init(void)  		 * disabled flag and cftype registration needs kmalloc,  		 * both of which aren't available during early_init.  		 */ -		if (!ss->disabled) { -			cgrp_dfl_root.subsys_mask |= 1 << ss->id; -			WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); +		if (ss->disabled) +			continue; + +		cgrp_dfl_root.subsys_mask |= 1 << ss->id; + +		if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes) +			ss->dfl_cftypes = ss->legacy_cftypes; + +		if (!ss->dfl_cftypes) +			cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; + +		if (ss->dfl_cftypes == ss->legacy_cftypes) { +			WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); +		} else { +			WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); +			WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));  		}  	} @@ -5205,6 +5382,14 @@ static int __init cgroup_disable(char *str)  }  __setup("cgroup_disable=", cgroup_disable); +static int __init cgroup_set_legacy_files_on_dfl(char *str) +{ +	printk("cgroup: using legacy files on the default hierarchy\n"); +	cgroup_legacy_files_on_dfl = true; +	return 0; +} +__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl); +  /**   * css_tryget_online_from_dir - get corresponding css from a cgroup dentry   * @dentry: directory dentry of interest @@ -5399,6 +5584,6 @@ static struct cftype debug_files[] =  {  struct cgroup_subsys debug_cgrp_subsys = {  	.css_alloc = debug_css_alloc,  	.css_free = debug_css_free, -	.base_cftypes = debug_files, +	.legacy_cftypes = debug_files,  };  #endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index a79e40f9d700..92b98cc0ee76 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -480,5 +480,5 @@ struct cgroup_subsys freezer_cgrp_subsys = {  	.css_free	= freezer_css_free,  	.attach		= freezer_attach,  	.fork		= freezer_fork, -	.base_cftypes	= files, +	.legacy_cftypes	= files,  }; diff --git a/kernel/cpu.c b/kernel/cpu.c index a343bde710b1..81e2a388a0f6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -274,21 +274,28 @@ void clear_tasks_mm_cpumask(int cpu)  	rcu_read_unlock();  } -static inline void check_for_tasks(int cpu) +static inline void check_for_tasks(int dead_cpu)  { -	struct task_struct *p; -	cputime_t utime, stime; +	struct task_struct *g, *p; -	write_lock_irq(&tasklist_lock); -	for_each_process(p) { -		task_cputime(p, &utime, &stime); -		if (task_cpu(p) == cpu && p->state == TASK_RUNNING && -		    (utime || stime)) -			pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n", -				p->comm, task_pid_nr(p), cpu, -				p->state, p->flags); -	} -	write_unlock_irq(&tasklist_lock); +	read_lock_irq(&tasklist_lock); +	do_each_thread(g, p) { +		if (!p->on_rq) +			continue; +		/* +		 * We do the check with unlocked task_rq(p)->lock. +		 * Order the reading to do not warn about a task, +		 * which was running on this cpu in the past, and +		 * it's just been woken on another cpu. +		 */ +		rmb(); +		if (task_cpu(p) != dead_cpu) +			continue; + +		pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n", +			p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags); +	} while_each_thread(g, p); +	read_unlock_irq(&tasklist_lock);  }  struct take_cpu_down_param { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 116a4164720a..22874d7cf2c0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -76,8 +76,34 @@ struct cpuset {  	struct cgroup_subsys_state css;  	unsigned long flags;		/* "unsigned long" so bitops work */ -	cpumask_var_t cpus_allowed;	/* CPUs allowed to tasks in cpuset */ -	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */ + +	/* +	 * On default hierarchy: +	 * +	 * The user-configured masks can only be changed by writing to +	 * cpuset.cpus and cpuset.mems, and won't be limited by the +	 * parent masks. +	 * +	 * The effective masks is the real masks that apply to the tasks +	 * in the cpuset. They may be changed if the configured masks are +	 * changed or hotplug happens. +	 * +	 * effective_mask == configured_mask & parent's effective_mask, +	 * and if it ends up empty, it will inherit the parent's mask. +	 * +	 * +	 * On legacy hierachy: +	 * +	 * The user-configured masks are always the same with effective masks. +	 */ + +	/* user-configured CPUs and Memory Nodes allow to tasks */ +	cpumask_var_t cpus_allowed; +	nodemask_t mems_allowed; + +	/* effective CPUs and Memory Nodes allow to tasks */ +	cpumask_var_t effective_cpus; +	nodemask_t effective_mems;  	/*  	 * This is old Memory Nodes tasks took on. @@ -307,9 +333,9 @@ static struct file_system_type cpuset_fs_type = {   */  static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)  { -	while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) +	while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))  		cs = parent_cs(cs); -	cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); +	cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);  }  /* @@ -325,9 +351,9 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)   */  static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)  { -	while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) +	while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))  		cs = parent_cs(cs); -	nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); +	nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);  }  /* @@ -376,13 +402,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)  	if (!trial)  		return NULL; -	if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { -		kfree(trial); -		return NULL; -	} -	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); +	if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) +		goto free_cs; +	if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL)) +		goto free_cpus; +	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); +	cpumask_copy(trial->effective_cpus, cs->effective_cpus);  	return trial; + +free_cpus: +	free_cpumask_var(trial->cpus_allowed); +free_cs: +	kfree(trial); +	return NULL;  }  /** @@ -391,6 +424,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)   */  static void free_trial_cpuset(struct cpuset *trial)  { +	free_cpumask_var(trial->effective_cpus);  	free_cpumask_var(trial->cpus_allowed);  	kfree(trial);  } @@ -436,9 +470,9 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)  	par = parent_cs(cur); -	/* We must be a subset of our parent cpuset */ +	/* On legacy hiearchy, we must be a subset of our parent cpuset. */  	ret = -EACCES; -	if (!is_cpuset_subset(trial, par)) +	if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))  		goto out;  	/* @@ -480,11 +514,11 @@ out:  #ifdef CONFIG_SMP  /*   * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping cpus_allowed masks? + * Do cpusets a, b have overlapping effective cpus_allowed masks?   */  static int cpusets_overlap(struct cpuset *a, struct cpuset *b)  { -	return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); +	return cpumask_intersects(a->effective_cpus, b->effective_cpus);  }  static void @@ -601,7 +635,7 @@ static int generate_sched_domains(cpumask_var_t **domains,  			*dattr = SD_ATTR_INIT;  			update_domain_attr_tree(dattr, &top_cpuset);  		} -		cpumask_copy(doms[0], top_cpuset.cpus_allowed); +		cpumask_copy(doms[0], top_cpuset.effective_cpus);  		goto done;  	} @@ -705,7 +739,7 @@ restart:  			struct cpuset *b = csa[j];  			if (apn == b->pn) { -				cpumask_or(dp, dp, b->cpus_allowed); +				cpumask_or(dp, dp, b->effective_cpus);  				if (dattr)  					update_domain_attr_tree(dattr + nslot, b); @@ -757,7 +791,7 @@ static void rebuild_sched_domains_locked(void)  	 * passing doms with offlined cpu to partition_sched_domains().  	 * Anyways, hotplug work item will rebuild sched domains.  	 */ -	if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask)) +	if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))  		goto out;  	/* Generate domain masks and attrs */ @@ -781,45 +815,6 @@ void rebuild_sched_domains(void)  	mutex_unlock(&cpuset_mutex);  } -/* - * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus - * @cs: the cpuset in interest - * - * A cpuset's effective cpumask is the cpumask of the nearest ancestor - * with non-empty cpus. We use effective cpumask whenever: - * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask - *   if the cpuset they reside in has no cpus) - * - we want to retrieve task_cs(tsk)'s cpus_allowed. - * - * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an - * exception. See comments there. - */ -static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs) -{ -	while (cpumask_empty(cs->cpus_allowed)) -		cs = parent_cs(cs); -	return cs; -} - -/* - * effective_nodemask_cpuset - return nearest ancestor with non-empty mems - * @cs: the cpuset in interest - * - * A cpuset's effective nodemask is the nodemask of the nearest ancestor - * with non-empty memss. We use effective nodemask whenever: - * - we update tasks' mems_allowed. (they take on the ancestor's nodemask - *   if the cpuset they reside in has no mems) - * - we want to retrieve task_cs(tsk)'s mems_allowed. - * - * Called with cpuset_mutex held. - */ -static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) -{ -	while (nodes_empty(cs->mems_allowed)) -		cs = parent_cs(cs); -	return cs; -} -  /**   * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.   * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed @@ -830,53 +825,80 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)   */  static void update_tasks_cpumask(struct cpuset *cs)  { -	struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);  	struct css_task_iter it;  	struct task_struct *task;  	css_task_iter_start(&cs->css, &it);  	while ((task = css_task_iter_next(&it))) -		set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed); +		set_cpus_allowed_ptr(task, cs->effective_cpus);  	css_task_iter_end(&it);  }  /* - * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. - * @root_cs: the root cpuset of the hierarchy - * @update_root: update root cpuset or not? + * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree + * @cs: the cpuset to consider + * @new_cpus: temp variable for calculating new effective_cpus + * + * When congifured cpumask is changed, the effective cpumasks of this cpuset + * and all its descendants need to be updated.   * - * This will update cpumasks of tasks in @root_cs and all other empty cpusets - * which take on cpumask of @root_cs. + * On legacy hierachy, effective_cpus will be the same with cpu_allowed.   *   * Called with cpuset_mutex held   */ -static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) +static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)  {  	struct cpuset *cp;  	struct cgroup_subsys_state *pos_css; +	bool need_rebuild_sched_domains = false;  	rcu_read_lock(); -	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { -		if (cp == root_cs) { -			if (!update_root) -				continue; -		} else { -			/* skip the whole subtree if @cp have some CPU */ -			if (!cpumask_empty(cp->cpus_allowed)) { -				pos_css = css_rightmost_descendant(pos_css); -				continue; -			} +	cpuset_for_each_descendant_pre(cp, pos_css, cs) { +		struct cpuset *parent = parent_cs(cp); + +		cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus); + +		/* +		 * If it becomes empty, inherit the effective mask of the +		 * parent, which is guaranteed to have some CPUs. +		 */ +		if (cpumask_empty(new_cpus)) +			cpumask_copy(new_cpus, parent->effective_cpus); + +		/* Skip the whole subtree if the cpumask remains the same. */ +		if (cpumask_equal(new_cpus, cp->effective_cpus)) { +			pos_css = css_rightmost_descendant(pos_css); +			continue;  		} +  		if (!css_tryget_online(&cp->css))  			continue;  		rcu_read_unlock(); +		mutex_lock(&callback_mutex); +		cpumask_copy(cp->effective_cpus, new_cpus); +		mutex_unlock(&callback_mutex); + +		WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && +			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); +  		update_tasks_cpumask(cp); +		/* +		 * If the effective cpumask of any non-empty cpuset is changed, +		 * we need to rebuild sched domains. +		 */ +		if (!cpumask_empty(cp->cpus_allowed) && +		    is_sched_load_balance(cp)) +			need_rebuild_sched_domains = true; +  		rcu_read_lock();  		css_put(&cp->css);  	}  	rcu_read_unlock(); + +	if (need_rebuild_sched_domains) +		rebuild_sched_domains_locked();  }  /** @@ -889,7 +911,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,  			  const char *buf)  {  	int retval; -	int is_load_balanced;  	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */  	if (cs == &top_cpuset) @@ -908,7 +929,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,  		if (retval < 0)  			return retval; -		if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) +		if (!cpumask_subset(trialcs->cpus_allowed, +				    top_cpuset.cpus_allowed))  			return -EINVAL;  	} @@ -920,16 +942,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,  	if (retval < 0)  		return retval; -	is_load_balanced = is_sched_load_balance(trialcs); -  	mutex_lock(&callback_mutex);  	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);  	mutex_unlock(&callback_mutex); -	update_tasks_cpumask_hier(cs, true); - -	if (is_load_balanced) -		rebuild_sched_domains_locked(); +	/* use trialcs->cpus_allowed as a temp variable */ +	update_cpumasks_hier(cs, trialcs->cpus_allowed);  	return 0;  } @@ -951,15 +969,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,  							const nodemask_t *to)  {  	struct task_struct *tsk = current; -	struct cpuset *mems_cs;  	tsk->mems_allowed = *to;  	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);  	rcu_read_lock(); -	mems_cs = effective_nodemask_cpuset(task_cs(tsk)); -	guarantee_online_mems(mems_cs, &tsk->mems_allowed); +	guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);  	rcu_read_unlock();  } @@ -1028,13 +1044,12 @@ static void *cpuset_being_rebound;  static void update_tasks_nodemask(struct cpuset *cs)  {  	static nodemask_t newmems;	/* protected by cpuset_mutex */ -	struct cpuset *mems_cs = effective_nodemask_cpuset(cs);  	struct css_task_iter it;  	struct task_struct *task;  	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */ -	guarantee_online_mems(mems_cs, &newmems); +	guarantee_online_mems(cs, &newmems);  	/*  	 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't @@ -1077,36 +1092,52 @@ static void update_tasks_nodemask(struct cpuset *cs)  }  /* - * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. - * @cs: the root cpuset of the hierarchy - * @update_root: update the root cpuset or not? + * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree + * @cs: the cpuset to consider + * @new_mems: a temp variable for calculating new effective_mems   * - * This will update nodemasks of tasks in @root_cs and all other empty cpusets - * which take on nodemask of @root_cs. + * When configured nodemask is changed, the effective nodemasks of this cpuset + * and all its descendants need to be updated. + * + * On legacy hiearchy, effective_mems will be the same with mems_allowed.   *   * Called with cpuset_mutex held   */ -static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) +static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)  {  	struct cpuset *cp;  	struct cgroup_subsys_state *pos_css;  	rcu_read_lock(); -	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { -		if (cp == root_cs) { -			if (!update_root) -				continue; -		} else { -			/* skip the whole subtree if @cp have some CPU */ -			if (!nodes_empty(cp->mems_allowed)) { -				pos_css = css_rightmost_descendant(pos_css); -				continue; -			} +	cpuset_for_each_descendant_pre(cp, pos_css, cs) { +		struct cpuset *parent = parent_cs(cp); + +		nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); + +		/* +		 * If it becomes empty, inherit the effective mask of the +		 * parent, which is guaranteed to have some MEMs. +		 */ +		if (nodes_empty(*new_mems)) +			*new_mems = parent->effective_mems; + +		/* Skip the whole subtree if the nodemask remains the same. */ +		if (nodes_equal(*new_mems, cp->effective_mems)) { +			pos_css = css_rightmost_descendant(pos_css); +			continue;  		} +  		if (!css_tryget_online(&cp->css))  			continue;  		rcu_read_unlock(); +		mutex_lock(&callback_mutex); +		cp->effective_mems = *new_mems; +		mutex_unlock(&callback_mutex); + +		WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && +			!nodes_equal(cp->mems_allowed, cp->effective_mems)); +  		update_tasks_nodemask(cp);  		rcu_read_lock(); @@ -1156,8 +1187,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,  			goto done;  		if (!nodes_subset(trialcs->mems_allowed, -				node_states[N_MEMORY])) { -			retval =  -EINVAL; +				  top_cpuset.mems_allowed)) { +			retval = -EINVAL;  			goto done;  		}  	} @@ -1174,7 +1205,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,  	cs->mems_allowed = trialcs->mems_allowed;  	mutex_unlock(&callback_mutex); -	update_tasks_nodemask_hier(cs, true); +	/* use trialcs->mems_allowed as a temp variable */ +	update_nodemasks_hier(cs, &cs->mems_allowed);  done:  	return retval;  } @@ -1389,12 +1421,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,  	mutex_lock(&cpuset_mutex); -	/* -	 * We allow to move tasks into an empty cpuset if sane_behavior -	 * flag is set. -	 */ +	/* allow moving tasks into an empty cpuset if on default hierarchy */  	ret = -ENOSPC; -	if (!cgroup_sane_behavior(css->cgroup) && +	if (!cgroup_on_dfl(css->cgroup) &&  	    (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))  		goto out_unlock; @@ -1452,8 +1481,6 @@ static void cpuset_attach(struct cgroup_subsys_state *css,  	struct task_struct *leader = cgroup_taskset_first(tset);  	struct cpuset *cs = css_cs(css);  	struct cpuset *oldcs = cpuset_attach_old_cs; -	struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); -	struct cpuset *mems_cs = effective_nodemask_cpuset(cs);  	mutex_lock(&cpuset_mutex); @@ -1461,9 +1488,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,  	if (cs == &top_cpuset)  		cpumask_copy(cpus_attach, cpu_possible_mask);  	else -		guarantee_online_cpus(cpus_cs, cpus_attach); +		guarantee_online_cpus(cs, cpus_attach); -	guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); +	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);  	cgroup_taskset_for_each(task, tset) {  		/* @@ -1480,11 +1507,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,  	 * Change mm, possibly for multiple threads in a threadgroup. This is  	 * expensive and may sleep.  	 */ -	cpuset_attach_nodemask_to = cs->mems_allowed; +	cpuset_attach_nodemask_to = cs->effective_mems;  	mm = get_task_mm(leader);  	if (mm) { -		struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs); -  		mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);  		/* @@ -1495,7 +1520,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,  		 * mm from.  		 */  		if (is_memory_migrate(cs)) { -			cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed, +			cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,  					  &cpuset_attach_nodemask_to);  		}  		mmput(mm); @@ -1516,6 +1541,8 @@ typedef enum {  	FILE_MEMORY_MIGRATE,  	FILE_CPULIST,  	FILE_MEMLIST, +	FILE_EFFECTIVE_CPULIST, +	FILE_EFFECTIVE_MEMLIST,  	FILE_CPU_EXCLUSIVE,  	FILE_MEM_EXCLUSIVE,  	FILE_MEM_HARDWALL, @@ -1694,6 +1721,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)  	case FILE_MEMLIST:  		s += nodelist_scnprintf(s, count, cs->mems_allowed);  		break; +	case FILE_EFFECTIVE_CPULIST: +		s += cpulist_scnprintf(s, count, cs->effective_cpus); +		break; +	case FILE_EFFECTIVE_MEMLIST: +		s += nodelist_scnprintf(s, count, cs->effective_mems); +		break;  	default:  		ret = -EINVAL;  		goto out_unlock; @@ -1779,6 +1812,18 @@ static struct cftype files[] = {  	},  	{ +		.name = "effective_cpus", +		.seq_show = cpuset_common_seq_show, +		.private = FILE_EFFECTIVE_CPULIST, +	}, + +	{ +		.name = "effective_mems", +		.seq_show = cpuset_common_seq_show, +		.private = FILE_EFFECTIVE_MEMLIST, +	}, + +	{  		.name = "cpu_exclusive",  		.read_u64 = cpuset_read_u64,  		.write_u64 = cpuset_write_u64, @@ -1869,18 +1914,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)  	cs = kzalloc(sizeof(*cs), GFP_KERNEL);  	if (!cs)  		return ERR_PTR(-ENOMEM); -	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { -		kfree(cs); -		return ERR_PTR(-ENOMEM); -	} +	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) +		goto free_cs; +	if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) +		goto free_cpus;  	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);  	cpumask_clear(cs->cpus_allowed);  	nodes_clear(cs->mems_allowed); +	cpumask_clear(cs->effective_cpus); +	nodes_clear(cs->effective_mems);  	fmeter_init(&cs->fmeter);  	cs->relax_domain_level = -1;  	return &cs->css; + +free_cpus: +	free_cpumask_var(cs->cpus_allowed); +free_cs: +	kfree(cs); +	return ERR_PTR(-ENOMEM);  }  static int cpuset_css_online(struct cgroup_subsys_state *css) @@ -1903,6 +1956,13 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)  	cpuset_inc(); +	mutex_lock(&callback_mutex); +	if (cgroup_on_dfl(cs->css.cgroup)) { +		cpumask_copy(cs->effective_cpus, parent->effective_cpus); +		cs->effective_mems = parent->effective_mems; +	} +	mutex_unlock(&callback_mutex); +  	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))  		goto out_unlock; @@ -1962,20 +2022,40 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)  {  	struct cpuset *cs = css_cs(css); +	free_cpumask_var(cs->effective_cpus);  	free_cpumask_var(cs->cpus_allowed);  	kfree(cs);  } +static void cpuset_bind(struct cgroup_subsys_state *root_css) +{ +	mutex_lock(&cpuset_mutex); +	mutex_lock(&callback_mutex); + +	if (cgroup_on_dfl(root_css->cgroup)) { +		cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); +		top_cpuset.mems_allowed = node_possible_map; +	} else { +		cpumask_copy(top_cpuset.cpus_allowed, +			     top_cpuset.effective_cpus); +		top_cpuset.mems_allowed = top_cpuset.effective_mems; +	} + +	mutex_unlock(&callback_mutex); +	mutex_unlock(&cpuset_mutex); +} +  struct cgroup_subsys cpuset_cgrp_subsys = { -	.css_alloc = cpuset_css_alloc, -	.css_online = cpuset_css_online, -	.css_offline = cpuset_css_offline, -	.css_free = cpuset_css_free, -	.can_attach = cpuset_can_attach, -	.cancel_attach = cpuset_cancel_attach, -	.attach = cpuset_attach, -	.base_cftypes = files, -	.early_init = 1, +	.css_alloc	= cpuset_css_alloc, +	.css_online	= cpuset_css_online, +	.css_offline	= cpuset_css_offline, +	.css_free	= cpuset_css_free, +	.can_attach	= cpuset_can_attach, +	.cancel_attach	= cpuset_cancel_attach, +	.attach		= cpuset_attach, +	.bind		= cpuset_bind, +	.legacy_cftypes	= files, +	.early_init	= 1,  };  /** @@ -1990,9 +2070,13 @@ int __init cpuset_init(void)  	if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))  		BUG(); +	if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) +		BUG();  	cpumask_setall(top_cpuset.cpus_allowed);  	nodes_setall(top_cpuset.mems_allowed); +	cpumask_setall(top_cpuset.effective_cpus); +	nodes_setall(top_cpuset.effective_mems);  	fmeter_init(&top_cpuset.fmeter);  	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); @@ -2035,6 +2119,66 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)  	}  } +static void +hotplug_update_tasks_legacy(struct cpuset *cs, +			    struct cpumask *new_cpus, nodemask_t *new_mems, +			    bool cpus_updated, bool mems_updated) +{ +	bool is_empty; + +	mutex_lock(&callback_mutex); +	cpumask_copy(cs->cpus_allowed, new_cpus); +	cpumask_copy(cs->effective_cpus, new_cpus); +	cs->mems_allowed = *new_mems; +	cs->effective_mems = *new_mems; +	mutex_unlock(&callback_mutex); + +	/* +	 * Don't call update_tasks_cpumask() if the cpuset becomes empty, +	 * as the tasks will be migratecd to an ancestor. +	 */ +	if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) +		update_tasks_cpumask(cs); +	if (mems_updated && !nodes_empty(cs->mems_allowed)) +		update_tasks_nodemask(cs); + +	is_empty = cpumask_empty(cs->cpus_allowed) || +		   nodes_empty(cs->mems_allowed); + +	mutex_unlock(&cpuset_mutex); + +	/* +	 * Move tasks to the nearest ancestor with execution resources, +	 * This is full cgroup operation which will also call back into +	 * cpuset. Should be done outside any lock. +	 */ +	if (is_empty) +		remove_tasks_in_empty_cpuset(cs); + +	mutex_lock(&cpuset_mutex); +} + +static void +hotplug_update_tasks(struct cpuset *cs, +		     struct cpumask *new_cpus, nodemask_t *new_mems, +		     bool cpus_updated, bool mems_updated) +{ +	if (cpumask_empty(new_cpus)) +		cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); +	if (nodes_empty(*new_mems)) +		*new_mems = parent_cs(cs)->effective_mems; + +	mutex_lock(&callback_mutex); +	cpumask_copy(cs->effective_cpus, new_cpus); +	cs->effective_mems = *new_mems; +	mutex_unlock(&callback_mutex); + +	if (cpus_updated) +		update_tasks_cpumask(cs); +	if (mems_updated) +		update_tasks_nodemask(cs); +} +  /**   * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug   * @cs: cpuset in interest @@ -2045,11 +2189,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)   */  static void cpuset_hotplug_update_tasks(struct cpuset *cs)  { -	static cpumask_t off_cpus; -	static nodemask_t off_mems; -	bool is_empty; -	bool sane = cgroup_sane_behavior(cs->css.cgroup); - +	static cpumask_t new_cpus; +	static nodemask_t new_mems; +	bool cpus_updated; +	bool mems_updated;  retry:  	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); @@ -2064,51 +2207,20 @@ retry:  		goto retry;  	} -	cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); -	nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); - -	mutex_lock(&callback_mutex); -	cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); -	mutex_unlock(&callback_mutex); - -	/* -	 * If sane_behavior flag is set, we need to update tasks' cpumask -	 * for empty cpuset to take on ancestor's cpumask. Otherwise, don't -	 * call update_tasks_cpumask() if the cpuset becomes empty, as -	 * the tasks in it will be migrated to an ancestor. -	 */ -	if ((sane && cpumask_empty(cs->cpus_allowed)) || -	    (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) -		update_tasks_cpumask(cs); +	cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus); +	nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); -	mutex_lock(&callback_mutex); -	nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); -	mutex_unlock(&callback_mutex); - -	/* -	 * If sane_behavior flag is set, we need to update tasks' nodemask -	 * for empty cpuset to take on ancestor's nodemask. Otherwise, don't -	 * call update_tasks_nodemask() if the cpuset becomes empty, as -	 * the tasks in it will be migratd to an ancestor. -	 */ -	if ((sane && nodes_empty(cs->mems_allowed)) || -	    (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) -		update_tasks_nodemask(cs); +	cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); +	mems_updated = !nodes_equal(new_mems, cs->effective_mems); -	is_empty = cpumask_empty(cs->cpus_allowed) || -		nodes_empty(cs->mems_allowed); +	if (cgroup_on_dfl(cs->css.cgroup)) +		hotplug_update_tasks(cs, &new_cpus, &new_mems, +				     cpus_updated, mems_updated); +	else +		hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, +					    cpus_updated, mems_updated);  	mutex_unlock(&cpuset_mutex); - -	/* -	 * If sane_behavior flag is set, we'll keep tasks in empty cpusets. -	 * -	 * Otherwise move tasks to the nearest ancestor with execution -	 * resources.  This is full cgroup operation which will -	 * also call back into cpuset.  Should be done outside any lock. -	 */ -	if (!sane && is_empty) -		remove_tasks_in_empty_cpuset(cs);  }  /** @@ -2132,6 +2244,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)  	static cpumask_t new_cpus;  	static nodemask_t new_mems;  	bool cpus_updated, mems_updated; +	bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);  	mutex_lock(&cpuset_mutex); @@ -2139,13 +2252,15 @@ static void cpuset_hotplug_workfn(struct work_struct *work)  	cpumask_copy(&new_cpus, cpu_active_mask);  	new_mems = node_states[N_MEMORY]; -	cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); -	mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); +	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); +	mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);  	/* synchronize cpus_allowed to cpu_active_mask */  	if (cpus_updated) {  		mutex_lock(&callback_mutex); -		cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); +		if (!on_dfl) +			cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); +		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);  		mutex_unlock(&callback_mutex);  		/* we don't mess with cpumasks of tasks in top_cpuset */  	} @@ -2153,7 +2268,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work)  	/* synchronize mems_allowed to N_MEMORY */  	if (mems_updated) {  		mutex_lock(&callback_mutex); -		top_cpuset.mems_allowed = new_mems; +		if (!on_dfl) +			top_cpuset.mems_allowed = new_mems; +		top_cpuset.effective_mems = new_mems;  		mutex_unlock(&callback_mutex);  		update_tasks_nodemask(&top_cpuset);  	} @@ -2228,6 +2345,9 @@ void __init cpuset_init_smp(void)  	top_cpuset.mems_allowed = node_states[N_MEMORY];  	top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; +	cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); +	top_cpuset.effective_mems = node_states[N_MEMORY]; +  	register_hotmemory_notifier(&cpuset_track_online_nodes_nb);  } @@ -2244,23 +2364,17 @@ void __init cpuset_init_smp(void)  void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)  { -	struct cpuset *cpus_cs; -  	mutex_lock(&callback_mutex);  	rcu_read_lock(); -	cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); -	guarantee_online_cpus(cpus_cs, pmask); +	guarantee_online_cpus(task_cs(tsk), pmask);  	rcu_read_unlock();  	mutex_unlock(&callback_mutex);  }  void cpuset_cpus_allowed_fallback(struct task_struct *tsk)  { -	struct cpuset *cpus_cs; -  	rcu_read_lock(); -	cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); -	do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed); +	do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);  	rcu_read_unlock();  	/* @@ -2299,13 +2413,11 @@ void cpuset_init_current_mems_allowed(void)  nodemask_t cpuset_mems_allowed(struct task_struct *tsk)  { -	struct cpuset *mems_cs;  	nodemask_t mask;  	mutex_lock(&callback_mutex);  	rcu_read_lock(); -	mems_cs = effective_nodemask_cpuset(task_cs(tsk)); -	guarantee_online_mems(mems_cs, &mask); +	guarantee_online_mems(task_cs(tsk), &mask);  	rcu_read_unlock();  	mutex_unlock(&callback_mutex); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2f7c760305ca..379650b984f8 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2472,7 +2472,7 @@ static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)  static void kdb_sysinfo(struct sysinfo *val)  {  	struct timespec uptime; -	do_posix_clock_monotonic_gettime(&uptime); +	ktime_get_ts(&uptime);  	memset(val, 0, sizeof(*val));  	val->uptime = uptime.tv_sec;  	val->loads[0] = avenrun[0]; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 54996b71e66d..ef90b04d783f 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -46,42 +46,25 @@ void __delayacct_tsk_init(struct task_struct *tsk)  }  /* - * Start accounting for a delay statistic using - * its starting timestamp (@start) + * Finish delay accounting for a statistic using its timestamps (@start), + * accumalator (@total) and @count   */ - -static inline void delayacct_start(struct timespec *start) +static void delayacct_end(u64 *start, u64 *total, u32 *count)  { -	do_posix_clock_monotonic_gettime(start); -} - -/* - * Finish delay accounting for a statistic using - * its timestamps (@start, @end), accumalator (@total) and @count - */ - -static void delayacct_end(struct timespec *start, struct timespec *end, -				u64 *total, u32 *count) -{ -	struct timespec ts; -	s64 ns; +	s64 ns = ktime_get_ns() - *start;  	unsigned long flags; -	do_posix_clock_monotonic_gettime(end); -	ts = timespec_sub(*end, *start); -	ns = timespec_to_ns(&ts); -	if (ns < 0) -		return; - -	spin_lock_irqsave(¤t->delays->lock, flags); -	*total += ns; -	(*count)++; -	spin_unlock_irqrestore(¤t->delays->lock, flags); +	if (ns > 0) { +		spin_lock_irqsave(¤t->delays->lock, flags); +		*total += ns; +		(*count)++; +		spin_unlock_irqrestore(¤t->delays->lock, flags); +	}  }  void __delayacct_blkio_start(void)  { -	delayacct_start(¤t->delays->blkio_start); +	current->delays->blkio_start = ktime_get_ns();  }  void __delayacct_blkio_end(void) @@ -89,35 +72,29 @@ void __delayacct_blkio_end(void)  	if (current->delays->flags & DELAYACCT_PF_SWAPIN)  		/* Swapin block I/O */  		delayacct_end(¤t->delays->blkio_start, -			¤t->delays->blkio_end,  			¤t->delays->swapin_delay,  			¤t->delays->swapin_count);  	else	/* Other block I/O */  		delayacct_end(¤t->delays->blkio_start, -			¤t->delays->blkio_end,  			¤t->delays->blkio_delay,  			¤t->delays->blkio_count);  }  int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)  { -	s64 tmp; -	unsigned long t1; -	unsigned long long t2, t3; -	unsigned long flags; -	struct timespec ts;  	cputime_t utime, stime, stimescaled, utimescaled; +	unsigned long long t2, t3; +	unsigned long flags, t1; +	s64 tmp; -	tmp = (s64)d->cpu_run_real_total;  	task_cputime(tsk, &utime, &stime); -	cputime_to_timespec(utime + stime, &ts); -	tmp += timespec_to_ns(&ts); +	tmp = (s64)d->cpu_run_real_total; +	tmp += cputime_to_nsecs(utime + stime);  	d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; -	tmp = (s64)d->cpu_scaled_run_real_total;  	task_cputime_scaled(tsk, &utimescaled, &stimescaled); -	cputime_to_timespec(utimescaled + stimescaled, &ts); -	tmp += timespec_to_ns(&ts); +	tmp = (s64)d->cpu_scaled_run_real_total; +	tmp += cputime_to_nsecs(utimescaled + stimescaled);  	d->cpu_scaled_run_real_total =  		(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; @@ -169,13 +146,12 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)  void __delayacct_freepages_start(void)  { -	delayacct_start(¤t->delays->freepages_start); +	current->delays->freepages_start = ktime_get_ns();  }  void __delayacct_freepages_end(void)  {  	delayacct_end(¤t->delays->freepages_start, -			¤t->delays->freepages_end,  			¤t->delays->freepages_delay,  			¤t->delays->freepages_count);  } diff --git a/kernel/events/core.c b/kernel/events/core.c index 6b17ac1b0c2a..1cf24b3e42ec 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5266,6 +5266,12 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)  		goto got_name;  	} else { +		if (vma->vm_ops && vma->vm_ops->name) { +			name = (char *) vma->vm_ops->name(vma); +			if (name) +				goto cpy_name; +		} +  		name = (char *)arch_vma_name(vma);  		if (name)  			goto cpy_name; @@ -7804,7 +7810,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,  /*   * Initialize the perf_event context in task_struct   */ -int perf_event_init_context(struct task_struct *child, int ctxn) +static int perf_event_init_context(struct task_struct *child, int ctxn)  {  	struct perf_event_context *child_ctx, *parent_ctx;  	struct perf_event_context *cloned_ctx; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6f3254e8c137..1d0af8a2c646 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,  	/* For mmu_notifiers */  	const unsigned long mmun_start = addr;  	const unsigned long mmun_end   = addr + PAGE_SIZE; +	struct mem_cgroup *memcg; + +	err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); +	if (err) +		return err;  	/* For try_to_free_swap() and munlock_vma_page() below */  	lock_page(page); @@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,  	get_page(kpage);  	page_add_new_anon_rmap(kpage, vma, addr); +	mem_cgroup_commit_charge(kpage, memcg, false); +	lru_cache_add_active_or_unevictable(kpage, vma);  	if (!PageAnon(page)) {  		dec_mm_counter(mm, MM_FILEPAGES); @@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,  	err = 0;   unlock: +	mem_cgroup_cancel_charge(kpage, memcg);  	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);  	unlock_page(page);  	return err; @@ -315,18 +323,11 @@ retry:  	if (!new_page)  		goto put_old; -	if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) -		goto put_new; -  	__SetPageUptodate(new_page);  	copy_highpage(new_page, old_page);  	copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);  	ret = __replace_page(vma, vaddr, old_page, new_page); -	if (ret) -		mem_cgroup_uncharge_page(new_page); - -put_new:  	page_cache_release(new_page);  put_old:  	put_page(old_page); diff --git a/kernel/exit.c b/kernel/exit.c index e5c4668f1799..32c58f7433a3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -59,7 +59,7 @@  #include <asm/pgtable.h>  #include <asm/mmu_context.h> -static void exit_mm(struct task_struct * tsk); +static void exit_mm(struct task_struct *tsk);  static void __unhash_process(struct task_struct *p, bool group_dead)  { @@ -151,7 +151,7 @@ static void __exit_signal(struct task_struct *tsk)  	spin_unlock(&sighand->siglock);  	__cleanup_sighand(sighand); -	clear_tsk_thread_flag(tsk,TIF_SIGPENDING); +	clear_tsk_thread_flag(tsk, TIF_SIGPENDING);  	if (group_dead) {  		flush_sigqueue(&sig->shared_pending);  		tty_kref_put(tty); @@ -168,7 +168,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)  } -void release_task(struct task_struct * p) +void release_task(struct task_struct *p)  {  	struct task_struct *leader;  	int zap_leader; @@ -192,7 +192,8 @@ repeat:  	 */  	zap_leader = 0;  	leader = p->group_leader; -	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { +	if (leader != p && thread_group_empty(leader) +			&& leader->exit_state == EXIT_ZOMBIE) {  		/*  		 * If we were the last child thread and the leader has  		 * exited already, and the leader's parent ignores SIGCHLD, @@ -241,7 +242,8 @@ struct pid *session_of_pgrp(struct pid *pgrp)   *   * "I ask you, have you ever known what it is to be an orphan?"   */ -static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) +static int will_become_orphaned_pgrp(struct pid *pgrp, +					struct task_struct *ignored_task)  {  	struct task_struct *p; @@ -294,9 +296,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)  	struct task_struct *ignored_task = tsk;  	if (!parent) -		 /* exit: our father is in a different pgrp than -		  * we are and we were the only connection outside. -		  */ +		/* exit: our father is in a different pgrp than +		 * we are and we were the only connection outside. +		 */  		parent = tsk->real_parent;  	else  		/* reparent: our child is in a different pgrp than @@ -405,7 +407,7 @@ assign_new_owner:   * Turn us into a lazy TLB process if we   * aren't already..   */ -static void exit_mm(struct task_struct * tsk) +static void exit_mm(struct task_struct *tsk)  {  	struct mm_struct *mm = tsk->mm;  	struct core_state *core_state; @@ -425,6 +427,7 @@ static void exit_mm(struct task_struct * tsk)  	core_state = mm->core_state;  	if (core_state) {  		struct core_thread self; +  		up_read(&mm->mmap_sem);  		self.task = tsk; @@ -455,6 +458,7 @@ static void exit_mm(struct task_struct * tsk)  	task_unlock(tsk);  	mm_update_next_owner(mm);  	mmput(mm); +	clear_thread_flag(TIF_MEMDIE);  }  /* @@ -565,6 +569,7 @@ static void forget_original_parent(struct task_struct *father)  	list_for_each_entry_safe(p, n, &father->children, sibling) {  		struct task_struct *t = p; +  		do {  			t->real_parent = reaper;  			if (t->parent == father) { @@ -598,7 +603,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)  	/*  	 * This does two things:  	 * -  	 * A.  Make init inherit all the child processes +	 * A.  Make init inherit all the child processes  	 * B.  Check to see if any process groups have become orphaned  	 *	as a result of our exiting, and if they have any stopped  	 *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2) @@ -648,9 +653,8 @@ static void check_stack_usage(void)  	spin_lock(&low_water_lock);  	if (free < lowest_to_date) { -		printk(KERN_WARNING "%s (%d) used greatest stack depth: " -				"%lu bytes left\n", -				current->comm, task_pid_nr(current), free); +		pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n", +			current->comm, task_pid_nr(current), free);  		lowest_to_date = free;  	}  	spin_unlock(&low_water_lock); @@ -691,8 +695,7 @@ void do_exit(long code)  	 * leave this task alone and wait for reboot.  	 */  	if (unlikely(tsk->flags & PF_EXITING)) { -		printk(KERN_ALERT -			"Fixing recursive fault but reboot is needed!\n"); +		pr_alert("Fixing recursive fault but reboot is needed!\n");  		/*  		 * We can do this unlocked here. The futex code uses  		 * this flag just to verify whether the pi state @@ -716,9 +719,9 @@ void do_exit(long code)  	raw_spin_unlock_wait(&tsk->pi_lock);  	if (unlikely(in_atomic())) -		printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", -				current->comm, task_pid_nr(current), -				preempt_count()); +		pr_info("note: %s[%d] exited with preempt_count %d\n", +			current->comm, task_pid_nr(current), +			preempt_count());  	acct_update_integrals(tsk);  	/* sync mm's RSS info before statistics gathering */ @@ -836,7 +839,6 @@ void do_exit(long code)  	for (;;)  		cpu_relax();	/* For when BUG is null */  } -  EXPORT_SYMBOL_GPL(do_exit);  void complete_and_exit(struct completion *comp, long code) @@ -846,7 +848,6 @@ void complete_and_exit(struct completion *comp, long code)  	do_exit(code);  } -  EXPORT_SYMBOL(complete_and_exit);  SYSCALL_DEFINE1(exit, int, error_code) @@ -869,6 +870,7 @@ do_group_exit(int exit_code)  		exit_code = sig->group_exit_code;  	else if (!thread_group_empty(current)) {  		struct sighand_struct *const sighand = current->sighand; +  		spin_lock_irq(&sighand->siglock);  		if (signal_group_exit(sig))  			/* Another thread got here before we took the lock.  */ @@ -1033,9 +1035,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)  		 * as other threads in the parent group can be right  		 * here reaping other children at the same time.  		 * -		 * We use thread_group_cputime_adjusted() to get times for the thread -		 * group, which consolidates times for all threads in the -		 * group including the group leader. +		 * We use thread_group_cputime_adjusted() to get times for +		 * the thread group, which consolidates times for all threads +		 * in the group including the group leader.  		 */  		thread_group_cputime_adjusted(p, &tgutime, &tgstime);  		spin_lock_irq(&p->real_parent->sighand->siglock); @@ -1417,6 +1419,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)  	list_for_each_entry(p, &tsk->children, sibling) {  		int ret = wait_consider_task(wo, 0, p); +  		if (ret)  			return ret;  	} @@ -1430,6 +1433,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)  	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {  		int ret = wait_consider_task(wo, 1, p); +  		if (ret)  			return ret;  	} diff --git a/kernel/fork.c b/kernel/fork.c index 6a13c46cd87d..1380d8ace334 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -315,6 +315,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)  		goto free_ti;  	tsk->stack = ti; +#ifdef CONFIG_SECCOMP +	/* +	 * We must handle setting up seccomp filters once we're under +	 * the sighand lock in case orig has changed between now and +	 * then. Until then, filter must be NULL to avoid messing up +	 * the usage counts on the error path calling free_task. +	 */ +	tsk->seccomp.filter = NULL; +#endif  	setup_thread_stack(tsk, orig);  	clear_user_return_notifier(tsk); @@ -365,12 +374,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)  	 */  	down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); -	mm->locked_vm = 0; -	mm->mmap = NULL; -	mm->vmacache_seqnum = 0; -	mm->map_count = 0; -	cpumask_clear(mm_cpumask(mm)); -	mm->mm_rb = RB_ROOT; +	mm->total_vm = oldmm->total_vm; +	mm->shared_vm = oldmm->shared_vm; +	mm->exec_vm = oldmm->exec_vm; +	mm->stack_vm = oldmm->stack_vm; +  	rb_link = &mm->mm_rb.rb_node;  	rb_parent = NULL;  	pprev = &mm->mmap; @@ -421,7 +429,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)  				atomic_dec(&inode->i_writecount);  			mutex_lock(&mapping->i_mmap_mutex);  			if (tmp->vm_flags & VM_SHARED) -				mapping->i_mmap_writable++; +				atomic_inc(&mapping->i_mmap_writable);  			flush_dcache_mmap_lock(mapping);  			/* insert tmp into the share list, just after mpnt */  			if (unlikely(tmp->vm_flags & VM_NONLINEAR)) @@ -527,19 +535,37 @@ static void mm_init_aio(struct mm_struct *mm)  #endif  } +static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ +#ifdef CONFIG_MEMCG +	mm->owner = p; +#endif +} +  static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)  { +	mm->mmap = NULL; +	mm->mm_rb = RB_ROOT; +	mm->vmacache_seqnum = 0;  	atomic_set(&mm->mm_users, 1);  	atomic_set(&mm->mm_count, 1);  	init_rwsem(&mm->mmap_sem);  	INIT_LIST_HEAD(&mm->mmlist);  	mm->core_state = NULL;  	atomic_long_set(&mm->nr_ptes, 0); +	mm->map_count = 0; +	mm->locked_vm = 0; +	mm->pinned_vm = 0;  	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));  	spin_lock_init(&mm->page_table_lock); +	mm_init_cpumask(mm);  	mm_init_aio(mm);  	mm_init_owner(mm, p); +	mmu_notifier_mm_init(mm);  	clear_tlb_flush_pending(mm); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS +	mm->pmd_huge_pte = NULL; +#endif  	if (current->mm) {  		mm->flags = current->mm->flags & MMF_INIT_MASK; @@ -549,11 +575,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)  		mm->def_flags = 0;  	} -	if (likely(!mm_alloc_pgd(mm))) { -		mmu_notifier_mm_init(mm); -		return mm; -	} +	if (mm_alloc_pgd(mm)) +		goto fail_nopgd; + +	if (init_new_context(p, mm)) +		goto fail_nocontext; +	return mm; + +fail_nocontext: +	mm_free_pgd(mm); +fail_nopgd:  	free_mm(mm);  	return NULL;  } @@ -587,7 +619,6 @@ struct mm_struct *mm_alloc(void)  		return NULL;  	memset(mm, 0, sizeof(*mm)); -	mm_init_cpumask(mm);  	return mm_init(mm, current);  } @@ -819,17 +850,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)  		goto fail_nomem;  	memcpy(mm, oldmm, sizeof(*mm)); -	mm_init_cpumask(mm); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS -	mm->pmd_huge_pte = NULL; -#endif  	if (!mm_init(mm, tsk))  		goto fail_nomem; -	if (init_new_context(tsk, mm)) -		goto fail_nocontext; -  	dup_mm_exe_file(oldmm, mm);  	err = dup_mmap(mm, oldmm); @@ -851,15 +875,6 @@ free_pt:  fail_nomem:  	return NULL; - -fail_nocontext: -	/* -	 * If init_new_context() failed, we cannot use mmput() to free the mm -	 * because it calls destroy_context() -	 */ -	mm_free_pgd(mm); -	free_mm(mm); -	return NULL;  }  static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) @@ -1081,6 +1096,39 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)  	return 0;  } +static void copy_seccomp(struct task_struct *p) +{ +#ifdef CONFIG_SECCOMP +	/* +	 * Must be called with sighand->lock held, which is common to +	 * all threads in the group. Holding cred_guard_mutex is not +	 * needed because this new task is not yet running and cannot +	 * be racing exec. +	 */ +	BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + +	/* Ref-count the new filter user, and assign it. */ +	get_seccomp_filter(current); +	p->seccomp = current->seccomp; + +	/* +	 * Explicitly enable no_new_privs here in case it got set +	 * between the task_struct being duplicated and holding the +	 * sighand lock. The seccomp state and nnp must be in sync. +	 */ +	if (task_no_new_privs(current)) +		task_set_no_new_privs(p); + +	/* +	 * If the parent gained a seccomp mode after copying thread +	 * flags and between before we held the sighand lock, we have +	 * to manually enable the seccomp thread flag here. +	 */ +	if (p->seccomp.mode != SECCOMP_MODE_DISABLED) +		set_tsk_thread_flag(p, TIF_SECCOMP); +#endif +} +  SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)  {  	current->clear_child_tid = tidptr; @@ -1095,17 +1143,9 @@ static void rt_mutex_init_task(struct task_struct *p)  	p->pi_waiters = RB_ROOT;  	p->pi_waiters_leftmost = NULL;  	p->pi_blocked_on = NULL; -	p->pi_top_task = NULL;  #endif  } -#ifdef CONFIG_MEMCG -void mm_init_owner(struct mm_struct *mm, struct task_struct *p) -{ -	mm->owner = p; -} -#endif /* CONFIG_MEMCG */ -  /*   * Initialize POSIX timer handling for a single task.   */ @@ -1196,7 +1236,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,  		goto fork_out;  	ftrace_graph_init_task(p); -	get_seccomp_filter(p);  	rt_mutex_init_task(p); @@ -1262,9 +1301,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	posix_cpu_timers_init(p); -	do_posix_clock_monotonic_gettime(&p->start_time); -	p->real_start_time = p->start_time; -	monotonic_to_bootbased(&p->real_start_time); +	p->start_time = ktime_get_ns(); +	p->real_start_time = ktime_get_boot_ns();  	p->io_context = NULL;  	p->audit_context = NULL;  	if (clone_flags & CLONE_THREAD) @@ -1307,10 +1345,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,  #ifdef CONFIG_DEBUG_MUTEXES  	p->blocked_on = NULL; /* not blocked yet */  #endif -#ifdef CONFIG_MEMCG -	p->memcg_batch.do_batch = 0; -	p->memcg_batch.memcg = NULL; -#endif  #ifdef CONFIG_BCACHE  	p->sequential_io	= 0;  	p->sequential_io_avg	= 0; @@ -1328,6 +1362,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	if (retval)  		goto bad_fork_cleanup_policy;  	/* copy all the process information */ +	shm_init_task(p);  	retval = copy_semundo(clone_flags, p);  	if (retval)  		goto bad_fork_cleanup_audit; @@ -1437,6 +1472,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	spin_lock(¤t->sighand->siglock);  	/* +	 * Copy seccomp details explicitly here, in case they were changed +	 * before holding sighand lock. +	 */ +	copy_seccomp(p); + +	/*  	 * Process group and session signals need to be delivered to just the  	 * parent before the fork or both the parent and the child after the  	 * fork. Restart if a signal comes in before we add the new process to @@ -1873,6 +1914,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)  			 */  			exit_sem(current);  		} +		if (unshare_flags & CLONE_NEWIPC) { +			/* Orphan segments in old ns (see sem above). */ +			exit_shm(current); +			shm_init_task(current); +		}  		if (new_nsproxy)  			switch_task_namespaces(current, new_nsproxy); diff --git a/kernel/futex.c b/kernel/futex.c index b632b5f3f094..d3a9d946d0b7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -792,94 +792,91 @@ void exit_pi_state_list(struct task_struct *curr)   * [10] There is no transient state which leaves owner and user space   *	TID out of sync.   */ -static int -lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, -		union futex_key *key, struct futex_pi_state **ps) + +/* + * Validate that the existing waiter has a pi_state and sanity check + * the pi_state against the user space value. If correct, attach to + * it. + */ +static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, +			      struct futex_pi_state **ps)  { -	struct futex_pi_state *pi_state = NULL; -	struct futex_q *this, *next; -	struct task_struct *p;  	pid_t pid = uval & FUTEX_TID_MASK; -	plist_for_each_entry_safe(this, next, &hb->chain, list) { -		if (match_futex(&this->key, key)) { -			/* -			 * Sanity check the waiter before increasing -			 * the refcount and attaching to it. -			 */ -			pi_state = this->pi_state; -			/* -			 * Userspace might have messed up non-PI and -			 * PI futexes [3] -			 */ -			if (unlikely(!pi_state)) -				return -EINVAL; +	/* +	 * Userspace might have messed up non-PI and PI futexes [3] +	 */ +	if (unlikely(!pi_state)) +		return -EINVAL; -			WARN_ON(!atomic_read(&pi_state->refcount)); +	WARN_ON(!atomic_read(&pi_state->refcount)); +	/* +	 * Handle the owner died case: +	 */ +	if (uval & FUTEX_OWNER_DIED) { +		/* +		 * exit_pi_state_list sets owner to NULL and wakes the +		 * topmost waiter. The task which acquires the +		 * pi_state->rt_mutex will fixup owner. +		 */ +		if (!pi_state->owner) {  			/* -			 * Handle the owner died case: +			 * No pi state owner, but the user space TID +			 * is not 0. Inconsistent state. [5]  			 */ -			if (uval & FUTEX_OWNER_DIED) { -				/* -				 * exit_pi_state_list sets owner to NULL and -				 * wakes the topmost waiter. The task which -				 * acquires the pi_state->rt_mutex will fixup -				 * owner. -				 */ -				if (!pi_state->owner) { -					/* -					 * No pi state owner, but the user -					 * space TID is not 0. Inconsistent -					 * state. [5] -					 */ -					if (pid) -						return -EINVAL; -					/* -					 * Take a ref on the state and -					 * return. [4] -					 */ -					goto out_state; -				} - -				/* -				 * If TID is 0, then either the dying owner -				 * has not yet executed exit_pi_state_list() -				 * or some waiter acquired the rtmutex in the -				 * pi state, but did not yet fixup the TID in -				 * user space. -				 * -				 * Take a ref on the state and return. [6] -				 */ -				if (!pid) -					goto out_state; -			} else { -				/* -				 * If the owner died bit is not set, -				 * then the pi_state must have an -				 * owner. [7] -				 */ -				if (!pi_state->owner) -					return -EINVAL; -			} - +			if (pid) +				return -EINVAL;  			/* -			 * Bail out if user space manipulated the -			 * futex value. If pi state exists then the -			 * owner TID must be the same as the user -			 * space TID. [9/10] +			 * Take a ref on the state and return success. [4]  			 */ -			if (pid != task_pid_vnr(pi_state->owner)) -				return -EINVAL; - -		out_state: -			atomic_inc(&pi_state->refcount); -			*ps = pi_state; -			return 0; +			goto out_state;  		} + +		/* +		 * If TID is 0, then either the dying owner has not +		 * yet executed exit_pi_state_list() or some waiter +		 * acquired the rtmutex in the pi state, but did not +		 * yet fixup the TID in user space. +		 * +		 * Take a ref on the state and return success. [6] +		 */ +		if (!pid) +			goto out_state; +	} else { +		/* +		 * If the owner died bit is not set, then the pi_state +		 * must have an owner. [7] +		 */ +		if (!pi_state->owner) +			return -EINVAL;  	}  	/* +	 * Bail out if user space manipulated the futex value. If pi +	 * state exists then the owner TID must be the same as the +	 * user space TID. [9/10] +	 */ +	if (pid != task_pid_vnr(pi_state->owner)) +		return -EINVAL; +out_state: +	atomic_inc(&pi_state->refcount); +	*ps = pi_state; +	return 0; +} + +/* + * Lookup the task for the TID provided from user space and attach to + * it after doing proper sanity checks. + */ +static int attach_to_pi_owner(u32 uval, union futex_key *key, +			      struct futex_pi_state **ps) +{ +	pid_t pid = uval & FUTEX_TID_MASK; +	struct futex_pi_state *pi_state; +	struct task_struct *p; + +	/*  	 * We are the first waiter - try to look up the real owner and attach  	 * the new pi_state to it, but bail out when TID = 0 [1]  	 */ @@ -920,7 +917,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,  	pi_state = alloc_pi_state();  	/* -	 * Initialize the pi_mutex in locked state and make 'p' +	 * Initialize the pi_mutex in locked state and make @p  	 * the owner of it:  	 */  	rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); @@ -940,6 +937,36 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,  	return 0;  } +static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, +			   union futex_key *key, struct futex_pi_state **ps) +{ +	struct futex_q *match = futex_top_waiter(hb, key); + +	/* +	 * If there is a waiter on that futex, validate it and +	 * attach to the pi_state when the validation succeeds. +	 */ +	if (match) +		return attach_to_pi_state(uval, match->pi_state, ps); + +	/* +	 * We are the first waiter - try to look up the owner based on +	 * @uval and attach to it. +	 */ +	return attach_to_pi_owner(uval, key, ps); +} + +static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) +{ +	u32 uninitialized_var(curval); + +	if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) +		return -EFAULT; + +	/*If user space value changed, let the caller retry */ +	return curval != uval ? -EAGAIN : 0; +} +  /**   * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex   * @uaddr:		the pi futex user address @@ -963,113 +990,69 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,  				struct futex_pi_state **ps,  				struct task_struct *task, int set_waiters)  { -	int lock_taken, ret, force_take = 0; -	u32 uval, newval, curval, vpid = task_pid_vnr(task); - -retry: -	ret = lock_taken = 0; +	u32 uval, newval, vpid = task_pid_vnr(task); +	struct futex_q *match; +	int ret;  	/* -	 * To avoid races, we attempt to take the lock here again -	 * (by doing a 0 -> TID atomic cmpxchg), while holding all -	 * the locks. It will most likely not succeed. +	 * Read the user space value first so we can validate a few +	 * things before proceeding further.  	 */ -	newval = vpid; -	if (set_waiters) -		newval |= FUTEX_WAITERS; - -	if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) +	if (get_futex_value_locked(&uval, uaddr))  		return -EFAULT;  	/*  	 * Detect deadlocks.  	 */ -	if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) +	if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))  		return -EDEADLK;  	/* -	 * Surprise - we got the lock, but we do not trust user space at all. -	 */ -	if (unlikely(!curval)) { -		/* -		 * We verify whether there is kernel state for this -		 * futex. If not, we can safely assume, that the 0 -> -		 * TID transition is correct. If state exists, we do -		 * not bother to fixup the user space state as it was -		 * corrupted already. -		 */ -		return futex_top_waiter(hb, key) ? -EINVAL : 1; -	} - -	uval = curval; - -	/* -	 * Set the FUTEX_WAITERS flag, so the owner will know it has someone -	 * to wake at the next unlock. +	 * Lookup existing state first. If it exists, try to attach to +	 * its pi_state.  	 */ -	newval = curval | FUTEX_WAITERS; +	match = futex_top_waiter(hb, key); +	if (match) +		return attach_to_pi_state(uval, match->pi_state, ps);  	/* -	 * Should we force take the futex? See below. +	 * No waiter and user TID is 0. We are here because the +	 * waiters or the owner died bit is set or called from +	 * requeue_cmp_pi or for whatever reason something took the +	 * syscall.  	 */ -	if (unlikely(force_take)) { +	if (!(uval & FUTEX_TID_MASK)) {  		/* -		 * Keep the OWNER_DIED and the WAITERS bit and set the -		 * new TID value. +		 * We take over the futex. No other waiters and the user space +		 * TID is 0. We preserve the owner died bit.  		 */ -		newval = (curval & ~FUTEX_TID_MASK) | vpid; -		force_take = 0; -		lock_taken = 1; -	} +		newval = uval & FUTEX_OWNER_DIED; +		newval |= vpid; -	if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) -		return -EFAULT; -	if (unlikely(curval != uval)) -		goto retry; +		/* The futex requeue_pi code can enforce the waiters bit */ +		if (set_waiters) +			newval |= FUTEX_WAITERS; + +		ret = lock_pi_update_atomic(uaddr, uval, newval); +		/* If the take over worked, return 1 */ +		return ret < 0 ? ret : 1; +	}  	/* -	 * We took the lock due to forced take over. +	 * First waiter. Set the waiters bit before attaching ourself to +	 * the owner. If owner tries to unlock, it will be forced into +	 * the kernel and blocked on hb->lock.  	 */ -	if (unlikely(lock_taken)) -		return 1; - +	newval = uval | FUTEX_WAITERS; +	ret = lock_pi_update_atomic(uaddr, uval, newval); +	if (ret) +		return ret;  	/* -	 * We dont have the lock. Look up the PI state (or create it if -	 * we are the first waiter): +	 * If the update of the user space value succeeded, we try to +	 * attach to the owner. If that fails, no harm done, we only +	 * set the FUTEX_WAITERS bit in the user space variable.  	 */ -	ret = lookup_pi_state(uval, hb, key, ps); - -	if (unlikely(ret)) { -		switch (ret) { -		case -ESRCH: -			/* -			 * We failed to find an owner for this -			 * futex. So we have no pi_state to block -			 * on. This can happen in two cases: -			 * -			 * 1) The owner died -			 * 2) A stale FUTEX_WAITERS bit -			 * -			 * Re-read the futex value. -			 */ -			if (get_futex_value_locked(&curval, uaddr)) -				return -EFAULT; - -			/* -			 * If the owner died or we have a stale -			 * WAITERS bit the owner TID in the user space -			 * futex is 0. -			 */ -			if (!(curval & FUTEX_TID_MASK)) { -				force_take = 1; -				goto retry; -			} -		default: -			break; -		} -	} - -	return ret; +	return attach_to_pi_owner(uval, key, ps);  }  /** @@ -1186,22 +1169,6 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)  	return 0;  } -static int unlock_futex_pi(u32 __user *uaddr, u32 uval) -{ -	u32 uninitialized_var(oldval); - -	/* -	 * There is no waiter, so we unlock the futex. The owner died -	 * bit has not to be preserved here. We are the owner: -	 */ -	if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) -		return -EFAULT; -	if (oldval != uval) -		return -EAGAIN; - -	return 0; -} -  /*   * Express the locking dependencies for lockdep:   */ @@ -1659,7 +1626,12 @@ retry_private:  				goto retry;  			goto out;  		case -EAGAIN: -			/* The owner was exiting, try again. */ +			/* +			 * Two reasons for this: +			 * - Owner is exiting and we just wait for the +			 *   exit to complete. +			 * - The user space value changed. +			 */  			double_unlock_hb(hb1, hb2);  			hb_waiters_dec(hb2);  			put_futex_key(&key2); @@ -1718,7 +1690,7 @@ retry_private:  			this->pi_state = pi_state;  			ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,  							this->rt_waiter, -							this->task, 1); +							this->task);  			if (ret == 1) {  				/* We got the lock. */  				requeue_pi_wake_futex(this, &key2, hb2); @@ -2316,8 +2288,10 @@ retry_private:  			goto uaddr_faulted;  		case -EAGAIN:  			/* -			 * Task is exiting and we just wait for the -			 * exit to complete. +			 * Two reasons for this: +			 * - Task is exiting and we just wait for the +			 *   exit to complete. +			 * - The user space value changed.  			 */  			queue_unlock(hb);  			put_futex_key(&q.key); @@ -2337,9 +2311,9 @@ retry_private:  	/*  	 * Block on the PI mutex:  	 */ -	if (!trylock) -		ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); -	else { +	if (!trylock) { +		ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to); +	} else {  		ret = rt_mutex_trylock(&q.pi_state->pi_mutex);  		/* Fixup the trylock return value: */  		ret = ret ? 0 : -EWOULDBLOCK; @@ -2401,10 +2375,10 @@ uaddr_faulted:   */  static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)  { -	struct futex_hash_bucket *hb; -	struct futex_q *this, *next; +	u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);  	union futex_key key = FUTEX_KEY_INIT; -	u32 uval, vpid = task_pid_vnr(current); +	struct futex_hash_bucket *hb; +	struct futex_q *match;  	int ret;  retry: @@ -2417,57 +2391,47 @@ retry:  		return -EPERM;  	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); -	if (unlikely(ret != 0)) -		goto out; +	if (ret) +		return ret;  	hb = hash_futex(&key);  	spin_lock(&hb->lock);  	/* -	 * To avoid races, try to do the TID -> 0 atomic transition -	 * again. If it succeeds then we can return without waking -	 * anyone else up. We only try this if neither the waiters nor -	 * the owner died bit are set. -	 */ -	if (!(uval & ~FUTEX_TID_MASK) && -	    cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) -		goto pi_faulted; -	/* -	 * Rare case: we managed to release the lock atomically, -	 * no need to wake anyone else up: -	 */ -	if (unlikely(uval == vpid)) -		goto out_unlock; - -	/* -	 * Ok, other tasks may need to be woken up - check waiters -	 * and do the wakeup if necessary: +	 * Check waiters first. We do not trust user space values at +	 * all and we at least want to know if user space fiddled +	 * with the futex value instead of blindly unlocking.  	 */ -	plist_for_each_entry_safe(this, next, &hb->chain, list) { -		if (!match_futex (&this->key, &key)) -			continue; -		ret = wake_futex_pi(uaddr, uval, this); +	match = futex_top_waiter(hb, &key); +	if (match) { +		ret = wake_futex_pi(uaddr, uval, match);  		/* -		 * The atomic access to the futex value -		 * generated a pagefault, so retry the -		 * user-access and the wakeup: +		 * The atomic access to the futex value generated a +		 * pagefault, so retry the user-access and the wakeup:  		 */  		if (ret == -EFAULT)  			goto pi_faulted;  		goto out_unlock;  	} +  	/* -	 * No waiters - kernel unlocks the futex: +	 * We have no kernel internal state, i.e. no waiters in the +	 * kernel. Waiters which are about to queue themselves are stuck +	 * on hb->lock. So we can safely ignore them. We do neither +	 * preserve the WAITERS bit not the OWNER_DIED one. We are the +	 * owner.  	 */ -	ret = unlock_futex_pi(uaddr, uval); -	if (ret == -EFAULT) +	if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))  		goto pi_faulted; +	/* +	 * If uval has changed, let user space handle it. +	 */ +	ret = (curval == uval) ? 0 : -EAGAIN; +  out_unlock:  	spin_unlock(&hb->lock);  	put_futex_key(&key); - -out:  	return ret;  pi_faulted: @@ -2669,7 +2633,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,  		 */  		WARN_ON(!q.pi_state);  		pi_mutex = &q.pi_state->pi_mutex; -		ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); +		ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);  		debug_rt_mutex_free_waiter(&rt_waiter);  		spin_lock(q.lock_ptr); diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 15ff01a76379..edf67c493a8e 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -784,8 +784,7 @@ static __init int gcov_fs_init(void)  err_remove:  	pr_err("init failed\n"); -	if (root_node.dentry) -		debugfs_remove(root_node.dentry); +	debugfs_remove(root_node.dentry);  	return rc;  } diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 452d6f2ba21d..cf80e7b0ddab 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -341,8 +341,8 @@ static struct lock_class_key irq_nested_lock_class;  /*   * irq_map_generic_chip - Map a generic chip for an irq domain   */ -static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, -				irq_hw_number_t hw_irq) +int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, +			 irq_hw_number_t hw_irq)  {  	struct irq_data *data = irq_get_irq_data(virq);  	struct irq_domain_chip_generic *dgc = d->gc; @@ -394,6 +394,7 @@ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,  	irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);  	return 0;  } +EXPORT_SYMBOL_GPL(irq_map_generic_chip);  struct irq_domain_ops irq_generic_chip_ops = {  	.map	= irq_map_generic_chip, diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index eb5e10e32e05..6534ff6ce02e 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -231,7 +231,7 @@ void irq_set_default_host(struct irq_domain *domain)  }  EXPORT_SYMBOL_GPL(irq_set_default_host); -static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) +void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)  {  	struct irq_data *irq_data = irq_get_irq_data(irq);  	irq_hw_number_t hwirq; diff --git a/kernel/irq_work.c b/kernel/irq_work.c index a82170e2fa78..e6bcbe756663 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -16,11 +16,12 @@  #include <linux/tick.h>  #include <linux/cpu.h>  #include <linux/notifier.h> +#include <linux/smp.h>  #include <asm/processor.h> -static DEFINE_PER_CPU(struct llist_head, irq_work_list); -static DEFINE_PER_CPU(int, irq_work_raised); +static DEFINE_PER_CPU(struct llist_head, raised_list); +static DEFINE_PER_CPU(struct llist_head, lazy_list);  /*   * Claim the entry so that no one else will poke at it. @@ -55,12 +56,34 @@ void __weak arch_irq_work_raise(void)  	 */  } +#ifdef CONFIG_SMP  /* - * Enqueue the irq_work @entry unless it's already pending + * Enqueue the irq_work @work on @cpu unless it's already pending   * somewhere.   *   * Can be re-enqueued while the callback is still in progress.   */ +bool irq_work_queue_on(struct irq_work *work, int cpu) +{ +	/* All work should have been flushed before going offline */ +	WARN_ON_ONCE(cpu_is_offline(cpu)); + +	/* Arch remote IPI send/receive backend aren't NMI safe */ +	WARN_ON_ONCE(in_nmi()); + +	/* Only queue if not already pending */ +	if (!irq_work_claim(work)) +		return false; + +	if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) +		arch_send_call_function_single_ipi(cpu); + +	return true; +} +EXPORT_SYMBOL_GPL(irq_work_queue_on); +#endif + +/* Enqueue the irq work @work on the current CPU */  bool irq_work_queue(struct irq_work *work)  {  	/* Only queue if not already pending */ @@ -70,15 +93,13 @@ bool irq_work_queue(struct irq_work *work)  	/* Queue the entry and raise the IPI if needed. */  	preempt_disable(); -	llist_add(&work->llnode, &__get_cpu_var(irq_work_list)); - -	/* -	 * If the work is not "lazy" or the tick is stopped, raise the irq -	 * work interrupt (if supported by the arch), otherwise, just wait -	 * for the next tick. -	 */ -	if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) { -		if (!this_cpu_cmpxchg(irq_work_raised, 0, 1)) +	/* If the work is "lazy", handle it from next tick if any */ +	if (work->flags & IRQ_WORK_LAZY) { +		if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) && +		    tick_nohz_tick_stopped()) +			arch_irq_work_raise(); +	} else { +		if (llist_add(&work->llnode, &__get_cpu_var(raised_list)))  			arch_irq_work_raise();  	} @@ -90,10 +111,11 @@ EXPORT_SYMBOL_GPL(irq_work_queue);  bool irq_work_needs_cpu(void)  { -	struct llist_head *this_list; +	struct llist_head *raised, *lazy; -	this_list = &__get_cpu_var(irq_work_list); -	if (llist_empty(this_list)) +	raised = &__get_cpu_var(raised_list); +	lazy = &__get_cpu_var(lazy_list); +	if (llist_empty(raised) && llist_empty(lazy))  		return false;  	/* All work should have been flushed before going offline */ @@ -102,28 +124,18 @@ bool irq_work_needs_cpu(void)  	return true;  } -static void __irq_work_run(void) +static void irq_work_run_list(struct llist_head *list)  {  	unsigned long flags;  	struct irq_work *work; -	struct llist_head *this_list;  	struct llist_node *llnode; +	BUG_ON(!irqs_disabled()); -	/* -	 * Reset the "raised" state right before we check the list because -	 * an NMI may enqueue after we find the list empty from the runner. -	 */ -	__this_cpu_write(irq_work_raised, 0); -	barrier(); - -	this_list = &__get_cpu_var(irq_work_list); -	if (llist_empty(this_list)) +	if (llist_empty(list))  		return; -	BUG_ON(!irqs_disabled()); - -	llnode = llist_del_all(this_list); +	llnode = llist_del_all(list);  	while (llnode != NULL) {  		work = llist_entry(llnode, struct irq_work, llnode); @@ -149,13 +161,13 @@ static void __irq_work_run(void)  }  /* - * Run the irq_work entries on this cpu. Requires to be ran from hardirq - * context with local IRQs disabled. + * hotplug calls this through: + *  hotplug_cfd() -> flush_smp_call_function_queue()   */  void irq_work_run(void)  { -	BUG_ON(!in_irq()); -	__irq_work_run(); +	irq_work_run_list(&__get_cpu_var(raised_list)); +	irq_work_run_list(&__get_cpu_var(lazy_list));  }  EXPORT_SYMBOL_GPL(irq_work_run); @@ -171,35 +183,3 @@ void irq_work_sync(struct irq_work *work)  		cpu_relax();  }  EXPORT_SYMBOL_GPL(irq_work_sync); - -#ifdef CONFIG_HOTPLUG_CPU -static int irq_work_cpu_notify(struct notifier_block *self, -			       unsigned long action, void *hcpu) -{ -	long cpu = (long)hcpu; - -	switch (action) { -	case CPU_DYING: -		/* Called from stop_machine */ -		if (WARN_ON_ONCE(cpu != smp_processor_id())) -			break; -		__irq_work_run(); -		break; -	default: -		break; -	} -	return NOTIFY_OK; -} - -static struct notifier_block cpu_notify; - -static __init int irq_work_init_cpu_notifier(void) -{ -	cpu_notify.notifier_call = irq_work_cpu_notify; -	cpu_notify.priority = 0; -	register_cpu_notifier(&cpu_notify); -	return 0; -} -device_initcall(irq_work_init_cpu_notifier); - -#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index cb0cf37dac3a..ae5167087845 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -364,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address,  	address += symbol_offset;  	name = kallsyms_lookup(address, &size, &offset, &modname, buffer);  	if (!name) -		return sprintf(buffer, "0x%lx", address); +		return sprintf(buffer, "0x%lx", address - symbol_offset);  	if (name != buffer)  		strcpy(buffer, name); diff --git a/kernel/kexec.c b/kernel/kexec.c index 4b8f0c925884..0b49a0a58102 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -6,6 +6,8 @@   * Version 2.  See the file COPYING for more details.   */ +#define pr_fmt(fmt)	"kexec: " fmt +  #include <linux/capability.h>  #include <linux/mm.h>  #include <linux/file.h> @@ -40,6 +42,9 @@  #include <asm/io.h>  #include <asm/sections.h> +#include <crypto/hash.h> +#include <crypto/sha.h> +  /* Per cpu memory for storing cpu states in case of system crash. */  note_buf_t __percpu *crash_notes; @@ -52,6 +57,15 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);  /* Flag to indicate we are going to kexec a new kernel */  bool kexec_in_progress = false; +/* + * Declare these symbols weak so that if architecture provides a purgatory, + * these will be overridden. + */ +char __weak kexec_purgatory[0]; +size_t __weak kexec_purgatory_size = 0; + +static int kexec_calculate_store_digests(struct kimage *image); +  /* Location of the reserved area for the crash kernel */  struct resource crashk_res = {  	.name  = "Crash kernel", @@ -125,45 +139,27 @@ static struct page *kimage_alloc_page(struct kimage *image,  				       gfp_t gfp_mask,  				       unsigned long dest); -static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, -			   unsigned long nr_segments, -			   struct kexec_segment __user *segments) +static int copy_user_segment_list(struct kimage *image, +				  unsigned long nr_segments, +				  struct kexec_segment __user *segments)  { +	int ret;  	size_t segment_bytes; -	struct kimage *image; -	unsigned long i; -	int result; - -	/* Allocate a controlling structure */ -	result = -ENOMEM; -	image = kzalloc(sizeof(*image), GFP_KERNEL); -	if (!image) -		goto out; - -	image->head = 0; -	image->entry = &image->head; -	image->last_entry = &image->head; -	image->control_page = ~0; /* By default this does not apply */ -	image->start = entry; -	image->type = KEXEC_TYPE_DEFAULT; - -	/* Initialize the list of control pages */ -	INIT_LIST_HEAD(&image->control_pages); - -	/* Initialize the list of destination pages */ -	INIT_LIST_HEAD(&image->dest_pages); - -	/* Initialize the list of unusable pages */ -	INIT_LIST_HEAD(&image->unuseable_pages);  	/* Read in the segments */  	image->nr_segments = nr_segments;  	segment_bytes = nr_segments * sizeof(*segments); -	result = copy_from_user(image->segment, segments, segment_bytes); -	if (result) { -		result = -EFAULT; -		goto out; -	} +	ret = copy_from_user(image->segment, segments, segment_bytes); +	if (ret) +		ret = -EFAULT; + +	return ret; +} + +static int sanity_check_segment_list(struct kimage *image) +{ +	int result, i; +	unsigned long nr_segments = image->nr_segments;  	/*  	 * Verify we have good destination addresses.  The caller is @@ -185,9 +181,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,  		mstart = image->segment[i].mem;  		mend   = mstart + image->segment[i].memsz;  		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) -			goto out; +			return result;  		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) -			goto out; +			return result;  	}  	/* Verify our destination addresses do not overlap. @@ -208,7 +204,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,  			pend   = pstart + image->segment[j].memsz;  			/* Do the segments overlap ? */  			if ((mend > pstart) && (mstart < pend)) -				goto out; +				return result;  		}  	} @@ -220,130 +216,401 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,  	result = -EINVAL;  	for (i = 0; i < nr_segments; i++) {  		if (image->segment[i].bufsz > image->segment[i].memsz) -			goto out; +			return result;  	} -	result = 0; -out: -	if (result == 0) -		*rimage = image; -	else -		kfree(image); +	/* +	 * Verify we have good destination addresses.  Normally +	 * the caller is responsible for making certain we don't +	 * attempt to load the new image into invalid or reserved +	 * areas of RAM.  But crash kernels are preloaded into a +	 * reserved area of ram.  We must ensure the addresses +	 * are in the reserved area otherwise preloading the +	 * kernel could corrupt things. +	 */ -	return result; +	if (image->type == KEXEC_TYPE_CRASH) { +		result = -EADDRNOTAVAIL; +		for (i = 0; i < nr_segments; i++) { +			unsigned long mstart, mend; + +			mstart = image->segment[i].mem; +			mend = mstart + image->segment[i].memsz - 1; +			/* Ensure we are within the crash kernel limits */ +			if ((mstart < crashk_res.start) || +			    (mend > crashk_res.end)) +				return result; +		} +	} +	return 0; +} + +static struct kimage *do_kimage_alloc_init(void) +{ +	struct kimage *image; + +	/* Allocate a controlling structure */ +	image = kzalloc(sizeof(*image), GFP_KERNEL); +	if (!image) +		return NULL; + +	image->head = 0; +	image->entry = &image->head; +	image->last_entry = &image->head; +	image->control_page = ~0; /* By default this does not apply */ +	image->type = KEXEC_TYPE_DEFAULT; + +	/* Initialize the list of control pages */ +	INIT_LIST_HEAD(&image->control_pages); + +	/* Initialize the list of destination pages */ +	INIT_LIST_HEAD(&image->dest_pages); + +	/* Initialize the list of unusable pages */ +	INIT_LIST_HEAD(&image->unusable_pages); + +	return image;  }  static void kimage_free_page_list(struct list_head *list); -static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, -				unsigned long nr_segments, -				struct kexec_segment __user *segments) +static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, +			     unsigned long nr_segments, +			     struct kexec_segment __user *segments, +			     unsigned long flags)  { -	int result; +	int ret;  	struct kimage *image; +	bool kexec_on_panic = flags & KEXEC_ON_CRASH; + +	if (kexec_on_panic) { +		/* Verify we have a valid entry point */ +		if ((entry < crashk_res.start) || (entry > crashk_res.end)) +			return -EADDRNOTAVAIL; +	}  	/* Allocate and initialize a controlling structure */ -	image = NULL; -	result = do_kimage_alloc(&image, entry, nr_segments, segments); -	if (result) -		goto out; +	image = do_kimage_alloc_init(); +	if (!image) +		return -ENOMEM; + +	image->start = entry; + +	ret = copy_user_segment_list(image, nr_segments, segments); +	if (ret) +		goto out_free_image; + +	ret = sanity_check_segment_list(image); +	if (ret) +		goto out_free_image; + +	 /* Enable the special crash kernel control page allocation policy. */ +	if (kexec_on_panic) { +		image->control_page = crashk_res.start; +		image->type = KEXEC_TYPE_CRASH; +	}  	/*  	 * Find a location for the control code buffer, and add it  	 * the vector of segments so that it's pages will also be  	 * counted as destination pages.  	 */ -	result = -ENOMEM; +	ret = -ENOMEM;  	image->control_code_page = kimage_alloc_control_pages(image,  					   get_order(KEXEC_CONTROL_PAGE_SIZE));  	if (!image->control_code_page) {  		pr_err("Could not allocate control_code_buffer\n"); -		goto out_free; +		goto out_free_image;  	} -	image->swap_page = kimage_alloc_control_pages(image, 0); -	if (!image->swap_page) { -		pr_err("Could not allocate swap buffer\n"); -		goto out_free; +	if (!kexec_on_panic) { +		image->swap_page = kimage_alloc_control_pages(image, 0); +		if (!image->swap_page) { +			pr_err("Could not allocate swap buffer\n"); +			goto out_free_control_pages; +		}  	}  	*rimage = image;  	return 0; - -out_free: +out_free_control_pages:  	kimage_free_page_list(&image->control_pages); +out_free_image:  	kfree(image); -out: -	return result; +	return ret;  } -static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, -				unsigned long nr_segments, -				struct kexec_segment __user *segments) +static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)  { -	int result; -	struct kimage *image; -	unsigned long i; +	struct fd f = fdget(fd); +	int ret; +	struct kstat stat; +	loff_t pos; +	ssize_t bytes = 0; -	image = NULL; -	/* Verify we have a valid entry point */ -	if ((entry < crashk_res.start) || (entry > crashk_res.end)) { -		result = -EADDRNOTAVAIL; +	if (!f.file) +		return -EBADF; + +	ret = vfs_getattr(&f.file->f_path, &stat); +	if (ret) +		goto out; + +	if (stat.size > INT_MAX) { +		ret = -EFBIG;  		goto out;  	} -	/* Allocate and initialize a controlling structure */ -	result = do_kimage_alloc(&image, entry, nr_segments, segments); -	if (result) +	/* Don't hand 0 to vmalloc, it whines. */ +	if (stat.size == 0) { +		ret = -EINVAL;  		goto out; +	} -	/* Enable the special crash kernel control page -	 * allocation policy. -	 */ -	image->control_page = crashk_res.start; -	image->type = KEXEC_TYPE_CRASH; +	*buf = vmalloc(stat.size); +	if (!*buf) { +		ret = -ENOMEM; +		goto out; +	} -	/* -	 * Verify we have good destination addresses.  Normally -	 * the caller is responsible for making certain we don't -	 * attempt to load the new image into invalid or reserved -	 * areas of RAM.  But crash kernels are preloaded into a -	 * reserved area of ram.  We must ensure the addresses -	 * are in the reserved area otherwise preloading the -	 * kernel could corrupt things. -	 */ -	result = -EADDRNOTAVAIL; -	for (i = 0; i < nr_segments; i++) { -		unsigned long mstart, mend; +	pos = 0; +	while (pos < stat.size) { +		bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, +				    stat.size - pos); +		if (bytes < 0) { +			vfree(*buf); +			ret = bytes; +			goto out; +		} -		mstart = image->segment[i].mem; -		mend = mstart + image->segment[i].memsz - 1; -		/* Ensure we are within the crash kernel limits */ -		if ((mstart < crashk_res.start) || (mend > crashk_res.end)) -			goto out_free; +		if (bytes == 0) +			break; +		pos += bytes;  	} +	if (pos != stat.size) { +		ret = -EBADF; +		vfree(*buf); +		goto out; +	} + +	*buf_len = pos; +out: +	fdput(f); +	return ret; +} + +/* Architectures can provide this probe function */ +int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, +					 unsigned long buf_len) +{ +	return -ENOEXEC; +} + +void * __weak arch_kexec_kernel_image_load(struct kimage *image) +{ +	return ERR_PTR(-ENOEXEC); +} + +void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) +{ +} + +int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, +					unsigned long buf_len) +{ +	return -EKEYREJECTED; +} + +/* Apply relocations of type RELA */ +int __weak +arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, +				 unsigned int relsec) +{ +	pr_err("RELA relocation unsupported.\n"); +	return -ENOEXEC; +} + +/* Apply relocations of type REL */ +int __weak +arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, +			     unsigned int relsec) +{ +	pr_err("REL relocation unsupported.\n"); +	return -ENOEXEC; +} + +/* + * Free up memory used by kernel, initrd, and comand line. This is temporary + * memory allocation which is not needed any more after these buffers have + * been loaded into separate segments and have been copied elsewhere. + */ +static void kimage_file_post_load_cleanup(struct kimage *image) +{ +	struct purgatory_info *pi = &image->purgatory_info; + +	vfree(image->kernel_buf); +	image->kernel_buf = NULL; + +	vfree(image->initrd_buf); +	image->initrd_buf = NULL; + +	kfree(image->cmdline_buf); +	image->cmdline_buf = NULL; + +	vfree(pi->purgatory_buf); +	pi->purgatory_buf = NULL; + +	vfree(pi->sechdrs); +	pi->sechdrs = NULL; + +	/* See if architecture has anything to cleanup post load */ +	arch_kimage_file_post_load_cleanup(image); +  	/* -	 * Find a location for the control code buffer, and add -	 * the vector of segments so that it's pages will also be -	 * counted as destination pages. +	 * Above call should have called into bootloader to free up +	 * any data stored in kimage->image_loader_data. It should +	 * be ok now to free it up.  	 */ -	result = -ENOMEM; +	kfree(image->image_loader_data); +	image->image_loader_data = NULL; +} + +/* + * In file mode list of segments is prepared by kernel. Copy relevant + * data from user space, do error checking, prepare segment list + */ +static int +kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, +			     const char __user *cmdline_ptr, +			     unsigned long cmdline_len, unsigned flags) +{ +	int ret = 0; +	void *ldata; + +	ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, +				&image->kernel_buf_len); +	if (ret) +		return ret; + +	/* Call arch image probe handlers */ +	ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, +					    image->kernel_buf_len); + +	if (ret) +		goto out; + +#ifdef CONFIG_KEXEC_VERIFY_SIG +	ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf, +					   image->kernel_buf_len); +	if (ret) { +		pr_debug("kernel signature verification failed.\n"); +		goto out; +	} +	pr_debug("kernel signature verification successful.\n"); +#endif +	/* It is possible that there no initramfs is being loaded */ +	if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { +		ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, +					&image->initrd_buf_len); +		if (ret) +			goto out; +	} + +	if (cmdline_len) { +		image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); +		if (!image->cmdline_buf) { +			ret = -ENOMEM; +			goto out; +		} + +		ret = copy_from_user(image->cmdline_buf, cmdline_ptr, +				     cmdline_len); +		if (ret) { +			ret = -EFAULT; +			goto out; +		} + +		image->cmdline_buf_len = cmdline_len; + +		/* command line should be a string with last byte null */ +		if (image->cmdline_buf[cmdline_len - 1] != '\0') { +			ret = -EINVAL; +			goto out; +		} +	} + +	/* Call arch image load handlers */ +	ldata = arch_kexec_kernel_image_load(image); + +	if (IS_ERR(ldata)) { +		ret = PTR_ERR(ldata); +		goto out; +	} + +	image->image_loader_data = ldata; +out: +	/* In case of error, free up all allocated memory in this function */ +	if (ret) +		kimage_file_post_load_cleanup(image); +	return ret; +} + +static int +kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, +		       int initrd_fd, const char __user *cmdline_ptr, +		       unsigned long cmdline_len, unsigned long flags) +{ +	int ret; +	struct kimage *image; +	bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; + +	image = do_kimage_alloc_init(); +	if (!image) +		return -ENOMEM; + +	image->file_mode = 1; + +	if (kexec_on_panic) { +		/* Enable special crash kernel control page alloc policy. */ +		image->control_page = crashk_res.start; +		image->type = KEXEC_TYPE_CRASH; +	} + +	ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, +					   cmdline_ptr, cmdline_len, flags); +	if (ret) +		goto out_free_image; + +	ret = sanity_check_segment_list(image); +	if (ret) +		goto out_free_post_load_bufs; + +	ret = -ENOMEM;  	image->control_code_page = kimage_alloc_control_pages(image,  					   get_order(KEXEC_CONTROL_PAGE_SIZE));  	if (!image->control_code_page) {  		pr_err("Could not allocate control_code_buffer\n"); -		goto out_free; +		goto out_free_post_load_bufs; +	} + +	if (!kexec_on_panic) { +		image->swap_page = kimage_alloc_control_pages(image, 0); +		if (!image->swap_page) { +			pr_err(KERN_ERR "Could not allocate swap buffer\n"); +			goto out_free_control_pages; +		}  	}  	*rimage = image;  	return 0; - -out_free: +out_free_control_pages: +	kimage_free_page_list(&image->control_pages); +out_free_post_load_bufs: +	kimage_file_post_load_cleanup(image); +out_free_image:  	kfree(image); -out: -	return result; +	return ret;  }  static int kimage_is_destination_range(struct kimage *image, @@ -609,7 +876,7 @@ static void kimage_free_extra_pages(struct kimage *image)  	kimage_free_page_list(&image->dest_pages);  	/* Walk through and free any unusable pages I have cached */ -	kimage_free_page_list(&image->unuseable_pages); +	kimage_free_page_list(&image->unusable_pages);  }  static void kimage_terminate(struct kimage *image) @@ -663,6 +930,14 @@ static void kimage_free(struct kimage *image)  	/* Free the kexec control pages... */  	kimage_free_page_list(&image->control_pages); + +	/* +	 * Free up any temporary buffers allocated. This might hit if +	 * error occurred much later after buffer allocation. +	 */ +	if (image->file_mode) +		kimage_file_post_load_cleanup(image); +  	kfree(image);  } @@ -732,7 +1007,7 @@ static struct page *kimage_alloc_page(struct kimage *image,  		/* If the page cannot be used file it away */  		if (page_to_pfn(page) >  				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { -			list_add(&page->lru, &image->unuseable_pages); +			list_add(&page->lru, &image->unusable_pages);  			continue;  		}  		addr = page_to_pfn(page) << PAGE_SHIFT; @@ -791,10 +1066,14 @@ static int kimage_load_normal_segment(struct kimage *image,  	unsigned long maddr;  	size_t ubytes, mbytes;  	int result; -	unsigned char __user *buf; +	unsigned char __user *buf = NULL; +	unsigned char *kbuf = NULL;  	result = 0; -	buf = segment->buf; +	if (image->file_mode) +		kbuf = segment->kbuf; +	else +		buf = segment->buf;  	ubytes = segment->bufsz;  	mbytes = segment->memsz;  	maddr = segment->mem; @@ -826,7 +1105,11 @@ static int kimage_load_normal_segment(struct kimage *image,  				PAGE_SIZE - (maddr & ~PAGE_MASK));  		uchunk = min(ubytes, mchunk); -		result = copy_from_user(ptr, buf, uchunk); +		/* For file based kexec, source pages are in kernel memory */ +		if (image->file_mode) +			memcpy(ptr, kbuf, uchunk); +		else +			result = copy_from_user(ptr, buf, uchunk);  		kunmap(page);  		if (result) {  			result = -EFAULT; @@ -834,7 +1117,10 @@ static int kimage_load_normal_segment(struct kimage *image,  		}  		ubytes -= uchunk;  		maddr  += mchunk; -		buf    += mchunk; +		if (image->file_mode) +			kbuf += mchunk; +		else +			buf += mchunk;  		mbytes -= mchunk;  	}  out: @@ -851,10 +1137,14 @@ static int kimage_load_crash_segment(struct kimage *image,  	unsigned long maddr;  	size_t ubytes, mbytes;  	int result; -	unsigned char __user *buf; +	unsigned char __user *buf = NULL; +	unsigned char *kbuf = NULL;  	result = 0; -	buf = segment->buf; +	if (image->file_mode) +		kbuf = segment->kbuf; +	else +		buf = segment->buf;  	ubytes = segment->bufsz;  	mbytes = segment->memsz;  	maddr = segment->mem; @@ -877,7 +1167,12 @@ static int kimage_load_crash_segment(struct kimage *image,  			/* Zero the trailing part of the page */  			memset(ptr + uchunk, 0, mchunk - uchunk);  		} -		result = copy_from_user(ptr, buf, uchunk); + +		/* For file based kexec, source pages are in kernel memory */ +		if (image->file_mode) +			memcpy(ptr, kbuf, uchunk); +		else +			result = copy_from_user(ptr, buf, uchunk);  		kexec_flush_icache_page(page);  		kunmap(page);  		if (result) { @@ -886,7 +1181,10 @@ static int kimage_load_crash_segment(struct kimage *image,  		}  		ubytes -= uchunk;  		maddr  += mchunk; -		buf    += mchunk; +		if (image->file_mode) +			kbuf += mchunk; +		else +			buf += mchunk;  		mbytes -= mchunk;  	}  out: @@ -986,16 +1284,16 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,  		/* Loading another kernel to reboot into */  		if ((flags & KEXEC_ON_CRASH) == 0) -			result = kimage_normal_alloc(&image, entry, -							nr_segments, segments); +			result = kimage_alloc_init(&image, entry, nr_segments, +						   segments, flags);  		/* Loading another kernel to switch to if this one crashes */  		else if (flags & KEXEC_ON_CRASH) {  			/* Free any current crash dump kernel before  			 * we corrupt it.  			 */  			kimage_free(xchg(&kexec_crash_image, NULL)); -			result = kimage_crash_alloc(&image, entry, -						     nr_segments, segments); +			result = kimage_alloc_init(&image, entry, nr_segments, +						   segments, flags);  			crash_map_reserved_pages();  		}  		if (result) @@ -1077,6 +1375,82 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,  }  #endif +SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, +		unsigned long, cmdline_len, const char __user *, cmdline_ptr, +		unsigned long, flags) +{ +	int ret = 0, i; +	struct kimage **dest_image, *image; + +	/* We only trust the superuser with rebooting the system. */ +	if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) +		return -EPERM; + +	/* Make sure we have a legal set of flags */ +	if (flags != (flags & KEXEC_FILE_FLAGS)) +		return -EINVAL; + +	image = NULL; + +	if (!mutex_trylock(&kexec_mutex)) +		return -EBUSY; + +	dest_image = &kexec_image; +	if (flags & KEXEC_FILE_ON_CRASH) +		dest_image = &kexec_crash_image; + +	if (flags & KEXEC_FILE_UNLOAD) +		goto exchange; + +	/* +	 * In case of crash, new kernel gets loaded in reserved region. It is +	 * same memory where old crash kernel might be loaded. Free any +	 * current crash dump kernel before we corrupt it. +	 */ +	if (flags & KEXEC_FILE_ON_CRASH) +		kimage_free(xchg(&kexec_crash_image, NULL)); + +	ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, +				     cmdline_len, flags); +	if (ret) +		goto out; + +	ret = machine_kexec_prepare(image); +	if (ret) +		goto out; + +	ret = kexec_calculate_store_digests(image); +	if (ret) +		goto out; + +	for (i = 0; i < image->nr_segments; i++) { +		struct kexec_segment *ksegment; + +		ksegment = &image->segment[i]; +		pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", +			 i, ksegment->buf, ksegment->bufsz, ksegment->mem, +			 ksegment->memsz); + +		ret = kimage_load_segment(image, &image->segment[i]); +		if (ret) +			goto out; +	} + +	kimage_terminate(image); + +	/* +	 * Free up any temporary buffers allocated which are not needed +	 * after image has been loaded +	 */ +	kimage_file_post_load_cleanup(image); +exchange: +	image = xchg(dest_image, image); +out: +	mutex_unlock(&kexec_mutex); +	kimage_free(image); +	return ret; +} +  void crash_kexec(struct pt_regs *regs)  {  	/* Take the kexec_mutex here to prevent sys_kexec_load @@ -1632,6 +2006,683 @@ static int __init crash_save_vmcoreinfo_init(void)  subsys_initcall(crash_save_vmcoreinfo_init); +static int __kexec_add_segment(struct kimage *image, char *buf, +			       unsigned long bufsz, unsigned long mem, +			       unsigned long memsz) +{ +	struct kexec_segment *ksegment; + +	ksegment = &image->segment[image->nr_segments]; +	ksegment->kbuf = buf; +	ksegment->bufsz = bufsz; +	ksegment->mem = mem; +	ksegment->memsz = memsz; +	image->nr_segments++; + +	return 0; +} + +static int locate_mem_hole_top_down(unsigned long start, unsigned long end, +				    struct kexec_buf *kbuf) +{ +	struct kimage *image = kbuf->image; +	unsigned long temp_start, temp_end; + +	temp_end = min(end, kbuf->buf_max); +	temp_start = temp_end - kbuf->memsz; + +	do { +		/* align down start */ +		temp_start = temp_start & (~(kbuf->buf_align - 1)); + +		if (temp_start < start || temp_start < kbuf->buf_min) +			return 0; + +		temp_end = temp_start + kbuf->memsz - 1; + +		/* +		 * Make sure this does not conflict with any of existing +		 * segments +		 */ +		if (kimage_is_destination_range(image, temp_start, temp_end)) { +			temp_start = temp_start - PAGE_SIZE; +			continue; +		} + +		/* We found a suitable memory range */ +		break; +	} while (1); + +	/* If we are here, we found a suitable memory range */ +	__kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, +			    kbuf->memsz); + +	/* Success, stop navigating through remaining System RAM ranges */ +	return 1; +} + +static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, +				     struct kexec_buf *kbuf) +{ +	struct kimage *image = kbuf->image; +	unsigned long temp_start, temp_end; + +	temp_start = max(start, kbuf->buf_min); + +	do { +		temp_start = ALIGN(temp_start, kbuf->buf_align); +		temp_end = temp_start + kbuf->memsz - 1; + +		if (temp_end > end || temp_end > kbuf->buf_max) +			return 0; +		/* +		 * Make sure this does not conflict with any of existing +		 * segments +		 */ +		if (kimage_is_destination_range(image, temp_start, temp_end)) { +			temp_start = temp_start + PAGE_SIZE; +			continue; +		} + +		/* We found a suitable memory range */ +		break; +	} while (1); + +	/* If we are here, we found a suitable memory range */ +	__kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, +			    kbuf->memsz); + +	/* Success, stop navigating through remaining System RAM ranges */ +	return 1; +} + +static int locate_mem_hole_callback(u64 start, u64 end, void *arg) +{ +	struct kexec_buf *kbuf = (struct kexec_buf *)arg; +	unsigned long sz = end - start + 1; + +	/* Returning 0 will take to next memory range */ +	if (sz < kbuf->memsz) +		return 0; + +	if (end < kbuf->buf_min || start > kbuf->buf_max) +		return 0; + +	/* +	 * Allocate memory top down with-in ram range. Otherwise bottom up +	 * allocation. +	 */ +	if (kbuf->top_down) +		return locate_mem_hole_top_down(start, end, kbuf); +	return locate_mem_hole_bottom_up(start, end, kbuf); +} + +/* + * Helper function for placing a buffer in a kexec segment. This assumes + * that kexec_mutex is held. + */ +int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, +		     unsigned long memsz, unsigned long buf_align, +		     unsigned long buf_min, unsigned long buf_max, +		     bool top_down, unsigned long *load_addr) +{ + +	struct kexec_segment *ksegment; +	struct kexec_buf buf, *kbuf; +	int ret; + +	/* Currently adding segment this way is allowed only in file mode */ +	if (!image->file_mode) +		return -EINVAL; + +	if (image->nr_segments >= KEXEC_SEGMENT_MAX) +		return -EINVAL; + +	/* +	 * Make sure we are not trying to add buffer after allocating +	 * control pages. All segments need to be placed first before +	 * any control pages are allocated. As control page allocation +	 * logic goes through list of segments to make sure there are +	 * no destination overlaps. +	 */ +	if (!list_empty(&image->control_pages)) { +		WARN_ON(1); +		return -EINVAL; +	} + +	memset(&buf, 0, sizeof(struct kexec_buf)); +	kbuf = &buf; +	kbuf->image = image; +	kbuf->buffer = buffer; +	kbuf->bufsz = bufsz; + +	kbuf->memsz = ALIGN(memsz, PAGE_SIZE); +	kbuf->buf_align = max(buf_align, PAGE_SIZE); +	kbuf->buf_min = buf_min; +	kbuf->buf_max = buf_max; +	kbuf->top_down = top_down; + +	/* Walk the RAM ranges and allocate a suitable range for the buffer */ +	if (image->type == KEXEC_TYPE_CRASH) +		ret = walk_iomem_res("Crash kernel", +				     IORESOURCE_MEM | IORESOURCE_BUSY, +				     crashk_res.start, crashk_res.end, kbuf, +				     locate_mem_hole_callback); +	else +		ret = walk_system_ram_res(0, -1, kbuf, +					  locate_mem_hole_callback); +	if (ret != 1) { +		/* A suitable memory range could not be found for buffer */ +		return -EADDRNOTAVAIL; +	} + +	/* Found a suitable memory range */ +	ksegment = &image->segment[image->nr_segments - 1]; +	*load_addr = ksegment->mem; +	return 0; +} + +/* Calculate and store the digest of segments */ +static int kexec_calculate_store_digests(struct kimage *image) +{ +	struct crypto_shash *tfm; +	struct shash_desc *desc; +	int ret = 0, i, j, zero_buf_sz, sha_region_sz; +	size_t desc_size, nullsz; +	char *digest; +	void *zero_buf; +	struct kexec_sha_region *sha_regions; +	struct purgatory_info *pi = &image->purgatory_info; + +	zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); +	zero_buf_sz = PAGE_SIZE; + +	tfm = crypto_alloc_shash("sha256", 0, 0); +	if (IS_ERR(tfm)) { +		ret = PTR_ERR(tfm); +		goto out; +	} + +	desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); +	desc = kzalloc(desc_size, GFP_KERNEL); +	if (!desc) { +		ret = -ENOMEM; +		goto out_free_tfm; +	} + +	sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); +	sha_regions = vzalloc(sha_region_sz); +	if (!sha_regions) +		goto out_free_desc; + +	desc->tfm   = tfm; +	desc->flags = 0; + +	ret = crypto_shash_init(desc); +	if (ret < 0) +		goto out_free_sha_regions; + +	digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); +	if (!digest) { +		ret = -ENOMEM; +		goto out_free_sha_regions; +	} + +	for (j = i = 0; i < image->nr_segments; i++) { +		struct kexec_segment *ksegment; + +		ksegment = &image->segment[i]; +		/* +		 * Skip purgatory as it will be modified once we put digest +		 * info in purgatory. +		 */ +		if (ksegment->kbuf == pi->purgatory_buf) +			continue; + +		ret = crypto_shash_update(desc, ksegment->kbuf, +					  ksegment->bufsz); +		if (ret) +			break; + +		/* +		 * Assume rest of the buffer is filled with zero and +		 * update digest accordingly. +		 */ +		nullsz = ksegment->memsz - ksegment->bufsz; +		while (nullsz) { +			unsigned long bytes = nullsz; + +			if (bytes > zero_buf_sz) +				bytes = zero_buf_sz; +			ret = crypto_shash_update(desc, zero_buf, bytes); +			if (ret) +				break; +			nullsz -= bytes; +		} + +		if (ret) +			break; + +		sha_regions[j].start = ksegment->mem; +		sha_regions[j].len = ksegment->memsz; +		j++; +	} + +	if (!ret) { +		ret = crypto_shash_final(desc, digest); +		if (ret) +			goto out_free_digest; +		ret = kexec_purgatory_get_set_symbol(image, "sha_regions", +						sha_regions, sha_region_sz, 0); +		if (ret) +			goto out_free_digest; + +		ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", +						digest, SHA256_DIGEST_SIZE, 0); +		if (ret) +			goto out_free_digest; +	} + +out_free_digest: +	kfree(digest); +out_free_sha_regions: +	vfree(sha_regions); +out_free_desc: +	kfree(desc); +out_free_tfm: +	kfree(tfm); +out: +	return ret; +} + +/* Actually load purgatory. Lot of code taken from kexec-tools */ +static int __kexec_load_purgatory(struct kimage *image, unsigned long min, +				  unsigned long max, int top_down) +{ +	struct purgatory_info *pi = &image->purgatory_info; +	unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; +	unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; +	unsigned char *buf_addr, *src; +	int i, ret = 0, entry_sidx = -1; +	const Elf_Shdr *sechdrs_c; +	Elf_Shdr *sechdrs = NULL; +	void *purgatory_buf = NULL; + +	/* +	 * sechdrs_c points to section headers in purgatory and are read +	 * only. No modifications allowed. +	 */ +	sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; + +	/* +	 * We can not modify sechdrs_c[] and its fields. It is read only. +	 * Copy it over to a local copy where one can store some temporary +	 * data and free it at the end. We need to modify ->sh_addr and +	 * ->sh_offset fields to keep track of permanent and temporary +	 * locations of sections. +	 */ +	sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); +	if (!sechdrs) +		return -ENOMEM; + +	memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + +	/* +	 * We seem to have multiple copies of sections. First copy is which +	 * is embedded in kernel in read only section. Some of these sections +	 * will be copied to a temporary buffer and relocated. And these +	 * sections will finally be copied to their final destination at +	 * segment load time. +	 * +	 * Use ->sh_offset to reflect section address in memory. It will +	 * point to original read only copy if section is not allocatable. +	 * Otherwise it will point to temporary copy which will be relocated. +	 * +	 * Use ->sh_addr to contain final address of the section where it +	 * will go during execution time. +	 */ +	for (i = 0; i < pi->ehdr->e_shnum; i++) { +		if (sechdrs[i].sh_type == SHT_NOBITS) +			continue; + +		sechdrs[i].sh_offset = (unsigned long)pi->ehdr + +						sechdrs[i].sh_offset; +	} + +	/* +	 * Identify entry point section and make entry relative to section +	 * start. +	 */ +	entry = pi->ehdr->e_entry; +	for (i = 0; i < pi->ehdr->e_shnum; i++) { +		if (!(sechdrs[i].sh_flags & SHF_ALLOC)) +			continue; + +		if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) +			continue; + +		/* Make entry section relative */ +		if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && +		    ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > +		     pi->ehdr->e_entry)) { +			entry_sidx = i; +			entry -= sechdrs[i].sh_addr; +			break; +		} +	} + +	/* Determine how much memory is needed to load relocatable object. */ +	buf_align = 1; +	bss_align = 1; +	buf_sz = 0; +	bss_sz = 0; + +	for (i = 0; i < pi->ehdr->e_shnum; i++) { +		if (!(sechdrs[i].sh_flags & SHF_ALLOC)) +			continue; + +		align = sechdrs[i].sh_addralign; +		if (sechdrs[i].sh_type != SHT_NOBITS) { +			if (buf_align < align) +				buf_align = align; +			buf_sz = ALIGN(buf_sz, align); +			buf_sz += sechdrs[i].sh_size; +		} else { +			/* bss section */ +			if (bss_align < align) +				bss_align = align; +			bss_sz = ALIGN(bss_sz, align); +			bss_sz += sechdrs[i].sh_size; +		} +	} + +	/* Determine the bss padding required to align bss properly */ +	bss_pad = 0; +	if (buf_sz & (bss_align - 1)) +		bss_pad = bss_align - (buf_sz & (bss_align - 1)); + +	memsz = buf_sz + bss_pad + bss_sz; + +	/* Allocate buffer for purgatory */ +	purgatory_buf = vzalloc(buf_sz); +	if (!purgatory_buf) { +		ret = -ENOMEM; +		goto out; +	} + +	if (buf_align < bss_align) +		buf_align = bss_align; + +	/* Add buffer to segment list */ +	ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, +				buf_align, min, max, top_down, +				&pi->purgatory_load_addr); +	if (ret) +		goto out; + +	/* Load SHF_ALLOC sections */ +	buf_addr = purgatory_buf; +	load_addr = curr_load_addr = pi->purgatory_load_addr; +	bss_addr = load_addr + buf_sz + bss_pad; + +	for (i = 0; i < pi->ehdr->e_shnum; i++) { +		if (!(sechdrs[i].sh_flags & SHF_ALLOC)) +			continue; + +		align = sechdrs[i].sh_addralign; +		if (sechdrs[i].sh_type != SHT_NOBITS) { +			curr_load_addr = ALIGN(curr_load_addr, align); +			offset = curr_load_addr - load_addr; +			/* We already modifed ->sh_offset to keep src addr */ +			src = (char *) sechdrs[i].sh_offset; +			memcpy(buf_addr + offset, src, sechdrs[i].sh_size); + +			/* Store load address and source address of section */ +			sechdrs[i].sh_addr = curr_load_addr; + +			/* +			 * This section got copied to temporary buffer. Update +			 * ->sh_offset accordingly. +			 */ +			sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); + +			/* Advance to the next address */ +			curr_load_addr += sechdrs[i].sh_size; +		} else { +			bss_addr = ALIGN(bss_addr, align); +			sechdrs[i].sh_addr = bss_addr; +			bss_addr += sechdrs[i].sh_size; +		} +	} + +	/* Update entry point based on load address of text section */ +	if (entry_sidx >= 0) +		entry += sechdrs[entry_sidx].sh_addr; + +	/* Make kernel jump to purgatory after shutdown */ +	image->start = entry; + +	/* Used later to get/set symbol values */ +	pi->sechdrs = sechdrs; + +	/* +	 * Used later to identify which section is purgatory and skip it +	 * from checksumming. +	 */ +	pi->purgatory_buf = purgatory_buf; +	return ret; +out: +	vfree(sechdrs); +	vfree(purgatory_buf); +	return ret; +} + +static int kexec_apply_relocations(struct kimage *image) +{ +	int i, ret; +	struct purgatory_info *pi = &image->purgatory_info; +	Elf_Shdr *sechdrs = pi->sechdrs; + +	/* Apply relocations */ +	for (i = 0; i < pi->ehdr->e_shnum; i++) { +		Elf_Shdr *section, *symtab; + +		if (sechdrs[i].sh_type != SHT_RELA && +		    sechdrs[i].sh_type != SHT_REL) +			continue; + +		/* +		 * For section of type SHT_RELA/SHT_REL, +		 * ->sh_link contains section header index of associated +		 * symbol table. And ->sh_info contains section header +		 * index of section to which relocations apply. +		 */ +		if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || +		    sechdrs[i].sh_link >= pi->ehdr->e_shnum) +			return -ENOEXEC; + +		section = &sechdrs[sechdrs[i].sh_info]; +		symtab = &sechdrs[sechdrs[i].sh_link]; + +		if (!(section->sh_flags & SHF_ALLOC)) +			continue; + +		/* +		 * symtab->sh_link contain section header index of associated +		 * string table. +		 */ +		if (symtab->sh_link >= pi->ehdr->e_shnum) +			/* Invalid section number? */ +			continue; + +		/* +		 * Respective archicture needs to provide support for applying +		 * relocations of type SHT_RELA/SHT_REL. +		 */ +		if (sechdrs[i].sh_type == SHT_RELA) +			ret = arch_kexec_apply_relocations_add(pi->ehdr, +							       sechdrs, i); +		else if (sechdrs[i].sh_type == SHT_REL) +			ret = arch_kexec_apply_relocations(pi->ehdr, +							   sechdrs, i); +		if (ret) +			return ret; +	} + +	return 0; +} + +/* Load relocatable purgatory object and relocate it appropriately */ +int kexec_load_purgatory(struct kimage *image, unsigned long min, +			 unsigned long max, int top_down, +			 unsigned long *load_addr) +{ +	struct purgatory_info *pi = &image->purgatory_info; +	int ret; + +	if (kexec_purgatory_size <= 0) +		return -EINVAL; + +	if (kexec_purgatory_size < sizeof(Elf_Ehdr)) +		return -ENOEXEC; + +	pi->ehdr = (Elf_Ehdr *)kexec_purgatory; + +	if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 +	    || pi->ehdr->e_type != ET_REL +	    || !elf_check_arch(pi->ehdr) +	    || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) +		return -ENOEXEC; + +	if (pi->ehdr->e_shoff >= kexec_purgatory_size +	    || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > +	    kexec_purgatory_size - pi->ehdr->e_shoff)) +		return -ENOEXEC; + +	ret = __kexec_load_purgatory(image, min, max, top_down); +	if (ret) +		return ret; + +	ret = kexec_apply_relocations(image); +	if (ret) +		goto out; + +	*load_addr = pi->purgatory_load_addr; +	return 0; +out: +	vfree(pi->sechdrs); +	vfree(pi->purgatory_buf); +	return ret; +} + +static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, +					    const char *name) +{ +	Elf_Sym *syms; +	Elf_Shdr *sechdrs; +	Elf_Ehdr *ehdr; +	int i, k; +	const char *strtab; + +	if (!pi->sechdrs || !pi->ehdr) +		return NULL; + +	sechdrs = pi->sechdrs; +	ehdr = pi->ehdr; + +	for (i = 0; i < ehdr->e_shnum; i++) { +		if (sechdrs[i].sh_type != SHT_SYMTAB) +			continue; + +		if (sechdrs[i].sh_link >= ehdr->e_shnum) +			/* Invalid strtab section number */ +			continue; +		strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; +		syms = (Elf_Sym *)sechdrs[i].sh_offset; + +		/* Go through symbols for a match */ +		for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { +			if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) +				continue; + +			if (strcmp(strtab + syms[k].st_name, name) != 0) +				continue; + +			if (syms[k].st_shndx == SHN_UNDEF || +			    syms[k].st_shndx >= ehdr->e_shnum) { +				pr_debug("Symbol: %s has bad section index %d.\n", +						name, syms[k].st_shndx); +				return NULL; +			} + +			/* Found the symbol we are looking for */ +			return &syms[k]; +		} +	} + +	return NULL; +} + +void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) +{ +	struct purgatory_info *pi = &image->purgatory_info; +	Elf_Sym *sym; +	Elf_Shdr *sechdr; + +	sym = kexec_purgatory_find_symbol(pi, name); +	if (!sym) +		return ERR_PTR(-EINVAL); + +	sechdr = &pi->sechdrs[sym->st_shndx]; + +	/* +	 * Returns the address where symbol will finally be loaded after +	 * kexec_load_segment() +	 */ +	return (void *)(sechdr->sh_addr + sym->st_value); +} + +/* + * Get or set value of a symbol. If "get_value" is true, symbol value is + * returned in buf otherwise symbol value is set based on value in buf. + */ +int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, +				   void *buf, unsigned int size, bool get_value) +{ +	Elf_Sym *sym; +	Elf_Shdr *sechdrs; +	struct purgatory_info *pi = &image->purgatory_info; +	char *sym_buf; + +	sym = kexec_purgatory_find_symbol(pi, name); +	if (!sym) +		return -EINVAL; + +	if (sym->st_size != size) { +		pr_err("symbol %s size mismatch: expected %lu actual %u\n", +		       name, (unsigned long)sym->st_size, size); +		return -EINVAL; +	} + +	sechdrs = pi->sechdrs; + +	if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { +		pr_err("symbol %s is in a bss section. Cannot %s\n", name, +		       get_value ? "get" : "set"); +		return -EINVAL; +	} + +	sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + +					sym->st_value; + +	if (get_value) +		memcpy((void *)buf, sym_buf, size); +	else +		memcpy((void *)sym_buf, buf, size); + +	return 0; +} +  /*   * Move into place and start executing a preloaded standalone   * executable.  If nothing was preloaded return an error. diff --git a/kernel/kthread.c b/kernel/kthread.c index c2390f41307b..ef483220e855 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -591,7 +591,7 @@ static void insert_kthread_work(struct kthread_worker *worker,  	list_add_tail(&work->node, pos);  	work->worker = worker; -	if (likely(worker->task)) +	if (!worker->current_work && likely(worker->task))  		wake_up_process(worker->task);  } diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d24e4339b46d..88d0d4420ad2 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -384,7 +384,9 @@ static void print_lockdep_off(const char *bug_msg)  {  	printk(KERN_DEBUG "%s\n", bug_msg);  	printk(KERN_DEBUG "turning off the locking correctness validator.\n"); +#ifdef CONFIG_LOCK_STAT  	printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n"); +#endif  }  static int save_trace(struct stack_trace *trace) diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c index be9ee1559fca..9887a905a762 100644 --- a/kernel/locking/mcs_spinlock.c +++ b/kernel/locking/mcs_spinlock.c @@ -1,6 +1,4 @@ -  #include <linux/percpu.h> -#include <linux/mutex.h>  #include <linux/sched.h>  #include "mcs_spinlock.h" @@ -79,7 +77,7 @@ osq_wait_next(struct optimistic_spin_queue *lock,  				break;  		} -		arch_mutex_cpu_relax(); +		cpu_relax_lowlatency();  	}  	return next; @@ -120,7 +118,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)  		if (need_resched())  			goto unqueue; -		arch_mutex_cpu_relax(); +		cpu_relax_lowlatency();  	}  	return true; @@ -146,7 +144,7 @@ unqueue:  		if (smp_load_acquire(&node->locked))  			return true; -		arch_mutex_cpu_relax(); +		cpu_relax_lowlatency();  		/*  		 * Or we race against a concurrent unqueue()'s step-B, in which diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 74356dc0ce29..23e89c5930e9 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -27,7 +27,7 @@ struct mcs_spinlock {  #define arch_mcs_spin_lock_contended(l)					\  do {									\  	while (!(smp_load_acquire(l)))					\ -		arch_mutex_cpu_relax();					\ +		cpu_relax_lowlatency();					\  } while (0)  #endif @@ -104,7 +104,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)  			return;  		/* Wait until the next pointer is set */  		while (!(next = ACCESS_ONCE(node->next))) -			arch_mutex_cpu_relax(); +			cpu_relax_lowlatency();  	}  	/* Pass lock to next waiter. */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index acca2c1a3c5e..ae712b25e492 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -46,12 +46,6 @@  # include <asm/mutex.h>  #endif -/* - * A negative mutex count indicates that waiters are sleeping waiting for the - * mutex. - */ -#define	MUTEX_SHOW_NO_WAITER(mutex)	(atomic_read(&(mutex)->count) >= 0) -  void  __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)  { @@ -152,7 +146,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)  		if (need_resched())  			break; -		arch_mutex_cpu_relax(); +		cpu_relax_lowlatency();  	}  	rcu_read_unlock(); @@ -388,12 +382,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  	/*  	 * Optimistic spinning.  	 * -	 * We try to spin for acquisition when we find that there are no -	 * pending waiters and the lock owner is currently running on a -	 * (different) CPU. -	 * -	 * The rationale is that if the lock owner is running, it is likely to -	 * release the lock soon. +	 * We try to spin for acquisition when we find that the lock owner +	 * is currently running on a (different) CPU and while we don't +	 * need to reschedule. The rationale is that if the lock owner is +	 * running, it is likely to release the lock soon.  	 *  	 * Since this needs the lock owner, and this mutex implementation  	 * doesn't track the owner atomically in the lock field, we need to @@ -440,7 +432,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  		if (owner && !mutex_spin_on_owner(lock, owner))  			break; -		if ((atomic_read(&lock->count) == 1) && +		/* Try to acquire the mutex if it is unlocked. */ +		if (!mutex_is_locked(lock) &&  		    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {  			lock_acquired(&lock->dep_map, ip);  			if (use_ww_ctx) { @@ -471,7 +464,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  		 * memory barriers as we'll eventually observe the right  		 * values at the cost of a few extra spins.  		 */ -		arch_mutex_cpu_relax(); +		cpu_relax_lowlatency();  	}  	osq_unlock(&lock->osq);  slowpath: @@ -485,8 +478,11 @@ slowpath:  #endif  	spin_lock_mutex(&lock->wait_lock, flags); -	/* once more, can we acquire the lock? */ -	if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1)) +	/* +	 * Once more, try to acquire the lock. Only try-lock the mutex if +	 * it is unlocked to reduce unnecessary xchg() operations. +	 */ +	if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1))  		goto skip_wait;  	debug_mutex_lock_common(lock, &waiter); @@ -506,9 +502,10 @@ slowpath:  		 * it's unlocked. Later on, if we sleep, this is the  		 * operation that gives us the lock. We xchg it to -1, so  		 * that when we release the lock, we properly wake up the -		 * other waiters: +		 * other waiters. We only attempt the xchg if the count is +		 * non-negative in order to avoid unnecessary xchg operations:  		 */ -		if (MUTEX_SHOW_NO_WAITER(lock) && +		if (atomic_read(&lock->count) >= 0 &&  		    (atomic_xchg(&lock->count, -1) == 1))  			break; @@ -823,6 +820,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)  	unsigned long flags;  	int prev; +	/* No need to trylock if the mutex is locked. */ +	if (mutex_is_locked(lock)) +		return 0; +  	spin_lock_mutex(&lock->wait_lock, flags);  	prev = atomic_xchg(&lock->count, -1); diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index fb5b8ac411a5..f956ede7f90d 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -20,7 +20,6 @@  #include <linux/cpumask.h>  #include <linux/percpu.h>  #include <linux/hardirq.h> -#include <linux/mutex.h>  #include <asm/qrwlock.h>  /** @@ -35,7 +34,7 @@ static __always_inline void  rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)  {  	while ((cnts & _QW_WMASK) == _QW_LOCKED) { -		arch_mutex_cpu_relax(); +		cpu_relax_lowlatency();  		cnts = smp_load_acquire((u32 *)&lock->cnts);  	}  } @@ -75,7 +74,7 @@ void queue_read_lock_slowpath(struct qrwlock *lock)  	 * to make sure that the write lock isn't taken.  	 */  	while (atomic_read(&lock->cnts) & _QW_WMASK) -		arch_mutex_cpu_relax(); +		cpu_relax_lowlatency();  	cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;  	rspin_until_writer_unlock(lock, cnts); @@ -114,7 +113,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)  				    cnts | _QW_WAITING) == cnts))  			break; -		arch_mutex_cpu_relax(); +		cpu_relax_lowlatency();  	}  	/* When no more readers, set the locked flag */ @@ -125,7 +124,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)  				    _QW_LOCKED) == _QW_WAITING))  			break; -		arch_mutex_cpu_relax(); +		cpu_relax_lowlatency();  	}  unlock:  	arch_spin_unlock(&lock->lock); diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 49b2ed3dced8..62b6cee8ea7f 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -66,12 +66,13 @@ void rt_mutex_debug_task_free(struct task_struct *task)   * the deadlock. We print when we return. act_waiter can be NULL in   * case of a remove waiter operation.   */ -void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, +void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, +			     struct rt_mutex_waiter *act_waiter,  			     struct rt_mutex *lock)  {  	struct task_struct *task; -	if (!debug_locks || detect || !act_waiter) +	if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter)  		return;  	task = rt_mutex_owner(act_waiter->lock); diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index ab29b6a22669..d0519c3432b6 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -20,14 +20,15 @@ extern void debug_rt_mutex_unlock(struct rt_mutex *lock);  extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,  				      struct task_struct *powner);  extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, +extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, +				    struct rt_mutex_waiter *waiter,  				    struct rt_mutex *lock);  extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);  # define debug_rt_mutex_reset_waiter(w)			\  	do { (w)->deadlock_lock = NULL; } while (0) -static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, -						 int detect) +static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, +						  enum rtmutex_chainwalk walk)  {  	return (waiter != NULL);  } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index fc605941b9b8..a0ea2a141b3b 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -308,6 +308,32 @@ static void rt_mutex_adjust_prio(struct task_struct *task)  }  /* + * Deadlock detection is conditional: + * + * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted + * if the detect argument is == RT_MUTEX_FULL_CHAINWALK. + * + * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always + * conducted independent of the detect argument. + * + * If the waiter argument is NULL this indicates the deboost path and + * deadlock detection is disabled independent of the detect argument + * and the config settings. + */ +static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, +					  enum rtmutex_chainwalk chwalk) +{ +	/* +	 * This is just a wrapper function for the following call, +	 * because debug_rt_mutex_detect_deadlock() smells like a magic +	 * debug feature and I wanted to keep the cond function in the +	 * main source file along with the comments instead of having +	 * two of the same in the headers. +	 */ +	return debug_rt_mutex_detect_deadlock(waiter, chwalk); +} + +/*   * Max number of times we'll walk the boosting chain:   */  int max_lock_depth = 1024; @@ -337,21 +363,65 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)   * @top_task:	the current top waiter   *   * Returns 0 or -EDEADLK. + * + * Chain walk basics and protection scope + * + * [R] refcount on task + * [P] task->pi_lock held + * [L] rtmutex->wait_lock held + * + * Step	Description				Protected by + *	function arguments: + *	@task					[R] + *	@orig_lock if != NULL			@top_task is blocked on it + *	@next_lock				Unprotected. Cannot be + *						dereferenced. Only used for + *						comparison. + *	@orig_waiter if != NULL			@top_task is blocked on it + *	@top_task				current, or in case of proxy + *						locking protected by calling + *						code + *	again: + *	  loop_sanity_check(); + *	retry: + * [1]	  lock(task->pi_lock);			[R] acquire [P] + * [2]	  waiter = task->pi_blocked_on;		[P] + * [3]	  check_exit_conditions_1();		[P] + * [4]	  lock = waiter->lock;			[P] + * [5]	  if (!try_lock(lock->wait_lock)) {	[P] try to acquire [L] + *	    unlock(task->pi_lock);		release [P] + *	    goto retry; + *	  } + * [6]	  check_exit_conditions_2();		[P] + [L] + * [7]	  requeue_lock_waiter(lock, waiter);	[P] + [L] + * [8]	  unlock(task->pi_lock);		release [P] + *	  put_task_struct(task);		release [R] + * [9]	  check_exit_conditions_3();		[L] + * [10]	  task = owner(lock);			[L] + *	  get_task_struct(task);		[L] acquire [R] + *	  lock(task->pi_lock);			[L] acquire [P] + * [11]	  requeue_pi_waiter(tsk, waiters(lock));[P] + [L] + * [12]	  check_exit_conditions_4();		[P] + [L] + * [13]	  unlock(task->pi_lock);		release [P] + *	  unlock(lock->wait_lock);		release [L] + *	  goto again;   */  static int rt_mutex_adjust_prio_chain(struct task_struct *task, -				      int deadlock_detect, +				      enum rtmutex_chainwalk chwalk,  				      struct rt_mutex *orig_lock,  				      struct rt_mutex *next_lock,  				      struct rt_mutex_waiter *orig_waiter,  				      struct task_struct *top_task)  { -	struct rt_mutex *lock;  	struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; -	int detect_deadlock, ret = 0, depth = 0; +	struct rt_mutex_waiter *prerequeue_top_waiter; +	int ret = 0, depth = 0; +	struct rt_mutex *lock; +	bool detect_deadlock;  	unsigned long flags; +	bool requeue = true; -	detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, -							 deadlock_detect); +	detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);  	/*  	 * The (de)boosting is a step by step approach with a lot of @@ -360,6 +430,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,  	 * carefully whether things change under us.  	 */   again: +	/* +	 * We limit the lock chain length for each invocation. +	 */  	if (++depth > max_lock_depth) {  		static int prev_max; @@ -377,13 +450,28 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,  		return -EDEADLK;  	} + +	/* +	 * We are fully preemptible here and only hold the refcount on +	 * @task. So everything can have changed under us since the +	 * caller or our own code below (goto retry/again) dropped all +	 * locks. +	 */   retry:  	/* -	 * Task can not go away as we did a get_task() before ! +	 * [1] Task cannot go away as we did a get_task() before !  	 */  	raw_spin_lock_irqsave(&task->pi_lock, flags); +	/* +	 * [2] Get the waiter on which @task is blocked on. +	 */  	waiter = task->pi_blocked_on; + +	/* +	 * [3] check_exit_conditions_1() protected by task->pi_lock. +	 */ +  	/*  	 * Check whether the end of the boosting chain has been  	 * reached or the state of the chain has changed while we @@ -421,20 +509,41 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,  			goto out_unlock_pi;  		/*  		 * If deadlock detection is off, we stop here if we -		 * are not the top pi waiter of the task. +		 * are not the top pi waiter of the task. If deadlock +		 * detection is enabled we continue, but stop the +		 * requeueing in the chain walk.  		 */ -		if (!detect_deadlock && top_waiter != task_top_pi_waiter(task)) -			goto out_unlock_pi; +		if (top_waiter != task_top_pi_waiter(task)) { +			if (!detect_deadlock) +				goto out_unlock_pi; +			else +				requeue = false; +		}  	}  	/* -	 * When deadlock detection is off then we check, if further -	 * priority adjustment is necessary. +	 * If the waiter priority is the same as the task priority +	 * then there is no further priority adjustment necessary.  If +	 * deadlock detection is off, we stop the chain walk. If its +	 * enabled we continue, but stop the requeueing in the chain +	 * walk.  	 */ -	if (!detect_deadlock && waiter->prio == task->prio) -		goto out_unlock_pi; +	if (waiter->prio == task->prio) { +		if (!detect_deadlock) +			goto out_unlock_pi; +		else +			requeue = false; +	} +	/* +	 * [4] Get the next lock +	 */  	lock = waiter->lock; +	/* +	 * [5] We need to trylock here as we are holding task->pi_lock, +	 * which is the reverse lock order versus the other rtmutex +	 * operations. +	 */  	if (!raw_spin_trylock(&lock->wait_lock)) {  		raw_spin_unlock_irqrestore(&task->pi_lock, flags);  		cpu_relax(); @@ -442,79 +551,180 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,  	}  	/* +	 * [6] check_exit_conditions_2() protected by task->pi_lock and +	 * lock->wait_lock. +	 *  	 * Deadlock detection. If the lock is the same as the original  	 * lock which caused us to walk the lock chain or if the  	 * current lock is owned by the task which initiated the chain  	 * walk, we detected a deadlock.  	 */  	if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { -		debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); +		debug_rt_mutex_deadlock(chwalk, orig_waiter, lock);  		raw_spin_unlock(&lock->wait_lock);  		ret = -EDEADLK;  		goto out_unlock_pi;  	} -	top_waiter = rt_mutex_top_waiter(lock); +	/* +	 * If we just follow the lock chain for deadlock detection, no +	 * need to do all the requeue operations. To avoid a truckload +	 * of conditionals around the various places below, just do the +	 * minimum chain walk checks. +	 */ +	if (!requeue) { +		/* +		 * No requeue[7] here. Just release @task [8] +		 */ +		raw_spin_unlock_irqrestore(&task->pi_lock, flags); +		put_task_struct(task); + +		/* +		 * [9] check_exit_conditions_3 protected by lock->wait_lock. +		 * If there is no owner of the lock, end of chain. +		 */ +		if (!rt_mutex_owner(lock)) { +			raw_spin_unlock(&lock->wait_lock); +			return 0; +		} + +		/* [10] Grab the next task, i.e. owner of @lock */ +		task = rt_mutex_owner(lock); +		get_task_struct(task); +		raw_spin_lock_irqsave(&task->pi_lock, flags); + +		/* +		 * No requeue [11] here. We just do deadlock detection. +		 * +		 * [12] Store whether owner is blocked +		 * itself. Decision is made after dropping the locks +		 */ +		next_lock = task_blocked_on_lock(task); +		/* +		 * Get the top waiter for the next iteration +		 */ +		top_waiter = rt_mutex_top_waiter(lock); + +		/* [13] Drop locks */ +		raw_spin_unlock_irqrestore(&task->pi_lock, flags); +		raw_spin_unlock(&lock->wait_lock); + +		/* If owner is not blocked, end of chain. */ +		if (!next_lock) +			goto out_put_task; +		goto again; +	} -	/* Requeue the waiter */ +	/* +	 * Store the current top waiter before doing the requeue +	 * operation on @lock. We need it for the boost/deboost +	 * decision below. +	 */ +	prerequeue_top_waiter = rt_mutex_top_waiter(lock); + +	/* [7] Requeue the waiter in the lock waiter list. */  	rt_mutex_dequeue(lock, waiter);  	waiter->prio = task->prio;  	rt_mutex_enqueue(lock, waiter); -	/* Release the task */ +	/* [8] Release the task */  	raw_spin_unlock_irqrestore(&task->pi_lock, flags); +	put_task_struct(task); + +	/* +	 * [9] check_exit_conditions_3 protected by lock->wait_lock. +	 * +	 * We must abort the chain walk if there is no lock owner even +	 * in the dead lock detection case, as we have nothing to +	 * follow here. This is the end of the chain we are walking. +	 */  	if (!rt_mutex_owner(lock)) {  		/* -		 * If the requeue above changed the top waiter, then we need -		 * to wake the new top waiter up to try to get the lock. +		 * If the requeue [7] above changed the top waiter, +		 * then we need to wake the new top waiter up to try +		 * to get the lock.  		 */ - -		if (top_waiter != rt_mutex_top_waiter(lock)) +		if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))  			wake_up_process(rt_mutex_top_waiter(lock)->task);  		raw_spin_unlock(&lock->wait_lock); -		goto out_put_task; +		return 0;  	} -	put_task_struct(task); -	/* Grab the next task */ +	/* [10] Grab the next task, i.e. the owner of @lock */  	task = rt_mutex_owner(lock);  	get_task_struct(task);  	raw_spin_lock_irqsave(&task->pi_lock, flags); +	/* [11] requeue the pi waiters if necessary */  	if (waiter == rt_mutex_top_waiter(lock)) { -		/* Boost the owner */ -		rt_mutex_dequeue_pi(task, top_waiter); +		/* +		 * The waiter became the new top (highest priority) +		 * waiter on the lock. Replace the previous top waiter +		 * in the owner tasks pi waiters list with this waiter +		 * and adjust the priority of the owner. +		 */ +		rt_mutex_dequeue_pi(task, prerequeue_top_waiter);  		rt_mutex_enqueue_pi(task, waiter);  		__rt_mutex_adjust_prio(task); -	} else if (top_waiter == waiter) { -		/* Deboost the owner */ +	} else if (prerequeue_top_waiter == waiter) { +		/* +		 * The waiter was the top waiter on the lock, but is +		 * no longer the top prority waiter. Replace waiter in +		 * the owner tasks pi waiters list with the new top +		 * (highest priority) waiter and adjust the priority +		 * of the owner. +		 * The new top waiter is stored in @waiter so that +		 * @waiter == @top_waiter evaluates to true below and +		 * we continue to deboost the rest of the chain. +		 */  		rt_mutex_dequeue_pi(task, waiter);  		waiter = rt_mutex_top_waiter(lock);  		rt_mutex_enqueue_pi(task, waiter);  		__rt_mutex_adjust_prio(task); +	} else { +		/* +		 * Nothing changed. No need to do any priority +		 * adjustment. +		 */  	}  	/* +	 * [12] check_exit_conditions_4() protected by task->pi_lock +	 * and lock->wait_lock. The actual decisions are made after we +	 * dropped the locks. +	 *  	 * Check whether the task which owns the current lock is pi  	 * blocked itself. If yes we store a pointer to the lock for  	 * the lock chain change detection above. After we dropped  	 * task->pi_lock next_lock cannot be dereferenced anymore.  	 */  	next_lock = task_blocked_on_lock(task); +	/* +	 * Store the top waiter of @lock for the end of chain walk +	 * decision below. +	 */ +	top_waiter = rt_mutex_top_waiter(lock); +	/* [13] Drop the locks */  	raw_spin_unlock_irqrestore(&task->pi_lock, flags); - -	top_waiter = rt_mutex_top_waiter(lock);  	raw_spin_unlock(&lock->wait_lock);  	/* +	 * Make the actual exit decisions [12], based on the stored +	 * values. +	 *  	 * We reached the end of the lock chain. Stop right here. No  	 * point to go back just to figure that out.  	 */  	if (!next_lock)  		goto out_put_task; +	/* +	 * If the current waiter is not the top waiter on the lock, +	 * then we can stop the chain walk here if we are not in full +	 * deadlock detection mode. +	 */  	if (!detect_deadlock && waiter != top_waiter)  		goto out_put_task; @@ -533,76 +743,119 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,   *   * Must be called with lock->wait_lock held.   * - * @lock:   the lock to be acquired. - * @task:   the task which wants to acquire the lock - * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) + * @lock:   The lock to be acquired. + * @task:   The task which wants to acquire the lock + * @waiter: The waiter that is queued to the lock's wait list if the + *	    callsite called task_blocked_on_lock(), otherwise NULL   */  static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, -		struct rt_mutex_waiter *waiter) +				struct rt_mutex_waiter *waiter)  { +	unsigned long flags; +  	/* -	 * We have to be careful here if the atomic speedups are -	 * enabled, such that, when -	 *  - no other waiter is on the lock -	 *  - the lock has been released since we did the cmpxchg -	 * the lock can be released or taken while we are doing the -	 * checks and marking the lock with RT_MUTEX_HAS_WAITERS. +	 * Before testing whether we can acquire @lock, we set the +	 * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all +	 * other tasks which try to modify @lock into the slow path +	 * and they serialize on @lock->wait_lock. +	 * +	 * The RT_MUTEX_HAS_WAITERS bit can have a transitional state +	 * as explained at the top of this file if and only if:  	 * -	 * The atomic acquire/release aware variant of -	 * mark_rt_mutex_waiters uses a cmpxchg loop. After setting -	 * the WAITERS bit, the atomic release / acquire can not -	 * happen anymore and lock->wait_lock protects us from the -	 * non-atomic case. +	 * - There is a lock owner. The caller must fixup the +	 *   transient state if it does a trylock or leaves the lock +	 *   function due to a signal or timeout.  	 * -	 * Note, that this might set lock->owner = -	 * RT_MUTEX_HAS_WAITERS in the case the lock is not contended -	 * any more. This is fixed up when we take the ownership. -	 * This is the transitional state explained at the top of this file. +	 * - @task acquires the lock and there are no other +	 *   waiters. This is undone in rt_mutex_set_owner(@task) at +	 *   the end of this function.  	 */  	mark_rt_mutex_waiters(lock); +	/* +	 * If @lock has an owner, give up. +	 */  	if (rt_mutex_owner(lock))  		return 0;  	/* -	 * It will get the lock because of one of these conditions: -	 * 1) there is no waiter -	 * 2) higher priority than waiters -	 * 3) it is top waiter +	 * If @waiter != NULL, @task has already enqueued the waiter +	 * into @lock waiter list. If @waiter == NULL then this is a +	 * trylock attempt.  	 */ -	if (rt_mutex_has_waiters(lock)) { -		if (task->prio >= rt_mutex_top_waiter(lock)->prio) { -			if (!waiter || waiter != rt_mutex_top_waiter(lock)) -				return 0; -		} -	} - -	if (waiter || rt_mutex_has_waiters(lock)) { -		unsigned long flags; -		struct rt_mutex_waiter *top; - -		raw_spin_lock_irqsave(&task->pi_lock, flags); +	if (waiter) { +		/* +		 * If waiter is not the highest priority waiter of +		 * @lock, give up. +		 */ +		if (waiter != rt_mutex_top_waiter(lock)) +			return 0; -		/* remove the queued waiter. */ -		if (waiter) { -			rt_mutex_dequeue(lock, waiter); -			task->pi_blocked_on = NULL; -		} +		/* +		 * We can acquire the lock. Remove the waiter from the +		 * lock waiters list. +		 */ +		rt_mutex_dequeue(lock, waiter); +	} else {  		/* -		 * We have to enqueue the top waiter(if it exists) into -		 * task->pi_waiters list. +		 * If the lock has waiters already we check whether @task is +		 * eligible to take over the lock. +		 * +		 * If there are no other waiters, @task can acquire +		 * the lock.  @task->pi_blocked_on is NULL, so it does +		 * not need to be dequeued.  		 */  		if (rt_mutex_has_waiters(lock)) { -			top = rt_mutex_top_waiter(lock); -			rt_mutex_enqueue_pi(task, top); +			/* +			 * If @task->prio is greater than or equal to +			 * the top waiter priority (kernel view), +			 * @task lost. +			 */ +			if (task->prio >= rt_mutex_top_waiter(lock)->prio) +				return 0; + +			/* +			 * The current top waiter stays enqueued. We +			 * don't have to change anything in the lock +			 * waiters order. +			 */ +		} else { +			/* +			 * No waiters. Take the lock without the +			 * pi_lock dance.@task->pi_blocked_on is NULL +			 * and we have no waiters to enqueue in @task +			 * pi waiters list. +			 */ +			goto takeit;  		} -		raw_spin_unlock_irqrestore(&task->pi_lock, flags);  	} +	/* +	 * Clear @task->pi_blocked_on. Requires protection by +	 * @task->pi_lock. Redundant operation for the @waiter == NULL +	 * case, but conditionals are more expensive than a redundant +	 * store. +	 */ +	raw_spin_lock_irqsave(&task->pi_lock, flags); +	task->pi_blocked_on = NULL; +	/* +	 * Finish the lock acquisition. @task is the new owner. If +	 * other waiters exist we have to insert the highest priority +	 * waiter into @task->pi_waiters list. +	 */ +	if (rt_mutex_has_waiters(lock)) +		rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock)); +	raw_spin_unlock_irqrestore(&task->pi_lock, flags); + +takeit:  	/* We got the lock. */  	debug_rt_mutex_lock(lock); +	/* +	 * This either preserves the RT_MUTEX_HAS_WAITERS bit if there +	 * are still waiters or clears it. +	 */  	rt_mutex_set_owner(lock, task);  	rt_mutex_deadlock_account_lock(lock, task); @@ -620,7 +873,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,  static int task_blocks_on_rt_mutex(struct rt_mutex *lock,  				   struct rt_mutex_waiter *waiter,  				   struct task_struct *task, -				   int detect_deadlock) +				   enum rtmutex_chainwalk chwalk)  {  	struct task_struct *owner = rt_mutex_owner(lock);  	struct rt_mutex_waiter *top_waiter = waiter; @@ -666,7 +919,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,  		__rt_mutex_adjust_prio(owner);  		if (owner->pi_blocked_on)  			chain_walk = 1; -	} else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { +	} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {  		chain_walk = 1;  	} @@ -691,7 +944,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,  	raw_spin_unlock(&lock->wait_lock); -	res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, +	res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,  					 next_lock, waiter, task);  	raw_spin_lock(&lock->wait_lock); @@ -753,9 +1006,9 @@ static void wakeup_next_waiter(struct rt_mutex *lock)  static void remove_waiter(struct rt_mutex *lock,  			  struct rt_mutex_waiter *waiter)  { -	int first = (waiter == rt_mutex_top_waiter(lock)); +	bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));  	struct task_struct *owner = rt_mutex_owner(lock); -	struct rt_mutex *next_lock = NULL; +	struct rt_mutex *next_lock;  	unsigned long flags;  	raw_spin_lock_irqsave(¤t->pi_lock, flags); @@ -763,29 +1016,31 @@ static void remove_waiter(struct rt_mutex *lock,  	current->pi_blocked_on = NULL;  	raw_spin_unlock_irqrestore(¤t->pi_lock, flags); -	if (!owner) +	/* +	 * Only update priority if the waiter was the highest priority +	 * waiter of the lock and there is an owner to update. +	 */ +	if (!owner || !is_top_waiter)  		return; -	if (first) { - -		raw_spin_lock_irqsave(&owner->pi_lock, flags); +	raw_spin_lock_irqsave(&owner->pi_lock, flags); -		rt_mutex_dequeue_pi(owner, waiter); +	rt_mutex_dequeue_pi(owner, waiter); -		if (rt_mutex_has_waiters(lock)) { -			struct rt_mutex_waiter *next; +	if (rt_mutex_has_waiters(lock)) +		rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); -			next = rt_mutex_top_waiter(lock); -			rt_mutex_enqueue_pi(owner, next); -		} -		__rt_mutex_adjust_prio(owner); +	__rt_mutex_adjust_prio(owner); -		/* Store the lock on which owner is blocked or NULL */ -		next_lock = task_blocked_on_lock(owner); +	/* Store the lock on which owner is blocked or NULL */ +	next_lock = task_blocked_on_lock(owner); -		raw_spin_unlock_irqrestore(&owner->pi_lock, flags); -	} +	raw_spin_unlock_irqrestore(&owner->pi_lock, flags); +	/* +	 * Don't walk the chain, if the owner task is not blocked +	 * itself. +	 */  	if (!next_lock)  		return; @@ -794,7 +1049,8 @@ static void remove_waiter(struct rt_mutex *lock,  	raw_spin_unlock(&lock->wait_lock); -	rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current); +	rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, +				   next_lock, NULL, current);  	raw_spin_lock(&lock->wait_lock);  } @@ -824,7 +1080,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)  	/* gets dropped in rt_mutex_adjust_prio_chain()! */  	get_task_struct(task); -	rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task); +	rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, +				   next_lock, NULL, task);  }  /** @@ -902,7 +1159,7 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,  static int __sched  rt_mutex_slowlock(struct rt_mutex *lock, int state,  		  struct hrtimer_sleeper *timeout, -		  int detect_deadlock) +		  enum rtmutex_chainwalk chwalk)  {  	struct rt_mutex_waiter waiter;  	int ret = 0; @@ -928,7 +1185,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,  			timeout->task = NULL;  	} -	ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); +	ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);  	if (likely(!ret))  		ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); @@ -937,7 +1194,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,  	if (unlikely(ret)) {  		remove_waiter(lock, &waiter); -		rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter); +		rt_mutex_handle_deadlock(ret, chwalk, &waiter);  	}  	/* @@ -960,22 +1217,31 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,  /*   * Slow path try-lock function:   */ -static inline int -rt_mutex_slowtrylock(struct rt_mutex *lock) +static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)  { -	int ret = 0; +	int ret; + +	/* +	 * If the lock already has an owner we fail to get the lock. +	 * This can be done without taking the @lock->wait_lock as +	 * it is only being read, and this is a trylock anyway. +	 */ +	if (rt_mutex_owner(lock)) +		return 0; +	/* +	 * The mutex has currently no owner. Lock the wait lock and +	 * try to acquire the lock. +	 */  	raw_spin_lock(&lock->wait_lock); -	if (likely(rt_mutex_owner(lock) != current)) { +	ret = try_to_take_rt_mutex(lock, current, NULL); -		ret = try_to_take_rt_mutex(lock, current, NULL); -		/* -		 * try_to_take_rt_mutex() sets the lock waiters -		 * bit unconditionally. Clean this up. -		 */ -		fixup_rt_mutex_waiters(lock); -	} +	/* +	 * try_to_take_rt_mutex() sets the lock waiters bit +	 * unconditionally. Clean this up. +	 */ +	fixup_rt_mutex_waiters(lock);  	raw_spin_unlock(&lock->wait_lock); @@ -1053,30 +1319,31 @@ rt_mutex_slowunlock(struct rt_mutex *lock)   */  static inline int  rt_mutex_fastlock(struct rt_mutex *lock, int state, -		  int detect_deadlock,  		  int (*slowfn)(struct rt_mutex *lock, int state,  				struct hrtimer_sleeper *timeout, -				int detect_deadlock)) +				enum rtmutex_chainwalk chwalk))  { -	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { +	if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {  		rt_mutex_deadlock_account_lock(lock, current);  		return 0;  	} else -		return slowfn(lock, state, NULL, detect_deadlock); +		return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);  }  static inline int  rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, -			struct hrtimer_sleeper *timeout, int detect_deadlock, +			struct hrtimer_sleeper *timeout, +			enum rtmutex_chainwalk chwalk,  			int (*slowfn)(struct rt_mutex *lock, int state,  				      struct hrtimer_sleeper *timeout, -				      int detect_deadlock)) +				      enum rtmutex_chainwalk chwalk))  { -	if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { +	if (chwalk == RT_MUTEX_MIN_CHAINWALK && +	    likely(rt_mutex_cmpxchg(lock, NULL, current))) {  		rt_mutex_deadlock_account_lock(lock, current);  		return 0;  	} else -		return slowfn(lock, state, timeout, detect_deadlock); +		return slowfn(lock, state, timeout, chwalk);  }  static inline int @@ -1109,54 +1376,61 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)  {  	might_sleep(); -	rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); +	rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);  }  EXPORT_SYMBOL_GPL(rt_mutex_lock);  /**   * rt_mutex_lock_interruptible - lock a rt_mutex interruptible   * - * @lock: 		the rt_mutex to be locked - * @detect_deadlock:	deadlock detection on/off + * @lock:		the rt_mutex to be locked   *   * Returns: - *  0 		on success - * -EINTR 	when interrupted by a signal - * -EDEADLK	when the lock would deadlock (when deadlock detection is on) + *  0		on success + * -EINTR	when interrupted by a signal   */ -int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, -						 int detect_deadlock) +int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)  {  	might_sleep(); -	return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, -				 detect_deadlock, rt_mutex_slowlock); +	return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);  }  EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); +/* + * Futex variant with full deadlock detection. + */ +int rt_mutex_timed_futex_lock(struct rt_mutex *lock, +			      struct hrtimer_sleeper *timeout) +{ +	might_sleep(); + +	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, +				       RT_MUTEX_FULL_CHAINWALK, +				       rt_mutex_slowlock); +} +  /**   * rt_mutex_timed_lock - lock a rt_mutex interruptible   *			the timeout structure is provided   *			by the caller   * - * @lock: 		the rt_mutex to be locked + * @lock:		the rt_mutex to be locked   * @timeout:		timeout structure or NULL (no timeout) - * @detect_deadlock:	deadlock detection on/off   *   * Returns: - *  0 		on success - * -EINTR 	when interrupted by a signal + *  0		on success + * -EINTR	when interrupted by a signal   * -ETIMEDOUT	when the timeout expired - * -EDEADLK	when the lock would deadlock (when deadlock detection is on)   */  int -rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, -		    int detect_deadlock) +rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)  {  	might_sleep();  	return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, -				       detect_deadlock, rt_mutex_slowlock); +				       RT_MUTEX_MIN_CHAINWALK, +				       rt_mutex_slowlock);  }  EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); @@ -1262,7 +1536,6 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,   * @lock:		the rt_mutex to take   * @waiter:		the pre-initialized rt_mutex_waiter   * @task:		the task to prepare - * @detect_deadlock:	perform deadlock detection (1) or not (0)   *   * Returns:   *  0 - task blocked on lock @@ -1273,7 +1546,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,   */  int rt_mutex_start_proxy_lock(struct rt_mutex *lock,  			      struct rt_mutex_waiter *waiter, -			      struct task_struct *task, int detect_deadlock) +			      struct task_struct *task)  {  	int ret; @@ -1285,7 +1558,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,  	}  	/* We enforce deadlock detection for futexes */ -	ret = task_blocks_on_rt_mutex(lock, waiter, task, 1); +	ret = task_blocks_on_rt_mutex(lock, waiter, task, +				      RT_MUTEX_FULL_CHAINWALK);  	if (ret && !rt_mutex_owner(lock)) {  		/* @@ -1331,22 +1605,20 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)   * rt_mutex_finish_proxy_lock() - Complete lock acquisition   * @lock:		the rt_mutex we were woken on   * @to:			the timeout, null if none. hrtimer should already have - * 			been started. + *			been started.   * @waiter:		the pre-initialized rt_mutex_waiter - * @detect_deadlock:	perform deadlock detection (1) or not (0)   *   * Complete the lock acquisition started our behalf by another thread.   *   * Returns:   *  0 - success - * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK + * <0 - error, one of -EINTR, -ETIMEDOUT   *   * Special API call for PI-futex requeue support   */  int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,  			       struct hrtimer_sleeper *to, -			       struct rt_mutex_waiter *waiter, -			       int detect_deadlock) +			       struct rt_mutex_waiter *waiter)  {  	int ret; diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index f6a1f3c133b1..c4060584c407 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -22,10 +22,15 @@  #define debug_rt_mutex_init(m, n)			do { } while (0)  #define debug_rt_mutex_deadlock(d, a ,l)		do { } while (0)  #define debug_rt_mutex_print_deadlock(w)		do { } while (0) -#define debug_rt_mutex_detect_deadlock(w,d)		(d)  #define debug_rt_mutex_reset_waiter(w)			do { } while (0)  static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)  {  	WARN(1, "rtmutex deadlock detected\n");  } + +static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w, +						  enum rtmutex_chainwalk walk) +{ +	return walk == RT_MUTEX_FULL_CHAINWALK; +} diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 7431a9c86f35..855212501407 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -102,6 +102,21 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)  }  /* + * Constants for rt mutex functions which have a selectable deadlock + * detection. + * + * RT_MUTEX_MIN_CHAINWALK:	Stops the lock chain walk when there are + *				no further PI adjustments to be made. + * + * RT_MUTEX_FULL_CHAINWALK:	Invoke deadlock detection with a full + *				walk of the lock chain. + */ +enum rtmutex_chainwalk { +	RT_MUTEX_MIN_CHAINWALK, +	RT_MUTEX_FULL_CHAINWALK, +}; + +/*   * PI-futex support (proxy locking functions, etc.):   */  extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); @@ -111,12 +126,11 @@ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,  				  struct task_struct *proxy_owner);  extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,  				     struct rt_mutex_waiter *waiter, -				     struct task_struct *task, -				     int detect_deadlock); +				     struct task_struct *task);  extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,  				      struct hrtimer_sleeper *to, -				      struct rt_mutex_waiter *waiter, -				      int detect_deadlock); +				      struct rt_mutex_waiter *waiter); +extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);  #ifdef CONFIG_DEBUG_RT_MUTEXES  # include "rtmutex-debug.h" diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index a2391ac135c8..d6203faf2eb1 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -329,7 +329,7 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)  		if (need_resched())  			break; -		arch_mutex_cpu_relax(); +		cpu_relax_lowlatency();  	}  	rcu_read_unlock(); @@ -381,7 +381,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)  		 * memory barriers as we'll eventually observe the right  		 * values at the cost of a few extra spins.  		 */ -		arch_mutex_cpu_relax(); +		cpu_relax_lowlatency();  	}  	osq_unlock(&sem->osq);  done: diff --git a/kernel/module.c b/kernel/module.c index 81e727cf6df9..ae79ce615cb9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -60,7 +60,6 @@  #include <linux/jump_label.h>  #include <linux/pfn.h>  #include <linux/bsearch.h> -#include <linux/fips.h>  #include <uapi/linux/module.h>  #include "module-internal.h" @@ -2448,9 +2447,6 @@ static int module_sig_check(struct load_info *info)  	}  	/* Not having a signature is only an error if we're strict. */ -	if (err < 0 && fips_enabled) -		panic("Module verification failed with error %d in FIPS mode\n", -		      err);  	if (err == -ENOKEY && !sig_enforce)  		err = 0; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 8e7811086b82..ef42d0ab3115 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)  	might_sleep(); +	task_lock(p);  	ns = p->nsproxy; +	p->nsproxy = new; +	task_unlock(p); -	rcu_assign_pointer(p->nsproxy, new); - -	if (ns && atomic_dec_and_test(&ns->count)) { -		/* -		 * wait for others to get what they want from this nsproxy. -		 * -		 * cannot release this nsproxy via the call_rcu() since -		 * put_mnt_ns() will want to sleep -		 */ -		synchronize_rcu(); +	if (ns && atomic_dec_and_test(&ns->count))  		free_nsproxy(ns); -	}  }  void exit_task_namespaces(struct task_struct *p) diff --git a/kernel/panic.c b/kernel/panic.c index 62e16cef9cc2..d09dc5c32c67 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -224,6 +224,7 @@ static const struct tnt tnts[] = {  	{ TAINT_FIRMWARE_WORKAROUND,	'I', ' ' },  	{ TAINT_OOT_MODULE,		'O', ' ' },  	{ TAINT_UNSIGNED_MODULE,	'E', ' ' }, +	{ TAINT_SOFTLOCKUP,		'L', ' ' },  };  /** diff --git a/kernel/params.c b/kernel/params.c index 1e52ca233fd9..34f527023794 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -256,6 +256,7 @@ STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);  STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);  STANDARD_PARAM_DEF(long, long, "%li", kstrtol);  STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul); +STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);  int param_set_charp(const char *val, const struct kernel_param *kp)  { diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9a83d780facd..e4e4121fa327 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -253,9 +253,6 @@ config APM_EMULATION  	  anything, try disabling/enabling this option (or disabling/enabling  	  APM in your BIOS). -config ARCH_HAS_OPP -	bool -  config PM_OPP  	bool  	---help--- diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index fcc2611d3f14..a9dfa79b6bab 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -371,7 +371,6 @@ int hibernation_snapshot(int platform_mode)  	}  	suspend_console(); -	ftrace_stop();  	pm_restrict_gfp_mask();  	error = dpm_suspend(PMSG_FREEZE); @@ -397,7 +396,6 @@ int hibernation_snapshot(int platform_mode)  	if (error || !in_suspend)  		pm_restore_gfp_mask(); -	ftrace_start();  	resume_console();  	dpm_complete(msg); @@ -500,7 +498,6 @@ int hibernation_restore(int platform_mode)  	pm_prepare_console();  	suspend_console(); -	ftrace_stop();  	pm_restrict_gfp_mask();  	error = dpm_suspend_start(PMSG_QUIESCE);  	if (!error) { @@ -508,7 +505,6 @@ int hibernation_restore(int platform_mode)  		dpm_resume_end(PMSG_RECOVER);  	}  	pm_restore_gfp_mask(); -	ftrace_start();  	resume_console();  	pm_restore_console();  	return error; @@ -535,7 +531,6 @@ int hibernation_platform_enter(void)  	entering_platform_hibernation = true;  	suspend_console(); -	ftrace_stop();  	error = dpm_suspend_start(PMSG_HIBERNATE);  	if (error) {  		if (hibernation_ops->recover) @@ -579,7 +574,6 @@ int hibernation_platform_enter(void)   Resume_devices:  	entering_platform_hibernation = false;  	dpm_resume_end(PMSG_RESTORE); -	ftrace_start();  	resume_console();   Close: diff --git a/kernel/power/main.c b/kernel/power/main.c index 8e90f330f139..9a59d042ea84 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -296,8 +296,8 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,  	suspend_state_t i;  	for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) -		if (pm_states[i].state) -			s += sprintf(s,"%s ", pm_states[i].label); +		if (pm_states[i]) +			s += sprintf(s,"%s ", pm_states[i]);  #endif  	if (hibernation_available()) @@ -311,8 +311,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,  static suspend_state_t decode_state(const char *buf, size_t n)  {  #ifdef CONFIG_SUSPEND -	suspend_state_t state = PM_SUSPEND_MIN; -	struct pm_sleep_state *s; +	suspend_state_t state;  #endif  	char *p;  	int len; @@ -325,10 +324,12 @@ static suspend_state_t decode_state(const char *buf, size_t n)  		return PM_SUSPEND_MAX;  #ifdef CONFIG_SUSPEND -	for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) -		if (s->state && len == strlen(s->label) -		    && !strncmp(buf, s->label, len)) -			return s->state; +	for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { +		const char *label = pm_states[state]; + +		if (label && len == strlen(label) && !strncmp(buf, label, len)) +			return state; +	}  #endif  	return PM_SUSPEND_ON; @@ -446,8 +447,8 @@ static ssize_t autosleep_show(struct kobject *kobj,  #ifdef CONFIG_SUSPEND  	if (state < PM_SUSPEND_MAX) -		return sprintf(buf, "%s\n", pm_states[state].state ? -					pm_states[state].label : "error"); +		return sprintf(buf, "%s\n", pm_states[state] ? +					pm_states[state] : "error");  #endif  #ifdef CONFIG_HIBERNATION  	return sprintf(buf, "disk\n"); @@ -615,7 +616,6 @@ static struct attribute_group attr_group = {  	.attrs = g,  }; -#ifdef CONFIG_PM_RUNTIME  struct workqueue_struct *pm_wq;  EXPORT_SYMBOL_GPL(pm_wq); @@ -625,9 +625,6 @@ static int __init pm_start_workqueue(void)  	return pm_wq ? 0 : -ENOMEM;  } -#else -static inline int pm_start_workqueue(void) { return 0; } -#endif  static int __init pm_init(void)  { diff --git a/kernel/power/power.h b/kernel/power/power.h index c60f13b5270a..5d49dcac2537 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -178,13 +178,8 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,  				unsigned int, char *);  #ifdef CONFIG_SUSPEND -struct pm_sleep_state { -	const char *label; -	suspend_state_t state; -}; -  /* kernel/power/suspend.c */ -extern struct pm_sleep_state pm_states[]; +extern const char *pm_states[];  extern int suspend_devices_and_enter(suspend_state_t state);  #else /* !CONFIG_SUSPEND */ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1ea328aafdc9..4fc5c32422b3 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -248,33 +248,61 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)   *	information is stored (in the form of a block of bitmap)   *	It also contains the pfns that correspond to the start and end of   *	the represented memory area. + * + *	The memory bitmap is organized as a radix tree to guarantee fast random + *	access to the bits. There is one radix tree for each zone (as returned + *	from create_mem_extents). + * + *	One radix tree is represented by one struct mem_zone_bm_rtree. There are + *	two linked lists for the nodes of the tree, one for the inner nodes and + *	one for the leave nodes. The linked leave nodes are used for fast linear + *	access of the memory bitmap. + * + *	The struct rtree_node represents one node of the radix tree.   */  #define BM_END_OF_MAP	(~0UL)  #define BM_BITS_PER_BLOCK	(PAGE_SIZE * BITS_PER_BYTE) +#define BM_BLOCK_SHIFT		(PAGE_SHIFT + 3) +#define BM_BLOCK_MASK		((1UL << BM_BLOCK_SHIFT) - 1) -struct bm_block { -	struct list_head hook;	/* hook into a list of bitmap blocks */ -	unsigned long start_pfn;	/* pfn represented by the first bit */ -	unsigned long end_pfn;	/* pfn represented by the last bit plus 1 */ -	unsigned long *data;	/* bitmap representing pages */ +/* + * struct rtree_node is a wrapper struct to link the nodes + * of the rtree together for easy linear iteration over + * bits and easy freeing + */ +struct rtree_node { +	struct list_head list; +	unsigned long *data;  }; -static inline unsigned long bm_block_bits(struct bm_block *bb) -{ -	return bb->end_pfn - bb->start_pfn; -} +/* + * struct mem_zone_bm_rtree represents a bitmap used for one + * populated memory zone. + */ +struct mem_zone_bm_rtree { +	struct list_head list;		/* Link Zones together         */ +	struct list_head nodes;		/* Radix Tree inner nodes      */ +	struct list_head leaves;	/* Radix Tree leaves           */ +	unsigned long start_pfn;	/* Zone start page frame       */ +	unsigned long end_pfn;		/* Zone end page frame + 1     */ +	struct rtree_node *rtree;	/* Radix Tree Root             */ +	int levels;			/* Number of Radix Tree Levels */ +	unsigned int blocks;		/* Number of Bitmap Blocks     */ +};  /* strcut bm_position is used for browsing memory bitmaps */  struct bm_position { -	struct bm_block *block; -	int bit; +	struct mem_zone_bm_rtree *zone; +	struct rtree_node *node; +	unsigned long node_pfn; +	int node_bit;  };  struct memory_bitmap { -	struct list_head blocks;	/* list of bitmap blocks */ +	struct list_head zones;  	struct linked_page *p_list;	/* list of pages used to store zone  					 * bitmap objects and bitmap block  					 * objects @@ -284,38 +312,178 @@ struct memory_bitmap {  /* Functions that operate on memory bitmaps */ -static void memory_bm_position_reset(struct memory_bitmap *bm) +#define BM_ENTRIES_PER_LEVEL	(PAGE_SIZE / sizeof(unsigned long)) +#if BITS_PER_LONG == 32 +#define BM_RTREE_LEVEL_SHIFT	(PAGE_SHIFT - 2) +#else +#define BM_RTREE_LEVEL_SHIFT	(PAGE_SHIFT - 3) +#endif +#define BM_RTREE_LEVEL_MASK	((1UL << BM_RTREE_LEVEL_SHIFT) - 1) + +/* + *	alloc_rtree_node - Allocate a new node and add it to the radix tree. + * + *	This function is used to allocate inner nodes as well as the + *	leave nodes of the radix tree. It also adds the node to the + *	corresponding linked list passed in by the *list parameter. + */ +static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed, +					   struct chain_allocator *ca, +					   struct list_head *list)  { -	bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); -	bm->cur.bit = 0; -} +	struct rtree_node *node; -static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); +	node = chain_alloc(ca, sizeof(struct rtree_node)); +	if (!node) +		return NULL; -/** - *	create_bm_block_list - create a list of block bitmap objects - *	@pages - number of pages to track - *	@list - list to put the allocated blocks into - *	@ca - chain allocator to be used for allocating memory +	node->data = get_image_page(gfp_mask, safe_needed); +	if (!node->data) +		return NULL; + +	list_add_tail(&node->list, list); + +	return node; +} + +/* + *	add_rtree_block - Add a new leave node to the radix tree + * + *	The leave nodes need to be allocated in order to keep the leaves + *	linked list in order. This is guaranteed by the zone->blocks + *	counter.   */ -static int create_bm_block_list(unsigned long pages, -				struct list_head *list, -				struct chain_allocator *ca) +static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask, +			   int safe_needed, struct chain_allocator *ca)  { -	unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); +	struct rtree_node *node, *block, **dst; +	unsigned int levels_needed, block_nr; +	int i; -	while (nr_blocks-- > 0) { -		struct bm_block *bb; +	block_nr = zone->blocks; +	levels_needed = 0; -		bb = chain_alloc(ca, sizeof(struct bm_block)); -		if (!bb) +	/* How many levels do we need for this block nr? */ +	while (block_nr) { +		levels_needed += 1; +		block_nr >>= BM_RTREE_LEVEL_SHIFT; +	} + +	/* Make sure the rtree has enough levels */ +	for (i = zone->levels; i < levels_needed; i++) { +		node = alloc_rtree_node(gfp_mask, safe_needed, ca, +					&zone->nodes); +		if (!node)  			return -ENOMEM; -		list_add(&bb->hook, list); + +		node->data[0] = (unsigned long)zone->rtree; +		zone->rtree = node; +		zone->levels += 1; +	} + +	/* Allocate new block */ +	block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves); +	if (!block) +		return -ENOMEM; + +	/* Now walk the rtree to insert the block */ +	node = zone->rtree; +	dst = &zone->rtree; +	block_nr = zone->blocks; +	for (i = zone->levels; i > 0; i--) { +		int index; + +		if (!node) { +			node = alloc_rtree_node(gfp_mask, safe_needed, ca, +						&zone->nodes); +			if (!node) +				return -ENOMEM; +			*dst = node; +		} + +		index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT); +		index &= BM_RTREE_LEVEL_MASK; +		dst = (struct rtree_node **)&((*dst)->data[index]); +		node = *dst;  	} +	zone->blocks += 1; +	*dst = block; +  	return 0;  } +static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, +			       int clear_nosave_free); + +/* + *	create_zone_bm_rtree - create a radix tree for one zone + * + *	Allocated the mem_zone_bm_rtree structure and initializes it. + *	This function also allocated and builds the radix tree for the + *	zone. + */ +static struct mem_zone_bm_rtree * +create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed, +		     struct chain_allocator *ca, +		     unsigned long start, unsigned long end) +{ +	struct mem_zone_bm_rtree *zone; +	unsigned int i, nr_blocks; +	unsigned long pages; + +	pages = end - start; +	zone  = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree)); +	if (!zone) +		return NULL; + +	INIT_LIST_HEAD(&zone->nodes); +	INIT_LIST_HEAD(&zone->leaves); +	zone->start_pfn = start; +	zone->end_pfn = end; +	nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); + +	for (i = 0; i < nr_blocks; i++) { +		if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) { +			free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR); +			return NULL; +		} +	} + +	return zone; +} + +/* + *	free_zone_bm_rtree - Free the memory of the radix tree + * + *	Free all node pages of the radix tree. The mem_zone_bm_rtree + *	structure itself is not freed here nor are the rtree_node + *	structs. + */ +static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone, +			       int clear_nosave_free) +{ +	struct rtree_node *node; + +	list_for_each_entry(node, &zone->nodes, list) +		free_image_page(node->data, clear_nosave_free); + +	list_for_each_entry(node, &zone->leaves, list) +		free_image_page(node->data, clear_nosave_free); +} + +static void memory_bm_position_reset(struct memory_bitmap *bm) +{ +	bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree, +				  list); +	bm->cur.node = list_entry(bm->cur.zone->leaves.next, +				  struct rtree_node, list); +	bm->cur.node_pfn = 0; +	bm->cur.node_bit = 0; +} + +static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); +  struct mem_extent {  	struct list_head hook;  	unsigned long start; @@ -407,40 +575,22 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)  	int error;  	chain_init(&ca, gfp_mask, safe_needed); -	INIT_LIST_HEAD(&bm->blocks); +	INIT_LIST_HEAD(&bm->zones);  	error = create_mem_extents(&mem_extents, gfp_mask);  	if (error)  		return error;  	list_for_each_entry(ext, &mem_extents, hook) { -		struct bm_block *bb; -		unsigned long pfn = ext->start; -		unsigned long pages = ext->end - ext->start; - -		bb = list_entry(bm->blocks.prev, struct bm_block, hook); +		struct mem_zone_bm_rtree *zone; -		error = create_bm_block_list(pages, bm->blocks.prev, &ca); -		if (error) +		zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca, +					    ext->start, ext->end); +		if (!zone) { +			error = -ENOMEM;  			goto Error; - -		list_for_each_entry_continue(bb, &bm->blocks, hook) { -			bb->data = get_image_page(gfp_mask, safe_needed); -			if (!bb->data) { -				error = -ENOMEM; -				goto Error; -			} - -			bb->start_pfn = pfn; -			if (pages >= BM_BITS_PER_BLOCK) { -				pfn += BM_BITS_PER_BLOCK; -				pages -= BM_BITS_PER_BLOCK; -			} else { -				/* This is executed only once in the loop */ -				pfn += pages; -			} -			bb->end_pfn = pfn;  		} +		list_add_tail(&zone->list, &bm->zones);  	}  	bm->p_list = ca.chain; @@ -460,51 +610,83 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)    */  static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)  { -	struct bm_block *bb; +	struct mem_zone_bm_rtree *zone; -	list_for_each_entry(bb, &bm->blocks, hook) -		if (bb->data) -			free_image_page(bb->data, clear_nosave_free); +	list_for_each_entry(zone, &bm->zones, list) +		free_zone_bm_rtree(zone, clear_nosave_free);  	free_list_of_pages(bm->p_list, clear_nosave_free); -	INIT_LIST_HEAD(&bm->blocks); +	INIT_LIST_HEAD(&bm->zones);  }  /** - *	memory_bm_find_bit - find the bit in the bitmap @bm that corresponds - *	to given pfn.  The cur_zone_bm member of @bm and the cur_block member - *	of @bm->cur_zone_bm are updated. + *	memory_bm_find_bit - Find the bit for pfn in the memory + *			     bitmap + * + *	Find the bit in the bitmap @bm that corresponds to given pfn. + *	The cur.zone, cur.block and cur.node_pfn member of @bm are + *	updated. + *	It walks the radix tree to find the page which contains the bit for + *	pfn and returns the bit position in **addr and *bit_nr.   */  static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, -				void **addr, unsigned int *bit_nr) +			      void **addr, unsigned int *bit_nr)  { -	struct bm_block *bb; +	struct mem_zone_bm_rtree *curr, *zone; +	struct rtree_node *node; +	int i, block_nr; +	zone = bm->cur.zone; + +	if (pfn >= zone->start_pfn && pfn < zone->end_pfn) +		goto zone_found; + +	zone = NULL; + +	/* Find the right zone */ +	list_for_each_entry(curr, &bm->zones, list) { +		if (pfn >= curr->start_pfn && pfn < curr->end_pfn) { +			zone = curr; +			break; +		} +	} + +	if (!zone) +		return -EFAULT; + +zone_found:  	/* -	 * Check if the pfn corresponds to the current bitmap block and find -	 * the block where it fits if this is not the case. +	 * We have a zone. Now walk the radix tree to find the leave +	 * node for our pfn.  	 */ -	bb = bm->cur.block; -	if (pfn < bb->start_pfn) -		list_for_each_entry_continue_reverse(bb, &bm->blocks, hook) -			if (pfn >= bb->start_pfn) -				break; -	if (pfn >= bb->end_pfn) -		list_for_each_entry_continue(bb, &bm->blocks, hook) -			if (pfn >= bb->start_pfn && pfn < bb->end_pfn) -				break; +	node = bm->cur.node; +	if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn) +		goto node_found; -	if (&bb->hook == &bm->blocks) -		return -EFAULT; +	node      = zone->rtree; +	block_nr  = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT; + +	for (i = zone->levels; i > 0; i--) { +		int index; + +		index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT); +		index &= BM_RTREE_LEVEL_MASK; +		BUG_ON(node->data[index] == 0); +		node = (struct rtree_node *)node->data[index]; +	} + +node_found: +	/* Update last position */ +	bm->cur.zone = zone; +	bm->cur.node = node; +	bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK; + +	/* Set return values */ +	*addr = node->data; +	*bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK; -	/* The block has been found */ -	bm->cur.block = bb; -	pfn -= bb->start_pfn; -	bm->cur.bit = pfn + 1; -	*bit_nr = pfn; -	*addr = bb->data;  	return 0;  } @@ -528,6 +710,7 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)  	error = memory_bm_find_bit(bm, pfn, &addr, &bit);  	if (!error)  		set_bit(bit, addr); +  	return error;  } @@ -542,6 +725,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)  	clear_bit(bit, addr);  } +static void memory_bm_clear_current(struct memory_bitmap *bm) +{ +	int bit; + +	bit = max(bm->cur.node_bit - 1, 0); +	clear_bit(bit, bm->cur.node->data); +} +  static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)  {  	void *addr; @@ -561,38 +752,70 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)  	return !memory_bm_find_bit(bm, pfn, &addr, &bit);  } -/** - *	memory_bm_next_pfn - find the pfn that corresponds to the next set bit - *	in the bitmap @bm.  If the pfn cannot be found, BM_END_OF_MAP is - *	returned. +/* + *	rtree_next_node - Jumps to the next leave node + * + *	Sets the position to the beginning of the next node in the + *	memory bitmap. This is either the next node in the current + *	zone's radix tree or the first node in the radix tree of the + *	next zone.   * - *	It is required to run memory_bm_position_reset() before the first call to - *	this function. + *	Returns true if there is a next node, false otherwise.   */ +static bool rtree_next_node(struct memory_bitmap *bm) +{ +	bm->cur.node = list_entry(bm->cur.node->list.next, +				  struct rtree_node, list); +	if (&bm->cur.node->list != &bm->cur.zone->leaves) { +		bm->cur.node_pfn += BM_BITS_PER_BLOCK; +		bm->cur.node_bit  = 0; +		touch_softlockup_watchdog(); +		return true; +	} + +	/* No more nodes, goto next zone */ +	bm->cur.zone = list_entry(bm->cur.zone->list.next, +				  struct mem_zone_bm_rtree, list); +	if (&bm->cur.zone->list != &bm->zones) { +		bm->cur.node = list_entry(bm->cur.zone->leaves.next, +					  struct rtree_node, list); +		bm->cur.node_pfn = 0; +		bm->cur.node_bit = 0; +		return true; +	} +	/* No more zones */ +	return false; +} + +/** + *	memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm + * + *	Starting from the last returned position this function searches + *	for the next set bit in the memory bitmap and returns its + *	number. If no more bit is set BM_END_OF_MAP is returned. + * + *	It is required to run memory_bm_position_reset() before the + *	first call to this function. + */  static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)  { -	struct bm_block *bb; +	unsigned long bits, pfn, pages;  	int bit; -	bb = bm->cur.block;  	do { -		bit = bm->cur.bit; -		bit = find_next_bit(bb->data, bm_block_bits(bb), bit); -		if (bit < bm_block_bits(bb)) -			goto Return_pfn; - -		bb = list_entry(bb->hook.next, struct bm_block, hook); -		bm->cur.block = bb; -		bm->cur.bit = 0; -	} while (&bb->hook != &bm->blocks); +		pages	  = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn; +		bits      = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK); +		bit	  = find_next_bit(bm->cur.node->data, bits, +					  bm->cur.node_bit); +		if (bit < bits) { +			pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit; +			bm->cur.node_bit = bit + 1; +			return pfn; +		} +	} while (rtree_next_node(bm)); -	memory_bm_position_reset(bm);  	return BM_END_OF_MAP; - - Return_pfn: -	bm->cur.bit = bit + 1; -	return bb->start_pfn + bit;  }  /** @@ -816,12 +1039,17 @@ void free_basic_memory_bitmaps(void)  unsigned int snapshot_additional_pages(struct zone *zone)  { -	unsigned int res; +	unsigned int rtree, nodes; + +	rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); +	rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node), +			      LINKED_PAGE_DATA_SIZE); +	while (nodes > 1) { +		nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL); +		rtree += nodes; +	} -	res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); -	res += DIV_ROUND_UP(res * sizeof(struct bm_block), -			    LINKED_PAGE_DATA_SIZE); -	return 2 * res; +	return 2 * rtree;  }  #ifdef CONFIG_HIGHMEM @@ -1094,23 +1322,35 @@ static struct memory_bitmap copy_bm;  void swsusp_free(void)  { -	struct zone *zone; -	unsigned long pfn, max_zone_pfn; +	unsigned long fb_pfn, fr_pfn; -	for_each_populated_zone(zone) { -		max_zone_pfn = zone_end_pfn(zone); -		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) -			if (pfn_valid(pfn)) { -				struct page *page = pfn_to_page(pfn); - -				if (swsusp_page_is_forbidden(page) && -				    swsusp_page_is_free(page)) { -					swsusp_unset_page_forbidden(page); -					swsusp_unset_page_free(page); -					__free_page(page); -				} -			} +	memory_bm_position_reset(forbidden_pages_map); +	memory_bm_position_reset(free_pages_map); + +loop: +	fr_pfn = memory_bm_next_pfn(free_pages_map); +	fb_pfn = memory_bm_next_pfn(forbidden_pages_map); + +	/* +	 * Find the next bit set in both bitmaps. This is guaranteed to +	 * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP. +	 */ +	do { +		if (fb_pfn < fr_pfn) +			fb_pfn = memory_bm_next_pfn(forbidden_pages_map); +		if (fr_pfn < fb_pfn) +			fr_pfn = memory_bm_next_pfn(free_pages_map); +	} while (fb_pfn != fr_pfn); + +	if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) { +		struct page *page = pfn_to_page(fr_pfn); + +		memory_bm_clear_current(forbidden_pages_map); +		memory_bm_clear_current(free_pages_map); +		__free_page(page); +		goto loop;  	} +  	nr_copy_pages = 0;  	nr_meta_pages = 0;  	restore_pblist = NULL; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ed35a4790afe..6dadb25cb0d8 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -31,20 +31,11 @@  #include "power.h" -struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = { -	[PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE }, -	[PM_SUSPEND_STANDBY] = { .label = "standby", }, -	[PM_SUSPEND_MEM] = { .label = "mem", }, -}; +static const char *pm_labels[] = { "mem", "standby", "freeze", }; +const char *pm_states[PM_SUSPEND_MAX];  static const struct platform_suspend_ops *suspend_ops;  static const struct platform_freeze_ops *freeze_ops; - -static bool need_suspend_ops(suspend_state_t state) -{ -	return state > PM_SUSPEND_FREEZE; -} -  static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);  static bool suspend_freeze_wake; @@ -97,10 +88,7 @@ static bool relative_states;  static int __init sleep_states_setup(char *str)  {  	relative_states = !strncmp(str, "1", 1); -	if (relative_states) { -		pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE; -		pm_states[PM_SUSPEND_FREEZE].state = 0; -	} +	pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];  	return 1;  } @@ -113,20 +101,20 @@ __setup("relative_sleep_states=", sleep_states_setup);  void suspend_set_ops(const struct platform_suspend_ops *ops)  {  	suspend_state_t i; -	int j = PM_SUSPEND_MAX - 1; +	int j = 0;  	lock_system_sleep();  	suspend_ops = ops;  	for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) -		if (valid_state(i)) -			pm_states[j--].state = i; -		else if (!relative_states) -			pm_states[j--].state = 0; +		if (valid_state(i)) { +			pm_states[i] = pm_labels[j++]; +		} else if (!relative_states) { +			pm_states[i] = NULL; +			j++; +		} -	pm_states[j--].state = PM_SUSPEND_FREEZE; -	while (j >= PM_SUSPEND_MIN) -		pm_states[j--].state = 0; +	pm_states[PM_SUSPEND_FREEZE] = pm_labels[j];  	unlock_system_sleep();  } @@ -145,6 +133,65 @@ int suspend_valid_only_mem(suspend_state_t state)  }  EXPORT_SYMBOL_GPL(suspend_valid_only_mem); +static bool sleep_state_supported(suspend_state_t state) +{ +	return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter); +} + +static int platform_suspend_prepare(suspend_state_t state) +{ +	return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ? +		suspend_ops->prepare() : 0; +} + +static int platform_suspend_prepare_late(suspend_state_t state) +{ +	return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ? +		suspend_ops->prepare_late() : 0; +} + +static void platform_suspend_wake(suspend_state_t state) +{ +	if (state != PM_SUSPEND_FREEZE && suspend_ops->wake) +		suspend_ops->wake(); +} + +static void platform_suspend_finish(suspend_state_t state) +{ +	if (state != PM_SUSPEND_FREEZE && suspend_ops->finish) +		suspend_ops->finish(); +} + +static int platform_suspend_begin(suspend_state_t state) +{ +	if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) +		return freeze_ops->begin(); +	else if (suspend_ops->begin) +		return suspend_ops->begin(state); +	else +		return 0; +} + +static void platform_suspend_end(suspend_state_t state) +{ +	if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) +		freeze_ops->end(); +	else if (suspend_ops->end) +		suspend_ops->end(); +} + +static void platform_suspend_recover(suspend_state_t state) +{ +	if (state != PM_SUSPEND_FREEZE && suspend_ops->recover) +		suspend_ops->recover(); +} + +static bool platform_suspend_again(suspend_state_t state) +{ +	return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ? +		suspend_ops->suspend_again() : false; +} +  static int suspend_test(int level)  {  #ifdef CONFIG_PM_DEBUG @@ -168,7 +215,7 @@ static int suspend_prepare(suspend_state_t state)  {  	int error; -	if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter)) +	if (!sleep_state_supported(state))  		return -EPERM;  	pm_prepare_console(); @@ -214,23 +261,18 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)  {  	int error; -	if (need_suspend_ops(state) && suspend_ops->prepare) { -		error = suspend_ops->prepare(); -		if (error) -			goto Platform_finish; -	} +	error = platform_suspend_prepare(state); +	if (error) +		goto Platform_finish;  	error = dpm_suspend_end(PMSG_SUSPEND);  	if (error) {  		printk(KERN_ERR "PM: Some devices failed to power down\n");  		goto Platform_finish;  	} - -	if (need_suspend_ops(state) && suspend_ops->prepare_late) { -		error = suspend_ops->prepare_late(); -		if (error) -			goto Platform_wake; -	} +	error = platform_suspend_prepare_late(state); +	if (error) +		goto Platform_wake;  	if (suspend_test(TEST_PLATFORM))  		goto Platform_wake; @@ -248,7 +290,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)  		goto Platform_wake;  	} -	ftrace_stop();  	error = disable_nonboot_cpus();  	if (error || suspend_test(TEST_CPUS))  		goto Enable_cpus; @@ -275,18 +316,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)   Enable_cpus:  	enable_nonboot_cpus(); -	ftrace_start();   Platform_wake: -	if (need_suspend_ops(state) && suspend_ops->wake) -		suspend_ops->wake(); - +	platform_suspend_wake(state);  	dpm_resume_start(PMSG_RESUME);   Platform_finish: -	if (need_suspend_ops(state) && suspend_ops->finish) -		suspend_ops->finish(); - +	platform_suspend_finish(state);  	return error;  } @@ -299,18 +335,13 @@ int suspend_devices_and_enter(suspend_state_t state)  	int error;  	bool wakeup = false; -	if (need_suspend_ops(state) && !suspend_ops) +	if (!sleep_state_supported(state))  		return -ENOSYS; -	if (need_suspend_ops(state) && suspend_ops->begin) { -		error = suspend_ops->begin(state); -		if (error) -			goto Close; -	} else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) { -		error = freeze_ops->begin(); -		if (error) -			goto Close; -	} +	error = platform_suspend_begin(state); +	if (error) +		goto Close; +  	suspend_console();  	suspend_test_start();  	error = dpm_suspend_start(PMSG_SUSPEND); @@ -324,25 +355,20 @@ int suspend_devices_and_enter(suspend_state_t state)  	do {  		error = suspend_enter(state, &wakeup); -	} while (!error && !wakeup && need_suspend_ops(state) -		&& suspend_ops->suspend_again && suspend_ops->suspend_again()); +	} while (!error && !wakeup && platform_suspend_again(state));   Resume_devices:  	suspend_test_start();  	dpm_resume_end(PMSG_RESUME);  	suspend_test_finish("resume devices");  	resume_console(); - Close: -	if (need_suspend_ops(state) && suspend_ops->end) -		suspend_ops->end(); -	else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) -		freeze_ops->end(); + Close: +	platform_suspend_end(state);  	return error;   Recover_platform: -	if (need_suspend_ops(state) && suspend_ops->recover) -		suspend_ops->recover(); +	platform_suspend_recover(state);  	goto Resume_devices;  } @@ -395,7 +421,7 @@ static int enter_state(suspend_state_t state)  	printk("done.\n");  	trace_suspend_resume(TPS("sync_filesystems"), 0, false); -	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label); +	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);  	error = suspend_prepare(state);  	if (error)  		goto Unlock; @@ -404,7 +430,7 @@ static int enter_state(suspend_state_t state)  		goto Finish;  	trace_suspend_resume(TPS("suspend_enter"), state, false); -	pr_debug("PM: Entering %s sleep\n", pm_states[state].label); +	pr_debug("PM: Entering %s sleep\n", pm_states[state]);  	pm_restrict_gfp_mask();  	error = suspend_devices_and_enter(state);  	pm_restore_gfp_mask(); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 269b097e78ea..2f524928b6aa 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c @@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)  	}  	if (state == PM_SUSPEND_MEM) { -		printk(info_test, pm_states[state].label); +		printk(info_test, pm_states[state]);  		status = pm_suspend(state);  		if (status == -ENODEV)  			state = PM_SUSPEND_STANDBY;  	}  	if (state == PM_SUSPEND_STANDBY) { -		printk(info_test, pm_states[state].label); +		printk(info_test, pm_states[state]);  		status = pm_suspend(state);  	}  	if (status < 0) @@ -141,8 +141,8 @@ static int __init setup_test_suspend(char *value)  	/* "=mem" ==> "mem" */  	value++;  	for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) -		if (!strcmp(pm_states[i].label, value)) { -			test_state = pm_states[i].state; +		if (!strcmp(pm_states[i], value)) { +			test_state = i;  			return 0;  		} @@ -162,8 +162,8 @@ static int __init test_suspend(void)  	/* PM is initialized by now; is that state testable? */  	if (test_state == PM_SUSPEND_ON)  		goto done; -	if (!pm_states[test_state].state) { -		printk(warn_bad_state, pm_states[test_state].label); +	if (!pm_states[test_state]) { +		printk(warn_bad_state, pm_states[test_state]);  		goto done;  	} diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 13e839dbca07..de1a6bb6861d 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -45,6 +45,7 @@  #include <linux/poll.h>  #include <linux/irq_work.h>  #include <linux/utsname.h> +#include <linux/ctype.h>  #include <asm/uaccess.h> @@ -56,7 +57,7 @@  int console_printk[4] = {  	CONSOLE_LOGLEVEL_DEFAULT,	/* console_loglevel */ -	DEFAULT_MESSAGE_LOGLEVEL,	/* default_message_loglevel */ +	MESSAGE_LOGLEVEL_DEFAULT,	/* default_message_loglevel */  	CONSOLE_LOGLEVEL_MIN,		/* minimum_console_loglevel */  	CONSOLE_LOGLEVEL_DEFAULT,	/* default_console_loglevel */  }; @@ -113,9 +114,9 @@ static int __down_trylock_console_sem(unsigned long ip)   * This is used for debugging the mess that is the VT code by   * keeping track if we have the console semaphore held. It's   * definitely not the perfect debug tool (we don't know if _WE_ - * hold it are racing, but it helps tracking those weird code - * path in the console code where we end up in places I want - * locked without the console sempahore held + * hold it and are racing, but it helps tracking those weird code + * paths in the console code where we end up in places I want + * locked without the console sempahore held).   */  static int console_locked, console_suspended; @@ -146,8 +147,8 @@ static int console_may_schedule;   * the overall length of the record.   *   * The heads to the first and last entry in the buffer, as well as the - * sequence numbers of these both entries are maintained when messages - * are stored.. + * sequence numbers of these entries are maintained when messages are + * stored.   *   * If the heads indicate available messages, the length in the header   * tells the start next message. A length == 0 for the next message @@ -257,7 +258,7 @@ static u64 clear_seq;  static u32 clear_idx;  #define PREFIX_MAX		32 -#define LOG_LINE_MAX		1024 - PREFIX_MAX +#define LOG_LINE_MAX		(1024 - PREFIX_MAX)  /* record buffer */  #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) @@ -266,6 +267,7 @@ static u32 clear_idx;  #define LOG_ALIGN __alignof__(struct printk_log)  #endif  #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)  static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);  static char *log_buf = __log_buf;  static u32 log_buf_len = __LOG_BUF_LEN; @@ -344,7 +346,7 @@ static int log_make_free_space(u32 msg_size)  	while (log_first_seq < log_next_seq) {  		if (logbuf_has_space(msg_size, false))  			return 0; -		/* drop old messages until we have enough continuous space */ +		/* drop old messages until we have enough contiguous space */  		log_first_idx = log_next(log_first_idx);  		log_first_seq++;  	} @@ -453,11 +455,7 @@ static int log_store(int facility, int level,  	return msg->text_len;  } -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif +int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);  static int syslog_action_restricted(int type)  { @@ -828,34 +826,74 @@ void log_buf_kexec_setup(void)  /* requested log_buf_len from kernel cmdline */  static unsigned long __initdata new_log_buf_len; -/* save requested log_buf_len since it's too early to process it */ -static int __init log_buf_len_setup(char *str) +/* we practice scaling the ring buffer by powers of 2 */ +static void __init log_buf_len_update(unsigned size)  { -	unsigned size = memparse(str, &str); -  	if (size)  		size = roundup_pow_of_two(size);  	if (size > log_buf_len)  		new_log_buf_len = size; +} + +/* save requested log_buf_len since it's too early to process it */ +static int __init log_buf_len_setup(char *str) +{ +	unsigned size = memparse(str, &str); + +	log_buf_len_update(size);  	return 0;  }  early_param("log_buf_len", log_buf_len_setup); +static void __init log_buf_add_cpu(void) +{ +	unsigned int cpu_extra; + +	/* +	 * archs should set up cpu_possible_bits properly with +	 * set_cpu_possible() after setup_arch() but just in +	 * case lets ensure this is valid. +	 */ +	if (num_possible_cpus() == 1) +		return; + +	cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; + +	/* by default this will only continue through for large > 64 CPUs */ +	if (cpu_extra <= __LOG_BUF_LEN / 2) +		return; + +	pr_info("log_buf_len individual max cpu contribution: %d bytes\n", +		__LOG_CPU_MAX_BUF_LEN); +	pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", +		cpu_extra); +	pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); + +	log_buf_len_update(cpu_extra + __LOG_BUF_LEN); +} +  void __init setup_log_buf(int early)  {  	unsigned long flags;  	char *new_log_buf;  	int free; +	if (log_buf != __log_buf) +		return; + +	if (!early && !new_log_buf_len) +		log_buf_add_cpu(); +  	if (!new_log_buf_len)  		return;  	if (early) {  		new_log_buf = -			memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); +			memblock_virt_alloc(new_log_buf_len, LOG_ALIGN);  	} else { -		new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); +		new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, +							  LOG_ALIGN);  	}  	if (unlikely(!new_log_buf)) { @@ -872,7 +910,7 @@ void __init setup_log_buf(int early)  	memcpy(log_buf, __log_buf, __LOG_BUF_LEN);  	raw_spin_unlock_irqrestore(&logbuf_lock, flags); -	pr_info("log_buf_len: %d\n", log_buf_len); +	pr_info("log_buf_len: %d bytes\n", log_buf_len);  	pr_info("early log buf free: %d(%d%%)\n",  		free, (free * 100) / __LOG_BUF_LEN);  } @@ -881,7 +919,7 @@ static bool __read_mostly ignore_loglevel;  static int __init ignore_loglevel_setup(char *str)  { -	ignore_loglevel = 1; +	ignore_loglevel = true;  	pr_info("debug: ignoring loglevel setting.\n");  	return 0; @@ -947,11 +985,7 @@ static inline void boot_delay_msec(int level)  }  #endif -#if defined(CONFIG_PRINTK_TIME) -static bool printk_time = 1; -#else -static bool printk_time; -#endif +static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);  module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);  static size_t print_time(u64 ts, char *buf) @@ -1310,7 +1344,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)  			 * for pending data, not the size; return the count of  			 * records, not the length.  			 */ -			error = log_next_idx - syslog_idx; +			error = log_next_seq - syslog_seq;  		} else {  			u64 seq = syslog_seq;  			u32 idx = syslog_idx; @@ -1416,10 +1450,9 @@ static int have_callable_console(void)  /*   * Can we actually use the console at this time on this cpu?   * - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up.   */  static inline int can_use_console(unsigned int cpu)  { @@ -1432,8 +1465,10 @@ static inline int can_use_console(unsigned int cpu)   * console_lock held, and 'console_locked' set) if it   * is successful, false otherwise.   */ -static int console_trylock_for_printk(unsigned int cpu) +static int console_trylock_for_printk(void)  { +	unsigned int cpu = smp_processor_id(); +  	if (!console_trylock())  		return 0;  	/* @@ -1476,7 +1511,7 @@ static struct cont {  	struct task_struct *owner;	/* task of first print*/  	u64 ts_nsec;			/* time of first print */  	u8 level;			/* log level of first message */ -	u8 facility;			/* log level of first message */ +	u8 facility;			/* log facility of first message */  	enum log_flags flags;		/* prefix, newline flags */  	bool flushed:1;			/* buffer sealed and committed */  } cont; @@ -1608,7 +1643,8 @@ asmlinkage int vprintk_emit(int facility, int level,  		 */  		if (!oops_in_progress && !lockdep_recursing(current)) {  			recursion_bug = 1; -			goto out_restore_irqs; +			local_irq_restore(flags); +			return 0;  		}  		zap_locks();  	} @@ -1716,21 +1752,30 @@ asmlinkage int vprintk_emit(int facility, int level,  	logbuf_cpu = UINT_MAX;  	raw_spin_unlock(&logbuf_lock); +	lockdep_on(); +	local_irq_restore(flags);  	/* If called from the scheduler, we can not call up(). */  	if (!in_sched) { +		lockdep_off(); +		/* +		 * Disable preemption to avoid being preempted while holding +		 * console_sem which would prevent anyone from printing to +		 * console +		 */ +		preempt_disable(); +  		/*  		 * Try to acquire and then immediately release the console  		 * semaphore.  The release will print out buffers and wake up  		 * /dev/kmsg and syslog() users.  		 */ -		if (console_trylock_for_printk(this_cpu)) +		if (console_trylock_for_printk())  			console_unlock(); +		preempt_enable(); +		lockdep_on();  	} -	lockdep_on(); -out_restore_irqs: -	local_irq_restore(flags);  	return printed_len;  }  EXPORT_SYMBOL(vprintk_emit); @@ -1802,7 +1847,7 @@ EXPORT_SYMBOL(printk);  #define LOG_LINE_MAX		0  #define PREFIX_MAX		0 -#define LOG_LINE_MAX 0 +  static u64 syslog_seq;  static u32 syslog_idx;  static u64 console_seq; @@ -1881,11 +1926,12 @@ static int __add_preferred_console(char *name, int idx, char *options,  	return 0;  }  /* - * Set up a list of consoles.  Called from init/main.c + * Set up a console.  Called via do_early_param() in init/main.c + * for each "console=" parameter in the boot command line.   */  static int __init console_setup(char *str)  { -	char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ +	char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */  	char *s, *options, *brl_options = NULL;  	int idx; @@ -1902,7 +1948,8 @@ static int __init console_setup(char *str)  		strncpy(buf, str, sizeof(buf) - 1);  	}  	buf[sizeof(buf) - 1] = 0; -	if ((options = strchr(str, ',')) != NULL) +	options = strchr(str, ','); +	if (options)  		*(options++) = 0;  #ifdef __sparc__  	if (!strcmp(str, "ttya")) @@ -1911,7 +1958,7 @@ static int __init console_setup(char *str)  		strcpy(buf, "ttyS1");  #endif  	for (s = buf; *s; s++) -		if ((*s >= '0' && *s <= '9') || *s == ',') +		if (isdigit(*s) || *s == ',')  			break;  	idx = simple_strtoul(s, NULL, 10);  	*s = 0; @@ -1950,7 +1997,6 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha  	     i++, c++)  		if (strcmp(c->name, name) == 0 && c->index == idx) {  			strlcpy(c->name, name_new, sizeof(c->name)); -			c->name[sizeof(c->name) - 1] = 0;  			c->options = options;  			c->index = idx_new;  			return i; @@ -1959,12 +2005,12 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha  	return -1;  } -bool console_suspend_enabled = 1; +bool console_suspend_enabled = true;  EXPORT_SYMBOL(console_suspend_enabled);  static int __init console_suspend_disable(char *str)  { -	console_suspend_enabled = 0; +	console_suspend_enabled = false;  	return 1;  }  __setup("no_console_suspend", console_suspend_disable); @@ -2045,8 +2091,8 @@ EXPORT_SYMBOL(console_lock);  /**   * console_trylock - try to lock the console system for exclusive use.   * - * Tried to acquire a lock which guarantees that the caller has - * exclusive access to the console system and the console_drivers list. + * Try to acquire a lock which guarantees that the caller has exclusive + * access to the console system and the console_drivers list.   *   * returns 1 on success, and 0 on failure to acquire the lock.   */ @@ -2618,14 +2664,13 @@ EXPORT_SYMBOL(__printk_ratelimit);  bool printk_timed_ratelimit(unsigned long *caller_jiffies,  			unsigned int interval_msecs)  { -	if (*caller_jiffies == 0 -			|| !time_in_range(jiffies, *caller_jiffies, -					*caller_jiffies -					+ msecs_to_jiffies(interval_msecs))) { -		*caller_jiffies = jiffies; -		return true; -	} -	return false; +	unsigned long elapsed = jiffies - *caller_jiffies; + +	if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) +		return false; + +	*caller_jiffies = jiffies; +	return true;  }  EXPORT_SYMBOL(printk_timed_ratelimit); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index adf98622cb32..54e75226c2c4 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -28,12 +28,6 @@  #include <linux/compat.h> -static int ptrace_trapping_sleep_fn(void *flags) -{ -	schedule(); -	return 0; -} -  /*   * ptrace a task: make the debugger its new parent and   * move it to the ptrace list. @@ -371,7 +365,7 @@ unlock_creds:  out:  	if (!retval) {  		wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, -			    ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE); +			    TASK_UNINTERRUPTIBLE);  		proc_ptrace_connector(task, PTRACE_ATTACH);  	} diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index bfda2726ca45..ff1a6de62f17 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -99,6 +99,10 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)  void kfree(const void *); +/* + * Reclaim the specified callback, either by invoking it (non-lazy case) + * or freeing it directly (lazy case).  Return true if lazy, false otherwise. + */  static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)  {  	unsigned long offset = (unsigned long)head->func; @@ -108,12 +112,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)  		RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));  		kfree((void *)head - offset);  		rcu_lock_release(&rcu_callback_map); -		return 1; +		return true;  	} else {  		RCU_TRACE(trace_rcu_invoke_callback(rn, head));  		head->func(head);  		rcu_lock_release(&rcu_callback_map); -		return 0; +		return false;  	}  } diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index c639556f3fa0..e037f3eb2f7b 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -298,9 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp)  	idx = ACCESS_ONCE(sp->completed) & 0x1;  	preempt_disable(); -	ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; +	__this_cpu_inc(sp->per_cpu_ref->c[idx]);  	smp_mb(); /* B */  /* Avoid leaking the critical section. */ -	ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; +	__this_cpu_inc(sp->per_cpu_ref->seq[idx]);  	preempt_enable();  	return idx;  } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 625d0b0cd75a..1b70cb6fbe3c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1013,10 +1013,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)  }  /* - * Dump stacks of all tasks running on stalled CPUs.  This is a fallback - * for architectures that do not implement trigger_all_cpu_backtrace(). - * The NMI-triggered stack traces are more accurate because they are - * printed by the target CPU. + * Dump stacks of all tasks running on stalled CPUs.   */  static void rcu_dump_cpu_stacks(struct rcu_state *rsp)  { @@ -1094,7 +1091,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)  	       (long)rsp->gpnum, (long)rsp->completed, totqlen);  	if (ndetected == 0)  		pr_err("INFO: Stall ended before state dump start\n"); -	else if (!trigger_all_cpu_backtrace()) +	else  		rcu_dump_cpu_stacks(rsp);  	/* Complain about tasks blocking the grace period. */ @@ -1125,8 +1122,7 @@ static void print_cpu_stall(struct rcu_state *rsp)  	pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",  		jiffies - rsp->gp_start,  		(long)rsp->gpnum, (long)rsp->completed, totqlen); -	if (!trigger_all_cpu_backtrace()) -		dump_stack(); +	rcu_dump_cpu_stacks(rsp);  	raw_spin_lock_irqsave(&rnp->lock, flags);  	if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall))) @@ -1305,10 +1301,16 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,  	 * believe that a grace period is in progress, then we must wait  	 * for the one following, which is in "c".  Because our request  	 * will be noticed at the end of the current grace period, we don't -	 * need to explicitly start one. +	 * need to explicitly start one.  We only do the lockless check +	 * of rnp_root's fields if the current rcu_node structure thinks +	 * there is no grace period in flight, and because we hold rnp->lock, +	 * the only possible change is when rnp_root's two fields are +	 * equal, in which case rnp_root->gpnum might be concurrently +	 * incremented.  But that is OK, as it will just result in our +	 * doing some extra useless work.  	 */  	if (rnp->gpnum != rnp->completed || -	    ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { +	    ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) {  		rnp->need_future_gp[c & 0x1]++;  		trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));  		goto out; @@ -1645,11 +1647,6 @@ static int rcu_gp_init(struct rcu_state *rsp)  					    rnp->level, rnp->grplo,  					    rnp->grphi, rnp->qsmask);  		raw_spin_unlock_irq(&rnp->lock); -#ifdef CONFIG_PROVE_RCU_DELAY -		if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 && -		    system_state == SYSTEM_RUNNING) -			udelay(200); -#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */  		cond_resched();  	} @@ -2347,7 +2344,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  	}  	smp_mb(); /* List handling before counting for rcu_barrier(). */  	rdp->qlen_lazy -= count_lazy; -	ACCESS_ONCE(rdp->qlen) -= count; +	ACCESS_ONCE(rdp->qlen) = rdp->qlen - count;  	rdp->n_cbs_invoked += count;  	/* Reinstate batch limit if we have worked down the excess. */ @@ -2485,14 +2482,14 @@ static void force_quiescent_state(struct rcu_state *rsp)  	struct rcu_node *rnp_old = NULL;  	/* Funnel through hierarchy to reduce memory contention. */ -	rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; +	rnp = __this_cpu_read(rsp->rda->mynode);  	for (; rnp != NULL; rnp = rnp->parent) {  		ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||  		      !raw_spin_trylock(&rnp->fqslock);  		if (rnp_old != NULL)  			raw_spin_unlock(&rnp_old->fqslock);  		if (ret) { -			ACCESS_ONCE(rsp->n_force_qs_lh)++; +			rsp->n_force_qs_lh++;  			return;  		}  		rnp_old = rnp; @@ -2504,7 +2501,7 @@ static void force_quiescent_state(struct rcu_state *rsp)  	smp_mb__after_unlock_lock();  	raw_spin_unlock(&rnp_old->fqslock);  	if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { -		ACCESS_ONCE(rsp->n_force_qs_lh)++; +		rsp->n_force_qs_lh++;  		raw_spin_unlock_irqrestore(&rnp_old->lock, flags);  		return;  /* Someone beat us to it. */  	} @@ -2662,7 +2659,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),  	unsigned long flags;  	struct rcu_data *rdp; -	WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ +	WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */  	if (debug_rcu_head_queue(head)) {  		/* Probable double call_rcu(), so leak the callback. */  		ACCESS_ONCE(head->func) = rcu_leak_callback; @@ -2693,7 +2690,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),  		local_irq_restore(flags);  		return;  	} -	ACCESS_ONCE(rdp->qlen)++; +	ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;  	if (lazy)  		rdp->qlen_lazy++;  	else @@ -3257,7 +3254,7 @@ static void _rcu_barrier(struct rcu_state *rsp)  	 * ACCESS_ONCE() to prevent the compiler from speculating  	 * the increment to precede the early-exit check.  	 */ -	ACCESS_ONCE(rsp->n_barrier_done)++; +	ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;  	WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);  	_rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);  	smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ @@ -3307,7 +3304,7 @@ static void _rcu_barrier(struct rcu_state *rsp)  	/* Increment ->n_barrier_done to prevent duplicate work. */  	smp_mb(); /* Keep increment after above mechanism. */ -	ACCESS_ONCE(rsp->n_barrier_done)++; +	ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;  	WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);  	_rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);  	smp_mb(); /* Keep increment before caller's subsequent code. */ @@ -3564,14 +3561,16 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)  static void __init rcu_init_one(struct rcu_state *rsp,  		struct rcu_data __percpu *rda)  { -	static char *buf[] = { "rcu_node_0", -			       "rcu_node_1", -			       "rcu_node_2", -			       "rcu_node_3" };  /* Match MAX_RCU_LVLS */ -	static char *fqs[] = { "rcu_node_fqs_0", -			       "rcu_node_fqs_1", -			       "rcu_node_fqs_2", -			       "rcu_node_fqs_3" };  /* Match MAX_RCU_LVLS */ +	static const char * const buf[] = { +		"rcu_node_0", +		"rcu_node_1", +		"rcu_node_2", +		"rcu_node_3" };  /* Match MAX_RCU_LVLS */ +	static const char * const fqs[] = { +		"rcu_node_fqs_0", +		"rcu_node_fqs_1", +		"rcu_node_fqs_2", +		"rcu_node_fqs_3" };  /* Match MAX_RCU_LVLS */  	static u8 fl_mask = 0x1;  	int cpustride = 1;  	int i; diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 0f69a79c5b7d..71e64c718f75 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -172,6 +172,14 @@ struct rcu_node {  				/*  queued on this rcu_node structure that */  				/*  are blocking the current grace period, */  				/*  there can be no such task. */ +	struct completion boost_completion; +				/* Used to ensure that the rt_mutex used */ +				/*  to carry out the boosting is fully */ +				/*  released with no future boostee accesses */ +				/*  before that rt_mutex is re-initialized. */ +	struct rt_mutex boost_mtx; +				/* Used only for the priority-boosting */ +				/*  side effect, not as a lock. */  	unsigned long boost_time;  				/* When to start boosting (jiffies). */  	struct task_struct *boost_kthread_task; @@ -334,11 +342,29 @@ struct rcu_data {  	struct rcu_head **nocb_tail;  	atomic_long_t nocb_q_count;	/* # CBs waiting for kthread */  	atomic_long_t nocb_q_count_lazy; /*  (approximate). */ +	struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ +	struct rcu_head **nocb_follower_tail; +	atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */ +	atomic_long_t nocb_follower_count_lazy; /*  (approximate). */  	int nocb_p_count;		/* # CBs being invoked by kthread */  	int nocb_p_count_lazy;		/*  (approximate). */  	wait_queue_head_t nocb_wq;	/* For nocb kthreads to sleep on. */  	struct task_struct *nocb_kthread;  	bool nocb_defer_wakeup;		/* Defer wakeup of nocb_kthread. */ + +	/* The following fields are used by the leader, hence own cacheline. */ +	struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; +					/* CBs waiting for GP. */ +	struct rcu_head **nocb_gp_tail; +	long nocb_gp_count; +	long nocb_gp_count_lazy; +	bool nocb_leader_wake;		/* Is the nocb leader thread awake? */ +	struct rcu_data *nocb_next_follower; +					/* Next follower in wakeup chain. */ + +	/* The following fields are used by the follower, hence new cachline. */ +	struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp; +					/* Leader CPU takes GP-end wakeups. */  #endif /* #ifdef CONFIG_RCU_NOCB_CPU */  	/* 8) RCU CPU stall data. */ @@ -587,8 +613,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp);  /* Sum up queue lengths for tracing. */  static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)  { -	*ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count; -	*qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy; +	*ql = atomic_long_read(&rdp->nocb_q_count) + +	      rdp->nocb_p_count + +	      atomic_long_read(&rdp->nocb_follower_count) + +	      rdp->nocb_p_count + rdp->nocb_gp_count; +	*qll = atomic_long_read(&rdp->nocb_q_count_lazy) + +	       rdp->nocb_p_count_lazy + +	       atomic_long_read(&rdp->nocb_follower_count_lazy) + +	       rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy;  }  #else /* #ifdef CONFIG_RCU_NOCB_CPU */  static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 02ac0fb186b8..00dc411e9676 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -33,6 +33,7 @@  #define RCU_KTHREAD_PRIO 1  #ifdef CONFIG_RCU_BOOST +#include "../locking/rtmutex_common.h"  #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO  #else  #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO @@ -336,7 +337,7 @@ void rcu_read_unlock_special(struct task_struct *t)  	unsigned long flags;  	struct list_head *np;  #ifdef CONFIG_RCU_BOOST -	struct rt_mutex *rbmp = NULL; +	bool drop_boost_mutex = false;  #endif /* #ifdef CONFIG_RCU_BOOST */  	struct rcu_node *rnp;  	int special; @@ -398,11 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t)  #ifdef CONFIG_RCU_BOOST  		if (&t->rcu_node_entry == rnp->boost_tasks)  			rnp->boost_tasks = np; -		/* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ -		if (t->rcu_boost_mutex) { -			rbmp = t->rcu_boost_mutex; -			t->rcu_boost_mutex = NULL; -		} +		/* Snapshot ->boost_mtx ownership with rcu_node lock held. */ +		drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;  #endif /* #ifdef CONFIG_RCU_BOOST */  		/* @@ -427,8 +425,10 @@ void rcu_read_unlock_special(struct task_struct *t)  #ifdef CONFIG_RCU_BOOST  		/* Unboost if we were boosted. */ -		if (rbmp) -			rt_mutex_unlock(rbmp); +		if (drop_boost_mutex) { +			rt_mutex_unlock(&rnp->boost_mtx); +			complete(&rnp->boost_completion); +		}  #endif /* #ifdef CONFIG_RCU_BOOST */  		/* @@ -988,6 +988,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)  /* Because preemptible RCU does not exist, no quieting of tasks. */  static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) +	__releases(rnp->lock)  {  	raw_spin_unlock_irqrestore(&rnp->lock, flags);  } @@ -1149,7 +1150,6 @@ static void rcu_wake_cond(struct task_struct *t, int status)  static int rcu_boost(struct rcu_node *rnp)  {  	unsigned long flags; -	struct rt_mutex mtx;  	struct task_struct *t;  	struct list_head *tb; @@ -1200,11 +1200,15 @@ static int rcu_boost(struct rcu_node *rnp)  	 * section.  	 */  	t = container_of(tb, struct task_struct, rcu_node_entry); -	rt_mutex_init_proxy_locked(&mtx, t); -	t->rcu_boost_mutex = &mtx; +	rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); +	init_completion(&rnp->boost_completion);  	raw_spin_unlock_irqrestore(&rnp->lock, flags); -	rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */ -	rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */ +	/* Lock only for side effect: boosts task t's priority. */ +	rt_mutex_lock(&rnp->boost_mtx); +	rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */ + +	/* Wait for boostee to be done w/boost_mtx before reinitializing. */ +	wait_for_completion(&rnp->boost_completion);  	return ACCESS_ONCE(rnp->exp_tasks) != NULL ||  	       ACCESS_ONCE(rnp->boost_tasks) != NULL; @@ -1256,6 +1260,7 @@ static int rcu_boost_kthread(void *arg)   * about it going away.   */  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) +	__releases(rnp->lock)  {  	struct task_struct *t; @@ -1491,6 +1496,7 @@ static void rcu_prepare_kthreads(int cpu)  #else /* #ifdef CONFIG_RCU_BOOST */  static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) +	__releases(rnp->lock)  {  	raw_spin_unlock_irqrestore(&rnp->lock, flags);  } @@ -2060,6 +2066,22 @@ bool rcu_is_nocb_cpu(int cpu)  #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */  /* + * Kick the leader kthread for this NOCB group. + */ +static void wake_nocb_leader(struct rcu_data *rdp, bool force) +{ +	struct rcu_data *rdp_leader = rdp->nocb_leader; + +	if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) +		return; +	if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) { +		/* Prior xchg orders against prior callback enqueue. */ +		ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true; +		wake_up(&rdp_leader->nocb_wq); +	} +} + +/*   * Enqueue the specified string of rcu_head structures onto the specified   * CPU's no-CBs lists.  The CPU is specified by rdp, the head of the   * string by rhp, and the tail of the string by rhtp.  The non-lazy/lazy @@ -2093,7 +2115,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,  	len = atomic_long_read(&rdp->nocb_q_count);  	if (old_rhpp == &rdp->nocb_head) {  		if (!irqs_disabled_flags(flags)) { -			wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */ +			/* ... if queue was empty ... */ +			wake_nocb_leader(rdp, false);  			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,  					    TPS("WakeEmpty"));  		} else { @@ -2103,7 +2126,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,  		}  		rdp->qlen_last_fqs_check = 0;  	} else if (len > rdp->qlen_last_fqs_check + qhimark) { -		wake_up_process(t); /* ... or if many callbacks queued. */ +		/* ... or if many callbacks queued. */ +		wake_nocb_leader(rdp, true);  		rdp->qlen_last_fqs_check = LONG_MAX / 2;  		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));  	} else { @@ -2213,13 +2237,150 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)  }  /* + * Leaders come here to wait for additional callbacks to show up. + * This function does not return until callbacks appear. + */ +static void nocb_leader_wait(struct rcu_data *my_rdp) +{ +	bool firsttime = true; +	bool gotcbs; +	struct rcu_data *rdp; +	struct rcu_head **tail; + +wait_again: + +	/* Wait for callbacks to appear. */ +	if (!rcu_nocb_poll) { +		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); +		wait_event_interruptible(my_rdp->nocb_wq, +					 ACCESS_ONCE(my_rdp->nocb_leader_wake)); +		/* Memory barrier handled by smp_mb() calls below and repoll. */ +	} else if (firsttime) { +		firsttime = false; /* Don't drown trace log with "Poll"! */ +		trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll"); +	} + +	/* +	 * Each pass through the following loop checks a follower for CBs. +	 * We are our own first follower.  Any CBs found are moved to +	 * nocb_gp_head, where they await a grace period. +	 */ +	gotcbs = false; +	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { +		rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head); +		if (!rdp->nocb_gp_head) +			continue;  /* No CBs here, try next follower. */ + +		/* Move callbacks to wait-for-GP list, which is empty. */ +		ACCESS_ONCE(rdp->nocb_head) = NULL; +		rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); +		rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0); +		rdp->nocb_gp_count_lazy = +			atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); +		gotcbs = true; +	} + +	/* +	 * If there were no callbacks, sleep a bit, rescan after a +	 * memory barrier, and go retry. +	 */ +	if (unlikely(!gotcbs)) { +		if (!rcu_nocb_poll) +			trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, +					    "WokeEmpty"); +		flush_signals(current); +		schedule_timeout_interruptible(1); + +		/* Rescan in case we were a victim of memory ordering. */ +		my_rdp->nocb_leader_wake = false; +		smp_mb();  /* Ensure _wake false before scan. */ +		for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) +			if (ACCESS_ONCE(rdp->nocb_head)) { +				/* Found CB, so short-circuit next wait. */ +				my_rdp->nocb_leader_wake = true; +				break; +			} +		goto wait_again; +	} + +	/* Wait for one grace period. */ +	rcu_nocb_wait_gp(my_rdp); + +	/* +	 * We left ->nocb_leader_wake set to reduce cache thrashing. +	 * We clear it now, but recheck for new callbacks while +	 * traversing our follower list. +	 */ +	my_rdp->nocb_leader_wake = false; +	smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */ + +	/* Each pass through the following loop wakes a follower, if needed. */ +	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { +		if (ACCESS_ONCE(rdp->nocb_head)) +			my_rdp->nocb_leader_wake = true; /* No need to wait. */ +		if (!rdp->nocb_gp_head) +			continue; /* No CBs, so no need to wake follower. */ + +		/* Append callbacks to follower's "done" list. */ +		tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); +		*tail = rdp->nocb_gp_head; +		atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); +		atomic_long_add(rdp->nocb_gp_count_lazy, +				&rdp->nocb_follower_count_lazy); +		if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { +			/* +			 * List was empty, wake up the follower. +			 * Memory barriers supplied by atomic_long_add(). +			 */ +			wake_up(&rdp->nocb_wq); +		} +	} + +	/* If we (the leader) don't have CBs, go wait some more. */ +	if (!my_rdp->nocb_follower_head) +		goto wait_again; +} + +/* + * Followers come here to wait for additional callbacks to show up. + * This function does not return until callbacks appear. + */ +static void nocb_follower_wait(struct rcu_data *rdp) +{ +	bool firsttime = true; + +	for (;;) { +		if (!rcu_nocb_poll) { +			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, +					    "FollowerSleep"); +			wait_event_interruptible(rdp->nocb_wq, +						 ACCESS_ONCE(rdp->nocb_follower_head)); +		} else if (firsttime) { +			/* Don't drown trace log with "Poll"! */ +			firsttime = false; +			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll"); +		} +		if (smp_load_acquire(&rdp->nocb_follower_head)) { +			/* ^^^ Ensure CB invocation follows _head test. */ +			return; +		} +		if (!rcu_nocb_poll) +			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, +					    "WokeEmpty"); +		flush_signals(current); +		schedule_timeout_interruptible(1); +	} +} + +/*   * Per-rcu_data kthread, but only for no-CBs CPUs.  Each kthread invokes - * callbacks queued by the corresponding no-CBs CPU. + * callbacks queued by the corresponding no-CBs CPU, however, there is + * an optional leader-follower relationship so that the grace-period + * kthreads don't have to do quite so many wakeups.   */  static int rcu_nocb_kthread(void *arg)  {  	int c, cl; -	bool firsttime = 1;  	struct rcu_head *list;  	struct rcu_head *next;  	struct rcu_head **tail; @@ -2227,41 +2388,22 @@ static int rcu_nocb_kthread(void *arg)  	/* Each pass through this loop invokes one batch of callbacks */  	for (;;) { -		/* If not polling, wait for next batch of callbacks. */ -		if (!rcu_nocb_poll) { -			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, -					    TPS("Sleep")); -			wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); -			/* Memory barrier provide by xchg() below. */ -		} else if (firsttime) { -			firsttime = 0; -			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, -					    TPS("Poll")); -		} -		list = ACCESS_ONCE(rdp->nocb_head); -		if (!list) { -			if (!rcu_nocb_poll) -				trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, -						    TPS("WokeEmpty")); -			schedule_timeout_interruptible(1); -			flush_signals(current); -			continue; -		} -		firsttime = 1; -		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, -				    TPS("WokeNonEmpty")); - -		/* -		 * Extract queued callbacks, update counts, and wait -		 * for a grace period to elapse. -		 */ -		ACCESS_ONCE(rdp->nocb_head) = NULL; -		tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); -		c = atomic_long_xchg(&rdp->nocb_q_count, 0); -		cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); -		ACCESS_ONCE(rdp->nocb_p_count) += c; -		ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; -		rcu_nocb_wait_gp(rdp); +		/* Wait for callbacks. */ +		if (rdp->nocb_leader == rdp) +			nocb_leader_wait(rdp); +		else +			nocb_follower_wait(rdp); + +		/* Pull the ready-to-invoke callbacks onto local list. */ +		list = ACCESS_ONCE(rdp->nocb_follower_head); +		BUG_ON(!list); +		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); +		ACCESS_ONCE(rdp->nocb_follower_head) = NULL; +		tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head); +		c = atomic_long_xchg(&rdp->nocb_follower_count, 0); +		cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0); +		rdp->nocb_p_count += c; +		rdp->nocb_p_count_lazy += cl;  		/* Each pass through the following loop invokes a callback. */  		trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); @@ -2305,7 +2447,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)  	if (!rcu_nocb_need_deferred_wakeup(rdp))  		return;  	ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; -	wake_up(&rdp->nocb_wq); +	wake_nocb_leader(rdp, false);  	trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));  } @@ -2314,19 +2456,57 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)  {  	rdp->nocb_tail = &rdp->nocb_head;  	init_waitqueue_head(&rdp->nocb_wq); +	rdp->nocb_follower_tail = &rdp->nocb_follower_head;  } -/* Create a kthread for each RCU flavor for each no-CBs CPU. */ +/* How many follower CPU IDs per leader?  Default of -1 for sqrt(nr_cpu_ids). */ +static int rcu_nocb_leader_stride = -1; +module_param(rcu_nocb_leader_stride, int, 0444); + +/* + * Create a kthread for each RCU flavor for each no-CBs CPU. + * Also initialize leader-follower relationships. + */  static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)  {  	int cpu; +	int ls = rcu_nocb_leader_stride; +	int nl = 0;  /* Next leader. */  	struct rcu_data *rdp; +	struct rcu_data *rdp_leader = NULL;  /* Suppress misguided gcc warn. */ +	struct rcu_data *rdp_prev = NULL;  	struct task_struct *t;  	if (rcu_nocb_mask == NULL)  		return; +#if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) +	if (tick_nohz_full_running) +		cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask); +#endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */ +	if (ls == -1) { +		ls = int_sqrt(nr_cpu_ids); +		rcu_nocb_leader_stride = ls; +	} + +	/* +	 * Each pass through this loop sets up one rcu_data structure and +	 * spawns one rcu_nocb_kthread(). +	 */  	for_each_cpu(cpu, rcu_nocb_mask) {  		rdp = per_cpu_ptr(rsp->rda, cpu); +		if (rdp->cpu >= nl) { +			/* New leader, set up for followers & next leader. */ +			nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; +			rdp->nocb_leader = rdp; +			rdp_leader = rdp; +		} else { +			/* Another follower, link to previous leader. */ +			rdp->nocb_leader = rdp_leader; +			rdp_prev->nocb_next_follower = rdp; +		} +		rdp_prev = rdp; + +		/* Spawn the kthread for this CPU. */  		t = kthread_run(rcu_nocb_kthread, rdp,  				"rcuo%c/%d", rsp->abbr, cpu);  		BUG_ON(IS_ERR(t)); @@ -2843,12 +3023,16 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)   */  static void rcu_bind_gp_kthread(void)  { -#ifdef CONFIG_NO_HZ_FULL -	int cpu = ACCESS_ONCE(tick_do_timer_cpu); +	int __maybe_unused cpu; -	if (cpu < 0 || cpu >= nr_cpu_ids) +	if (!tick_nohz_full_enabled())  		return; -	if (raw_smp_processor_id() != cpu) +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE +	cpu = tick_do_timer_cpu; +	if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu)  		set_cpus_allowed_ptr(current, cpumask_of(cpu)); -#endif /* #ifdef CONFIG_NO_HZ_FULL */ +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ +	if (!is_housekeeping_cpu(raw_smp_processor_id())) +		housekeeping_affine(current); +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */  } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index bc7883570530..4056d7992a6c 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -90,9 +90,6 @@ void __rcu_read_unlock(void)  	} else {  		barrier();  /* critical section before exit code. */  		t->rcu_read_lock_nesting = INT_MIN; -#ifdef CONFIG_PROVE_RCU_DELAY -		udelay(10); /* Make preemption more probable. */ -#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */  		barrier();  /* assign before ->rcu_read_unlock_special load */  		if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))  			rcu_read_unlock_special(t); diff --git a/kernel/resource.c b/kernel/resource.c index 3c2237ac32db..da14b8d09296 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -59,10 +59,12 @@ static DEFINE_RWLOCK(resource_lock);  static struct resource *bootmem_resource_free;  static DEFINE_SPINLOCK(bootmem_resource_lock); -static void *r_next(struct seq_file *m, void *v, loff_t *pos) +static struct resource *next_resource(struct resource *p, bool sibling_only)  { -	struct resource *p = v; -	(*pos)++; +	/* Caller wants to traverse through siblings only */ +	if (sibling_only) +		return p->sibling; +  	if (p->child)  		return p->child;  	while (!p->sibling && p->parent) @@ -70,6 +72,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)  	return p->sibling;  } +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ +	struct resource *p = v; +	(*pos)++; +	return (void *)next_resource(p, false); +} +  #ifdef CONFIG_PROC_FS  enum { MAX_IORES_LEVEL = 5 }; @@ -322,16 +331,19 @@ int release_resource(struct resource *old)  EXPORT_SYMBOL(release_resource); -#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)  /* - * Finds the lowest memory reosurce exists within [res->start.res->end) + * Finds the lowest iomem reosurce exists with-in [res->start.res->end)   * the caller must specify res->start, res->end, res->flags and "name".   * If found, returns 0, res is overwritten, if not found, returns -1. + * This walks through whole tree and not just first level children + * until and unless first_level_children_only is true.   */ -static int find_next_system_ram(struct resource *res, char *name) +static int find_next_iomem_res(struct resource *res, char *name, +			       bool first_level_children_only)  {  	resource_size_t start, end;  	struct resource *p; +	bool sibling_only = false;  	BUG_ON(!res); @@ -340,8 +352,14 @@ static int find_next_system_ram(struct resource *res, char *name)  	BUG_ON(start >= end);  	read_lock(&resource_lock); -	for (p = iomem_resource.child; p ; p = p->sibling) { -		/* system ram is just marked as IORESOURCE_MEM */ + +	if (first_level_children_only) { +		p = iomem_resource.child; +		sibling_only = true; +	} else +		p = &iomem_resource; + +	while ((p = next_resource(p, sibling_only))) {  		if (p->flags != res->flags)  			continue;  		if (name && strcmp(p->name, name)) @@ -353,6 +371,7 @@ static int find_next_system_ram(struct resource *res, char *name)  		if ((p->end >= start) && (p->start < end))  			break;  	} +  	read_unlock(&resource_lock);  	if (!p)  		return -1; @@ -365,6 +384,70 @@ static int find_next_system_ram(struct resource *res, char *name)  }  /* + * Walks through iomem resources and calls func() with matching resource + * ranges. This walks through whole tree and not just first level children. + * All the memory ranges which overlap start,end and also match flags and + * name are valid candidates. + * + * @name: name of resource + * @flags: resource flags + * @start: start addr + * @end: end addr + */ +int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, +		void *arg, int (*func)(u64, u64, void *)) +{ +	struct resource res; +	u64 orig_end; +	int ret = -1; + +	res.start = start; +	res.end = end; +	res.flags = flags; +	orig_end = res.end; +	while ((res.start < res.end) && +		(!find_next_iomem_res(&res, name, false))) { +		ret = (*func)(res.start, res.end, arg); +		if (ret) +			break; +		res.start = res.end + 1; +		res.end = orig_end; +	} +	return ret; +} + +/* + * This function calls callback against all memory range of "System RAM" + * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. + * Now, this function is only for "System RAM". This function deals with + * full ranges and not pfn. If resources are not pfn aligned, dealing + * with pfn can truncate ranges. + */ +int walk_system_ram_res(u64 start, u64 end, void *arg, +				int (*func)(u64, u64, void *)) +{ +	struct resource res; +	u64 orig_end; +	int ret = -1; + +	res.start = start; +	res.end = end; +	res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; +	orig_end = res.end; +	while ((res.start < res.end) && +		(!find_next_iomem_res(&res, "System RAM", true))) { +		ret = (*func)(res.start, res.end, arg); +		if (ret) +			break; +		res.start = res.end + 1; +		res.end = orig_end; +	} +	return ret; +} + +#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) + +/*   * This function calls callback against all memory range of "System RAM"   * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.   * Now, this function is only for "System RAM". @@ -382,7 +465,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,  	res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;  	orig_end = res.end;  	while ((res.start < res.end) && -		(find_next_system_ram(&res, "System RAM") >= 0)) { +		(find_next_iomem_res(&res, "System RAM", true) >= 0)) {  		pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;  		end_pfn = (res.end + 1) >> PAGE_SHIFT;  		if (end_pfn > pfn) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bc1638b33449..1211575a2208 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -139,6 +139,8 @@ void update_rq_clock(struct rq *rq)  		return;  	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; +	if (delta < 0) +		return;  	rq->clock += delta;  	update_rq_clock_task(rq, delta);  } @@ -243,6 +245,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,  	char buf[64];  	char *cmp;  	int i; +	struct inode *inode;  	if (cnt > 63)  		cnt = 63; @@ -253,7 +256,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf,  	buf[cnt] = 0;  	cmp = strstrip(buf); +	/* Ensure the static_key remains in a consistent state */ +	inode = file_inode(filp); +	mutex_lock(&inode->i_mutex);  	i = sched_feat_set(cmp); +	mutex_unlock(&inode->i_mutex);  	if (i == __SCHED_FEAT_NR)  		return -EINVAL; @@ -587,30 +594,31 @@ static bool set_nr_if_polling(struct task_struct *p)  #endif  /* - * resched_task - mark a task 'to be rescheduled now'. + * resched_curr - mark rq's current task 'to be rescheduled now'.   *   * On UP this means the setting of the need_resched flag, on SMP it   * might also involve a cross-CPU call to trigger the scheduler on   * the target CPU.   */ -void resched_task(struct task_struct *p) +void resched_curr(struct rq *rq)  { +	struct task_struct *curr = rq->curr;  	int cpu; -	lockdep_assert_held(&task_rq(p)->lock); +	lockdep_assert_held(&rq->lock); -	if (test_tsk_need_resched(p)) +	if (test_tsk_need_resched(curr))  		return; -	cpu = task_cpu(p); +	cpu = cpu_of(rq);  	if (cpu == smp_processor_id()) { -		set_tsk_need_resched(p); +		set_tsk_need_resched(curr);  		set_preempt_need_resched();  		return;  	} -	if (set_nr_and_not_polling(p)) +	if (set_nr_and_not_polling(curr))  		smp_send_reschedule(cpu);  	else  		trace_sched_wake_idle_without_ipi(cpu); @@ -623,7 +631,7 @@ void resched_cpu(int cpu)  	if (!raw_spin_trylock_irqsave(&rq->lock, flags))  		return; -	resched_task(cpu_curr(cpu)); +	resched_curr(rq);  	raw_spin_unlock_irqrestore(&rq->lock, flags);  } @@ -684,10 +692,16 @@ static void wake_up_idle_cpu(int cpu)  static bool wake_up_full_nohz_cpu(int cpu)  { +	/* +	 * We just need the target to call irq_exit() and re-evaluate +	 * the next tick. The nohz full kick at least implies that. +	 * If needed we can still optimize that later with an +	 * empty IRQ. +	 */  	if (tick_nohz_full_cpu(cpu)) {  		if (cpu != smp_processor_id() ||  		    tick_nohz_tick_stopped()) -			smp_send_reschedule(cpu); +			tick_nohz_full_kick_cpu(cpu);  		return true;  	} @@ -730,18 +744,15 @@ static inline bool got_nohz_idle_kick(void)  #ifdef CONFIG_NO_HZ_FULL  bool sched_can_stop_tick(void)  { -       struct rq *rq; - -       rq = this_rq(); - -       /* Make sure rq->nr_running update is visible after the IPI */ -       smp_rmb(); - -       /* More than one running task need preemption */ -       if (rq->nr_running > 1) -               return false; +	/* +	 * More than one running task need preemption. +	 * nr_running update is assumed to be visible +	 * after IPI is sent from wakers. +	 */ +	if (this_rq()->nr_running > 1) +		return false; -       return true; +	return true;  }  #endif /* CONFIG_NO_HZ_FULL */ @@ -1022,7 +1033,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)  			if (class == rq->curr->sched_class)  				break;  			if (class == p->sched_class) { -				resched_task(rq->curr); +				resched_curr(rq);  				break;  			}  		} @@ -1568,9 +1579,7 @@ void scheduler_ipi(void)  	 */  	preempt_fold_need_resched(); -	if (llist_empty(&this_rq()->wake_list) -			&& !tick_nohz_full_cpu(smp_processor_id()) -			&& !got_nohz_idle_kick()) +	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())  		return;  	/* @@ -1587,7 +1596,6 @@ void scheduler_ipi(void)  	 * somewhat pessimize the simple resched case.  	 */  	irq_enter(); -	tick_nohz_full_check();  	sched_ttwu_pending();  	/* @@ -2431,7 +2439,12 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)  {  	u64 ns = 0; -	if (task_current(rq, p)) { +	/* +	 * Must be ->curr _and_ ->on_rq.  If dequeued, we would +	 * project cycles that may never be accounted to this +	 * thread, breaking clock_gettime(). +	 */ +	if (task_current(rq, p) && p->on_rq) {  		update_rq_clock(rq);  		ns = rq_clock_task(rq) - p->se.exec_start;  		if ((s64)ns < 0) @@ -2474,8 +2487,10 @@ unsigned long long task_sched_runtime(struct task_struct *p)  	 * If we race with it leaving cpu, we'll take a lock. So we're correct.  	 * If we race with it entering cpu, unaccounted time is 0. This is  	 * indistinguishable from the read occurring a few cycles earlier. +	 * If we see ->on_cpu without ->on_rq, the task is leaving, and has +	 * been accounted, so we're correct here as well.  	 */ -	if (!p->on_cpu) +	if (!p->on_cpu || !p->on_rq)  		return p->se.sum_exec_runtime;  #endif @@ -2971,7 +2986,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  	}  	trace_sched_pi_setprio(p, prio); -	p->pi_top_task = rt_mutex_get_top_task(p);  	oldprio = p->prio;  	prev_class = p->sched_class;  	on_rq = p->on_rq; @@ -2991,8 +3005,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)  	 *          running task  	 */  	if (dl_prio(prio)) { -		if (!dl_prio(p->normal_prio) || (p->pi_top_task && -			dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { +		struct task_struct *pi_task = rt_mutex_get_top_task(p); +		if (!dl_prio(p->normal_prio) || +		    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {  			p->dl.dl_boosted = 1;  			p->dl.dl_throttled = 0;  			enqueue_flag = ENQUEUE_REPLENISH; @@ -3064,7 +3079,7 @@ void set_user_nice(struct task_struct *p, long nice)  		 * lowered its priority, then reschedule its CPU:  		 */  		if (delta < 0 || (delta > 0 && task_running(rq, p))) -			resched_task(rq->curr); +			resched_curr(rq);  	}  out_unlock:  	task_rq_unlock(rq, p, &flags); @@ -3203,12 +3218,18 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)  	dl_se->dl_yielded = 0;  } +/* + * sched_setparam() passes in -1 for its policy, to let the functions + * it calls know not to change it. + */ +#define SETPARAM_POLICY	-1 +  static void __setscheduler_params(struct task_struct *p,  		const struct sched_attr *attr)  {  	int policy = attr->sched_policy; -	if (policy == -1) /* setparam */ +	if (policy == SETPARAM_POLICY)  		policy = p->policy;  	p->policy = policy; @@ -3557,10 +3578,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy,  		.sched_nice	= PRIO_TO_NICE(p->static_prio),  	}; -	/* -	 * Fixup the legacy SCHED_RESET_ON_FORK hack -	 */ -	if (policy & SCHED_RESET_ON_FORK) { +	/* Fixup the legacy SCHED_RESET_ON_FORK hack. */ +	if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {  		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;  		policy &= ~SCHED_RESET_ON_FORK;  		attr.sched_policy = policy; @@ -3730,7 +3749,7 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,   */  SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)  { -	return do_sched_setscheduler(pid, -1, param); +	return do_sched_setscheduler(pid, SETPARAM_POLICY, param);  }  /** @@ -4285,7 +4304,7 @@ again:  		 * fairness.  		 */  		if (preempt && rq != p_rq) -			resched_task(p_rq->curr); +			resched_curr(p_rq);  	}  out_unlock: @@ -6465,6 +6484,20 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,  		sched_domain_level_max = max(sched_domain_level_max, sd->level);  		child->parent = sd;  		sd->child = child; + +		if (!cpumask_subset(sched_domain_span(child), +				    sched_domain_span(sd))) { +			pr_err("BUG: arch topology borken\n"); +#ifdef CONFIG_SCHED_DEBUG +			pr_err("     the %s domain not a subset of the %s domain\n", +					child->name, sd->name); +#endif +			/* Fixup, ensure @sd has at least @child cpus. */ +			cpumask_or(sched_domain_span(sd), +				   sched_domain_span(sd), +				   sched_domain_span(child)); +		} +  	}  	set_domain_attribute(sd, attr); @@ -7092,7 +7125,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)  	__setscheduler(rq, p, &attr);  	if (on_rq) {  		enqueue_task(rq, p, 0); -		resched_task(rq->curr); +		resched_curr(rq);  	}  	check_class_changed(rq, p, prev_class, old_prio); @@ -7803,6 +7836,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)  	if (period > max_cfs_quota_period)  		return -EINVAL; +	/* +	 * Prevent race between setting of cfs_rq->runtime_enabled and +	 * unthrottle_offline_cfs_rqs(). +	 */ +	get_online_cpus();  	mutex_lock(&cfs_constraints_mutex);  	ret = __cfs_schedulable(tg, period, quota);  	if (ret) @@ -7828,7 +7866,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)  	}  	raw_spin_unlock_irq(&cfs_b->lock); -	for_each_possible_cpu(i) { +	for_each_online_cpu(i) {  		struct cfs_rq *cfs_rq = tg->cfs_rq[i];  		struct rq *rq = cfs_rq->rq; @@ -7844,6 +7882,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)  		cfs_bandwidth_usage_dec();  out_unlock:  	mutex_unlock(&cfs_constraints_mutex); +	put_online_cpus();  	return ret;  } @@ -8083,7 +8122,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {  	.can_attach	= cpu_cgroup_can_attach,  	.attach		= cpu_cgroup_attach,  	.exit		= cpu_cgroup_exit, -	.base_cftypes	= cpu_files, +	.legacy_cftypes	= cpu_files,  	.early_init	= 1,  }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9cf350c94ec4..dd7cbb55bbf2 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -278,6 +278,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)  struct cgroup_subsys cpuacct_cgrp_subsys = {  	.css_alloc	= cpuacct_css_alloc,  	.css_free	= cpuacct_css_free, -	.base_cftypes	= files, +	.legacy_cftypes	= files,  	.early_init	= 1,  }; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fc4f98b1258f..255ce138b652 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -306,7 +306,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,   * the overrunning entity can't interfere with other entity in the system and   * can't make them miss their deadlines. Reasons why this kind of overruns   * could happen are, typically, a entity voluntarily trying to overcome its - * runtime, or it just underestimated it during sched_setscheduler_ex(). + * runtime, or it just underestimated it during sched_setattr().   */  static void replenish_dl_entity(struct sched_dl_entity *dl_se,  				struct sched_dl_entity *pi_se) @@ -535,7 +535,7 @@ again:  		if (task_has_dl_policy(rq->curr))  			check_preempt_curr_dl(rq, p, 0);  		else -			resched_task(rq->curr); +			resched_curr(rq);  #ifdef CONFIG_SMP  		/*  		 * Queueing this task back might have overloaded rq, @@ -634,7 +634,7 @@ static void update_curr_dl(struct rq *rq)  			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);  		if (!is_leftmost(curr, &rq->dl)) -			resched_task(curr); +			resched_curr(rq);  	}  	/* @@ -964,7 +964,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)  	    cpudl_find(&rq->rd->cpudl, p, NULL) != -1)  		return; -	resched_task(rq->curr); +	resched_curr(rq);  }  static int pull_dl_task(struct rq *this_rq); @@ -979,7 +979,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,  				  int flags)  {  	if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { -		resched_task(rq->curr); +		resched_curr(rq);  		return;  	} @@ -1333,7 +1333,7 @@ retry:  	if (dl_task(rq->curr) &&  	    dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&  	    rq->curr->nr_cpus_allowed > 1) { -		resched_task(rq->curr); +		resched_curr(rq);  		return 0;  	} @@ -1373,7 +1373,7 @@ retry:  	set_task_cpu(next_task, later_rq->cpu);  	activate_task(later_rq, next_task, 0); -	resched_task(later_rq->curr); +	resched_curr(later_rq);  	double_unlock_balance(rq, later_rq); @@ -1632,14 +1632,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,  		 */  		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&  		    rq->curr == p) -			resched_task(p); +			resched_curr(rq);  #else  		/*  		 * Again, we don't know if p has a earlier  		 * or later deadline, so let's blindly set a  		 * (maybe not needed) rescheduling point.  		 */ -		resched_task(p); +		resched_curr(rq);  #endif /* CONFIG_SMP */  	} else  		switched_to_dl(rq, p); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fea7d3335e1f..bfa3c86d0d68 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid)  	if (!cpus)  		return; -	ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;  	ns->task_capacity =  		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);  	ns->has_free_capacity = (ns->nr_running < ns->task_capacity); @@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env,  	env->best_cpu = env->dst_cpu;  } -static bool load_too_imbalanced(long orig_src_load, long orig_dst_load, -				long src_load, long dst_load, +static bool load_too_imbalanced(long src_load, long dst_load,  				struct task_numa_env *env)  {  	long imb, old_imb; +	long orig_src_load, orig_dst_load; +	long src_capacity, dst_capacity; + +	/* +	 * The load is corrected for the CPU capacity available on each node. +	 * +	 * src_load        dst_load +	 * ------------ vs --------- +	 * src_capacity    dst_capacity +	 */ +	src_capacity = env->src_stats.compute_capacity; +	dst_capacity = env->dst_stats.compute_capacity;  	/* We care about the slope of the imbalance, not the direction. */  	if (dst_load < src_load)  		swap(dst_load, src_load);  	/* Is the difference below the threshold? */ -	imb = dst_load * 100 - src_load * env->imbalance_pct; +	imb = dst_load * src_capacity * 100 - +	      src_load * dst_capacity * env->imbalance_pct;  	if (imb <= 0)  		return false; @@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,  	 * The imbalance is above the allowed threshold.  	 * Compare it with the old imbalance.  	 */ +	orig_src_load = env->src_stats.load; +	orig_dst_load = env->dst_stats.load; +  	if (orig_dst_load < orig_src_load)  		swap(orig_dst_load, orig_src_load); -	old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct; +	old_imb = orig_dst_load * src_capacity * 100 - +		  orig_src_load * dst_capacity * env->imbalance_pct;  	/* Would this change make things worse? */  	return (imb > old_imb); @@ -1136,10 +1151,10 @@ static void task_numa_compare(struct task_numa_env *env,  	struct rq *src_rq = cpu_rq(env->src_cpu);  	struct rq *dst_rq = cpu_rq(env->dst_cpu);  	struct task_struct *cur; -	long orig_src_load, src_load; -	long orig_dst_load, dst_load; +	long src_load, dst_load;  	long load; -	long imp = (groupimp > 0) ? groupimp : taskimp; +	long imp = env->p->numa_group ? groupimp : taskimp; +	long moveimp = imp;  	rcu_read_lock();  	cur = ACCESS_ONCE(dst_rq->curr); @@ -1177,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env,  			 * itself (not part of a group), use the task weight  			 * instead.  			 */ -			if (env->p->numa_group) -				imp = groupimp; -			else -				imp = taskimp; -  			if (cur->numa_group)  				imp += group_weight(cur, env->src_nid) -  				       group_weight(cur, env->dst_nid); @@ -1191,7 +1201,7 @@ static void task_numa_compare(struct task_numa_env *env,  		}  	} -	if (imp < env->best_imp) +	if (imp <= env->best_imp && moveimp <= env->best_imp)  		goto unlock;  	if (!cur) { @@ -1204,20 +1214,34 @@ static void task_numa_compare(struct task_numa_env *env,  	}  	/* Balance doesn't matter much if we're running a task per cpu */ -	if (src_rq->nr_running == 1 && dst_rq->nr_running == 1) +	if (imp > env->best_imp && src_rq->nr_running == 1 && +			dst_rq->nr_running == 1)  		goto assign;  	/*  	 * In the overloaded case, try and keep the load balanced.  	 */  balance: -	orig_dst_load = env->dst_stats.load; -	orig_src_load = env->src_stats.load; - -	/* XXX missing capacity terms */  	load = task_h_load(env->p); -	dst_load = orig_dst_load + load; -	src_load = orig_src_load - load; +	dst_load = env->dst_stats.load + load; +	src_load = env->src_stats.load - load; + +	if (moveimp > imp && moveimp > env->best_imp) { +		/* +		 * If the improvement from just moving env->p direction is +		 * better than swapping tasks around, check if a move is +		 * possible. Store a slightly smaller score than moveimp, +		 * so an actually idle CPU will win. +		 */ +		if (!load_too_imbalanced(src_load, dst_load, env)) { +			imp = moveimp - 1; +			cur = NULL; +			goto assign; +		} +	} + +	if (imp <= env->best_imp) +		goto unlock;  	if (cur) {  		load = task_h_load(cur); @@ -1225,8 +1249,7 @@ balance:  		src_load += load;  	} -	if (load_too_imbalanced(orig_src_load, orig_dst_load, -				src_load, dst_load, env)) +	if (load_too_imbalanced(src_load, dst_load, env))  		goto unlock;  assign: @@ -1302,9 +1325,8 @@ static int task_numa_migrate(struct task_struct *p)  	groupimp = group_weight(p, env.dst_nid) - groupweight;  	update_numa_stats(&env.dst_stats, env.dst_nid); -	/* If the preferred nid has free capacity, try to use it. */ -	if (env.dst_stats.has_free_capacity) -		task_numa_find_cpu(&env, taskimp, groupimp); +	/* Try to find a spot on the preferred nid. */ +	task_numa_find_cpu(&env, taskimp, groupimp);  	/* No space available on the preferred nid. Look elsewhere. */  	if (env.best_cpu == -1) { @@ -1324,10 +1346,6 @@ static int task_numa_migrate(struct task_struct *p)  		}  	} -	/* No better CPU than the current one was found. */ -	if (env.best_cpu == -1) -		return -EAGAIN; -  	/*  	 * If the task is part of a workload that spans multiple NUMA nodes,  	 * and is migrating into one of the workload's active nodes, remember @@ -1336,8 +1354,19 @@ static int task_numa_migrate(struct task_struct *p)  	 * A task that migrated to a second choice node will be better off  	 * trying for a better one later. Do not set the preferred node here.  	 */ -	if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes)) -		sched_setnuma(p, env.dst_nid); +	if (p->numa_group) { +		if (env.best_cpu == -1) +			nid = env.src_nid; +		else +			nid = env.dst_nid; + +		if (node_isset(nid, p->numa_group->active_nodes)) +			sched_setnuma(p, env.dst_nid); +	} + +	/* No better CPU than the current one was found. */ +	if (env.best_cpu == -1) +		return -EAGAIN;  	/*  	 * Reset the scan period if the task is being rescheduled on an @@ -1415,12 +1444,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)  /*   * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS   * increments. The more local the fault statistics are, the higher the scan - * period will be for the next scan window. If local/remote ratio is below - * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the - * scan period will decrease + * period will be for the next scan window. If local/(local+remote) ratio is + * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) + * the scan period will decrease. Aim for 70% local accesses.   */  #define NUMA_PERIOD_SLOTS 10 -#define NUMA_PERIOD_THRESHOLD 3 +#define NUMA_PERIOD_THRESHOLD 7  /*   * Increase the scan period (slow down scanning) if the majority of @@ -1595,30 +1624,17 @@ static void task_numa_placement(struct task_struct *p)  	if (p->numa_group) {  		update_numa_active_node_mask(p->numa_group); -		/* -		 * If the preferred task and group nids are different, -		 * iterate over the nodes again to find the best place. -		 */ -		if (max_nid != max_group_nid) { -			unsigned long weight, max_weight = 0; - -			for_each_online_node(nid) { -				weight = task_weight(p, nid) + group_weight(p, nid); -				if (weight > max_weight) { -					max_weight = weight; -					max_nid = nid; -				} -			} -		} -  		spin_unlock_irq(group_lock); +		max_nid = max_group_nid;  	} -	/* Preferred node as the node with the most faults */ -	if (max_faults && max_nid != p->numa_preferred_nid) { -		/* Update the preferred nid and migrate task if possible */ -		sched_setnuma(p, max_nid); -		numa_migrate_preferred(p); +	if (max_faults) { +		/* Set the new preferred node */ +		if (max_nid != p->numa_preferred_nid) +			sched_setnuma(p, max_nid); + +		if (task_node(p) != p->numa_preferred_nid) +			numa_migrate_preferred(p);  	}  } @@ -2899,7 +2915,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)  	ideal_runtime = sched_slice(cfs_rq, curr);  	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;  	if (delta_exec > ideal_runtime) { -		resched_task(rq_of(cfs_rq)->curr); +		resched_curr(rq_of(cfs_rq));  		/*  		 * The current task ran long enough, ensure it doesn't get  		 * re-elected due to buddy favours. @@ -2923,7 +2939,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)  		return;  	if (delta > ideal_runtime) -		resched_task(rq_of(cfs_rq)->curr); +		resched_curr(rq_of(cfs_rq));  }  static void @@ -3063,7 +3079,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)  	 * validating it and just reschedule.  	 */  	if (queued) { -		resched_task(rq_of(cfs_rq)->curr); +		resched_curr(rq_of(cfs_rq));  		return;  	}  	/* @@ -3254,7 +3270,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)  	 * hierarchy can be throttled  	 */  	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) -		resched_task(rq_of(cfs_rq)->curr); +		resched_curr(rq_of(cfs_rq));  }  static __always_inline @@ -3360,7 +3376,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)  	cfs_rq->throttled = 1;  	cfs_rq->throttled_clock = rq_clock(rq);  	raw_spin_lock(&cfs_b->lock); -	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); +	/* +	 * Add to the _head_ of the list, so that an already-started +	 * distribute_cfs_runtime will not see us +	 */ +	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);  	if (!cfs_b->timer_active)  		__start_cfs_bandwidth(cfs_b, false);  	raw_spin_unlock(&cfs_b->lock); @@ -3410,14 +3430,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)  	/* determine whether we need to wake up potentially idle cpu */  	if (rq->curr == rq->idle && rq->cfs.nr_running) -		resched_task(rq->curr); +		resched_curr(rq);  }  static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,  		u64 remaining, u64 expires)  {  	struct cfs_rq *cfs_rq; -	u64 runtime = remaining; +	u64 runtime; +	u64 starting_runtime = remaining;  	rcu_read_lock();  	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -3448,7 +3469,7 @@ next:  	}  	rcu_read_unlock(); -	return remaining; +	return starting_runtime - remaining;  }  /* @@ -3494,22 +3515,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)  	/* account preceding periods in which throttling occurred */  	cfs_b->nr_throttled += overrun; -	/* -	 * There are throttled entities so we must first use the new bandwidth -	 * to unthrottle them before making it generally available.  This -	 * ensures that all existing debts will be paid before a new cfs_rq is -	 * allowed to run. -	 */ -	runtime = cfs_b->runtime;  	runtime_expires = cfs_b->runtime_expires; -	cfs_b->runtime = 0;  	/* -	 * This check is repeated as we are holding onto the new bandwidth -	 * while we unthrottle.  This can potentially race with an unthrottled -	 * group trying to acquire new bandwidth from the global pool. +	 * This check is repeated as we are holding onto the new bandwidth while +	 * we unthrottle. This can potentially race with an unthrottled group +	 * trying to acquire new bandwidth from the global pool. This can result +	 * in us over-using our runtime if it is all used during this loop, but +	 * only by limited amounts in that extreme case.  	 */ -	while (throttled && runtime > 0) { +	while (throttled && cfs_b->runtime > 0) { +		runtime = cfs_b->runtime;  		raw_spin_unlock(&cfs_b->lock);  		/* we can't nest cfs_b->lock while distributing bandwidth */  		runtime = distribute_cfs_runtime(cfs_b, runtime, @@ -3517,10 +3533,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)  		raw_spin_lock(&cfs_b->lock);  		throttled = !list_empty(&cfs_b->throttled_cfs_rq); + +		cfs_b->runtime -= min(runtime, cfs_b->runtime);  	} -	/* return (any) remaining runtime */ -	cfs_b->runtime = runtime;  	/*  	 * While we are ensured activity in the period following an  	 * unthrottle, this also covers the case in which the new bandwidth is @@ -3631,10 +3647,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)  		return;  	} -	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { +	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)  		runtime = cfs_b->runtime; -		cfs_b->runtime = 0; -	} +  	expires = cfs_b->runtime_expires;  	raw_spin_unlock(&cfs_b->lock); @@ -3645,7 +3660,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)  	raw_spin_lock(&cfs_b->lock);  	if (expires == cfs_b->runtime_expires) -		cfs_b->runtime = runtime; +		cfs_b->runtime -= min(runtime, cfs_b->runtime);  	raw_spin_unlock(&cfs_b->lock);  } @@ -3775,6 +3790,19 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)  	hrtimer_cancel(&cfs_b->slack_timer);  } +static void __maybe_unused update_runtime_enabled(struct rq *rq) +{ +	struct cfs_rq *cfs_rq; + +	for_each_leaf_cfs_rq(rq, cfs_rq) { +		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + +		raw_spin_lock(&cfs_b->lock); +		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF; +		raw_spin_unlock(&cfs_b->lock); +	} +} +  static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)  {  	struct cfs_rq *cfs_rq; @@ -3788,6 +3816,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)  		 * there's some valid quota amount  		 */  		cfs_rq->runtime_remaining = 1; +		/* +		 * Offline rq is schedulable till cpu is completely disabled +		 * in take_cpu_down(), so we prevent new cfs throttling here. +		 */ +		cfs_rq->runtime_enabled = 0; +  		if (cfs_rq_throttled(cfs_rq))  			unthrottle_cfs_rq(cfs_rq);  	} @@ -3831,6 +3865,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)  	return NULL;  }  static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +static inline void update_runtime_enabled(struct rq *rq) {}  static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}  #endif /* CONFIG_CFS_BANDWIDTH */ @@ -3854,7 +3889,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)  		if (delta < 0) {  			if (rq->curr == p) -				resched_task(p); +				resched_curr(rq);  			return;  		} @@ -4723,7 +4758,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_  	return;  preempt: -	resched_task(curr); +	resched_curr(rq);  	/*  	 * Only set the backward buddy when the current task is still  	 * on the rq. This can happen when a wakeup gets interleaved @@ -5094,8 +5129,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)  /*   * Is this task likely cache-hot:   */ -static int -task_hot(struct task_struct *p, u64 now) +static int task_hot(struct task_struct *p, struct lb_env *env)  {  	s64 delta; @@ -5108,7 +5142,7 @@ task_hot(struct task_struct *p, u64 now)  	/*  	 * Buddy candidates are cache hot:  	 */ -	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && +	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&  			(&p->se == cfs_rq_of(&p->se)->next ||  			 &p->se == cfs_rq_of(&p->se)->last))  		return 1; @@ -5118,7 +5152,7 @@ task_hot(struct task_struct *p, u64 now)  	if (sysctl_sched_migration_cost == 0)  		return 0; -	delta = now - p->se.exec_start; +	delta = rq_clock_task(env->src_rq) - p->se.exec_start;  	return delta < (s64)sysctl_sched_migration_cost;  } @@ -5272,7 +5306,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)  	 * 2) task is cache cold, or  	 * 3) too many balance attempts have failed.  	 */ -	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq)); +	tsk_cache_hot = task_hot(p, env);  	if (!tsk_cache_hot)  		tsk_cache_hot = migrate_degrades_locality(p, env); @@ -5864,10 +5898,12 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro   * @load_idx: Load index of sched_domain of this_cpu for load calc.   * @local_group: Does group contain this_cpu.   * @sgs: variable to hold the statistics for this group. + * @overload: Indicate more than one runnable task for any CPU.   */  static inline void update_sg_lb_stats(struct lb_env *env,  			struct sched_group *group, int load_idx, -			int local_group, struct sg_lb_stats *sgs) +			int local_group, struct sg_lb_stats *sgs, +			bool *overload)  {  	unsigned long load;  	int i; @@ -5885,6 +5921,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,  		sgs->group_load += load;  		sgs->sum_nr_running += rq->nr_running; + +		if (rq->nr_running > 1) +			*overload = true; +  #ifdef CONFIG_NUMA_BALANCING  		sgs->nr_numa_running += rq->nr_numa_running;  		sgs->nr_preferred_running += rq->nr_preferred_running; @@ -5995,6 +6035,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd  	struct sched_group *sg = env->sd->groups;  	struct sg_lb_stats tmp_sgs;  	int load_idx, prefer_sibling = 0; +	bool overload = false;  	if (child && child->flags & SD_PREFER_SIBLING)  		prefer_sibling = 1; @@ -6015,7 +6056,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd  				update_group_capacity(env->sd, env->dst_cpu);  		} -		update_sg_lb_stats(env, sg, load_idx, local_group, sgs); +		update_sg_lb_stats(env, sg, load_idx, local_group, sgs, +						&overload);  		if (local_group)  			goto next_group; @@ -6049,6 +6091,13 @@ next_group:  	if (env->sd->flags & SD_NUMA)  		env->fbq_type = fbq_classify_group(&sds->busiest_stat); + +	if (!env->sd->parent) { +		/* update overload indicator if we are at root domain */ +		if (env->dst_rq->rd->overload != overload) +			env->dst_rq->rd->overload = overload; +	} +  }  /** @@ -6767,7 +6816,8 @@ static int idle_balance(struct rq *this_rq)  	 */  	this_rq->idle_stamp = rq_clock(this_rq); -	if (this_rq->avg_idle < sysctl_sched_migration_cost) { +	if (this_rq->avg_idle < sysctl_sched_migration_cost || +	    !this_rq->rd->overload) {  		rcu_read_lock();  		sd = rcu_dereference_check_sched_domain(this_rq->sd);  		if (sd) @@ -7325,6 +7375,8 @@ void trigger_load_balance(struct rq *rq)  static void rq_online_fair(struct rq *rq)  {  	update_sysctl(); + +	update_runtime_enabled(rq);  }  static void rq_offline_fair(struct rq *rq) @@ -7398,7 +7450,7 @@ static void task_fork_fair(struct task_struct *p)  		 * 'current' within the tree based on its new key value.  		 */  		swap(curr->vruntime, se->vruntime); -		resched_task(rq->curr); +		resched_curr(rq);  	}  	se->vruntime -= cfs_rq->min_vruntime; @@ -7423,7 +7475,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)  	 */  	if (rq->curr == p) {  		if (p->prio > oldprio) -			resched_task(rq->curr); +			resched_curr(rq);  	} else  		check_preempt_curr(rq, p, 0);  } @@ -7486,7 +7538,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)  	 * if we can still preempt the current task.  	 */  	if (rq->curr == p) -		resched_task(rq->curr); +		resched_curr(rq);  	else  		check_preempt_curr(rq, p, 0);  } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cf009fb0bc25..11e7bc434f43 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -79,7 +79,7 @@ static void cpuidle_idle_call(void)  	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);  	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);  	int next_state, entered_state; -	bool broadcast; +	unsigned int broadcast;  	/*  	 * Check if the idle task must be rescheduled. If it is the @@ -135,7 +135,7 @@ use_default:  		goto exit_idle;  	} -	broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP); +	broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP;  	/*  	 * Tell the time framework to switch to a broadcast timer @@ -147,8 +147,6 @@ use_default:  	    clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))  		goto use_default; -	trace_cpu_idle_rcuidle(next_state, dev->cpu); -  	/*  	 * Enter the idle state previously returned by the governor decision.  	 * This function will block until an interrupt occurs and will take @@ -156,8 +154,6 @@ use_default:  	 */  	entered_state = cpuidle_enter(drv, dev, next_state); -	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu); -  	if (broadcast)  		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 879f2b75266a..67ad4e7f506a 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -20,7 +20,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)   */  static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)  { -	resched_task(rq->idle); +	resched_curr(rq);  }  static struct task_struct * diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a49083192c64..5f6edca4fafd 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -463,9 +463,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);  static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)  {  	struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; +	struct rq *rq = rq_of_rt_rq(rt_rq);  	struct sched_rt_entity *rt_se; -	int cpu = cpu_of(rq_of_rt_rq(rt_rq)); +	int cpu = cpu_of(rq);  	rt_se = rt_rq->tg->rt_se[cpu]; @@ -476,7 +477,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)  			enqueue_rt_entity(rt_se, false);  		if (rt_rq->highest_prio.curr < curr->prio) -			resched_task(curr); +			resched_curr(rq);  	}  } @@ -566,7 +567,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)  		return;  	enqueue_top_rt_rq(rt_rq); -	resched_task(rq->curr); +	resched_curr(rq);  }  static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) @@ -740,6 +741,9 @@ balanced:  		rt_rq->rt_throttled = 0;  		raw_spin_unlock(&rt_rq->rt_runtime_lock);  		raw_spin_unlock(&rt_b->rt_runtime_lock); + +		/* Make rt_rq available for pick_next_task() */ +		sched_rt_rq_enqueue(rt_rq);  	}  } @@ -948,7 +952,7 @@ static void update_curr_rt(struct rq *rq)  			raw_spin_lock(&rt_rq->rt_runtime_lock);  			rt_rq->rt_time += delta_exec;  			if (sched_rt_runtime_exceeded(rt_rq)) -				resched_task(curr); +				resched_curr(rq);  			raw_spin_unlock(&rt_rq->rt_runtime_lock);  		}  	} @@ -1363,7 +1367,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)  	 * to try and push current away:  	 */  	requeue_task_rt(rq, p, 1); -	resched_task(rq->curr); +	resched_curr(rq);  }  #endif /* CONFIG_SMP */ @@ -1374,7 +1378,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)  static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)  {  	if (p->prio < rq->curr->prio) { -		resched_task(rq->curr); +		resched_curr(rq);  		return;  	} @@ -1690,7 +1694,7 @@ retry:  	 * just reschedule current.  	 */  	if (unlikely(next_task->prio < rq->curr->prio)) { -		resched_task(rq->curr); +		resched_curr(rq);  		return 0;  	} @@ -1737,7 +1741,7 @@ retry:  	activate_task(lowest_rq, next_task, 0);  	ret = 1; -	resched_task(lowest_rq->curr); +	resched_curr(lowest_rq);  	double_unlock_balance(rq, lowest_rq); @@ -1936,7 +1940,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)  		return;  	if (pull_rt_task(rq)) -		resched_task(rq->curr); +		resched_curr(rq);  }  void __init init_sched_rt_class(void) @@ -1974,7 +1978,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)  			check_resched = 0;  #endif /* CONFIG_SMP */  		if (check_resched && p->prio < rq->curr->prio) -			resched_task(rq->curr); +			resched_curr(rq);  	}  } @@ -2003,11 +2007,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)  		 * Only reschedule if p is still on the same runqueue.  		 */  		if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) -			resched_task(p); +			resched_curr(rq);  #else  		/* For UP simply resched on drop of prio */  		if (oldprio < p->prio) -			resched_task(p); +			resched_curr(rq);  #endif /* CONFIG_SMP */  	} else {  		/* @@ -2016,7 +2020,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)  		 * then reschedule.  		 */  		if (p->prio < rq->curr->prio) -			resched_task(rq->curr); +			resched_curr(rq);  	}  } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 31cc02ebc54e..579712f4e9d5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -477,6 +477,9 @@ struct root_domain {  	cpumask_var_t span;  	cpumask_var_t online; +	/* Indicate more than one runnable task for any CPU */ +	bool overload; +  	/*  	 * The bit corresponding to a CPU gets set here if such CPU has more  	 * than one runnable -deadline task (as it is below for RT tasks). @@ -884,20 +887,10 @@ enum {  #undef SCHED_FEAT  #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) -static __always_inline bool static_branch__true(struct static_key *key) -{ -	return static_key_true(key); /* Not out of line branch. */ -} - -static __always_inline bool static_branch__false(struct static_key *key) -{ -	return static_key_false(key); /* Out of line branch. */ -} -  #define SCHED_FEAT(name, enabled)					\  static __always_inline bool static_branch_##name(struct static_key *key) \  {									\ -	return static_branch__##enabled(key);				\ +	return static_key_##enabled(key);				\  }  #include "features.h" @@ -1196,7 +1189,7 @@ extern void init_sched_rt_class(void);  extern void init_sched_fair_class(void);  extern void init_sched_dl_class(void); -extern void resched_task(struct task_struct *p); +extern void resched_curr(struct rq *rq);  extern void resched_cpu(int cpu);  extern struct rt_bandwidth def_rt_bandwidth; @@ -1218,15 +1211,26 @@ static inline void add_nr_running(struct rq *rq, unsigned count)  	rq->nr_running = prev_nr + count; -#ifdef CONFIG_NO_HZ_FULL  	if (prev_nr < 2 && rq->nr_running >= 2) { +#ifdef CONFIG_SMP +		if (!rq->rd->overload) +			rq->rd->overload = true; +#endif + +#ifdef CONFIG_NO_HZ_FULL  		if (tick_nohz_full_cpu(rq->cpu)) { -			/* Order rq->nr_running write against the IPI */ -			smp_wmb(); -			smp_send_reschedule(rq->cpu); +			/* +			 * Tick is needed if more than one task runs on a CPU. +			 * Send the target an IPI to kick it out of nohz mode. +			 * +			 * We assume that IPI implies full memory barrier and the +			 * new value of rq->nr_running is visible on reception +			 * from the target. +			 */ +			tick_nohz_full_kick_cpu(rq->cpu);  		} -       }  #endif +	}  }  static inline void sub_nr_running(struct rq *rq, unsigned count) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 0ffa20ae657b..15cab1a4f84e 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -319,14 +319,14 @@ EXPORT_SYMBOL(wake_bit_function);   */  int __sched  __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, -			int (*action)(void *), unsigned mode) +	      wait_bit_action_f *action, unsigned mode)  {  	int ret = 0;  	do {  		prepare_to_wait(wq, &q->wait, mode);  		if (test_bit(q->key.bit_nr, q->key.flags)) -			ret = (*action)(q->key.flags); +			ret = (*action)(&q->key);  	} while (test_bit(q->key.bit_nr, q->key.flags) && !ret);  	finish_wait(wq, &q->wait);  	return ret; @@ -334,7 +334,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,  EXPORT_SYMBOL(__wait_on_bit);  int __sched out_of_line_wait_on_bit(void *word, int bit, -					int (*action)(void *), unsigned mode) +				    wait_bit_action_f *action, unsigned mode)  {  	wait_queue_head_t *wq = bit_waitqueue(word, bit);  	DEFINE_WAIT_BIT(wait, word, bit); @@ -345,7 +345,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit);  int __sched  __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, -			int (*action)(void *), unsigned mode) +			wait_bit_action_f *action, unsigned mode)  {  	do {  		int ret; @@ -353,7 +353,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,  		prepare_to_wait_exclusive(wq, &q->wait, mode);  		if (!test_bit(q->key.bit_nr, q->key.flags))  			continue; -		ret = action(q->key.flags); +		ret = action(&q->key);  		if (!ret)  			continue;  		abort_exclusive_wait(wq, &q->wait, mode, &q->key); @@ -365,7 +365,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,  EXPORT_SYMBOL(__wait_on_bit_lock);  int __sched out_of_line_wait_on_bit_lock(void *word, int bit, -					int (*action)(void *), unsigned mode) +					 wait_bit_action_f *action, unsigned mode)  {  	wait_queue_head_t *wq = bit_waitqueue(word, bit);  	DEFINE_WAIT_BIT(wait, word, bit); @@ -502,3 +502,21 @@ void wake_up_atomic_t(atomic_t *p)  	__wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);  }  EXPORT_SYMBOL(wake_up_atomic_t); + +__sched int bit_wait(struct wait_bit_key *word) +{ +	if (signal_pending_state(current->state, current)) +		return 1; +	schedule(); +	return 0; +} +EXPORT_SYMBOL(bit_wait); + +__sched int bit_wait_io(struct wait_bit_key *word) +{ +	if (signal_pending_state(current->state, current)) +		return 1; +	io_schedule(); +	return 0; +} +EXPORT_SYMBOL(bit_wait_io); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 301bbc24739c..25b0043f4755 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -18,15 +18,17 @@  #include <linux/compat.h>  #include <linux/sched.h>  #include <linux/seccomp.h> +#include <linux/slab.h> +#include <linux/syscalls.h>  /* #define SECCOMP_DEBUG 1 */  #ifdef CONFIG_SECCOMP_FILTER  #include <asm/syscall.h>  #include <linux/filter.h> +#include <linux/pid.h>  #include <linux/ptrace.h>  #include <linux/security.h> -#include <linux/slab.h>  #include <linux/tracehook.h>  #include <linux/uaccess.h> @@ -54,7 +56,7 @@  struct seccomp_filter {  	atomic_t usage;  	struct seccomp_filter *prev; -	struct sk_filter *prog; +	struct bpf_prog *prog;  };  /* Limit any path through the tree to 256KB worth of instructions. */ @@ -87,7 +89,7 @@ static void populate_seccomp_data(struct seccomp_data *sd)   *	@filter: filter to verify   *	@flen: length of filter   * - * Takes a previously checked filter (by sk_chk_filter) and + * Takes a previously checked filter (by bpf_check_classic) and   * redirects all filter code that loads struct sk_buff data   * and related data through seccomp_bpf_load.  It also   * enforces length and alignment checking of those loads. @@ -172,51 +174,184 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)   */  static u32 seccomp_run_filters(int syscall)  { -	struct seccomp_filter *f; +	struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);  	struct seccomp_data sd;  	u32 ret = SECCOMP_RET_ALLOW;  	/* Ensure unexpected behavior doesn't result in failing open. */ -	if (WARN_ON(current->seccomp.filter == NULL)) +	if (unlikely(WARN_ON(f == NULL)))  		return SECCOMP_RET_KILL; +	/* Make sure cross-thread synced filter points somewhere sane. */ +	smp_read_barrier_depends(); +  	populate_seccomp_data(&sd);  	/*  	 * All filters in the list are evaluated and the lowest BPF return  	 * value always takes priority (ignoring the DATA).  	 */ -	for (f = current->seccomp.filter; f; f = f->prev) { -		u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd); +	for (; f; f = f->prev) { +		u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd);  		if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))  			ret = cur_ret;  	}  	return ret;  } +#endif /* CONFIG_SECCOMP_FILTER */ + +static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) +{ +	BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + +	if (current->seccomp.mode && current->seccomp.mode != seccomp_mode) +		return false; + +	return true; +} + +static inline void seccomp_assign_mode(struct task_struct *task, +				       unsigned long seccomp_mode) +{ +	BUG_ON(!spin_is_locked(&task->sighand->siglock)); + +	task->seccomp.mode = seccomp_mode; +	/* +	 * Make sure TIF_SECCOMP cannot be set before the mode (and +	 * filter) is set. +	 */ +	smp_mb__before_atomic(); +	set_tsk_thread_flag(task, TIF_SECCOMP); +} + +#ifdef CONFIG_SECCOMP_FILTER +/* Returns 1 if the parent is an ancestor of the child. */ +static int is_ancestor(struct seccomp_filter *parent, +		       struct seccomp_filter *child) +{ +	/* NULL is the root ancestor. */ +	if (parent == NULL) +		return 1; +	for (; child; child = child->prev) +		if (child == parent) +			return 1; +	return 0; +}  /** - * seccomp_attach_filter: Attaches a seccomp filter to current. + * seccomp_can_sync_threads: checks if all threads can be synchronized + * + * Expects sighand and cred_guard_mutex locks to be held. + * + * Returns 0 on success, -ve on error, or the pid of a thread which was + * either not in the correct seccomp mode or it did not have an ancestral + * seccomp filter. + */ +static inline pid_t seccomp_can_sync_threads(void) +{ +	struct task_struct *thread, *caller; + +	BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); +	BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + +	/* Validate all threads being eligible for synchronization. */ +	caller = current; +	for_each_thread(caller, thread) { +		pid_t failed; + +		/* Skip current, since it is initiating the sync. */ +		if (thread == caller) +			continue; + +		if (thread->seccomp.mode == SECCOMP_MODE_DISABLED || +		    (thread->seccomp.mode == SECCOMP_MODE_FILTER && +		     is_ancestor(thread->seccomp.filter, +				 caller->seccomp.filter))) +			continue; + +		/* Return the first thread that cannot be synchronized. */ +		failed = task_pid_vnr(thread); +		/* If the pid cannot be resolved, then return -ESRCH */ +		if (unlikely(WARN_ON(failed == 0))) +			failed = -ESRCH; +		return failed; +	} + +	return 0; +} + +/** + * seccomp_sync_threads: sets all threads to use current's filter + * + * Expects sighand and cred_guard_mutex locks to be held, and for + * seccomp_can_sync_threads() to have returned success already + * without dropping the locks. + * + */ +static inline void seccomp_sync_threads(void) +{ +	struct task_struct *thread, *caller; + +	BUG_ON(!mutex_is_locked(¤t->signal->cred_guard_mutex)); +	BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + +	/* Synchronize all threads. */ +	caller = current; +	for_each_thread(caller, thread) { +		/* Skip current, since it needs no changes. */ +		if (thread == caller) +			continue; + +		/* Get a task reference for the new leaf node. */ +		get_seccomp_filter(caller); +		/* +		 * Drop the task reference to the shared ancestor since +		 * current's path will hold a reference.  (This also +		 * allows a put before the assignment.) +		 */ +		put_seccomp_filter(thread); +		smp_store_release(&thread->seccomp.filter, +				  caller->seccomp.filter); +		/* +		 * Opt the other thread into seccomp if needed. +		 * As threads are considered to be trust-realm +		 * equivalent (see ptrace_may_access), it is safe to +		 * allow one thread to transition the other. +		 */ +		if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) { +			/* +			 * Don't let an unprivileged task work around +			 * the no_new_privs restriction by creating +			 * a thread that sets it up, enters seccomp, +			 * then dies. +			 */ +			if (task_no_new_privs(caller)) +				task_set_no_new_privs(thread); + +			seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); +		} +	} +} + +/** + * seccomp_prepare_filter: Prepares a seccomp filter for use.   * @fprog: BPF program to install   * - * Returns 0 on success or an errno on failure. + * Returns filter on success or an ERR_PTR on failure.   */ -static long seccomp_attach_filter(struct sock_fprog *fprog) +static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)  {  	struct seccomp_filter *filter; -	unsigned long fp_size = fprog->len * sizeof(struct sock_filter); -	unsigned long total_insns = fprog->len; +	unsigned long fp_size;  	struct sock_filter *fp;  	int new_len;  	long ret;  	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) -		return -EINVAL; - -	for (filter = current->seccomp.filter; filter; filter = filter->prev) -		total_insns += filter->prog->len + 4;  /* include a 4 instr penalty */ -	if (total_insns > MAX_INSNS_PER_PATH) -		return -ENOMEM; +		return ERR_PTR(-EINVAL); +	BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter)); +	fp_size = fprog->len * sizeof(struct sock_filter);  	/*  	 * Installing a seccomp filter requires that the task has @@ -224,14 +359,14 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)  	 * This avoids scenarios where unprivileged tasks can affect the  	 * behavior of privileged children.  	 */ -	if (!current->no_new_privs && +	if (!task_no_new_privs(current) &&  	    security_capable_noaudit(current_cred(), current_user_ns(),  				     CAP_SYS_ADMIN) != 0) -		return -EACCES; +		return ERR_PTR(-EACCES);  	fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);  	if (!fp) -		return -ENOMEM; +		return ERR_PTR(-ENOMEM);  	/* Copy the instructions from fprog. */  	ret = -EFAULT; @@ -239,7 +374,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)  		goto free_prog;  	/* Check and rewrite the fprog via the skb checker */ -	ret = sk_chk_filter(fp, fprog->len); +	ret = bpf_check_classic(fp, fprog->len);  	if (ret)  		goto free_prog; @@ -248,8 +383,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)  	if (ret)  		goto free_prog; -	/* Convert 'sock_filter' insns to 'sock_filter_int' insns */ -	ret = sk_convert_filter(fp, fprog->len, NULL, &new_len); +	/* Convert 'sock_filter' insns to 'bpf_insn' insns */ +	ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len);  	if (ret)  		goto free_prog; @@ -260,12 +395,12 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)  	if (!filter)  		goto free_prog; -	filter->prog = kzalloc(sk_filter_size(new_len), +	filter->prog = kzalloc(bpf_prog_size(new_len),  			       GFP_KERNEL|__GFP_NOWARN);  	if (!filter->prog)  		goto free_filter; -	ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); +	ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);  	if (ret)  		goto free_filter_prog;  	kfree(fp); @@ -273,15 +408,9 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)  	atomic_set(&filter->usage, 1);  	filter->prog->len = new_len; -	sk_filter_select_runtime(filter->prog); +	bpf_prog_select_runtime(filter->prog); -	/* -	 * If there is an existing filter, make it the prev and don't drop its -	 * task reference. -	 */ -	filter->prev = current->seccomp.filter; -	current->seccomp.filter = filter; -	return 0; +	return filter;  free_filter_prog:  	kfree(filter->prog); @@ -289,19 +418,20 @@ free_filter:  	kfree(filter);  free_prog:  	kfree(fp); -	return ret; +	return ERR_PTR(ret);  }  /** - * seccomp_attach_user_filter - attaches a user-supplied sock_fprog + * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog   * @user_filter: pointer to the user data containing a sock_fprog.   *   * Returns 0 on success and non-zero otherwise.   */ -static long seccomp_attach_user_filter(char __user *user_filter) +static struct seccomp_filter * +seccomp_prepare_user_filter(const char __user *user_filter)  {  	struct sock_fprog fprog; -	long ret = -EFAULT; +	struct seccomp_filter *filter = ERR_PTR(-EFAULT);  #ifdef CONFIG_COMPAT  	if (is_compat_task()) { @@ -314,9 +444,56 @@ static long seccomp_attach_user_filter(char __user *user_filter)  #endif  	if (copy_from_user(&fprog, user_filter, sizeof(fprog)))  		goto out; -	ret = seccomp_attach_filter(&fprog); +	filter = seccomp_prepare_filter(&fprog);  out: -	return ret; +	return filter; +} + +/** + * seccomp_attach_filter: validate and attach filter + * @flags:  flags to change filter behavior + * @filter: seccomp filter to add to the current process + * + * Caller must be holding current->sighand->siglock lock. + * + * Returns 0 on success, -ve on error. + */ +static long seccomp_attach_filter(unsigned int flags, +				  struct seccomp_filter *filter) +{ +	unsigned long total_insns; +	struct seccomp_filter *walker; + +	BUG_ON(!spin_is_locked(¤t->sighand->siglock)); + +	/* Validate resulting filter length. */ +	total_insns = filter->prog->len; +	for (walker = current->seccomp.filter; walker; walker = walker->prev) +		total_insns += walker->prog->len + 4;  /* 4 instr penalty */ +	if (total_insns > MAX_INSNS_PER_PATH) +		return -ENOMEM; + +	/* If thread sync has been requested, check that it is possible. */ +	if (flags & SECCOMP_FILTER_FLAG_TSYNC) { +		int ret; + +		ret = seccomp_can_sync_threads(); +		if (ret) +			return ret; +	} + +	/* +	 * If there is an existing filter, make it the prev and don't drop its +	 * task reference. +	 */ +	filter->prev = current->seccomp.filter; +	current->seccomp.filter = filter; + +	/* Now that the new filter is in place, synchronize to all threads. */ +	if (flags & SECCOMP_FILTER_FLAG_TSYNC) +		seccomp_sync_threads(); + +	return 0;  }  /* get_seccomp_filter - increments the reference count of the filter on @tsk */ @@ -329,6 +506,14 @@ void get_seccomp_filter(struct task_struct *tsk)  	atomic_inc(&orig->usage);  } +static inline void seccomp_filter_free(struct seccomp_filter *filter) +{ +	if (filter) { +		bpf_prog_free(filter->prog); +		kfree(filter); +	} +} +  /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */  void put_seccomp_filter(struct task_struct *tsk)  { @@ -337,8 +522,7 @@ void put_seccomp_filter(struct task_struct *tsk)  	while (orig && atomic_dec_and_test(&orig->usage)) {  		struct seccomp_filter *freeme = orig;  		orig = orig->prev; -		sk_filter_free(freeme->prog); -		kfree(freeme); +		seccomp_filter_free(freeme);  	}  } @@ -382,12 +566,17 @@ static int mode1_syscalls_32[] = {  int __secure_computing(int this_syscall)  { -	int mode = current->seccomp.mode;  	int exit_sig = 0;  	int *syscall;  	u32 ret; -	switch (mode) { +	/* +	 * Make sure that any changes to mode from another thread have +	 * been seen after TIF_SECCOMP was seen. +	 */ +	rmb(); + +	switch (current->seccomp.mode) {  	case SECCOMP_MODE_STRICT:  		syscall = mode1_syscalls;  #ifdef CONFIG_COMPAT @@ -473,47 +662,152 @@ long prctl_get_seccomp(void)  }  /** - * prctl_set_seccomp: configures current->seccomp.mode - * @seccomp_mode: requested mode to use - * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER + * seccomp_set_mode_strict: internal function for setting strict seccomp   * - * This function may be called repeatedly with a @seccomp_mode of - * SECCOMP_MODE_FILTER to install additional filters.  Every filter - * successfully installed will be evaluated (in reverse order) for each system - * call the task makes. + * Once current->seccomp.mode is non-zero, it may not be changed. + * + * Returns 0 on success or -EINVAL on failure. + */ +static long seccomp_set_mode_strict(void) +{ +	const unsigned long seccomp_mode = SECCOMP_MODE_STRICT; +	long ret = -EINVAL; + +	spin_lock_irq(¤t->sighand->siglock); + +	if (!seccomp_may_assign_mode(seccomp_mode)) +		goto out; + +#ifdef TIF_NOTSC +	disable_TSC(); +#endif +	seccomp_assign_mode(current, seccomp_mode); +	ret = 0; + +out: +	spin_unlock_irq(¤t->sighand->siglock); + +	return ret; +} + +#ifdef CONFIG_SECCOMP_FILTER +/** + * seccomp_set_mode_filter: internal function for setting seccomp filter + * @flags:  flags to change filter behavior + * @filter: struct sock_fprog containing filter + * + * This function may be called repeatedly to install additional filters. + * Every filter successfully installed will be evaluated (in reverse order) + * for each system call the task makes.   *   * Once current->seccomp.mode is non-zero, it may not be changed.   *   * Returns 0 on success or -EINVAL on failure.   */ -long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) +static long seccomp_set_mode_filter(unsigned int flags, +				    const char __user *filter)  { +	const unsigned long seccomp_mode = SECCOMP_MODE_FILTER; +	struct seccomp_filter *prepared = NULL;  	long ret = -EINVAL; -	if (current->seccomp.mode && -	    current->seccomp.mode != seccomp_mode) +	/* Validate flags. */ +	if (flags & ~SECCOMP_FILTER_FLAG_MASK) +		return -EINVAL; + +	/* Prepare the new filter before holding any locks. */ +	prepared = seccomp_prepare_user_filter(filter); +	if (IS_ERR(prepared)) +		return PTR_ERR(prepared); + +	/* +	 * Make sure we cannot change seccomp or nnp state via TSYNC +	 * while another thread is in the middle of calling exec. +	 */ +	if (flags & SECCOMP_FILTER_FLAG_TSYNC && +	    mutex_lock_killable(¤t->signal->cred_guard_mutex)) +		goto out_free; + +	spin_lock_irq(¤t->sighand->siglock); + +	if (!seccomp_may_assign_mode(seccomp_mode)) +		goto out; + +	ret = seccomp_attach_filter(flags, prepared); +	if (ret)  		goto out; +	/* Do not free the successfully attached filter. */ +	prepared = NULL; + +	seccomp_assign_mode(current, seccomp_mode); +out: +	spin_unlock_irq(¤t->sighand->siglock); +	if (flags & SECCOMP_FILTER_FLAG_TSYNC) +		mutex_unlock(¤t->signal->cred_guard_mutex); +out_free: +	seccomp_filter_free(prepared); +	return ret; +} +#else +static inline long seccomp_set_mode_filter(unsigned int flags, +					   const char __user *filter) +{ +	return -EINVAL; +} +#endif + +/* Common entry point for both prctl and syscall. */ +static long do_seccomp(unsigned int op, unsigned int flags, +		       const char __user *uargs) +{ +	switch (op) { +	case SECCOMP_SET_MODE_STRICT: +		if (flags != 0 || uargs != NULL) +			return -EINVAL; +		return seccomp_set_mode_strict(); +	case SECCOMP_SET_MODE_FILTER: +		return seccomp_set_mode_filter(flags, uargs); +	default: +		return -EINVAL; +	} +} + +SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags, +			 const char __user *, uargs) +{ +	return do_seccomp(op, flags, uargs); +} + +/** + * prctl_set_seccomp: configures current->seccomp.mode + * @seccomp_mode: requested mode to use + * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER + * + * Returns 0 on success or -EINVAL on failure. + */ +long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) +{ +	unsigned int op; +	char __user *uargs;  	switch (seccomp_mode) {  	case SECCOMP_MODE_STRICT: -		ret = 0; -#ifdef TIF_NOTSC -		disable_TSC(); -#endif +		op = SECCOMP_SET_MODE_STRICT; +		/* +		 * Setting strict mode through prctl always ignored filter, +		 * so make sure it is always NULL here to pass the internal +		 * check in do_seccomp(). +		 */ +		uargs = NULL;  		break; -#ifdef CONFIG_SECCOMP_FILTER  	case SECCOMP_MODE_FILTER: -		ret = seccomp_attach_user_filter(filter); -		if (ret) -			goto out; +		op = SECCOMP_SET_MODE_FILTER; +		uargs = filter;  		break; -#endif  	default: -		goto out; +		return -EINVAL;  	} -	current->seccomp.mode = seccomp_mode; -	set_thread_flag(TIF_SECCOMP); -out: -	return ret; +	/* prctl interface doesn't have flags, so they are always zero. */ +	return do_seccomp(op, 0, uargs);  } diff --git a/kernel/signal.c b/kernel/signal.c index a4077e90f19f..8f0876f9f6dd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1263,6 +1263,10 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,  	struct sighand_struct *sighand;  	for (;;) { +		/* +		 * Disable interrupts early to avoid deadlocks. +		 * See rcu_read_unlock() comment header for details. +		 */  		local_irq_save(*flags);  		rcu_read_lock();  		sighand = rcu_dereference(tsk->sighand); @@ -2166,8 +2170,7 @@ static int ptrace_signal(int signr, siginfo_t *info)  	return signr;  } -int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, -			  struct pt_regs *regs, void *cookie) +int get_signal(struct ksignal *ksig)  {  	struct sighand_struct *sighand = current->sighand;  	struct signal_struct *signal = current->signal; @@ -2237,13 +2240,13 @@ relock:  			goto relock;  		} -		signr = dequeue_signal(current, ¤t->blocked, info); +		signr = dequeue_signal(current, ¤t->blocked, &ksig->info);  		if (!signr)  			break; /* will return 0 */  		if (unlikely(current->ptrace) && signr != SIGKILL) { -			signr = ptrace_signal(signr, info); +			signr = ptrace_signal(signr, &ksig->info);  			if (!signr)  				continue;  		} @@ -2251,13 +2254,13 @@ relock:  		ka = &sighand->action[signr-1];  		/* Trace actually delivered signals. */ -		trace_signal_deliver(signr, info, ka); +		trace_signal_deliver(signr, &ksig->info, ka);  		if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */  			continue;  		if (ka->sa.sa_handler != SIG_DFL) {  			/* Run the handler.  */ -			*return_ka = *ka; +			ksig->ka = *ka;  			if (ka->sa.sa_flags & SA_ONESHOT)  				ka->sa.sa_handler = SIG_DFL; @@ -2307,7 +2310,7 @@ relock:  				spin_lock_irq(&sighand->siglock);  			} -			if (likely(do_signal_stop(info->si_signo))) { +			if (likely(do_signal_stop(ksig->info.si_signo))) {  				/* It released the siglock.  */  				goto relock;  			} @@ -2328,7 +2331,7 @@ relock:  		if (sig_kernel_coredump(signr)) {  			if (print_fatal_signals) -				print_fatal_signal(info->si_signo); +				print_fatal_signal(ksig->info.si_signo);  			proc_coredump_connector(current);  			/*  			 * If it was able to dump core, this kills all @@ -2338,34 +2341,32 @@ relock:  			 * first and our do_group_exit call below will use  			 * that value and ignore the one we pass it.  			 */ -			do_coredump(info); +			do_coredump(&ksig->info);  		}  		/*  		 * Death signals, no core dump.  		 */ -		do_group_exit(info->si_signo); +		do_group_exit(ksig->info.si_signo);  		/* NOTREACHED */  	}  	spin_unlock_irq(&sighand->siglock); -	return signr; + +	ksig->sig = signr; +	return ksig->sig > 0;  }  /**   * signal_delivered -  - * @sig:		number of signal being delivered - * @info:		siginfo_t of signal being delivered - * @ka:			sigaction setting that chose the handler - * @regs:		user register state + * @ksig:		kernel signal struct   * @stepping:		nonzero if debugger single-step or block-step in use   *   * This function should be called when a signal has successfully been - * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask + * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask   * is always blocked, and the signal itself is blocked unless %SA_NODEFER - * is set in @ka->sa.sa_flags.  Tracing is notified. + * is set in @ksig->ka.sa.sa_flags.  Tracing is notified.   */ -void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, -			struct pt_regs *regs, int stepping) +static void signal_delivered(struct ksignal *ksig, int stepping)  {  	sigset_t blocked; @@ -2375,11 +2376,11 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,  	   simply clear the restore sigmask flag.  */  	clear_restore_sigmask(); -	sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); -	if (!(ka->sa.sa_flags & SA_NODEFER)) -		sigaddset(&blocked, sig); +	sigorsets(&blocked, ¤t->blocked, &ksig->ka.sa.sa_mask); +	if (!(ksig->ka.sa.sa_flags & SA_NODEFER)) +		sigaddset(&blocked, ksig->sig);  	set_current_blocked(&blocked); -	tracehook_signal_handler(sig, info, ka, regs, stepping); +	tracehook_signal_handler(stepping);  }  void signal_setup_done(int failed, struct ksignal *ksig, int stepping) @@ -2387,8 +2388,7 @@ void signal_setup_done(int failed, struct ksignal *ksig, int stepping)  	if (failed)  		force_sigsegv(ksig->sig, current);  	else -		signal_delivered(ksig->sig, &ksig->info, &ksig->ka, -			signal_pt_regs(), stepping); +		signal_delivered(ksig, stepping);  }  /* diff --git a/kernel/smp.c b/kernel/smp.c index 80c33f8de14f..aff8aa14f547 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -3,6 +3,7 @@   *   * (C) Jens Axboe <jens.axboe@oracle.com> 2008   */ +#include <linux/irq_work.h>  #include <linux/rcupdate.h>  #include <linux/rculist.h>  #include <linux/kernel.h> @@ -251,6 +252,14 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)  		csd->func(csd->info);  		csd_unlock(csd);  	} + +	/* +	 * Handle irq works queued remotely by irq_work_queue_on(). +	 * Smp functions above are typically synchronous so they +	 * better run first since some other CPUs may be busy waiting +	 * for them. +	 */ +	irq_work_run();  }  /* @@ -661,7 +670,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),  			if (cond_func(cpu, info)) {  				ret = smp_call_function_single(cpu, func,  								info, wait); -				WARN_ON_ONCE(!ret); +				WARN_ON_ONCE(ret);  			}  		preempt_enable();  	} diff --git a/kernel/sys.c b/kernel/sys.c index 66a751ebf9d9..ce8129192a26 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1990,12 +1990,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,  		if (arg2 != 1 || arg3 || arg4 || arg5)  			return -EINVAL; -		current->no_new_privs = 1; +		task_set_no_new_privs(current);  		break;  	case PR_GET_NO_NEW_PRIVS:  		if (arg2 || arg3 || arg4 || arg5)  			return -EINVAL; -		return current->no_new_privs ? 1 : 0; +		return task_no_new_privs(current) ? 1 : 0;  	case PR_GET_THP_DISABLE:  		if (arg2 || arg3 || arg4 || arg5)  			return -EINVAL; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 36441b51b5df..391d4ddb6f4b 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -25,6 +25,7 @@ cond_syscall(sys_swapon);  cond_syscall(sys_swapoff);  cond_syscall(sys_kexec_load);  cond_syscall(compat_sys_kexec_load); +cond_syscall(sys_kexec_file_load);  cond_syscall(sys_init_module);  cond_syscall(sys_finit_module);  cond_syscall(sys_delete_module); @@ -197,6 +198,7 @@ cond_syscall(compat_sys_timerfd_settime);  cond_syscall(compat_sys_timerfd_gettime);  cond_syscall(sys_eventfd);  cond_syscall(sys_eventfd2); +cond_syscall(sys_memfd_create);  /* performance counters: */  cond_syscall(sys_perf_event_open); @@ -213,3 +215,6 @@ cond_syscall(compat_sys_open_by_handle_at);  /* compare kernel pointers */  cond_syscall(sys_kcmp); + +/* operate on Secure Computing state */ +cond_syscall(sys_seccomp); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 75b22e22a72c..75875a741b5e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1240,8 +1240,7 @@ static struct ctl_table vm_table[] = {  		.maxlen		= sizeof(unsigned long),  		.mode		= 0644,  		.proc_handler	= hugetlb_sysctl_handler, -		.extra1		= (void *)&hugetlb_zero, -		.extra2		= (void *)&hugetlb_infinity, +		.extra1		= &zero,  	},  #ifdef CONFIG_NUMA  	{ @@ -1250,8 +1249,7 @@ static struct ctl_table vm_table[] = {  		.maxlen         = sizeof(unsigned long),  		.mode           = 0644,  		.proc_handler   = &hugetlb_mempolicy_sysctl_handler, -		.extra1		= (void *)&hugetlb_zero, -		.extra2		= (void *)&hugetlb_infinity, +		.extra1		= &zero,  	},  #endif  	 { @@ -1274,8 +1272,7 @@ static struct ctl_table vm_table[] = {  		.maxlen		= sizeof(unsigned long),  		.mode		= 0644,  		.proc_handler	= hugetlb_overcommit_handler, -		.extra1		= (void *)&hugetlb_zero, -		.extra2		= (void *)&hugetlb_infinity, +		.extra1		= &zero,  	},  #endif  	{ diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 653cbbd9e7ad..e4ba9a5a5ccb 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -522,6 +522,7 @@ static const struct bin_table bin_net_ipv6_conf_var_table[] = {  	{ CTL_INT,	NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN,	"accept_ra_rt_info_max_plen" },  	{ CTL_INT,	NET_IPV6_PROXY_NDP,			"proxy_ndp" },  	{ CTL_INT,	NET_IPV6_ACCEPT_SOURCE_ROUTE,		"accept_source_route" }, +	{ CTL_INT,	NET_IPV6_ACCEPT_RA_FROM_LOCAL,		"accept_ra_from_local" },  	{}  }; diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c index 52ebc70263f4..875f64e8935b 100644 --- a/kernel/system_keyring.c +++ b/kernel/system_keyring.c @@ -89,6 +89,7 @@ static __init int load_system_certificate_list(void)  			pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",  			       PTR_ERR(key));  		} else { +			set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags);  			pr_notice("Loaded X.509 cert '%s'\n",  				  key_ref_to_ptr(key)->description);  			key_ref_put(key); diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 12d6ebbfdd83..0dbab6d1acb4 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -14,6 +14,8 @@   * the GNU General Public License for more details.   */ +#define pr_fmt(fmt) "Kprobe smoke test: " fmt +  #include <linux/kernel.h>  #include <linux/kprobes.h>  #include <linux/random.h> @@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,  {  	if (preh_val != (rand1 / div_factor)) {  		handler_errors++; -		printk(KERN_ERR "Kprobe smoke test failed: " -				"incorrect value in post_handler\n"); +		pr_err("incorrect value in post_handler\n");  	}  	posth_val = preh_val + div_factor;  } @@ -59,8 +60,7 @@ static int test_kprobe(void)  	ret = register_kprobe(&kp);  	if (ret < 0) { -		printk(KERN_ERR "Kprobe smoke test failed: " -				"register_kprobe returned %d\n", ret); +		pr_err("register_kprobe returned %d\n", ret);  		return ret;  	} @@ -68,14 +68,12 @@ static int test_kprobe(void)  	unregister_kprobe(&kp);  	if (preh_val == 0) { -		printk(KERN_ERR "Kprobe smoke test failed: " -				"kprobe pre_handler not called\n"); +		pr_err("kprobe pre_handler not called\n");  		handler_errors++;  	}  	if (posth_val == 0) { -		printk(KERN_ERR "Kprobe smoke test failed: " -				"kprobe post_handler not called\n"); +		pr_err("kprobe post_handler not called\n");  		handler_errors++;  	} @@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs,  {  	if (preh_val != (rand1 / div_factor) + 1) {  		handler_errors++; -		printk(KERN_ERR "Kprobe smoke test failed: " -				"incorrect value in post_handler2\n"); +		pr_err("incorrect value in post_handler2\n");  	}  	posth_val = preh_val + div_factor;  } @@ -120,8 +117,7 @@ static int test_kprobes(void)  	kp.flags = 0;  	ret = register_kprobes(kps, 2);  	if (ret < 0) { -		printk(KERN_ERR "Kprobe smoke test failed: " -				"register_kprobes returned %d\n", ret); +		pr_err("register_kprobes returned %d\n", ret);  		return ret;  	} @@ -130,14 +126,12 @@ static int test_kprobes(void)  	ret = target(rand1);  	if (preh_val == 0) { -		printk(KERN_ERR "Kprobe smoke test failed: " -				"kprobe pre_handler not called\n"); +		pr_err("kprobe pre_handler not called\n");  		handler_errors++;  	}  	if (posth_val == 0) { -		printk(KERN_ERR "Kprobe smoke test failed: " -				"kprobe post_handler not called\n"); +		pr_err("kprobe post_handler not called\n");  		handler_errors++;  	} @@ -146,14 +140,12 @@ static int test_kprobes(void)  	ret = target2(rand1);  	if (preh_val == 0) { -		printk(KERN_ERR "Kprobe smoke test failed: " -				"kprobe pre_handler2 not called\n"); +		pr_err("kprobe pre_handler2 not called\n");  		handler_errors++;  	}  	if (posth_val == 0) { -		printk(KERN_ERR "Kprobe smoke test failed: " -				"kprobe post_handler2 not called\n"); +		pr_err("kprobe post_handler2 not called\n");  		handler_errors++;  	} @@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value)  {  	if (value != rand1) {  		handler_errors++; -		printk(KERN_ERR "Kprobe smoke test failed: " -				"incorrect value in jprobe handler\n"); +		pr_err("incorrect value in jprobe handler\n");  	}  	jph_val = rand1; @@ -186,16 +177,14 @@ static int test_jprobe(void)  	ret = register_jprobe(&jp);  	if (ret < 0) { -		printk(KERN_ERR "Kprobe smoke test failed: " -				"register_jprobe returned %d\n", ret); +		pr_err("register_jprobe returned %d\n", ret);  		return ret;  	}  	ret = target(rand1);  	unregister_jprobe(&jp);  	if (jph_val == 0) { -		printk(KERN_ERR "Kprobe smoke test failed: " -				"jprobe handler not called\n"); +		pr_err("jprobe handler not called\n");  		handler_errors++;  	} @@ -217,24 +206,21 @@ static int test_jprobes(void)  	jp.kp.flags = 0;  	ret = register_jprobes(jps, 2);  	if (ret < 0) { -		printk(KERN_ERR "Kprobe smoke test failed: " -				"register_jprobes returned %d\n", ret); +		pr_err("register_jprobes returned %d\n", ret);  		return ret;  	}  	jph_val = 0;  	ret = target(rand1);  	if (jph_val == 0) { -		printk(KERN_ERR "Kprobe smoke test failed: " -				"jprobe handler not called\n"); +		pr_err("jprobe handler not called\n");  		handler_errors++;  	}  	jph_val = 0;  	ret = target2(rand1);  	if (jph_val == 0) { -		printk(KERN_ERR "Kprobe smoke test failed: " -				"jprobe handler2 not called\n"); +		pr_err("jprobe handler2 not called\n");  		handler_errors++;  	}  	unregister_jprobes(jps, 2); @@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)  	if (ret != (rand1 / div_factor)) {  		handler_errors++; -		printk(KERN_ERR "Kprobe smoke test failed: " -				"incorrect value in kretprobe handler\n"); +		pr_err("incorrect value in kretprobe handler\n");  	}  	if (krph_val == 0) {  		handler_errors++; -		printk(KERN_ERR "Kprobe smoke test failed: " -				"call to kretprobe entry handler failed\n"); +		pr_err("call to kretprobe entry handler failed\n");  	}  	krph_val = rand1; @@ -281,16 +265,14 @@ static int test_kretprobe(void)  	ret = register_kretprobe(&rp);  	if (ret < 0) { -		printk(KERN_ERR "Kprobe smoke test failed: " -				"register_kretprobe returned %d\n", ret); +		pr_err("register_kretprobe returned %d\n", ret);  		return ret;  	}  	ret = target(rand1);  	unregister_kretprobe(&rp);  	if (krph_val != rand1) { -		printk(KERN_ERR "Kprobe smoke test failed: " -				"kretprobe handler not called\n"); +		pr_err("kretprobe handler not called\n");  		handler_errors++;  	} @@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs)  	if (ret != (rand1 / div_factor) + 1) {  		handler_errors++; -		printk(KERN_ERR "Kprobe smoke test failed: " -				"incorrect value in kretprobe handler2\n"); +		pr_err("incorrect value in kretprobe handler2\n");  	}  	if (krph_val == 0) {  		handler_errors++; -		printk(KERN_ERR "Kprobe smoke test failed: " -				"call to kretprobe entry handler failed\n"); +		pr_err("call to kretprobe entry handler failed\n");  	}  	krph_val = rand1; @@ -332,24 +312,21 @@ static int test_kretprobes(void)  	rp.kp.flags = 0;  	ret = register_kretprobes(rps, 2);  	if (ret < 0) { -		printk(KERN_ERR "Kprobe smoke test failed: " -				"register_kretprobe returned %d\n", ret); +		pr_err("register_kretprobe returned %d\n", ret);  		return ret;  	}  	krph_val = 0;  	ret = target(rand1);  	if (krph_val != rand1) { -		printk(KERN_ERR "Kprobe smoke test failed: " -				"kretprobe handler not called\n"); +		pr_err("kretprobe handler not called\n");  		handler_errors++;  	}  	krph_val = 0;  	ret = target2(rand1);  	if (krph_val != rand1) { -		printk(KERN_ERR "Kprobe smoke test failed: " -				"kretprobe handler2 not called\n"); +		pr_err("kretprobe handler2 not called\n");  		handler_errors++;  	}  	unregister_kretprobes(rps, 2); @@ -368,7 +345,7 @@ int init_test_probes(void)  		rand1 = prandom_u32();  	} while (rand1 <= div_factor); -	printk(KERN_INFO "Kprobe smoke test started\n"); +	pr_info("started\n");  	num_tests++;  	ret = test_kprobe();  	if (ret < 0) @@ -402,13 +379,11 @@ int init_test_probes(void)  #endif /* CONFIG_KRETPROBES */  	if (errors) -		printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " -				"%d tests failed\n", errors, num_tests); +		pr_err("BUG: %d out of %d tests failed\n", errors, num_tests);  	else if (handler_errors) -		printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " -				"running handlers\n", handler_errors); +		pr_err("BUG: %d error(s) running handlers\n", handler_errors);  	else -		printk(KERN_INFO "Kprobe smoke test passed successfully\n"); +		pr_info("passed successfully\n");  	return 0;  } diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index f448513a45ed..d626dc98e8df 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG  config ARCH_CLOCKSOURCE_DATA  	bool +# Clocksources require validation of the clocksource against the last +# cycle update - x86/TSC misfeature +config CLOCKSOURCE_VALIDATE_LAST_CYCLE +	bool +  # Timekeeping vsyscall support  config GENERIC_TIME_VSYSCALL  	bool @@ -20,10 +25,6 @@ config GENERIC_TIME_VSYSCALL  config GENERIC_TIME_VSYSCALL_OLD  	bool -# ktime_t scalar 64bit nsec representation -config KTIME_SCALAR -	bool -  # Old style timekeeping  config ARCH_USES_GETTIMEOFFSET  	bool diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 57a413fd0ebf..7347426fa68d 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,3 +1,4 @@ +obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o  obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o  obj-y += timeconv.o posix-clock.o alarmtimer.o @@ -12,3 +13,21 @@ obj-$(CONFIG_TICK_ONESHOT)			+= tick-oneshot.o  obj-$(CONFIG_TICK_ONESHOT)			+= tick-sched.o  obj-$(CONFIG_TIMER_STATS)			+= timer_stats.o  obj-$(CONFIG_DEBUG_FS)				+= timekeeping_debug.o +obj-$(CONFIG_TEST_UDELAY)			+= udelay_test.o + +$(obj)/time.o: $(obj)/timeconst.h + +quiet_cmd_hzfile = HZFILE  $@ +      cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ + +targets += hz.bc +$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE +	$(call if_changed,hzfile) + +quiet_cmd_bc  = BC      $@ +      cmd_bc  = bc -q $(filter-out FORCE,$^) > $@ + +targets += timeconst.h +$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE +	$(call if_changed,bc) + diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index fe75444ae7ec..4aec4a457431 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -71,7 +71,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void)  	return ret;  } - +EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);  static int alarmtimer_rtc_add_device(struct device *dev,  				struct class_interface *class_intf) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index ba3e502c955a..2e949cc9c9f1 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -32,6 +32,7 @@  #include <linux/kthread.h>  #include "tick-internal.h" +#include "timekeeping_internal.h"  void timecounter_init(struct timecounter *tc,  		      const struct cyclecounter *cc, @@ -249,7 +250,7 @@ void clocksource_mark_unstable(struct clocksource *cs)  static void clocksource_watchdog(unsigned long data)  {  	struct clocksource *cs; -	cycle_t csnow, wdnow; +	cycle_t csnow, wdnow, delta;  	int64_t wd_nsec, cs_nsec;  	int next_cpu, reset_pending; @@ -282,11 +283,12 @@ static void clocksource_watchdog(unsigned long data)  			continue;  		} -		wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, -					     watchdog->mult, watchdog->shift); +		delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask); +		wd_nsec = clocksource_cyc2ns(delta, watchdog->mult, +					     watchdog->shift); -		cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) & -					     cs->mask, cs->mult, cs->shift); +		delta = clocksource_delta(csnow, cs->cs_last, cs->mask); +		cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);  		cs->cs_last = csnow;  		cs->wd_last = wdnow; diff --git a/kernel/hrtimer.c b/kernel/time/hrtimer.c index 3ab28993f6e0..1c2fe7de2842 100644 --- a/kernel/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -54,6 +54,8 @@  #include <trace/events/timer.h> +#include "timekeeping.h" +  /*   * The timer bases:   * @@ -114,21 +116,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)   */  static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)  { -	ktime_t xtim, mono, boot; -	struct timespec xts, tom, slp; -	s32 tai_offset; +	ktime_t xtim, mono, boot, tai; +	ktime_t off_real, off_boot, off_tai; -	get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); -	tai_offset = timekeeping_get_tai_offset(); +	mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); +	boot = ktime_add(mono, off_boot); +	xtim = ktime_add(mono, off_real); +	tai = ktime_add(xtim, off_tai); -	xtim = timespec_to_ktime(xts); -	mono = ktime_add(xtim, timespec_to_ktime(tom)); -	boot = ktime_add(mono, timespec_to_ktime(slp));  	base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;  	base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;  	base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; -	base->clock_base[HRTIMER_BASE_TAI].softirq_time = -				ktime_add(xtim,	ktime_set(tai_offset, 0)); +	base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai;  }  /* @@ -264,60 +263,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)   * too large for inlining:   */  #if BITS_PER_LONG < 64 -# ifndef CONFIG_KTIME_SCALAR -/** - * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable - * @kt:		addend - * @nsec:	the scalar nsec value to add - * - * Returns the sum of kt and nsec in ktime_t format - */ -ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) -{ -	ktime_t tmp; - -	if (likely(nsec < NSEC_PER_SEC)) { -		tmp.tv64 = nsec; -	} else { -		unsigned long rem = do_div(nsec, NSEC_PER_SEC); - -		/* Make sure nsec fits into long */ -		if (unlikely(nsec > KTIME_SEC_MAX)) -			return (ktime_t){ .tv64 = KTIME_MAX }; - -		tmp = ktime_set((long)nsec, rem); -	} - -	return ktime_add(kt, tmp); -} - -EXPORT_SYMBOL_GPL(ktime_add_ns); - -/** - * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable - * @kt:		minuend - * @nsec:	the scalar nsec value to subtract - * - * Returns the subtraction of @nsec from @kt in ktime_t format - */ -ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec) -{ -	ktime_t tmp; - -	if (likely(nsec < NSEC_PER_SEC)) { -		tmp.tv64 = nsec; -	} else { -		unsigned long rem = do_div(nsec, NSEC_PER_SEC); - -		tmp = ktime_set((long)nsec, rem); -	} - -	return ktime_sub(kt, tmp); -} - -EXPORT_SYMBOL_GPL(ktime_sub_ns); -# endif /* !CONFIG_KTIME_SCALAR */ -  /*   * Divide a ktime value by a nanosecond value   */ @@ -337,6 +282,7 @@ u64 ktime_divns(const ktime_t kt, s64 div)  	return dclc;  } +EXPORT_SYMBOL_GPL(ktime_divns);  #endif /* BITS_PER_LONG >= 64 */  /* @@ -602,6 +548,11 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)   * timers, we have to check, whether it expires earlier than the timer for   * which the clock event device was armed.   * + * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming + * and no expiry check happens. The timer gets enqueued into the rbtree. The + * reprogramming and expiry check is done in the hrtimer_interrupt or in the + * softirq. + *   * Called with interrupts disabled and base->cpu_base.lock held   */  static int hrtimer_reprogram(struct hrtimer *timer, @@ -662,25 +613,13 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)  	base->hres_active = 0;  } -/* - * When High resolution timers are active, try to reprogram. Note, that in case - * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry - * check happens. The timer gets enqueued into the rbtree. The reprogramming - * and expiry check is done in the hrtimer_interrupt or in the softirq. - */ -static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, -					    struct hrtimer_clock_base *base) -{ -	return base->cpu_base->hres_active && hrtimer_reprogram(timer, base); -} -  static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)  {  	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;  	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;  	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; -	return ktime_get_update_offsets(offs_real, offs_boot, offs_tai); +	return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);  }  /* @@ -755,8 +694,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }  static inline int hrtimer_switch_to_hres(void) { return 0; }  static inline void  hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } -static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, -					    struct hrtimer_clock_base *base) +static inline int hrtimer_reprogram(struct hrtimer *timer, +				    struct hrtimer_clock_base *base)  {  	return 0;  } @@ -1013,14 +952,25 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,  	leftmost = enqueue_hrtimer(timer, new_base); -	/* -	 * Only allow reprogramming if the new base is on this CPU. -	 * (it might still be on another CPU if the timer was pending) -	 * -	 * XXX send_remote_softirq() ? -	 */ -	if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases) -		&& hrtimer_enqueue_reprogram(timer, new_base)) { +	if (!leftmost) { +		unlock_hrtimer_base(timer, &flags); +		return ret; +	} + +	if (!hrtimer_is_hres_active(timer)) { +		/* +		 * Kick to reschedule the next tick to handle the new timer +		 * on dynticks target. +		 */ +		wake_up_nohz_cpu(new_base->cpu_base->cpu); +	} else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) && +			hrtimer_reprogram(timer, new_base)) { +		/* +		 * Only allow reprogramming if the new base is on this CPU. +		 * (it might still be on another CPU if the timer was pending) +		 * +		 * XXX send_remote_softirq() ? +		 */  		if (wakeup) {  			/*  			 * We need to drop cpu_base->lock to avoid a @@ -1680,6 +1630,7 @@ static void init_hrtimers_cpu(int cpu)  		timerqueue_init_head(&cpu_base->clock_base[i].active);  	} +	cpu_base->cpu = cpu;  	hrtimer_init_hres(cpu_base);  } diff --git a/kernel/itimer.c b/kernel/time/itimer.c index 8d262b467573..8d262b467573 100644 --- a/kernel/itimer.c +++ b/kernel/time/itimer.c diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 33db43a39515..87a346fd6d61 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -466,7 +466,8 @@ static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);  static void sync_cmos_clock(struct work_struct *work)  { -	struct timespec now, next; +	struct timespec64 now; +	struct timespec next;  	int fail = 1;  	/* @@ -485,9 +486,9 @@ static void sync_cmos_clock(struct work_struct *work)  		return;  	} -	getnstimeofday(&now); +	getnstimeofday64(&now);  	if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { -		struct timespec adjust = now; +		struct timespec adjust = timespec64_to_timespec(now);  		fail = -ENODEV;  		if (persistent_clock_is_local) @@ -531,7 +532,7 @@ void ntp_notify_cmos_timer(void) { }  /*   * Propagate a new txc->status value into the NTP state:   */ -static inline void process_adj_status(struct timex *txc, struct timespec *ts) +static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)  {  	if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {  		time_state = TIME_OK; @@ -554,7 +555,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)  static inline void process_adjtimex_modes(struct timex *txc, -						struct timespec *ts, +						struct timespec64 *ts,  						s32 *time_tai)  {  	if (txc->modes & ADJ_STATUS) @@ -640,7 +641,7 @@ int ntp_validate_timex(struct timex *txc)   * adjtimex mainly allows reading (and writing, if superuser) of   * kernel time-keeping variables. used by xntpd.   */ -int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) +int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)  {  	int result; @@ -684,7 +685,7 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)  	/* fill PPS status fields */  	pps_fill_timex(txc); -	txc->time.tv_sec = ts->tv_sec; +	txc->time.tv_sec = (time_t)ts->tv_sec;  	txc->time.tv_usec = ts->tv_nsec;  	if (!(time_status & STA_NANO))  		txc->time.tv_usec /= NSEC_PER_USEC; diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 1950cb4ca2a4..bbd102ad9df7 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -7,6 +7,6 @@ extern void ntp_clear(void);  extern u64 ntp_tick_length(void);  extern int second_overflow(unsigned long secs);  extern int ntp_validate_timex(struct timex *); -extern int __do_adjtimex(struct timex *, struct timespec *, s32 *); +extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);  extern void __hardpps(const struct timespec *, const struct timespec *);  #endif /* _LINUX_NTP_INTERNAL_H */ diff --git a/kernel/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 3b8946416a5f..3b8946416a5f 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c diff --git a/kernel/posix-timers.c b/kernel/time/posix-timers.c index 424c2d4265c9..42b463ad90f2 100644 --- a/kernel/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -49,6 +49,8 @@  #include <linux/export.h>  #include <linux/hashtable.h> +#include "timekeeping.h" +  /*   * Management arrays for POSIX timers. Timers are now kept in static hash table   * with 512 entries. diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 7ab92b19965a..c19c1d84b6f3 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -4,6 +4,8 @@  #include <linux/hrtimer.h>  #include <linux/tick.h> +#include "timekeeping.h" +  extern seqlock_t jiffies_lock;  #define CS_NAME_LEN	32 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6558b7ac112d..99aa6ee3908f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -154,6 +154,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)  #ifdef CONFIG_NO_HZ_FULL  cpumask_var_t tick_nohz_full_mask; +cpumask_var_t housekeeping_mask;  bool tick_nohz_full_running;  static bool can_stop_full_tick(void) @@ -224,13 +225,15 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {  };  /* - * Kick the current CPU if it's full dynticks in order to force it to + * Kick the CPU if it's full dynticks in order to force it to   * re-evaluate its dependency on the tick and restart it if necessary.   */ -void tick_nohz_full_kick(void) +void tick_nohz_full_kick_cpu(int cpu)  { -	if (tick_nohz_full_cpu(smp_processor_id())) -		irq_work_queue(&__get_cpu_var(nohz_full_kick_work)); +	if (!tick_nohz_full_cpu(cpu)) +		return; + +	irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);  }  static void nohz_full_kick_ipi(void *info) @@ -281,6 +284,7 @@ static int __init tick_nohz_full_setup(char *str)  	int cpu;  	alloc_bootmem_cpumask_var(&tick_nohz_full_mask); +	alloc_bootmem_cpumask_var(&housekeeping_mask);  	if (cpulist_parse(str, tick_nohz_full_mask) < 0) {  		pr_warning("NOHZ: Incorrect nohz_full cpumask\n");  		return 1; @@ -291,6 +295,8 @@ static int __init tick_nohz_full_setup(char *str)  		pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);  		cpumask_clear_cpu(cpu, tick_nohz_full_mask);  	} +	cpumask_andnot(housekeeping_mask, +		       cpu_possible_mask, tick_nohz_full_mask);  	tick_nohz_full_running = true;  	return 1; @@ -332,9 +338,15 @@ static int tick_nohz_init_all(void)  		pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");  		return err;  	} +	if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) { +		pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n"); +		return err; +	}  	err = 0;  	cpumask_setall(tick_nohz_full_mask);  	cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); +	cpumask_clear(housekeeping_mask); +	cpumask_set_cpu(smp_processor_id(), housekeeping_mask);  	tick_nohz_full_running = true;  #endif  	return err; diff --git a/kernel/time.c b/kernel/time/time.c index 7c7964c33ae7..f0294ba14634 100644 --- a/kernel/time.c +++ b/kernel/time/time.c @@ -42,6 +42,7 @@  #include <asm/unistd.h>  #include "timeconst.h" +#include "timekeeping.h"  /*   * The timezone where the local system is located.  Used as a default by some @@ -420,6 +421,68 @@ struct timeval ns_to_timeval(const s64 nsec)  }  EXPORT_SYMBOL(ns_to_timeval); +#if BITS_PER_LONG == 32 +/** + * set_normalized_timespec - set timespec sec and nsec parts and normalize + * + * @ts:		pointer to timespec variable to be set + * @sec:	seconds to set + * @nsec:	nanoseconds to set + * + * Set seconds and nanoseconds field of a timespec variable and + * normalize to the timespec storage format + * + * Note: The tv_nsec part is always in the range of + *	0 <= tv_nsec < NSEC_PER_SEC + * For negative values only the tv_sec field is negative ! + */ +void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec) +{ +	while (nsec >= NSEC_PER_SEC) { +		/* +		 * The following asm() prevents the compiler from +		 * optimising this loop into a modulo operation. See +		 * also __iter_div_u64_rem() in include/linux/time.h +		 */ +		asm("" : "+rm"(nsec)); +		nsec -= NSEC_PER_SEC; +		++sec; +	} +	while (nsec < 0) { +		asm("" : "+rm"(nsec)); +		nsec += NSEC_PER_SEC; +		--sec; +	} +	ts->tv_sec = sec; +	ts->tv_nsec = nsec; +} +EXPORT_SYMBOL(set_normalized_timespec64); + +/** + * ns_to_timespec64 - Convert nanoseconds to timespec64 + * @nsec:       the nanoseconds value to be converted + * + * Returns the timespec64 representation of the nsec parameter. + */ +struct timespec64 ns_to_timespec64(const s64 nsec) +{ +	struct timespec64 ts; +	s32 rem; + +	if (!nsec) +		return (struct timespec64) {0, 0}; + +	ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); +	if (unlikely(rem < 0)) { +		ts.tv_sec--; +		rem += NSEC_PER_SEC; +	} +	ts.tv_nsec = rem; + +	return ts; +} +EXPORT_SYMBOL(ns_to_timespec64); +#endif  /*   * When we convert to jiffies then we interpret incoming values   * the following way: @@ -694,6 +757,7 @@ unsigned long nsecs_to_jiffies(u64 n)  {  	return (unsigned long)nsecs_to_jiffies64(n);  } +EXPORT_SYMBOL_GPL(nsecs_to_jiffies);  /*   * Add two timespec values and do a safety check for overflow. diff --git a/kernel/timeconst.bc b/kernel/time/timeconst.bc index 511bdf2cafda..511bdf2cafda 100644 --- a/kernel/timeconst.bc +++ b/kernel/time/timeconst.bc diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 32d8d6aaedb8..f36b02838a47 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -32,11 +32,34 @@  #define TK_MIRROR		(1 << 1)  #define TK_CLOCK_WAS_SET	(1 << 2) -static struct timekeeper timekeeper; +/* + * The most important data for readout fits into a single 64 byte + * cache line. + */ +static struct { +	seqcount_t		seq; +	struct timekeeper	timekeeper; +} tk_core ____cacheline_aligned; +  static DEFINE_RAW_SPINLOCK(timekeeper_lock); -static seqcount_t timekeeper_seq;  static struct timekeeper shadow_timekeeper; +/** + * struct tk_fast - NMI safe timekeeper + * @seq:	Sequence counter for protecting updates. The lowest bit + *		is the index for the tk_read_base array + * @base:	tk_read_base array. Access is indexed by the lowest bit of + *		@seq. + * + * See @update_fast_timekeeper() below. + */ +struct tk_fast { +	seqcount_t		seq; +	struct tk_read_base	base[2]; +}; + +static struct tk_fast tk_fast_mono ____cacheline_aligned; +  /* flag for if timekeeping is suspended */  int __read_mostly timekeeping_suspended; @@ -45,49 +68,54 @@ bool __read_mostly persistent_clock_exist = false;  static inline void tk_normalize_xtime(struct timekeeper *tk)  { -	while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) { -		tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift; +	while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) { +		tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift;  		tk->xtime_sec++;  	}  } -static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts) +static inline struct timespec64 tk_xtime(struct timekeeper *tk) +{ +	struct timespec64 ts; + +	ts.tv_sec = tk->xtime_sec; +	ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift); +	return ts; +} + +static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)  {  	tk->xtime_sec = ts->tv_sec; -	tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift; +	tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift;  } -static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts) +static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)  {  	tk->xtime_sec += ts->tv_sec; -	tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift; +	tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift;  	tk_normalize_xtime(tk);  } -static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm) +static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)  { -	struct timespec tmp; +	struct timespec64 tmp;  	/*  	 * Verify consistency of: offset_real = -wall_to_monotonic  	 * before modifying anything  	 */ -	set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec, +	set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,  					-tk->wall_to_monotonic.tv_nsec); -	WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64); +	WARN_ON_ONCE(tk->offs_real.tv64 != timespec64_to_ktime(tmp).tv64);  	tk->wall_to_monotonic = wtm; -	set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec); -	tk->offs_real = timespec_to_ktime(tmp); +	set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec); +	tk->offs_real = timespec64_to_ktime(tmp);  	tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));  } -static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t) +static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)  { -	/* Verify consistency before modifying */ -	WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64); - -	tk->total_sleep_time	= t; -	tk->offs_boot		= timespec_to_ktime(t); +	tk->offs_boot = ktime_add(tk->offs_boot, delta);  }  /** @@ -107,9 +135,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)  	u64 tmp, ntpinterval;  	struct clocksource *old_clock; -	old_clock = tk->clock; -	tk->clock = clock; -	tk->cycle_last = clock->cycle_last = clock->read(clock); +	old_clock = tk->tkr.clock; +	tk->tkr.clock = clock; +	tk->tkr.read = clock->read; +	tk->tkr.mask = clock->mask; +	tk->tkr.cycle_last = tk->tkr.read(clock);  	/* Do the ns -> cycle conversion first, using original mult */  	tmp = NTP_INTERVAL_LENGTH; @@ -133,78 +163,212 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)  	if (old_clock) {  		int shift_change = clock->shift - old_clock->shift;  		if (shift_change < 0) -			tk->xtime_nsec >>= -shift_change; +			tk->tkr.xtime_nsec >>= -shift_change;  		else -			tk->xtime_nsec <<= shift_change; +			tk->tkr.xtime_nsec <<= shift_change;  	} -	tk->shift = clock->shift; +	tk->tkr.shift = clock->shift;  	tk->ntp_error = 0;  	tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; +	tk->ntp_tick = ntpinterval << tk->ntp_error_shift;  	/*  	 * The timekeeper keeps its own mult values for the currently  	 * active clocksource. These value will be adjusted via NTP  	 * to counteract clock drifting.  	 */ -	tk->mult = clock->mult; +	tk->tkr.mult = clock->mult; +	tk->ntp_err_mult = 0;  }  /* Timekeeper helper functions. */  #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET -u32 (*arch_gettimeoffset)(void); - -u32 get_arch_timeoffset(void) -{ -	if (likely(arch_gettimeoffset)) -		return arch_gettimeoffset(); -	return 0; -} +static u32 default_arch_gettimeoffset(void) { return 0; } +u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;  #else -static inline u32 get_arch_timeoffset(void) { return 0; } +static inline u32 arch_gettimeoffset(void) { return 0; }  #endif -static inline s64 timekeeping_get_ns(struct timekeeper *tk) +static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)  { -	cycle_t cycle_now, cycle_delta; -	struct clocksource *clock; +	cycle_t cycle_now, delta;  	s64 nsec;  	/* read clocksource: */ -	clock = tk->clock; -	cycle_now = clock->read(clock); +	cycle_now = tkr->read(tkr->clock);  	/* calculate the delta since the last update_wall_time: */ -	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; +	delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); -	nsec = cycle_delta * tk->mult + tk->xtime_nsec; -	nsec >>= tk->shift; +	nsec = delta * tkr->mult + tkr->xtime_nsec; +	nsec >>= tkr->shift;  	/* If arch requires, add in get_arch_timeoffset() */ -	return nsec + get_arch_timeoffset(); +	return nsec + arch_gettimeoffset();  }  static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)  { -	cycle_t cycle_now, cycle_delta; -	struct clocksource *clock; +	struct clocksource *clock = tk->tkr.clock; +	cycle_t cycle_now, delta;  	s64 nsec;  	/* read clocksource: */ -	clock = tk->clock; -	cycle_now = clock->read(clock); +	cycle_now = tk->tkr.read(clock);  	/* calculate the delta since the last update_wall_time: */ -	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; +	delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);  	/* convert delta to nanoseconds. */ -	nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); +	nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);  	/* If arch requires, add in get_arch_timeoffset() */ -	return nsec + get_arch_timeoffset(); +	return nsec + arch_gettimeoffset(); +} + +/** + * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. + * @tk:		The timekeeper from which we take the update + * @tkf:	The fast timekeeper to update + * @tbase:	The time base for the fast timekeeper (mono/raw) + * + * We want to use this from any context including NMI and tracing / + * instrumenting the timekeeping code itself. + * + * So we handle this differently than the other timekeeping accessor + * functions which retry when the sequence count has changed. The + * update side does: + * + * smp_wmb();	<- Ensure that the last base[1] update is visible + * tkf->seq++; + * smp_wmb();	<- Ensure that the seqcount update is visible + * update(tkf->base[0], tk); + * smp_wmb();	<- Ensure that the base[0] update is visible + * tkf->seq++; + * smp_wmb();	<- Ensure that the seqcount update is visible + * update(tkf->base[1], tk); + * + * The reader side does: + * + * do { + *	seq = tkf->seq; + *	smp_rmb(); + *	idx = seq & 0x01; + *	now = now(tkf->base[idx]); + *	smp_rmb(); + * } while (seq != tkf->seq) + * + * As long as we update base[0] readers are forced off to + * base[1]. Once base[0] is updated readers are redirected to base[0] + * and the base[1] update takes place. + * + * So if a NMI hits the update of base[0] then it will use base[1] + * which is still consistent. In the worst case this can result is a + * slightly wrong timestamp (a few nanoseconds). See + * @ktime_get_mono_fast_ns. + */ +static void update_fast_timekeeper(struct timekeeper *tk) +{ +	struct tk_read_base *base = tk_fast_mono.base; + +	/* Force readers off to base[1] */ +	raw_write_seqcount_latch(&tk_fast_mono.seq); + +	/* Update base[0] */ +	memcpy(base, &tk->tkr, sizeof(*base)); + +	/* Force readers back to base[0] */ +	raw_write_seqcount_latch(&tk_fast_mono.seq); + +	/* Update base[1] */ +	memcpy(base + 1, base, sizeof(*base));  } +/** + * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic + * + * This timestamp is not guaranteed to be monotonic across an update. + * The timestamp is calculated by: + * + *	now = base_mono + clock_delta * slope + * + * So if the update lowers the slope, readers who are forced to the + * not yet updated second array are still using the old steeper slope. + * + * tmono + * ^ + * |    o  n + * |   o n + * |  u + * | o + * |o + * |12345678---> reader order + * + * o = old slope + * u = update + * n = new slope + * + * So reader 6 will observe time going backwards versus reader 5. + * + * While other CPUs are likely to be able observe that, the only way + * for a CPU local observation is when an NMI hits in the middle of + * the update. Timestamps taken from that NMI context might be ahead + * of the following timestamps. Callers need to be aware of that and + * deal with it. + */ +u64 notrace ktime_get_mono_fast_ns(void) +{ +	struct tk_read_base *tkr; +	unsigned int seq; +	u64 now; + +	do { +		seq = raw_read_seqcount(&tk_fast_mono.seq); +		tkr = tk_fast_mono.base + (seq & 0x01); +		now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr); + +	} while (read_seqcount_retry(&tk_fast_mono.seq, seq)); +	return now; +} +EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); + +#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD + +static inline void update_vsyscall(struct timekeeper *tk) +{ +	struct timespec xt; + +	xt = timespec64_to_timespec(tk_xtime(tk)); +	update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->tkr.clock, tk->tkr.mult, +			    tk->tkr.cycle_last); +} + +static inline void old_vsyscall_fixup(struct timekeeper *tk) +{ +	s64 remainder; + +	/* +	* Store only full nanoseconds into xtime_nsec after rounding +	* it up and add the remainder to the error difference. +	* XXX - This is necessary to avoid small 1ns inconsistnecies caused +	* by truncating the remainder in vsyscalls. However, it causes +	* additional work to be done in timekeeping_adjust(). Once +	* the vsyscall implementations are converted to use xtime_nsec +	* (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD +	* users are removed, this can be killed. +	*/ +	remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1); +	tk->tkr.xtime_nsec -= remainder; +	tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift; +	tk->ntp_error += remainder << tk->ntp_error_shift; +	tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift; +} +#else +#define old_vsyscall_fixup(tk) +#endif +  static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);  static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) @@ -217,7 +381,7 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)   */  int pvclock_gtod_register_notifier(struct notifier_block *nb)  { -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper;  	unsigned long flags;  	int ret; @@ -247,6 +411,29 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)  }  EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); +/* + * Update the ktime_t based scalar nsec members of the timekeeper + */ +static inline void tk_update_ktime_data(struct timekeeper *tk) +{ +	s64 nsec; + +	/* +	 * The xtime based monotonic readout is: +	 *	nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now(); +	 * The ktime based monotonic readout is: +	 *	nsec = base_mono + now(); +	 * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec +	 */ +	nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); +	nsec *= NSEC_PER_SEC; +	nsec += tk->wall_to_monotonic.tv_nsec; +	tk->tkr.base_mono = ns_to_ktime(nsec); + +	/* Update the monotonic raw base */ +	tk->base_raw = timespec64_to_ktime(tk->raw_time); +} +  /* must hold timekeeper_lock */  static void timekeeping_update(struct timekeeper *tk, unsigned int action)  { @@ -257,8 +444,13 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)  	update_vsyscall(tk);  	update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); +	tk_update_ktime_data(tk); +  	if (action & TK_MIRROR) -		memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); +		memcpy(&shadow_timekeeper, &tk_core.timekeeper, +		       sizeof(tk_core.timekeeper)); + +	update_fast_timekeeper(tk);  }  /** @@ -270,49 +462,48 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)   */  static void timekeeping_forward_now(struct timekeeper *tk)  { -	cycle_t cycle_now, cycle_delta; -	struct clocksource *clock; +	struct clocksource *clock = tk->tkr.clock; +	cycle_t cycle_now, delta;  	s64 nsec; -	clock = tk->clock; -	cycle_now = clock->read(clock); -	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; -	tk->cycle_last = clock->cycle_last = cycle_now; +	cycle_now = tk->tkr.read(clock); +	delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); +	tk->tkr.cycle_last = cycle_now; -	tk->xtime_nsec += cycle_delta * tk->mult; +	tk->tkr.xtime_nsec += delta * tk->tkr.mult;  	/* If arch requires, add in get_arch_timeoffset() */ -	tk->xtime_nsec += (u64)get_arch_timeoffset() << tk->shift; +	tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift;  	tk_normalize_xtime(tk); -	nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); -	timespec_add_ns(&tk->raw_time, nsec); +	nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); +	timespec64_add_ns(&tk->raw_time, nsec);  }  /** - * __getnstimeofday - Returns the time of day in a timespec. + * __getnstimeofday64 - Returns the time of day in a timespec64.   * @ts:		pointer to the timespec to be set   *   * Updates the time of day in the timespec.   * Returns 0 on success, or -ve when suspended (timespec will be undefined).   */ -int __getnstimeofday(struct timespec *ts) +int __getnstimeofday64(struct timespec64 *ts)  { -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper;  	unsigned long seq;  	s64 nsecs = 0;  	do { -		seq = read_seqcount_begin(&timekeeper_seq); +		seq = read_seqcount_begin(&tk_core.seq);  		ts->tv_sec = tk->xtime_sec; -		nsecs = timekeeping_get_ns(tk); +		nsecs = timekeeping_get_ns(&tk->tkr); -	} while (read_seqcount_retry(&timekeeper_seq, seq)); +	} while (read_seqcount_retry(&tk_core.seq, seq));  	ts->tv_nsec = 0; -	timespec_add_ns(ts, nsecs); +	timespec64_add_ns(ts, nsecs);  	/*  	 * Do not bail out early, in case there were callers still using @@ -322,116 +513,138 @@ int __getnstimeofday(struct timespec *ts)  		return -EAGAIN;  	return 0;  } -EXPORT_SYMBOL(__getnstimeofday); +EXPORT_SYMBOL(__getnstimeofday64);  /** - * getnstimeofday - Returns the time of day in a timespec. + * getnstimeofday64 - Returns the time of day in a timespec64.   * @ts:		pointer to the timespec to be set   *   * Returns the time of day in a timespec (WARN if suspended).   */ -void getnstimeofday(struct timespec *ts) +void getnstimeofday64(struct timespec64 *ts)  { -	WARN_ON(__getnstimeofday(ts)); +	WARN_ON(__getnstimeofday64(ts));  } -EXPORT_SYMBOL(getnstimeofday); +EXPORT_SYMBOL(getnstimeofday64);  ktime_t ktime_get(void)  { -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper;  	unsigned int seq; -	s64 secs, nsecs; +	ktime_t base; +	s64 nsecs;  	WARN_ON(timekeeping_suspended);  	do { -		seq = read_seqcount_begin(&timekeeper_seq); -		secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; -		nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec; +		seq = read_seqcount_begin(&tk_core.seq); +		base = tk->tkr.base_mono; +		nsecs = timekeeping_get_ns(&tk->tkr); -	} while (read_seqcount_retry(&timekeeper_seq, seq)); -	/* -	 * Use ktime_set/ktime_add_ns to create a proper ktime on -	 * 32-bit architectures without CONFIG_KTIME_SCALAR. -	 */ -	return ktime_add_ns(ktime_set(secs, 0), nsecs); +	} while (read_seqcount_retry(&tk_core.seq, seq)); + +	return ktime_add_ns(base, nsecs);  }  EXPORT_SYMBOL_GPL(ktime_get); -/** - * ktime_get_ts - get the monotonic clock in timespec format - * @ts:		pointer to timespec variable - * - * The function calculates the monotonic clock from the realtime - * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by @ts. - */ -void ktime_get_ts(struct timespec *ts) +static ktime_t *offsets[TK_OFFS_MAX] = { +	[TK_OFFS_REAL]	= &tk_core.timekeeper.offs_real, +	[TK_OFFS_BOOT]	= &tk_core.timekeeper.offs_boot, +	[TK_OFFS_TAI]	= &tk_core.timekeeper.offs_tai, +}; + +ktime_t ktime_get_with_offset(enum tk_offsets offs)  { -	struct timekeeper *tk = &timekeeper; -	struct timespec tomono; -	s64 nsec; +	struct timekeeper *tk = &tk_core.timekeeper;  	unsigned int seq; +	ktime_t base, *offset = offsets[offs]; +	s64 nsecs;  	WARN_ON(timekeeping_suspended);  	do { -		seq = read_seqcount_begin(&timekeeper_seq); -		ts->tv_sec = tk->xtime_sec; -		nsec = timekeeping_get_ns(tk); -		tomono = tk->wall_to_monotonic; +		seq = read_seqcount_begin(&tk_core.seq); +		base = ktime_add(tk->tkr.base_mono, *offset); +		nsecs = timekeeping_get_ns(&tk->tkr); -	} while (read_seqcount_retry(&timekeeper_seq, seq)); +	} while (read_seqcount_retry(&tk_core.seq, seq)); -	ts->tv_sec += tomono.tv_sec; -	ts->tv_nsec = 0; -	timespec_add_ns(ts, nsec + tomono.tv_nsec); -} -EXPORT_SYMBOL_GPL(ktime_get_ts); +	return ktime_add_ns(base, nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_with_offset);  /** - * timekeeping_clocktai - Returns the TAI time of day in a timespec - * @ts:		pointer to the timespec to be set - * - * Returns the time of day in a timespec. + * ktime_mono_to_any() - convert mononotic time to any other time + * @tmono:	time to convert. + * @offs:	which offset to use   */ -void timekeeping_clocktai(struct timespec *ts) +ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)  { -	struct timekeeper *tk = &timekeeper; +	ktime_t *offset = offsets[offs];  	unsigned long seq; -	u64 nsecs; - -	WARN_ON(timekeeping_suspended); +	ktime_t tconv;  	do { -		seq = read_seqcount_begin(&timekeeper_seq); +		seq = read_seqcount_begin(&tk_core.seq); +		tconv = ktime_add(tmono, *offset); +	} while (read_seqcount_retry(&tk_core.seq, seq)); -		ts->tv_sec = tk->xtime_sec + tk->tai_offset; -		nsecs = timekeeping_get_ns(tk); +	return tconv; +} +EXPORT_SYMBOL_GPL(ktime_mono_to_any); -	} while (read_seqcount_retry(&timekeeper_seq, seq)); +/** + * ktime_get_raw - Returns the raw monotonic time in ktime_t format + */ +ktime_t ktime_get_raw(void) +{ +	struct timekeeper *tk = &tk_core.timekeeper; +	unsigned int seq; +	ktime_t base; +	s64 nsecs; -	ts->tv_nsec = 0; -	timespec_add_ns(ts, nsecs); +	do { +		seq = read_seqcount_begin(&tk_core.seq); +		base = tk->base_raw; +		nsecs = timekeeping_get_ns_raw(tk); -} -EXPORT_SYMBOL(timekeeping_clocktai); +	} while (read_seqcount_retry(&tk_core.seq, seq)); +	return ktime_add_ns(base, nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_raw);  /** - * ktime_get_clocktai - Returns the TAI time of day in a ktime + * ktime_get_ts64 - get the monotonic clock in timespec64 format + * @ts:		pointer to timespec variable   * - * Returns the time of day in a ktime. + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by @ts.   */ -ktime_t ktime_get_clocktai(void) +void ktime_get_ts64(struct timespec64 *ts)  { -	struct timespec ts; +	struct timekeeper *tk = &tk_core.timekeeper; +	struct timespec64 tomono; +	s64 nsec; +	unsigned int seq; + +	WARN_ON(timekeeping_suspended); -	timekeeping_clocktai(&ts); -	return timespec_to_ktime(ts); +	do { +		seq = read_seqcount_begin(&tk_core.seq); +		ts->tv_sec = tk->xtime_sec; +		nsec = timekeeping_get_ns(&tk->tkr); +		tomono = tk->wall_to_monotonic; + +	} while (read_seqcount_retry(&tk_core.seq, seq)); + +	ts->tv_sec += tomono.tv_sec; +	ts->tv_nsec = 0; +	timespec64_add_ns(ts, nsec + tomono.tv_nsec);  } -EXPORT_SYMBOL(ktime_get_clocktai); +EXPORT_SYMBOL_GPL(ktime_get_ts64);  #ifdef CONFIG_NTP_PPS @@ -446,23 +659,23 @@ EXPORT_SYMBOL(ktime_get_clocktai);   */  void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)  { -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper;  	unsigned long seq;  	s64 nsecs_raw, nsecs_real;  	WARN_ON_ONCE(timekeeping_suspended);  	do { -		seq = read_seqcount_begin(&timekeeper_seq); +		seq = read_seqcount_begin(&tk_core.seq); -		*ts_raw = tk->raw_time; +		*ts_raw = timespec64_to_timespec(tk->raw_time);  		ts_real->tv_sec = tk->xtime_sec;  		ts_real->tv_nsec = 0;  		nsecs_raw = timekeeping_get_ns_raw(tk); -		nsecs_real = timekeeping_get_ns(tk); +		nsecs_real = timekeeping_get_ns(&tk->tkr); -	} while (read_seqcount_retry(&timekeeper_seq, seq)); +	} while (read_seqcount_retry(&tk_core.seq, seq));  	timespec_add_ns(ts_raw, nsecs_raw);  	timespec_add_ns(ts_real, nsecs_real); @@ -479,9 +692,9 @@ EXPORT_SYMBOL(getnstime_raw_and_real);   */  void do_gettimeofday(struct timeval *tv)  { -	struct timespec now; +	struct timespec64 now; -	getnstimeofday(&now); +	getnstimeofday64(&now);  	tv->tv_sec = now.tv_sec;  	tv->tv_usec = now.tv_nsec/1000;  } @@ -495,15 +708,15 @@ EXPORT_SYMBOL(do_gettimeofday);   */  int do_settimeofday(const struct timespec *tv)  { -	struct timekeeper *tk = &timekeeper; -	struct timespec ts_delta, xt; +	struct timekeeper *tk = &tk_core.timekeeper; +	struct timespec64 ts_delta, xt, tmp;  	unsigned long flags;  	if (!timespec_valid_strict(tv))  		return -EINVAL;  	raw_spin_lock_irqsave(&timekeeper_lock, flags); -	write_seqcount_begin(&timekeeper_seq); +	write_seqcount_begin(&tk_core.seq);  	timekeeping_forward_now(tk); @@ -511,13 +724,14 @@ int do_settimeofday(const struct timespec *tv)  	ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;  	ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec; -	tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta)); +	tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); -	tk_set_xtime(tk, tv); +	tmp = timespec_to_timespec64(*tv); +	tk_set_xtime(tk, &tmp);  	timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); -	write_seqcount_end(&timekeeper_seq); +	write_seqcount_end(&tk_core.seq);  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	/* signal hrtimers about time change */ @@ -535,33 +749,35 @@ EXPORT_SYMBOL(do_settimeofday);   */  int timekeeping_inject_offset(struct timespec *ts)  { -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper;  	unsigned long flags; -	struct timespec tmp; +	struct timespec64 ts64, tmp;  	int ret = 0;  	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)  		return -EINVAL; +	ts64 = timespec_to_timespec64(*ts); +  	raw_spin_lock_irqsave(&timekeeper_lock, flags); -	write_seqcount_begin(&timekeeper_seq); +	write_seqcount_begin(&tk_core.seq);  	timekeeping_forward_now(tk);  	/* Make sure the proposed value is valid */ -	tmp = timespec_add(tk_xtime(tk),  *ts); -	if (!timespec_valid_strict(&tmp)) { +	tmp = timespec64_add(tk_xtime(tk),  ts64); +	if (!timespec64_valid_strict(&tmp)) {  		ret = -EINVAL;  		goto error;  	} -	tk_xtime_add(tk, ts); -	tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts)); +	tk_xtime_add(tk, &ts64); +	tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64));  error: /* even if we error out, we forwarded the time, so call update */  	timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); -	write_seqcount_end(&timekeeper_seq); +	write_seqcount_end(&tk_core.seq);  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	/* signal hrtimers about time change */ @@ -578,14 +794,14 @@ EXPORT_SYMBOL(timekeeping_inject_offset);   */  s32 timekeeping_get_tai_offset(void)  { -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper;  	unsigned int seq;  	s32 ret;  	do { -		seq = read_seqcount_begin(&timekeeper_seq); +		seq = read_seqcount_begin(&tk_core.seq);  		ret = tk->tai_offset; -	} while (read_seqcount_retry(&timekeeper_seq, seq)); +	} while (read_seqcount_retry(&tk_core.seq, seq));  	return ret;  } @@ -606,14 +822,14 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)   */  void timekeeping_set_tai_offset(s32 tai_offset)  { -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper;  	unsigned long flags;  	raw_spin_lock_irqsave(&timekeeper_lock, flags); -	write_seqcount_begin(&timekeeper_seq); +	write_seqcount_begin(&tk_core.seq);  	__timekeeping_set_tai_offset(tk, tai_offset);  	timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); -	write_seqcount_end(&timekeeper_seq); +	write_seqcount_end(&tk_core.seq);  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	clock_was_set();  } @@ -625,14 +841,14 @@ void timekeeping_set_tai_offset(s32 tai_offset)   */  static int change_clocksource(void *data)  { -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper;  	struct clocksource *new, *old;  	unsigned long flags;  	new = (struct clocksource *) data;  	raw_spin_lock_irqsave(&timekeeper_lock, flags); -	write_seqcount_begin(&timekeeper_seq); +	write_seqcount_begin(&tk_core.seq);  	timekeeping_forward_now(tk);  	/* @@ -641,7 +857,7 @@ static int change_clocksource(void *data)  	 */  	if (try_module_get(new->owner)) {  		if (!new->enable || new->enable(new) == 0) { -			old = tk->clock; +			old = tk->tkr.clock;  			tk_setup_internals(tk, new);  			if (old->disable)  				old->disable(old); @@ -652,7 +868,7 @@ static int change_clocksource(void *data)  	}  	timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); -	write_seqcount_end(&timekeeper_seq); +	write_seqcount_end(&tk_core.seq);  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	return 0; @@ -667,29 +883,14 @@ static int change_clocksource(void *data)   */  int timekeeping_notify(struct clocksource *clock)  { -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper; -	if (tk->clock == clock) +	if (tk->tkr.clock == clock)  		return 0;  	stop_machine(change_clocksource, clock, NULL);  	tick_clock_notify(); -	return tk->clock == clock ? 0 : -1; -} - -/** - * ktime_get_real - get the real (wall-) time in ktime_t format - * - * returns the time in ktime_t format - */ -ktime_t ktime_get_real(void) -{ -	struct timespec now; - -	getnstimeofday(&now); - -	return timespec_to_ktime(now); +	return tk->tkr.clock == clock ? 0 : -1;  } -EXPORT_SYMBOL_GPL(ktime_get_real);  /**   * getrawmonotonic - Returns the raw monotonic time in a timespec @@ -699,18 +900,20 @@ EXPORT_SYMBOL_GPL(ktime_get_real);   */  void getrawmonotonic(struct timespec *ts)  { -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper; +	struct timespec64 ts64;  	unsigned long seq;  	s64 nsecs;  	do { -		seq = read_seqcount_begin(&timekeeper_seq); +		seq = read_seqcount_begin(&tk_core.seq);  		nsecs = timekeeping_get_ns_raw(tk); -		*ts = tk->raw_time; +		ts64 = tk->raw_time; -	} while (read_seqcount_retry(&timekeeper_seq, seq)); +	} while (read_seqcount_retry(&tk_core.seq, seq)); -	timespec_add_ns(ts, nsecs); +	timespec64_add_ns(&ts64, nsecs); +	*ts = timespec64_to_timespec(ts64);  }  EXPORT_SYMBOL(getrawmonotonic); @@ -719,16 +922,16 @@ EXPORT_SYMBOL(getrawmonotonic);   */  int timekeeping_valid_for_hres(void)  { -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper;  	unsigned long seq;  	int ret;  	do { -		seq = read_seqcount_begin(&timekeeper_seq); +		seq = read_seqcount_begin(&tk_core.seq); -		ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; +		ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; -	} while (read_seqcount_retry(&timekeeper_seq, seq)); +	} while (read_seqcount_retry(&tk_core.seq, seq));  	return ret;  } @@ -738,16 +941,16 @@ int timekeeping_valid_for_hres(void)   */  u64 timekeeping_max_deferment(void)  { -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper;  	unsigned long seq;  	u64 ret;  	do { -		seq = read_seqcount_begin(&timekeeper_seq); +		seq = read_seqcount_begin(&tk_core.seq); -		ret = tk->clock->max_idle_ns; +		ret = tk->tkr.clock->max_idle_ns; -	} while (read_seqcount_retry(&timekeeper_seq, seq)); +	} while (read_seqcount_retry(&tk_core.seq, seq));  	return ret;  } @@ -787,14 +990,15 @@ void __weak read_boot_clock(struct timespec *ts)   */  void __init timekeeping_init(void)  { -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper;  	struct clocksource *clock;  	unsigned long flags; -	struct timespec now, boot, tmp; - -	read_persistent_clock(&now); +	struct timespec64 now, boot, tmp; +	struct timespec ts; -	if (!timespec_valid_strict(&now)) { +	read_persistent_clock(&ts); +	now = timespec_to_timespec64(ts); +	if (!timespec64_valid_strict(&now)) {  		pr_warn("WARNING: Persistent clock returned invalid value!\n"  			"         Check your CMOS/BIOS settings.\n");  		now.tv_sec = 0; @@ -802,8 +1006,9 @@ void __init timekeeping_init(void)  	} else if (now.tv_sec || now.tv_nsec)  		persistent_clock_exist = true; -	read_boot_clock(&boot); -	if (!timespec_valid_strict(&boot)) { +	read_boot_clock(&ts); +	boot = timespec_to_timespec64(ts); +	if (!timespec64_valid_strict(&boot)) {  		pr_warn("WARNING: Boot clock returned invalid value!\n"  			"         Check your CMOS/BIOS settings.\n");  		boot.tv_sec = 0; @@ -811,7 +1016,7 @@ void __init timekeeping_init(void)  	}  	raw_spin_lock_irqsave(&timekeeper_lock, flags); -	write_seqcount_begin(&timekeeper_seq); +	write_seqcount_begin(&tk_core.seq);  	ntp_init();  	clock = clocksource_default_clock(); @@ -822,24 +1027,21 @@ void __init timekeeping_init(void)  	tk_set_xtime(tk, &now);  	tk->raw_time.tv_sec = 0;  	tk->raw_time.tv_nsec = 0; +	tk->base_raw.tv64 = 0;  	if (boot.tv_sec == 0 && boot.tv_nsec == 0)  		boot = tk_xtime(tk); -	set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec); +	set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);  	tk_set_wall_to_mono(tk, tmp); -	tmp.tv_sec = 0; -	tmp.tv_nsec = 0; -	tk_set_sleep_time(tk, tmp); - -	memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper)); +	timekeeping_update(tk, TK_MIRROR); -	write_seqcount_end(&timekeeper_seq); +	write_seqcount_end(&tk_core.seq);  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  }  /* time in seconds when suspend began */ -static struct timespec timekeeping_suspend_time; +static struct timespec64 timekeeping_suspend_time;  /**   * __timekeeping_inject_sleeptime - Internal function to add sleep interval @@ -849,17 +1051,17 @@ static struct timespec timekeeping_suspend_time;   * adds the sleep offset to the timekeeping variables.   */  static void __timekeeping_inject_sleeptime(struct timekeeper *tk, -							struct timespec *delta) +					   struct timespec64 *delta)  { -	if (!timespec_valid_strict(delta)) { +	if (!timespec64_valid_strict(delta)) {  		printk_deferred(KERN_WARNING  				"__timekeeping_inject_sleeptime: Invalid "  				"sleep delta value!\n");  		return;  	}  	tk_xtime_add(tk, delta); -	tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta)); -	tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta)); +	tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta)); +	tk_update_sleep_time(tk, timespec64_to_ktime(*delta));  	tk_debug_account_sleep_time(delta);  } @@ -875,7 +1077,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,   */  void timekeeping_inject_sleeptime(struct timespec *delta)  { -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper; +	struct timespec64 tmp;  	unsigned long flags;  	/* @@ -886,15 +1089,16 @@ void timekeeping_inject_sleeptime(struct timespec *delta)  		return;  	raw_spin_lock_irqsave(&timekeeper_lock, flags); -	write_seqcount_begin(&timekeeper_seq); +	write_seqcount_begin(&tk_core.seq);  	timekeeping_forward_now(tk); -	__timekeeping_inject_sleeptime(tk, delta); +	tmp = timespec_to_timespec64(*delta); +	__timekeeping_inject_sleeptime(tk, &tmp);  	timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); -	write_seqcount_end(&timekeeper_seq); +	write_seqcount_end(&tk_core.seq);  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	/* signal hrtimers about time change */ @@ -910,20 +1114,22 @@ void timekeeping_inject_sleeptime(struct timespec *delta)   */  static void timekeeping_resume(void)  { -	struct timekeeper *tk = &timekeeper; -	struct clocksource *clock = tk->clock; +	struct timekeeper *tk = &tk_core.timekeeper; +	struct clocksource *clock = tk->tkr.clock;  	unsigned long flags; -	struct timespec ts_new, ts_delta; +	struct timespec64 ts_new, ts_delta; +	struct timespec tmp;  	cycle_t cycle_now, cycle_delta;  	bool suspendtime_found = false; -	read_persistent_clock(&ts_new); +	read_persistent_clock(&tmp); +	ts_new = timespec_to_timespec64(tmp);  	clockevents_resume();  	clocksource_resume();  	raw_spin_lock_irqsave(&timekeeper_lock, flags); -	write_seqcount_begin(&timekeeper_seq); +	write_seqcount_begin(&tk_core.seq);  	/*  	 * After system resumes, we need to calculate the suspended time and @@ -937,15 +1143,16 @@ static void timekeeping_resume(void)  	 * The less preferred source will only be tried if there is no better  	 * usable source. The rtc part is handled separately in rtc core code.  	 */ -	cycle_now = clock->read(clock); +	cycle_now = tk->tkr.read(clock);  	if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && -		cycle_now > clock->cycle_last) { +		cycle_now > tk->tkr.cycle_last) {  		u64 num, max = ULLONG_MAX;  		u32 mult = clock->mult;  		u32 shift = clock->shift;  		s64 nsec = 0; -		cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; +		cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, +						tk->tkr.mask);  		/*  		 * "cycle_delta * mutl" may cause 64 bits overflow, if the @@ -960,10 +1167,10 @@ static void timekeeping_resume(void)  		}  		nsec += ((u64) cycle_delta * mult) >> shift; -		ts_delta = ns_to_timespec(nsec); +		ts_delta = ns_to_timespec64(nsec);  		suspendtime_found = true; -	} else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) { -		ts_delta = timespec_sub(ts_new, timekeeping_suspend_time); +	} else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { +		ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);  		suspendtime_found = true;  	} @@ -971,11 +1178,11 @@ static void timekeeping_resume(void)  		__timekeeping_inject_sleeptime(tk, &ts_delta);  	/* Re-base the last cycle value */ -	tk->cycle_last = clock->cycle_last = cycle_now; +	tk->tkr.cycle_last = cycle_now;  	tk->ntp_error = 0;  	timekeeping_suspended = 0;  	timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); -	write_seqcount_end(&timekeeper_seq); +	write_seqcount_end(&tk_core.seq);  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	touch_softlockup_watchdog(); @@ -988,12 +1195,14 @@ static void timekeeping_resume(void)  static int timekeeping_suspend(void)  { -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper;  	unsigned long flags; -	struct timespec		delta, delta_delta; -	static struct timespec	old_delta; +	struct timespec64		delta, delta_delta; +	static struct timespec64	old_delta; +	struct timespec tmp; -	read_persistent_clock(&timekeeping_suspend_time); +	read_persistent_clock(&tmp); +	timekeeping_suspend_time = timespec_to_timespec64(tmp);  	/*  	 * On some systems the persistent_clock can not be detected at @@ -1004,7 +1213,7 @@ static int timekeeping_suspend(void)  		persistent_clock_exist = true;  	raw_spin_lock_irqsave(&timekeeper_lock, flags); -	write_seqcount_begin(&timekeeper_seq); +	write_seqcount_begin(&tk_core.seq);  	timekeeping_forward_now(tk);  	timekeeping_suspended = 1; @@ -1014,8 +1223,8 @@ static int timekeeping_suspend(void)  	 * try to compensate so the difference in system time  	 * and persistent_clock time stays close to constant.  	 */ -	delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time); -	delta_delta = timespec_sub(delta, old_delta); +	delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); +	delta_delta = timespec64_sub(delta, old_delta);  	if (abs(delta_delta.tv_sec)  >= 2) {  		/*  		 * if delta_delta is too large, assume time correction @@ -1025,11 +1234,11 @@ static int timekeeping_suspend(void)  	} else {  		/* Otherwise try to adjust old_system to compensate */  		timekeeping_suspend_time = -			timespec_add(timekeeping_suspend_time, delta_delta); +			timespec64_add(timekeeping_suspend_time, delta_delta);  	}  	timekeeping_update(tk, TK_MIRROR); -	write_seqcount_end(&timekeeper_seq); +	write_seqcount_end(&tk_core.seq);  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); @@ -1050,125 +1259,34 @@ static int __init timekeeping_init_ops(void)  	register_syscore_ops(&timekeeping_syscore_ops);  	return 0;  } -  device_initcall(timekeeping_init_ops);  /* - * If the error is already larger, we look ahead even further - * to compensate for late or lost adjustments. + * Apply a multiplier adjustment to the timekeeper   */ -static __always_inline int timekeeping_bigadjust(struct timekeeper *tk, -						 s64 error, s64 *interval, -						 s64 *offset) +static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, +							 s64 offset, +							 bool negative, +							 int adj_scale)  { -	s64 tick_error, i; -	u32 look_ahead, adj; -	s32 error2, mult; - -	/* -	 * Use the current error value to determine how much to look ahead. -	 * The larger the error the slower we adjust for it to avoid problems -	 * with losing too many ticks, otherwise we would overadjust and -	 * produce an even larger error.  The smaller the adjustment the -	 * faster we try to adjust for it, as lost ticks can do less harm -	 * here.  This is tuned so that an error of about 1 msec is adjusted -	 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). -	 */ -	error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); -	error2 = abs(error2); -	for (look_ahead = 0; error2 > 0; look_ahead++) -		error2 >>= 2; +	s64 interval = tk->cycle_interval; +	s32 mult_adj = 1; -	/* -	 * Now calculate the error in (1 << look_ahead) ticks, but first -	 * remove the single look ahead already included in the error. -	 */ -	tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1); -	tick_error -= tk->xtime_interval >> 1; -	error = ((error - tick_error) >> look_ahead) + tick_error; - -	/* Finally calculate the adjustment shift value.  */ -	i = *interval; -	mult = 1; -	if (error < 0) { -		error = -error; -		*interval = -*interval; -		*offset = -*offset; -		mult = -1; +	if (negative) { +		mult_adj = -mult_adj; +		interval = -interval; +		offset  = -offset;  	} -	for (adj = 0; error > i; adj++) -		error >>= 1; - -	*interval <<= adj; -	*offset <<= adj; -	return mult << adj; -} - -/* - * Adjust the multiplier to reduce the error value, - * this is optimized for the most common adjustments of -1,0,1, - * for other values we can do a bit more work. - */ -static void timekeeping_adjust(struct timekeeper *tk, s64 offset) -{ -	s64 error, interval = tk->cycle_interval; -	int adj; +	mult_adj <<= adj_scale; +	interval <<= adj_scale; +	offset <<= adj_scale;  	/* -	 * The point of this is to check if the error is greater than half -	 * an interval. -	 * -	 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. -	 * -	 * Note we subtract one in the shift, so that error is really error*2. -	 * This "saves" dividing(shifting) interval twice, but keeps the -	 * (error > interval) comparison as still measuring if error is -	 * larger than half an interval. -	 * -	 * Note: It does not "save" on aggravation when reading the code. -	 */ -	error = tk->ntp_error >> (tk->ntp_error_shift - 1); -	if (error > interval) { -		/* -		 * We now divide error by 4(via shift), which checks if -		 * the error is greater than twice the interval. -		 * If it is greater, we need a bigadjust, if its smaller, -		 * we can adjust by 1. -		 */ -		error >>= 2; -		if (likely(error <= interval)) -			adj = 1; -		else -			adj = timekeeping_bigadjust(tk, error, &interval, &offset); -	} else { -		if (error < -interval) { -			/* See comment above, this is just switched for the negative */ -			error >>= 2; -			if (likely(error >= -interval)) { -				adj = -1; -				interval = -interval; -				offset = -offset; -			} else { -				adj = timekeeping_bigadjust(tk, error, &interval, &offset); -			} -		} else { -			goto out_adjust; -		} -	} - -	if (unlikely(tk->clock->maxadj && -		(tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { -		printk_deferred_once(KERN_WARNING -			"Adjusting %s more than 11%% (%ld vs %ld)\n", -			tk->clock->name, (long)tk->mult + adj, -			(long)tk->clock->mult + tk->clock->maxadj); -	} -	/*  	 * So the following can be confusing.  	 * -	 * To keep things simple, lets assume adj == 1 for now. +	 * To keep things simple, lets assume mult_adj == 1 for now.  	 * -	 * When adj != 1, remember that the interval and offset values +	 * When mult_adj != 1, remember that the interval and offset values  	 * have been appropriately scaled so the math is the same.  	 *  	 * The basic idea here is that we're increasing the multiplier @@ -1212,12 +1330,78 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)  	 *  	 * XXX - TODO: Doc ntp_error calculation.  	 */ -	tk->mult += adj; +	tk->tkr.mult += mult_adj;  	tk->xtime_interval += interval; -	tk->xtime_nsec -= offset; +	tk->tkr.xtime_nsec -= offset;  	tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; +} + +/* + * Calculate the multiplier adjustment needed to match the frequency + * specified by NTP + */ +static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, +							s64 offset) +{ +	s64 interval = tk->cycle_interval; +	s64 xinterval = tk->xtime_interval; +	s64 tick_error; +	bool negative; +	u32 adj; + +	/* Remove any current error adj from freq calculation */ +	if (tk->ntp_err_mult) +		xinterval -= tk->cycle_interval; + +	tk->ntp_tick = ntp_tick_length(); + +	/* Calculate current error per tick */ +	tick_error = ntp_tick_length() >> tk->ntp_error_shift; +	tick_error -= (xinterval + tk->xtime_remainder); + +	/* Don't worry about correcting it if its small */ +	if (likely((tick_error >= 0) && (tick_error <= interval))) +		return; + +	/* preserve the direction of correction */ +	negative = (tick_error < 0); + +	/* Sort out the magnitude of the correction */ +	tick_error = abs(tick_error); +	for (adj = 0; tick_error > interval; adj++) +		tick_error >>= 1; + +	/* scale the corrections */ +	timekeeping_apply_adjustment(tk, offset, negative, adj); +} + +/* + * Adjust the timekeeper's multiplier to the correct frequency + * and also to reduce the accumulated error value. + */ +static void timekeeping_adjust(struct timekeeper *tk, s64 offset) +{ +	/* Correct for the current frequency error */ +	timekeeping_freqadjust(tk, offset); + +	/* Next make a small adjustment to fix any cumulative error */ +	if (!tk->ntp_err_mult && (tk->ntp_error > 0)) { +		tk->ntp_err_mult = 1; +		timekeeping_apply_adjustment(tk, offset, 0, 0); +	} else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) { +		/* Undo any existing error adjustment */ +		timekeeping_apply_adjustment(tk, offset, 1, 0); +		tk->ntp_err_mult = 0; +	} + +	if (unlikely(tk->tkr.clock->maxadj && +		(tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) { +		printk_once(KERN_WARNING +			"Adjusting %s more than 11%% (%ld vs %ld)\n", +			tk->tkr.clock->name, (long)tk->tkr.mult, +			(long)tk->tkr.clock->mult + tk->tkr.clock->maxadj); +	} -out_adjust:  	/*  	 * It may be possible that when we entered this function, xtime_nsec  	 * was very small.  Further, if we're slightly speeding the clocksource @@ -1232,12 +1416,11 @@ out_adjust:  	 * We'll correct this error next time through this function, when  	 * xtime_nsec is not as small.  	 */ -	if (unlikely((s64)tk->xtime_nsec < 0)) { -		s64 neg = -(s64)tk->xtime_nsec; -		tk->xtime_nsec = 0; +	if (unlikely((s64)tk->tkr.xtime_nsec < 0)) { +		s64 neg = -(s64)tk->tkr.xtime_nsec; +		tk->tkr.xtime_nsec = 0;  		tk->ntp_error += neg << tk->ntp_error_shift;  	} -  }  /** @@ -1250,26 +1433,26 @@ out_adjust:   */  static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)  { -	u64 nsecps = (u64)NSEC_PER_SEC << tk->shift; +	u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift;  	unsigned int clock_set = 0; -	while (tk->xtime_nsec >= nsecps) { +	while (tk->tkr.xtime_nsec >= nsecps) {  		int leap; -		tk->xtime_nsec -= nsecps; +		tk->tkr.xtime_nsec -= nsecps;  		tk->xtime_sec++;  		/* Figure out if its a leap sec and apply if needed */  		leap = second_overflow(tk->xtime_sec);  		if (unlikely(leap)) { -			struct timespec ts; +			struct timespec64 ts;  			tk->xtime_sec += leap;  			ts.tv_sec = leap;  			ts.tv_nsec = 0;  			tk_set_wall_to_mono(tk, -				timespec_sub(tk->wall_to_monotonic, ts)); +				timespec64_sub(tk->wall_to_monotonic, ts));  			__timekeeping_set_tai_offset(tk, tk->tai_offset - leap); @@ -1301,9 +1484,9 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,  	/* Accumulate one shifted interval */  	offset -= interval; -	tk->cycle_last += interval; +	tk->tkr.cycle_last += interval; -	tk->xtime_nsec += tk->xtime_interval << shift; +	tk->tkr.xtime_nsec += tk->xtime_interval << shift;  	*clock_set |= accumulate_nsecs_to_secs(tk);  	/* Accumulate raw time */ @@ -1317,48 +1500,20 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,  	tk->raw_time.tv_nsec = raw_nsecs;  	/* Accumulate error between NTP and clock interval */ -	tk->ntp_error += ntp_tick_length() << shift; +	tk->ntp_error += tk->ntp_tick << shift;  	tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<  						(tk->ntp_error_shift + shift);  	return offset;  } -#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD -static inline void old_vsyscall_fixup(struct timekeeper *tk) -{ -	s64 remainder; - -	/* -	* Store only full nanoseconds into xtime_nsec after rounding -	* it up and add the remainder to the error difference. -	* XXX - This is necessary to avoid small 1ns inconsistnecies caused -	* by truncating the remainder in vsyscalls. However, it causes -	* additional work to be done in timekeeping_adjust(). Once -	* the vsyscall implementations are converted to use xtime_nsec -	* (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD -	* users are removed, this can be killed. -	*/ -	remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1); -	tk->xtime_nsec -= remainder; -	tk->xtime_nsec += 1ULL << tk->shift; -	tk->ntp_error += remainder << tk->ntp_error_shift; -	tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift; -} -#else -#define old_vsyscall_fixup(tk) -#endif - - -  /**   * update_wall_time - Uses the current clocksource to increment the wall time   *   */  void update_wall_time(void)  { -	struct clocksource *clock; -	struct timekeeper *real_tk = &timekeeper; +	struct timekeeper *real_tk = &tk_core.timekeeper;  	struct timekeeper *tk = &shadow_timekeeper;  	cycle_t offset;  	int shift = 0, maxshift; @@ -1371,12 +1526,11 @@ void update_wall_time(void)  	if (unlikely(timekeeping_suspended))  		goto out; -	clock = real_tk->clock; -  #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET  	offset = real_tk->cycle_interval;  #else -	offset = (clock->read(clock) - clock->cycle_last) & clock->mask; +	offset = clocksource_delta(tk->tkr.read(tk->tkr.clock), +				   tk->tkr.cycle_last, tk->tkr.mask);  #endif  	/* Check if there's really nothing to do */ @@ -1418,9 +1572,7 @@ void update_wall_time(void)  	 */  	clock_set |= accumulate_nsecs_to_secs(tk); -	write_seqcount_begin(&timekeeper_seq); -	/* Update clock->cycle_last with the new value */ -	clock->cycle_last = tk->cycle_last; +	write_seqcount_begin(&tk_core.seq);  	/*  	 * Update the real timekeeper.  	 * @@ -1428,12 +1580,12 @@ void update_wall_time(void)  	 * requires changes to all other timekeeper usage sites as  	 * well, i.e. move the timekeeper pointer getter into the  	 * spinlocked/seqcount protected sections. And we trade this -	 * memcpy under the timekeeper_seq against one before we start +	 * memcpy under the tk_core.seq against one before we start  	 * updating.  	 */  	memcpy(real_tk, tk, sizeof(*tk));  	timekeeping_update(real_tk, clock_set); -	write_seqcount_end(&timekeeper_seq); +	write_seqcount_end(&tk_core.seq);  out:  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	if (clock_set) @@ -1454,83 +1606,16 @@ out:   */  void getboottime(struct timespec *ts)  { -	struct timekeeper *tk = &timekeeper; -	struct timespec boottime = { -		.tv_sec = tk->wall_to_monotonic.tv_sec + -				tk->total_sleep_time.tv_sec, -		.tv_nsec = tk->wall_to_monotonic.tv_nsec + -				tk->total_sleep_time.tv_nsec -	}; - -	set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); -} -EXPORT_SYMBOL_GPL(getboottime); - -/** - * get_monotonic_boottime - Returns monotonic time since boot - * @ts:		pointer to the timespec to be set - * - * Returns the monotonic time since boot in a timespec. - * - * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also - * includes the time spent in suspend. - */ -void get_monotonic_boottime(struct timespec *ts) -{ -	struct timekeeper *tk = &timekeeper; -	struct timespec tomono, sleep; -	s64 nsec; -	unsigned int seq; - -	WARN_ON(timekeeping_suspended); - -	do { -		seq = read_seqcount_begin(&timekeeper_seq); -		ts->tv_sec = tk->xtime_sec; -		nsec = timekeeping_get_ns(tk); -		tomono = tk->wall_to_monotonic; -		sleep = tk->total_sleep_time; - -	} while (read_seqcount_retry(&timekeeper_seq, seq)); - -	ts->tv_sec += tomono.tv_sec + sleep.tv_sec; -	ts->tv_nsec = 0; -	timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec); -} -EXPORT_SYMBOL_GPL(get_monotonic_boottime); - -/** - * ktime_get_boottime - Returns monotonic time since boot in a ktime - * - * Returns the monotonic time since boot in a ktime - * - * This is similar to CLOCK_MONTONIC/ktime_get, but also - * includes the time spent in suspend. - */ -ktime_t ktime_get_boottime(void) -{ -	struct timespec ts; - -	get_monotonic_boottime(&ts); -	return timespec_to_ktime(ts); -} -EXPORT_SYMBOL_GPL(ktime_get_boottime); - -/** - * monotonic_to_bootbased - Convert the monotonic time to boot based. - * @ts:		pointer to the timespec to be converted - */ -void monotonic_to_bootbased(struct timespec *ts) -{ -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper; +	ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot); -	*ts = timespec_add(*ts, tk->total_sleep_time); +	*ts = ktime_to_timespec(t);  } -EXPORT_SYMBOL_GPL(monotonic_to_bootbased); +EXPORT_SYMBOL_GPL(getboottime);  unsigned long get_seconds(void)  { -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper;  	return tk->xtime_sec;  } @@ -1538,43 +1623,44 @@ EXPORT_SYMBOL(get_seconds);  struct timespec __current_kernel_time(void)  { -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper; -	return tk_xtime(tk); +	return timespec64_to_timespec(tk_xtime(tk));  }  struct timespec current_kernel_time(void)  { -	struct timekeeper *tk = &timekeeper; -	struct timespec now; +	struct timekeeper *tk = &tk_core.timekeeper; +	struct timespec64 now;  	unsigned long seq;  	do { -		seq = read_seqcount_begin(&timekeeper_seq); +		seq = read_seqcount_begin(&tk_core.seq);  		now = tk_xtime(tk); -	} while (read_seqcount_retry(&timekeeper_seq, seq)); +	} while (read_seqcount_retry(&tk_core.seq, seq)); -	return now; +	return timespec64_to_timespec(now);  }  EXPORT_SYMBOL(current_kernel_time);  struct timespec get_monotonic_coarse(void)  { -	struct timekeeper *tk = &timekeeper; -	struct timespec now, mono; +	struct timekeeper *tk = &tk_core.timekeeper; +	struct timespec64 now, mono;  	unsigned long seq;  	do { -		seq = read_seqcount_begin(&timekeeper_seq); +		seq = read_seqcount_begin(&tk_core.seq);  		now = tk_xtime(tk);  		mono = tk->wall_to_monotonic; -	} while (read_seqcount_retry(&timekeeper_seq, seq)); +	} while (read_seqcount_retry(&tk_core.seq, seq)); -	set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, +	set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec,  				now.tv_nsec + mono.tv_nsec); -	return now; + +	return timespec64_to_timespec(now);  }  /* @@ -1587,29 +1673,38 @@ void do_timer(unsigned long ticks)  }  /** - * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, - *    and sleep offsets. - * @xtim:	pointer to timespec to be set with xtime - * @wtom:	pointer to timespec to be set with wall_to_monotonic - * @sleep:	pointer to timespec to be set with time in suspend + * ktime_get_update_offsets_tick - hrtimer helper + * @offs_real:	pointer to storage for monotonic -> realtime offset + * @offs_boot:	pointer to storage for monotonic -> boottime offset + * @offs_tai:	pointer to storage for monotonic -> clock tai offset + * + * Returns monotonic time at last tick and various offsets   */ -void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, -				struct timespec *wtom, struct timespec *sleep) +ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, +							ktime_t *offs_tai)  { -	struct timekeeper *tk = &timekeeper; -	unsigned long seq; +	struct timekeeper *tk = &tk_core.timekeeper; +	unsigned int seq; +	ktime_t base; +	u64 nsecs;  	do { -		seq = read_seqcount_begin(&timekeeper_seq); -		*xtim = tk_xtime(tk); -		*wtom = tk->wall_to_monotonic; -		*sleep = tk->total_sleep_time; -	} while (read_seqcount_retry(&timekeeper_seq, seq)); +		seq = read_seqcount_begin(&tk_core.seq); + +		base = tk->tkr.base_mono; +		nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift; + +		*offs_real = tk->offs_real; +		*offs_boot = tk->offs_boot; +		*offs_tai = tk->offs_tai; +	} while (read_seqcount_retry(&tk_core.seq, seq)); + +	return ktime_add_ns(base, nsecs);  }  #ifdef CONFIG_HIGH_RES_TIMERS  /** - * ktime_get_update_offsets - hrtimer helper + * ktime_get_update_offsets_now - hrtimer helper   * @offs_real:	pointer to storage for monotonic -> realtime offset   * @offs_boot:	pointer to storage for monotonic -> boottime offset   * @offs_tai:	pointer to storage for monotonic -> clock tai offset @@ -1617,57 +1712,37 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,   * Returns current monotonic time and updates the offsets   * Called from hrtimer_interrupt() or retrigger_next_event()   */ -ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, +ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,  							ktime_t *offs_tai)  { -	struct timekeeper *tk = &timekeeper; -	ktime_t now; +	struct timekeeper *tk = &tk_core.timekeeper;  	unsigned int seq; -	u64 secs, nsecs; +	ktime_t base; +	u64 nsecs;  	do { -		seq = read_seqcount_begin(&timekeeper_seq); +		seq = read_seqcount_begin(&tk_core.seq); -		secs = tk->xtime_sec; -		nsecs = timekeeping_get_ns(tk); +		base = tk->tkr.base_mono; +		nsecs = timekeeping_get_ns(&tk->tkr);  		*offs_real = tk->offs_real;  		*offs_boot = tk->offs_boot;  		*offs_tai = tk->offs_tai; -	} while (read_seqcount_retry(&timekeeper_seq, seq)); +	} while (read_seqcount_retry(&tk_core.seq, seq)); -	now = ktime_add_ns(ktime_set(secs, 0), nsecs); -	now = ktime_sub(now, *offs_real); -	return now; +	return ktime_add_ns(base, nsecs);  }  #endif  /** - * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format - */ -ktime_t ktime_get_monotonic_offset(void) -{ -	struct timekeeper *tk = &timekeeper; -	unsigned long seq; -	struct timespec wtom; - -	do { -		seq = read_seqcount_begin(&timekeeper_seq); -		wtom = tk->wall_to_monotonic; -	} while (read_seqcount_retry(&timekeeper_seq, seq)); - -	return timespec_to_ktime(wtom); -} -EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); - -/**   * do_adjtimex() - Accessor function to NTP __do_adjtimex function   */  int do_adjtimex(struct timex *txc)  { -	struct timekeeper *tk = &timekeeper; +	struct timekeeper *tk = &tk_core.timekeeper;  	unsigned long flags; -	struct timespec ts; +	struct timespec64 ts;  	s32 orig_tai, tai;  	int ret; @@ -1687,10 +1762,10 @@ int do_adjtimex(struct timex *txc)  			return ret;  	} -	getnstimeofday(&ts); +	getnstimeofday64(&ts);  	raw_spin_lock_irqsave(&timekeeper_lock, flags); -	write_seqcount_begin(&timekeeper_seq); +	write_seqcount_begin(&tk_core.seq);  	orig_tai = tai = tk->tai_offset;  	ret = __do_adjtimex(txc, &ts, &tai); @@ -1699,7 +1774,7 @@ int do_adjtimex(struct timex *txc)  		__timekeeping_set_tai_offset(tk, tai);  		timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);  	} -	write_seqcount_end(&timekeeper_seq); +	write_seqcount_end(&tk_core.seq);  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  	if (tai != orig_tai) @@ -1719,11 +1794,11 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)  	unsigned long flags;  	raw_spin_lock_irqsave(&timekeeper_lock, flags); -	write_seqcount_begin(&timekeeper_seq); +	write_seqcount_begin(&tk_core.seq);  	__hardpps(phase_ts, raw_ts); -	write_seqcount_end(&timekeeper_seq); +	write_seqcount_end(&tk_core.seq);  	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);  }  EXPORT_SYMBOL(hardpps); diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h new file mode 100644 index 000000000000..adc1fc98bde3 --- /dev/null +++ b/kernel/time/timekeeping.h @@ -0,0 +1,20 @@ +#ifndef _KERNEL_TIME_TIMEKEEPING_H +#define _KERNEL_TIME_TIMEKEEPING_H +/* + * Internal interfaces for kernel/time/ + */ +extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, +						ktime_t *offs_boot, +						ktime_t *offs_tai); +extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, +						ktime_t *offs_boot, +						ktime_t *offs_tai); + +extern int timekeeping_valid_for_hres(void); +extern u64 timekeeping_max_deferment(void); +extern int timekeeping_inject_offset(struct timespec *ts); +extern s32 timekeeping_get_tai_offset(void); +extern void timekeeping_set_tai_offset(s32 tai_offset); +extern void timekeeping_clocktai(struct timespec *ts); + +#endif diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index 4d54f97558df..f6bd65236712 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -67,7 +67,7 @@ static int __init tk_debug_sleep_time_init(void)  }  late_initcall(tk_debug_sleep_time_init); -void tk_debug_account_sleep_time(struct timespec *t) +void tk_debug_account_sleep_time(struct timespec64 *t)  {  	sleep_time_bin[fls(t->tv_sec)]++;  } diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 13323ea08ffa..4ea005a7f9da 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -3,12 +3,27 @@  /*   * timekeeping debug functions   */ +#include <linux/clocksource.h>  #include <linux/time.h>  #ifdef CONFIG_DEBUG_FS -extern void tk_debug_account_sleep_time(struct timespec *t); +extern void tk_debug_account_sleep_time(struct timespec64 *t);  #else  #define tk_debug_account_sleep_time(x)  #endif +#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE +static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) +{ +	cycle_t ret = (now - last) & mask; + +	return (s64) ret > 0 ? ret : 0; +} +#else +static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) +{ +	return (now - last) & mask; +} +#endif +  #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/timer.c b/kernel/time/timer.c index 3bb01a323b2a..aca5dfe2fa3d 100644 --- a/kernel/timer.c +++ b/kernel/time/timer.c @@ -82,6 +82,7 @@ struct tvec_base {  	unsigned long next_timer;  	unsigned long active_timers;  	unsigned long all_timers; +	int cpu;  	struct tvec_root tv1;  	struct tvec tv2;  	struct tvec tv3; @@ -409,6 +410,22 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)  			base->next_timer = timer->expires;  	}  	base->all_timers++; + +	/* +	 * Check whether the other CPU is in dynticks mode and needs +	 * to be triggered to reevaluate the timer wheel. +	 * We are protected against the other CPU fiddling +	 * with the timer by holding the timer base lock. This also +	 * makes sure that a CPU on the way to stop its tick can not +	 * evaluate the timer wheel. +	 * +	 * Spare the IPI for deferrable timers on idle targets though. +	 * The next busy ticks will take care of it. Except full dynticks +	 * require special care against races with idle_cpu(), lets deal +	 * with that later. +	 */ +	if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu)) +		wake_up_nohz_cpu(base->cpu);  }  #ifdef CONFIG_TIMER_STATS @@ -948,22 +965,6 @@ void add_timer_on(struct timer_list *timer, int cpu)  	timer_set_base(timer, base);  	debug_activate(timer, timer->expires);  	internal_add_timer(base, timer); -	/* -	 * Check whether the other CPU is in dynticks mode and needs -	 * to be triggered to reevaluate the timer wheel. -	 * We are protected against the other CPU fiddling -	 * with the timer by holding the timer base lock. This also -	 * makes sure that a CPU on the way to stop its tick can not -	 * evaluate the timer wheel. -	 * -	 * Spare the IPI for deferrable timers on idle targets though. -	 * The next busy ticks will take care of it. Except full dynticks -	 * require special care against races with idle_cpu(), lets deal -	 * with that later. -	 */ -	if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu)) -		wake_up_nohz_cpu(cpu); -  	spin_unlock_irqrestore(&base->lock, flags);  }  EXPORT_SYMBOL_GPL(add_timer_on); @@ -1568,6 +1569,7 @@ static int init_timers_cpu(int cpu)  		}  		spin_lock_init(&base->lock);  		tvec_base_done[cpu] = 1; +		base->cpu = cpu;  	} else {  		base = per_cpu(tvec_bases, cpu);  	} diff --git a/kernel/time/udelay_test.c b/kernel/time/udelay_test.c new file mode 100644 index 000000000000..e622ba365a13 --- /dev/null +++ b/kernel/time/udelay_test.c @@ -0,0 +1,168 @@ +/* + * udelay() test kernel module + * + * Test is executed by writing and reading to /sys/kernel/debug/udelay_test + * Tests are configured by writing: USECS ITERATIONS + * Tests are executed by reading from the same file. + * Specifying usecs of 0 or negative values will run multiples tests. + * + * Copyright (C) 2014 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + */ + +#include <linux/debugfs.h> +#include <linux/delay.h> +#include <linux/ktime.h> +#include <linux/module.h> +#include <linux/uaccess.h> + +#define DEFAULT_ITERATIONS 100 + +#define DEBUGFS_FILENAME "udelay_test" + +static DEFINE_MUTEX(udelay_test_lock); +static struct dentry *udelay_test_debugfs_file; +static int udelay_test_usecs; +static int udelay_test_iterations = DEFAULT_ITERATIONS; + +static int udelay_test_single(struct seq_file *s, int usecs, uint32_t iters) +{ +	int min = 0, max = 0, fail_count = 0; +	uint64_t sum = 0; +	uint64_t avg; +	int i; +	/* Allow udelay to be up to 0.5% fast */ +	int allowed_error_ns = usecs * 5; + +	for (i = 0; i < iters; ++i) { +		struct timespec ts1, ts2; +		int time_passed; + +		ktime_get_ts(&ts1); +		udelay(usecs); +		ktime_get_ts(&ts2); +		time_passed = timespec_to_ns(&ts2) - timespec_to_ns(&ts1); + +		if (i == 0 || time_passed < min) +			min = time_passed; +		if (i == 0 || time_passed > max) +			max = time_passed; +		if ((time_passed + allowed_error_ns) / 1000 < usecs) +			++fail_count; +		WARN_ON(time_passed < 0); +		sum += time_passed; +	} + +	avg = sum; +	do_div(avg, iters); +	seq_printf(s, "%d usecs x %d: exp=%d allowed=%d min=%d avg=%lld max=%d", +			usecs, iters, usecs * 1000, +			(usecs * 1000) - allowed_error_ns, min, avg, max); +	if (fail_count) +		seq_printf(s, " FAIL=%d", fail_count); +	seq_puts(s, "\n"); + +	return 0; +} + +static int udelay_test_show(struct seq_file *s, void *v) +{ +	int usecs; +	int iters; +	int ret = 0; + +	mutex_lock(&udelay_test_lock); +	usecs = udelay_test_usecs; +	iters = udelay_test_iterations; +	mutex_unlock(&udelay_test_lock); + +	if (usecs > 0 && iters > 0) { +		return udelay_test_single(s, usecs, iters); +	} else if (usecs == 0) { +		struct timespec ts; + +		ktime_get_ts(&ts); +		seq_printf(s, "udelay() test (lpj=%ld kt=%ld.%09ld)\n", +				loops_per_jiffy, ts.tv_sec, ts.tv_nsec); +		seq_puts(s, "usage:\n"); +		seq_puts(s, "echo USECS [ITERS] > " DEBUGFS_FILENAME "\n"); +		seq_puts(s, "cat " DEBUGFS_FILENAME "\n"); +	} + +	return ret; +} + +static int udelay_test_open(struct inode *inode, struct file *file) +{ +	return single_open(file, udelay_test_show, inode->i_private); +} + +static ssize_t udelay_test_write(struct file *file, const char __user *buf, +		size_t count, loff_t *pos) +{ +	char lbuf[32]; +	int ret; +	int usecs; +	int iters; + +	if (count >= sizeof(lbuf)) +		return -EINVAL; + +	if (copy_from_user(lbuf, buf, count)) +		return -EFAULT; +	lbuf[count] = '\0'; + +	ret = sscanf(lbuf, "%d %d", &usecs, &iters); +	if (ret < 1) +		return -EINVAL; +	else if (ret < 2) +		iters = DEFAULT_ITERATIONS; + +	mutex_lock(&udelay_test_lock); +	udelay_test_usecs = usecs; +	udelay_test_iterations = iters; +	mutex_unlock(&udelay_test_lock); + +	return count; +} + +static const struct file_operations udelay_test_debugfs_ops = { +	.owner = THIS_MODULE, +	.open = udelay_test_open, +	.read = seq_read, +	.write = udelay_test_write, +	.llseek = seq_lseek, +	.release = single_release, +}; + +static int __init udelay_test_init(void) +{ +	mutex_lock(&udelay_test_lock); +	udelay_test_debugfs_file = debugfs_create_file(DEBUGFS_FILENAME, +			S_IRUSR, NULL, NULL, &udelay_test_debugfs_ops); +	mutex_unlock(&udelay_test_lock); + +	return 0; +} + +module_init(udelay_test_init); + +static void __exit udelay_test_exit(void) +{ +	mutex_lock(&udelay_test_lock); +	debugfs_remove(udelay_test_debugfs_file); +	mutex_unlock(&udelay_test_lock); +} + +module_exit(udelay_test_exit); + +MODULE_AUTHOR("David Riley <davidriley@chromium.org>"); +MODULE_LICENSE("GPL"); diff --git a/kernel/torture.c b/kernel/torture.c index 40bb511cca48..d600af21f022 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -708,7 +708,7 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,  	int ret = 0;  	VERBOSE_TOROUT_STRING(m); -	*tp = kthread_run(fn, arg, s); +	*tp = kthread_run(fn, arg, "%s", s);  	if (IS_ERR(*tp)) {  		ret = PTR_ERR(*tp);  		VERBOSE_TOROUT_ERRSTRING(f); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d4409356f40d..a5da09c899dd 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -29,11 +29,6 @@ config HAVE_FUNCTION_GRAPH_FP_TEST  	help  	  See Documentation/trace/ftrace-design.txt -config HAVE_FUNCTION_TRACE_MCOUNT_TEST -	bool -	help -	  See Documentation/trace/ftrace-design.txt -  config HAVE_DYNAMIC_FTRACE  	bool  	help diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2611613f14f1..67d6369ddf83 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o  obj-$(CONFIG_TRACING) += trace.o  obj-$(CONFIG_TRACING) += trace_output.o +obj-$(CONFIG_TRACING) += trace_seq.o  obj-$(CONFIG_TRACING) += trace_stat.o  obj-$(CONFIG_TRACING) += trace_printk.o  obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ac9d1dad630b..1654b12c891a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -80,9 +80,6 @@ static struct ftrace_ops ftrace_list_end __read_mostly = {  int ftrace_enabled __read_mostly;  static int last_ftrace_enabled; -/* Quick disabling of function tracer. */ -int function_trace_stop __read_mostly; -  /* Current function tracing op */  struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;  /* What to set function_trace_op to */ @@ -1042,6 +1039,8 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid;  #ifdef CONFIG_DYNAMIC_FTRACE +static struct ftrace_ops *removed_ops; +  #ifndef CONFIG_FTRACE_MCOUNT_RECORD  # error Dynamic ftrace depends on MCOUNT_RECORD  #endif @@ -1304,25 +1303,15 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,  	struct ftrace_hash *new_hash;  	int size = src->count;  	int bits = 0; -	int ret;  	int i;  	/* -	 * Remove the current set, update the hash and add -	 * them back. -	 */ -	ftrace_hash_rec_disable(ops, enable); - -	/*  	 * If the new source is empty, just free dst and assign it  	 * the empty_hash.  	 */  	if (!src->count) { -		free_ftrace_hash_rcu(*dst); -		rcu_assign_pointer(*dst, EMPTY_HASH); -		/* still need to update the function records */ -		ret = 0; -		goto out; +		new_hash = EMPTY_HASH; +		goto update;  	}  	/* @@ -1335,10 +1324,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,  	if (bits > FTRACE_HASH_MAX_BITS)  		bits = FTRACE_HASH_MAX_BITS; -	ret = -ENOMEM;  	new_hash = alloc_ftrace_hash(bits);  	if (!new_hash) -		goto out; +		return -ENOMEM;  	size = 1 << src->size_bits;  	for (i = 0; i < size; i++) { @@ -1349,20 +1337,20 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,  		}  	} +update: +	/* +	 * Remove the current set, update the hash and add +	 * them back. +	 */ +	ftrace_hash_rec_disable(ops, enable); +  	old_hash = *dst;  	rcu_assign_pointer(*dst, new_hash);  	free_ftrace_hash_rcu(old_hash); -	ret = 0; - out: -	/* -	 * Enable regardless of ret: -	 *  On success, we enable the new hash. -	 *  On failure, we re-enable the original hash. -	 */  	ftrace_hash_rec_enable(ops, enable); -	return ret; +	return 0;  }  /* @@ -1492,6 +1480,53 @@ int ftrace_text_reserved(const void *start, const void *end)  	return (int)!!ret;  } +/* Test if ops registered to this rec needs regs */ +static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) +{ +	struct ftrace_ops *ops; +	bool keep_regs = false; + +	for (ops = ftrace_ops_list; +	     ops != &ftrace_list_end; ops = ops->next) { +		/* pass rec in as regs to have non-NULL val */ +		if (ftrace_ops_test(ops, rec->ip, rec)) { +			if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { +				keep_regs = true; +				break; +			} +		} +	} + +	return  keep_regs; +} + +static void ftrace_remove_tramp(struct ftrace_ops *ops, +				struct dyn_ftrace *rec) +{ +	struct ftrace_func_entry *entry; + +	entry = ftrace_lookup_ip(ops->tramp_hash, rec->ip); +	if (!entry) +		return; + +	/* +	 * The tramp_hash entry will be removed at time +	 * of update. +	 */ +	ops->nr_trampolines--; +	rec->flags &= ~FTRACE_FL_TRAMP; +} + +static void ftrace_clear_tramps(struct dyn_ftrace *rec) +{ +	struct ftrace_ops *op; + +	do_for_each_ftrace_op(op, ftrace_ops_list) { +		if (op->nr_trampolines) +			ftrace_remove_tramp(op, rec); +	} while_for_each_ftrace_op(op); +} +  static void __ftrace_hash_rec_update(struct ftrace_ops *ops,  				     int filter_hash,  				     bool inc) @@ -1572,8 +1607,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,  		if (inc) {  			rec->flags++; -			if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) +			if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX))  				return; + +			/* +			 * If there's only a single callback registered to a +			 * function, and the ops has a trampoline registered +			 * for it, then we can call it directly. +			 */ +			if (ftrace_rec_count(rec) == 1 && ops->trampoline) { +				rec->flags |= FTRACE_FL_TRAMP; +				ops->nr_trampolines++; +			} else { +				/* +				 * If we are adding another function callback +				 * to this function, and the previous had a +				 * trampoline used, then we need to go back to +				 * the default trampoline. +				 */ +				rec->flags &= ~FTRACE_FL_TRAMP; + +				/* remove trampolines from any ops for this rec */ +				ftrace_clear_tramps(rec); +			} +  			/*  			 * If any ops wants regs saved for this function  			 * then all ops will get saved regs. @@ -1581,9 +1638,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,  			if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)  				rec->flags |= FTRACE_FL_REGS;  		} else { -			if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) +			if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0))  				return;  			rec->flags--; + +			if (ops->trampoline && !ftrace_rec_count(rec)) +				ftrace_remove_tramp(ops, rec); + +			/* +			 * If the rec had REGS enabled and the ops that is +			 * being removed had REGS set, then see if there is +			 * still any ops for this record that wants regs. +			 * If not, we can stop recording them. +			 */ +			if (ftrace_rec_count(rec) > 0 && +			    rec->flags & FTRACE_FL_REGS && +			    ops->flags & FTRACE_OPS_FL_SAVE_REGS) { +				if (!test_rec_ops_needs_regs(rec)) +					rec->flags &= ~FTRACE_FL_REGS; +			} + +			/* +			 * flags will be cleared in ftrace_check_record() +			 * if rec count is zero. +			 */  		}  		count++;  		/* Shortcut, if we handled all records, we are done. */ @@ -1668,17 +1746,23 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)  	 * If we are disabling calls, then disable all records that  	 * are enabled.  	 */ -	if (enable && (rec->flags & ~FTRACE_FL_MASK)) +	if (enable && ftrace_rec_count(rec))  		flag = FTRACE_FL_ENABLED;  	/* -	 * If enabling and the REGS flag does not match the REGS_EN, then -	 * do not ignore this record. Set flags to fail the compare against -	 * ENABLED. +	 * If enabling and the REGS flag does not match the REGS_EN, or +	 * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore +	 * this record. Set flags to fail the compare against ENABLED.  	 */ -	if (flag && -	    (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN))) -		flag |= FTRACE_FL_REGS; +	if (flag) { +		if (!(rec->flags & FTRACE_FL_REGS) !=  +		    !(rec->flags & FTRACE_FL_REGS_EN)) +			flag |= FTRACE_FL_REGS; + +		if (!(rec->flags & FTRACE_FL_TRAMP) !=  +		    !(rec->flags & FTRACE_FL_TRAMP_EN)) +			flag |= FTRACE_FL_TRAMP; +	}  	/* If the state of this record hasn't changed, then do nothing */  	if ((rec->flags & FTRACE_FL_ENABLED) == flag) @@ -1696,6 +1780,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)  				else  					rec->flags &= ~FTRACE_FL_REGS_EN;  			} +			if (flag & FTRACE_FL_TRAMP) { +				if (rec->flags & FTRACE_FL_TRAMP) +					rec->flags |= FTRACE_FL_TRAMP_EN; +				else +					rec->flags &= ~FTRACE_FL_TRAMP_EN; +			}  		}  		/* @@ -1704,7 +1794,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)  		 * Otherwise,  		 *   return UPDATE_MODIFY_CALL to tell the caller to convert  		 *   from the save regs, to a non-save regs function or -		 *   vice versa. +		 *   vice versa, or from a trampoline call.  		 */  		if (flag & FTRACE_FL_ENABLED)  			return FTRACE_UPDATE_MAKE_CALL; @@ -1714,7 +1804,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)  	if (update) {  		/* If there's no more users, clear all flags */ -		if (!(rec->flags & ~FTRACE_FL_MASK)) +		if (!ftrace_rec_count(rec))  			rec->flags = 0;  		else  			/* Just disable the record (keep REGS state) */ @@ -1751,6 +1841,43 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)  	return ftrace_check_record(rec, enable, 0);  } +static struct ftrace_ops * +ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) +{ +	struct ftrace_ops *op; + +	/* Removed ops need to be tested first */ +	if (removed_ops && removed_ops->tramp_hash) { +		if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip)) +			return removed_ops; +	} + +	do_for_each_ftrace_op(op, ftrace_ops_list) { +		if (!op->tramp_hash) +			continue; + +		if (ftrace_lookup_ip(op->tramp_hash, rec->ip)) +			return op; + +	} while_for_each_ftrace_op(op); + +	return NULL; +} + +static struct ftrace_ops * +ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) +{ +	struct ftrace_ops *op; + +	do_for_each_ftrace_op(op, ftrace_ops_list) { +		/* pass rec in as regs to have non-NULL val */ +		if (ftrace_ops_test(op, rec->ip, rec)) +			return op; +	} while_for_each_ftrace_op(op); + +	return NULL; +} +  /**   * ftrace_get_addr_new - Get the call address to set to   * @rec:  The ftrace record descriptor @@ -1763,6 +1890,20 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)   */  unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)  { +	struct ftrace_ops *ops; + +	/* Trampolines take precedence over regs */ +	if (rec->flags & FTRACE_FL_TRAMP) { +		ops = ftrace_find_tramp_ops_new(rec); +		if (FTRACE_WARN_ON(!ops || !ops->trampoline)) { +			pr_warning("Bad trampoline accounting at: %p (%pS)\n", +				    (void *)rec->ip, (void *)rec->ip); +			/* Ftrace is shutting down, return anything */ +			return (unsigned long)FTRACE_ADDR; +		} +		return ops->trampoline; +	} +  	if (rec->flags & FTRACE_FL_REGS)  		return (unsigned long)FTRACE_REGS_ADDR;  	else @@ -1781,6 +1922,20 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)   */  unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)  { +	struct ftrace_ops *ops; + +	/* Trampolines take precedence over regs */ +	if (rec->flags & FTRACE_FL_TRAMP_EN) { +		ops = ftrace_find_tramp_ops_curr(rec); +		if (FTRACE_WARN_ON(!ops)) { +			pr_warning("Bad trampoline accounting at: %p (%pS)\n", +				    (void *)rec->ip, (void *)rec->ip); +			/* Ftrace is shutting down, return anything */ +			return (unsigned long)FTRACE_ADDR; +		} +		return ops->trampoline; +	} +  	if (rec->flags & FTRACE_FL_REGS_EN)  		return (unsigned long)FTRACE_REGS_ADDR;  	else @@ -2023,6 +2178,89 @@ void __weak arch_ftrace_update_code(int command)  	ftrace_run_stop_machine(command);  } +static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops) +{ +	struct ftrace_page *pg; +	struct dyn_ftrace *rec; +	int size, bits; +	int ret; + +	size = ops->nr_trampolines; +	bits = 0; +	/* +	 * Make the hash size about 1/2 the # found +	 */ +	for (size /= 2; size; size >>= 1) +		bits++; + +	ops->tramp_hash = alloc_ftrace_hash(bits); +	/* +	 * TODO: a failed allocation is going to screw up +	 * the accounting of what needs to be modified +	 * and not. For now, we kill ftrace if we fail +	 * to allocate here. But there are ways around this, +	 * but that will take a little more work. +	 */ +	if (!ops->tramp_hash) +		return -ENOMEM; + +	do_for_each_ftrace_rec(pg, rec) { +		if (ftrace_rec_count(rec) == 1 && +		    ftrace_ops_test(ops, rec->ip, rec)) { + +			/* +			 * If another ops adds to a rec, the rec will +			 * lose its trampoline and never get it back +			 * until all ops are off of it. +			 */ +			if (!(rec->flags & FTRACE_FL_TRAMP)) +				continue; + +			/* This record had better have a trampoline */ +			if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN))) +				return -1; + +			ret = add_hash_entry(ops->tramp_hash, rec->ip); +			if (ret < 0) +				return ret; +		} +	} while_for_each_ftrace_rec(); + +	/* The number of recs in the hash must match nr_trampolines */ +	FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines); + +	return 0; +} + +static int ftrace_save_tramp_hashes(void) +{ +	struct ftrace_ops *op; +	int ret; + +	/* +	 * Now that any trampoline is being used, we need to save the +	 * hashes for the ops that have them. This allows the mapping +	 * back from the record to the ops that has the trampoline to +	 * know what code is being replaced. Modifying code must always +	 * verify what it is changing. +	 */ +	do_for_each_ftrace_op(op, ftrace_ops_list) { + +		/* The tramp_hash is recreated each time. */ +		free_ftrace_hash(op->tramp_hash); +		op->tramp_hash = NULL; + +		if (op->nr_trampolines) { +			ret = ftrace_save_ops_tramp_hash(op); +			if (ret) +				return ret; +		} + +	} while_for_each_ftrace_op(op); + +	return 0; +} +  static void ftrace_run_update_code(int command)  {  	int ret; @@ -2031,11 +2269,6 @@ static void ftrace_run_update_code(int command)  	FTRACE_WARN_ON(ret);  	if (ret)  		return; -	/* -	 * Do not call function tracer while we update the code. -	 * We are in stop machine. -	 */ -	function_trace_stop++;  	/*  	 * By default we use stop_machine() to modify the code. @@ -2045,15 +2278,15 @@ static void ftrace_run_update_code(int command)  	 */  	arch_ftrace_update_code(command); -	function_trace_stop--; -  	ret = ftrace_arch_code_modify_post_process();  	FTRACE_WARN_ON(ret); + +	ret = ftrace_save_tramp_hashes(); +	FTRACE_WARN_ON(ret);  }  static ftrace_func_t saved_ftrace_func;  static int ftrace_start_up; -static int global_start_up;  static void control_ops_free(struct ftrace_ops *ops)  { @@ -2117,8 +2350,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)  	ftrace_hash_rec_disable(ops, 1); -	if (!global_start_up) -		ops->flags &= ~FTRACE_OPS_FL_ENABLED; +	ops->flags &= ~FTRACE_OPS_FL_ENABLED;  	command |= FTRACE_UPDATE_CALLS; @@ -2139,8 +2371,16 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)  		return 0;  	} +	/* +	 * If the ops uses a trampoline, then it needs to be +	 * tested first on update. +	 */ +	removed_ops = ops; +  	ftrace_run_update_code(command); +	removed_ops = NULL; +  	/*  	 * Dynamic ops may be freed, we must make sure that all  	 * callers are done before leaving this function. @@ -2398,7 +2638,8 @@ ftrace_allocate_pages(unsigned long num_to_init)  	return start_pg;   free_pages: -	while (start_pg) { +	pg = start_pg; +	while (pg) {  		order = get_count_order(pg->size / ENTRIES_PER_PAGE);  		free_pages((unsigned long)pg->records, order);  		start_pg = pg->next; @@ -2595,8 +2836,10 @@ static void *t_start(struct seq_file *m, loff_t *pos)  	 * off, we can short cut and just print out that all  	 * functions are enabled.  	 */ -	if (iter->flags & FTRACE_ITER_FILTER && -	    ftrace_hash_empty(ops->filter_hash)) { +	if ((iter->flags & FTRACE_ITER_FILTER && +	     ftrace_hash_empty(ops->filter_hash)) || +	    (iter->flags & FTRACE_ITER_NOTRACE && +	     ftrace_hash_empty(ops->notrace_hash))) {  		if (*pos > 0)  			return t_hash_start(m, pos);  		iter->flags |= FTRACE_ITER_PRINTALL; @@ -2641,7 +2884,10 @@ static int t_show(struct seq_file *m, void *v)  		return t_hash_show(m, iter);  	if (iter->flags & FTRACE_ITER_PRINTALL) { -		seq_printf(m, "#### all functions enabled ####\n"); +		if (iter->flags & FTRACE_ITER_NOTRACE) +			seq_printf(m, "#### no functions disabled ####\n"); +		else +			seq_printf(m, "#### all functions enabled ####\n");  		return 0;  	} @@ -2651,10 +2897,22 @@ static int t_show(struct seq_file *m, void *v)  		return 0;  	seq_printf(m, "%ps", (void *)rec->ip); -	if (iter->flags & FTRACE_ITER_ENABLED) +	if (iter->flags & FTRACE_ITER_ENABLED) {  		seq_printf(m, " (%ld)%s", -			   rec->flags & ~FTRACE_FL_MASK, -			   rec->flags & FTRACE_FL_REGS ? " R" : ""); +			   ftrace_rec_count(rec), +			   rec->flags & FTRACE_FL_REGS ? " R" : "  "); +		if (rec->flags & FTRACE_FL_TRAMP_EN) { +			struct ftrace_ops *ops; + +			ops = ftrace_find_tramp_ops_curr(rec); +			if (ops && ops->trampoline) +				seq_printf(m, "\ttramp: %pS", +					   (void *)ops->trampoline); +			else +				seq_printf(m, "\ttramp: ERROR!"); +		} +	}	 +  	seq_printf(m, "\n");  	return 0; @@ -2702,13 +2960,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file)  	return iter ? 0 : -ENOMEM;  } -static void ftrace_filter_reset(struct ftrace_hash *hash) -{ -	mutex_lock(&ftrace_lock); -	ftrace_hash_clear(hash); -	mutex_unlock(&ftrace_lock); -} -  /**   * ftrace_regex_open - initialize function tracer filter files   * @ops: The ftrace_ops that hold the hash filters @@ -2758,7 +3009,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,  		hash = ops->filter_hash;  	if (file->f_mode & FMODE_WRITE) { -		iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); +		const int size_bits = FTRACE_HASH_DEFAULT_BITS; + +		if (file->f_flags & O_TRUNC) +			iter->hash = alloc_ftrace_hash(size_bits); +		else +			iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash); +  		if (!iter->hash) {  			trace_parser_put(&iter->parser);  			kfree(iter); @@ -2767,10 +3024,6 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,  		}  	} -	if ((file->f_mode & FMODE_WRITE) && -	    (file->f_flags & O_TRUNC)) -		ftrace_filter_reset(iter->hash); -  	if (file->f_mode & FMODE_READ) {  		iter->pg = ftrace_pages_start; @@ -3471,14 +3724,16 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,  	else  		orig_hash = &ops->notrace_hash; -	hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); +	if (reset) +		hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); +	else +		hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); +  	if (!hash) {  		ret = -ENOMEM;  		goto out_regex_unlock;  	} -	if (reset) -		ftrace_filter_reset(hash);  	if (buf && !ftrace_match_records(hash, buf, len)) {  		ret = -EINVAL;  		goto out_regex_unlock; @@ -3630,6 +3885,7 @@ __setup("ftrace_filter=", set_ftrace_filter);  #ifdef CONFIG_FUNCTION_GRAPH_TRACER  static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; +static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;  static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);  static int __init set_graph_function(char *str) @@ -3639,16 +3895,29 @@ static int __init set_graph_function(char *str)  }  __setup("ftrace_graph_filter=", set_graph_function); -static void __init set_ftrace_early_graph(char *buf) +static int __init set_graph_notrace_function(char *str) +{ +	strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE); +	return 1; +} +__setup("ftrace_graph_notrace=", set_graph_notrace_function); + +static void __init set_ftrace_early_graph(char *buf, int enable)  {  	int ret;  	char *func; +	unsigned long *table = ftrace_graph_funcs; +	int *count = &ftrace_graph_count; + +	if (!enable) { +		table = ftrace_graph_notrace_funcs; +		count = &ftrace_graph_notrace_count; +	}  	while (buf) {  		func = strsep(&buf, ",");  		/* we allow only one expression at a time */ -		ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, -				      FTRACE_GRAPH_MAX_FUNCS, func); +		ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func);  		if (ret)  			printk(KERN_DEBUG "ftrace: function %s not "  					  "traceable\n", func); @@ -3677,7 +3946,9 @@ static void __init set_ftrace_early_filters(void)  		ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);  #ifdef CONFIG_FUNCTION_GRAPH_TRACER  	if (ftrace_graph_buf[0]) -		set_ftrace_early_graph(ftrace_graph_buf); +		set_ftrace_early_graph(ftrace_graph_buf, 1); +	if (ftrace_graph_notrace_buf[0]) +		set_ftrace_early_graph(ftrace_graph_notrace_buf, 0);  #endif /* CONFIG_FUNCTION_GRAPH_TRACER */  } @@ -3819,7 +4090,12 @@ static int g_show(struct seq_file *m, void *v)  		return 0;  	if (ptr == (unsigned long *)1) { -		seq_printf(m, "#### all functions enabled ####\n"); +		struct ftrace_graph_data *fgd = m->private; + +		if (fgd->table == ftrace_graph_funcs) +			seq_printf(m, "#### all functions enabled ####\n"); +		else +			seq_printf(m, "#### no functions disabled ####\n");  		return 0;  	} @@ -4447,9 +4723,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,  	struct ftrace_ops *op;  	int bit; -	if (function_trace_stop) -		return; -  	bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);  	if (bit < 0)  		return; @@ -4461,9 +4734,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,  	preempt_disable_notrace();  	do_for_each_ftrace_op(op, ftrace_ops_list) {  		if (ftrace_ops_test(op, ip, regs)) { -			if (WARN_ON(!op->func)) { -				function_trace_stop = 1; -				printk("op=%p %pS\n", op, op); +			if (FTRACE_WARN_ON(!op->func)) { +				pr_warn("op=%p %pS\n", op, op);  				goto out;  			}  			op->func(ip, parent_ip, op, regs); @@ -5084,6 +5356,12 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,  	/* Function graph doesn't use the .func field of global_ops */  	global_ops.flags |= FTRACE_OPS_FL_STUB; +#ifdef CONFIG_DYNAMIC_FTRACE +	/* Optimize function graph calling (if implemented by arch) */ +	if (FTRACE_GRAPH_TRAMP_ADDR != 0) +		global_ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR; +#endif +  	ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);  out: @@ -5104,6 +5382,10 @@ void unregister_ftrace_graph(void)  	__ftrace_graph_entry = ftrace_graph_entry_stub;  	ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);  	global_ops.flags &= ~FTRACE_OPS_FL_STUB; +#ifdef CONFIG_DYNAMIC_FTRACE +	if (FTRACE_GRAPH_TRAMP_ADDR != 0) +		global_ops.trampoline = 0; +#endif  	unregister_pm_notifier(&ftrace_suspend_notifier);  	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); @@ -5183,9 +5465,4 @@ void ftrace_graph_exit_task(struct task_struct *t)  	kfree(ret_stack);  } - -void ftrace_graph_stop(void) -{ -	ftrace_stop(); -}  #endif diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b95381ebdd5e..afb04b9b818a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1689,22 +1689,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,  			if (!cpu_buffer->nr_pages_to_update)  				continue; -			/* The update must run on the CPU that is being updated. */ -			preempt_disable(); -			if (cpu == smp_processor_id() || !cpu_online(cpu)) { +			/* Can't run something on an offline CPU. */ +			if (!cpu_online(cpu)) {  				rb_update_pages(cpu_buffer);  				cpu_buffer->nr_pages_to_update = 0;  			} else { -				/* -				 * Can not disable preemption for schedule_work_on() -				 * on PREEMPT_RT. -				 */ -				preempt_enable();  				schedule_work_on(cpu,  						&cpu_buffer->update_pages_work); -				preempt_disable();  			} -			preempt_enable();  		}  		/* wait for all the updates to complete */ @@ -1742,22 +1734,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,  		get_online_cpus(); -		preempt_disable(); -		/* The update must run on the CPU that is being updated. */ -		if (cpu_id == smp_processor_id() || !cpu_online(cpu_id)) +		/* Can't run something on an offline CPU. */ +		if (!cpu_online(cpu_id))  			rb_update_pages(cpu_buffer);  		else { -			/* -			 * Can not disable preemption for schedule_work_on() -			 * on PREEMPT_RT. -			 */ -			preempt_enable();  			schedule_work_on(cpu_id,  					 &cpu_buffer->update_pages_work);  			wait_for_completion(&cpu_buffer->update_done); -			preempt_disable();  		} -		preempt_enable();  		cpu_buffer->nr_pages_to_update = 0;  		put_online_cpus(); @@ -3772,7 +3756,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)  	if (rb_per_cpu_empty(cpu_buffer))  		return NULL; -	if (iter->head >= local_read(&iter->head_page->page->commit)) { +	if (iter->head >= rb_page_size(iter->head_page)) {  		rb_inc_iter(iter);  		goto again;  	} diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 291397e66669..8a528392b1f4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -820,11 +820,12 @@ static struct {  	const char *name;  	int in_ns;		/* is this clock in nanoseconds? */  } trace_clocks[] = { -	{ trace_clock_local,	"local",	1 }, -	{ trace_clock_global,	"global",	1 }, -	{ trace_clock_counter,	"counter",	0 }, -	{ trace_clock_jiffies,	"uptime",	0 }, -	{ trace_clock,		"perf",		1 }, +	{ trace_clock_local,		"local",	1 }, +	{ trace_clock_global,		"global",	1 }, +	{ trace_clock_counter,		"counter",	0 }, +	{ trace_clock_jiffies,		"uptime",	0 }, +	{ trace_clock,			"perf",		1 }, +	{ ktime_get_mono_fast_ns,	"mono",		1 },  	ARCH_TRACE_CLOCKS  }; @@ -937,30 +938,6 @@ out:  	return ret;  } -ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) -{ -	int len; -	int ret; - -	if (!cnt) -		return 0; - -	if (s->len <= s->readpos) -		return -EBUSY; - -	len = s->len - s->readpos; -	if (cnt > len) -		cnt = len; -	ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); -	if (ret == cnt) -		return -EFAULT; - -	cnt -= ret; - -	s->readpos += cnt; -	return cnt; -} -  static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)  {  	int len; @@ -3699,6 +3676,7 @@ static const char readme_msg[] =  #endif  #ifdef CONFIG_FUNCTION_GRAPH_TRACER  	"  set_graph_function\t- Trace the nested calls of a function (function_graph)\n" +	"  set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n"  	"  max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"  #endif  #ifdef CONFIG_TRACER_SNAPSHOT @@ -4238,10 +4216,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,  }  static ssize_t -tracing_max_lat_read(struct file *filp, char __user *ubuf, -		     size_t cnt, loff_t *ppos) +tracing_nsecs_read(unsigned long *ptr, char __user *ubuf, +		   size_t cnt, loff_t *ppos)  { -	unsigned long *ptr = filp->private_data;  	char buf[64];  	int r; @@ -4253,10 +4230,9 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf,  }  static ssize_t -tracing_max_lat_write(struct file *filp, const char __user *ubuf, -		      size_t cnt, loff_t *ppos) +tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf, +		    size_t cnt, loff_t *ppos)  { -	unsigned long *ptr = filp->private_data;  	unsigned long val;  	int ret; @@ -4269,6 +4245,52 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,  	return cnt;  } +static ssize_t +tracing_thresh_read(struct file *filp, char __user *ubuf, +		    size_t cnt, loff_t *ppos) +{ +	return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos); +} + +static ssize_t +tracing_thresh_write(struct file *filp, const char __user *ubuf, +		     size_t cnt, loff_t *ppos) +{ +	struct trace_array *tr = filp->private_data; +	int ret; + +	mutex_lock(&trace_types_lock); +	ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos); +	if (ret < 0) +		goto out; + +	if (tr->current_trace->update_thresh) { +		ret = tr->current_trace->update_thresh(tr); +		if (ret < 0) +			goto out; +	} + +	ret = cnt; +out: +	mutex_unlock(&trace_types_lock); + +	return ret; +} + +static ssize_t +tracing_max_lat_read(struct file *filp, char __user *ubuf, +		     size_t cnt, loff_t *ppos) +{ +	return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos); +} + +static ssize_t +tracing_max_lat_write(struct file *filp, const char __user *ubuf, +		      size_t cnt, loff_t *ppos) +{ +	return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos); +} +  static int tracing_open_pipe(struct inode *inode, struct file *filp)  {  	struct trace_array *tr = inode->i_private; @@ -5170,6 +5192,13 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp)  #endif /* CONFIG_TRACER_SNAPSHOT */ +static const struct file_operations tracing_thresh_fops = { +	.open		= tracing_open_generic, +	.read		= tracing_thresh_read, +	.write		= tracing_thresh_write, +	.llseek		= generic_file_llseek, +}; +  static const struct file_operations tracing_max_lat_fops = {  	.open		= tracing_open_generic,  	.read		= tracing_max_lat_read, @@ -6107,10 +6136,8 @@ destroy_trace_option_files(struct trace_option_dentry *topts)  	if (!topts)  		return; -	for (cnt = 0; topts[cnt].opt; cnt++) { -		if (topts[cnt].entry) -			debugfs_remove(topts[cnt].entry); -	} +	for (cnt = 0; topts[cnt].opt; cnt++) +		debugfs_remove(topts[cnt].entry);  	kfree(topts);  } @@ -6533,7 +6560,7 @@ static __init int tracer_init_debugfs(void)  	init_tracer_debugfs(&global_trace, d_tracer);  	trace_create_file("tracing_thresh", 0644, d_tracer, -			&tracing_thresh, &tracing_max_lat_fops); +			&global_trace, &tracing_thresh_fops);  	trace_create_file("README", 0444, d_tracer,  			NULL, &tracing_readme_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9258f5a815db..385391fb1d3b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -339,6 +339,7 @@ struct tracer_flags {   * @reset: called when one switches to another tracer   * @start: called when tracing is unpaused (echo 1 > tracing_enabled)   * @stop: called when tracing is paused (echo 0 > tracing_enabled) + * @update_thresh: called when tracing_thresh is updated   * @open: called when the trace file is opened   * @pipe_open: called when the trace_pipe file is opened   * @close: called when the trace file is released @@ -357,6 +358,7 @@ struct tracer {  	void			(*reset)(struct trace_array *tr);  	void			(*start)(struct trace_array *tr);  	void			(*stop)(struct trace_array *tr); +	int			(*update_thresh)(struct trace_array *tr);  	void			(*open)(struct trace_iterator *iter);  	void			(*pipe_open)(struct trace_iterator *iter);  	void			(*close)(struct trace_iterator *iter); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 5d12bb407b44..4b9c114ee9de 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -30,6 +30,18 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,  			return ret;  	} +	/* +	 * We checked and allowed to create parent, +	 * allow children without checking. +	 */ +	if (p_event->parent) +		return 0; + +	/* +	 * It's ok to check current process (owner) permissions in here, +	 * because code below is called only via perf_event_open syscall. +	 */ +  	/* The ftrace function trace is allowed only for root. */  	if (ftrace_event_is_function(tp_event)) {  		if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2de53628689f..ef06ce7e9cf8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -8,6 +8,8 @@   *   */ +#define pr_fmt(fmt) fmt +  #include <linux/workqueue.h>  #include <linux/spinlock.h>  #include <linux/kthread.h> @@ -1491,7 +1493,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,  	dir->entry = debugfs_create_dir(name, parent);  	if (!dir->entry) { -		pr_warning("Failed to create system directory %s\n", name); +		pr_warn("Failed to create system directory %s\n", name);  		__put_system(system);  		goto out_free;  	} @@ -1507,7 +1509,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,  	if (!entry) {  		kfree(system->filter);  		system->filter = NULL; -		pr_warning("Could not create debugfs '%s/filter' entry\n", name); +		pr_warn("Could not create debugfs '%s/filter' entry\n", name);  	}  	trace_create_file("enable", 0644, dir->entry, dir, @@ -1522,8 +1524,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,   out_fail:  	/* Only print this message if failed on memory allocation */  	if (!dir || !system) -		pr_warning("No memory to create event subsystem %s\n", -			   name); +		pr_warn("No memory to create event subsystem %s\n", name);  	return NULL;  } @@ -1551,8 +1552,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)  	name = ftrace_event_name(call);  	file->dir = debugfs_create_dir(name, d_events);  	if (!file->dir) { -		pr_warning("Could not create debugfs '%s' directory\n", -			   name); +		pr_warn("Could not create debugfs '%s' directory\n", name);  		return -1;  	} @@ -1575,8 +1575,8 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)  	if (list_empty(head)) {  		ret = call->class->define_fields(call);  		if (ret < 0) { -			pr_warning("Could not initialize trace point" -				   " events/%s\n", name); +			pr_warn("Could not initialize trace point events/%s\n", +				name);  			return -1;  		}  	} @@ -1621,7 +1621,6 @@ static void event_remove(struct ftrace_event_call *call)  		if (file->event_call != call)  			continue;  		ftrace_event_enable_disable(file, 0); -		destroy_preds(file);  		/*  		 * The do_for_each_event_file() is  		 * a double loop. After finding the call for this @@ -1649,8 +1648,7 @@ static int event_init(struct ftrace_event_call *call)  	if (call->class->raw_init) {  		ret = call->class->raw_init(call);  		if (ret < 0 && ret != -ENOSYS) -			pr_warn("Could not initialize trace events/%s\n", -				name); +			pr_warn("Could not initialize trace events/%s\n", name);  	}  	return ret; @@ -1749,7 +1747,8 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)  {  	event_remove(call);  	trace_destroy_fields(call); -	destroy_call_preds(call); +	free_event_filter(call->filter); +	call->filter = NULL;  }  static int probe_remove_event_call(struct ftrace_event_call *call) @@ -1895,8 +1894,8 @@ __trace_add_event_dirs(struct trace_array *tr)  	list_for_each_entry(call, &ftrace_events, list) {  		ret = __trace_add_new_event(call, tr);  		if (ret < 0) -			pr_warning("Could not create directory for event %s\n", -				   ftrace_event_name(call)); +			pr_warn("Could not create directory for event %s\n", +				ftrace_event_name(call));  	}  } @@ -2208,8 +2207,8 @@ __trace_early_add_event_dirs(struct trace_array *tr)  	list_for_each_entry(file, &tr->events, list) {  		ret = event_create_dir(tr->event_dir, file);  		if (ret < 0) -			pr_warning("Could not create directory for event %s\n", -				   ftrace_event_name(file->event_call)); +			pr_warn("Could not create directory for event %s\n", +				ftrace_event_name(file->event_call));  	}  } @@ -2232,8 +2231,8 @@ __trace_early_add_events(struct trace_array *tr)  		ret = __trace_early_add_new_event(call, tr);  		if (ret < 0) -			pr_warning("Could not create early event %s\n", -				   ftrace_event_name(call)); +			pr_warn("Could not create early event %s\n", +				ftrace_event_name(call));  	}  } @@ -2280,13 +2279,13 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)  	entry = debugfs_create_file("set_event", 0644, parent,  				    tr, &ftrace_set_event_fops);  	if (!entry) { -		pr_warning("Could not create debugfs 'set_event' entry\n"); +		pr_warn("Could not create debugfs 'set_event' entry\n");  		return -ENOMEM;  	}  	d_events = debugfs_create_dir("events", parent);  	if (!d_events) { -		pr_warning("Could not create debugfs 'events' directory\n"); +		pr_warn("Could not create debugfs 'events' directory\n");  		return -ENOMEM;  	} @@ -2462,11 +2461,10 @@ static __init int event_trace_init(void)  	entry = debugfs_create_file("available_events", 0444, d_tracer,  				    tr, &ftrace_avail_fops);  	if (!entry) -		pr_warning("Could not create debugfs " -			   "'available_events' entry\n"); +		pr_warn("Could not create debugfs 'available_events' entry\n");  	if (trace_define_common_fields()) -		pr_warning("tracing: Failed to allocate common fields"); +		pr_warn("tracing: Failed to allocate common fields");  	ret = early_event_add_tracer(d_tracer, tr);  	if (ret) @@ -2475,7 +2473,7 @@ static __init int event_trace_init(void)  #ifdef CONFIG_MODULES  	ret = register_module_notifier(&trace_module_nb);  	if (ret) -		pr_warning("Failed to register trace events module notifier\n"); +		pr_warn("Failed to register trace events module notifier\n");  #endif  	return 0;  } @@ -2579,7 +2577,7 @@ static __init void event_trace_self_tests(void)  		 * it and the self test should not be on.  		 */  		if (file->flags & FTRACE_EVENT_FL_ENABLED) { -			pr_warning("Enabled event during self test!\n"); +			pr_warn("Enabled event during self test!\n");  			WARN_ON_ONCE(1);  			continue;  		} @@ -2607,8 +2605,8 @@ static __init void event_trace_self_tests(void)  		ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);  		if (WARN_ON_ONCE(ret)) { -			pr_warning("error enabling system %s\n", -				   system->name); +			pr_warn("error enabling system %s\n", +				system->name);  			continue;  		} @@ -2616,8 +2614,8 @@ static __init void event_trace_self_tests(void)  		ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);  		if (WARN_ON_ONCE(ret)) { -			pr_warning("error disabling system %s\n", -				   system->name); +			pr_warn("error disabling system %s\n", +				system->name);  			continue;  		} @@ -2631,7 +2629,7 @@ static __init void event_trace_self_tests(void)  	ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);  	if (WARN_ON_ONCE(ret)) { -		pr_warning("error enabling all events\n"); +		pr_warn("error enabling all events\n");  		return;  	} @@ -2640,7 +2638,7 @@ static __init void event_trace_self_tests(void)  	/* reset sysname */  	ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);  	if (WARN_ON_ONCE(ret)) { -		pr_warning("error disabling all events\n"); +		pr_warn("error disabling all events\n");  		return;  	} diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8a8631926a07..7a8c1528e141 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -774,17 +774,12 @@ static void __free_preds(struct event_filter *filter)  	filter->n_preds = 0;  } -static void call_filter_disable(struct ftrace_event_call *call) -{ -	call->flags &= ~TRACE_EVENT_FL_FILTERED; -} -  static void filter_disable(struct ftrace_event_file *file)  {  	struct ftrace_event_call *call = file->event_call;  	if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) -		call_filter_disable(call); +		call->flags &= ~TRACE_EVENT_FL_FILTERED;  	else  		file->flags &= ~FTRACE_EVENT_FL_FILTERED;  } @@ -804,32 +799,6 @@ void free_event_filter(struct event_filter *filter)  	__free_filter(filter);  } -void destroy_call_preds(struct ftrace_event_call *call) -{ -	__free_filter(call->filter); -	call->filter = NULL; -} - -static void destroy_file_preds(struct ftrace_event_file *file) -{ -	__free_filter(file->filter); -	file->filter = NULL; -} - -/* - * Called when destroying the ftrace_event_file. - * The file is being freed, so we do not need to worry about - * the file being currently used. This is for module code removing - * the tracepoints from within it. - */ -void destroy_preds(struct ftrace_event_file *file) -{ -	if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER) -		destroy_call_preds(file->event_call); -	else -		destroy_file_preds(file); -} -  static struct event_filter *__alloc_filter(void)  {  	struct event_filter *filter; @@ -873,17 +842,14 @@ static inline void __remove_filter(struct ftrace_event_file *file)  		remove_filter_string(file->filter);  } -static void filter_free_subsystem_preds(struct event_subsystem *system, +static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir,  					struct trace_array *tr)  {  	struct ftrace_event_file *file; -	struct ftrace_event_call *call;  	list_for_each_entry(file, &tr->events, list) { -		call = file->event_call; -		if (strcmp(call->class->system, system->name) != 0) +		if (file->system != dir)  			continue; -  		__remove_filter(file);  	}  } @@ -901,15 +867,13 @@ static inline void __free_subsystem_filter(struct ftrace_event_file *file)  	}  } -static void filter_free_subsystem_filters(struct event_subsystem *system, +static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir,  					  struct trace_array *tr)  {  	struct ftrace_event_file *file; -	struct ftrace_event_call *call;  	list_for_each_entry(file, &tr->events, list) { -		call = file->event_call; -		if (strcmp(call->class->system, system->name) != 0) +		if (file->system != dir)  			continue;  		__free_subsystem_filter(file);  	} @@ -1582,7 +1546,6 @@ static int fold_pred_tree(struct event_filter *filter,  static int replace_preds(struct ftrace_event_call *call,  			 struct event_filter *filter,  			 struct filter_parse_state *ps, -			 char *filter_string,  			 bool dry_run)  {  	char *operand1 = NULL, *operand2 = NULL; @@ -1755,13 +1718,12 @@ struct filter_list {  	struct event_filter	*filter;  }; -static int replace_system_preds(struct event_subsystem *system, +static int replace_system_preds(struct ftrace_subsystem_dir *dir,  				struct trace_array *tr,  				struct filter_parse_state *ps,  				char *filter_string)  {  	struct ftrace_event_file *file; -	struct ftrace_event_call *call;  	struct filter_list *filter_item;  	struct filter_list *tmp;  	LIST_HEAD(filter_list); @@ -1769,15 +1731,14 @@ static int replace_system_preds(struct event_subsystem *system,  	int err;  	list_for_each_entry(file, &tr->events, list) { -		call = file->event_call; -		if (strcmp(call->class->system, system->name) != 0) +		if (file->system != dir)  			continue;  		/*  		 * Try to see if the filter can be applied  		 *  (filter arg is ignored on dry_run)  		 */ -		err = replace_preds(call, NULL, ps, filter_string, true); +		err = replace_preds(file->event_call, NULL, ps, true);  		if (err)  			event_set_no_set_filter_flag(file);  		else @@ -1787,9 +1748,7 @@ static int replace_system_preds(struct event_subsystem *system,  	list_for_each_entry(file, &tr->events, list) {  		struct event_filter *filter; -		call = file->event_call; - -		if (strcmp(call->class->system, system->name) != 0) +		if (file->system != dir)  			continue;  		if (event_no_set_filter_flag(file)) @@ -1811,7 +1770,7 @@ static int replace_system_preds(struct event_subsystem *system,  		if (err)  			goto fail_mem; -		err = replace_preds(call, filter, ps, filter_string, false); +		err = replace_preds(file->event_call, filter, ps, false);  		if (err) {  			filter_disable(file);  			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); @@ -1933,7 +1892,7 @@ static int create_filter(struct ftrace_event_call *call,  	err = create_filter_start(filter_str, set_str, &ps, &filter);  	if (!err) { -		err = replace_preds(call, filter, ps, filter_str, false); +		err = replace_preds(call, filter, ps, false);  		if (err && set_str)  			append_filter_err(ps, filter);  	} @@ -1959,7 +1918,7 @@ int create_event_filter(struct ftrace_event_call *call,   * Identical to create_filter() except that it creates a subsystem filter   * and always remembers @filter_str.   */ -static int create_system_filter(struct event_subsystem *system, +static int create_system_filter(struct ftrace_subsystem_dir *dir,  				struct trace_array *tr,  				char *filter_str, struct event_filter **filterp)  { @@ -1969,7 +1928,7 @@ static int create_system_filter(struct event_subsystem *system,  	err = create_filter_start(filter_str, true, &ps, &filter);  	if (!err) { -		err = replace_system_preds(system, tr, ps, filter_str); +		err = replace_system_preds(dir, tr, ps, filter_str);  		if (!err) {  			/* System filters just show a default message */  			kfree(filter->filter_string); @@ -2053,18 +2012,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,  	}  	if (!strcmp(strstrip(filter_string), "0")) { -		filter_free_subsystem_preds(system, tr); +		filter_free_subsystem_preds(dir, tr);  		remove_filter_string(system->filter);  		filter = system->filter;  		system->filter = NULL;  		/* Ensure all filters are no longer used */  		synchronize_sched(); -		filter_free_subsystem_filters(system, tr); +		filter_free_subsystem_filters(dir, tr);  		__free_filter(filter);  		goto out_unlock;  	} -	err = create_system_filter(system, tr, filter_string, &filter); +	err = create_system_filter(dir, tr, filter_string, &filter);  	if (filter) {  		/*  		 * No event actually uses the system filter diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4de3e57f723c..f0a0c982cde3 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -15,6 +15,33 @@  #include "trace.h"  #include "trace_output.h" +static bool kill_ftrace_graph; + +/** + * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called + * + * ftrace_graph_stop() is called when a severe error is detected in + * the function graph tracing. This function is called by the critical + * paths of function graph to keep those paths from doing any more harm. + */ +bool ftrace_graph_is_dead(void) +{ +	return kill_ftrace_graph; +} + +/** + * ftrace_graph_stop - set to permanently disable function graph tracincg + * + * In case of an error int function graph tracing, this is called + * to try to keep function graph tracing from causing any more harm. + * Usually this is pretty severe and this is called to try to at least + * get a warning out to the user. + */ +void ftrace_graph_stop(void) +{ +	kill_ftrace_graph = true; +} +  /* When set, irq functions will be ignored */  static int ftrace_graph_skip_irqs; @@ -92,6 +119,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,  	unsigned long long calltime;  	int index; +	if (unlikely(ftrace_graph_is_dead())) +		return -EBUSY; +  	if (!current->ret_stack)  		return -EBUSY; @@ -323,7 +353,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)  	return ret;  } -int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) +static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)  {  	if (tracing_thresh)  		return 1; @@ -412,7 +442,7 @@ void set_graph_array(struct trace_array *tr)  	smp_mb();  } -void trace_graph_thresh_return(struct ftrace_graph_ret *trace) +static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)  {  	if (tracing_thresh &&  	    (trace->rettime - trace->calltime < tracing_thresh)) @@ -445,6 +475,12 @@ static void graph_trace_reset(struct trace_array *tr)  	unregister_ftrace_graph();  } +static int graph_trace_update_thresh(struct trace_array *tr) +{ +	graph_trace_reset(tr); +	return graph_trace_init(tr); +} +  static int max_bytes_for_cpu;  static enum print_line_t @@ -1399,7 +1435,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)  	seq_printf(s, "               |   |   |   |\n");  } -void print_graph_headers(struct seq_file *s) +static void print_graph_headers(struct seq_file *s)  {  	print_graph_headers_flags(s, tracer_flags.val);  } @@ -1495,6 +1531,7 @@ static struct trace_event graph_trace_ret_event = {  static struct tracer graph_trace __tracer_data = {  	.name		= "function_graph", +	.update_thresh	= graph_trace_update_thresh,  	.open		= graph_trace_open,  	.pipe_open	= graph_trace_open,  	.close		= graph_trace_close, diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index f3dad80c20b2..c6977d5a9b12 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -20,23 +20,6 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;  static int next_event_type = __TRACE_LAST_TYPE + 1; -int trace_print_seq(struct seq_file *m, struct trace_seq *s) -{ -	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; -	int ret; - -	ret = seq_write(m, s->buffer, len); - -	/* -	 * Only reset this buffer if we successfully wrote to the -	 * seq_file buffer. -	 */ -	if (!ret) -		trace_seq_init(s); - -	return ret; -} -  enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)  {  	struct trace_seq *s = &iter->seq; @@ -85,257 +68,6 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)  	return TRACE_TYPE_HANDLED;  } -/** - * trace_seq_printf - sequence printing of trace information - * @s: trace sequence descriptor - * @fmt: printf format string - * - * It returns 0 if the trace oversizes the buffer's free - * space, 1 otherwise. - * - * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace - * trace_seq_printf is used to store strings into a special - * buffer (@s). Then the output may be either used by - * the sequencer or pulled into another buffer. - */ -int -trace_seq_printf(struct trace_seq *s, const char *fmt, ...) -{ -	int len = (PAGE_SIZE - 1) - s->len; -	va_list ap; -	int ret; - -	if (s->full || !len) -		return 0; - -	va_start(ap, fmt); -	ret = vsnprintf(s->buffer + s->len, len, fmt, ap); -	va_end(ap); - -	/* If we can't write it all, don't bother writing anything */ -	if (ret >= len) { -		s->full = 1; -		return 0; -	} - -	s->len += ret; - -	return 1; -} -EXPORT_SYMBOL_GPL(trace_seq_printf); - -/** - * trace_seq_bitmask - put a list of longs as a bitmask print output - * @s:		trace sequence descriptor - * @maskp:	points to an array of unsigned longs that represent a bitmask - * @nmaskbits:	The number of bits that are valid in @maskp - * - * It returns 0 if the trace oversizes the buffer's free - * space, 1 otherwise. - * - * Writes a ASCII representation of a bitmask string into @s. - */ -int -trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, -		  int nmaskbits) -{ -	int len = (PAGE_SIZE - 1) - s->len; -	int ret; - -	if (s->full || !len) -		return 0; - -	ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); -	s->len += ret; - -	return 1; -} -EXPORT_SYMBOL_GPL(trace_seq_bitmask); - -/** - * trace_seq_vprintf - sequence printing of trace information - * @s: trace sequence descriptor - * @fmt: printf format string - * - * The tracer may use either sequence operations or its own - * copy to user routines. To simplify formating of a trace - * trace_seq_printf is used to store strings into a special - * buffer (@s). Then the output may be either used by - * the sequencer or pulled into another buffer. - */ -int -trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) -{ -	int len = (PAGE_SIZE - 1) - s->len; -	int ret; - -	if (s->full || !len) -		return 0; - -	ret = vsnprintf(s->buffer + s->len, len, fmt, args); - -	/* If we can't write it all, don't bother writing anything */ -	if (ret >= len) { -		s->full = 1; -		return 0; -	} - -	s->len += ret; - -	return len; -} -EXPORT_SYMBOL_GPL(trace_seq_vprintf); - -int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) -{ -	int len = (PAGE_SIZE - 1) - s->len; -	int ret; - -	if (s->full || !len) -		return 0; - -	ret = bstr_printf(s->buffer + s->len, len, fmt, binary); - -	/* If we can't write it all, don't bother writing anything */ -	if (ret >= len) { -		s->full = 1; -		return 0; -	} - -	s->len += ret; - -	return len; -} - -/** - * trace_seq_puts - trace sequence printing of simple string - * @s: trace sequence descriptor - * @str: simple string to record - * - * The tracer may use either the sequence operations or its own - * copy to user routines. This function records a simple string - * into a special buffer (@s) for later retrieval by a sequencer - * or other mechanism. - */ -int trace_seq_puts(struct trace_seq *s, const char *str) -{ -	int len = strlen(str); - -	if (s->full) -		return 0; - -	if (len > ((PAGE_SIZE - 1) - s->len)) { -		s->full = 1; -		return 0; -	} - -	memcpy(s->buffer + s->len, str, len); -	s->len += len; - -	return len; -} - -int trace_seq_putc(struct trace_seq *s, unsigned char c) -{ -	if (s->full) -		return 0; - -	if (s->len >= (PAGE_SIZE - 1)) { -		s->full = 1; -		return 0; -	} - -	s->buffer[s->len++] = c; - -	return 1; -} -EXPORT_SYMBOL(trace_seq_putc); - -int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) -{ -	if (s->full) -		return 0; - -	if (len > ((PAGE_SIZE - 1) - s->len)) { -		s->full = 1; -		return 0; -	} - -	memcpy(s->buffer + s->len, mem, len); -	s->len += len; - -	return len; -} - -int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len) -{ -	unsigned char hex[HEX_CHARS]; -	const unsigned char *data = mem; -	int i, j; - -	if (s->full) -		return 0; - -#ifdef __BIG_ENDIAN -	for (i = 0, j = 0; i < len; i++) { -#else -	for (i = len-1, j = 0; i >= 0; i--) { -#endif -		hex[j++] = hex_asc_hi(data[i]); -		hex[j++] = hex_asc_lo(data[i]); -	} -	hex[j++] = ' '; - -	return trace_seq_putmem(s, hex, j); -} - -void *trace_seq_reserve(struct trace_seq *s, size_t len) -{ -	void *ret; - -	if (s->full) -		return NULL; - -	if (len > ((PAGE_SIZE - 1) - s->len)) { -		s->full = 1; -		return NULL; -	} - -	ret = s->buffer + s->len; -	s->len += len; - -	return ret; -} - -int trace_seq_path(struct trace_seq *s, const struct path *path) -{ -	unsigned char *p; - -	if (s->full) -		return 0; - -	if (s->len >= (PAGE_SIZE - 1)) { -		s->full = 1; -		return 0; -	} - -	p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); -	if (!IS_ERR(p)) { -		p = mangle_path(s->buffer + s->len, p, "\n"); -		if (p) { -			s->len = p - s->buffer; -			return 1; -		} -	} else { -		s->buffer[s->len++] = '?'; -		return 1; -	} - -	s->full = 1; -	return 0; -} -  const char *  ftrace_print_flags_seq(struct trace_seq *p, const char *delim,  		       unsigned long flags, @@ -343,7 +75,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,  {  	unsigned long mask;  	const char *str; -	const char *ret = p->buffer + p->len; +	const char *ret = trace_seq_buffer_ptr(p);  	int i, first = 1;  	for (i = 0;  flag_array[i].name && flags; i++) { @@ -379,7 +111,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,  			 const struct trace_print_flags *symbol_array)  {  	int i; -	const char *ret = p->buffer + p->len; +	const char *ret = trace_seq_buffer_ptr(p);  	for (i = 0;  symbol_array[i].name; i++) { @@ -390,7 +122,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,  		break;  	} -	if (ret == (const char *)(p->buffer + p->len)) +	if (ret == (const char *)(trace_seq_buffer_ptr(p)))  		trace_seq_printf(p, "0x%lx", val);  	trace_seq_putc(p, 0); @@ -405,7 +137,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,  			 const struct trace_print_flags_u64 *symbol_array)  {  	int i; -	const char *ret = p->buffer + p->len; +	const char *ret = trace_seq_buffer_ptr(p);  	for (i = 0;  symbol_array[i].name; i++) { @@ -416,7 +148,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,  		break;  	} -	if (ret == (const char *)(p->buffer + p->len)) +	if (ret == (const char *)(trace_seq_buffer_ptr(p)))  		trace_seq_printf(p, "0x%llx", val);  	trace_seq_putc(p, 0); @@ -430,7 +162,7 @@ const char *  ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,  			 unsigned int bitmask_size)  { -	const char *ret = p->buffer + p->len; +	const char *ret = trace_seq_buffer_ptr(p);  	trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);  	trace_seq_putc(p, 0); @@ -443,7 +175,7 @@ const char *  ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)  {  	int i; -	const char *ret = p->buffer + p->len; +	const char *ret = trace_seq_buffer_ptr(p);  	for (i = 0; i < buf_len; i++)  		trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 127a9d8c8357..80b25b585a70 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -35,9 +35,6 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);  extern int __unregister_ftrace_event(struct trace_event *event);  extern struct rw_semaphore trace_event_sem; -#define MAX_MEMHEX_BYTES	8 -#define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1) -  #define SEQ_PUT_FIELD_RET(s, x)				\  do {							\  	if (!trace_seq_putmem(s, &(x), sizeof(x)))	\ @@ -46,7 +43,6 @@ do {							\  #define SEQ_PUT_HEX_FIELD_RET(s, x)			\  do {							\ -	BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES);	\  	if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))	\  		return TRACE_TYPE_PARTIAL_LINE;		\  } while (0) diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c new file mode 100644 index 000000000000..1f24ed99dca2 --- /dev/null +++ b/kernel/trace/trace_seq.c @@ -0,0 +1,428 @@ +/* + * trace_seq.c + * + * Copyright (C) 2008-2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com> + * + * The trace_seq is a handy tool that allows you to pass a descriptor around + * to a buffer that other functions can write to. It is similar to the + * seq_file functionality but has some differences. + * + * To use it, the trace_seq must be initialized with trace_seq_init(). + * This will set up the counters within the descriptor. You can call + * trace_seq_init() more than once to reset the trace_seq to start + * from scratch. + *  + * The buffer size is currently PAGE_SIZE, although it may become dynamic + * in the future. + * + * A write to the buffer will either succed or fail. That is, unlike + * sprintf() there will not be a partial write (well it may write into + * the buffer but it wont update the pointers). This allows users to + * try to write something into the trace_seq buffer and if it fails + * they can flush it and try again. + * + */ +#include <linux/uaccess.h> +#include <linux/seq_file.h> +#include <linux/trace_seq.h> + +/* How much buffer is left on the trace_seq? */ +#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len) + +/* How much buffer is written? */ +#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1)) + +/** + * trace_print_seq - move the contents of trace_seq into a seq_file + * @m: the seq_file descriptor that is the destination + * @s: the trace_seq descriptor that is the source. + * + * Returns 0 on success and non zero on error. If it succeeds to + * write to the seq_file it will reset the trace_seq, otherwise + * it does not modify the trace_seq to let the caller try again. + */ +int trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ +	unsigned int len = TRACE_SEQ_BUF_USED(s); +	int ret; + +	ret = seq_write(m, s->buffer, len); + +	/* +	 * Only reset this buffer if we successfully wrote to the +	 * seq_file buffer. This lets the caller try again or +	 * do something else with the contents. +	 */ +	if (!ret) +		trace_seq_init(s); + +	return ret; +} + +/** + * trace_seq_printf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf() is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + * + * Returns 1 if we successfully written all the contents to + *   the buffer. +  * Returns 0 if we the length to write is bigger than the + *   reserved buffer space. In this case, nothing gets written. + */ +int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +{ +	unsigned int len = TRACE_SEQ_BUF_LEFT(s); +	va_list ap; +	int ret; + +	if (s->full || !len) +		return 0; + +	va_start(ap, fmt); +	ret = vsnprintf(s->buffer + s->len, len, fmt, ap); +	va_end(ap); + +	/* If we can't write it all, don't bother writing anything */ +	if (ret >= len) { +		s->full = 1; +		return 0; +	} + +	s->len += ret; + +	return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_printf); + +/** + * trace_seq_bitmask - write a bitmask array in its ASCII representation + * @s:		trace sequence descriptor + * @maskp:	points to an array of unsigned longs that represent a bitmask + * @nmaskbits:	The number of bits that are valid in @maskp + * + * Writes a ASCII representation of a bitmask string into @s. + * + * Returns 1 if we successfully written all the contents to + *   the buffer. + * Returns 0 if we the length to write is bigger than the + *   reserved buffer space. In this case, nothing gets written. + */ +int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp, +		      int nmaskbits) +{ +	unsigned int len = TRACE_SEQ_BUF_LEFT(s); +	int ret; + +	if (s->full || !len) +		return 0; + +	ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits); +	s->len += ret; + +	return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_bitmask); + +/** + * trace_seq_vprintf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + * + * Returns how much it wrote to the buffer. + */ +int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) +{ +	unsigned int len = TRACE_SEQ_BUF_LEFT(s); +	int ret; + +	if (s->full || !len) +		return 0; + +	ret = vsnprintf(s->buffer + s->len, len, fmt, args); + +	/* If we can't write it all, don't bother writing anything */ +	if (ret >= len) { +		s->full = 1; +		return 0; +	} + +	s->len += ret; + +	return len; +} +EXPORT_SYMBOL_GPL(trace_seq_vprintf); + +/** + * trace_seq_bprintf - Write the printf string from binary arguments + * @s: trace sequence descriptor + * @fmt: The format string for the @binary arguments + * @binary: The binary arguments for @fmt. + * + * When recording in a fast path, a printf may be recorded with just + * saving the format and the arguments as they were passed to the + * function, instead of wasting cycles converting the arguments into + * ASCII characters. Instead, the arguments are saved in a 32 bit + * word array that is defined by the format string constraints. + * + * This function will take the format and the binary array and finish + * the conversion into the ASCII string within the buffer. + * + * Returns how much it wrote to the buffer. + */ +int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +{ +	unsigned int len = TRACE_SEQ_BUF_LEFT(s); +	int ret; + +	if (s->full || !len) +		return 0; + +	ret = bstr_printf(s->buffer + s->len, len, fmt, binary); + +	/* If we can't write it all, don't bother writing anything */ +	if (ret >= len) { +		s->full = 1; +		return 0; +	} + +	s->len += ret; + +	return len; +} +EXPORT_SYMBOL_GPL(trace_seq_bprintf); + +/** + * trace_seq_puts - trace sequence printing of simple string + * @s: trace sequence descriptor + * @str: simple string to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple string + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + * + * Returns how much it wrote to the buffer. + */ +int trace_seq_puts(struct trace_seq *s, const char *str) +{ +	unsigned int len = strlen(str); + +	if (s->full) +		return 0; + +	if (len > TRACE_SEQ_BUF_LEFT(s)) { +		s->full = 1; +		return 0; +	} + +	memcpy(s->buffer + s->len, str, len); +	s->len += len; + +	return len; +} +EXPORT_SYMBOL_GPL(trace_seq_puts); + +/** + * trace_seq_putc - trace sequence printing of simple character + * @s: trace sequence descriptor + * @c: simple character to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple charater + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + * + * Returns how much it wrote to the buffer. + */ +int trace_seq_putc(struct trace_seq *s, unsigned char c) +{ +	if (s->full) +		return 0; + +	if (TRACE_SEQ_BUF_LEFT(s) < 1) { +		s->full = 1; +		return 0; +	} + +	s->buffer[s->len++] = c; + +	return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_putc); + +/** + * trace_seq_putmem - write raw data into the trace_seq buffer + * @s: trace sequence descriptor + * @mem: The raw memory to copy into the buffer + * @len: The length of the raw memory to copy (in bytes) + * + * There may be cases where raw memory needs to be written into the + * buffer and a strcpy() would not work. Using this function allows + * for such cases. + * + * Returns how much it wrote to the buffer. + */ +int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len) +{ +	if (s->full) +		return 0; + +	if (len > TRACE_SEQ_BUF_LEFT(s)) { +		s->full = 1; +		return 0; +	} + +	memcpy(s->buffer + s->len, mem, len); +	s->len += len; + +	return len; +} +EXPORT_SYMBOL_GPL(trace_seq_putmem); + +#define MAX_MEMHEX_BYTES	8U +#define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1) + +/** + * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex + * @s: trace sequence descriptor + * @mem: The raw memory to write its hex ASCII representation of + * @len: The length of the raw memory to copy (in bytes) + * + * This is similar to trace_seq_putmem() except instead of just copying the + * raw memory into the buffer it writes its ASCII representation of it + * in hex characters. + * + * Returns how much it wrote to the buffer. + */ +int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, +			 unsigned int len) +{ +	unsigned char hex[HEX_CHARS]; +	const unsigned char *data = mem; +	unsigned int start_len; +	int i, j; +	int cnt = 0; + +	if (s->full) +		return 0; + +	while (len) { +		start_len = min(len, HEX_CHARS - 1); +#ifdef __BIG_ENDIAN +		for (i = 0, j = 0; i < start_len; i++) { +#else +		for (i = start_len-1, j = 0; i >= 0; i--) { +#endif +			hex[j++] = hex_asc_hi(data[i]); +			hex[j++] = hex_asc_lo(data[i]); +		} +		if (WARN_ON_ONCE(j == 0 || j/2 > len)) +			break; + +		/* j increments twice per loop */ +		len -= j / 2; +		hex[j++] = ' '; + +		cnt += trace_seq_putmem(s, hex, j); +	} +	return cnt; +} +EXPORT_SYMBOL_GPL(trace_seq_putmem_hex); + +/** + * trace_seq_path - copy a path into the sequence buffer + * @s: trace sequence descriptor + * @path: path to write into the sequence buffer. + * + * Write a path name into the sequence buffer. + * + * Returns 1 if we successfully written all the contents to + *   the buffer. + * Returns 0 if we the length to write is bigger than the + *   reserved buffer space. In this case, nothing gets written. + */ +int trace_seq_path(struct trace_seq *s, const struct path *path) +{ +	unsigned char *p; + +	if (s->full) +		return 0; + +	if (TRACE_SEQ_BUF_LEFT(s) < 1) { +		s->full = 1; +		return 0; +	} + +	p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); +	if (!IS_ERR(p)) { +		p = mangle_path(s->buffer + s->len, p, "\n"); +		if (p) { +			s->len = p - s->buffer; +			return 1; +		} +	} else { +		s->buffer[s->len++] = '?'; +		return 1; +	} + +	s->full = 1; +	return 0; +} +EXPORT_SYMBOL_GPL(trace_seq_path); + +/** + * trace_seq_to_user - copy the squence buffer to user space + * @s: trace sequence descriptor + * @ubuf: The userspace memory location to copy to + * @cnt: The amount to copy + * + * Copies the sequence buffer into the userspace memory pointed to + * by @ubuf. It starts from the last read position (@s->readpos) + * and writes up to @cnt characters or till it reaches the end of + * the content in the buffer (@s->len), which ever comes first. + * + * On success, it returns a positive number of the number of bytes + * it copied. + * + * On failure it returns -EBUSY if all of the content in the + * sequence has been already read, which includes nothing in the + * sequenc (@s->len == @s->readpos). + * + * Returns -EFAULT if the copy to userspace fails. + */ +int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt) +{ +	int len; +	int ret; + +	if (!cnt) +		return 0; + +	if (s->len <= s->readpos) +		return -EBUSY; + +	len = s->len - s->readpos; +	if (cnt > len) +		cnt = len; +	ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); +	if (ret == cnt) +		return -EFAULT; + +	cnt -= ret; + +	s->readpos += cnt; +	return cnt; +} +EXPORT_SYMBOL_GPL(trace_seq_to_user); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 3c9b97e6b1f4..33ff6a24b802 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -265,7 +265,6 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)  	if (is_ret)  		tu->consumer.ret_handler = uretprobe_dispatcher;  	init_trace_uprobe_filter(&tu->filter); -	tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER;  	return tu;  error: @@ -1292,7 +1291,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)  		kfree(call->print_fmt);  		return -ENODEV;  	} -	call->flags = 0; +  	call->class->reg = trace_uprobe_register;  	call->data = tu;  	ret = trace_add_event_call(call); diff --git a/kernel/tsacct.c b/kernel/tsacct.c index a1dd9a1b1327..975cb49e32bf 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -31,20 +31,19 @@ void bacct_add_tsk(struct user_namespace *user_ns,  		   struct taskstats *stats, struct task_struct *tsk)  {  	const struct cred *tcred; -	struct timespec uptime, ts;  	cputime_t utime, stime, utimescaled, stimescaled; -	u64 ac_etime; +	u64 delta;  	BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); -	/* calculate task elapsed time in timespec */ -	do_posix_clock_monotonic_gettime(&uptime); -	ts = timespec_sub(uptime, tsk->start_time); -	/* rebase elapsed time to usec (should never be negative) */ -	ac_etime = timespec_to_ns(&ts); -	do_div(ac_etime, NSEC_PER_USEC); -	stats->ac_etime = ac_etime; -	stats->ac_btime = get_seconds() - ts.tv_sec; +	/* calculate task elapsed time in nsec */ +	delta = ktime_get_ns() - tsk->start_time; +	/* Convert to micro seconds */ +	do_div(delta, NSEC_PER_USEC); +	stats->ac_etime = delta; +	/* Convert to seconds for btime */ +	do_div(delta, USEC_PER_SEC); +	stats->ac_btime = get_seconds() - delta;  	if (thread_group_leader(tsk)) {  		stats->ac_exitcode = tsk->exit_code;  		if (tsk->flags & PF_FORKNOEXEC) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index fcc02560fd6b..aa312b0dc3ec 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -526,21 +526,21 @@ static void m_stop(struct seq_file *seq, void *v)  	return;  } -struct seq_operations proc_uid_seq_operations = { +const struct seq_operations proc_uid_seq_operations = {  	.start = uid_m_start,  	.stop = m_stop,  	.next = m_next,  	.show = uid_m_show,  }; -struct seq_operations proc_gid_seq_operations = { +const struct seq_operations proc_gid_seq_operations = {  	.start = gid_m_start,  	.stop = m_stop,  	.next = m_next,  	.show = gid_m_show,  }; -struct seq_operations proc_projid_seq_operations = { +const struct seq_operations proc_projid_seq_operations = {  	.start = projid_m_start,  	.stop = m_stop,  	.next = m_next, diff --git a/kernel/utsname.c b/kernel/utsname.c index fd393124e507..883aaaa7de8a 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task)  	struct uts_namespace *ns = NULL;  	struct nsproxy *nsproxy; -	rcu_read_lock(); -	nsproxy = task_nsproxy(task); +	task_lock(task); +	nsproxy = task->nsproxy;  	if (nsproxy) {  		ns = nsproxy->uts_ns;  		get_uts_ns(ns);  	} -	rcu_read_unlock(); +	task_unlock(task);  	return ns;  } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c3319bd1b040..a8d6914030fe 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -260,9 +260,11 @@ static void watchdog_overflow_callback(struct perf_event *event,  			return;  		if (hardlockup_panic) -			panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); +			panic("Watchdog detected hard LOCKUP on cpu %d", +			      this_cpu);  		else -			WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); +			WARN(1, "Watchdog detected hard LOCKUP on cpu %d", +			     this_cpu);  		__this_cpu_write(hard_watchdog_warn, true);  		return; @@ -345,7 +347,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)  			}  		} -		printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", +		pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",  			smp_processor_id(), duration,  			current->comm, task_pid_nr(current));  		print_modules(); @@ -366,6 +368,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)  			smp_mb__after_atomic();  		} +		add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);  		if (softlockup_panic)  			panic("softlockup: hung tasks");  		__this_cpu_write(soft_watchdog_warn, true); @@ -484,7 +487,7 @@ static int watchdog_nmi_enable(unsigned int cpu)  	if (PTR_ERR(event) == -EOPNOTSUPP)  		pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);  	else if (PTR_ERR(event) == -ENOENT) -		pr_warning("disabled (cpu%i): hardware events not enabled\n", +		pr_warn("disabled (cpu%i): hardware events not enabled\n",  			 cpu);  	else  		pr_err("disabled (cpu%i): unable to create perf event: %ld\n", diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 35974ac69600..5dbe22aa3efd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -265,7 +265,6 @@ struct workqueue_struct {  static struct kmem_cache *pwq_cache; -static int wq_numa_tbl_len;		/* highest possible NUMA node id + 1 */  static cpumask_var_t *wq_numa_possible_cpumask;  					/* possible CPUs of each node */ @@ -758,13 +757,6 @@ static bool too_many_workers(struct worker_pool *pool)  	int nr_idle = pool->nr_idle + managing; /* manager is considered idle */  	int nr_busy = pool->nr_workers - nr_idle; -	/* -	 * nr_idle and idle_list may disagree if idle rebinding is in -	 * progress.  Never return %true if idle_list is empty. -	 */ -	if (list_empty(&pool->idle_list)) -		return false; -  	return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;  } @@ -850,7 +842,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)  	pool = worker->pool;  	/* this can only happen on the local cpu */ -	if (WARN_ON_ONCE(cpu != raw_smp_processor_id())) +	if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))  		return NULL;  	/* @@ -874,35 +866,22 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)   * worker_set_flags - set worker flags and adjust nr_running accordingly   * @worker: self   * @flags: flags to set - * @wakeup: wakeup an idle worker if necessary   * - * Set @flags in @worker->flags and adjust nr_running accordingly.  If - * nr_running becomes zero and @wakeup is %true, an idle worker is - * woken up. + * Set @flags in @worker->flags and adjust nr_running accordingly.   *   * CONTEXT:   * spin_lock_irq(pool->lock)   */ -static inline void worker_set_flags(struct worker *worker, unsigned int flags, -				    bool wakeup) +static inline void worker_set_flags(struct worker *worker, unsigned int flags)  {  	struct worker_pool *pool = worker->pool;  	WARN_ON_ONCE(worker->task != current); -	/* -	 * If transitioning into NOT_RUNNING, adjust nr_running and -	 * wake up an idle worker as necessary if requested by -	 * @wakeup. -	 */ +	/* If transitioning into NOT_RUNNING, adjust nr_running. */  	if ((flags & WORKER_NOT_RUNNING) &&  	    !(worker->flags & WORKER_NOT_RUNNING)) { -		if (wakeup) { -			if (atomic_dec_and_test(&pool->nr_running) && -			    !list_empty(&pool->worklist)) -				wake_up_worker(pool); -		} else -			atomic_dec(&pool->nr_running); +		atomic_dec(&pool->nr_running);  	}  	worker->flags |= flags; @@ -1232,7 +1211,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,  			pwq_activate_delayed_work(work);  		list_del_init(&work->entry); -		pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work)); +		pwq_dec_nr_in_flight(pwq, get_work_color(work));  		/* work->data points to pwq iff queued, point to pool */  		set_work_pool_and_keep_pending(work, pool->id); @@ -1560,7 +1539,7 @@ static void worker_enter_idle(struct worker *worker)  			 (worker->hentry.next || worker->hentry.pprev)))  		return; -	/* can't use worker_set_flags(), also called from start_worker() */ +	/* can't use worker_set_flags(), also called from create_worker() */  	worker->flags |= WORKER_IDLE;  	pool->nr_idle++;  	worker->last_active = jiffies; @@ -1602,11 +1581,11 @@ static void worker_leave_idle(struct worker *worker)  	list_del_init(&worker->entry);  } -static struct worker *alloc_worker(void) +static struct worker *alloc_worker(int node)  {  	struct worker *worker; -	worker = kzalloc(sizeof(*worker), GFP_KERNEL); +	worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);  	if (worker) {  		INIT_LIST_HEAD(&worker->entry);  		INIT_LIST_HEAD(&worker->scheduled); @@ -1670,6 +1649,9 @@ static void worker_detach_from_pool(struct worker *worker,  		detach_completion = pool->detach_completion;  	mutex_unlock(&pool->attach_mutex); +	/* clear leftover flags without pool->lock after it is detached */ +	worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND); +  	if (detach_completion)  		complete(detach_completion);  } @@ -1678,8 +1660,7 @@ static void worker_detach_from_pool(struct worker *worker,   * create_worker - create a new workqueue worker   * @pool: pool the new worker will belong to   * - * Create a new worker which is attached to @pool.  The new worker must be - * started by start_worker(). + * Create and start a new worker which is attached to @pool.   *   * CONTEXT:   * Might sleep.  Does GFP_KERNEL allocations. @@ -1698,7 +1679,7 @@ static struct worker *create_worker(struct worker_pool *pool)  	if (id < 0)  		goto fail; -	worker = alloc_worker(); +	worker = alloc_worker(pool->node);  	if (!worker)  		goto fail; @@ -1724,6 +1705,13 @@ static struct worker *create_worker(struct worker_pool *pool)  	/* successful, attach the worker to the pool */  	worker_attach_to_pool(worker, pool); +	/* start the newly created worker */ +	spin_lock_irq(&pool->lock); +	worker->pool->nr_workers++; +	worker_enter_idle(worker); +	wake_up_process(worker->task); +	spin_unlock_irq(&pool->lock); +  	return worker;  fail: @@ -1734,44 +1722,6 @@ fail:  }  /** - * start_worker - start a newly created worker - * @worker: worker to start - * - * Make the pool aware of @worker and start it. - * - * CONTEXT: - * spin_lock_irq(pool->lock). - */ -static void start_worker(struct worker *worker) -{ -	worker->pool->nr_workers++; -	worker_enter_idle(worker); -	wake_up_process(worker->task); -} - -/** - * create_and_start_worker - create and start a worker for a pool - * @pool: the target pool - * - * Grab the managership of @pool and create and start a new worker for it. - * - * Return: 0 on success. A negative error code otherwise. - */ -static int create_and_start_worker(struct worker_pool *pool) -{ -	struct worker *worker; - -	worker = create_worker(pool); -	if (worker) { -		spin_lock_irq(&pool->lock); -		start_worker(worker); -		spin_unlock_irq(&pool->lock); -	} - -	return worker ? 0 : -ENOMEM; -} - -/**   * destroy_worker - destroy a workqueue worker   * @worker: worker to be destroyed   * @@ -1909,23 +1859,10 @@ restart:  	mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);  	while (true) { -		struct worker *worker; - -		worker = create_worker(pool); -		if (worker) { -			del_timer_sync(&pool->mayday_timer); -			spin_lock_irq(&pool->lock); -			start_worker(worker); -			if (WARN_ON_ONCE(need_to_create_worker(pool))) -				goto restart; -			return true; -		} - -		if (!need_to_create_worker(pool)) +		if (create_worker(pool) || !need_to_create_worker(pool))  			break; -		__set_current_state(TASK_INTERRUPTIBLE); -		schedule_timeout(CREATE_COOLDOWN); +		schedule_timeout_interruptible(CREATE_COOLDOWN);  		if (!need_to_create_worker(pool))  			break; @@ -1933,6 +1870,11 @@ restart:  	del_timer_sync(&pool->mayday_timer);  	spin_lock_irq(&pool->lock); +	/* +	 * This is necessary even after a new worker was just successfully +	 * created as @pool->lock was dropped and the new worker might have +	 * already become busy. +	 */  	if (need_to_create_worker(pool))  		goto restart;  	return true; @@ -2020,13 +1962,8 @@ __acquires(&pool->lock)  	lockdep_copy_map(&lockdep_map, &work->lockdep_map);  #endif -	/* -	 * Ensure we're on the correct CPU.  DISASSOCIATED test is -	 * necessary to avoid spurious warnings from rescuers servicing the -	 * unbound or a disassociated pool. -	 */ -	WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && -		     !(pool->flags & POOL_DISASSOCIATED) && +	/* ensure we're on the correct CPU */ +	WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&  		     raw_smp_processor_id() != pool->cpu);  	/* @@ -2052,17 +1989,22 @@ __acquires(&pool->lock)  	list_del_init(&work->entry);  	/* -	 * CPU intensive works don't participate in concurrency -	 * management.  They're the scheduler's responsibility. +	 * CPU intensive works don't participate in concurrency management. +	 * They're the scheduler's responsibility.  This takes @worker out +	 * of concurrency management and the next code block will chain +	 * execution of the pending work items.  	 */  	if (unlikely(cpu_intensive)) -		worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); +		worker_set_flags(worker, WORKER_CPU_INTENSIVE);  	/* -	 * Unbound pool isn't concurrency managed and work items should be -	 * executed ASAP.  Wake up another worker if necessary. +	 * Wake up another worker if necessary.  The condition is always +	 * false for normal per-cpu workers since nr_running would always +	 * be >= 1 at this point.  This is used to chain execution of the +	 * pending work items for WORKER_NOT_RUNNING workers such as the +	 * UNBOUND and CPU_INTENSIVE ones.  	 */ -	if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) +	if (need_more_worker(pool))  		wake_up_worker(pool);  	/* @@ -2218,7 +2160,7 @@ recheck:  		}  	} while (keep_working(pool)); -	worker_set_flags(worker, WORKER_PREP, false); +	worker_set_flags(worker, WORKER_PREP);  sleep:  	/*  	 * pool->lock is held and there's no work to process and no need to @@ -2311,29 +2253,27 @@ repeat:  				move_linked_works(work, scheduled, &n);  		process_scheduled_works(rescuer); -		spin_unlock_irq(&pool->lock); - -		worker_detach_from_pool(rescuer, pool); - -		spin_lock_irq(&pool->lock);  		/*  		 * Put the reference grabbed by send_mayday().  @pool won't -		 * go away while we're holding its lock. +		 * go away while we're still attached to it.  		 */  		put_pwq(pwq);  		/* -		 * Leave this pool.  If keep_working() is %true, notify a +		 * Leave this pool.  If need_more_worker() is %true, notify a  		 * regular worker; otherwise, we end up with 0 concurrency  		 * and stalling the execution.  		 */ -		if (keep_working(pool)) +		if (need_more_worker(pool))  			wake_up_worker(pool);  		rescuer->pool = NULL; -		spin_unlock(&pool->lock); -		spin_lock(&wq_mayday_lock); +		spin_unlock_irq(&pool->lock); + +		worker_detach_from_pool(rescuer, pool); + +		spin_lock_irq(&wq_mayday_lock);  	}  	spin_unlock_irq(&wq_mayday_lock); @@ -3458,7 +3398,7 @@ static void put_unbound_pool(struct worker_pool *pool)  		return;  	/* sanity checks */ -	if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || +	if (WARN_ON(!(pool->cpu < 0)) ||  	    WARN_ON(!list_empty(&pool->worklist)))  		return; @@ -3524,7 +3464,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)  	hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {  		if (wqattrs_equal(pool->attrs, attrs)) {  			pool->refcnt++; -			goto out_unlock; +			return pool;  		}  	} @@ -3557,12 +3497,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)  		goto fail;  	/* create and start the initial worker */ -	if (create_and_start_worker(pool) < 0) +	if (!create_worker(pool))  		goto fail;  	/* install */  	hash_add(unbound_pool_hash, &pool->hash_node, hash); -out_unlock: +  	return pool;  fail:  	if (pool) @@ -3591,11 +3531,6 @@ static void pwq_unbound_release_workfn(struct work_struct *work)  	if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))  		return; -	/* -	 * Unlink @pwq.  Synchronization against wq->mutex isn't strictly -	 * necessary on release but do it anyway.  It's easier to verify -	 * and consistent with the linking path. -	 */  	mutex_lock(&wq->mutex);  	list_del_rcu(&pwq->pwqs_node);  	is_last = list_empty(&wq->pwqs); @@ -3692,10 +3627,7 @@ static void link_pwq(struct pool_workqueue *pwq)  	if (!list_empty(&pwq->pwqs_node))  		return; -	/* -	 * Set the matching work_color.  This is synchronized with -	 * wq->mutex to avoid confusing flush_workqueue(). -	 */ +	/* set the matching work_color */  	pwq->work_color = wq->work_color;  	/* sync max_active to the current setting */ @@ -3832,7 +3764,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,  	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))  		return -EINVAL; -	pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL); +	pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);  	new_attrs = alloc_workqueue_attrs(GFP_KERNEL);  	tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);  	if (!pwq_tbl || !new_attrs || !tmp_attrs) @@ -4080,7 +4012,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,  	/* allocate wq and format name */  	if (flags & WQ_UNBOUND) -		tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); +		tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);  	wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);  	if (!wq) @@ -4122,7 +4054,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,  	if (flags & WQ_MEM_RECLAIM) {  		struct worker *rescuer; -		rescuer = alloc_worker(); +		rescuer = alloc_worker(NUMA_NO_NODE);  		if (!rescuer)  			goto err_destroy; @@ -4470,8 +4402,6 @@ static void wq_unbind_fn(struct work_struct *work)  	struct worker *worker;  	for_each_cpu_worker_pool(pool, cpu) { -		WARN_ON_ONCE(cpu != smp_processor_id()); -  		mutex_lock(&pool->attach_mutex);  		spin_lock_irq(&pool->lock); @@ -4543,6 +4473,7 @@ static void rebind_workers(struct worker_pool *pool)  						  pool->attrs->cpumask) < 0);  	spin_lock_irq(&pool->lock); +	pool->flags &= ~POOL_DISASSOCIATED;  	for_each_pool_worker(worker, pool) {  		unsigned int worker_flags = worker->flags; @@ -4632,7 +4563,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,  		for_each_cpu_worker_pool(pool, cpu) {  			if (pool->nr_workers)  				continue; -			if (create_and_start_worker(pool) < 0) +			if (!create_worker(pool))  				return NOTIFY_BAD;  		}  		break; @@ -4644,15 +4575,10 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,  		for_each_pool(pool, pi) {  			mutex_lock(&pool->attach_mutex); -			if (pool->cpu == cpu) { -				spin_lock_irq(&pool->lock); -				pool->flags &= ~POOL_DISASSOCIATED; -				spin_unlock_irq(&pool->lock); - +			if (pool->cpu == cpu)  				rebind_workers(pool); -			} else if (pool->cpu < 0) { +			else if (pool->cpu < 0)  				restore_unbound_workers_cpumask(pool, cpu); -			}  			mutex_unlock(&pool->attach_mutex);  		} @@ -4856,10 +4782,6 @@ static void __init wq_numa_init(void)  	cpumask_var_t *tbl;  	int node, cpu; -	/* determine NUMA pwq table len - highest node id + 1 */ -	for_each_node(node) -		wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1); -  	if (num_possible_nodes() <= 1)  		return; @@ -4876,7 +4798,7 @@ static void __init wq_numa_init(void)  	 * available.  Build one from cpu_to_node() which should have been  	 * fully initialized by now.  	 */ -	tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL); +	tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);  	BUG_ON(!tbl);  	for_each_node(node) @@ -4936,7 +4858,7 @@ static int __init init_workqueues(void)  		for_each_cpu_worker_pool(pool, cpu) {  			pool->flags &= ~POOL_DISASSOCIATED; -			BUG_ON(create_and_start_worker(pool) < 0); +			BUG_ON(!create_worker(pool));  		}  	}  | 
