summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/bpf/Makefile3
-rw-r--r--kernel/bpf/arraymap.c40
-rw-r--r--kernel/bpf/bloom_filter.c41
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c24
-rw-r--r--kernel/bpf/bpf_inode_storage.c61
-rw-r--r--kernel/bpf/bpf_iter.c70
-rw-r--r--kernel/bpf/bpf_local_storage.c371
-rw-r--r--kernel/bpf/bpf_struct_ops.c276
-rw-r--r--kernel/bpf/bpf_task_storage.c28
-rw-r--r--kernel/bpf/btf.c467
-rw-r--r--kernel/bpf/cgroup.c100
-rw-r--r--kernel/bpf/cgroup_iter.c4
-rw-r--r--kernel/bpf/core.c13
-rw-r--r--kernel/bpf/cpumap.c18
-rw-r--r--kernel/bpf/cpumask.c87
-rw-r--r--kernel/bpf/devmap.c50
-rw-r--r--kernel/bpf/hashtab.c140
-rw-r--r--kernel/bpf/helpers.c509
-rw-r--r--kernel/bpf/local_storage.c17
-rw-r--r--kernel/bpf/log.c330
-rw-r--r--kernel/bpf/lpm_trie.c17
-rw-r--r--kernel/bpf/map_in_map.c15
-rw-r--r--kernel/bpf/memalloc.c59
-rw-r--r--kernel/bpf/offload.c6
-rw-r--r--kernel/bpf/queue_stack_maps.c32
-rw-r--r--kernel/bpf/reuseport_array.c10
-rw-r--r--kernel/bpf/ringbuf.c26
-rw-r--r--kernel/bpf/stackmap.c20
-rw-r--r--kernel/bpf/syscall.c170
-rw-r--r--kernel/bpf/trampoline.c40
-rw-r--r--kernel/bpf/verifier.c2381
-rw-r--r--kernel/cgroup/cgroup-v1.c16
-rw-r--r--kernel/cgroup/cgroup.c86
-rw-r--r--kernel/cgroup/cpuset.c208
-rw-r--r--kernel/cgroup/legacy_freezer.c7
-rw-r--r--kernel/cgroup/rstat.c8
-rw-r--r--kernel/compat.c2
-rw-r--r--kernel/configs/android-base.config159
-rw-r--r--kernel/configs/android-recommended.config127
-rw-r--r--kernel/configs/tiny.config1
-rw-r--r--kernel/cpu.c25
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/delayacct.c14
-rw-r--r--kernel/dma/map_benchmark.c1
-rw-r--r--kernel/dma/pool.c6
-rw-r--r--kernel/dma/swiotlb.c51
-rw-r--r--kernel/entry/common.c5
-rw-r--r--kernel/entry/syscall_user_dispatch.c74
-rw-r--r--kernel/events/core.c24
-rw-r--r--kernel/events/hw_breakpoint_test.c1
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c306
-rw-r--r--kernel/hung_task.c10
-rw-r--r--kernel/irq/manage.c5
-rw-r--r--kernel/irq_work.c12
-rw-r--r--kernel/kallsyms.c5
-rw-r--r--kernel/kallsyms_selftest.c6
-rw-r--r--kernel/kcsan/Makefile2
-rw-r--r--kernel/kcsan/core.c17
-rw-r--r--kernel/kcsan/kcsan_test.c20
-rw-r--r--kernel/kexec_file.c6
-rw-r--r--kernel/kheaders.c10
-rw-r--r--kernel/ksysfs.c22
-rw-r--r--kernel/kthread.c55
-rw-r--r--kernel/livepatch/core.c10
-rw-r--r--kernel/livepatch/transition.c122
-rw-r--r--kernel/locking/lockdep.c64
-rw-r--r--kernel/locking/locktorture.c188
-rw-r--r--kernel/locking/test-ww_mutex.c2
-rw-r--r--kernel/module/Kconfig100
-rw-r--r--kernel/module/Makefile6
-rw-r--r--kernel/module/decompress.c6
-rw-r--r--kernel/module/dups.c246
-rw-r--r--kernel/module/internal.h141
-rw-r--r--kernel/module/kallsyms.c94
-rw-r--r--kernel/module/kdb.c17
-rw-r--r--kernel/module/kmod.c (renamed from kernel/kmod.c)49
-rw-r--r--kernel/module/livepatch.c10
-rw-r--r--kernel/module/main.c1091
-rw-r--r--kernel/module/procfs.c16
-rw-r--r--kernel/module/stats.c430
-rw-r--r--kernel/module/strict_rwx.c99
-rw-r--r--kernel/module/tracking.c7
-rw-r--r--kernel/module/tree_lookup.c39
-rw-r--r--kernel/notifier.c6
-rw-r--r--kernel/nsproxy.c17
-rw-r--r--kernel/padata.c4
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/params.c2
-rw-r--r--kernel/pid.c19
-rw-r--r--kernel/power/main.c59
-rw-r--r--kernel/power/process.c2
-rw-r--r--kernel/printk/printk.c17
-rw-r--r--kernel/ptrace.c9
-rw-r--r--kernel/rcu/Kconfig3
-rw-r--r--kernel/rcu/rcu.h43
-rw-r--r--kernel/rcu/rcuscale.c9
-rw-r--r--kernel/rcu/rcutorture.c234
-rw-r--r--kernel/rcu/refscale.c2
-rw-r--r--kernel/rcu/srcutiny.c2
-rw-r--r--kernel/rcu/srcutree.c438
-rw-r--r--kernel/rcu/tasks.h33
-rw-r--r--kernel/rcu/tree.c45
-rw-r--r--kernel/rcu/tree_exp.h16
-rw-r--r--kernel/rcu/tree_nocb.h4
-rw-r--r--kernel/sched/clock.c3
-rw-r--r--kernel/sched/core.c714
-rw-r--r--kernel/sched/deadline.c11
-rw-r--r--kernel/sched/debug.c52
-rw-r--r--kernel/sched/fair.c144
-rw-r--r--kernel/sched/idle.c2
-rw-r--r--kernel/sched/psi.c473
-rw-r--r--kernel/sched/rt.c23
-rw-r--r--kernel/sched/sched.h243
-rw-r--r--kernel/sched/smp.h2
-rw-r--r--kernel/sched/topology.c4
-rw-r--r--kernel/seccomp.c15
-rw-r--r--kernel/signal.c21
-rw-r--r--kernel/smp.c313
-rw-r--r--kernel/softirq.c9
-rw-r--r--kernel/sys.c111
-rw-r--r--kernel/sysctl.c122
-rw-r--r--kernel/time/alarmtimer.c3
-rw-r--r--kernel/time/posix-cpu-timers.c81
-rw-r--r--kernel/time/posix-timers.c4
-rw-r--r--kernel/time/tick-common.c12
-rw-r--r--kernel/time/tick-sched.c151
-rw-r--r--kernel/time/tick-sched.h67
-rw-r--r--kernel/time/timekeeping.c4
-rw-r--r--kernel/trace/Kconfig8
-rw-r--r--kernel/trace/bpf_trace.c21
-rw-r--r--kernel/trace/fprobe.c32
-rw-r--r--kernel/trace/ftrace.c495
-rw-r--r--kernel/trace/kprobe_event_gen_test.c4
-rw-r--r--kernel/trace/ring_buffer.c119
-rw-r--r--kernel/trace/rv/reactor_panic.c1
-rw-r--r--kernel/trace/rv/reactor_printk.c1
-rw-r--r--kernel/trace/rv/rv.c2
-rw-r--r--kernel/trace/trace.c37
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_events_hist.c12
-rw-r--r--kernel/trace/trace_events_synth.c19
-rw-r--r--kernel/trace/trace_events_user.c1034
-rw-r--r--kernel/trace/trace_hwlat.c11
-rw-r--r--kernel/trace/trace_osnoise.c16
-rw-r--r--kernel/trace/trace_output.c175
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_probe.c2
-rw-r--r--kernel/trace/trace_selftest.c19
-rw-r--r--kernel/utsname_sysctl.c11
-rw-r--r--kernel/vhost_task.c117
-rw-r--r--kernel/watch_queue.c1
-rw-r--r--kernel/workqueue.c142
155 files changed, 10592 insertions, 4706 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 10ef068f598d..b69c95315480 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -13,8 +13,8 @@ obj-y = fork.o exec_domain.o panic.o \
async.o range.o smpboot.o ucount.o regset.o
obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o
-obj-$(CONFIG_MODULES) += kmod.o
obj-$(CONFIG_MULTIUSER) += groups.o
+obj-$(CONFIG_VHOST_TASK) += vhost_task.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace internal ftrace files
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 02242614dcc7..1d3892168d32 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -6,7 +6,8 @@ cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
endif
CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o log.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_iter.o map_iter.o task_iter.o prog_iter.o link_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 484706959556..2058e89b5ddd 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -307,8 +307,8 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
}
/* Called from syscall or from eBPF program */
-static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long array_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u32 index = *(u32 *)key;
@@ -386,7 +386,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
}
/* Called from syscall or from eBPF program */
-static int array_map_delete_elem(struct bpf_map *map, void *key)
+static long array_map_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
@@ -686,8 +686,8 @@ static const struct bpf_iter_seq_info iter_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info),
};
-static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
- void *callback_ctx, u64 flags)
+static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+ void *callback_ctx, u64 flags)
{
u32 i, key, num_elems = 0;
struct bpf_array *array;
@@ -721,6 +721,28 @@ static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_
return num_elems;
}
+static u64 array_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ u32 elem_size = array->elem_size;
+ u64 entries = map->max_entries;
+ u64 usage = sizeof(*array);
+
+ if (percpu) {
+ usage += entries * sizeof(void *);
+ usage += entries * elem_size * num_possible_cpus();
+ } else {
+ if (map->map_flags & BPF_F_MMAPABLE) {
+ usage = PAGE_ALIGN(usage);
+ usage += PAGE_ALIGN(entries * elem_size);
+ } else {
+ usage += entries * elem_size;
+ }
+ }
+ return usage;
+}
+
BTF_ID_LIST_SINGLE(array_map_btf_ids, struct, bpf_array)
const struct bpf_map_ops array_map_ops = {
.map_meta_equal = array_map_meta_equal,
@@ -742,6 +764,7 @@ const struct bpf_map_ops array_map_ops = {
.map_update_batch = generic_map_update_batch,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_array_elem,
+ .map_mem_usage = array_map_mem_usage,
.map_btf_id = &array_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
@@ -762,6 +785,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_update_batch = generic_map_update_batch,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_array_elem,
+ .map_mem_usage = array_map_mem_usage,
.map_btf_id = &array_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
};
@@ -847,7 +871,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
return 0;
}
-static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
+static long fd_array_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
void *old_ptr;
@@ -1156,6 +1180,7 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
.map_release_uref = prog_array_map_clear,
.map_seq_show_elem = prog_array_map_seq_show_elem,
+ .map_mem_usage = array_map_mem_usage,
.map_btf_id = &array_map_btf_ids[0],
};
@@ -1257,6 +1282,7 @@ const struct bpf_map_ops perf_event_array_map_ops = {
.map_fd_put_ptr = perf_event_fd_array_put_ptr,
.map_release = perf_event_fd_array_release,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = array_map_mem_usage,
.map_btf_id = &array_map_btf_ids[0],
};
@@ -1291,6 +1317,7 @@ const struct bpf_map_ops cgroup_array_map_ops = {
.map_fd_get_ptr = cgroup_fd_array_get_ptr,
.map_fd_put_ptr = cgroup_fd_array_put_ptr,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = array_map_mem_usage,
.map_btf_id = &array_map_btf_ids[0],
};
#endif
@@ -1379,5 +1406,6 @@ const struct bpf_map_ops array_of_maps_map_ops = {
.map_lookup_batch = generic_map_lookup_batch,
.map_update_batch = generic_map_update_batch,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = array_map_mem_usage,
.map_btf_id = &array_map_btf_ids[0],
};
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index 48ee750849f2..540331b610a9 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -16,13 +16,6 @@ struct bpf_bloom_filter {
struct bpf_map map;
u32 bitset_mask;
u32 hash_seed;
- /* If the size of the values in the bloom filter is u32 aligned,
- * then it is more performant to use jhash2 as the underlying hash
- * function, else we use jhash. This tracks the number of u32s
- * in an u32-aligned value size. If the value size is not u32 aligned,
- * this will be 0.
- */
- u32 aligned_u32_count;
u32 nr_hash_funcs;
unsigned long bitset[];
};
@@ -32,16 +25,15 @@ static u32 hash(struct bpf_bloom_filter *bloom, void *value,
{
u32 h;
- if (bloom->aligned_u32_count)
- h = jhash2(value, bloom->aligned_u32_count,
- bloom->hash_seed + index);
+ if (likely(value_size % 4 == 0))
+ h = jhash2(value, value_size / 4, bloom->hash_seed + index);
else
h = jhash(value, value_size, bloom->hash_seed + index);
return h & bloom->bitset_mask;
}
-static int bloom_map_peek_elem(struct bpf_map *map, void *value)
+static long bloom_map_peek_elem(struct bpf_map *map, void *value)
{
struct bpf_bloom_filter *bloom =
container_of(map, struct bpf_bloom_filter, map);
@@ -56,7 +48,7 @@ static int bloom_map_peek_elem(struct bpf_map *map, void *value)
return 0;
}
-static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
+static long bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
{
struct bpf_bloom_filter *bloom =
container_of(map, struct bpf_bloom_filter, map);
@@ -73,12 +65,12 @@ static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
return 0;
}
-static int bloom_map_pop_elem(struct bpf_map *map, void *value)
+static long bloom_map_pop_elem(struct bpf_map *map, void *value)
{
return -EOPNOTSUPP;
}
-static int bloom_map_delete_elem(struct bpf_map *map, void *value)
+static long bloom_map_delete_elem(struct bpf_map *map, void *value)
{
return -EOPNOTSUPP;
}
@@ -152,11 +144,6 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
bloom->nr_hash_funcs = nr_hash_funcs;
bloom->bitset_mask = bitset_mask;
- /* Check whether the value size is u32-aligned */
- if ((attr->value_size & (sizeof(u32) - 1)) == 0)
- bloom->aligned_u32_count =
- attr->value_size / sizeof(u32);
-
if (!(attr->map_flags & BPF_F_ZERO_SEED))
bloom->hash_seed = get_random_u32();
@@ -177,8 +164,8 @@ static void *bloom_map_lookup_elem(struct bpf_map *map, void *key)
return ERR_PTR(-EINVAL);
}
-static int bloom_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long bloom_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
/* The eBPF program should use map_push_elem instead */
return -EINVAL;
@@ -193,6 +180,17 @@ static int bloom_map_check_btf(const struct bpf_map *map,
return btf_type_is_void(key_type) ? 0 : -EINVAL;
}
+static u64 bloom_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_bloom_filter *bloom;
+ u64 bitset_bytes;
+
+ bloom = container_of(map, struct bpf_bloom_filter, map);
+ bitset_bytes = BITS_TO_BYTES((u64)bloom->bitset_mask + 1);
+ bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long));
+ return sizeof(*bloom) + bitset_bytes;
+}
+
BTF_ID_LIST_SINGLE(bpf_bloom_map_btf_ids, struct, bpf_bloom_filter)
const struct bpf_map_ops bloom_filter_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -206,5 +204,6 @@ const struct bpf_map_ops bloom_filter_map_ops = {
.map_update_elem = bloom_map_update_elem,
.map_delete_elem = bloom_map_delete_elem,
.map_check_btf = bloom_map_check_btf,
+ .map_mem_usage = bloom_map_mem_usage,
.map_btf_id = &bpf_bloom_map_btf_ids[0],
};
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 6cdf6d9ed91d..d44fe8dd9732 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -46,8 +46,6 @@ static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner)
void bpf_cgrp_storage_free(struct cgroup *cgroup)
{
struct bpf_local_storage *local_storage;
- bool free_cgroup_storage = false;
- unsigned long flags;
rcu_read_lock();
local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
@@ -57,14 +55,9 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
}
bpf_cgrp_storage_lock();
- raw_spin_lock_irqsave(&local_storage->lock, flags);
- free_cgroup_storage = bpf_local_storage_unlink_nolock(local_storage);
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_local_storage_destroy(local_storage);
bpf_cgrp_storage_unlock();
rcu_read_unlock();
-
- if (free_cgroup_storage)
- kfree_rcu(local_storage, rcu);
}
static struct bpf_local_storage_data *
@@ -100,8 +93,8 @@ static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
return sdata ? sdata->data : NULL;
}
-static int bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
struct cgroup *cgroup;
@@ -128,11 +121,11 @@ static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- bpf_selem_unlink(SELEM(sdata), true);
+ bpf_selem_unlink(SELEM(sdata), false);
return 0;
}
-static int bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
{
struct cgroup *cgroup;
int err, fd;
@@ -156,7 +149,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &cgroup_cache);
+ return bpf_local_storage_map_alloc(attr, &cgroup_cache, true);
}
static void cgroup_storage_map_free(struct bpf_map *map)
@@ -221,6 +214,7 @@ const struct bpf_map_ops cgrp_storage_map_ops = {
.map_update_elem = bpf_cgrp_storage_update_elem,
.map_delete_elem = bpf_cgrp_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
.map_btf_id = &bpf_local_storage_map_btf_id[0],
.map_owner_storage_ptr = cgroup_storage_ptr,
};
@@ -230,7 +224,7 @@ const struct bpf_func_proto bpf_cgrp_storage_get_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &bpf_cgroup_btf_id[0],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
@@ -241,6 +235,6 @@ const struct bpf_func_proto bpf_cgrp_storage_delete_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &bpf_cgroup_btf_id[0],
};
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 05f4c66c9089..b0ef45db207c 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -57,7 +57,6 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
void bpf_inode_storage_free(struct inode *inode)
{
struct bpf_local_storage *local_storage;
- bool free_inode_storage = false;
struct bpf_storage_blob *bsb;
bsb = bpf_inode(inode);
@@ -72,51 +71,40 @@ void bpf_inode_storage_free(struct inode *inode)
return;
}
- raw_spin_lock_bh(&local_storage->lock);
- free_inode_storage = bpf_local_storage_unlink_nolock(local_storage);
- raw_spin_unlock_bh(&local_storage->lock);
+ bpf_local_storage_destroy(local_storage);
rcu_read_unlock();
-
- if (free_inode_storage)
- kfree_rcu(local_storage, rcu);
}
static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_local_storage_data *sdata;
- struct file *f;
- int fd;
+ struct fd f = fdget_raw(*(int *)key);
- fd = *(int *)key;
- f = fget_raw(fd);
- if (!f)
+ if (!f.file)
return ERR_PTR(-EBADF);
- sdata = inode_storage_lookup(f->f_inode, map, true);
- fput(f);
+ sdata = inode_storage_lookup(file_inode(f.file), map, true);
+ fdput(f);
return sdata ? sdata->data : NULL;
}
-static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
- struct file *f;
- int fd;
+ struct fd f = fdget_raw(*(int *)key);
- fd = *(int *)key;
- f = fget_raw(fd);
- if (!f)
+ if (!f.file)
return -EBADF;
- if (!inode_storage_ptr(f->f_inode)) {
- fput(f);
+ if (!inode_storage_ptr(file_inode(f.file))) {
+ fdput(f);
return -EBADF;
}
- sdata = bpf_local_storage_update(f->f_inode,
+ sdata = bpf_local_storage_update(file_inode(f.file),
(struct bpf_local_storage_map *)map,
value, map_flags, GFP_ATOMIC);
- fput(f);
+ fdput(f);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -128,23 +116,21 @@ static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
if (!sdata)
return -ENOENT;
- bpf_selem_unlink(SELEM(sdata), true);
+ bpf_selem_unlink(SELEM(sdata), false);
return 0;
}
-static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
{
- struct file *f;
- int fd, err;
+ struct fd f = fdget_raw(*(int *)key);
+ int err;
- fd = *(int *)key;
- f = fget_raw(fd);
- if (!f)
+ if (!f.file)
return -EBADF;
- err = inode_storage_delete(f->f_inode, map);
- fput(f);
+ err = inode_storage_delete(file_inode(f.file), map);
+ fdput(f);
return err;
}
@@ -205,7 +191,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &inode_cache);
+ return bpf_local_storage_map_alloc(attr, &inode_cache, false);
}
static void inode_storage_map_free(struct bpf_map *map)
@@ -223,6 +209,7 @@ const struct bpf_map_ops inode_storage_map_ops = {
.map_update_elem = bpf_fd_inode_storage_update_elem,
.map_delete_elem = bpf_fd_inode_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
.map_btf_id = &bpf_local_storage_map_btf_id[0],
.map_owner_storage_ptr = inode_storage_ptr,
};
@@ -234,7 +221,7 @@ const struct bpf_func_proto bpf_inode_storage_get_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &bpf_inode_storage_btf_ids[0],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
@@ -245,6 +232,6 @@ const struct bpf_func_proto bpf_inode_storage_delete_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &bpf_inode_storage_btf_ids[0],
};
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 5dc307bdeaeb..96856f130cbf 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -776,3 +776,73 @@ const struct bpf_func_proto bpf_loop_proto = {
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
+
+struct bpf_iter_num_kern {
+ int cur; /* current value, inclusive */
+ int end; /* final value, exclusive */
+} __aligned(8);
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+__bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num));
+
+ BTF_TYPE_EMIT(struct btf_iter_num);
+
+ /* start == end is legit, it's an empty range and we'll just get NULL
+ * on first (and any subsequent) bpf_iter_num_next() call
+ */
+ if (start > end) {
+ s->cur = s->end = 0;
+ return -EINVAL;
+ }
+
+ /* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */
+ if ((s64)end - (s64)start > BPF_MAX_LOOPS) {
+ s->cur = s->end = 0;
+ return -E2BIG;
+ }
+
+ /* user will call bpf_iter_num_next() first,
+ * which will set s->cur to exactly start value;
+ * underflow shouldn't matter
+ */
+ s->cur = start - 1;
+ s->end = end;
+
+ return 0;
+}
+
+__bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ /* check failed initialization or if we are done (same behavior);
+ * need to be careful about overflow, so convert to s64 for checks,
+ * e.g., if s->cur == s->end == INT_MAX, we can't just do
+ * s->cur + 1 >= s->end
+ */
+ if ((s64)(s->cur + 1) >= s->end) {
+ s->cur = s->end = 0;
+ return NULL;
+ }
+
+ s->cur++;
+
+ return &s->cur;
+}
+
+__bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it)
+{
+ struct bpf_iter_num_kern *s = (void *)it;
+
+ s->cur = s->end = 0;
+}
+
+__diag_pop();
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 35f4138a54dc..47d9948d768f 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -51,11 +51,21 @@ owner_storage(struct bpf_local_storage_map *smap, void *owner)
return map->ops->map_owner_storage_ptr(owner);
}
+static bool selem_linked_to_storage_lockless(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed_lockless(&selem->snode);
+}
+
static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
{
return !hlist_unhashed(&selem->snode);
}
+static bool selem_linked_to_map_lockless(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed_lockless(&selem->map_node);
+}
+
static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
{
return !hlist_unhashed(&selem->map_node);
@@ -70,11 +80,28 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
if (charge_mem && mem_charge(smap, owner, smap->elem_size))
return NULL;
- selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
- gfp_flags | __GFP_NOWARN);
+ if (smap->bpf_ma) {
+ migrate_disable();
+ selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
+ migrate_enable();
+ if (selem)
+ /* Keep the original bpf_map_kzalloc behavior
+ * before started using the bpf_mem_cache_alloc.
+ *
+ * No need to use zero_map_value. The bpf_selem_free()
+ * only does bpf_mem_cache_free when there is
+ * no other bpf prog is using the selem.
+ */
+ memset(SDATA(selem)->data, 0, smap->map.value_size);
+ } else {
+ selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
+ gfp_flags | __GFP_NOWARN);
+ }
+
if (selem) {
if (value)
copy_map_value(&smap->map, SDATA(selem)->data, value);
+ /* No need to call check_and_init_map_value as memory is zero init */
return selem;
}
@@ -84,7 +111,8 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
}
-void bpf_local_storage_free_rcu(struct rcu_head *rcu)
+/* rcu tasks trace callback for bpf_ma == false */
+static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
@@ -98,7 +126,66 @@ void bpf_local_storage_free_rcu(struct rcu_head *rcu)
kfree_rcu(local_storage, rcu);
}
-static void bpf_selem_free_rcu(struct rcu_head *rcu)
+static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage *local_storage;
+
+ local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+ bpf_mem_cache_raw_free(local_storage);
+}
+
+static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_local_storage_free_rcu(rcu);
+ else
+ call_rcu(rcu, bpf_local_storage_free_rcu);
+}
+
+/* Handle bpf_ma == false */
+static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
+ bool vanilla_rcu)
+{
+ if (vanilla_rcu)
+ kfree_rcu(local_storage, rcu);
+ else
+ call_rcu_tasks_trace(&local_storage->rcu,
+ __bpf_local_storage_free_trace_rcu);
+}
+
+static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_map *smap,
+ bool bpf_ma, bool reuse_now)
+{
+ if (!local_storage)
+ return;
+
+ if (!bpf_ma) {
+ __bpf_local_storage_free(local_storage, reuse_now);
+ return;
+ }
+
+ if (!reuse_now) {
+ call_rcu_tasks_trace(&local_storage->rcu,
+ bpf_local_storage_free_trace_rcu);
+ return;
+ }
+
+ if (smap) {
+ migrate_disable();
+ bpf_mem_cache_free(&smap->storage_ma, local_storage);
+ migrate_enable();
+ } else {
+ /* smap could be NULL if the selem that triggered
+ * this 'local_storage' creation had been long gone.
+ * In this case, directly do call_rcu().
+ */
+ call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+ }
+}
+
+/* rcu tasks trace callback for bpf_ma == false */
+static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
@@ -109,13 +196,63 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
kfree_rcu(selem, rcu);
}
+/* Handle bpf_ma == false */
+static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
+ bool vanilla_rcu)
+{
+ if (vanilla_rcu)
+ kfree_rcu(selem, rcu);
+ else
+ call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
+}
+
+static void bpf_selem_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage_elem *selem;
+
+ selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+ bpf_mem_cache_raw_free(selem);
+}
+
+static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
+{
+ if (rcu_trace_implies_rcu_gp())
+ bpf_selem_free_rcu(rcu);
+ else
+ call_rcu(rcu, bpf_selem_free_rcu);
+}
+
+void bpf_selem_free(struct bpf_local_storage_elem *selem,
+ struct bpf_local_storage_map *smap,
+ bool reuse_now)
+{
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+
+ if (!smap->bpf_ma) {
+ __bpf_selem_free(selem, reuse_now);
+ return;
+ }
+
+ if (!reuse_now) {
+ call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
+ } else {
+ /* Instead of using the vanilla call_rcu(),
+ * bpf_mem_cache_free will be able to reuse selem
+ * immediately.
+ */
+ migrate_disable();
+ bpf_mem_cache_free(&smap->selem_ma, selem);
+ migrate_enable();
+ }
+}
+
/* local_storage->lock must be held and selem->local_storage == local_storage.
* The caller must ensure selem->smap is still valid to be
* dereferenced for its smap->elem_size and smap->cache_idx.
*/
static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem,
- bool uncharge_mem, bool use_trace_rcu)
+ bool uncharge_mem, bool reuse_now)
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
@@ -159,40 +296,75 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
SDATA(selem))
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
- if (use_trace_rcu)
- call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
- else
- kfree_rcu(selem, rcu);
+ bpf_selem_free(selem, smap, reuse_now);
+
+ if (rcu_access_pointer(local_storage->smap) == smap)
+ RCU_INIT_POINTER(local_storage->smap, NULL);
return free_local_storage;
}
-static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
- bool use_trace_rcu)
+static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_map *storage_smap,
+ struct bpf_local_storage_elem *selem)
+{
+
+ struct bpf_local_storage_map *selem_smap;
+
+ /* local_storage->smap may be NULL. If it is, get the bpf_ma
+ * from any selem in the local_storage->list. The bpf_ma of all
+ * local_storage and selem should have the same value
+ * for the same map type.
+ *
+ * If the local_storage->list is already empty, the caller will not
+ * care about the bpf_ma value also because the caller is not
+ * responsibile to free the local_storage.
+ */
+
+ if (storage_smap)
+ return storage_smap->bpf_ma;
+
+ if (!selem) {
+ struct hlist_node *n;
+
+ n = rcu_dereference_check(hlist_first_rcu(&local_storage->list),
+ bpf_rcu_lock_held());
+ if (!n)
+ return false;
+
+ selem = hlist_entry(n, struct bpf_local_storage_elem, snode);
+ }
+ selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+
+ return selem_smap->bpf_ma;
+}
+
+static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
+ bool reuse_now)
{
+ struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage *local_storage;
- bool free_local_storage = false;
+ bool bpf_ma, free_local_storage = false;
unsigned long flags;
- if (unlikely(!selem_linked_to_storage(selem)))
+ if (unlikely(!selem_linked_to_storage_lockless(selem)))
/* selem has already been unlinked from sk */
return;
local_storage = rcu_dereference_check(selem->local_storage,
bpf_rcu_lock_held());
+ storage_smap = rcu_dereference_check(local_storage->smap,
+ bpf_rcu_lock_held());
+ bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem);
+
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, true, use_trace_rcu);
+ local_storage, selem, true, reuse_now);
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
- if (free_local_storage) {
- if (use_trace_rcu)
- call_rcu_tasks_trace(&local_storage->rcu,
- bpf_local_storage_free_rcu);
- else
- kfree_rcu(local_storage, rcu);
- }
+ if (free_local_storage)
+ bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);
}
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -202,13 +374,13 @@ void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
hlist_add_head_rcu(&selem->snode, &local_storage->list);
}
-void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
+static void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage_map *smap;
struct bpf_local_storage_map_bucket *b;
unsigned long flags;
- if (unlikely(!selem_linked_to_map(selem)))
+ if (unlikely(!selem_linked_to_map_lockless(selem)))
/* selem has already be unlinked from smap */
return;
@@ -232,14 +404,14 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap,
raw_spin_unlock_irqrestore(&b->lock, flags);
}
-void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu)
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool reuse_now)
{
/* Always unlink from map before unlinking from local_storage
* because selem will be freed after successfully unlinked from
* the local_storage.
*/
bpf_selem_unlink_map(selem);
- __bpf_selem_unlink_storage(selem, use_trace_rcu);
+ bpf_selem_unlink_storage(selem, reuse_now);
}
/* If cacheit_lockit is false, this lookup function is lockless */
@@ -312,13 +484,21 @@ int bpf_local_storage_alloc(void *owner,
if (err)
return err;
- storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
- gfp_flags | __GFP_NOWARN);
+ if (smap->bpf_ma) {
+ migrate_disable();
+ storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags);
+ migrate_enable();
+ } else {
+ storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
+ gfp_flags | __GFP_NOWARN);
+ }
+
if (!storage) {
err = -ENOMEM;
goto uncharge;
}
+ RCU_INIT_POINTER(storage->smap, smap);
INIT_HLIST_HEAD(&storage->list);
raw_spin_lock_init(&storage->lock);
storage->owner = owner;
@@ -358,7 +538,7 @@ int bpf_local_storage_alloc(void *owner,
return 0;
uncharge:
- kfree(storage);
+ bpf_local_storage_free(storage, smap, smap->bpf_ma, true);
mem_uncharge(smap, owner, sizeof(*storage));
return err;
}
@@ -402,7 +582,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
if (err) {
- kfree(selem);
+ bpf_selem_free(selem, smap, true);
mem_uncharge(smap, owner, smap->elem_size);
return ERR_PTR(err);
}
@@ -420,7 +600,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
err = check_flags(old_sdata, map_flags);
if (err)
return ERR_PTR(err);
- if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) {
+ if (old_sdata && selem_linked_to_storage_lockless(SELEM(old_sdata))) {
copy_map_value_locked(&smap->map, old_sdata->data,
value, false);
return old_sdata;
@@ -485,7 +665,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (old_sdata) {
bpf_selem_unlink_map(SELEM(old_sdata));
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
- false, true);
+ false, false);
}
unlock:
@@ -496,7 +676,7 @@ unlock_err:
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
if (selem) {
mem_uncharge(smap, owner, smap->elem_size);
- kfree(selem);
+ bpf_selem_free(selem, smap, true);
}
return ERR_PTR(err);
}
@@ -552,40 +732,6 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
return 0;
}
-static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_attr *attr)
-{
- struct bpf_local_storage_map *smap;
- unsigned int i;
- u32 nbuckets;
-
- smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
- if (!smap)
- return ERR_PTR(-ENOMEM);
- bpf_map_init_from_attr(&smap->map, attr);
-
- nbuckets = roundup_pow_of_two(num_possible_cpus());
- /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
- nbuckets = max_t(u32, 2, nbuckets);
- smap->bucket_log = ilog2(nbuckets);
-
- smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
- nbuckets, GFP_USER | __GFP_NOWARN);
- if (!smap->buckets) {
- bpf_map_area_free(smap);
- return ERR_PTR(-ENOMEM);
- }
-
- for (i = 0; i < nbuckets; i++) {
- INIT_HLIST_HEAD(&smap->buckets[i].list);
- raw_spin_lock_init(&smap->buckets[i].lock);
- }
-
- smap->elem_size = offsetof(struct bpf_local_storage_elem,
- sdata.data[attr->value_size]);
-
- return smap;
-}
-
int bpf_local_storage_map_check_btf(const struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
@@ -603,11 +749,16 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
return 0;
}
-bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
+void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
{
+ struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage_elem *selem;
- bool free_storage = false;
+ bool bpf_ma, free_storage = false;
struct hlist_node *n;
+ unsigned long flags;
+
+ storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held());
+ bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL);
/* Neither the bpf_prog nor the bpf_map's syscall
* could be modifying the local_storage->list now.
@@ -618,6 +769,7 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
* when unlinking elem from the local_storage->list and
* the map's bucket->list.
*/
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
/* Always unlink from map before unlinking from
* local_storage.
@@ -630,24 +782,89 @@ bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
* of the loop will set the free_cgroup_storage to true.
*/
free_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, false, false);
+ local_storage, selem, false, true);
}
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
- return free_storage;
+ if (free_storage)
+ bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);
}
+u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_local_storage_map *smap = (struct bpf_local_storage_map *)map;
+ u64 usage = sizeof(*smap);
+
+ /* The dynamically callocated selems are not counted currently. */
+ usage += sizeof(*smap->buckets) * (1ULL << smap->bucket_log);
+ return usage;
+}
+
+/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory.
+ * A deadlock free allocator is useful for storage that the bpf prog can easily
+ * get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf.
+ * The task and cgroup storage fall into this case. The bpf_mem_alloc reuses
+ * memory immediately. To be reuse-immediate safe, the owner destruction
+ * code path needs to go through a rcu grace period before calling
+ * bpf_local_storage_destroy().
+ *
+ * When bpf_ma == false, the kmalloc and kfree are used.
+ */
struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr *attr,
- struct bpf_local_storage_cache *cache)
+ struct bpf_local_storage_cache *cache,
+ bool bpf_ma)
{
struct bpf_local_storage_map *smap;
+ unsigned int i;
+ u32 nbuckets;
+ int err;
+
+ smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
+ if (!smap)
+ return ERR_PTR(-ENOMEM);
+ bpf_map_init_from_attr(&smap->map, attr);
+
+ nbuckets = roundup_pow_of_two(num_possible_cpus());
+ /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
+ nbuckets = max_t(u32, 2, nbuckets);
+ smap->bucket_log = ilog2(nbuckets);
+
+ smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
+ nbuckets, GFP_USER | __GFP_NOWARN);
+ if (!smap->buckets) {
+ err = -ENOMEM;
+ goto free_smap;
+ }
+
+ for (i = 0; i < nbuckets; i++) {
+ INIT_HLIST_HEAD(&smap->buckets[i].list);
+ raw_spin_lock_init(&smap->buckets[i].lock);
+ }
+
+ smap->elem_size = offsetof(struct bpf_local_storage_elem,
+ sdata.data[attr->value_size]);
- smap = __bpf_local_storage_map_alloc(attr);
- if (IS_ERR(smap))
- return ERR_CAST(smap);
+ smap->bpf_ma = bpf_ma;
+ if (bpf_ma) {
+ err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
+ if (err)
+ goto free_smap;
+
+ err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false);
+ if (err) {
+ bpf_mem_alloc_destroy(&smap->selem_ma);
+ goto free_smap;
+ }
+ }
smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
return &smap->map;
+
+free_smap:
+ kvfree(smap->buckets);
+ bpf_map_area_free(smap);
+ return ERR_PTR(err);
}
void bpf_local_storage_map_free(struct bpf_map *map,
@@ -689,7 +906,7 @@ void bpf_local_storage_map_free(struct bpf_map *map,
migrate_disable();
this_cpu_inc(*busy_counter);
}
- bpf_selem_unlink(selem, false);
+ bpf_selem_unlink(selem, true);
if (busy_counter) {
this_cpu_dec(*busy_counter);
migrate_enable();
@@ -713,6 +930,10 @@ void bpf_local_storage_map_free(struct bpf_map *map,
*/
synchronize_rcu();
+ if (smap->bpf_ma) {
+ bpf_mem_alloc_destroy(&smap->selem_ma);
+ bpf_mem_alloc_destroy(&smap->storage_ma);
+ }
kvfree(smap->buckets);
bpf_map_area_free(smap);
}
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index ece9870cab68..d3f0a4825fa6 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -11,11 +11,13 @@
#include <linux/refcount.h>
#include <linux/mutex.h>
#include <linux/btf_ids.h>
+#include <linux/rcupdate_wait.h>
enum bpf_struct_ops_state {
BPF_STRUCT_OPS_STATE_INIT,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE,
+ BPF_STRUCT_OPS_STATE_READY,
};
#define BPF_STRUCT_OPS_COMMON_VALUE \
@@ -58,6 +60,13 @@ struct bpf_struct_ops_map {
struct bpf_struct_ops_value kvalue;
};
+struct bpf_struct_ops_link {
+ struct bpf_link link;
+ struct bpf_map __rcu *map;
+};
+
+static DEFINE_MUTEX(update_mutex);
+
#define VALUE_PREFIX "bpf_struct_ops_"
#define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
@@ -249,6 +258,7 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
struct bpf_struct_ops_value *uvalue, *kvalue;
enum bpf_struct_ops_state state;
+ s64 refcnt;
if (unlikely(*(u32 *)key != 0))
return -ENOENT;
@@ -267,7 +277,14 @@ int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
uvalue = value;
memcpy(uvalue, st_map->uvalue, map->value_size);
uvalue->state = state;
- refcount_set(&uvalue->refcnt, refcount_read(&kvalue->refcnt));
+
+ /* This value offers the user space a general estimate of how
+ * many sockets are still utilizing this struct_ops for TCP
+ * congestion control. The number might not be exact, but it
+ * should sufficiently meet our present goals.
+ */
+ refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt);
+ refcount_set(&uvalue->refcnt, max_t(s64, refcnt, 0));
return 0;
}
@@ -349,8 +366,8 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
model, flags, tlinks, NULL);
}
-static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
const struct bpf_struct_ops *st_ops = st_map->st_ops;
@@ -491,12 +508,29 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*(unsigned long *)(udata + moff) = prog->aux->id;
}
- refcount_set(&kvalue->refcnt, 1);
- bpf_map_inc(map);
+ if (st_map->map.map_flags & BPF_F_LINK) {
+ err = st_ops->validate(kdata);
+ if (err)
+ goto reset_unlock;
+ set_memory_rox((long)st_map->image, 1);
+ /* Let bpf_link handle registration & unregistration.
+ *
+ * Pair with smp_load_acquire() during lookup_elem().
+ */
+ smp_store_release(&kvalue->state, BPF_STRUCT_OPS_STATE_READY);
+ goto unlock;
+ }
set_memory_rox((long)st_map->image, 1);
err = st_ops->reg(kdata);
if (likely(!err)) {
+ /* This refcnt increment on the map here after
+ * 'st_ops->reg()' is secure since the state of the
+ * map must be set to INIT at this moment, and thus
+ * bpf_struct_ops_map_delete_elem() can't unregister
+ * or transition it to TOBEFREE concurrently.
+ */
+ bpf_map_inc(map);
/* Pair with smp_load_acquire() during lookup_elem().
* It ensures the above udata updates (e.g. prog->aux->id)
* can be seen once BPF_STRUCT_OPS_STATE_INUSE is set.
@@ -512,7 +546,6 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*/
set_memory_nx((long)st_map->image, 1);
set_memory_rw((long)st_map->image, 1);
- bpf_map_put(map);
reset_unlock:
bpf_struct_ops_map_put_progs(st_map);
@@ -524,20 +557,22 @@ unlock:
return err;
}
-static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
+static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
{
enum bpf_struct_ops_state prev_state;
struct bpf_struct_ops_map *st_map;
st_map = (struct bpf_struct_ops_map *)map;
+ if (st_map->map.map_flags & BPF_F_LINK)
+ return -EOPNOTSUPP;
+
prev_state = cmpxchg(&st_map->kvalue.state,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE);
switch (prev_state) {
case BPF_STRUCT_OPS_STATE_INUSE:
st_map->st_ops->unreg(&st_map->kvalue.data);
- if (refcount_dec_and_test(&st_map->kvalue.refcnt))
- bpf_map_put(map);
+ bpf_map_put(map);
return 0;
case BPF_STRUCT_OPS_STATE_TOBEFREE:
return -EINPROGRESS;
@@ -570,7 +605,7 @@ static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
kfree(value);
}
-static void bpf_struct_ops_map_free(struct bpf_map *map)
+static void __bpf_struct_ops_map_free(struct bpf_map *map)
{
struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
@@ -582,10 +617,32 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
bpf_map_area_free(st_map);
}
+static void bpf_struct_ops_map_free(struct bpf_map *map)
+{
+ /* The struct_ops's function may switch to another struct_ops.
+ *
+ * For example, bpf_tcp_cc_x->init() may switch to
+ * another tcp_cc_y by calling
+ * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+ * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
+ * and its refcount may reach 0 which then free its
+ * trampoline image while tcp_cc_x is still running.
+ *
+ * A vanilla rcu gp is to wait for all bpf-tcp-cc prog
+ * to finish. bpf-tcp-cc prog is non sleepable.
+ * A rcu_tasks gp is to wait for the last few insn
+ * in the tramopline image to finish before releasing
+ * the trampoline image.
+ */
+ synchronize_rcu_mult(call_rcu, call_rcu_tasks);
+
+ __bpf_struct_ops_map_free(map);
+}
+
static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
{
if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 ||
- attr->map_flags || !attr->btf_vmlinux_value_type_id)
+ (attr->map_flags & ~BPF_F_LINK) || !attr->btf_vmlinux_value_type_id)
return -EINVAL;
return 0;
}
@@ -609,6 +666,9 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
if (attr->value_size != vt->size)
return ERR_PTR(-EINVAL);
+ if (attr->map_flags & BPF_F_LINK && (!st_ops->validate || !st_ops->update))
+ return ERR_PTR(-EOPNOTSUPP);
+
t = st_ops->type;
st_map_size = sizeof(*st_map) +
@@ -630,7 +690,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
NUMA_NO_NODE);
st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
if (!st_map->uvalue || !st_map->links || !st_map->image) {
- bpf_struct_ops_map_free(map);
+ __bpf_struct_ops_map_free(map);
return ERR_PTR(-ENOMEM);
}
@@ -641,6 +701,21 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
return map;
}
+static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
+ const struct bpf_struct_ops *st_ops = st_map->st_ops;
+ const struct btf_type *vt = st_ops->value_type;
+ u64 usage;
+
+ usage = sizeof(*st_map) +
+ vt->size - sizeof(struct bpf_struct_ops_value);
+ usage += vt->size;
+ usage += btf_type_vlen(vt) * sizeof(struct bpf_links *);
+ usage += PAGE_SIZE;
+ return usage;
+}
+
BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map)
const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_alloc_check = bpf_struct_ops_map_alloc_check,
@@ -651,6 +726,7 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_delete_elem = bpf_struct_ops_map_delete_elem,
.map_update_elem = bpf_struct_ops_map_update_elem,
.map_seq_show_elem = bpf_struct_ops_map_seq_show_elem,
+ .map_mem_usage = bpf_struct_ops_map_mem_usage,
.map_btf_id = &bpf_struct_ops_map_btf_ids[0],
};
@@ -660,41 +736,175 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
bool bpf_struct_ops_get(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
- return refcount_inc_not_zero(&kvalue->refcnt);
+ map = __bpf_map_inc_not_zero(&st_map->map, false);
+ return !IS_ERR(map);
}
-static void bpf_struct_ops_put_rcu(struct rcu_head *head)
+void bpf_struct_ops_put(const void *kdata)
{
+ struct bpf_struct_ops_value *kvalue;
struct bpf_struct_ops_map *st_map;
- st_map = container_of(head, struct bpf_struct_ops_map, rcu);
+ kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+ st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
bpf_map_put(&st_map->map);
}
-void bpf_struct_ops_put(const void *kdata)
+static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
{
- struct bpf_struct_ops_value *kvalue;
+ struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
- kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
- if (refcount_dec_and_test(&kvalue->refcnt)) {
- struct bpf_struct_ops_map *st_map;
+ return map->map_type == BPF_MAP_TYPE_STRUCT_OPS &&
+ map->map_flags & BPF_F_LINK &&
+ /* Pair with smp_store_release() during map_update */
+ smp_load_acquire(&st_map->kvalue.state) == BPF_STRUCT_OPS_STATE_READY;
+}
- st_map = container_of(kvalue, struct bpf_struct_ops_map,
- kvalue);
- /* The struct_ops's function may switch to another struct_ops.
- *
- * For example, bpf_tcp_cc_x->init() may switch to
- * another tcp_cc_y by calling
- * setsockopt(TCP_CONGESTION, "tcp_cc_y").
- * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
- * and its map->refcnt may reach 0 which then free its
- * trampoline image while tcp_cc_x is still running.
- *
- * Thus, a rcu grace period is needed here.
+static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_struct_ops_map *st_map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ st_map = (struct bpf_struct_ops_map *)
+ rcu_dereference_protected(st_link->map, true);
+ if (st_map) {
+ /* st_link->map can be NULL if
+ * bpf_struct_ops_link_create() fails to register.
*/
- call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
+ st_map->st_ops->unreg(&st_map->kvalue.data);
+ bpf_map_put(&st_map->map);
}
+ kfree(st_link);
+}
+
+static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock();
+ map = rcu_dereference(st_link->map);
+ seq_printf(seq, "map_id:\t%d\n", map->id);
+ rcu_read_unlock();
}
+
+static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_struct_ops_link *st_link;
+ struct bpf_map *map;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ rcu_read_lock();
+ map = rcu_dereference(st_link->map);
+ info->struct_ops.map_id = map->id;
+ rcu_read_unlock();
+ return 0;
+}
+
+static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map,
+ struct bpf_map *expected_old_map)
+{
+ struct bpf_struct_ops_map *st_map, *old_st_map;
+ struct bpf_map *old_map;
+ struct bpf_struct_ops_link *st_link;
+ int err = 0;
+
+ st_link = container_of(link, struct bpf_struct_ops_link, link);
+ st_map = container_of(new_map, struct bpf_struct_ops_map, map);
+
+ if (!bpf_struct_ops_valid_to_reg(new_map))
+ return -EINVAL;
+
+ mutex_lock(&update_mutex);
+
+ old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ if (expected_old_map && old_map != expected_old_map) {
+ err = -EPERM;
+ goto err_out;
+ }
+
+ old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
+ /* The new and old struct_ops must be the same type. */
+ if (st_map->st_ops != old_st_map->st_ops) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
+ if (err)
+ goto err_out;
+
+ bpf_map_inc(new_map);
+ rcu_assign_pointer(st_link->map, new_map);
+ bpf_map_put(old_map);
+
+err_out:
+ mutex_unlock(&update_mutex);
+
+ return err;
+}
+
+static const struct bpf_link_ops bpf_struct_ops_map_lops = {
+ .dealloc = bpf_struct_ops_map_link_dealloc,
+ .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
+ .fill_link_info = bpf_struct_ops_map_link_fill_link_info,
+ .update_map = bpf_struct_ops_map_link_update,
+};
+
+int bpf_struct_ops_link_create(union bpf_attr *attr)
+{
+ struct bpf_struct_ops_link *link = NULL;
+ struct bpf_link_primer link_primer;
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
+ int err;
+
+ map = bpf_map_get(attr->link_create.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ st_map = (struct bpf_struct_ops_map *)map;
+
+ if (!bpf_struct_ops_valid_to_reg(map)) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto err_out;
+
+ err = st_map->st_ops->reg(st_map->kvalue.data);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ link = NULL;
+ goto err_out;
+ }
+ RCU_INIT_POINTER(link->map, map);
+
+ return bpf_link_settle(&link_primer);
+
+err_out:
+ bpf_map_put(map);
+ kfree(link);
+ return err;
+}
+
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 1e486055a523..adf6dfe0ba68 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -72,8 +72,6 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
void bpf_task_storage_free(struct task_struct *task)
{
struct bpf_local_storage *local_storage;
- bool free_task_storage = false;
- unsigned long flags;
rcu_read_lock();
@@ -84,14 +82,9 @@ void bpf_task_storage_free(struct task_struct *task)
}
bpf_task_storage_lock();
- raw_spin_lock_irqsave(&local_storage->lock, flags);
- free_task_storage = bpf_local_storage_unlink_nolock(local_storage);
- raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_local_storage_destroy(local_storage);
bpf_task_storage_unlock();
rcu_read_unlock();
-
- if (free_task_storage)
- kfree_rcu(local_storage, rcu);
}
static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
@@ -127,8 +120,8 @@ out:
return ERR_PTR(err);
}
-static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
struct bpf_local_storage_data *sdata;
struct task_struct *task;
@@ -175,12 +168,12 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
if (!nobusy)
return -EBUSY;
- bpf_selem_unlink(SELEM(sdata), true);
+ bpf_selem_unlink(SELEM(sdata), false);
return 0;
}
-static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
+static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
{
struct task_struct *task;
unsigned int f_flags;
@@ -316,7 +309,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &task_cache);
+ return bpf_local_storage_map_alloc(attr, &task_cache, true);
}
static void task_storage_map_free(struct bpf_map *map)
@@ -335,6 +328,7 @@ const struct bpf_map_ops task_storage_map_ops = {
.map_update_elem = bpf_pid_task_storage_update_elem,
.map_delete_elem = bpf_pid_task_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
.map_btf_id = &bpf_local_storage_map_btf_id[0],
.map_owner_storage_ptr = task_storage_ptr,
};
@@ -344,7 +338,7 @@ const struct bpf_func_proto bpf_task_storage_get_recur_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
@@ -355,7 +349,7 @@ const struct bpf_func_proto bpf_task_storage_get_proto = {
.gpl_only = false,
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
@@ -366,7 +360,7 @@ const struct bpf_func_proto bpf_task_storage_delete_recur_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};
@@ -375,6 +369,6 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 73780748404c..6b682b8e4b50 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -25,6 +25,9 @@
#include <linux/bsearch.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
+
+#include <net/netfilter/nf_bpf_link.h>
+
#include <net/sock.h>
#include "../tools/lib/bpf/relo_core.h"
@@ -207,6 +210,12 @@ enum btf_kfunc_hook {
BTF_KFUNC_HOOK_TRACING,
BTF_KFUNC_HOOK_SYSCALL,
BTF_KFUNC_HOOK_FMODRET,
+ BTF_KFUNC_HOOK_CGROUP_SKB,
+ BTF_KFUNC_HOOK_SCHED_ACT,
+ BTF_KFUNC_HOOK_SK_SKB,
+ BTF_KFUNC_HOOK_SOCKET_FILTER,
+ BTF_KFUNC_HOOK_LWT,
+ BTF_KFUNC_HOOK_NETFILTER,
BTF_KFUNC_HOOK_MAX,
};
@@ -572,8 +581,8 @@ static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
*btf_p = btf;
return ret;
}
- spin_lock_bh(&btf_idr_lock);
btf_put(btf);
+ spin_lock_bh(&btf_idr_lock);
}
spin_unlock_bh(&btf_idr_lock);
return ret;
@@ -1661,10 +1670,8 @@ static void btf_struct_metas_free(struct btf_struct_metas *tab)
if (!tab)
return;
- for (i = 0; i < tab->cnt; i++) {
+ for (i = 0; i < tab->cnt; i++)
btf_record_free(tab->types[i].record);
- kfree(tab->types[i].field_offs);
- }
kfree(tab);
}
@@ -3226,12 +3233,6 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-enum btf_field_info_type {
- BTF_FIELD_SPIN_LOCK,
- BTF_FIELD_TIMER,
- BTF_FIELD_KPTR,
-};
-
enum {
BTF_FIELD_IGNORE = 0,
BTF_FIELD_FOUND = 1,
@@ -3283,9 +3284,9 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
/* Reject extra tags */
if (btf_type_is_type_tag(btf_type_by_id(btf, t->type)))
return -EINVAL;
- if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
+ if (!strcmp("kptr_untrusted", __btf_name_by_offset(btf, t->name_off)))
type = BPF_KPTR_UNREF;
- else if (!strcmp("kptr_ref", __btf_name_by_offset(btf, t->name_off)))
+ else if (!strcmp("kptr", __btf_name_by_offset(btf, t->name_off)))
type = BPF_KPTR_REF;
else
return -EINVAL;
@@ -3394,6 +3395,7 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
field_mask_test_name(BPF_LIST_NODE, "bpf_list_node");
field_mask_test_name(BPF_RB_ROOT, "bpf_rb_root");
field_mask_test_name(BPF_RB_NODE, "bpf_rb_node");
+ field_mask_test_name(BPF_REFCOUNT, "bpf_refcount");
/* Only return BPF_KPTR when all other types with matchable names fail */
if (field_mask & BPF_KPTR) {
@@ -3442,6 +3444,7 @@ static int btf_find_struct_field(const struct btf *btf,
case BPF_TIMER:
case BPF_LIST_NODE:
case BPF_RB_NODE:
+ case BPF_REFCOUNT:
ret = btf_find_struct(btf, member_type, off, sz, field_type,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
@@ -3507,6 +3510,7 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
case BPF_TIMER:
case BPF_LIST_NODE:
case BPF_RB_NODE:
+ case BPF_REFCOUNT:
ret = btf_find_struct(btf, var_type, off, sz, field_type,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
@@ -3557,7 +3561,10 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
{
struct module *mod = NULL;
const struct btf_type *t;
- struct btf *kernel_btf;
+ /* If a matching btf type is found in kernel or module BTFs, kptr_ref
+ * is that BTF, otherwise it's program BTF
+ */
+ struct btf *kptr_btf;
int ret;
s32 id;
@@ -3566,7 +3573,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
*/
t = btf_type_by_id(btf, info->kptr.type_id);
id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
- &kernel_btf);
+ &kptr_btf);
+ if (id == -ENOENT) {
+ /* btf_parse_kptr should only be called w/ btf = program BTF */
+ WARN_ON_ONCE(btf_is_kernel(btf));
+
+ /* Type exists only in program BTF. Assume that it's a MEM_ALLOC
+ * kptr allocated via bpf_obj_new
+ */
+ field->kptr.dtor = NULL;
+ id = info->kptr.type_id;
+ kptr_btf = (struct btf *)btf;
+ btf_get(kptr_btf);
+ goto found_dtor;
+ }
if (id < 0)
return id;
@@ -3583,20 +3603,20 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
* can be used as a referenced pointer and be stored in a map at
* the same time.
*/
- dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id);
+ dtor_btf_id = btf_find_dtor_kfunc(kptr_btf, id);
if (dtor_btf_id < 0) {
ret = dtor_btf_id;
goto end_btf;
}
- dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id);
+ dtor_func = btf_type_by_id(kptr_btf, dtor_btf_id);
if (!dtor_func) {
ret = -ENOENT;
goto end_btf;
}
- if (btf_is_module(kernel_btf)) {
- mod = btf_try_get_module(kernel_btf);
+ if (btf_is_module(kptr_btf)) {
+ mod = btf_try_get_module(kptr_btf);
if (!mod) {
ret = -ENXIO;
goto end_btf;
@@ -3606,7 +3626,7 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
/* We already verified dtor_func to be btf_type_is_func
* in register_btf_id_dtor_kfuncs.
*/
- dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off);
+ dtor_func_name = __btf_name_by_offset(kptr_btf, dtor_func->name_off);
addr = kallsyms_lookup_name(dtor_func_name);
if (!addr) {
ret = -EINVAL;
@@ -3615,14 +3635,15 @@ static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
field->kptr.dtor = (void *)addr;
}
+found_dtor:
field->kptr.btf_id = id;
- field->kptr.btf = kernel_btf;
+ field->kptr.btf = kptr_btf;
field->kptr.module = mod;
return 0;
end_mod:
module_put(mod);
end_btf:
- btf_put(kernel_btf);
+ btf_put(kptr_btf);
return ret;
}
@@ -3684,12 +3705,24 @@ static int btf_parse_rb_root(const struct btf *btf, struct btf_field *field,
__alignof__(struct bpf_rb_node));
}
+static int btf_field_cmp(const void *_a, const void *_b, const void *priv)
+{
+ const struct btf_field *a = (const struct btf_field *)_a;
+ const struct btf_field *b = (const struct btf_field *)_b;
+
+ if (a->offset < b->offset)
+ return -1;
+ else if (a->offset > b->offset)
+ return 1;
+ return 0;
+}
+
struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
u32 field_mask, u32 value_size)
{
struct btf_field_info info_arr[BTF_FIELDS_MAX];
+ u32 next_off = 0, field_type_size;
struct btf_record *rec;
- u32 next_off = 0;
int ret, i, cnt;
ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr));
@@ -3708,8 +3741,10 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
rec->spin_lock_off = -EINVAL;
rec->timer_off = -EINVAL;
+ rec->refcount_off = -EINVAL;
for (i = 0; i < cnt; i++) {
- if (info_arr[i].off + btf_field_type_size(info_arr[i].type) > value_size) {
+ field_type_size = btf_field_type_size(info_arr[i].type);
+ if (info_arr[i].off + field_type_size > value_size) {
WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size);
ret = -EFAULT;
goto end;
@@ -3718,11 +3753,12 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
ret = -EEXIST;
goto end;
}
- next_off = info_arr[i].off + btf_field_type_size(info_arr[i].type);
+ next_off = info_arr[i].off + field_type_size;
rec->field_mask |= info_arr[i].type;
rec->fields[i].offset = info_arr[i].off;
rec->fields[i].type = info_arr[i].type;
+ rec->fields[i].size = field_type_size;
switch (info_arr[i].type) {
case BPF_SPIN_LOCK:
@@ -3735,6 +3771,11 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
/* Cache offset for faster lookup at runtime */
rec->timer_off = rec->fields[i].offset;
break;
+ case BPF_REFCOUNT:
+ WARN_ON_ONCE(rec->refcount_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->refcount_off = rec->fields[i].offset;
+ break;
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
@@ -3768,30 +3809,16 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
goto end;
}
- /* need collection identity for non-owning refs before allowing this
- *
- * Consider a node type w/ both list and rb_node fields:
- * struct node {
- * struct bpf_list_node l;
- * struct bpf_rb_node r;
- * }
- *
- * Used like so:
- * struct node *n = bpf_obj_new(....);
- * bpf_list_push_front(&list_head, &n->l);
- * bpf_rbtree_remove(&rb_root, &n->r);
- *
- * It should not be possible to rbtree_remove the node since it hasn't
- * been added to a tree. But push_front converts n to a non-owning
- * reference, and rbtree_remove accepts the non-owning reference to
- * a type w/ bpf_rb_node field.
- */
- if (btf_record_has_field(rec, BPF_LIST_NODE) &&
+ if (rec->refcount_off < 0 &&
+ btf_record_has_field(rec, BPF_LIST_NODE) &&
btf_record_has_field(rec, BPF_RB_NODE)) {
ret = -EINVAL;
goto end;
}
+ sort_r(rec->fields, rec->cnt, sizeof(struct btf_field), btf_field_cmp,
+ NULL, rec);
+
return rec;
end:
btf_record_free(rec);
@@ -3873,61 +3900,6 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
return 0;
}
-static int btf_field_offs_cmp(const void *_a, const void *_b, const void *priv)
-{
- const u32 a = *(const u32 *)_a;
- const u32 b = *(const u32 *)_b;
-
- if (a < b)
- return -1;
- else if (a > b)
- return 1;
- return 0;
-}
-
-static void btf_field_offs_swap(void *_a, void *_b, int size, const void *priv)
-{
- struct btf_field_offs *foffs = (void *)priv;
- u32 *off_base = foffs->field_off;
- u32 *a = _a, *b = _b;
- u8 *sz_a, *sz_b;
-
- sz_a = foffs->field_sz + (a - off_base);
- sz_b = foffs->field_sz + (b - off_base);
-
- swap(*a, *b);
- swap(*sz_a, *sz_b);
-}
-
-struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec)
-{
- struct btf_field_offs *foffs;
- u32 i, *off;
- u8 *sz;
-
- BUILD_BUG_ON(ARRAY_SIZE(foffs->field_off) != ARRAY_SIZE(foffs->field_sz));
- if (IS_ERR_OR_NULL(rec))
- return NULL;
-
- foffs = kzalloc(sizeof(*foffs), GFP_KERNEL | __GFP_NOWARN);
- if (!foffs)
- return ERR_PTR(-ENOMEM);
-
- off = foffs->field_off;
- sz = foffs->field_sz;
- for (i = 0; i < rec->cnt; i++) {
- off[i] = rec->fields[i].offset;
- sz[i] = btf_field_type_size(rec->fields[i].type);
- }
- foffs->cnt = rec->cnt;
-
- if (foffs->cnt == 1)
- return foffs;
- sort_r(foffs->field_off, foffs->cnt, sizeof(foffs->field_off[0]),
- btf_field_offs_cmp, btf_field_offs_swap, foffs);
- return foffs;
-}
-
static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct btf_show *show)
@@ -5332,6 +5304,7 @@ static const char *alloc_obj_fields[] = {
"bpf_list_node",
"bpf_rb_root",
"bpf_rb_node",
+ "bpf_refcount",
};
static struct btf_struct_metas *
@@ -5370,7 +5343,6 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
for (i = 1; i < n; i++) {
struct btf_struct_metas *new_tab;
const struct btf_member *member;
- struct btf_field_offs *foffs;
struct btf_struct_meta *type;
struct btf_record *record;
const struct btf_type *t;
@@ -5406,23 +5378,13 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
type = &tab->types[tab->cnt];
type->btf_id = i;
record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE |
- BPF_RB_ROOT | BPF_RB_NODE, t->size);
+ BPF_RB_ROOT | BPF_RB_NODE | BPF_REFCOUNT, t->size);
/* The record cannot be unset, treat it as an error if so */
if (IS_ERR_OR_NULL(record)) {
ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
goto free;
}
- foffs = btf_parse_field_offs(record);
- /* We need the field_offs to be valid for a valid record,
- * either both should be set or both should be unset.
- */
- if (IS_ERR_OR_NULL(foffs)) {
- btf_record_free(record);
- ret = -EFAULT;
- goto free;
- }
type->record = record;
- type->field_offs = foffs;
tab->cnt++;
}
return tab;
@@ -5489,38 +5451,45 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
return 0;
}
-static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
- u32 log_level, char __user *log_ubuf, u32 log_size)
+static int finalize_log(struct bpf_verifier_log *log, bpfptr_t uattr, u32 uattr_size)
+{
+ u32 log_true_size;
+ int err;
+
+ err = bpf_vlog_finalize(log, &log_true_size);
+
+ if (uattr_size >= offsetofend(union bpf_attr, btf_log_true_size) &&
+ copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, btf_log_true_size),
+ &log_true_size, sizeof(log_true_size)))
+ err = -EFAULT;
+
+ return err;
+}
+
+static struct btf *btf_parse(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
+ bpfptr_t btf_data = make_bpfptr(attr->btf, uattr.is_kernel);
+ char __user *log_ubuf = u64_to_user_ptr(attr->btf_log_buf);
struct btf_struct_metas *struct_meta_tab;
struct btf_verifier_env *env = NULL;
- struct bpf_verifier_log *log;
struct btf *btf = NULL;
u8 *data;
- int err;
+ int err, ret;
- if (btf_data_size > BTF_MAX_SIZE)
+ if (attr->btf_size > BTF_MAX_SIZE)
return ERR_PTR(-E2BIG);
env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
if (!env)
return ERR_PTR(-ENOMEM);
- log = &env->log;
- if (log_level || log_ubuf || log_size) {
- /* user requested verbose verifier output
- * and supplied buffer to store the verification trace
- */
- log->level = log_level;
- log->ubuf = log_ubuf;
- log->len_total = log_size;
-
- /* log attributes have to be sane */
- if (!bpf_verifier_log_attr_valid(log)) {
- err = -EINVAL;
- goto errout;
- }
- }
+ /* user could have requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ err = bpf_vlog_init(&env->log, attr->btf_log_level,
+ log_ubuf, attr->btf_log_size);
+ if (err)
+ goto errout_free;
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
@@ -5529,16 +5498,16 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
}
env->btf = btf;
- data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
+ data = kvmalloc(attr->btf_size, GFP_KERNEL | __GFP_NOWARN);
if (!data) {
err = -ENOMEM;
goto errout;
}
btf->data = data;
- btf->data_size = btf_data_size;
+ btf->data_size = attr->btf_size;
- if (copy_from_bpfptr(data, btf_data, btf_data_size)) {
+ if (copy_from_bpfptr(data, btf_data, attr->btf_size)) {
err = -EFAULT;
goto errout;
}
@@ -5561,7 +5530,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
if (err)
goto errout;
- struct_meta_tab = btf_parse_struct_metas(log, btf);
+ struct_meta_tab = btf_parse_struct_metas(&env->log, btf);
if (IS_ERR(struct_meta_tab)) {
err = PTR_ERR(struct_meta_tab);
goto errout;
@@ -5578,10 +5547,9 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
}
}
- if (log->level && bpf_verifier_log_full(log)) {
- err = -ENOSPC;
- goto errout_meta;
- }
+ err = finalize_log(&env->log, uattr, uattr_size);
+ if (err)
+ goto errout_free;
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
@@ -5590,6 +5558,11 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
errout_meta:
btf_free_struct_meta_tab(btf);
errout:
+ /* overwrite err with -ENOSPC or -EFAULT */
+ ret = finalize_log(&env->log, uattr, uattr_size);
+ if (ret)
+ err = ret;
+errout_free:
btf_verifier_env_free(env);
if (btf)
btf_free(btf);
@@ -5684,6 +5657,10 @@ again:
* int socket_filter_bpf_prog(struct __sk_buff *skb)
* { // no fields of skb are ever used }
*/
+ if (strcmp(ctx_tname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0)
+ return ctx_type;
+ if (strcmp(ctx_tname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0)
+ return ctx_type;
if (strcmp(ctx_tname, tname)) {
/* bpf_user_pt_regs_t is a typedef, so resolve it to
* underlying struct and check name again
@@ -5891,12 +5868,8 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
{
- /* t comes in already as a pointer */
- t = btf_type_by_id(btf, t->type);
-
- /* allow const */
- if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
- t = btf_type_by_id(btf, t->type);
+ /* skip modifiers */
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
return btf_type_is_int(t);
}
@@ -6147,7 +6120,8 @@ enum bpf_struct_walk_result {
static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, int off, int size,
- u32 *next_btf_id, enum bpf_type_flag *flag)
+ u32 *next_btf_id, enum bpf_type_flag *flag,
+ const char **field_name)
{
u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
const struct btf_type *mtype, *elem_type = NULL;
@@ -6155,6 +6129,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
const char *tname, *mname, *tag_value;
u32 vlen, elem_id, mid;
+ *flag = 0;
again:
tname = __btf_name_by_offset(btf, t->name_off);
if (!btf_type_is_struct(t)) {
@@ -6186,11 +6161,13 @@ again:
if (off < moff)
goto error;
- /* Only allow structure for now, can be relaxed for
- * other types later.
- */
+ /* allow structure and integer */
t = btf_type_skip_modifiers(btf, array_elem->type,
NULL);
+
+ if (btf_type_is_int(t))
+ return WALK_SCALAR;
+
if (!btf_type_is_struct(t))
goto error;
@@ -6321,6 +6298,15 @@ error:
* of this field or inside of this struct
*/
if (btf_type_is_struct(mtype)) {
+ if (BTF_INFO_KIND(mtype->info) == BTF_KIND_UNION &&
+ btf_type_vlen(mtype) != 1)
+ /*
+ * walking unions yields untrusted pointers
+ * with exception of __bpf_md_ptr and other
+ * unions with a single member
+ */
+ *flag |= PTR_UNTRUSTED;
+
/* our field must be inside that union or struct */
t = mtype;
@@ -6365,7 +6351,9 @@ error:
stype = btf_type_skip_modifiers(btf, mtype->type, &id);
if (btf_type_is_struct(stype)) {
*next_btf_id = id;
- *flag = tmp_flag;
+ *flag |= tmp_flag;
+ if (field_name)
+ *field_name = mname;
return WALK_PTR;
}
}
@@ -6392,7 +6380,8 @@ error:
int btf_struct_access(struct bpf_verifier_log *log,
const struct bpf_reg_state *reg,
int off, int size, enum bpf_access_type atype __maybe_unused,
- u32 *next_btf_id, enum bpf_type_flag *flag)
+ u32 *next_btf_id, enum bpf_type_flag *flag,
+ const char **field_name)
{
const struct btf *btf = reg->btf;
enum bpf_type_flag tmp_flag = 0;
@@ -6424,7 +6413,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
t = btf_type_by_id(btf, id);
do {
- err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag);
+ err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag, field_name);
switch (err) {
case WALK_PTR:
@@ -6499,7 +6488,7 @@ again:
type = btf_type_by_id(btf, id);
if (!type)
return false;
- err = btf_struct_walk(log, btf, type, off, 1, &id, &flag);
+ err = btf_struct_walk(log, btf, type, off, 1, &id, &flag, NULL);
if (err != WALK_STRUCT)
return false;
@@ -7180,15 +7169,12 @@ static int __btf_new_fd(struct btf *btf)
return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
}
-int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr)
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
struct btf *btf;
int ret;
- btf = btf_parse(make_bpfptr(attr->btf, uattr.is_kernel),
- attr->btf_size, attr->btf_log_level,
- u64_to_user_ptr(attr->btf_log_buf),
- attr->btf_log_size);
+ btf = btf_parse(attr, uattr, uattr_size);
if (IS_ERR(btf))
return PTR_ERR(btf);
@@ -7578,6 +7564,108 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
BTF_TRACING_TYPE_xxx
#undef BTF_TRACING_TYPE
+static int btf_check_iter_kfuncs(struct btf *btf, const char *func_name,
+ const struct btf_type *func, u32 func_flags)
+{
+ u32 flags = func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+ const char *name, *sfx, *iter_name;
+ const struct btf_param *arg;
+ const struct btf_type *t;
+ char exp_name[128];
+ u32 nr_args;
+
+ /* exactly one of KF_ITER_{NEW,NEXT,DESTROY} can be set */
+ if (!flags || (flags & (flags - 1)))
+ return -EINVAL;
+
+ /* any BPF iter kfunc should have `struct bpf_iter_<type> *` first arg */
+ nr_args = btf_type_vlen(func);
+ if (nr_args < 1)
+ return -EINVAL;
+
+ arg = &btf_params(func)[0];
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
+ if (!t || !__btf_type_is_struct(t))
+ return -EINVAL;
+
+ name = btf_name_by_offset(btf, t->name_off);
+ if (!name || strncmp(name, ITER_PREFIX, sizeof(ITER_PREFIX) - 1))
+ return -EINVAL;
+
+ /* sizeof(struct bpf_iter_<type>) should be a multiple of 8 to
+ * fit nicely in stack slots
+ */
+ if (t->size == 0 || (t->size % 8))
+ return -EINVAL;
+
+ /* validate bpf_iter_<type>_{new,next,destroy}(struct bpf_iter_<type> *)
+ * naming pattern
+ */
+ iter_name = name + sizeof(ITER_PREFIX) - 1;
+ if (flags & KF_ITER_NEW)
+ sfx = "new";
+ else if (flags & KF_ITER_NEXT)
+ sfx = "next";
+ else /* (flags & KF_ITER_DESTROY) */
+ sfx = "destroy";
+
+ snprintf(exp_name, sizeof(exp_name), "bpf_iter_%s_%s", iter_name, sfx);
+ if (strcmp(func_name, exp_name))
+ return -EINVAL;
+
+ /* only iter constructor should have extra arguments */
+ if (!(flags & KF_ITER_NEW) && nr_args != 1)
+ return -EINVAL;
+
+ if (flags & KF_ITER_NEXT) {
+ /* bpf_iter_<type>_next() should return pointer */
+ t = btf_type_skip_modifiers(btf, func->type, NULL);
+ if (!t || !btf_type_is_ptr(t))
+ return -EINVAL;
+ }
+
+ if (flags & KF_ITER_DESTROY) {
+ /* bpf_iter_<type>_destroy() should return void */
+ t = btf_type_by_id(btf, func->type);
+ if (!t || !btf_type_is_void(t))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int btf_check_kfunc_protos(struct btf *btf, u32 func_id, u32 func_flags)
+{
+ const struct btf_type *func;
+ const char *func_name;
+ int err;
+
+ /* any kfunc should be FUNC -> FUNC_PROTO */
+ func = btf_type_by_id(btf, func_id);
+ if (!func || !btf_type_is_func(func))
+ return -EINVAL;
+
+ /* sanity check kfunc name */
+ func_name = btf_name_by_offset(btf, func->name_off);
+ if (!func_name || !func_name[0])
+ return -EINVAL;
+
+ func = btf_type_by_id(btf, func->type);
+ if (!func || !btf_type_is_func_proto(func))
+ return -EINVAL;
+
+ if (func_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY)) {
+ err = btf_check_iter_kfuncs(btf, func_name, func, func_flags);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
/* Kernel Function (kfunc) BTF ID set registration API */
static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
@@ -7705,6 +7793,21 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
return BTF_KFUNC_HOOK_TRACING;
case BPF_PROG_TYPE_SYSCALL:
return BTF_KFUNC_HOOK_SYSCALL;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ return BTF_KFUNC_HOOK_CGROUP_SKB;
+ case BPF_PROG_TYPE_SCHED_ACT:
+ return BTF_KFUNC_HOOK_SCHED_ACT;
+ case BPF_PROG_TYPE_SK_SKB:
+ return BTF_KFUNC_HOOK_SK_SKB;
+ case BPF_PROG_TYPE_SOCKET_FILTER:
+ return BTF_KFUNC_HOOK_SOCKET_FILTER;
+ case BPF_PROG_TYPE_LWT_OUT:
+ case BPF_PROG_TYPE_LWT_IN:
+ case BPF_PROG_TYPE_LWT_XMIT:
+ case BPF_PROG_TYPE_LWT_SEG6LOCAL:
+ return BTF_KFUNC_HOOK_LWT;
+ case BPF_PROG_TYPE_NETFILTER:
+ return BTF_KFUNC_HOOK_NETFILTER;
default:
return BTF_KFUNC_HOOK_MAX;
}
@@ -7741,7 +7844,7 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
const struct btf_kfunc_id_set *kset)
{
struct btf *btf;
- int ret;
+ int ret, i;
btf = btf_get_module_btf(kset->owner);
if (!btf) {
@@ -7758,7 +7861,15 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
if (IS_ERR(btf))
return PTR_ERR(btf);
+ for (i = 0; i < kset->set->cnt; i++) {
+ ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id,
+ kset->set->pairs[i].flags);
+ if (ret)
+ goto err_out;
+ }
+
ret = btf_populate_kfunc_set(btf, hook, kset->set);
+err_out:
btf_put(btf);
return ret;
}
@@ -8249,12 +8360,10 @@ check_modules:
btf_get(mod_btf);
spin_unlock_bh(&btf_idr_lock);
cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
- if (IS_ERR(cands)) {
- btf_put(mod_btf);
+ btf_put(mod_btf);
+ if (IS_ERR(cands))
return ERR_CAST(cands);
- }
spin_lock_bh(&btf_idr_lock);
- btf_put(mod_btf);
}
spin_unlock_bh(&btf_idr_lock);
/* cands is a pointer to kmalloced memory here if cands->cnt > 0
@@ -8336,16 +8445,15 @@ out:
bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
const struct bpf_reg_state *reg,
- int off)
+ const char *field_name, u32 btf_id, const char *suffix)
{
struct btf *btf = reg->btf;
const struct btf_type *walk_type, *safe_type;
const char *tname;
char safe_tname[64];
long ret, safe_id;
- const struct btf_member *member, *m_walk = NULL;
+ const struct btf_member *member;
u32 i;
- const char *walk_name;
walk_type = btf_type_by_id(btf, reg->btf_id);
if (!walk_type)
@@ -8353,7 +8461,7 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
tname = btf_name_by_offset(btf, walk_type->name_off);
- ret = snprintf(safe_tname, sizeof(safe_tname), "%s__safe_fields", tname);
+ ret = snprintf(safe_tname, sizeof(safe_tname), "%s%s", tname, suffix);
if (ret < 0)
return false;
@@ -8365,30 +8473,17 @@ bool btf_nested_type_is_trusted(struct bpf_verifier_log *log,
if (!safe_type)
return false;
- for_each_member(i, walk_type, member) {
- u32 moff;
-
- /* We're looking for the PTR_TO_BTF_ID member in the struct
- * type we're walking which matches the specified offset.
- * Below, we'll iterate over the fields in the safe variant of
- * the struct and see if any of them has a matching type /
- * name.
- */
- moff = __btf_member_bit_offset(walk_type, member) / 8;
- if (off == moff) {
- m_walk = member;
- break;
- }
- }
- if (m_walk == NULL)
- return false;
-
- walk_name = __btf_name_by_offset(btf, m_walk->name_off);
for_each_member(i, safe_type, member) {
const char *m_name = __btf_name_by_offset(btf, member->name_off);
+ const struct btf_type *mtype = btf_type_by_id(btf, member->type);
+ u32 id;
+
+ if (!btf_type_is_ptr(mtype))
+ continue;
+ btf_type_skip_modifiers(btf, mtype->type, &id);
/* If we match on both type and name, the field is considered trusted. */
- if (m_walk->type == member->type && !strcmp(walk_name, m_name))
+ if (btf_id == id && !strcmp(field_name, m_name))
return true;
}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index bf2fdb33fb31..517b6a5928cc 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -173,11 +173,11 @@ void bpf_cgroup_atype_put(int cgroup_atype)
{
int i = cgroup_atype - CGROUP_LSM_START;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
if (--cgroup_lsm_atype[i].refcnt <= 0)
cgroup_lsm_atype[i].attach_btf_id = 0;
WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
}
#else
static enum cgroup_bpf_attach_type
@@ -282,7 +282,7 @@ static void cgroup_bpf_release(struct work_struct *work)
unsigned int atype;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
struct hlist_head *progs = &cgrp->bpf.progs[atype];
@@ -315,7 +315,7 @@ static void cgroup_bpf_release(struct work_struct *work)
bpf_cgroup_storage_free(storage);
}
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
cgroup_bpf_put(p);
@@ -729,9 +729,9 @@ static int cgroup_bpf_attach(struct cgroup *cgrp,
{
int ret;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -831,7 +831,7 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
cg_link = container_of(link, struct bpf_cgroup_link, link);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
/* link might have been auto-released by dying cgroup, so fail */
if (!cg_link->cgroup) {
ret = -ENOLINK;
@@ -843,7 +843,7 @@ static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
}
ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
out_unlock:
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -1009,9 +1009,9 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
{
int ret;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -1120,9 +1120,9 @@ static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
{
int ret;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
ret = __cgroup_bpf_query(cgrp, attr, uattr);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -1189,11 +1189,11 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
if (!cg_link->cgroup)
return;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
/* re-check cgroup under lock again */
if (!cg_link->cgroup) {
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return;
}
@@ -1205,7 +1205,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
cg = cg_link->cgroup;
cg_link->cgroup = NULL;
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
cgroup_put(cg);
}
@@ -1232,10 +1232,10 @@ static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
container_of(link, struct bpf_cgroup_link, link);
u64 cg_id = 0;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
if (cg_link->cgroup)
cg_id = cgroup_id(cg_link->cgroup);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
seq_printf(seq,
"cgroup_id:\t%llu\n"
@@ -1251,10 +1251,10 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
container_of(link, struct bpf_cgroup_link, link);
u64 cg_id = 0;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
if (cg_link->cgroup)
cg_id = cgroup_id(cg_link->cgroup);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
info->cgroup.cgroup_id = cg_id;
info->cgroup.attach_type = cg_link->type;
@@ -1921,14 +1921,17 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
if (ret < 0)
goto out;
- if (ctx.optlen > max_optlen || ctx.optlen < 0) {
+ if (optval && (ctx.optlen > max_optlen || ctx.optlen < 0)) {
ret = -EFAULT;
goto out;
}
if (ctx.optlen != 0) {
- if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
- put_user(ctx.optlen, optlen)) {
+ if (optval && copy_to_user(optval, ctx.optval, ctx.optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ if (put_user(ctx.optlen, optlen)) {
ret = -EFAULT;
goto out;
}
@@ -2223,10 +2226,12 @@ static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
treg, si->dst_reg,
offsetof(struct bpf_sysctl_kern, ppos));
- *insn++ = BPF_STX_MEM(
- BPF_SIZEOF(u32), treg, si->src_reg,
+ *insn++ = BPF_RAW_INSN(
+ BPF_CLASS(si->code) | BPF_MEM | BPF_SIZEOF(u32),
+ treg, si->src_reg,
bpf_ctx_narrow_access_offset(
- 0, sizeof(u32), sizeof(loff_t)));
+ 0, sizeof(u32), sizeof(loff_t)),
+ si->imm);
*insn++ = BPF_LDX_MEM(
BPF_DW, treg, si->dst_reg,
offsetof(struct bpf_sysctl_kern, tmp_reg));
@@ -2376,10 +2381,17 @@ static bool cg_sockopt_is_valid_access(int off, int size,
return true;
}
-#define CG_SOCKOPT_ACCESS_FIELD(T, F) \
- T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
- si->dst_reg, si->src_reg, \
- offsetof(struct bpf_sockopt_kern, F))
+#define CG_SOCKOPT_READ_FIELD(F) \
+ BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sockopt_kern, F))
+
+#define CG_SOCKOPT_WRITE_FIELD(F) \
+ BPF_RAW_INSN((BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F) | \
+ BPF_MEM | BPF_CLASS(si->code)), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sockopt_kern, F), \
+ si->imm)
static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
@@ -2391,25 +2403,25 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
switch (si->off) {
case offsetof(struct bpf_sockopt, sk):
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
+ *insn++ = CG_SOCKOPT_READ_FIELD(sk);
break;
case offsetof(struct bpf_sockopt, level):
if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
+ *insn++ = CG_SOCKOPT_WRITE_FIELD(level);
else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
+ *insn++ = CG_SOCKOPT_READ_FIELD(level);
break;
case offsetof(struct bpf_sockopt, optname):
if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
+ *insn++ = CG_SOCKOPT_WRITE_FIELD(optname);
else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optname);
break;
case offsetof(struct bpf_sockopt, optlen):
if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
+ *insn++ = CG_SOCKOPT_WRITE_FIELD(optlen);
else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optlen);
break;
case offsetof(struct bpf_sockopt, retval):
BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
@@ -2429,9 +2441,11 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
treg, treg,
offsetof(struct task_struct, bpf_ctx));
- *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
- treg, si->src_reg,
- offsetof(struct bpf_cg_run_ctx, retval));
+ *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_MEM |
+ BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+ treg, si->src_reg,
+ offsetof(struct bpf_cg_run_ctx, retval),
+ si->imm);
*insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
offsetof(struct bpf_sockopt_kern, tmp_reg));
} else {
@@ -2447,10 +2461,10 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
}
break;
case offsetof(struct bpf_sockopt, optval):
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optval);
break;
case offsetof(struct bpf_sockopt, optval_end):
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
+ *insn++ = CG_SOCKOPT_READ_FIELD(optval_end);
break;
}
@@ -2529,10 +2543,6 @@ cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_pid_tgid_proto;
case BPF_FUNC_get_current_comm:
return &bpf_get_current_comm_proto;
- case BPF_FUNC_get_current_cgroup_id:
- return &bpf_get_current_cgroup_id_proto;
- case BPF_FUNC_get_current_ancestor_cgroup_id:
- return &bpf_get_current_ancestor_cgroup_id_proto;
#ifdef CONFIG_CGROUP_NET_CLASSID
case BPF_FUNC_get_cgroup_classid:
return &bpf_get_cgroup_classid_curr_proto;
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index 06989d278846..810378f04fbc 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -58,7 +58,7 @@ static void *cgroup_iter_seq_start(struct seq_file *seq, loff_t *pos)
{
struct cgroup_iter_priv *p = seq->private;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
/* cgroup_iter doesn't support read across multiple sessions. */
if (*pos > 0) {
@@ -89,7 +89,7 @@ static void cgroup_iter_seq_stop(struct seq_file *seq, void *v)
{
struct cgroup_iter_priv *p = seq->private;
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
/* pass NULL to the prog for post-processing */
if (!v) {
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b297e9f60ca1..7421487422d4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -972,7 +972,7 @@ static int __init bpf_jit_charge_init(void)
{
/* Only used as heuristic here to derive limit. */
bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
- bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2,
+ bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1,
PAGE_SIZE), LONG_MAX);
return 0;
}
@@ -1187,6 +1187,7 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
s16 off = insn->off;
s32 imm = insn->imm;
u8 *addr;
+ int err;
*func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL;
if (!*func_addr_fixed) {
@@ -1201,6 +1202,11 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
addr = (u8 *)prog->aux->func[off]->bpf_func;
else
return -EINVAL;
+ } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+ bpf_jit_supports_far_kfunc_call()) {
+ err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr);
+ if (err)
+ return err;
} else {
/* Address of a BPF helper call. Since part of the core
* kernel, it's always at a fixed location. __bpf_call_base
@@ -2732,6 +2738,11 @@ bool __weak bpf_jit_supports_kfunc_call(void)
return false;
}
+bool __weak bpf_jit_supports_far_kfunc_call(void)
+{
+ return false;
+}
+
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
*/
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index d2110c1f6fa6..8ec18faa74ac 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -540,7 +540,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
}
}
-static int cpu_map_delete_elem(struct bpf_map *map, void *key)
+static long cpu_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
u32 key_cpu = *(u32 *)key;
@@ -553,8 +553,8 @@ static int cpu_map_delete_elem(struct bpf_map *map, void *key)
return 0;
}
-static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
struct bpf_cpumap_val cpumap_value = {};
@@ -667,12 +667,21 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
-static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
+static long cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
{
return __bpf_xdp_redirect_map(map, index, flags, 0,
__cpu_map_lookup_elem);
}
+static u64 cpu_map_mem_usage(const struct bpf_map *map)
+{
+ u64 usage = sizeof(struct bpf_cpu_map);
+
+ /* Currently the dynamically allocated elements are not counted */
+ usage += (u64)map->max_entries * sizeof(struct bpf_cpu_map_entry *);
+ return usage;
+}
+
BTF_ID_LIST_SINGLE(cpu_map_btf_ids, struct, bpf_cpu_map)
const struct bpf_map_ops cpu_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -683,6 +692,7 @@ const struct bpf_map_ops cpu_map_ops = {
.map_lookup_elem = cpu_map_lookup_elem,
.map_get_next_key = cpu_map_get_next_key,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = cpu_map_mem_usage,
.map_btf_id = &cpu_map_btf_ids[0],
.map_redirect = cpu_map_redirect,
};
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 52b981512a35..7efdf5d770ca 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -9,6 +9,7 @@
/**
* struct bpf_cpumask - refcounted BPF cpumask wrapper structure
* @cpumask: The actual cpumask embedded in the struct.
+ * @rcu: The RCU head used to free the cpumask with RCU safety.
* @usage: Object reference counter. When the refcount goes to 0, the
* memory is released back to the BPF allocator, which provides
* RCU safety.
@@ -24,6 +25,7 @@
*/
struct bpf_cpumask {
cpumask_t cpumask;
+ struct rcu_head rcu;
refcount_t usage;
};
@@ -55,7 +57,7 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void)
/* cpumask must be the first element so struct bpf_cpumask be cast to struct cpumask. */
BUILD_BUG_ON(offsetof(struct bpf_cpumask, cpumask) != 0);
- cpumask = bpf_mem_alloc(&bpf_cpumask_ma, sizeof(*cpumask));
+ cpumask = bpf_mem_cache_alloc(&bpf_cpumask_ma);
if (!cpumask)
return NULL;
@@ -80,32 +82,14 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask)
return cpumask;
}
-/**
- * bpf_cpumask_kptr_get() - Attempt to acquire a reference to a BPF cpumask
- * stored in a map.
- * @cpumaskp: A pointer to a BPF cpumask map value.
- *
- * Attempts to acquire a reference to a BPF cpumask stored in a map value. The
- * cpumask returned by this function must either be embedded in a map as a
- * kptr, or freed with bpf_cpumask_release(). This function may return NULL if
- * no BPF cpumask was found in the specified map value.
- */
-__bpf_kfunc struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumaskp)
+static void cpumask_free_cb(struct rcu_head *head)
{
struct bpf_cpumask *cpumask;
- /* The BPF memory allocator frees memory backing its caches in an RCU
- * callback. Thus, we can safely use RCU to ensure that the cpumask is
- * safe to read.
- */
- rcu_read_lock();
-
- cpumask = READ_ONCE(*cpumaskp);
- if (cpumask && !refcount_inc_not_zero(&cpumask->usage))
- cpumask = NULL;
-
- rcu_read_unlock();
- return cpumask;
+ cpumask = container_of(head, struct bpf_cpumask, rcu);
+ migrate_disable();
+ bpf_mem_cache_free(&bpf_cpumask_ma, cpumask);
+ migrate_enable();
}
/**
@@ -118,14 +102,8 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_kptr_get(struct bpf_cpumask **cpumas
*/
__bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
{
- if (!cpumask)
- return;
-
- if (refcount_dec_and_test(&cpumask->usage)) {
- migrate_disable();
- bpf_mem_free(&bpf_cpumask_ma, cpumask);
- migrate_enable();
- }
+ if (refcount_dec_and_test(&cpumask->usage))
+ call_rcu(&cpumask->rcu, cpumask_free_cb);
}
/**
@@ -424,29 +402,28 @@ __diag_pop();
BTF_SET8_START(cpumask_kfunc_btf_ids)
BTF_ID_FLAGS(func, bpf_cpumask_create, KF_ACQUIRE | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_cpumask_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_cpumask_first, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_and, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_or, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_full, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_any, KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cpumask_first, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_first_zero, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_set_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_clear_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_set_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_test_and_clear_cpu, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_setall, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_clear, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_and, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_or, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_xor, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_equal, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_intersects, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_subset, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_empty, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_full, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any, KF_RCU)
+BTF_ID_FLAGS(func, bpf_cpumask_any_and, KF_RCU)
BTF_SET8_END(cpumask_kfunc_btf_ids)
static const struct btf_kfunc_id_set cpumask_kfunc_set = {
@@ -468,7 +445,7 @@ static int __init cpumask_kfunc_init(void)
},
};
- ret = bpf_mem_alloc_init(&bpf_cpumask_ma, 0, false);
+ ret = bpf_mem_alloc_init(&bpf_cpumask_ma, sizeof(struct bpf_cpumask), false);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &cpumask_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &cpumask_kfunc_set);
return ret ?: register_btf_id_dtor_kfuncs(cpumask_dtors,
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 2675fefc6cb6..802692fa3905 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -809,7 +809,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
kfree(dev);
}
-static int dev_map_delete_elem(struct bpf_map *map, void *key)
+static long dev_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
@@ -819,12 +819,14 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
return -EINVAL;
old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL));
- if (old_dev)
+ if (old_dev) {
call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ atomic_dec((atomic_t *)&dtab->items);
+ }
return 0;
}
-static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
+static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
@@ -895,8 +897,8 @@ err_out:
return ERR_PTR(-EINVAL);
}
-static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
- void *key, void *value, u64 map_flags)
+static long __dev_map_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
@@ -931,19 +933,21 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev)));
if (old_dev)
call_rcu(&old_dev->rcu, __dev_map_entry_free);
+ else
+ atomic_inc((atomic_t *)&dtab->items);
return 0;
}
-static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long dev_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
return __dev_map_update_elem(current->nsproxy->net_ns,
map, key, value, map_flags);
}
-static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
- void *key, void *value, u64 map_flags)
+static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
+ void *key, void *value, u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
@@ -995,27 +999,41 @@ out_err:
return err;
}
-static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
return __dev_map_hash_update_elem(current->nsproxy->net_ns,
map, key, value, map_flags);
}
-static int dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
+static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
return __bpf_xdp_redirect_map(map, ifindex, flags,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
__dev_map_lookup_elem);
}
-static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
+static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
return __bpf_xdp_redirect_map(map, ifindex, flags,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
__dev_map_hash_lookup_elem);
}
+static u64 dev_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ u64 usage = sizeof(struct bpf_dtab);
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)
+ usage += (u64)dtab->n_buckets * sizeof(struct hlist_head);
+ else
+ usage += (u64)map->max_entries * sizeof(struct bpf_dtab_netdev *);
+ usage += atomic_read((atomic_t *)&dtab->items) *
+ (u64)sizeof(struct bpf_dtab_netdev);
+ return usage;
+}
+
BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)
const struct bpf_map_ops dev_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -1026,6 +1044,7 @@ const struct bpf_map_ops dev_map_ops = {
.map_update_elem = dev_map_update_elem,
.map_delete_elem = dev_map_delete_elem,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = dev_map_mem_usage,
.map_btf_id = &dev_map_btf_ids[0],
.map_redirect = dev_map_redirect,
};
@@ -1039,6 +1058,7 @@ const struct bpf_map_ops dev_map_hash_ops = {
.map_update_elem = dev_map_hash_update_elem,
.map_delete_elem = dev_map_hash_delete_elem,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = dev_map_mem_usage,
.map_btf_id = &dev_map_btf_ids[0],
.map_redirect = dev_hash_map_redirect,
};
@@ -1109,9 +1129,11 @@ static int dev_map_notification(struct notifier_block *notifier,
if (!dev || netdev != dev->dev)
continue;
odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL));
- if (dev == odev)
+ if (dev == odev) {
call_rcu(&dev->rcu,
__dev_map_entry_free);
+ atomic_dec((atomic_t *)&dtab->items);
+ }
}
}
rcu_read_unlock();
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 5dfcb5ad0d06..00c253b84bf5 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -249,7 +249,18 @@ static void htab_free_prealloced_fields(struct bpf_htab *htab)
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
- bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
+ if (htab_is_percpu(htab)) {
+ void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
+ cond_resched();
+ }
+ } else {
+ bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
+ cond_resched();
+ }
cond_resched();
}
}
@@ -596,6 +607,8 @@ free_htab:
static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd)
{
+ if (likely(key_len % 4 == 0))
+ return jhash2(key, key_len / 4, hashrnd);
return jhash(key, key_len, hashrnd);
}
@@ -759,9 +772,17 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
static void check_and_free_fields(struct bpf_htab *htab,
struct htab_elem *elem)
{
- void *map_value = elem->key + round_up(htab->map.key_size, 8);
+ if (htab_is_percpu(htab)) {
+ void __percpu *pptr = htab_elem_get_ptr(elem, htab->map.key_size);
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ bpf_obj_free_fields(htab->map.record, per_cpu_ptr(pptr, cpu));
+ } else {
+ void *map_value = elem->key + round_up(htab->map.key_size, 8);
- bpf_obj_free_fields(htab->map.record, map_value);
+ bpf_obj_free_fields(htab->map.record, map_value);
+ }
}
/* It is called from the bpf_lru_list when the LRU needs to delete
@@ -858,9 +879,9 @@ find_first_elem:
static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
+ check_and_free_fields(htab, l);
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
- check_and_free_fields(htab, l);
bpf_mem_cache_free(&htab->ma, l);
}
@@ -918,14 +939,13 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
{
if (!onallcpus) {
/* copy true value_size bytes */
- memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+ copy_map_value(&htab->map, this_cpu_ptr(pptr), value);
} else {
u32 size = round_up(htab->map.value_size, 8);
int off = 0, cpu;
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
- value + off, size);
+ copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value + off);
off += size;
}
}
@@ -940,16 +960,14 @@ static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
* (onallcpus=false always when coming from bpf prog).
*/
if (!onallcpus) {
- u32 size = round_up(htab->map.value_size, 8);
int current_cpu = raw_smp_processor_id();
int cpu;
for_each_possible_cpu(cpu) {
if (cpu == current_cpu)
- bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value,
- size);
- else
- memset(per_cpu_ptr(pptr, cpu), 0, size);
+ copy_map_value_long(&htab->map, per_cpu_ptr(pptr, cpu), value);
+ else /* Since elem is preallocated, we cannot touch special fields */
+ zero_map_value(&htab->map, per_cpu_ptr(pptr, cpu));
}
} else {
pcpu_copy_value(htab, pptr, value, onallcpus);
@@ -1057,8 +1075,8 @@ static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
}
/* Called from syscall or from eBPF program */
-static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
@@ -1159,8 +1177,8 @@ static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)
bpf_lru_push_free(&htab->lru, &elem->lru_node);
}
-static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new, *l_old = NULL;
@@ -1226,9 +1244,9 @@ err:
return ret;
}
-static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags,
- bool onallcpus)
+static long __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool onallcpus)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
@@ -1281,9 +1299,9 @@ err:
return ret;
}
-static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags,
- bool onallcpus)
+static long __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags,
+ bool onallcpus)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
@@ -1348,21 +1366,21 @@ err:
return ret;
}
-static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
}
-static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 map_flags)
+static long htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
{
return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
false);
}
/* Called from syscall or from eBPF program */
-static int htab_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_nulls_head *head;
@@ -1398,7 +1416,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
return ret;
}
-static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
+static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_nulls_head *head;
@@ -1575,9 +1593,8 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
pptr = htab_elem_get_ptr(l, key_size);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(value + off,
- per_cpu_ptr(pptr, cpu),
- roundup_value_size);
+ copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, value + off);
off += roundup_value_size;
}
} else {
@@ -1772,8 +1789,8 @@ again_nocopy:
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(dst_val + off,
- per_cpu_ptr(pptr, cpu), size);
+ copy_map_value_long(&htab->map, dst_val + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, dst_val + off);
off += size;
}
} else {
@@ -2046,9 +2063,9 @@ static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
roundup_value_size = round_up(map->value_size, 8);
pptr = htab_elem_get_ptr(elem, map->key_size);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(info->percpu_value_buf + off,
- per_cpu_ptr(pptr, cpu),
- roundup_value_size);
+ copy_map_value_long(map, info->percpu_value_buf + off,
+ per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, info->percpu_value_buf + off);
off += roundup_value_size;
}
ctx.value = info->percpu_value_buf;
@@ -2119,8 +2136,8 @@ static const struct bpf_iter_seq_info iter_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
};
-static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
- void *callback_ctx, u64 flags)
+static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+ void *callback_ctx, u64 flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_nulls_head *head;
@@ -2175,6 +2192,44 @@ out:
return num_elems;
}
+static u64 htab_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ u32 value_size = round_up(htab->map.value_size, 8);
+ bool prealloc = htab_is_prealloc(htab);
+ bool percpu = htab_is_percpu(htab);
+ bool lru = htab_is_lru(htab);
+ u64 num_entries;
+ u64 usage = sizeof(struct bpf_htab);
+
+ usage += sizeof(struct bucket) * htab->n_buckets;
+ usage += sizeof(int) * num_possible_cpus() * HASHTAB_MAP_LOCK_COUNT;
+ if (prealloc) {
+ num_entries = map->max_entries;
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+
+ usage += htab->elem_size * num_entries;
+
+ if (percpu)
+ usage += value_size * num_possible_cpus() * num_entries;
+ else if (!lru)
+ usage += sizeof(struct htab_elem *) * num_possible_cpus();
+ } else {
+#define LLIST_NODE_SZ sizeof(struct llist_node)
+
+ num_entries = htab->use_percpu_counter ?
+ percpu_counter_sum(&htab->pcount) :
+ atomic_read(&htab->count);
+ usage += (htab->elem_size + LLIST_NODE_SZ) * num_entries;
+ if (percpu) {
+ usage += (LLIST_NODE_SZ + sizeof(void *)) * num_entries;
+ usage += value_size * num_possible_cpus() * num_entries;
+ }
+ }
+ return usage;
+}
+
BTF_ID_LIST_SINGLE(htab_map_btf_ids, struct, bpf_htab)
const struct bpf_map_ops htab_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -2191,6 +2246,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_seq_show_elem = htab_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab),
.map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
@@ -2212,6 +2268,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_seq_show_elem = htab_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_lru),
.map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
@@ -2292,8 +2349,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
*/
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
- bpf_long_memcpy(value + off,
- per_cpu_ptr(pptr, cpu), size);
+ copy_map_value_long(map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(map, value + off);
off += size;
}
ret = 0;
@@ -2363,6 +2420,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_percpu),
.map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
@@ -2382,6 +2440,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
.map_seq_show_elem = htab_percpu_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
.map_for_each_callback = bpf_for_each_hash_elem,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab_lru_percpu),
.map_btf_id = &htab_map_btf_ids[0],
.iter_seq_info = &iter_seq_info,
@@ -2519,6 +2578,7 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
.map_gen_lookup = htab_of_map_gen_lookup,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = htab_map_mem_usage,
BATCH_OPS(htab),
.map_btf_id = &htab_map_btf_ids[0],
};
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 5b278a38ae58..8d368fa353f9 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -18,6 +18,7 @@
#include <linux/pid_namespace.h>
#include <linux/poison.h>
#include <linux/proc_ns.h>
+#include <linux/sched/task.h>
#include <linux/security.h>
#include <linux/btf_ids.h>
#include <linux/bpf_mem_alloc.h>
@@ -257,7 +258,7 @@ BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
goto err_clear;
/* Verifier guarantees that size > 0 */
- strscpy(buf, task->comm, size);
+ strscpy_pad(buf, task->comm, size);
return 0;
err_clear:
memset(buf, 0, size);
@@ -571,7 +572,7 @@ static const struct bpf_func_proto bpf_strncmp_proto = {
.func = bpf_strncmp,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_PTR_TO_CONST_STR,
};
@@ -1264,10 +1265,11 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
{
struct bpf_hrtimer *t;
int ret = 0;
+ enum hrtimer_mode mode;
if (in_nmi())
return -EOPNOTSUPP;
- if (flags)
+ if (flags > BPF_F_TIMER_ABS)
return -EINVAL;
__bpf_spin_lock_irqsave(&timer->lock);
t = timer->timer;
@@ -1275,7 +1277,13 @@ BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, fla
ret = -EINVAL;
goto out;
}
- hrtimer_start(&t->timer, ns_to_ktime(nsecs), HRTIMER_MODE_REL_SOFT);
+
+ if (flags & BPF_F_TIMER_ABS)
+ mode = HRTIMER_MODE_ABS_SOFT;
+ else
+ mode = HRTIMER_MODE_REL_SOFT;
+
+ hrtimer_start(&t->timer, ns_to_ktime(nsecs), mode);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
return ret;
@@ -1420,11 +1428,21 @@ static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
return ptr->size & DYNPTR_RDONLY_BIT;
}
+void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr)
+{
+ ptr->size |= DYNPTR_RDONLY_BIT;
+}
+
static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_type type)
{
ptr->size |= type << DYNPTR_TYPE_SHIFT;
}
+static enum bpf_dynptr_type bpf_dynptr_get_type(const struct bpf_dynptr_kern *ptr)
+{
+ return (ptr->size & ~(DYNPTR_RDONLY_BIT)) >> DYNPTR_TYPE_SHIFT;
+}
+
u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr)
{
return ptr->size & DYNPTR_SIZE_MASK;
@@ -1497,6 +1515,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
u32, offset, u64, flags)
{
+ enum bpf_dynptr_type type;
int err;
if (!src->data || flags)
@@ -1506,13 +1525,25 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern
if (err)
return err;
- /* Source and destination may possibly overlap, hence use memmove to
- * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
- * pointing to overlapping PTR_TO_MAP_VALUE regions.
- */
- memmove(dst, src->data + src->offset + offset, len);
+ type = bpf_dynptr_get_type(src);
- return 0;
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst, src->data + src->offset + offset, len);
+ return 0;
+ case BPF_DYNPTR_TYPE_SKB:
+ return __bpf_skb_load_bytes(src->data, src->offset + offset, dst, len);
+ case BPF_DYNPTR_TYPE_XDP:
+ return __bpf_xdp_load_bytes(src->data, src->offset + offset, dst, len);
+ default:
+ WARN_ONCE(true, "bpf_dynptr_read: unknown dynptr type %d\n", type);
+ return -EFAULT;
+ }
}
static const struct bpf_func_proto bpf_dynptr_read_proto = {
@@ -1529,22 +1560,40 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
u32, len, u64, flags)
{
+ enum bpf_dynptr_type type;
int err;
- if (!dst->data || flags || bpf_dynptr_is_rdonly(dst))
+ if (!dst->data || bpf_dynptr_is_rdonly(dst))
return -EINVAL;
err = bpf_dynptr_check_off_len(dst, offset, len);
if (err)
return err;
- /* Source and destination may possibly overlap, hence use memmove to
- * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
- * pointing to overlapping PTR_TO_MAP_VALUE regions.
- */
- memmove(dst->data + dst->offset + offset, src, len);
+ type = bpf_dynptr_get_type(dst);
- return 0;
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ if (flags)
+ return -EINVAL;
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst->data + dst->offset + offset, src, len);
+ return 0;
+ case BPF_DYNPTR_TYPE_SKB:
+ return __bpf_skb_store_bytes(dst->data, dst->offset + offset, src, len,
+ flags);
+ case BPF_DYNPTR_TYPE_XDP:
+ if (flags)
+ return -EINVAL;
+ return __bpf_xdp_store_bytes(dst->data, dst->offset + offset, src, len);
+ default:
+ WARN_ONCE(true, "bpf_dynptr_write: unknown dynptr type %d\n", type);
+ return -EFAULT;
+ }
}
static const struct bpf_func_proto bpf_dynptr_write_proto = {
@@ -1560,6 +1609,7 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
{
+ enum bpf_dynptr_type type;
int err;
if (!ptr->data)
@@ -1572,7 +1622,20 @@ BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u3
if (bpf_dynptr_is_rdonly(ptr))
return 0;
- return (unsigned long)(ptr->data + ptr->offset + offset);
+ type = bpf_dynptr_get_type(ptr);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return (unsigned long)(ptr->data + ptr->offset + offset);
+ case BPF_DYNPTR_TYPE_SKB:
+ case BPF_DYNPTR_TYPE_XDP:
+ /* skb and xdp dynptrs should use bpf_dynptr_slice / bpf_dynptr_slice_rdwr */
+ return 0;
+ default:
+ WARN_ONCE(true, "bpf_dynptr_data: unknown dynptr type %d\n", type);
+ return 0;
+ }
}
static const struct bpf_func_proto bpf_dynptr_data_proto = {
@@ -1693,6 +1756,10 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_cgrp_storage_get_proto;
case BPF_FUNC_cgrp_storage_delete:
return &bpf_cgrp_storage_delete_proto;
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
#endif
default:
break;
@@ -1731,6 +1798,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
}
}
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+
void bpf_list_head_free(const struct btf_field *field, void *list_head,
struct bpf_spin_lock *spin_lock)
{
@@ -1761,13 +1830,8 @@ unlock:
/* The contained type can also have resources, including a
* bpf_list_head which needs to be freed.
*/
- bpf_obj_free_fields(field->graph_root.value_rec, obj);
- /* bpf_mem_free requires migrate_disable(), since we can be
- * called from map free path as well apart from BPF program (as
- * part of map ops doing bpf_obj_free_fields).
- */
migrate_disable();
- bpf_mem_free(&bpf_global_ma, obj);
+ __bpf_obj_drop_impl(obj, field->graph_root.value_rec);
migrate_enable();
}
}
@@ -1804,10 +1868,9 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
obj = pos;
obj -= field->graph_root.node_offset;
- bpf_obj_free_fields(field->graph_root.value_rec, obj);
migrate_disable();
- bpf_mem_free(&bpf_global_ma, obj);
+ __bpf_obj_drop_impl(obj, field->graph_root.value_rec);
migrate_enable();
}
}
@@ -1826,45 +1889,96 @@ __bpf_kfunc void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
if (!p)
return NULL;
if (meta)
- bpf_obj_init(meta->field_offs, p);
+ bpf_obj_init(meta->record, p);
return p;
}
+/* Must be called under migrate_disable(), as required by bpf_mem_free */
+void __bpf_obj_drop_impl(void *p, const struct btf_record *rec)
+{
+ if (rec && rec->refcount_off >= 0 &&
+ !refcount_dec_and_test((refcount_t *)(p + rec->refcount_off))) {
+ /* Object is refcounted and refcount_dec didn't result in 0
+ * refcount. Return without freeing the object
+ */
+ return;
+ }
+
+ if (rec)
+ bpf_obj_free_fields(rec, p);
+ bpf_mem_free(&bpf_global_ma, p);
+}
+
__bpf_kfunc void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
{
struct btf_struct_meta *meta = meta__ign;
void *p = p__alloc;
- if (meta)
- bpf_obj_free_fields(meta->record, p);
- bpf_mem_free(&bpf_global_ma, p);
+ __bpf_obj_drop_impl(p, meta ? meta->record : NULL);
}
-static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail)
+__bpf_kfunc void *bpf_refcount_acquire_impl(void *p__refcounted_kptr, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ struct bpf_refcount *ref;
+
+ /* Could just cast directly to refcount_t *, but need some code using
+ * bpf_refcount type so that it is emitted in vmlinux BTF
+ */
+ ref = (struct bpf_refcount *)(p__refcounted_kptr + meta->record->refcount_off);
+
+ refcount_inc((refcount_t *)ref);
+ return (void *)p__refcounted_kptr;
+}
+
+static int __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head,
+ bool tail, struct btf_record *rec, u64 off)
{
struct list_head *n = (void *)node, *h = (void *)head;
+ /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+ * called on its fields, so init here
+ */
if (unlikely(!h->next))
INIT_LIST_HEAD(h);
- if (unlikely(!n->next))
- INIT_LIST_HEAD(n);
+ if (!list_empty(n)) {
+ /* Only called from BPF prog, no need to migrate_disable */
+ __bpf_obj_drop_impl(n - off, rec);
+ return -EINVAL;
+ }
+
tail ? list_add_tail(n, h) : list_add(n, h);
+
+ return 0;
}
-__bpf_kfunc void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node)
+__bpf_kfunc int bpf_list_push_front_impl(struct bpf_list_head *head,
+ struct bpf_list_node *node,
+ void *meta__ign, u64 off)
{
- return __bpf_list_add(node, head, false);
+ struct btf_struct_meta *meta = meta__ign;
+
+ return __bpf_list_add(node, head, false,
+ meta ? meta->record : NULL, off);
}
-__bpf_kfunc void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node)
+__bpf_kfunc int bpf_list_push_back_impl(struct bpf_list_head *head,
+ struct bpf_list_node *node,
+ void *meta__ign, u64 off)
{
- return __bpf_list_add(node, head, true);
+ struct btf_struct_meta *meta = meta__ign;
+
+ return __bpf_list_add(node, head, true,
+ meta ? meta->record : NULL, off);
}
static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
{
struct list_head *n, *h = (void *)head;
+ /* If list_head was 0-initialized by map, bpf_obj_init_field wasn't
+ * called on its fields, so init here
+ */
if (unlikely(!h->next))
INIT_LIST_HEAD(h);
if (list_empty(h))
@@ -1890,6 +2004,9 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
struct rb_root_cached *r = (struct rb_root_cached *)root;
struct rb_node *n = (struct rb_node *)node;
+ if (RB_EMPTY_NODE(n))
+ return NULL;
+
rb_erase_cached(n, r);
RB_CLEAR_NODE(n);
return (struct bpf_rb_node *)n;
@@ -1898,14 +2015,20 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
/* Need to copy rbtree_add_cached's logic here because our 'less' is a BPF
* program
*/
-static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
- void *less)
+static int __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ void *less, struct btf_record *rec, u64 off)
{
struct rb_node **link = &((struct rb_root_cached *)root)->rb_root.rb_node;
+ struct rb_node *parent = NULL, *n = (struct rb_node *)node;
bpf_callback_t cb = (bpf_callback_t)less;
- struct rb_node *parent = NULL;
bool leftmost = true;
+ if (!RB_EMPTY_NODE(n)) {
+ /* Only called from BPF prog, no need to migrate_disable */
+ __bpf_obj_drop_impl(n - off, rec);
+ return -EINVAL;
+ }
+
while (*link) {
parent = *link;
if (cb((uintptr_t)node, (uintptr_t)parent, 0, 0, 0)) {
@@ -1916,15 +2039,18 @@ static void __bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
}
}
- rb_link_node((struct rb_node *)node, parent, link);
- rb_insert_color_cached((struct rb_node *)node,
- (struct rb_root_cached *)root, leftmost);
+ rb_link_node(n, parent, link);
+ rb_insert_color_cached(n, (struct rb_root_cached *)root, leftmost);
+ return 0;
}
-__bpf_kfunc void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
- bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b))
+__bpf_kfunc int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
+ void *meta__ign, u64 off)
{
- __bpf_rbtree_add(root, node, (void *)less);
+ struct btf_struct_meta *meta = meta__ign;
+
+ return __bpf_rbtree_add(root, node, (void *)less, meta ? meta->record : NULL, off);
}
__bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
@@ -1942,73 +2068,8 @@ __bpf_kfunc struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root)
*/
__bpf_kfunc struct task_struct *bpf_task_acquire(struct task_struct *p)
{
- return get_task_struct(p);
-}
-
-/**
- * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task
- * acquired by this kfunc which is not stored in a map as a kptr, must be
- * released by calling bpf_task_release().
- * @p: The task on which a reference is being acquired.
- */
-__bpf_kfunc struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p)
-{
- /* For the time being this function returns NULL, as it's not currently
- * possible to safely acquire a reference to a task with RCU protection
- * using get_task_struct() and put_task_struct(). This is due to the
- * slightly odd mechanics of p->rcu_users, and how task RCU protection
- * works.
- *
- * A struct task_struct is refcounted by two different refcount_t
- * fields:
- *
- * 1. p->usage: The "true" refcount field which tracks a task's
- * lifetime. The task is freed as soon as this
- * refcount drops to 0.
- *
- * 2. p->rcu_users: An "RCU users" refcount field which is statically
- * initialized to 2, and is co-located in a union with
- * a struct rcu_head field (p->rcu). p->rcu_users
- * essentially encapsulates a single p->usage
- * refcount, and when p->rcu_users goes to 0, an RCU
- * callback is scheduled on the struct rcu_head which
- * decrements the p->usage refcount.
- *
- * There are two important implications to this task refcounting logic
- * described above. The first is that
- * refcount_inc_not_zero(&p->rcu_users) cannot be used anywhere, as
- * after the refcount goes to 0, the RCU callback being scheduled will
- * cause the memory backing the refcount to again be nonzero due to the
- * fields sharing a union. The other is that we can't rely on RCU to
- * guarantee that a task is valid in a BPF program. This is because a
- * task could have already transitioned to being in the TASK_DEAD
- * state, had its rcu_users refcount go to 0, and its rcu callback
- * invoked in which it drops its single p->usage reference. At this
- * point the task will be freed as soon as the last p->usage reference
- * goes to 0, without waiting for another RCU gp to elapse. The only
- * way that a BPF program can guarantee that a task is valid is in this
- * scenario is to hold a p->usage refcount itself.
- *
- * Until we're able to resolve this issue, either by pulling
- * p->rcu_users and p->rcu out of the union, or by getting rid of
- * p->usage and just using p->rcu_users for refcounting, we'll just
- * return NULL here.
- */
- return NULL;
-}
-
-/**
- * bpf_task_kptr_get - Acquire a reference on a struct task_struct kptr. A task
- * kptr acquired by this kfunc which is not subsequently stored in a map, must
- * be released by calling bpf_task_release().
- * @pp: A pointer to a task kptr on which a reference is being acquired.
- */
-__bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
-{
- /* We must return NULL here until we have clarity on how to properly
- * leverage RCU for ensuring a task's lifetime. See the comment above
- * in bpf_task_acquire_not_zero() for more details.
- */
+ if (refcount_inc_not_zero(&p->rcu_users))
+ return p;
return NULL;
}
@@ -2018,10 +2079,7 @@ __bpf_kfunc struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
*/
__bpf_kfunc void bpf_task_release(struct task_struct *p)
{
- if (!p)
- return;
-
- put_task_struct(p);
+ put_task_struct_rcu_user(p);
}
#ifdef CONFIG_CGROUPS
@@ -2033,39 +2091,7 @@ __bpf_kfunc void bpf_task_release(struct task_struct *p)
*/
__bpf_kfunc struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
{
- cgroup_get(cgrp);
- return cgrp;
-}
-
-/**
- * bpf_cgroup_kptr_get - Acquire a reference on a struct cgroup kptr. A cgroup
- * kptr acquired by this kfunc which is not subsequently stored in a map, must
- * be released by calling bpf_cgroup_release().
- * @cgrpp: A pointer to a cgroup kptr on which a reference is being acquired.
- */
-__bpf_kfunc struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)
-{
- struct cgroup *cgrp;
-
- rcu_read_lock();
- /* Another context could remove the cgroup from the map and release it
- * at any time, including after we've done the lookup above. This is
- * safe because we're in an RCU read region, so the cgroup is
- * guaranteed to remain valid until at least the rcu_read_unlock()
- * below.
- */
- cgrp = READ_ONCE(*cgrpp);
-
- if (cgrp && !cgroup_tryget(cgrp))
- /* If the cgroup had been removed from the map and freed as
- * described above, cgroup_tryget() will return false. The
- * cgroup will be freed at some point after the current RCU gp
- * has ended, so just return NULL to the user.
- */
- cgrp = NULL;
- rcu_read_unlock();
-
- return cgrp;
+ return cgroup_tryget(cgrp) ? cgrp : NULL;
}
/**
@@ -2077,9 +2103,6 @@ __bpf_kfunc struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)
*/
__bpf_kfunc void bpf_cgroup_release(struct cgroup *cgrp)
{
- if (!cgrp)
- return;
-
cgroup_put(cgrp);
}
@@ -2097,10 +2120,28 @@ __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
if (level > cgrp->level || level < 0)
return NULL;
+ /* cgrp's refcnt could be 0 here, but ancestors can still be accessed */
ancestor = cgrp->ancestors[level];
- cgroup_get(ancestor);
+ if (!cgroup_tryget(ancestor))
+ return NULL;
return ancestor;
}
+
+/**
+ * bpf_cgroup_from_id - Find a cgroup from its ID. A cgroup returned by this
+ * kfunc which is not subsequently stored in a map, must be released by calling
+ * bpf_cgroup_release().
+ * @cgid: cgroup id.
+ */
+__bpf_kfunc struct cgroup *bpf_cgroup_from_id(u64 cgid)
+{
+ struct cgroup *cgrp;
+
+ cgrp = cgroup_get_from_id(cgid);
+ if (IS_ERR(cgrp))
+ return NULL;
+ return cgrp;
+}
#endif /* CONFIG_CGROUPS */
/**
@@ -2116,12 +2157,146 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
rcu_read_lock();
p = find_task_by_pid_ns(pid, &init_pid_ns);
if (p)
- bpf_task_acquire(p);
+ p = bpf_task_acquire(p);
rcu_read_unlock();
return p;
}
+/**
+ * bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
+ * @ptr: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer: User-provided buffer to copy contents into
+ * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
+ * requested slice. This must be a constant.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * If the intention is to write to the data slice, please use
+ * bpf_dynptr_slice_rdwr.
+ *
+ * The user must check that the returned pointer is not null before using it.
+ *
+ * Please note that in the case of skb and xdp dynptrs, bpf_dynptr_slice
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * Return: NULL if the call failed (eg invalid dynptr), pointer to a read-only
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
+ void *buffer, u32 buffer__szk)
+{
+ enum bpf_dynptr_type type;
+ u32 len = buffer__szk;
+ int err;
+
+ if (!ptr->data)
+ return NULL;
+
+ err = bpf_dynptr_check_off_len(ptr, offset, len);
+ if (err)
+ return NULL;
+
+ type = bpf_dynptr_get_type(ptr);
+
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return ptr->data + ptr->offset + offset;
+ case BPF_DYNPTR_TYPE_SKB:
+ return skb_header_pointer(ptr->data, ptr->offset + offset, len, buffer);
+ case BPF_DYNPTR_TYPE_XDP:
+ {
+ void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset + offset, len);
+ if (xdp_ptr)
+ return xdp_ptr;
+
+ bpf_xdp_copy_buf(ptr->data, ptr->offset + offset, buffer, len, false);
+ return buffer;
+ }
+ default:
+ WARN_ONCE(true, "unknown dynptr type %d\n", type);
+ return NULL;
+ }
+}
+
+/**
+ * bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
+ * @ptr: The dynptr whose data slice to retrieve
+ * @offset: Offset into the dynptr
+ * @buffer: User-provided buffer to copy contents into
+ * @buffer__szk: Size (in bytes) of the buffer. This is the length of the
+ * requested slice. This must be a constant.
+ *
+ * For non-skb and non-xdp type dynptrs, there is no difference between
+ * bpf_dynptr_slice and bpf_dynptr_data.
+ *
+ * The returned pointer is writable and may point to either directly the dynptr
+ * data at the requested offset or to the buffer if unable to obtain a direct
+ * data pointer to (example: the requested slice is to the paged area of an skb
+ * packet). In the case where the returned pointer is to the buffer, the user
+ * is responsible for persisting writes through calling bpf_dynptr_write(). This
+ * usually looks something like this pattern:
+ *
+ * struct eth_hdr *eth = bpf_dynptr_slice_rdwr(&dynptr, 0, buffer, sizeof(buffer));
+ * if (!eth)
+ * return TC_ACT_SHOT;
+ *
+ * // mutate eth header //
+ *
+ * if (eth == buffer)
+ * bpf_dynptr_write(&ptr, 0, buffer, sizeof(buffer), 0);
+ *
+ * Please note that, as in the example above, the user must check that the
+ * returned pointer is not null before using it.
+ *
+ * Please also note that in the case of skb and xdp dynptrs, bpf_dynptr_slice_rdwr
+ * does not change the underlying packet data pointers, so a call to
+ * bpf_dynptr_slice_rdwr will not invalidate any ctx->data/data_end pointers in
+ * the bpf program.
+ *
+ * Return: NULL if the call failed (eg invalid dynptr), pointer to a
+ * data slice (can be either direct pointer to the data or a pointer to the user
+ * provided buffer, with its contents containing the data, if unable to obtain
+ * direct pointer)
+ */
+__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset,
+ void *buffer, u32 buffer__szk)
+{
+ if (!ptr->data || bpf_dynptr_is_rdonly(ptr))
+ return NULL;
+
+ /* bpf_dynptr_slice_rdwr is the same logic as bpf_dynptr_slice.
+ *
+ * For skb-type dynptrs, it is safe to write into the returned pointer
+ * if the bpf program allows skb data writes. There are two possiblities
+ * that may occur when calling bpf_dynptr_slice_rdwr:
+ *
+ * 1) The requested slice is in the head of the skb. In this case, the
+ * returned pointer is directly to skb data, and if the skb is cloned, the
+ * verifier will have uncloned it (see bpf_unclone_prologue()) already.
+ * The pointer can be directly written into.
+ *
+ * 2) Some portion of the requested slice is in the paged buffer area.
+ * In this case, the requested data will be copied out into the buffer
+ * and the returned pointer will be a pointer to the buffer. The skb
+ * will not be pulled. To persist the write, the user will need to call
+ * bpf_dynptr_write(), which will pull the skb and commit the write.
+ *
+ * Similarly for xdp programs, if the requested slice is not across xdp
+ * fragments, then a direct pointer will be returned, otherwise the data
+ * will be copied out into the buffer and the user will need to call
+ * bpf_dynptr_write() to commit changes.
+ */
+ return bpf_dynptr_slice(ptr, offset, buffer, buffer__szk);
+}
+
__bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj)
{
return obj;
@@ -2150,23 +2325,22 @@ BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
#endif
BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_list_push_front)
-BTF_ID_FLAGS(func, bpf_list_push_back)
+BTF_ID_FLAGS(func, bpf_refcount_acquire_impl, KF_ACQUIRE)
+BTF_ID_FLAGS(func, bpf_list_push_front_impl)
+BTF_ID_FLAGS(func, bpf_list_push_back_impl)
BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE)
-BTF_ID_FLAGS(func, bpf_rbtree_add)
+BTF_ID_FLAGS(func, bpf_rbtree_remove, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_rbtree_add_impl)
BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
#ifdef CONFIG_CGROUPS
-BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
-BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
-BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
#endif
BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
BTF_SET8_END(generic_btf_ids)
@@ -2190,6 +2364,11 @@ BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
BTF_ID_FLAGS(func, bpf_rdonly_cast)
BTF_ID_FLAGS(func, bpf_rcu_read_lock)
BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
+BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_dynptr_slice_rdwr, KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_num_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_num_destroy, KF_ITER_DESTROY)
BTF_SET8_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index e90d9f63edc5..a04f505aefe9 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -141,8 +141,8 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key)
return &READ_ONCE(storage->buf)->data[0];
}
-static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long cgroup_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
struct bpf_cgroup_storage *storage;
struct bpf_storage_buffer *new;
@@ -333,14 +333,14 @@ static void cgroup_storage_map_free(struct bpf_map *_map)
struct list_head *storages = &map->list;
struct bpf_cgroup_storage *storage, *stmp;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
list_for_each_entry_safe(storage, stmp, storages, list_map) {
bpf_cgroup_storage_unlink(storage);
bpf_cgroup_storage_free(storage);
}
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
WARN_ON(!RB_EMPTY_ROOT(&map->root));
WARN_ON(!list_empty(&map->list));
@@ -348,7 +348,7 @@ static void cgroup_storage_map_free(struct bpf_map *_map)
bpf_map_area_free(map);
}
-static int cgroup_storage_delete_elem(struct bpf_map *map, void *key)
+static long cgroup_storage_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
@@ -446,6 +446,12 @@ static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
rcu_read_unlock();
}
+static u64 cgroup_storage_map_usage(const struct bpf_map *map)
+{
+ /* Currently the dynamically allocated elements are not counted. */
+ return sizeof(struct bpf_cgroup_storage_map);
+}
+
BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct,
bpf_cgroup_storage_map)
const struct bpf_map_ops cgroup_storage_map_ops = {
@@ -457,6 +463,7 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
.map_delete_elem = cgroup_storage_delete_elem,
.map_check_btf = cgroup_storage_check_btf,
.map_seq_show_elem = cgroup_storage_seq_show_elem,
+ .map_mem_usage = cgroup_storage_map_usage,
.map_btf_id = &cgroup_storage_map_btf_ids[0],
};
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
new file mode 100644
index 000000000000..046ddff37a76
--- /dev/null
+++ b/kernel/bpf/log.c
@@ -0,0 +1,330 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+ */
+#include <uapi/linux/btf.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/math64.h>
+
+static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log)
+{
+ /* ubuf and len_total should both be specified (or not) together */
+ if (!!log->ubuf != !!log->len_total)
+ return false;
+ /* log buf without log_level is meaningless */
+ if (log->ubuf && log->level == 0)
+ return false;
+ if (log->level & ~BPF_LOG_MASK)
+ return false;
+ if (log->len_total > UINT_MAX >> 2)
+ return false;
+ return true;
+}
+
+int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level,
+ char __user *log_buf, u32 log_size)
+{
+ log->level = log_level;
+ log->ubuf = log_buf;
+ log->len_total = log_size;
+
+ /* log attributes have to be sane */
+ if (!bpf_verifier_log_attr_valid(log))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len)
+{
+ /* add_len includes terminal \0, so no need for +1. */
+ u64 len = log->end_pos + add_len;
+
+ /* log->len_max could be larger than our current len due to
+ * bpf_vlog_reset() calls, so we maintain the max of any length at any
+ * previous point
+ */
+ if (len > UINT_MAX)
+ log->len_max = UINT_MAX;
+ else if (len > log->len_max)
+ log->len_max = len;
+}
+
+void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
+ va_list args)
+{
+ u64 cur_pos;
+ u32 new_n, n;
+
+ n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
+
+ WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+ "verifier log line truncated - local buffer too short\n");
+
+ if (log->level == BPF_LOG_KERNEL) {
+ bool newline = n > 0 && log->kbuf[n - 1] == '\n';
+
+ pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
+ return;
+ }
+
+ n += 1; /* include terminating zero */
+ bpf_vlog_update_len_max(log, n);
+
+ if (log->level & BPF_LOG_FIXED) {
+ /* check if we have at least something to put into user buf */
+ new_n = 0;
+ if (log->end_pos < log->len_total) {
+ new_n = min_t(u32, log->len_total - log->end_pos, n);
+ log->kbuf[new_n - 1] = '\0';
+ }
+
+ cur_pos = log->end_pos;
+ log->end_pos += n - 1; /* don't count terminating '\0' */
+
+ if (log->ubuf && new_n &&
+ copy_to_user(log->ubuf + cur_pos, log->kbuf, new_n))
+ goto fail;
+ } else {
+ u64 new_end, new_start;
+ u32 buf_start, buf_end, new_n;
+
+ new_end = log->end_pos + n;
+ if (new_end - log->start_pos >= log->len_total)
+ new_start = new_end - log->len_total;
+ else
+ new_start = log->start_pos;
+
+ log->start_pos = new_start;
+ log->end_pos = new_end - 1; /* don't count terminating '\0' */
+
+ if (!log->ubuf)
+ return;
+
+ new_n = min(n, log->len_total);
+ cur_pos = new_end - new_n;
+ div_u64_rem(cur_pos, log->len_total, &buf_start);
+ div_u64_rem(new_end, log->len_total, &buf_end);
+ /* new_end and buf_end are exclusive indices, so if buf_end is
+ * exactly zero, then it actually points right to the end of
+ * ubuf and there is no wrap around
+ */
+ if (buf_end == 0)
+ buf_end = log->len_total;
+
+ /* if buf_start > buf_end, we wrapped around;
+ * if buf_start == buf_end, then we fill ubuf completely; we
+ * can't have buf_start == buf_end to mean that there is
+ * nothing to write, because we always write at least
+ * something, even if terminal '\0'
+ */
+ if (buf_start < buf_end) {
+ /* message fits within contiguous chunk of ubuf */
+ if (copy_to_user(log->ubuf + buf_start,
+ log->kbuf + n - new_n,
+ buf_end - buf_start))
+ goto fail;
+ } else {
+ /* message wraps around the end of ubuf, copy in two chunks */
+ if (copy_to_user(log->ubuf + buf_start,
+ log->kbuf + n - new_n,
+ log->len_total - buf_start))
+ goto fail;
+ if (copy_to_user(log->ubuf,
+ log->kbuf + n - buf_end,
+ buf_end))
+ goto fail;
+ }
+ }
+
+ return;
+fail:
+ log->ubuf = NULL;
+}
+
+void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos)
+{
+ char zero = 0;
+ u32 pos;
+
+ if (WARN_ON_ONCE(new_pos > log->end_pos))
+ return;
+
+ if (!bpf_verifier_log_needed(log) || log->level == BPF_LOG_KERNEL)
+ return;
+
+ /* if position to which we reset is beyond current log window,
+ * then we didn't preserve any useful content and should adjust
+ * start_pos to end up with an empty log (start_pos == end_pos)
+ */
+ log->end_pos = new_pos;
+ if (log->end_pos < log->start_pos)
+ log->start_pos = log->end_pos;
+
+ if (!log->ubuf)
+ return;
+
+ if (log->level & BPF_LOG_FIXED)
+ pos = log->end_pos + 1;
+ else
+ div_u64_rem(new_pos, log->len_total, &pos);
+
+ if (pos < log->len_total && put_user(zero, log->ubuf + pos))
+ log->ubuf = NULL;
+}
+
+static void bpf_vlog_reverse_kbuf(char *buf, int len)
+{
+ int i, j;
+
+ for (i = 0, j = len - 1; i < j; i++, j--)
+ swap(buf[i], buf[j]);
+}
+
+static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int end)
+{
+ /* we split log->kbuf into two equal parts for both ends of array */
+ int n = sizeof(log->kbuf) / 2, nn;
+ char *lbuf = log->kbuf, *rbuf = log->kbuf + n;
+
+ /* Read ubuf's section [start, end) two chunks at a time, from left
+ * and right side; within each chunk, swap all the bytes; after that
+ * reverse the order of lbuf and rbuf and write result back to ubuf.
+ * This way we'll end up with swapped contents of specified
+ * [start, end) ubuf segment.
+ */
+ while (end - start > 1) {
+ nn = min(n, (end - start ) / 2);
+
+ if (copy_from_user(lbuf, log->ubuf + start, nn))
+ return -EFAULT;
+ if (copy_from_user(rbuf, log->ubuf + end - nn, nn))
+ return -EFAULT;
+
+ bpf_vlog_reverse_kbuf(lbuf, nn);
+ bpf_vlog_reverse_kbuf(rbuf, nn);
+
+ /* we write lbuf to the right end of ubuf, while rbuf to the
+ * left one to end up with properly reversed overall ubuf
+ */
+ if (copy_to_user(log->ubuf + start, rbuf, nn))
+ return -EFAULT;
+ if (copy_to_user(log->ubuf + end - nn, lbuf, nn))
+ return -EFAULT;
+
+ start += nn;
+ end -= nn;
+ }
+
+ return 0;
+}
+
+int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual)
+{
+ u32 sublen;
+ int err;
+
+ *log_size_actual = 0;
+ if (!log || log->level == 0 || log->level == BPF_LOG_KERNEL)
+ return 0;
+
+ if (!log->ubuf)
+ goto skip_log_rotate;
+ /* If we never truncated log, there is nothing to move around. */
+ if (log->start_pos == 0)
+ goto skip_log_rotate;
+
+ /* Otherwise we need to rotate log contents to make it start from the
+ * buffer beginning and be a continuous zero-terminated string. Note
+ * that if log->start_pos != 0 then we definitely filled up entire log
+ * buffer with no gaps, and we just need to shift buffer contents to
+ * the left by (log->start_pos % log->len_total) bytes.
+ *
+ * Unfortunately, user buffer could be huge and we don't want to
+ * allocate temporary kernel memory of the same size just to shift
+ * contents in a straightforward fashion. Instead, we'll be clever and
+ * do in-place array rotation. This is a leetcode-style problem, which
+ * could be solved by three rotations.
+ *
+ * Let's say we have log buffer that has to be shifted left by 7 bytes
+ * (spaces and vertical bar is just for demonstrative purposes):
+ * E F G H I J K | A B C D
+ *
+ * First, we reverse entire array:
+ * D C B A | K J I H G F E
+ *
+ * Then we rotate first 4 bytes (DCBA) and separately last 7 bytes
+ * (KJIHGFE), resulting in a properly rotated array:
+ * A B C D | E F G H I J K
+ *
+ * We'll utilize log->kbuf to read user memory chunk by chunk, swap
+ * bytes, and write them back. Doing it byte-by-byte would be
+ * unnecessarily inefficient. Altogether we are going to read and
+ * write each byte twice, for total 4 memory copies between kernel and
+ * user space.
+ */
+
+ /* length of the chopped off part that will be the beginning;
+ * len(ABCD) in the example above
+ */
+ div_u64_rem(log->start_pos, log->len_total, &sublen);
+ sublen = log->len_total - sublen;
+
+ err = bpf_vlog_reverse_ubuf(log, 0, log->len_total);
+ err = err ?: bpf_vlog_reverse_ubuf(log, 0, sublen);
+ err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total);
+ if (err)
+ log->ubuf = NULL;
+
+skip_log_rotate:
+ *log_size_actual = log->len_max;
+
+ /* properly initialized log has either both ubuf!=NULL and len_total>0
+ * or ubuf==NULL and len_total==0, so if this condition doesn't hold,
+ * we got a fault somewhere along the way, so report it back
+ */
+ if (!!log->ubuf != !!log->len_total)
+ return -EFAULT;
+
+ /* did truncation actually happen? */
+ if (log->ubuf && log->len_max > log->len_total)
+ return -ENOSPC;
+
+ return 0;
+}
+
+/* log_level controls verbosity level of eBPF verifier.
+ * bpf_verifier_log_write() is used to dump the verification trace to the log,
+ * so the user can figure out what's wrong with the program
+ */
+__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ if (!bpf_verifier_log_needed(&env->log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(&env->log, fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
+
+__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+}
+EXPORT_SYMBOL_GPL(bpf_log);
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index d833496e9e42..e0d3ddf2037a 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -300,8 +300,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
}
/* Called from syscall or from eBPF program */
-static int trie_update_elem(struct bpf_map *map,
- void *_key, void *value, u64 flags)
+static long trie_update_elem(struct bpf_map *map,
+ void *_key, void *value, u64 flags)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
@@ -431,7 +431,7 @@ out:
}
/* Called from syscall or from eBPF program */
-static int trie_delete_elem(struct bpf_map *map, void *_key)
+static long trie_delete_elem(struct bpf_map *map, void *_key)
{
struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
struct bpf_lpm_trie_key *key = _key;
@@ -720,6 +720,16 @@ static int trie_check_btf(const struct bpf_map *map,
-EINVAL : 0;
}
+static u64 trie_mem_usage(const struct bpf_map *map)
+{
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ u64 elem_size;
+
+ elem_size = sizeof(struct lpm_trie_node) + trie->data_size +
+ trie->map.value_size;
+ return elem_size * READ_ONCE(trie->n_entries);
+}
+
BTF_ID_LIST_SINGLE(trie_map_btf_ids, struct, lpm_trie)
const struct bpf_map_ops trie_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -733,5 +743,6 @@ const struct bpf_map_ops trie_map_ops = {
.map_update_batch = generic_map_update_batch,
.map_delete_batch = generic_map_delete_batch,
.map_check_btf = trie_check_btf,
+ .map_mem_usage = trie_mem_usage,
.map_btf_id = &trie_map_btf_ids[0],
};
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 38136ec4e095..2c5c64c2a53b 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -56,18 +56,6 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
ret = PTR_ERR(inner_map_meta->record);
goto free;
}
- if (inner_map_meta->record) {
- struct btf_field_offs *field_offs;
- /* If btf_record is !IS_ERR_OR_NULL, then field_offs is always
- * valid.
- */
- field_offs = kmemdup(inner_map->field_offs, sizeof(*inner_map->field_offs), GFP_KERNEL | __GFP_NOWARN);
- if (!field_offs) {
- ret = -ENOMEM;
- goto free_rec;
- }
- inner_map_meta->field_offs = field_offs;
- }
/* Note: We must use the same BTF, as we also used btf_record_dup above
* which relies on BTF being same for both maps, as some members like
* record->fields.list_head have pointers like value_rec pointing into
@@ -88,8 +76,6 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
fdput(f);
return inner_map_meta;
-free_rec:
- btf_record_free(inner_map_meta->record);
free:
kfree(inner_map_meta);
put:
@@ -99,7 +85,6 @@ put:
void bpf_map_meta_free(struct bpf_map *map_meta)
{
- kfree(map_meta->field_offs);
bpf_map_free_record(map_meta);
btf_put(map_meta->btf);
kfree(map_meta);
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 5fcdacbb8439..410637c225fb 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -121,15 +121,8 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head)
return entry;
}
-static void *__alloc(struct bpf_mem_cache *c, int node)
+static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
{
- /* Allocate, but don't deplete atomic reserves that typical
- * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
- * will allocate from the current numa node which is what we
- * want here.
- */
- gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT;
-
if (c->percpu_size) {
void **obj = kmalloc_node(c->percpu_size, flags, node);
void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
@@ -185,7 +178,12 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
*/
obj = __llist_del_first(&c->free_by_rcu);
if (!obj) {
- obj = __alloc(c, node);
+ /* Allocate, but don't deplete atomic reserves that typical
+ * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
+ * will allocate from the current numa node which is what we
+ * want here.
+ */
+ obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT);
if (!obj)
break;
}
@@ -676,3 +674,46 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
unit_free(this_cpu_ptr(ma->cache), ptr);
}
+
+/* Directly does a kfree() without putting 'ptr' back to the free_llist
+ * for reuse and without waiting for a rcu_tasks_trace gp.
+ * The caller must first go through the rcu_tasks_trace gp for 'ptr'
+ * before calling bpf_mem_cache_raw_free().
+ * It could be used when the rcu_tasks_trace callback does not have
+ * a hold on the original bpf_mem_alloc object that allocated the
+ * 'ptr'. This should only be used in the uncommon code path.
+ * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled
+ * and may affect performance.
+ */
+void bpf_mem_cache_raw_free(void *ptr)
+{
+ if (!ptr)
+ return;
+
+ kfree(ptr - LLIST_NODE_SZ);
+}
+
+/* When flags == GFP_KERNEL, it signals that the caller will not cause
+ * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use
+ * kmalloc if the free_llist is empty.
+ */
+void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
+{
+ struct bpf_mem_cache *c;
+ void *ret;
+
+ c = this_cpu_ptr(ma->cache);
+
+ ret = unit_alloc(c);
+ if (!ret && flags == GFP_KERNEL) {
+ struct mem_cgroup *memcg, *old_memcg;
+
+ memcg = get_memcg(c);
+ old_memcg = set_active_memcg(memcg);
+ ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT);
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
+ }
+
+ return !ret ? NULL : ret + LLIST_NODE_SZ;
+}
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 0c85e06f7ea7..d9c9f45e3529 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -563,6 +563,12 @@ void bpf_map_offload_map_free(struct bpf_map *map)
bpf_map_area_free(offmap);
}
+u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map)
+{
+ /* The memory dynamically allocated in netdev dev_ops is not counted */
+ return sizeof(struct bpf_offloaded_map);
+}
+
int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
{
struct bpf_offloaded_map *offmap = map_to_offmap(map);
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index 8a5e060de63b..601609164ef3 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -95,7 +95,7 @@ static void queue_stack_map_free(struct bpf_map *map)
bpf_map_area_free(qs);
}
-static int __queue_map_get(struct bpf_map *map, void *value, bool delete)
+static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long flags;
@@ -124,7 +124,7 @@ out:
}
-static int __stack_map_get(struct bpf_map *map, void *value, bool delete)
+static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long flags;
@@ -156,32 +156,32 @@ out:
}
/* Called from syscall or from eBPF program */
-static int queue_map_peek_elem(struct bpf_map *map, void *value)
+static long queue_map_peek_elem(struct bpf_map *map, void *value)
{
return __queue_map_get(map, value, false);
}
/* Called from syscall or from eBPF program */
-static int stack_map_peek_elem(struct bpf_map *map, void *value)
+static long stack_map_peek_elem(struct bpf_map *map, void *value)
{
return __stack_map_get(map, value, false);
}
/* Called from syscall or from eBPF program */
-static int queue_map_pop_elem(struct bpf_map *map, void *value)
+static long queue_map_pop_elem(struct bpf_map *map, void *value)
{
return __queue_map_get(map, value, true);
}
/* Called from syscall or from eBPF program */
-static int stack_map_pop_elem(struct bpf_map *map, void *value)
+static long stack_map_pop_elem(struct bpf_map *map, void *value)
{
return __stack_map_get(map, value, true);
}
/* Called from syscall or from eBPF program */
-static int queue_stack_map_push_elem(struct bpf_map *map, void *value,
- u64 flags)
+static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
+ u64 flags)
{
struct bpf_queue_stack *qs = bpf_queue_stack(map);
unsigned long irq_flags;
@@ -227,14 +227,14 @@ static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key)
}
/* Called from syscall or from eBPF program */
-static int queue_stack_map_update_elem(struct bpf_map *map, void *key,
- void *value, u64 flags)
+static long queue_stack_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
{
return -EINVAL;
}
/* Called from syscall or from eBPF program */
-static int queue_stack_map_delete_elem(struct bpf_map *map, void *key)
+static long queue_stack_map_delete_elem(struct bpf_map *map, void *key)
{
return -EINVAL;
}
@@ -246,6 +246,14 @@ static int queue_stack_map_get_next_key(struct bpf_map *map, void *key,
return -EINVAL;
}
+static u64 queue_stack_map_mem_usage(const struct bpf_map *map)
+{
+ u64 usage = sizeof(struct bpf_queue_stack);
+
+ usage += ((u64)map->max_entries + 1) * map->value_size;
+ return usage;
+}
+
BTF_ID_LIST_SINGLE(queue_map_btf_ids, struct, bpf_queue_stack)
const struct bpf_map_ops queue_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -259,6 +267,7 @@ const struct bpf_map_ops queue_map_ops = {
.map_pop_elem = queue_map_pop_elem,
.map_peek_elem = queue_map_peek_elem,
.map_get_next_key = queue_stack_map_get_next_key,
+ .map_mem_usage = queue_stack_map_mem_usage,
.map_btf_id = &queue_map_btf_ids[0],
};
@@ -274,5 +283,6 @@ const struct bpf_map_ops stack_map_ops = {
.map_pop_elem = stack_map_pop_elem,
.map_peek_elem = stack_map_peek_elem,
.map_get_next_key = queue_stack_map_get_next_key,
+ .map_mem_usage = queue_stack_map_mem_usage,
.map_btf_id = &queue_map_btf_ids[0],
};
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 82c61612f382..cbf2d8d784b8 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -59,7 +59,7 @@ static void *reuseport_array_lookup_elem(struct bpf_map *map, void *key)
}
/* Called from syscall only */
-static int reuseport_array_delete_elem(struct bpf_map *map, void *key)
+static long reuseport_array_delete_elem(struct bpf_map *map, void *key)
{
struct reuseport_array *array = reuseport_array(map);
u32 index = *(u32 *)key;
@@ -335,6 +335,13 @@ static int reuseport_array_get_next_key(struct bpf_map *map, void *key,
return 0;
}
+static u64 reuseport_array_mem_usage(const struct bpf_map *map)
+{
+ struct reuseport_array *array;
+
+ return struct_size(array, ptrs, map->max_entries);
+}
+
BTF_ID_LIST_SINGLE(reuseport_array_map_btf_ids, struct, reuseport_array)
const struct bpf_map_ops reuseport_array_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -344,5 +351,6 @@ const struct bpf_map_ops reuseport_array_ops = {
.map_lookup_elem = reuseport_array_lookup_elem,
.map_get_next_key = reuseport_array_get_next_key,
.map_delete_elem = reuseport_array_delete_elem,
+ .map_mem_usage = reuseport_array_mem_usage,
.map_btf_id = &reuseport_array_map_btf_ids[0],
};
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 8732e0aadf36..875ac9b698d9 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -19,6 +19,7 @@
(offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT)
/* consumer page and producer page */
#define RINGBUF_POS_PAGES 2
+#define RINGBUF_NR_META_PAGES (RINGBUF_PGOFF + RINGBUF_POS_PAGES)
#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
@@ -96,7 +97,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
{
const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL |
__GFP_NOWARN | __GFP_ZERO;
- int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES;
+ int nr_meta_pages = RINGBUF_NR_META_PAGES;
int nr_data_pages = data_sz >> PAGE_SHIFT;
int nr_pages = nr_meta_pages + nr_data_pages;
struct page **pages, *page;
@@ -241,13 +242,13 @@ static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
return ERR_PTR(-ENOTSUPP);
}
-static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 flags)
+static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 flags)
{
return -ENOTSUPP;
}
-static int ringbuf_map_delete_elem(struct bpf_map *map, void *key)
+static long ringbuf_map_delete_elem(struct bpf_map *map, void *key)
{
return -ENOTSUPP;
}
@@ -336,6 +337,21 @@ static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp,
return 0;
}
+static u64 ringbuf_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_ringbuf *rb;
+ int nr_data_pages;
+ int nr_meta_pages;
+ u64 usage = sizeof(struct bpf_ringbuf_map);
+
+ rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+ usage += (u64)rb->nr_pages << PAGE_SHIFT;
+ nr_meta_pages = RINGBUF_NR_META_PAGES;
+ nr_data_pages = map->max_entries >> PAGE_SHIFT;
+ usage += (nr_meta_pages + 2 * nr_data_pages) * sizeof(struct page *);
+ return usage;
+}
+
BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
const struct bpf_map_ops ringbuf_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -347,6 +363,7 @@ const struct bpf_map_ops ringbuf_map_ops = {
.map_update_elem = ringbuf_map_update_elem,
.map_delete_elem = ringbuf_map_delete_elem,
.map_get_next_key = ringbuf_map_get_next_key,
+ .map_mem_usage = ringbuf_map_mem_usage,
.map_btf_id = &ringbuf_map_btf_ids[0],
};
@@ -361,6 +378,7 @@ const struct bpf_map_ops user_ringbuf_map_ops = {
.map_update_elem = ringbuf_map_update_elem,
.map_delete_elem = ringbuf_map_delete_elem,
.map_get_next_key = ringbuf_map_get_next_key,
+ .map_mem_usage = ringbuf_map_mem_usage,
.map_btf_id = &user_ringbuf_map_btf_ids[0],
};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index aecea7451b61..b25fce425b2c 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -618,14 +618,14 @@ static int stack_map_get_next_key(struct bpf_map *map, void *key,
return 0;
}
-static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
- u64 map_flags)
+static long stack_map_update_elem(struct bpf_map *map, void *key, void *value,
+ u64 map_flags)
{
return -EINVAL;
}
/* Called from syscall or from eBPF program */
-static int stack_map_delete_elem(struct bpf_map *map, void *key)
+static long stack_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *old_bucket;
@@ -654,6 +654,19 @@ static void stack_map_free(struct bpf_map *map)
put_callchain_buffers();
}
+static u64 stack_map_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+ u64 value_size = map->value_size;
+ u64 n_buckets = smap->n_buckets;
+ u64 enties = map->max_entries;
+ u64 usage = sizeof(*smap);
+
+ usage += n_buckets * sizeof(struct stack_map_bucket *);
+ usage += enties * (sizeof(struct stack_map_bucket) + value_size);
+ return usage;
+}
+
BTF_ID_LIST_SINGLE(stack_trace_map_btf_ids, struct, bpf_stack_map)
const struct bpf_map_ops stack_trace_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
@@ -664,5 +677,6 @@ const struct bpf_map_ops stack_trace_map_ops = {
.map_update_elem = stack_map_update_elem,
.map_delete_elem = stack_map_delete_elem,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = stack_map_mem_usage,
.map_btf_id = &stack_trace_map_btf_ids[0],
};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index adc83cb82f37..14f39c1e573e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -35,6 +35,7 @@
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
#include <linux/trace_events.h>
+#include <net/netfilter/nf_bpf_link.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -105,6 +106,7 @@ const struct bpf_map_ops bpf_map_offload_ops = {
.map_alloc = bpf_map_offload_map_alloc,
.map_free = bpf_map_offload_map_free,
.map_check_btf = map_check_no_btf,
+ .map_mem_usage = bpf_map_offload_map_mem_usage,
};
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
@@ -128,6 +130,8 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
}
if (attr->map_ifindex)
ops = &bpf_map_offload_ops;
+ if (!ops->map_mem_usage)
+ return ERR_PTR(-EINVAL);
map = ops->map_alloc(attr);
if (IS_ERR(map))
return map;
@@ -517,14 +521,14 @@ static int btf_field_cmp(const void *a, const void *b)
}
struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
- enum btf_field_type type)
+ u32 field_mask)
{
struct btf_field *field;
- if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type))
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & field_mask))
return NULL;
field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
- if (!field || !(field->type & type))
+ if (!field || !(field->type & field_mask))
return NULL;
return field;
}
@@ -549,6 +553,7 @@ void btf_record_free(struct btf_record *rec)
case BPF_RB_NODE:
case BPF_SPIN_LOCK:
case BPF_TIMER:
+ case BPF_REFCOUNT:
/* Nothing to release */
break;
default:
@@ -596,6 +601,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
case BPF_RB_NODE:
case BPF_SPIN_LOCK:
case BPF_TIMER:
+ case BPF_REFCOUNT:
/* Nothing to acquire */
break;
default:
@@ -647,6 +653,8 @@ void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
bpf_timer_cancel_and_free(obj + rec->timer_off);
}
+extern void __bpf_obj_drop_impl(void *p, const struct btf_record *rec);
+
void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
{
const struct btf_field *fields;
@@ -656,8 +664,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
return;
fields = rec->fields;
for (i = 0; i < rec->cnt; i++) {
+ struct btf_struct_meta *pointee_struct_meta;
const struct btf_field *field = &fields[i];
void *field_ptr = obj + field->offset;
+ void *xchgd_field;
switch (fields[i].type) {
case BPF_SPIN_LOCK:
@@ -669,7 +679,22 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
WRITE_ONCE(*(u64 *)field_ptr, 0);
break;
case BPF_KPTR_REF:
- field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));
+ xchgd_field = (void *)xchg((unsigned long *)field_ptr, 0);
+ if (!xchgd_field)
+ break;
+
+ if (!btf_is_kernel(field->kptr.btf)) {
+ pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
+ field->kptr.btf_id);
+ WARN_ON_ONCE(!pointee_struct_meta);
+ migrate_disable();
+ __bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
+ pointee_struct_meta->record :
+ NULL);
+ migrate_enable();
+ } else {
+ field->kptr.dtor(xchgd_field);
+ }
break;
case BPF_LIST_HEAD:
if (WARN_ON_ONCE(rec->spin_lock_off < 0))
@@ -683,6 +708,7 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
break;
case BPF_LIST_NODE:
case BPF_RB_NODE:
+ case BPF_REFCOUNT:
break;
default:
WARN_ON_ONCE(1);
@@ -695,14 +721,13 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
- struct btf_field_offs *foffs = map->field_offs;
struct btf_record *rec = map->record;
security_bpf_map_free(map);
bpf_map_release_memcg(map);
/* implementation dependent freeing */
map->ops->map_free(map);
- /* Delay freeing of field_offs and btf_record for maps, as map_free
+ /* Delay freeing of btf_record for maps, as map_free
* callback usually needs access to them. It is better to do it here
* than require each callback to do the free itself manually.
*
@@ -711,7 +736,6 @@ static void bpf_map_free_deferred(struct work_struct *work)
* eventually calls bpf_map_free_meta, since inner_map_meta is only a
* template bpf_map struct used during verification.
*/
- kfree(foffs);
btf_record_free(rec);
}
@@ -771,17 +795,10 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
}
#ifdef CONFIG_PROC_FS
-/* Provides an approximation of the map's memory footprint.
- * Used only to provide a backward compatibility and display
- * a reasonable "memlock" info.
- */
-static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
+/* Show the memory usage of a bpf map */
+static u64 bpf_map_memory_usage(const struct bpf_map *map)
{
- unsigned long size;
-
- size = round_up(map->key_size + bpf_map_value_size(map), 8);
-
- return round_up(map->max_entries * size, PAGE_SIZE);
+ return map->ops->map_mem_usage(map);
}
static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
@@ -803,7 +820,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
"max_entries:\t%u\n"
"map_flags:\t%#x\n"
"map_extra:\t%#llx\n"
- "memlock:\t%lu\n"
+ "memlock:\t%llu\n"
"map_id:\t%u\n"
"frozen:\t%u\n",
map->map_type,
@@ -812,7 +829,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
map->max_entries,
map->map_flags,
(unsigned long long)map->map_extra,
- bpf_map_memory_footprint(map),
+ bpf_map_memory_usage(map),
map->id,
READ_ONCE(map->frozen));
if (type) {
@@ -1019,7 +1036,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
map->record = btf_parse_fields(btf, value_type,
BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
- BPF_RB_ROOT,
+ BPF_RB_ROOT | BPF_REFCOUNT,
map->value_size);
if (!IS_ERR_OR_NULL(map->record)) {
int i;
@@ -1058,10 +1075,17 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
break;
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
+ case BPF_REFCOUNT:
if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY &&
- map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) {
+ map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
ret = -EOPNOTSUPP;
goto free_map_tab;
}
@@ -1104,7 +1128,6 @@ free_map_tab:
static int map_create(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
- struct btf_field_offs *foffs;
struct bpf_map *map;
int f_flags;
int err;
@@ -1184,17 +1207,9 @@ static int map_create(union bpf_attr *attr)
attr->btf_vmlinux_value_type_id;
}
-
- foffs = btf_parse_field_offs(map->record);
- if (IS_ERR(foffs)) {
- err = PTR_ERR(foffs);
- goto free_map;
- }
- map->field_offs = foffs;
-
err = security_bpf_map_alloc(map);
if (err)
- goto free_map_field_offs;
+ goto free_map;
err = bpf_map_alloc_id(map);
if (err)
@@ -1218,8 +1233,6 @@ static int map_create(union bpf_attr *attr)
free_map_sec:
security_bpf_map_free(map);
-free_map_field_offs:
- kfree(map->field_offs);
free_map:
btf_put(map->btf);
map->ops->map_free(map);
@@ -1285,8 +1298,10 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
return map;
}
-/* map_idr_lock should have been held */
-static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
+/* map_idr_lock should have been held or the map should have been
+ * protected by rcu read lock.
+ */
+struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref)
{
int refold;
@@ -2049,6 +2064,7 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred)
{
bpf_prog_kallsyms_del_all(prog);
btf_put(prog->aux->btf);
+ module_put(prog->aux->mod);
kvfree(prog->aux->jited_linfo);
kvfree(prog->aux->linfo);
kfree(prog->aux->kfunc_tab);
@@ -2439,7 +2455,6 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_LWT_SEG6LOCAL:
case BPF_PROG_TYPE_SK_SKB:
case BPF_PROG_TYPE_SK_MSG:
- case BPF_PROG_TYPE_LIRC_MODE2:
case BPF_PROG_TYPE_FLOW_DISSECTOR:
case BPF_PROG_TYPE_CGROUP_DEVICE:
case BPF_PROG_TYPE_CGROUP_SOCK:
@@ -2448,6 +2463,7 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type)
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_EXT: /* extends any prog */
+ case BPF_PROG_TYPE_NETFILTER:
return true;
case BPF_PROG_TYPE_CGROUP_SKB:
/* always unpriv */
@@ -2477,9 +2493,9 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size
+#define BPF_PROG_LOAD_LAST_FIELD log_true_size
-static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
+static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
enum bpf_prog_type type = attr->prog_type;
struct bpf_prog *prog, *dst_prog = NULL;
@@ -2629,7 +2645,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
goto free_prog_sec;
/* run eBPF verifier */
- err = bpf_check(&prog, attr, uattr);
+ err = bpf_check(&prog, attr, uattr, uattr_size);
if (err < 0)
goto free_used_maps;
@@ -2804,16 +2820,19 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
const struct bpf_prog *prog = link->prog;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
- bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"link_type:\t%s\n"
- "link_id:\t%u\n"
- "prog_tag:\t%s\n"
- "prog_id:\t%u\n",
+ "link_id:\t%u\n",
bpf_link_type_strs[link->type],
- link->id,
- prog_tag,
- prog->aux->id);
+ link->id);
+ if (prog) {
+ bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
+ seq_printf(m,
+ "prog_tag:\t%s\n"
+ "prog_id:\t%u\n",
+ prog_tag,
+ prog->aux->id);
+ }
if (link->ops->show_fdinfo)
link->ops->show_fdinfo(link, m);
}
@@ -3095,6 +3114,11 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
if (err)
goto out_unlock;
+ if (tgt_info.tgt_mod) {
+ module_put(prog->aux->mod);
+ prog->aux->mod = tgt_info.tgt_mod;
+ }
+
tr = bpf_trampoline_get(key, &tgt_info);
if (!tr) {
err = -ENOMEM;
@@ -4288,7 +4312,8 @@ static int bpf_link_get_info_by_fd(struct file *file,
info.type = link->type;
info.id = link->id;
- info.prog_id = link->prog->aux->id;
+ if (link->prog)
+ info.prog_id = link->prog->aux->id;
if (link->ops->fill_link_info) {
err = link->ops->fill_link_info(link, &info);
@@ -4338,9 +4363,9 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
return err;
}
-#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
+#define BPF_BTF_LOAD_LAST_FIELD btf_log_true_size
-static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
+static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
if (CHECK_ATTR(BPF_BTF_LOAD))
return -EINVAL;
@@ -4348,7 +4373,7 @@ static int bpf_btf_load(const union bpf_attr *attr, bpfptr_t uattr)
if (!bpf_capable())
return -EPERM;
- return btf_new_fd(attr, uattr);
+ return btf_new_fd(attr, uattr, uattr_size);
}
#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id
@@ -4551,6 +4576,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
if (CHECK_ATTR(BPF_LINK_CREATE))
return -EINVAL;
+ if (attr->link_create.attach_type == BPF_STRUCT_OPS)
+ return bpf_struct_ops_link_create(attr);
+
prog = bpf_prog_get(attr->link_create.prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
@@ -4562,6 +4590,7 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
switch (prog->type) {
case BPF_PROG_TYPE_EXT:
+ case BPF_PROG_TYPE_NETFILTER:
break;
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_TRACEPOINT:
@@ -4628,6 +4657,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
case BPF_PROG_TYPE_XDP:
ret = bpf_xdp_link_attach(attr, prog);
break;
+ case BPF_PROG_TYPE_NETFILTER:
+ ret = bpf_nf_link_attach(attr, prog);
+ break;
#endif
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_TRACEPOINT:
@@ -4649,6 +4681,35 @@ out:
return ret;
}
+static int link_update_map(struct bpf_link *link, union bpf_attr *attr)
+{
+ struct bpf_map *new_map, *old_map = NULL;
+ int ret;
+
+ new_map = bpf_map_get(attr->link_update.new_map_fd);
+ if (IS_ERR(new_map))
+ return PTR_ERR(new_map);
+
+ if (attr->link_update.flags & BPF_F_REPLACE) {
+ old_map = bpf_map_get(attr->link_update.old_map_fd);
+ if (IS_ERR(old_map)) {
+ ret = PTR_ERR(old_map);
+ goto out_put;
+ }
+ } else if (attr->link_update.old_map_fd) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ ret = link->ops->update_map(link, new_map, old_map);
+
+ if (old_map)
+ bpf_map_put(old_map);
+out_put:
+ bpf_map_put(new_map);
+ return ret;
+}
+
#define BPF_LINK_UPDATE_LAST_FIELD link_update.old_prog_fd
static int link_update(union bpf_attr *attr)
@@ -4669,6 +4730,11 @@ static int link_update(union bpf_attr *attr)
if (IS_ERR(link))
return PTR_ERR(link);
+ if (link->ops->update_map) {
+ ret = link_update_map(link, attr);
+ goto out_put_link;
+ }
+
new_prog = bpf_prog_get(attr->link_update.new_prog_fd);
if (IS_ERR(new_prog)) {
ret = PTR_ERR(new_prog);
@@ -4989,7 +5055,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
err = map_freeze(&attr);
break;
case BPF_PROG_LOAD:
- err = bpf_prog_load(&attr, uattr);
+ err = bpf_prog_load(&attr, uattr, size);
break;
case BPF_OBJ_PIN:
err = bpf_obj_pin(&attr);
@@ -5034,7 +5100,7 @@ static int __sys_bpf(int cmd, bpfptr_t uattr, unsigned int size)
err = bpf_raw_tracepoint_open(&attr);
break;
case BPF_BTF_LOAD:
- err = bpf_btf_load(&attr, uattr);
+ err = bpf_btf_load(&attr, uattr, size);
break;
case BPF_BTF_GET_FD_BY_ID:
err = bpf_btf_get_fd_by_id(&attr);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index d0ed7d6f5eec..ac021bc43a66 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -9,7 +9,6 @@
#include <linux/btf.h>
#include <linux/rcupdate_trace.h>
#include <linux/rcupdate_wait.h>
-#include <linux/module.h>
#include <linux/static_call.h>
#include <linux/bpf_verifier.h>
#include <linux/bpf_lsm.h>
@@ -45,8 +44,8 @@ static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd
lockdep_assert_held_once(&tr->mutex);
/* Instead of updating the trampoline here, we propagate
- * -EAGAIN to register_ftrace_direct_multi(). Then we can
- * retry register_ftrace_direct_multi() after updating the
+ * -EAGAIN to register_ftrace_direct(). Then we can
+ * retry register_ftrace_direct() after updating the
* trampoline.
*/
if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
@@ -172,38 +171,16 @@ out:
return tr;
}
-static int bpf_trampoline_module_get(struct bpf_trampoline *tr)
-{
- struct module *mod;
- int err = 0;
-
- preempt_disable();
- mod = __module_text_address((unsigned long) tr->func.addr);
- if (mod && !try_module_get(mod))
- err = -ENOENT;
- preempt_enable();
- tr->mod = mod;
- return err;
-}
-
-static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
-{
- module_put(tr->mod);
- tr->mod = NULL;
-}
-
static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
{
void *ip = tr->func.addr;
int ret;
if (tr->func.ftrace_managed)
- ret = unregister_ftrace_direct_multi(tr->fops, (long)old_addr);
+ ret = unregister_ftrace_direct(tr->fops, (long)old_addr, false);
else
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
- if (!ret)
- bpf_trampoline_module_put(tr);
return ret;
}
@@ -215,9 +192,9 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad
if (tr->func.ftrace_managed) {
if (lock_direct_mutex)
- ret = modify_ftrace_direct_multi(tr->fops, (long)new_addr);
+ ret = modify_ftrace_direct(tr->fops, (long)new_addr);
else
- ret = modify_ftrace_direct_multi_nolock(tr->fops, (long)new_addr);
+ ret = modify_ftrace_direct_nolock(tr->fops, (long)new_addr);
} else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
}
@@ -238,18 +215,13 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
tr->func.ftrace_managed = true;
}
- if (bpf_trampoline_module_get(tr))
- return -ENOENT;
-
if (tr->func.ftrace_managed) {
ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
- ret = register_ftrace_direct_multi(tr->fops, (long)new_addr);
+ ret = register_ftrace_direct(tr->fops, (long)new_addr);
} else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
}
- if (ret)
- bpf_trampoline_module_put(tr);
return ret;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 272563a0b770..fbcf5a4e2fcd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -24,6 +24,7 @@
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
#include <linux/poison.h>
+#include <linux/module.h>
#include "disasm.h"
@@ -194,6 +195,8 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
static int ref_set_non_owning(struct bpf_verifier_env *env,
struct bpf_reg_state *reg);
+static void specialize_kfunc(struct bpf_verifier_env *env,
+ u32 func_id, u16 offset, unsigned long *addr);
static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
{
@@ -268,7 +271,50 @@ struct bpf_call_arg_meta {
u32 ret_btf_id;
u32 subprogno;
struct btf_field *kptr_field;
- u8 uninit_dynptr_regno;
+};
+
+struct btf_and_id {
+ struct btf *btf;
+ u32 btf_id;
+};
+
+struct bpf_kfunc_call_arg_meta {
+ /* In parameters */
+ struct btf *btf;
+ u32 func_id;
+ u32 kfunc_flags;
+ const struct btf_type *func_proto;
+ const char *func_name;
+ /* Out parameters */
+ u32 ref_obj_id;
+ u8 release_regno;
+ bool r0_rdonly;
+ u32 ret_btf_id;
+ u64 r0_size;
+ u32 subprogno;
+ struct {
+ u64 value;
+ bool found;
+ } arg_constant;
+ union {
+ struct btf_and_id arg_obj_drop;
+ struct btf_and_id arg_refcount_acquire;
+ };
+ struct {
+ struct btf_field *field;
+ } arg_list_head;
+ struct {
+ struct btf_field *field;
+ } arg_rbtree_root;
+ struct {
+ enum bpf_dynptr_type type;
+ u32 id;
+ } initialized_dynptr;
+ struct {
+ u8 spi;
+ u8 frameno;
+ } iter;
+ u64 mem_size;
};
struct btf *btf_vmlinux;
@@ -296,61 +342,6 @@ find_linfo(const struct bpf_verifier_env *env, u32 insn_off)
return &linfo[i - 1];
}
-void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
- va_list args)
-{
- unsigned int n;
-
- n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
-
- WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
- "verifier log line truncated - local buffer too short\n");
-
- if (log->level == BPF_LOG_KERNEL) {
- bool newline = n > 0 && log->kbuf[n - 1] == '\n';
-
- pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
- return;
- }
-
- n = min(log->len_total - log->len_used - 1, n);
- log->kbuf[n] = '\0';
- if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
- log->len_used += n;
- else
- log->ubuf = NULL;
-}
-
-static void bpf_vlog_reset(struct bpf_verifier_log *log, u32 new_pos)
-{
- char zero = 0;
-
- if (!bpf_verifier_log_needed(log))
- return;
-
- log->len_used = new_pos;
- if (put_user(zero, log->ubuf + new_pos))
- log->ubuf = NULL;
-}
-
-/* log_level controls verbosity level of eBPF verifier.
- * bpf_verifier_log_write() is used to dump the verification trace to the log,
- * so the user can figure out what's wrong with the program
- */
-__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env,
- const char *fmt, ...)
-{
- va_list args;
-
- if (!bpf_verifier_log_needed(&env->log))
- return;
-
- va_start(args, fmt);
- bpf_verifier_vlog(&env->log, fmt, args);
- va_end(args);
-}
-EXPORT_SYMBOL_GPL(bpf_verifier_log_write);
-
__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
{
struct bpf_verifier_env *env = private_data;
@@ -364,20 +355,6 @@ __printf(2, 3) static void verbose(void *private_data, const char *fmt, ...)
va_end(args);
}
-__printf(2, 3) void bpf_log(struct bpf_verifier_log *log,
- const char *fmt, ...)
-{
- va_list args;
-
- if (!bpf_verifier_log_needed(log))
- return;
-
- va_start(args, fmt);
- bpf_verifier_vlog(log, fmt, args);
- va_end(args);
-}
-EXPORT_SYMBOL_GPL(bpf_log);
-
static const char *ltrim(const char *s)
{
while (isspace(*s))
@@ -447,13 +424,23 @@ static bool type_is_sk_pointer(enum bpf_reg_type type)
type == PTR_TO_XDP_SOCK;
}
+static bool type_may_be_null(u32 type)
+{
+ return type & PTR_MAYBE_NULL;
+}
+
static bool reg_type_not_null(enum bpf_reg_type type)
{
+ if (type_may_be_null(type))
+ return false;
+
+ type = base_type(type);
return type == PTR_TO_SOCKET ||
type == PTR_TO_TCP_SOCK ||
type == PTR_TO_MAP_VALUE ||
type == PTR_TO_MAP_KEY ||
- type == PTR_TO_SOCK_COMMON;
+ type == PTR_TO_SOCK_COMMON ||
+ type == PTR_TO_MEM;
}
static bool type_is_ptr_alloc_obj(u32 type)
@@ -491,11 +478,6 @@ static bool type_is_rdonly_mem(u32 type)
return type & MEM_RDONLY;
}
-static bool type_may_be_null(u32 type)
-{
- return type & PTR_MAYBE_NULL;
-}
-
static bool is_acquire_function(enum bpf_func_id func_id,
const struct bpf_map *map)
{
@@ -633,6 +615,7 @@ static char slot_type_char[] = {
[STACK_MISC] = 'm',
[STACK_ZERO] = '0',
[STACK_DYNPTR] = 'd',
+ [STACK_ITER] = 'i',
};
static void print_liveness(struct bpf_verifier_env *env,
@@ -675,37 +658,91 @@ static bool is_spi_bounds_valid(struct bpf_func_state *state, int spi, int nr_sl
return spi - nr_slots + 1 >= 0 && spi < allocated_slots;
}
-static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static int stack_slot_obj_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ const char *obj_kind, int nr_slots)
{
int off, spi;
if (!tnum_is_const(reg->var_off)) {
- verbose(env, "dynptr has to be at a constant offset\n");
+ verbose(env, "%s has to be at a constant offset\n", obj_kind);
return -EINVAL;
}
off = reg->off + reg->var_off.value;
if (off % BPF_REG_SIZE) {
- verbose(env, "cannot pass in dynptr at an offset=%d\n", off);
+ verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
return -EINVAL;
}
spi = __get_spi(off);
- if (spi < 1) {
- verbose(env, "cannot pass in dynptr at an offset=%d\n", off);
+ if (spi + 1 < nr_slots) {
+ verbose(env, "cannot pass in %s at an offset=%d\n", obj_kind, off);
return -EINVAL;
}
- if (!is_spi_bounds_valid(func(env, reg), spi, BPF_DYNPTR_NR_SLOTS))
+ if (!is_spi_bounds_valid(func(env, reg), spi, nr_slots))
return -ERANGE;
return spi;
}
-static const char *kernel_type_name(const struct btf* btf, u32 id)
+static int dynptr_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ return stack_slot_obj_get_spi(env, reg, "dynptr", BPF_DYNPTR_NR_SLOTS);
+}
+
+static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int nr_slots)
+{
+ return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
+}
+
+static const char *btf_type_name(const struct btf *btf, u32 id)
{
return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
}
+static const char *dynptr_type_str(enum bpf_dynptr_type type)
+{
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ return "local";
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return "ringbuf";
+ case BPF_DYNPTR_TYPE_SKB:
+ return "skb";
+ case BPF_DYNPTR_TYPE_XDP:
+ return "xdp";
+ case BPF_DYNPTR_TYPE_INVALID:
+ return "<invalid>";
+ default:
+ WARN_ONCE(1, "unknown dynptr type %d\n", type);
+ return "<unknown>";
+ }
+}
+
+static const char *iter_type_str(const struct btf *btf, u32 btf_id)
+{
+ if (!btf || btf_id == 0)
+ return "<invalid>";
+
+ /* we already validated that type is valid and has conforming name */
+ return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1;
+}
+
+static const char *iter_state_str(enum bpf_iter_state state)
+{
+ switch (state) {
+ case BPF_ITER_STATE_ACTIVE:
+ return "active";
+ case BPF_ITER_STATE_DRAINED:
+ return "drained";
+ case BPF_ITER_STATE_INVALID:
+ return "<invalid>";
+ default:
+ WARN_ONCE(1, "unknown iter state %d\n", state);
+ return "<unknown>";
+ }
+}
+
static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
{
env->scratched_regs |= 1U << regno;
@@ -751,11 +788,31 @@ static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
return BPF_DYNPTR_TYPE_LOCAL;
case DYNPTR_TYPE_RINGBUF:
return BPF_DYNPTR_TYPE_RINGBUF;
+ case DYNPTR_TYPE_SKB:
+ return BPF_DYNPTR_TYPE_SKB;
+ case DYNPTR_TYPE_XDP:
+ return BPF_DYNPTR_TYPE_XDP;
default:
return BPF_DYNPTR_TYPE_INVALID;
}
}
+static enum bpf_type_flag get_dynptr_type_flag(enum bpf_dynptr_type type)
+{
+ switch (type) {
+ case BPF_DYNPTR_TYPE_LOCAL:
+ return DYNPTR_TYPE_LOCAL;
+ case BPF_DYNPTR_TYPE_RINGBUF:
+ return DYNPTR_TYPE_RINGBUF;
+ case BPF_DYNPTR_TYPE_SKB:
+ return DYNPTR_TYPE_SKB;
+ case BPF_DYNPTR_TYPE_XDP:
+ return DYNPTR_TYPE_XDP;
+ default:
+ return 0;
+ }
+}
+
static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
{
return type == BPF_DYNPTR_TYPE_RINGBUF;
@@ -895,6 +952,14 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
static void __mark_reg_unknown(const struct bpf_verifier_env *env,
struct bpf_reg_state *reg);
+static void mark_reg_invalid(const struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ if (!env->allow_ptr_leaks)
+ __mark_reg_not_init(env, reg);
+ else
+ __mark_reg_unknown(env, reg);
+}
+
static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
struct bpf_func_state *state, int spi)
{
@@ -934,12 +999,8 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
/* Dynptr slices are only PTR_TO_MEM_OR_NULL and PTR_TO_MEM */
if (dreg->type != (PTR_TO_MEM | PTR_MAYBE_NULL) && dreg->type != PTR_TO_MEM)
continue;
- if (dreg->dynptr_id == dynptr_id) {
- if (!env->allow_ptr_leaks)
- __mark_reg_not_init(env, dreg);
- else
- __mark_reg_unknown(env, dreg);
- }
+ if (dreg->dynptr_id == dynptr_id)
+ mark_reg_invalid(env, dreg);
}));
/* Do not release reference state, we are destroying dynptr on stack,
@@ -955,39 +1016,49 @@ static int destroy_if_dynptr_stack_slot(struct bpf_verifier_env *env,
return 0;
}
-static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- int spi)
+static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
+ int spi;
+
if (reg->type == CONST_PTR_TO_DYNPTR)
return false;
- /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
- * will do check_mem_access to check and update stack bounds later, so
- * return true for that case.
+ spi = dynptr_get_spi(env, reg);
+
+ /* -ERANGE (i.e. spi not falling into allocated stack slots) isn't an
+ * error because this just means the stack state hasn't been updated yet.
+ * We will do check_mem_access to check and update stack bounds later.
*/
- if (spi < 0)
- return spi == -ERANGE;
- /* We allow overwriting existing unreferenced STACK_DYNPTR slots, see
- * mark_stack_slots_dynptr which calls destroy_if_dynptr_stack_slot to
- * ensure dynptr objects at the slots we are touching are completely
- * destructed before we reinitialize them for a new one. For referenced
- * ones, destroy_if_dynptr_stack_slot returns an error early instead of
- * delaying it until the end where the user will get "Unreleased
+ if (spi < 0 && spi != -ERANGE)
+ return false;
+
+ /* We don't need to check if the stack slots are marked by previous
+ * dynptr initializations because we allow overwriting existing unreferenced
+ * STACK_DYNPTR slots, see mark_stack_slots_dynptr which calls
+ * destroy_if_dynptr_stack_slot to ensure dynptr objects at the slots we are
+ * touching are completely destructed before we reinitialize them for a new
+ * one. For referenced ones, destroy_if_dynptr_stack_slot returns an error early
+ * instead of delaying it until the end where the user will get "Unreleased
* reference" error.
*/
return true;
}
-static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- int spi)
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_func_state *state = func(env, reg);
- int i;
+ int i, spi;
- /* This already represents first slot of initialized bpf_dynptr */
+ /* This already represents first slot of initialized bpf_dynptr.
+ *
+ * CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
+ * check_func_arg_reg_off's logic, so we don't need to check its
+ * offset and alignment.
+ */
if (reg->type == CONST_PTR_TO_DYNPTR)
return true;
+ spi = dynptr_get_spi(env, reg);
if (spi < 0)
return false;
if (!state->stack[spi].spilled_ptr.dynptr.first_slot)
@@ -1024,6 +1095,157 @@ static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg
}
}
+static void __mark_reg_known_zero(struct bpf_reg_state *reg);
+
+static int mark_stack_slots_iter(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int insn_idx,
+ struct btf *btf, u32 btf_id, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j, id;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ id = acquire_reference_state(env, insn_idx);
+ if (id < 0)
+ return id;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ __mark_reg_known_zero(st);
+ st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+ st->live |= REG_LIVE_WRITTEN;
+ st->ref_obj_id = i == 0 ? id : 0;
+ st->iter.btf = btf;
+ st->iter.btf_id = btf_id;
+ st->iter.state = BPF_ITER_STATE_ACTIVE;
+ st->iter.depth = 0;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ slot->slot_type[j] = STACK_ITER;
+
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
+}
+
+static int unmark_stack_slots_iter(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ if (i == 0)
+ WARN_ON_ONCE(release_reference(env, st->ref_obj_id));
+
+ __mark_reg_not_init(env, st);
+
+ /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
+ st->live |= REG_LIVE_WRITTEN;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ slot->slot_type[j] = STACK_INVALID;
+
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
+}
+
+static bool is_iter_reg_valid_uninit(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+ * will do check_mem_access to check and update stack bounds later, so
+ * return true for that case.
+ */
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi == -ERANGE)
+ return true;
+ if (spi < 0)
+ return false;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ if (slot->slot_type[j] == STACK_ITER)
+ return false;
+ }
+
+ return true;
+}
+
+static bool is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ struct btf *btf, u32 btf_id, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi, i, j;
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return false;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_stack_state *slot = &state->stack[spi - i];
+ struct bpf_reg_state *st = &slot->spilled_ptr;
+
+ /* only main (first) slot has ref_obj_id set */
+ if (i == 0 && !st->ref_obj_id)
+ return false;
+ if (i != 0 && st->ref_obj_id)
+ return false;
+ if (st->iter.btf != btf || st->iter.btf_id != btf_id)
+ return false;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ if (slot->slot_type[j] != STACK_ITER)
+ return false;
+ }
+
+ return true;
+}
+
+/* Check if given stack slot is "special":
+ * - spilled register state (STACK_SPILL);
+ * - dynptr state (STACK_DYNPTR);
+ * - iter state (STACK_ITER).
+ */
+static bool is_stack_slot_special(const struct bpf_stack_state *stack)
+{
+ enum bpf_stack_slot_type type = stack->slot_type[BPF_REG_SIZE - 1];
+
+ switch (type) {
+ case STACK_SPILL:
+ case STACK_DYNPTR:
+ case STACK_ITER:
+ return true;
+ case STACK_INVALID:
+ case STACK_MISC:
+ case STACK_ZERO:
+ return false;
+ default:
+ WARN_ONCE(1, "unknown stack slot type %d\n", type);
+ return true;
+ }
+}
+
/* The reg state of a pointer or a bounded scalar was saved when
* it was spilled to the stack.
*/
@@ -1070,7 +1292,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
verbose(env, "%s", reg_type_str(env, t));
if (base_type(t) == PTR_TO_BTF_ID)
- verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));
+ verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
verbose(env, "(");
/*
* _a stands for append, was shortened to avoid multiline statements below.
@@ -1143,26 +1365,62 @@ static void print_verifier_state(struct bpf_verifier_env *env,
for (j = 0; j < BPF_REG_SIZE; j++) {
if (state->stack[i].slot_type[j] != STACK_INVALID)
valid = true;
- types_buf[j] = slot_type_char[
- state->stack[i].slot_type[j]];
+ types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];
}
types_buf[BPF_REG_SIZE] = 0;
if (!valid)
continue;
if (!print_all && !stack_slot_scratched(env, i))
continue;
- verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
- print_liveness(env, state->stack[i].spilled_ptr.live);
- if (is_spilled_reg(&state->stack[i])) {
+ switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) {
+ case STACK_SPILL:
reg = &state->stack[i].spilled_ptr;
t = reg->type;
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
if (t == SCALAR_VALUE && reg->precise)
verbose(env, "P");
if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
verbose(env, "%lld", reg->var_off.value + reg->off);
- } else {
+ break;
+ case STACK_DYNPTR:
+ i += BPF_DYNPTR_NR_SLOTS - 1;
+ reg = &state->stack[i].spilled_ptr;
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
+ verbose(env, "=dynptr_%s", dynptr_type_str(reg->dynptr.type));
+ if (reg->ref_obj_id)
+ verbose(env, "(ref_id=%d)", reg->ref_obj_id);
+ break;
+ case STACK_ITER:
+ /* only main slot has ref_obj_id set; skip others */
+ reg = &state->stack[i].spilled_ptr;
+ if (!reg->ref_obj_id)
+ continue;
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
+ verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)",
+ iter_type_str(reg->iter.btf, reg->iter.btf_id),
+ reg->ref_obj_id, iter_state_str(reg->iter.state),
+ reg->iter.depth);
+ break;
+ case STACK_MISC:
+ case STACK_ZERO:
+ default:
+ reg = &state->stack[i].spilled_ptr;
+
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ types_buf[j] = slot_type_char[state->stack[i].slot_type[j]];
+ types_buf[BPF_REG_SIZE] = 0;
+
+ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
+ print_liveness(env, reg->live);
verbose(env, "=%s", types_buf);
+ break;
}
}
if (state->acquired_refs && state->refs[0].id) {
@@ -1188,10 +1446,10 @@ static inline u32 vlog_alignment(u32 pos)
static void print_insn_state(struct bpf_verifier_env *env,
const struct bpf_func_state *state)
{
- if (env->prev_log_len && env->prev_log_len == env->log.len_used) {
+ if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
/* remove new line character */
- bpf_vlog_reset(&env->log, env->prev_log_len - 1);
- verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_len), ' ');
+ bpf_vlog_reset(&env->log, env->prev_log_pos - 1);
+ verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' ');
} else {
verbose(env, "%d:", env->insn_idx);
}
@@ -1499,7 +1757,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
- elem->log_pos = env->log.len_used;
+ elem->log_pos = env->log.end_pos;
env->head = elem;
env->stack_size++;
err = copy_verifier_state(&elem->st, cur);
@@ -1664,6 +1922,12 @@ static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
reg->type == PTR_TO_PACKET_END;
}
+static bool reg_is_dynptr_slice_pkt(const struct bpf_reg_state *reg)
+{
+ return base_type(reg->type) == PTR_TO_MEM &&
+ (reg->type & DYNPTR_TYPE_SKB || reg->type & DYNPTR_TYPE_XDP);
+}
+
/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
enum bpf_reg_type which)
@@ -1823,9 +2087,9 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)
struct tnum var64_off = tnum_intersect(reg->var_off,
tnum_range(reg->umin_value,
reg->umax_value));
- struct tnum var32_off = tnum_intersect(tnum_subreg(reg->var_off),
- tnum_range(reg->u32_min_value,
- reg->u32_max_value));
+ struct tnum var32_off = tnum_intersect(tnum_subreg(var64_off),
+ tnum_range(reg->u32_min_value,
+ reg->u32_max_value));
reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}
@@ -2029,7 +2293,7 @@ static struct bpf_verifier_state *push_async_cb(struct bpf_verifier_env *env,
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
- elem->log_pos = env->log.len_used;
+ elem->log_pos = env->log.end_pos;
env->head = elem;
env->stack_size++;
if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
@@ -2117,6 +2381,7 @@ struct bpf_kfunc_desc {
u32 func_id;
s32 imm;
u16 offset;
+ unsigned long addr;
};
struct bpf_kfunc_btf {
@@ -2126,6 +2391,11 @@ struct bpf_kfunc_btf {
};
struct bpf_kfunc_desc_tab {
+ /* Sorted by func_id (BTF ID) and offset (fd_array offset) during
+ * verification. JITs do lookups by bpf_insn, where func_id may not be
+ * available, therefore at the end of verification do_misc_fixups()
+ * sorts this by imm and offset.
+ */
struct bpf_kfunc_desc descs[MAX_KFUNC_DESCS];
u32 nr_descs;
};
@@ -2166,6 +2436,19 @@ find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
sizeof(tab->descs[0]), kfunc_desc_cmp_by_id_off);
}
+int bpf_get_kfunc_addr(const struct bpf_prog *prog, u32 func_id,
+ u16 btf_fd_idx, u8 **func_addr)
+{
+ const struct bpf_kfunc_desc *desc;
+
+ desc = find_kfunc_desc(prog, func_id, btf_fd_idx);
+ if (!desc)
+ return -EFAULT;
+
+ *func_addr = (u8 *)desc->addr;
+ return 0;
+}
+
static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
s16 offset)
{
@@ -2345,13 +2628,18 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
func_name);
return -EINVAL;
}
+ specialize_kfunc(env, func_id, offset, &addr);
- call_imm = BPF_CALL_IMM(addr);
- /* Check whether or not the relative offset overflows desc->imm */
- if ((unsigned long)(s32)call_imm != call_imm) {
- verbose(env, "address of kernel function %s is out of range\n",
- func_name);
- return -EINVAL;
+ if (bpf_jit_supports_far_kfunc_call()) {
+ call_imm = func_id;
+ } else {
+ call_imm = BPF_CALL_IMM(addr);
+ /* Check whether the relative offset overflows desc->imm */
+ if ((unsigned long)(s32)call_imm != call_imm) {
+ verbose(env, "address of kernel function %s is out of range\n",
+ func_name);
+ return -EINVAL;
+ }
}
if (bpf_dev_bound_kfunc_id(func_id)) {
@@ -2364,6 +2652,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
desc->func_id = func_id;
desc->imm = call_imm;
desc->offset = offset;
+ desc->addr = addr;
err = btf_distill_func_proto(&env->log, desc_btf,
func_proto, func_name,
&desc->func_model);
@@ -2373,19 +2662,19 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return err;
}
-static int kfunc_desc_cmp_by_imm(const void *a, const void *b)
+static int kfunc_desc_cmp_by_imm_off(const void *a, const void *b)
{
const struct bpf_kfunc_desc *d0 = a;
const struct bpf_kfunc_desc *d1 = b;
- if (d0->imm > d1->imm)
- return 1;
- else if (d0->imm < d1->imm)
- return -1;
+ if (d0->imm != d1->imm)
+ return d0->imm < d1->imm ? -1 : 1;
+ if (d0->offset != d1->offset)
+ return d0->offset < d1->offset ? -1 : 1;
return 0;
}
-static void sort_kfunc_descs_by_imm(struct bpf_prog *prog)
+static void sort_kfunc_descs_by_imm_off(struct bpf_prog *prog)
{
struct bpf_kfunc_desc_tab *tab;
@@ -2394,7 +2683,7 @@ static void sort_kfunc_descs_by_imm(struct bpf_prog *prog)
return;
sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
- kfunc_desc_cmp_by_imm, NULL);
+ kfunc_desc_cmp_by_imm_off, NULL);
}
bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog)
@@ -2408,13 +2697,14 @@ bpf_jit_find_kfunc_model(const struct bpf_prog *prog,
{
const struct bpf_kfunc_desc desc = {
.imm = insn->imm,
+ .offset = insn->off,
};
const struct bpf_kfunc_desc *res;
struct bpf_kfunc_desc_tab *tab;
tab = prog->aux->kfunc_tab;
res = bsearch(&desc, tab->descs, tab->nr_descs,
- sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm);
+ sizeof(tab->descs[0]), kfunc_desc_cmp_by_imm_off);
return res ? &res->func_model : NULL;
}
@@ -2475,8 +2765,8 @@ static int check_subprogs(struct bpf_verifier_env *env)
u8 code = insn[i].code;
if (code == (BPF_JMP | BPF_CALL) &&
- insn[i].imm == BPF_FUNC_tail_call &&
- insn[i].src_reg != BPF_PSEUDO_CALL)
+ insn[i].src_reg == 0 &&
+ insn[i].imm == BPF_FUNC_tail_call)
subprog[cur_subprog].has_tail_call = true;
if (BPF_CLASS(code) == BPF_LD &&
(BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))
@@ -2587,6 +2877,25 @@ static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *
state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
}
+static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int spi, int nr_slots)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int err, i;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
+
+ err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
+ if (err)
+ return err;
+
+ mark_stack_slot_scratched(env, spi - i);
+ }
+
+ return 0;
+}
+
/* This function is supposed to be used by the following 32-bit optimization
* code only. It returns TRUE if the source or destination register operates
* on 64-bit, otherwise return FALSE.
@@ -2967,6 +3276,21 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
}
} else if (opcode == BPF_EXIT) {
return -ENOTSUPP;
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ if (!(*reg_mask & (dreg | sreg)))
+ return 0;
+ /* dreg <cond> sreg
+ * Both dreg and sreg need precision before
+ * this insn. If only sreg was marked precise
+ * before it would be equally necessary to
+ * propagate it to dreg.
+ */
+ *reg_mask |= (sreg | dreg);
+ /* else dreg <cond> K
+ * Only dreg still needs precision before
+ * this insn, so for the K-based conditional
+ * there is nothing new to be marked.
+ */
}
} else if (class == BPF_LD) {
if (!(*reg_mask & dreg))
@@ -3568,8 +3892,8 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
/* regular write of data into stack destroys any spilled ptr */
state->stack[spi].spilled_ptr.type = NOT_INIT;
- /* Mark slots as STACK_MISC if they belonged to spilled ptr. */
- if (is_spilled_reg(&state->stack[spi]))
+ /* Mark slots as STACK_MISC if they belonged to spilled ptr/dynptr/iter. */
+ if (is_stack_slot_special(&state->stack[spi]))
for (i = 0; i < BPF_REG_SIZE; i++)
scrub_spilled_slot(&state->stack[spi].slot_type[i]);
@@ -3826,6 +4150,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
continue;
if (type == STACK_MISC)
continue;
+ if (type == STACK_INVALID && env->allow_uninit_stack)
+ continue;
verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
@@ -3863,6 +4189,8 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
continue;
if (type == STACK_ZERO)
continue;
+ if (type == STACK_INVALID && env->allow_uninit_stack)
+ continue;
verbose(env, "invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
@@ -3958,17 +4286,13 @@ static int check_stack_read(struct bpf_verifier_env *env,
}
/* Variable offset is prohibited for unprivileged mode for simplicity
* since it requires corresponding support in Spectre masking for stack
- * ALU. See also retrieve_ptr_limit().
+ * ALU. See also retrieve_ptr_limit(). The check in
+ * check_stack_access_for_ptr_arithmetic() called by
+ * adjust_ptr_min_max_vals() prevents users from creating stack pointers
+ * with variable offsets, therefore no check is required here. Further,
+ * just checking it here would be insufficient as speculative stack
+ * writes could still lead to unsafe speculative behaviour.
*/
- if (!env->bypass_spec_v1 && var_off) {
- char tn_buf[48];
-
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
- ptr_regno, tn_buf);
- return -EACCES;
- }
-
if (!var_off) {
off += reg->var_off.value;
err = check_stack_read_fixed_off(env, state, off, size,
@@ -4174,8 +4498,8 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
struct btf_field *kptr_field,
struct bpf_reg_state *reg, u32 regno)
{
- const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
- int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED;
+ const char *targ_name = btf_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
+ int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
const char *reg_name = "";
/* Only unreferenced case accepts untrusted pointers */
@@ -4190,7 +4514,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
return -EINVAL;
}
/* We need to verify reg->type and reg->btf, before accessing reg->btf */
- reg_name = kernel_type_name(reg->btf, reg->btf_id);
+ reg_name = btf_type_name(reg->btf, reg->btf_id);
/* For ref_ptr case, release function check should ensure we get one
* referenced PTR_TO_BTF_ID, and that its fixed offset is 0. For the
@@ -4242,6 +4566,36 @@ bad_type:
return -EINVAL;
}
+/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
+ * can dereference RCU protected pointers and result is PTR_TRUSTED.
+ */
+static bool in_rcu_cs(struct bpf_verifier_env *env)
+{
+ return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable;
+}
+
+/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
+BTF_SET_START(rcu_protected_types)
+BTF_ID(struct, prog_test_ref_kfunc)
+BTF_ID(struct, cgroup)
+BTF_ID(struct, bpf_cpumask)
+BTF_ID(struct, task_struct)
+BTF_SET_END(rcu_protected_types)
+
+static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
+{
+ if (!btf_is_kernel(btf))
+ return false;
+ return btf_id_set_contains(&rcu_protected_types, btf_id);
+}
+
+static bool rcu_safe_kptr(const struct btf_field *field)
+{
+ const struct btf_field_kptr *kptr = &field->kptr;
+
+ return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id);
+}
+
static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
int value_regno, int insn_idx,
struct btf_field *kptr_field)
@@ -4276,7 +4630,10 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
* value from map as PTR_TO_BTF_ID, with the correct type.
*/
mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
- kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
+ kptr_field->kptr.btf_id,
+ rcu_safe_kptr(kptr_field) && in_rcu_cs(env) ?
+ PTR_MAYBE_NULL | MEM_RCU :
+ PTR_MAYBE_NULL | PTR_UNTRUSTED);
/* For mark_ptr_or_null_reg */
val_reg->id = ++env->id_gen;
} else if (class == BPF_STX) {
@@ -4596,6 +4953,11 @@ static bool is_rcu_reg(const struct bpf_reg_state *reg)
return reg->type & MEM_RCU;
}
+static void clear_trusted_flags(enum bpf_type_flag *flag)
+{
+ *flag &= ~(BPF_REG_TRUSTED_MODIFIERS | MEM_RCU);
+}
+
static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int off, int size, bool strict)
@@ -4999,23 +5361,110 @@ static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
return 0;
}
-#define BTF_TYPE_SAFE_NESTED(__type) __PASTE(__type, __safe_fields)
+#define BTF_TYPE_SAFE_RCU(__type) __PASTE(__type, __safe_rcu)
+#define BTF_TYPE_SAFE_RCU_OR_NULL(__type) __PASTE(__type, __safe_rcu_or_null)
+#define BTF_TYPE_SAFE_TRUSTED(__type) __PASTE(__type, __safe_trusted)
+
+/*
+ * Allow list few fields as RCU trusted or full trusted.
+ * This logic doesn't allow mix tagging and will be removed once GCC supports
+ * btf_type_tag.
+ */
-BTF_TYPE_SAFE_NESTED(struct task_struct) {
+/* RCU trusted: these fields are trusted in RCU CS and never NULL */
+BTF_TYPE_SAFE_RCU(struct task_struct) {
const cpumask_t *cpus_ptr;
+ struct css_set __rcu *cgroups;
+ struct task_struct __rcu *real_parent;
+ struct task_struct *group_leader;
};
-static bool nested_ptr_is_trusted(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg,
- int off)
+BTF_TYPE_SAFE_RCU(struct cgroup) {
+ /* cgrp->kn is always accessible as documented in kernel/cgroup/cgroup.c */
+ struct kernfs_node *kn;
+};
+
+BTF_TYPE_SAFE_RCU(struct css_set) {
+ struct cgroup *dfl_cgrp;
+};
+
+/* RCU trusted: these fields are trusted in RCU CS and can be NULL */
+BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct) {
+ struct file __rcu *exe_file;
+};
+
+/* skb->sk, req->sk are not RCU protected, but we mark them as such
+ * because bpf prog accessible sockets are SOCK_RCU_FREE.
+ */
+BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff) {
+ struct sock *sk;
+};
+
+BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock) {
+ struct sock *sk;
+};
+
+/* full trusted: these fields are trusted even outside of RCU CS and never NULL */
+BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta) {
+ struct seq_file *seq;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task) {
+ struct bpf_iter_meta *meta;
+ struct task_struct *task;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct linux_binprm) {
+ struct file *file;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct file) {
+ struct inode *f_inode;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct dentry) {
+ /* no negative dentry-s in places where bpf can see it */
+ struct inode *d_inode;
+};
+
+BTF_TYPE_SAFE_TRUSTED(struct socket) {
+ struct sock *sk;
+};
+
+static bool type_is_rcu(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
{
- /* If its parent is not trusted, it can't regain its trusted status. */
- if (!is_trusted_reg(reg))
- return false;
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct task_struct));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct cgroup));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU(struct css_set));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu");
+}
+
+static bool type_is_rcu_or_null(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct mm_struct));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct sk_buff));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_RCU_OR_NULL(struct request_sock));
+
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_rcu_or_null");
+}
- BTF_TYPE_EMIT(BTF_TYPE_SAFE_NESTED(struct task_struct));
+static bool type_is_trusted(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const char *field_name, u32 btf_id)
+{
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter_meta));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct bpf_iter__task));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct linux_binprm));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct file));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct dentry));
+ BTF_TYPE_EMIT(BTF_TYPE_SAFE_TRUSTED(struct socket));
- return btf_nested_type_is_trusted(&env->log, reg, off);
+ return btf_nested_type_is_trusted(&env->log, reg, field_name, btf_id, "__safe_trusted");
}
static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
@@ -5027,8 +5476,9 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
struct bpf_reg_state *reg = regs + regno;
const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
const char *tname = btf_name_by_offset(reg->btf, t->name_off);
+ const char *field_name = NULL;
enum bpf_type_flag flag = 0;
- u32 btf_id;
+ u32 btf_id = 0;
int ret;
if (!env->allow_ptr_leaks) {
@@ -5073,12 +5523,12 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
- if (env->ops->btf_struct_access && !type_is_alloc(reg->type)) {
+ if (env->ops->btf_struct_access && !type_is_alloc(reg->type) && atype == BPF_WRITE) {
if (!btf_is_kernel(reg->btf)) {
verbose(env, "verifier internal error: reg->btf must be kernel btf\n");
return -EFAULT;
}
- ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
+ ret = env->ops->btf_struct_access(&env->log, reg, off, size);
} else {
/* Writes are permitted with default btf_struct_access for
* program allocated objects (which always have ref_obj_id > 0),
@@ -5095,47 +5545,63 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EFAULT;
}
- ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
+ ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag, &field_name);
}
if (ret < 0)
return ret;
- /* If this is an untrusted pointer, all pointers formed by walking it
- * also inherit the untrusted flag.
- */
- if (type_flag(reg->type) & PTR_UNTRUSTED)
- flag |= PTR_UNTRUSTED;
+ if (ret != PTR_TO_BTF_ID) {
+ /* just mark; */
- /* By default any pointer obtained from walking a trusted pointer is no
- * longer trusted, unless the field being accessed has explicitly been
- * marked as inheriting its parent's state of trust.
- *
- * An RCU-protected pointer can also be deemed trusted if we are in an
- * RCU read region. This case is handled below.
- */
- if (nested_ptr_is_trusted(env, reg, off))
- flag |= PTR_TRUSTED;
- else
- flag &= ~PTR_TRUSTED;
-
- if (flag & MEM_RCU) {
- /* Mark value register as MEM_RCU only if it is protected by
- * bpf_rcu_read_lock() and the ptr reg is rcu or trusted. MEM_RCU
- * itself can already indicate trustedness inside the rcu
- * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since
- * it could be null in some cases.
+ } else if (type_flag(reg->type) & PTR_UNTRUSTED) {
+ /* If this is an untrusted pointer, all pointers formed by walking it
+ * also inherit the untrusted flag.
*/
- if (!env->cur_state->active_rcu_lock ||
- !(is_trusted_reg(reg) || is_rcu_reg(reg)))
- flag &= ~MEM_RCU;
- else
- flag |= PTR_MAYBE_NULL;
- } else if (reg->type & MEM_RCU) {
- /* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged
- * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively.
+ flag = PTR_UNTRUSTED;
+
+ } else if (is_trusted_reg(reg) || is_rcu_reg(reg)) {
+ /* By default any pointer obtained from walking a trusted pointer is no
+ * longer trusted, unless the field being accessed has explicitly been
+ * marked as inheriting its parent's state of trust (either full or RCU).
+ * For example:
+ * 'cgroups' pointer is untrusted if task->cgroups dereference
+ * happened in a sleepable program outside of bpf_rcu_read_lock()
+ * section. In a non-sleepable program it's trusted while in RCU CS (aka MEM_RCU).
+ * Note bpf_rcu_read_unlock() converts MEM_RCU pointers to PTR_UNTRUSTED.
+ *
+ * A regular RCU-protected pointer with __rcu tag can also be deemed
+ * trusted if we are in an RCU CS. Such pointer can be NULL.
*/
- flag |= PTR_UNTRUSTED;
+ if (type_is_trusted(env, reg, field_name, btf_id)) {
+ flag |= PTR_TRUSTED;
+ } else if (in_rcu_cs(env) && !type_may_be_null(reg->type)) {
+ if (type_is_rcu(env, reg, field_name, btf_id)) {
+ /* ignore __rcu tag and mark it MEM_RCU */
+ flag |= MEM_RCU;
+ } else if (flag & MEM_RCU ||
+ type_is_rcu_or_null(env, reg, field_name, btf_id)) {
+ /* __rcu tagged pointers can be NULL */
+ flag |= MEM_RCU | PTR_MAYBE_NULL;
+ } else if (flag & (MEM_PERCPU | MEM_USER)) {
+ /* keep as-is */
+ } else {
+ /* walking unknown pointers yields old deprecated PTR_TO_BTF_ID */
+ clear_trusted_flags(&flag);
+ }
+ } else {
+ /*
+ * If not in RCU CS or MEM_RCU pointer can be NULL then
+ * aggressively mark as untrusted otherwise such
+ * pointers will be plain PTR_TO_BTF_ID without flags
+ * and will be allowed to be passed into helpers for
+ * compat reasons.
+ */
+ flag = PTR_UNTRUSTED;
+ }
+ } else {
+ /* Old compat. Deprecated */
+ clear_trusted_flags(&flag);
}
if (atype == BPF_READ && value_regno >= 0)
@@ -5194,7 +5660,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
/* Simulate access to a PTR_TO_BTF_ID */
memset(&map_reg, 0, sizeof(map_reg));
mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0);
- ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag);
+ ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag, NULL);
if (ret < 0)
return ret;
@@ -5754,7 +6220,8 @@ static int check_stack_range_initialized(
stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
if (*stype == STACK_MISC)
goto mark;
- if (*stype == STACK_ZERO) {
+ if ((*stype == STACK_ZERO) ||
+ (*stype == STACK_INVALID && env->allow_uninit_stack)) {
if (clobber) {
/* helper can write anything into the stack */
*stype = STACK_MISC;
@@ -5859,6 +6326,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
env,
regno, reg->off, access_size,
zero_size_allowed, ACCESS_HELPER, meta);
+ case PTR_TO_BTF_ID:
+ return check_ptr_to_btf_access(env, regs, regno, reg->off,
+ access_size, BPF_READ, -1);
case PTR_TO_CTX:
/* in case the function doesn't know how to access the context,
* (because we are in a program of type SYSCALL for example), we
@@ -6206,11 +6676,11 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
* Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
* type, and declare it as 'const struct bpf_dynptr *' in their prototype.
*/
-int process_dynptr_func(struct bpf_verifier_env *env, int regno,
- enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta)
+static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn_idx,
+ enum bpf_arg_type arg_type)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- int spi = 0;
+ int err;
/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
* ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
@@ -6219,15 +6689,6 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n");
return -EFAULT;
}
- /* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
- * check_func_arg_reg_off's logic. We only need to check offset
- * and its alignment for PTR_TO_STACK.
- */
- if (reg->type == PTR_TO_STACK) {
- spi = dynptr_get_spi(env, reg);
- if (spi < 0 && spi != -ERANGE)
- return spi;
- }
/* MEM_UNINIT - Points to memory that is an appropriate candidate for
* constructing a mutable bpf_dynptr object.
@@ -6245,30 +6706,30 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
* to.
*/
if (arg_type & MEM_UNINIT) {
- if (!is_dynptr_reg_valid_uninit(env, reg, spi)) {
+ int i;
+
+ if (!is_dynptr_reg_valid_uninit(env, reg)) {
verbose(env, "Dynptr has to be an uninitialized dynptr\n");
return -EINVAL;
}
- /* We only support one dynptr being uninitialized at the moment,
- * which is sufficient for the helper functions we have right now.
- */
- if (meta->uninit_dynptr_regno) {
- verbose(env, "verifier internal error: multiple uninitialized dynptr args\n");
- return -EFAULT;
+ /* we write BPF_DW bits (8 bytes) at a time */
+ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
+ err = check_mem_access(env, insn_idx, regno,
+ i, BPF_DW, BPF_WRITE, -1, false);
+ if (err)
+ return err;
}
- meta->uninit_dynptr_regno = regno;
+ err = mark_stack_slots_dynptr(env, reg, arg_type, insn_idx);
} else /* MEM_RDONLY and None case from above */ {
- int err;
-
/* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
return -EINVAL;
}
- if (!is_dynptr_reg_valid_init(env, reg, spi)) {
+ if (!is_dynptr_reg_valid_init(env, reg)) {
verbose(env,
"Expected an initialized dynptr as arg #%d\n",
regno);
@@ -6277,29 +6738,211 @@ int process_dynptr_func(struct bpf_verifier_env *env, int regno,
/* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */
if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
- const char *err_extra = "";
-
- switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
- case DYNPTR_TYPE_LOCAL:
- err_extra = "local";
- break;
- case DYNPTR_TYPE_RINGBUF:
- err_extra = "ringbuf";
- break;
- default:
- err_extra = "<unknown>";
- break;
- }
verbose(env,
"Expected a dynptr of type %s as arg #%d\n",
- err_extra, regno);
+ dynptr_type_str(arg_to_dynptr_type(arg_type)), regno);
return -EINVAL;
}
err = mark_dynptr_read(env, reg);
+ }
+ return err;
+}
+
+static u32 iter_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg, int spi)
+{
+ struct bpf_func_state *state = func(env, reg);
+
+ return state->stack[spi].spilled_ptr.ref_obj_id;
+}
+
+static bool is_iter_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & (KF_ITER_NEW | KF_ITER_NEXT | KF_ITER_DESTROY);
+}
+
+static bool is_iter_new_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_NEW;
+}
+
+static bool is_iter_next_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_NEXT;
+}
+
+static bool is_iter_destroy_kfunc(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ITER_DESTROY;
+}
+
+static bool is_kfunc_arg_iter(struct bpf_kfunc_call_arg_meta *meta, int arg)
+{
+ /* btf_check_iter_kfuncs() guarantees that first argument of any iter
+ * kfunc is iter state pointer
+ */
+ return arg == 0 && is_iter_kfunc(meta);
+}
+
+static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_idx,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ const struct btf_type *t;
+ const struct btf_param *arg;
+ int spi, err, i, nr_slots;
+ u32 btf_id;
+
+ /* btf_check_iter_kfuncs() ensures we don't need to validate anything here */
+ arg = &btf_params(meta->func_proto)[0];
+ t = btf_type_skip_modifiers(meta->btf, arg->type, NULL); /* PTR */
+ t = btf_type_skip_modifiers(meta->btf, t->type, &btf_id); /* STRUCT */
+ nr_slots = t->size / BPF_REG_SIZE;
+
+ if (is_iter_new_kfunc(meta)) {
+ /* bpf_iter_<type>_new() expects pointer to uninit iter state */
+ if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) {
+ verbose(env, "expected uninitialized iter_%s as arg #%d\n",
+ iter_type_str(meta->btf, btf_id), regno);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < nr_slots * 8; i += BPF_REG_SIZE) {
+ err = check_mem_access(env, insn_idx, regno,
+ i, BPF_DW, BPF_WRITE, -1, false);
+ if (err)
+ return err;
+ }
+
+ err = mark_stack_slots_iter(env, reg, insn_idx, meta->btf, btf_id, nr_slots);
+ if (err)
+ return err;
+ } else {
+ /* iter_next() or iter_destroy() expect initialized iter state*/
+ if (!is_iter_reg_valid_init(env, reg, meta->btf, btf_id, nr_slots)) {
+ verbose(env, "expected an initialized iter_%s as arg #%d\n",
+ iter_type_str(meta->btf, btf_id), regno);
+ return -EINVAL;
+ }
+
+ spi = iter_get_spi(env, reg, nr_slots);
+ if (spi < 0)
+ return spi;
+
+ err = mark_iter_read(env, reg, spi, nr_slots);
if (err)
return err;
+
+ /* remember meta->iter info for process_iter_next_call() */
+ meta->iter.spi = spi;
+ meta->iter.frameno = reg->frameno;
+ meta->ref_obj_id = iter_ref_obj_id(env, reg, spi);
+
+ if (is_iter_destroy_kfunc(meta)) {
+ err = unmark_stack_slots_iter(env, reg, nr_slots);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+/* process_iter_next_call() is called when verifier gets to iterator's next
+ * "method" (e.g., bpf_iter_num_next() for numbers iterator) call. We'll refer
+ * to it as just "iter_next()" in comments below.
+ *
+ * BPF verifier relies on a crucial contract for any iter_next()
+ * implementation: it should *eventually* return NULL, and once that happens
+ * it should keep returning NULL. That is, once iterator exhausts elements to
+ * iterate, it should never reset or spuriously return new elements.
+ *
+ * With the assumption of such contract, process_iter_next_call() simulates
+ * a fork in the verifier state to validate loop logic correctness and safety
+ * without having to simulate infinite amount of iterations.
+ *
+ * In current state, we first assume that iter_next() returned NULL and
+ * iterator state is set to DRAINED (BPF_ITER_STATE_DRAINED). In such
+ * conditions we should not form an infinite loop and should eventually reach
+ * exit.
+ *
+ * Besides that, we also fork current state and enqueue it for later
+ * verification. In a forked state we keep iterator state as ACTIVE
+ * (BPF_ITER_STATE_ACTIVE) and assume non-NULL return from iter_next(). We
+ * also bump iteration depth to prevent erroneous infinite loop detection
+ * later on (see iter_active_depths_differ() comment for details). In this
+ * state we assume that we'll eventually loop back to another iter_next()
+ * calls (it could be in exactly same location or in some other instruction,
+ * it doesn't matter, we don't make any unnecessary assumptions about this,
+ * everything revolves around iterator state in a stack slot, not which
+ * instruction is calling iter_next()). When that happens, we either will come
+ * to iter_next() with equivalent state and can conclude that next iteration
+ * will proceed in exactly the same way as we just verified, so it's safe to
+ * assume that loop converges. If not, we'll go on another iteration
+ * simulation with a different input state, until all possible starting states
+ * are validated or we reach maximum number of instructions limit.
+ *
+ * This way, we will either exhaustively discover all possible input states
+ * that iterator loop can start with and eventually will converge, or we'll
+ * effectively regress into bounded loop simulation logic and either reach
+ * maximum number of instructions if loop is not provably convergent, or there
+ * is some statically known limit on number of iterations (e.g., if there is
+ * an explicit `if n > 100 then break;` statement somewhere in the loop).
+ *
+ * One very subtle but very important aspect is that we *always* simulate NULL
+ * condition first (as the current state) before we simulate non-NULL case.
+ * This has to do with intricacies of scalar precision tracking. By simulating
+ * "exit condition" of iter_next() returning NULL first, we make sure all the
+ * relevant precision marks *that will be set **after** we exit iterator loop*
+ * are propagated backwards to common parent state of NULL and non-NULL
+ * branches. Thanks to that, state equivalence checks done later in forked
+ * state, when reaching iter_next() for ACTIVE iterator, can assume that
+ * precision marks are finalized and won't change. Because simulating another
+ * ACTIVE iterator iteration won't change them (because given same input
+ * states we'll end up with exactly same output states which we are currently
+ * comparing; and verification after the loop already propagated back what
+ * needs to be **additionally** tracked as precise). It's subtle, grok
+ * precision tracking for more intuitive understanding.
+ */
+static int process_iter_next_call(struct bpf_verifier_env *env, int insn_idx,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_verifier_state *cur_st = env->cur_state, *queued_st;
+ struct bpf_func_state *cur_fr = cur_st->frame[cur_st->curframe], *queued_fr;
+ struct bpf_reg_state *cur_iter, *queued_iter;
+ int iter_frameno = meta->iter.frameno;
+ int iter_spi = meta->iter.spi;
+
+ BTF_TYPE_EMIT(struct bpf_iter);
+
+ cur_iter = &env->cur_state->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+
+ if (cur_iter->iter.state != BPF_ITER_STATE_ACTIVE &&
+ cur_iter->iter.state != BPF_ITER_STATE_DRAINED) {
+ verbose(env, "verifier internal error: unexpected iterator state %d (%s)\n",
+ cur_iter->iter.state, iter_state_str(cur_iter->iter.state));
+ return -EFAULT;
+ }
+
+ if (cur_iter->iter.state == BPF_ITER_STATE_ACTIVE) {
+ /* branch out active iter state */
+ queued_st = push_stack(env, insn_idx + 1, insn_idx, false);
+ if (!queued_st)
+ return -ENOMEM;
+
+ queued_iter = &queued_st->frame[iter_frameno]->stack[iter_spi].spilled_ptr;
+ queued_iter->iter.state = BPF_ITER_STATE_ACTIVE;
+ queued_iter->iter.depth++;
+
+ queued_fr = queued_st->frame[queued_st->curframe];
+ mark_ptr_not_null_reg(&queued_fr->regs[BPF_REG_0]);
}
+
+ /* switch to DRAINED state, but keep the depth unchanged */
+ /* mark current iter state as drained and assume returned NULL */
+ cur_iter->iter.state = BPF_ITER_STATE_DRAINED;
+ __mark_reg_const_zero(&cur_fr->regs[BPF_REG_0]);
+
return 0;
}
@@ -6397,6 +7040,7 @@ static const struct bpf_reg_types mem_types = {
PTR_TO_MEM,
PTR_TO_MEM | MEM_RINGBUF,
PTR_TO_BUF,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
},
};
@@ -6506,6 +7150,9 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
if (arg_type & PTR_MAYBE_NULL)
type &= ~PTR_MAYBE_NULL;
+ if (meta->func_id == BPF_FUNC_kptr_xchg && type & MEM_ALLOC)
+ type &= ~MEM_ALLOC;
+
for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
expected = compatible->types[i];
if (expected == NOT_INIT)
@@ -6522,7 +7169,27 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
found:
- if (reg->type == PTR_TO_BTF_ID || reg->type & PTR_TRUSTED) {
+ if (base_type(reg->type) != PTR_TO_BTF_ID)
+ return 0;
+
+ if (compatible == &mem_types) {
+ if (!(arg_type & MEM_RDONLY)) {
+ verbose(env,
+ "%s() may write into memory pointed by R%d type=%s\n",
+ func_id_name(meta->func_id),
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ return 0;
+ }
+
+ switch ((int)reg->type) {
+ case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | PTR_TRUSTED:
+ case PTR_TO_BTF_ID | MEM_RCU:
+ case PTR_TO_BTF_ID | PTR_MAYBE_NULL:
+ case PTR_TO_BTF_ID | PTR_MAYBE_NULL | MEM_RCU:
+ {
/* For bpf_sk_release, it needs to match against first member
* 'struct sock_common', hence make an exception for it. This
* allows bpf_sk_release to work for multiple socket types.
@@ -6530,6 +7197,12 @@ found:
bool strict_type_match = arg_type_is_release(arg_type) &&
meta->func_id != BPF_FUNC_sk_release;
+ if (type_may_be_null(reg->type) &&
+ (!type_may_be_null(arg_type) || arg_type_is_release(arg_type))) {
+ verbose(env, "Possibly NULL pointer passed to helper arg%d\n", regno);
+ return -EACCES;
+ }
+
if (!arg_btf_id) {
if (!compatible->btf_id) {
verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
@@ -6553,18 +7226,29 @@ found:
btf_vmlinux, *arg_btf_id,
strict_type_match)) {
verbose(env, "R%d is of type %s but %s is expected\n",
- regno, kernel_type_name(reg->btf, reg->btf_id),
- kernel_type_name(btf_vmlinux, *arg_btf_id));
+ regno, btf_type_name(reg->btf, reg->btf_id),
+ btf_type_name(btf_vmlinux, *arg_btf_id));
return -EACCES;
}
}
- } else if (type_is_alloc(reg->type)) {
- if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock) {
+ break;
+ }
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock &&
+ meta->func_id != BPF_FUNC_kptr_xchg) {
verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
return -EFAULT;
}
+ /* Handled by helper specific checks */
+ break;
+ case PTR_TO_BTF_ID | MEM_PERCPU:
+ case PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED:
+ /* Handled by helper specific checks */
+ break;
+ default:
+ verbose(env, "verifier internal error: invalid PTR_TO_BTF_ID register for type match\n");
+ return -EFAULT;
}
-
return 0;
}
@@ -6614,7 +7298,7 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
verbose(env, "R%d must have zero offset when passed to release func\n",
regno);
verbose(env, "No graph node or root found at R%d type:%s off:%d\n", regno,
- kernel_type_name(reg->btf, reg->btf_id), reg->off);
+ btf_type_name(reg->btf, reg->btf_id), reg->off);
return -EINVAL;
}
@@ -6651,7 +7335,6 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
case PTR_TO_BTF_ID | MEM_ALLOC:
case PTR_TO_BTF_ID | PTR_TRUSTED:
case PTR_TO_BTF_ID | MEM_RCU:
- case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:
case PTR_TO_BTF_ID | MEM_ALLOC | NON_OWN_REF:
/* When referenced PTR_TO_BTF_ID is passed to release function,
* its fixed offset must be 0. In the other cases, fixed offset
@@ -6666,6 +7349,28 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
}
}
+static struct bpf_reg_state *get_dynptr_arg_reg(struct bpf_verifier_env *env,
+ const struct bpf_func_proto *fn,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_reg_state *state = NULL;
+ int i;
+
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
+ if (arg_type_is_dynptr(fn->arg_type[i])) {
+ if (state) {
+ verbose(env, "verifier internal error: multiple dynptr args\n");
+ return NULL;
+ }
+ state = &regs[BPF_REG_1 + i];
+ }
+
+ if (!state)
+ verbose(env, "verifier internal error: no dynptr arg found\n");
+
+ return state;
+}
+
static int dynptr_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_func_state *state = func(env, reg);
@@ -6692,9 +7397,28 @@ static int dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state
return state->stack[spi].spilled_ptr.ref_obj_id;
}
+static enum bpf_dynptr_type dynptr_get_type(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->dynptr.type;
+
+ spi = __get_spi(reg->off);
+ if (spi < 0) {
+ verbose(env, "verifier internal error: invalid spi when querying dynptr type\n");
+ return BPF_DYNPTR_TYPE_INVALID;
+ }
+
+ return state->stack[spi].spilled_ptr.dynptr.type;
+}
+
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
struct bpf_call_arg_meta *meta,
- const struct bpf_func_proto *fn)
+ const struct bpf_func_proto *fn,
+ int insn_idx)
{
u32 regno = BPF_REG_1 + arg;
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
@@ -6907,7 +7631,7 @@ skip_type_check:
err = check_mem_size_reg(env, reg, regno, true, meta);
break;
case ARG_PTR_TO_DYNPTR:
- err = process_dynptr_func(env, regno, arg_type, meta);
+ err = process_dynptr_func(env, regno, insn_idx, arg_type);
if (err)
return err;
break;
@@ -7126,22 +7850,26 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
break;
case BPF_MAP_TYPE_SK_STORAGE:
if (func_id != BPF_FUNC_sk_storage_get &&
- func_id != BPF_FUNC_sk_storage_delete)
+ func_id != BPF_FUNC_sk_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
goto error;
break;
case BPF_MAP_TYPE_INODE_STORAGE:
if (func_id != BPF_FUNC_inode_storage_get &&
- func_id != BPF_FUNC_inode_storage_delete)
+ func_id != BPF_FUNC_inode_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
goto error;
break;
case BPF_MAP_TYPE_TASK_STORAGE:
if (func_id != BPF_FUNC_task_storage_get &&
- func_id != BPF_FUNC_task_storage_delete)
+ func_id != BPF_FUNC_task_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
goto error;
break;
case BPF_MAP_TYPE_CGRP_STORAGE:
if (func_id != BPF_FUNC_cgrp_storage_get &&
- func_id != BPF_FUNC_cgrp_storage_delete)
+ func_id != BPF_FUNC_cgrp_storage_delete &&
+ func_id != BPF_FUNC_kptr_xchg)
goto error;
break;
case BPF_MAP_TYPE_BLOOM_FILTER:
@@ -7355,6 +8083,9 @@ static int check_func_proto(const struct bpf_func_proto *fn, int func_id)
/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
* are now invalid, so turn them into unknown SCALAR_VALUE.
+ *
+ * This also applies to dynptr slices belonging to skb and xdp dynptrs,
+ * since these slices point to packet data.
*/
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
@@ -7362,8 +8093,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
struct bpf_reg_state *reg;
bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
- if (reg_is_pkt_pointer_any(reg))
- __mark_reg_unknown(env, reg);
+ if (reg_is_pkt_pointer_any(reg) || reg_is_dynptr_slice_pkt(reg))
+ mark_reg_invalid(env, reg);
}));
}
@@ -7408,12 +8139,8 @@ static int release_reference(struct bpf_verifier_env *env,
return err;
bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
- if (reg->ref_obj_id == ref_obj_id) {
- if (!env->allow_ptr_leaks)
- __mark_reg_not_init(env, reg);
- else
- __mark_reg_unknown(env, reg);
- }
+ if (reg->ref_obj_id == ref_obj_id)
+ mark_reg_invalid(env, reg);
}));
return 0;
@@ -7426,7 +8153,7 @@ static void invalidate_non_owning_refs(struct bpf_verifier_env *env)
bpf_for_each_reg_in_vstate(env->cur_state, unused, reg, ({
if (type_is_non_owning_ref(reg->type))
- __mark_reg_unknown(env, reg);
+ mark_reg_invalid(env, reg);
}));
}
@@ -7788,10 +8515,10 @@ static int set_rbtree_add_callback_state(struct bpf_verifier_env *env,
struct bpf_func_state *callee,
int insn_idx)
{
- /* void bpf_rbtree_add(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ /* void bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
* bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b));
*
- * 'struct bpf_rb_node *node' arg to bpf_rbtree_add is the same PTR_TO_BTF_ID w/ offset
+ * 'struct bpf_rb_node *node' arg to bpf_rbtree_add_impl is the same PTR_TO_BTF_ID w/ offset
* that 'less' callback args will be receiving. However, 'node' arg was release_reference'd
* by this point, so look at 'root'
*/
@@ -8197,7 +8924,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
meta.func_id = func_id;
/* check args */
for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
- err = check_func_arg(env, i, &meta, fn);
+ err = check_func_arg(env, i, &meta, fn, insn_idx);
if (err)
return err;
}
@@ -8222,30 +8949,6 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs = cur_regs(env);
- /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
- * be reinitialized by any dynptr helper. Hence, mark_stack_slots_dynptr
- * is safe to do directly.
- */
- if (meta.uninit_dynptr_regno) {
- if (regs[meta.uninit_dynptr_regno].type == CONST_PTR_TO_DYNPTR) {
- verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be initialized\n");
- return -EFAULT;
- }
- /* we write BPF_DW bits (8 bytes) at a time */
- for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
- err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno,
- i, BPF_DW, BPF_WRITE, -1, false);
- if (err)
- return err;
- }
-
- err = mark_stack_slots_dynptr(env, &regs[meta.uninit_dynptr_regno],
- fn->arg_type[meta.uninit_dynptr_regno - BPF_REG_1],
- insn_idx);
- if (err)
- return err;
- }
-
if (meta.release_regno) {
err = -EINVAL;
/* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
@@ -8330,43 +9033,62 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
break;
case BPF_FUNC_dynptr_data:
- for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
- if (arg_type_is_dynptr(fn->arg_type[i])) {
- struct bpf_reg_state *reg = &regs[BPF_REG_1 + i];
- int id, ref_obj_id;
+ {
+ struct bpf_reg_state *reg;
+ int id, ref_obj_id;
- if (meta.dynptr_id) {
- verbose(env, "verifier internal error: meta.dynptr_id already set\n");
- return -EFAULT;
- }
+ reg = get_dynptr_arg_reg(env, fn, regs);
+ if (!reg)
+ return -EFAULT;
- if (meta.ref_obj_id) {
- verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
- return -EFAULT;
- }
- id = dynptr_id(env, reg);
- if (id < 0) {
- verbose(env, "verifier internal error: failed to obtain dynptr id\n");
- return id;
- }
+ if (meta.dynptr_id) {
+ verbose(env, "verifier internal error: meta.dynptr_id already set\n");
+ return -EFAULT;
+ }
+ if (meta.ref_obj_id) {
+ verbose(env, "verifier internal error: meta.ref_obj_id already set\n");
+ return -EFAULT;
+ }
- ref_obj_id = dynptr_ref_obj_id(env, reg);
- if (ref_obj_id < 0) {
- verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n");
- return ref_obj_id;
- }
+ id = dynptr_id(env, reg);
+ if (id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+ return id;
+ }
- meta.dynptr_id = id;
- meta.ref_obj_id = ref_obj_id;
- break;
- }
+ ref_obj_id = dynptr_ref_obj_id(env, reg);
+ if (ref_obj_id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr ref_obj_id\n");
+ return ref_obj_id;
}
- if (i == MAX_BPF_FUNC_REG_ARGS) {
- verbose(env, "verifier internal error: no dynptr in bpf_dynptr_data()\n");
+
+ meta.dynptr_id = id;
+ meta.ref_obj_id = ref_obj_id;
+
+ break;
+ }
+ case BPF_FUNC_dynptr_write:
+ {
+ enum bpf_dynptr_type dynptr_type;
+ struct bpf_reg_state *reg;
+
+ reg = get_dynptr_arg_reg(env, fn, regs);
+ if (!reg)
return -EFAULT;
- }
+
+ dynptr_type = dynptr_get_type(env, reg);
+ if (dynptr_type == BPF_DYNPTR_TYPE_INVALID)
+ return -EFAULT;
+
+ if (dynptr_type == BPF_DYNPTR_TYPE_SKB)
+ /* this will trigger clear_all_pkt_pointers(), which will
+ * invalidate all dynptr slices associated with the skb
+ */
+ changes_data = true;
+
break;
+ }
case BPF_FUNC_user_ringbuf_drain:
err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
set_user_ringbuf_callback_state);
@@ -8479,6 +9201,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (func_id == BPF_FUNC_kptr_xchg) {
ret_btf = meta.kptr_field->kptr.btf;
ret_btf_id = meta.kptr_field->kptr.btf_id;
+ if (!btf_is_kernel(ret_btf))
+ regs[BPF_REG_0].type |= MEM_ALLOC;
} else {
if (fn->ret_btf_id == BPF_PTR_POISON) {
verbose(env, "verifier internal error:");
@@ -8595,36 +9319,6 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
}
}
-struct bpf_kfunc_call_arg_meta {
- /* In parameters */
- struct btf *btf;
- u32 func_id;
- u32 kfunc_flags;
- const struct btf_type *func_proto;
- const char *func_name;
- /* Out parameters */
- u32 ref_obj_id;
- u8 release_regno;
- bool r0_rdonly;
- u32 ret_btf_id;
- u64 r0_size;
- u32 subprogno;
- struct {
- u64 value;
- bool found;
- } arg_constant;
- struct {
- struct btf *btf;
- u32 btf_id;
- } arg_obj_drop;
- struct {
- struct btf_field *field;
- } arg_list_head;
- struct {
- struct btf_field *field;
- } arg_rbtree_root;
-};
-
static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
{
return meta->kfunc_flags & KF_ACQUIRE;
@@ -8642,7 +9336,7 @@ static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta)
{
- return meta->kfunc_flags & KF_TRUSTED_ARGS;
+ return (meta->kfunc_flags & KF_TRUSTED_ARGS) || is_kfunc_release(meta);
}
static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
@@ -8660,11 +9354,6 @@ static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
return meta->kfunc_flags & KF_RCU;
}
-static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg)
-{
- return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET);
-}
-
static bool __kfunc_param_match_suffix(const struct btf *btf,
const struct btf_param *arg,
const char *suffix)
@@ -8696,6 +9385,19 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf,
return __kfunc_param_match_suffix(btf, arg, "__sz");
}
+static bool is_kfunc_arg_const_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ return __kfunc_param_match_suffix(btf, arg, "__szk");
+}
+
static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
{
return __kfunc_param_match_suffix(btf, arg, "__k");
@@ -8711,6 +9413,16 @@ static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param
return __kfunc_param_match_suffix(btf, arg, "__alloc");
}
+static bool is_kfunc_arg_uninit(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__uninit");
+}
+
+static bool is_kfunc_arg_refcounted_kptr(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__refcounted_kptr");
+}
+
static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
const struct btf_param *arg,
const char *name)
@@ -8850,14 +9562,15 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_CTX,
- KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */
- KF_ARG_PTR_TO_KPTR, /* PTR_TO_KPTR but type specific */
+ KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */
+ KF_ARG_PTR_TO_REFCOUNTED_KPTR, /* Refcounted local kptr */
KF_ARG_PTR_TO_DYNPTR,
+ KF_ARG_PTR_TO_ITER,
KF_ARG_PTR_TO_LIST_HEAD,
KF_ARG_PTR_TO_LIST_NODE,
- KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */
+ KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */
KF_ARG_PTR_TO_MEM,
- KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */
+ KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */
KF_ARG_PTR_TO_CALLBACK,
KF_ARG_PTR_TO_RB_ROOT,
KF_ARG_PTR_TO_RB_NODE,
@@ -8866,8 +9579,9 @@ enum kfunc_ptr_arg_type {
enum special_kfunc_type {
KF_bpf_obj_new_impl,
KF_bpf_obj_drop_impl,
- KF_bpf_list_push_front,
- KF_bpf_list_push_back,
+ KF_bpf_refcount_acquire_impl,
+ KF_bpf_list_push_front_impl,
+ KF_bpf_list_push_back_impl,
KF_bpf_list_pop_front,
KF_bpf_list_pop_back,
KF_bpf_cast_to_kern_ctx,
@@ -8875,29 +9589,39 @@ enum special_kfunc_type {
KF_bpf_rcu_read_lock,
KF_bpf_rcu_read_unlock,
KF_bpf_rbtree_remove,
- KF_bpf_rbtree_add,
+ KF_bpf_rbtree_add_impl,
KF_bpf_rbtree_first,
+ KF_bpf_dynptr_from_skb,
+ KF_bpf_dynptr_from_xdp,
+ KF_bpf_dynptr_slice,
+ KF_bpf_dynptr_slice_rdwr,
};
BTF_SET_START(special_kfunc_set)
BTF_ID(func, bpf_obj_new_impl)
BTF_ID(func, bpf_obj_drop_impl)
-BTF_ID(func, bpf_list_push_front)
-BTF_ID(func, bpf_list_push_back)
+BTF_ID(func, bpf_refcount_acquire_impl)
+BTF_ID(func, bpf_list_push_front_impl)
+BTF_ID(func, bpf_list_push_back_impl)
BTF_ID(func, bpf_list_pop_front)
BTF_ID(func, bpf_list_pop_back)
BTF_ID(func, bpf_cast_to_kern_ctx)
BTF_ID(func, bpf_rdonly_cast)
BTF_ID(func, bpf_rbtree_remove)
-BTF_ID(func, bpf_rbtree_add)
+BTF_ID(func, bpf_rbtree_add_impl)
BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_dynptr_from_skb)
+BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_slice)
+BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_SET_END(special_kfunc_set)
BTF_ID_LIST(special_kfunc_list)
BTF_ID(func, bpf_obj_new_impl)
BTF_ID(func, bpf_obj_drop_impl)
-BTF_ID(func, bpf_list_push_front)
-BTF_ID(func, bpf_list_push_back)
+BTF_ID(func, bpf_refcount_acquire_impl)
+BTF_ID(func, bpf_list_push_front_impl)
+BTF_ID(func, bpf_list_push_back_impl)
BTF_ID(func, bpf_list_pop_front)
BTF_ID(func, bpf_list_pop_back)
BTF_ID(func, bpf_cast_to_kern_ctx)
@@ -8905,8 +9629,12 @@ BTF_ID(func, bpf_rdonly_cast)
BTF_ID(func, bpf_rcu_read_lock)
BTF_ID(func, bpf_rcu_read_unlock)
BTF_ID(func, bpf_rbtree_remove)
-BTF_ID(func, bpf_rbtree_add)
+BTF_ID(func, bpf_rbtree_add_impl)
BTF_ID(func, bpf_rbtree_first)
+BTF_ID(func, bpf_dynptr_from_skb)
+BTF_ID(func, bpf_dynptr_from_xdp)
+BTF_ID(func, bpf_dynptr_slice)
+BTF_ID(func, bpf_dynptr_slice_rdwr)
static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -8944,24 +9672,15 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_ALLOC_BTF_ID;
- if (is_kfunc_arg_kptr_get(meta, argno)) {
- if (!btf_type_is_ptr(ref_t)) {
- verbose(env, "arg#0 BTF type must be a double pointer for kptr_get kfunc\n");
- return -EINVAL;
- }
- ref_t = btf_type_by_id(meta->btf, ref_t->type);
- ref_tname = btf_name_by_offset(meta->btf, ref_t->name_off);
- if (!btf_type_is_struct(ref_t)) {
- verbose(env, "kernel function %s args#0 pointer type %s %s is not supported\n",
- meta->func_name, btf_type_str(ref_t), ref_tname);
- return -EINVAL;
- }
- return KF_ARG_PTR_TO_KPTR;
- }
+ if (is_kfunc_arg_refcounted_kptr(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_REFCOUNTED_KPTR;
if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_DYNPTR;
+ if (is_kfunc_arg_iter(meta, argno))
+ return KF_ARG_PTR_TO_ITER;
+
if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_LIST_HEAD;
@@ -8986,7 +9705,10 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
return KF_ARG_PTR_TO_CALLBACK;
- if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]))
+
+ if (argno + 1 < nargs &&
+ (is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
+ is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
arg_mem_size = true;
/* This is the catch all argument type of register types supported by
@@ -9066,40 +9788,6 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
return 0;
}
-static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg,
- const struct btf_type *ref_t,
- const char *ref_tname,
- struct bpf_kfunc_call_arg_meta *meta,
- int argno)
-{
- struct btf_field *kptr_field;
-
- /* check_func_arg_reg_off allows var_off for
- * PTR_TO_MAP_VALUE, but we need fixed offset to find
- * off_desc.
- */
- if (!tnum_is_const(reg->var_off)) {
- verbose(env, "arg#0 must have constant offset\n");
- return -EINVAL;
- }
-
- kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR);
- if (!kptr_field || kptr_field->type != BPF_KPTR_REF) {
- verbose(env, "arg#0 no referenced kptr at map value offset=%llu\n",
- reg->off + reg->var_off.value);
- return -EINVAL;
- }
-
- if (!btf_struct_ids_match(&env->log, meta->btf, ref_t->type, 0, kptr_field->kptr.btf,
- kptr_field->kptr.btf_id, true)) {
- verbose(env, "kernel function %s args#%d expected pointer to %s %s\n",
- meta->func_name, argno, btf_type_str(ref_t), ref_tname);
- return -EINVAL;
- }
- return 0;
-}
-
static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_verifier_state *state = env->cur_state;
@@ -9206,7 +9894,6 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_
ptr = reg->map_ptr;
break;
case PTR_TO_BTF_ID | MEM_ALLOC:
- case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:
ptr = reg->btf;
break;
default:
@@ -9227,27 +9914,28 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_
static bool is_bpf_list_api_kfunc(u32 btf_id)
{
- return btf_id == special_kfunc_list[KF_bpf_list_push_front] ||
- btf_id == special_kfunc_list[KF_bpf_list_push_back] ||
+ return btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ btf_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
btf_id == special_kfunc_list[KF_bpf_list_pop_back];
}
static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
{
- return btf_id == special_kfunc_list[KF_bpf_rbtree_add] ||
+ return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl] ||
btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
btf_id == special_kfunc_list[KF_bpf_rbtree_first];
}
static bool is_bpf_graph_api_kfunc(u32 btf_id)
{
- return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id);
+ return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id) ||
+ btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
}
static bool is_callback_calling_kfunc(u32 btf_id)
{
- return btf_id == special_kfunc_list[KF_bpf_rbtree_add];
+ return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
}
static bool is_rbtree_lock_required_kfunc(u32 btf_id)
@@ -9288,12 +9976,12 @@ static bool check_kfunc_is_graph_node_api(struct bpf_verifier_env *env,
switch (node_field_type) {
case BPF_LIST_NODE:
- ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front] ||
- kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back]);
+ ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ kfunc_btf_id == special_kfunc_list[KF_bpf_list_push_back_impl]);
break;
case BPF_RB_NODE:
ret = (kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_remove] ||
- kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add]);
+ kfunc_btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl]);
break;
default:
verbose(env, "verifier internal error: unexpected graph node argument type %s\n",
@@ -9455,11 +10143,13 @@ static int process_kf_arg_ptr_to_rbtree_node(struct bpf_verifier_env *env,
&meta->arg_rbtree_root.field);
}
-static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta)
+static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta,
+ int insn_idx)
{
const char *func_name = meta->func_name, *ref_tname;
const struct btf *btf = meta->btf;
const struct btf_param *args;
+ struct btf_record *rec;
u32 i, nargs;
int ret;
@@ -9538,7 +10228,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EINVAL;
}
- if (is_kfunc_trusted_args(meta) &&
+ if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
(register_is_null(reg) || type_may_be_null(reg->type))) {
verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
return -EACCES;
@@ -9585,8 +10275,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
/* Trusted arguments have the same offset checks as release arguments */
arg_type |= OBJ_RELEASE;
break;
- case KF_ARG_PTR_TO_KPTR:
case KF_ARG_PTR_TO_DYNPTR:
+ case KF_ARG_PTR_TO_ITER:
case KF_ARG_PTR_TO_LIST_HEAD:
case KF_ARG_PTR_TO_LIST_NODE:
case KF_ARG_PTR_TO_RB_ROOT:
@@ -9594,6 +10284,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_MEM:
case KF_ARG_PTR_TO_MEM_SIZE:
case KF_ARG_PTR_TO_CALLBACK:
+ case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
/* Trusted by default */
break;
default:
@@ -9636,23 +10327,46 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
meta->arg_obj_drop.btf_id = reg->btf_id;
}
break;
- case KF_ARG_PTR_TO_KPTR:
- if (reg->type != PTR_TO_MAP_VALUE) {
- verbose(env, "arg#0 expected pointer to map value\n");
- return -EINVAL;
- }
- ret = process_kf_arg_ptr_to_kptr(env, reg, ref_t, ref_tname, meta, i);
- if (ret < 0)
- return ret;
- break;
case KF_ARG_PTR_TO_DYNPTR:
+ {
+ enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
+
if (reg->type != PTR_TO_STACK &&
reg->type != CONST_PTR_TO_DYNPTR) {
verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i);
return -EINVAL;
}
- ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR | MEM_RDONLY, NULL);
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ dynptr_arg_type |= MEM_RDONLY;
+
+ if (is_kfunc_arg_uninit(btf, &args[i]))
+ dynptr_arg_type |= MEM_UNINIT;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_skb])
+ dynptr_arg_type |= DYNPTR_TYPE_SKB;
+ else if (meta->func_id == special_kfunc_list[KF_bpf_dynptr_from_xdp])
+ dynptr_arg_type |= DYNPTR_TYPE_XDP;
+
+ ret = process_dynptr_func(env, regno, insn_idx, dynptr_arg_type);
+ if (ret < 0)
+ return ret;
+
+ if (!(dynptr_arg_type & MEM_UNINIT)) {
+ int id = dynptr_id(env, reg);
+
+ if (id < 0) {
+ verbose(env, "verifier internal error: failed to obtain dynptr id\n");
+ return id;
+ }
+ meta->initialized_dynptr.id = id;
+ meta->initialized_dynptr.type = dynptr_get_type(env, reg);
+ }
+
+ break;
+ }
+ case KF_ARG_PTR_TO_ITER:
+ ret = process_iter_arg(env, regno, insn_idx, meta);
if (ret < 0)
return ret;
break;
@@ -9749,17 +10463,59 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return ret;
break;
case KF_ARG_PTR_TO_MEM_SIZE:
- ret = check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1);
+ {
+ struct bpf_reg_state *size_reg = &regs[regno + 1];
+ const struct btf_param *size_arg = &args[i + 1];
+
+ ret = check_kfunc_mem_size_reg(env, size_reg, regno + 1);
if (ret < 0) {
verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
return ret;
}
- /* Skip next '__sz' argument */
+
+ if (is_kfunc_arg_const_mem_size(meta->btf, size_arg, size_reg)) {
+ if (meta->arg_constant.found) {
+ verbose(env, "verifier internal error: only one constant argument permitted\n");
+ return -EFAULT;
+ }
+ if (!tnum_is_const(size_reg->var_off)) {
+ verbose(env, "R%d must be a known constant\n", regno + 1);
+ return -EINVAL;
+ }
+ meta->arg_constant.found = true;
+ meta->arg_constant.value = size_reg->var_off.value;
+ }
+
+ /* Skip next '__sz' or '__szk' argument */
i++;
break;
+ }
case KF_ARG_PTR_TO_CALLBACK:
meta->subprogno = reg->subprogno;
break;
+ case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
+ if (!type_is_ptr_alloc_obj(reg->type) && !type_is_non_owning_ref(reg->type)) {
+ verbose(env, "arg#%d is neither owning or non-owning ref\n", i);
+ return -EINVAL;
+ }
+
+ rec = reg_btf_record(reg);
+ if (!rec) {
+ verbose(env, "verifier internal error: Couldn't find btf_record\n");
+ return -EFAULT;
+ }
+
+ if (rec->refcount_off < 0) {
+ verbose(env, "arg#%d doesn't point to a type with bpf_refcount field\n", i);
+ return -EINVAL;
+ }
+ if (rec->refcount_off >= 0) {
+ verbose(env, "bpf_refcount_acquire calls are disabled for now\n");
+ return -EINVAL;
+ }
+ meta->arg_refcount_acquire.btf = reg->btf;
+ meta->arg_refcount_acquire.btf_id = reg->btf_id;
+ break;
}
}
@@ -9772,24 +10528,21 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return 0;
}
-static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
- int *insn_idx_p)
+static int fetch_kfunc_meta(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_kfunc_call_arg_meta *meta,
+ const char **kfunc_name)
{
- const struct btf_type *t, *func, *func_proto, *ptr_type;
- u32 i, nargs, func_id, ptr_type_id, release_ref_obj_id;
- struct bpf_reg_state *regs = cur_regs(env);
- const char *func_name, *ptr_type_name;
- bool sleepable, rcu_lock, rcu_unlock;
- struct bpf_kfunc_call_arg_meta meta;
- int err, insn_idx = *insn_idx_p;
- const struct btf_param *args;
- const struct btf_type *ret_t;
+ const struct btf_type *func, *func_proto;
+ u32 func_id, *kfunc_flags;
+ const char *func_name;
struct btf *desc_btf;
- u32 *kfunc_flags;
- /* skip for now, but return error when we find this in fixup_kfunc_call */
+ if (kfunc_name)
+ *kfunc_name = NULL;
+
if (!insn->imm)
- return 0;
+ return -EINVAL;
desc_btf = find_kfunc_desc_btf(env, insn->off);
if (IS_ERR(desc_btf))
@@ -9798,22 +10551,53 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
func_id = insn->imm;
func = btf_type_by_id(desc_btf, func_id);
func_name = btf_name_by_offset(desc_btf, func->name_off);
+ if (kfunc_name)
+ *kfunc_name = func_name;
func_proto = btf_type_by_id(desc_btf, func->type);
kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id);
if (!kfunc_flags) {
- verbose(env, "calling kernel function %s is not allowed\n",
- func_name);
return -EACCES;
}
- /* Prepare kfunc call metadata */
- memset(&meta, 0, sizeof(meta));
- meta.btf = desc_btf;
- meta.func_id = func_id;
- meta.kfunc_flags = *kfunc_flags;
- meta.func_proto = func_proto;
- meta.func_name = func_name;
+ memset(meta, 0, sizeof(*meta));
+ meta->btf = desc_btf;
+ meta->func_id = func_id;
+ meta->kfunc_flags = *kfunc_flags;
+ meta->func_proto = func_proto;
+ meta->func_name = func_name;
+
+ return 0;
+}
+
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx_p)
+{
+ const struct btf_type *t, *ptr_type;
+ u32 i, nargs, ptr_type_id, release_ref_obj_id;
+ struct bpf_reg_state *regs = cur_regs(env);
+ const char *func_name, *ptr_type_name;
+ bool sleepable, rcu_lock, rcu_unlock;
+ struct bpf_kfunc_call_arg_meta meta;
+ struct bpf_insn_aux_data *insn_aux;
+ int err, insn_idx = *insn_idx_p;
+ const struct btf_param *args;
+ const struct btf_type *ret_t;
+ struct btf *desc_btf;
+
+ /* skip for now, but return error when we find this in fixup_kfunc_call */
+ if (!insn->imm)
+ return 0;
+
+ err = fetch_kfunc_meta(env, insn, &meta, &func_name);
+ if (err == -EACCES && func_name)
+ verbose(env, "calling kernel function %s is not allowed\n", func_name);
+ if (err)
+ return err;
+ desc_btf = meta.btf;
+ insn_aux = &env->insn_aux_data[insn_idx];
+
+ insn_aux->is_iter_next = is_iter_next_kfunc(&meta);
if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
@@ -9828,10 +10612,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
- if ((rcu_lock || rcu_unlock) && !env->rcu_tag_supported) {
- verbose(env, "no vmlinux btf rcu tag support for kfunc %s\n", func_name);
- return -EACCES;
- }
if (env->cur_state->active_rcu_lock) {
struct bpf_func_state *state;
@@ -9860,7 +10640,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
/* Check the arguments */
- err = check_kfunc_args(env, &meta);
+ err = check_kfunc_args(env, &meta, insn_idx);
if (err < 0)
return err;
/* In case of release function, we get register number of refcounted
@@ -9870,36 +10650,37 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
err = release_reference(env, regs[meta.release_regno].ref_obj_id);
if (err) {
verbose(env, "kfunc %s#%d reference has not been acquired before\n",
- func_name, func_id);
+ func_name, meta.func_id);
return err;
}
}
- if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front] ||
- meta.func_id == special_kfunc_list[KF_bpf_list_push_back] ||
- meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
release_ref_obj_id = regs[BPF_REG_2].ref_obj_id;
+ insn_aux->insert_off = regs[BPF_REG_2].off;
err = ref_convert_owning_non_owning(env, release_ref_obj_id);
if (err) {
verbose(env, "kfunc %s#%d conversion of owning ref to non-owning failed\n",
- func_name, func_id);
+ func_name, meta.func_id);
return err;
}
err = release_reference(env, release_ref_obj_id);
if (err) {
verbose(env, "kfunc %s#%d reference has not been acquired before\n",
- func_name, func_id);
+ func_name, meta.func_id);
return err;
}
}
- if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add]) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
set_rbtree_add_callback_state);
if (err) {
verbose(env, "kfunc %s#%d failed callback verification\n",
- func_name, func_id);
+ func_name, meta.func_id);
return err;
}
}
@@ -9908,11 +10689,13 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
mark_reg_not_init(env, regs, caller_saved[i]);
/* Check return type */
- t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
+ t = btf_type_skip_modifiers(desc_btf, meta.func_proto->type, NULL);
if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
/* Only exception is bpf_obj_new_impl */
- if (meta.btf != btf_vmlinux || meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl]) {
+ if (meta.btf != btf_vmlinux ||
+ (meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl] &&
+ meta.func_id != special_kfunc_list[KF_bpf_refcount_acquire_impl])) {
verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
return -EINVAL;
}
@@ -9957,13 +10740,18 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].btf = ret_btf;
regs[BPF_REG_0].btf_id = ret_btf_id;
- env->insn_aux_data[insn_idx].obj_new_size = ret_t->size;
- env->insn_aux_data[insn_idx].kptr_struct_meta =
+ insn_aux->obj_new_size = ret_t->size;
+ insn_aux->kptr_struct_meta =
btf_find_struct_meta(ret_btf, ret_btf_id);
- } else if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
- env->insn_aux_data[insn_idx].kptr_struct_meta =
- btf_find_struct_meta(meta.arg_obj_drop.btf,
- meta.arg_obj_drop.btf_id);
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = meta.arg_refcount_acquire.btf;
+ regs[BPF_REG_0].btf_id = meta.arg_refcount_acquire.btf_id;
+
+ insn_aux->kptr_struct_meta =
+ btf_find_struct_meta(meta.arg_refcount_acquire.btf,
+ meta.arg_refcount_acquire.btf_id);
} else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
struct btf_field *field = meta.arg_list_head.field;
@@ -9991,6 +10779,42 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
regs[BPF_REG_0].btf = desc_btf;
regs[BPF_REG_0].btf_id = meta.arg_constant.value;
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice] ||
+ meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) {
+ enum bpf_type_flag type_flag = get_dynptr_type_flag(meta.initialized_dynptr.type);
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+
+ if (!meta.arg_constant.found) {
+ verbose(env, "verifier internal error: bpf_dynptr_slice(_rdwr) no constant size\n");
+ return -EFAULT;
+ }
+
+ regs[BPF_REG_0].mem_size = meta.arg_constant.value;
+
+ /* PTR_MAYBE_NULL will be added when is_kfunc_ret_null is checked */
+ regs[BPF_REG_0].type = PTR_TO_MEM | type_flag;
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice]) {
+ regs[BPF_REG_0].type |= MEM_RDONLY;
+ } else {
+ /* this will set env->seen_direct_write to true */
+ if (!may_access_direct_pkt_data(env, NULL, BPF_WRITE)) {
+ verbose(env, "the prog does not allow writes to packet data\n");
+ return -EINVAL;
+ }
+ }
+
+ if (!meta.initialized_dynptr.id) {
+ verbose(env, "verifier internal error: no dynptr id\n");
+ return -EFAULT;
+ }
+ regs[BPF_REG_0].dynptr_id = meta.initialized_dynptr.id;
+
+ /* we don't need to set BPF_REG_0's ref obj id
+ * because packet slices are not refcounted (see
+ * dynptr_type_refcounted)
+ */
} else {
verbose(env, "kernel function %s unhandled dynamic return type\n",
meta.func_name);
@@ -9998,6 +10822,14 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
} else if (!__btf_type_is_struct(ptr_type)) {
if (!meta.r0_size) {
+ __u32 sz;
+
+ if (!IS_ERR(btf_resolve_size(desc_btf, ptr_type, &sz))) {
+ meta.r0_size = sz;
+ meta.r0_rdonly = true;
+ }
+ }
+ if (!meta.r0_size) {
ptr_type_name = btf_name_by_offset(desc_btf,
ptr_type->name_off);
verbose(env,
@@ -10043,15 +10875,20 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
ref_set_non_owning(env, &regs[BPF_REG_0]);
}
- if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_remove])
- invalidate_non_owning_refs(env);
-
if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
regs[BPF_REG_0].id = ++env->id_gen;
- } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
+ } else if (btf_type_is_void(t)) {
+ if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ insn_aux->kptr_struct_meta =
+ btf_find_struct_meta(meta.arg_obj_drop.btf,
+ meta.arg_obj_drop.btf_id);
+ }
+ }
+ }
- nargs = btf_type_vlen(func_proto);
- args = (const struct btf_param *)(func_proto + 1);
+ nargs = btf_type_vlen(meta.func_proto);
+ args = (const struct btf_param *)(meta.func_proto + 1);
for (i = 0; i < nargs; i++) {
u32 regno = i + 1;
@@ -10063,6 +10900,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
mark_btf_func_reg_size(env, regno, t->size);
}
+ if (is_iter_next_kfunc(&meta)) {
+ err = process_iter_next_call(env, insn_idx, &meta);
+ if (err)
+ return err;
+ }
+
return 0;
}
@@ -11596,12 +12439,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->src_reg);
return -EACCES;
} else if (src_reg->type == SCALAR_VALUE) {
+ bool is_src_reg_u32 = src_reg->umax_value <= U32_MAX;
+
+ if (is_src_reg_u32 && !src_reg->id)
+ src_reg->id = ++env->id_gen;
copy_register_state(dst_reg, src_reg);
- /* Make sure ID is cleared otherwise
+ /* Make sure ID is cleared if src_reg is not in u32 range otherwise
* dst_reg min/max could be incorrectly
* propagated into src_reg by find_equal_scalars()
*/
- dst_reg->id = 0;
+ if (!is_src_reg_u32)
+ dst_reg->id = 0;
dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = env->insn_idx + 1;
} else {
@@ -11769,10 +12617,14 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode)
case BPF_JEQ:
if (tnum_is_const(subreg))
return !!tnum_equals_const(subreg, val);
+ else if (val < reg->u32_min_value || val > reg->u32_max_value)
+ return 0;
break;
case BPF_JNE:
if (tnum_is_const(subreg))
return !tnum_equals_const(subreg, val);
+ else if (val < reg->u32_min_value || val > reg->u32_max_value)
+ return 1;
break;
case BPF_JSET:
if ((~subreg.mask & subreg.value) & val)
@@ -11842,10 +12694,14 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode)
case BPF_JEQ:
if (tnum_is_const(reg->var_off))
return !!tnum_equals_const(reg->var_off, val);
+ else if (val < reg->umin_value || val > reg->umax_value)
+ return 0;
break;
case BPF_JNE:
if (tnum_is_const(reg->var_off))
return !tnum_equals_const(reg->var_off, val);
+ else if (val < reg->umin_value || val > reg->umax_value)
+ return 1;
break;
case BPF_JSET:
if ((~reg->var_off.mask & reg->var_off.value) & val)
@@ -12466,6 +13322,18 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
src_reg->var_off.value,
opcode,
is_jmp32);
+ } else if (dst_reg->type == SCALAR_VALUE &&
+ is_jmp32 && tnum_is_const(tnum_subreg(dst_reg->var_off))) {
+ pred = is_branch_taken(src_reg,
+ tnum_subreg(dst_reg->var_off).value,
+ flip_opcode(opcode),
+ is_jmp32);
+ } else if (dst_reg->type == SCALAR_VALUE &&
+ !is_jmp32 && tnum_is_const(dst_reg->var_off)) {
+ pred = is_branch_taken(src_reg,
+ dst_reg->var_off.value,
+ flip_opcode(opcode),
+ is_jmp32);
} else if (reg_is_pkt_pointer_any(dst_reg) &&
reg_is_pkt_pointer_any(src_reg) &&
!is_jmp32) {
@@ -12966,6 +13834,9 @@ static int check_return_code(struct bpf_verifier_env *env)
}
break;
+ case BPF_PROG_TYPE_NETFILTER:
+ range = tnum_range(NF_DROP, NF_ACCEPT);
+ break;
case BPF_PROG_TYPE_EXT:
/* freplace program can return anything as its return value
* depends on the to-be-replaced kernel func or bpf program.
@@ -13060,6 +13931,17 @@ static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
return env->insn_aux_data[insn_idx].prune_point;
}
+static void mark_force_checkpoint(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].force_checkpoint = true;
+}
+
+static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].force_checkpoint;
+}
+
+
enum {
DONE_EXPLORING = 0,
KEEP_EXPLORING = 1,
@@ -13152,44 +14034,63 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
*/
static int visit_insn(int t, struct bpf_verifier_env *env)
{
- struct bpf_insn *insns = env->prog->insnsi;
+ struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
int ret;
- if (bpf_pseudo_func(insns + t))
+ if (bpf_pseudo_func(insn))
return visit_func_call_insn(t, insns, env, true);
/* All non-branch instructions have a single fall-through edge. */
- if (BPF_CLASS(insns[t].code) != BPF_JMP &&
- BPF_CLASS(insns[t].code) != BPF_JMP32)
+ if (BPF_CLASS(insn->code) != BPF_JMP &&
+ BPF_CLASS(insn->code) != BPF_JMP32)
return push_insn(t, t + 1, FALLTHROUGH, env, false);
- switch (BPF_OP(insns[t].code)) {
+ switch (BPF_OP(insn->code)) {
case BPF_EXIT:
return DONE_EXPLORING;
case BPF_CALL:
- if (insns[t].imm == BPF_FUNC_timer_set_callback)
+ if (insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback)
/* Mark this call insn as a prune point to trigger
* is_state_visited() check before call itself is
* processed by __check_func_call(). Otherwise new
* async state will be pushed for further exploration.
*/
mark_prune_point(env, t);
- return visit_func_call_insn(t, insns, env,
- insns[t].src_reg == BPF_PSEUDO_CALL);
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ struct bpf_kfunc_call_arg_meta meta;
+
+ ret = fetch_kfunc_meta(env, insn, &meta, NULL);
+ if (ret == 0 && is_iter_next_kfunc(&meta)) {
+ mark_prune_point(env, t);
+ /* Checking and saving state checkpoints at iter_next() call
+ * is crucial for fast convergence of open-coded iterator loop
+ * logic, so we need to force it. If we don't do that,
+ * is_state_visited() might skip saving a checkpoint, causing
+ * unnecessarily long sequence of not checkpointed
+ * instructions and jumps, leading to exhaustion of jump
+ * history buffer, and potentially other undesired outcomes.
+ * It is expected that with correct open-coded iterators
+ * convergence will happen quickly, so we don't run a risk of
+ * exhausting memory.
+ */
+ mark_force_checkpoint(env, t);
+ }
+ }
+ return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
case BPF_JA:
- if (BPF_SRC(insns[t].code) != BPF_K)
+ if (BPF_SRC(insn->code) != BPF_K)
return -EINVAL;
/* unconditional jump with single edge */
- ret = push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env,
+ ret = push_insn(t, t + insn->off + 1, FALLTHROUGH, env,
true);
if (ret)
return ret;
- mark_prune_point(env, t + insns[t].off + 1);
- mark_jmp_point(env, t + insns[t].off + 1);
+ mark_prune_point(env, t + insn->off + 1);
+ mark_jmp_point(env, t + insn->off + 1);
return ret;
@@ -13201,7 +14102,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
if (ret)
return ret;
- return push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
+ return push_insn(t, t + insn->off + 1, BRANCH, env, true);
}
}
@@ -13822,7 +14723,7 @@ static bool regs_exact(const struct bpf_reg_state *rold,
const struct bpf_reg_state *rcur,
struct bpf_id_pair *idmap)
{
- return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
+ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
check_ids(rold->id, rcur->id, idmap) &&
check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
}
@@ -13877,13 +14778,17 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
tnum_in(rold->var_off, rcur->var_off);
case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
+ case PTR_TO_MEM:
+ case PTR_TO_BUF:
+ case PTR_TO_TP_BUFFER:
/* If the new min/max/var_off satisfy the old ones and
* everything else matches, we are OK.
*/
return memcmp(rold, rcur, offsetof(struct bpf_reg_state, var_off)) == 0 &&
range_within(rold, rcur) &&
tnum_in(rold->var_off, rcur->var_off) &&
- check_ids(rold->id, rcur->id, idmap);
+ check_ids(rold->id, rcur->id, idmap) &&
+ check_ids(rold->ref_obj_id, rcur->ref_obj_id, idmap);
case PTR_TO_PACKET_META:
case PTR_TO_PACKET:
/* We must have at least as much range as the old ptr
@@ -13925,6 +14830,8 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
* didn't use them
*/
for (i = 0; i < old->allocated_stack; i++) {
+ struct bpf_reg_state *old_reg, *cur_reg;
+
spi = i / BPF_REG_SIZE;
if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) {
@@ -13936,6 +14843,10 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
continue;
+ if (env->allow_uninit_stack &&
+ old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC)
+ continue;
+
/* explored stack has more populated slots than current stack
* and these slots were used
*/
@@ -13977,9 +14888,6 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
return false;
break;
case STACK_DYNPTR:
- {
- const struct bpf_reg_state *old_reg, *cur_reg;
-
old_reg = &old->stack[spi].spilled_ptr;
cur_reg = &cur->stack[spi].spilled_ptr;
if (old_reg->dynptr.type != cur_reg->dynptr.type ||
@@ -13987,7 +14895,22 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
return false;
break;
- }
+ case STACK_ITER:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ /* iter.depth is not compared between states as it
+ * doesn't matter for correctness and would otherwise
+ * prevent convergence; we maintain it only to prevent
+ * infinite loop check triggering, see
+ * iter_active_depths_differ()
+ */
+ if (old_reg->iter.btf != cur_reg->iter.btf ||
+ old_reg->iter.btf_id != cur_reg->iter.btf_id ||
+ old_reg->iter.state != cur_reg->iter.state ||
+ /* ignore {old_reg,cur_reg}->iter.depth, see above */
+ !check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
case STACK_MISC:
case STACK_ZERO:
case STACK_INVALID:
@@ -14201,10 +15124,11 @@ static int propagate_precision(struct bpf_verifier_env *env,
state_reg = state->regs;
for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
+ !state_reg->precise ||
+ !(state_reg->live & REG_LIVE_READ))
continue;
if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "frame %d: propagating r%d\n", i, fr);
+ verbose(env, "frame %d: propagating r%d\n", fr, i);
err = mark_chain_precision_frame(env, fr, i);
if (err < 0)
return err;
@@ -14215,11 +15139,12 @@ static int propagate_precision(struct bpf_verifier_env *env,
continue;
state_reg = &state->stack[i].spilled_ptr;
if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
+ !state_reg->precise ||
+ !(state_reg->live & REG_LIVE_READ))
continue;
if (env->log.level & BPF_LOG_LEVEL2)
verbose(env, "frame %d: propagating fp%d\n",
- (-i - 1) * BPF_REG_SIZE, fr);
+ fr, (-i - 1) * BPF_REG_SIZE);
err = mark_chain_precision_stack_frame(env, fr, i);
if (err < 0)
return err;
@@ -14246,6 +15171,92 @@ static bool states_maybe_looping(struct bpf_verifier_state *old,
return true;
}
+static bool is_iter_next_insn(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].is_iter_next;
+}
+
+/* is_state_visited() handles iter_next() (see process_iter_next_call() for
+ * terminology) calls specially: as opposed to bounded BPF loops, it *expects*
+ * states to match, which otherwise would look like an infinite loop. So while
+ * iter_next() calls are taken care of, we still need to be careful and
+ * prevent erroneous and too eager declaration of "ininite loop", when
+ * iterators are involved.
+ *
+ * Here's a situation in pseudo-BPF assembly form:
+ *
+ * 0: again: ; set up iter_next() call args
+ * 1: r1 = &it ; <CHECKPOINT HERE>
+ * 2: call bpf_iter_num_next ; this is iter_next() call
+ * 3: if r0 == 0 goto done
+ * 4: ... something useful here ...
+ * 5: goto again ; another iteration
+ * 6: done:
+ * 7: r1 = &it
+ * 8: call bpf_iter_num_destroy ; clean up iter state
+ * 9: exit
+ *
+ * This is a typical loop. Let's assume that we have a prune point at 1:,
+ * before we get to `call bpf_iter_num_next` (e.g., because of that `goto
+ * again`, assuming other heuristics don't get in a way).
+ *
+ * When we first time come to 1:, let's say we have some state X. We proceed
+ * to 2:, fork states, enqueue ACTIVE, validate NULL case successfully, exit.
+ * Now we come back to validate that forked ACTIVE state. We proceed through
+ * 3-5, come to goto, jump to 1:. Let's assume our state didn't change, so we
+ * are converging. But the problem is that we don't know that yet, as this
+ * convergence has to happen at iter_next() call site only. So if nothing is
+ * done, at 1: verifier will use bounded loop logic and declare infinite
+ * looping (and would be *technically* correct, if not for iterator's
+ * "eventual sticky NULL" contract, see process_iter_next_call()). But we
+ * don't want that. So what we do in process_iter_next_call() when we go on
+ * another ACTIVE iteration, we bump slot->iter.depth, to mark that it's
+ * a different iteration. So when we suspect an infinite loop, we additionally
+ * check if any of the *ACTIVE* iterator states depths differ. If yes, we
+ * pretend we are not looping and wait for next iter_next() call.
+ *
+ * This only applies to ACTIVE state. In DRAINED state we don't expect to
+ * loop, because that would actually mean infinite loop, as DRAINED state is
+ * "sticky", and so we'll keep returning into the same instruction with the
+ * same state (at least in one of possible code paths).
+ *
+ * This approach allows to keep infinite loop heuristic even in the face of
+ * active iterator. E.g., C snippet below is and will be detected as
+ * inifintely looping:
+ *
+ * struct bpf_iter_num it;
+ * int *p, x;
+ *
+ * bpf_iter_num_new(&it, 0, 10);
+ * while ((p = bpf_iter_num_next(&t))) {
+ * x = p;
+ * while (x--) {} // <<-- infinite loop here
+ * }
+ *
+ */
+static bool iter_active_depths_differ(struct bpf_verifier_state *old, struct bpf_verifier_state *cur)
+{
+ struct bpf_reg_state *slot, *cur_slot;
+ struct bpf_func_state *state;
+ int i, fr;
+
+ for (fr = old->curframe; fr >= 0; fr--) {
+ state = old->frame[fr];
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_ITER)
+ continue;
+
+ slot = &state->stack[i].spilled_ptr;
+ if (slot->iter.state != BPF_ITER_STATE_ACTIVE)
+ continue;
+
+ cur_slot = &cur->frame[fr]->stack[i].spilled_ptr;
+ if (cur_slot->iter.depth != slot->iter.depth)
+ return true;
+ }
+ }
+ return false;
+}
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
@@ -14253,7 +15264,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
struct bpf_verifier_state_list *sl, **pprev;
struct bpf_verifier_state *cur = env->cur_state, *new;
int i, j, err, states_cnt = 0;
- bool add_new_state = env->test_state_freq ? true : false;
+ bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx);
+ bool add_new_state = force_new_state;
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
@@ -14293,8 +15305,46 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* Since the verifier still needs to catch infinite loops
* inside async callbacks.
*/
- } else if (states_maybe_looping(&sl->state, cur) &&
- states_equal(env, &sl->state, cur)) {
+ goto skip_inf_loop_check;
+ }
+ /* BPF open-coded iterators loop detection is special.
+ * states_maybe_looping() logic is too simplistic in detecting
+ * states that *might* be equivalent, because it doesn't know
+ * about ID remapping, so don't even perform it.
+ * See process_iter_next_call() and iter_active_depths_differ()
+ * for overview of the logic. When current and one of parent
+ * states are detected as equivalent, it's a good thing: we prove
+ * convergence and can stop simulating further iterations.
+ * It's safe to assume that iterator loop will finish, taking into
+ * account iter_next() contract of eventually returning
+ * sticky NULL result.
+ */
+ if (is_iter_next_insn(env, insn_idx)) {
+ if (states_equal(env, &sl->state, cur)) {
+ struct bpf_func_state *cur_frame;
+ struct bpf_reg_state *iter_state, *iter_reg;
+ int spi;
+
+ cur_frame = cur->frame[cur->curframe];
+ /* btf_check_iter_kfuncs() enforces that
+ * iter state pointer is always the first arg
+ */
+ iter_reg = &cur_frame->regs[BPF_REG_1];
+ /* current state is valid due to states_equal(),
+ * so we can assume valid iter and reg state,
+ * no need for extra (re-)validations
+ */
+ spi = __get_spi(iter_reg->off + iter_reg->var_off.value);
+ iter_state = &func(env, iter_reg)->stack[spi].spilled_ptr;
+ if (iter_state->iter.state == BPF_ITER_STATE_ACTIVE)
+ goto hit;
+ }
+ goto skip_inf_loop_check;
+ }
+ /* attempt to detect infinite loop to avoid unnecessary doomed work */
+ if (states_maybe_looping(&sl->state, cur) &&
+ states_equal(env, &sl->state, cur) &&
+ !iter_active_depths_differ(&sl->state, cur)) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
return -EINVAL;
@@ -14311,12 +15361,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* This threshold shouldn't be too high either, since states
* at the end of the loop are likely to be useful in pruning.
*/
- if (env->jmps_processed - env->prev_jmps_processed < 20 &&
+skip_inf_loop_check:
+ if (!force_new_state &&
+ env->jmps_processed - env->prev_jmps_processed < 20 &&
env->insn_processed - env->prev_insn_processed < 100)
add_new_state = false;
goto miss;
}
if (states_equal(env, &sl->state, cur)) {
+hit:
sl->hit_cnt++;
/* reached equivalent register/stack state,
* prune the search.
@@ -14500,6 +15553,44 @@ static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev)
!reg_type_mismatch_ok(prev));
}
+static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type,
+ bool allow_trust_missmatch)
+{
+ enum bpf_reg_type *prev_type = &env->insn_aux_data[env->insn_idx].ptr_type;
+
+ if (*prev_type == NOT_INIT) {
+ /* Saw a valid insn
+ * dst_reg = *(u32 *)(src_reg + off)
+ * save type to validate intersecting paths
+ */
+ *prev_type = type;
+ } else if (reg_type_mismatch(type, *prev_type)) {
+ /* Abuser program is trying to use the same insn
+ * dst_reg = *(u32*) (src_reg + off)
+ * with different pointer types:
+ * src_reg == ctx in one branch and
+ * src_reg == stack|map in some other branch.
+ * Reject it.
+ */
+ if (allow_trust_missmatch &&
+ base_type(type) == PTR_TO_BTF_ID &&
+ base_type(*prev_type) == PTR_TO_BTF_ID) {
+ /*
+ * Have to support a use case when one path through
+ * the program yields TRUSTED pointer while another
+ * is UNTRUSTED. Fallback to UNTRUSTED to generate
+ * BPF_PROBE_MEM.
+ */
+ *prev_type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
+ } else {
+ verbose(env, "same insn cannot be used with different pointers\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int do_check(struct bpf_verifier_env *env)
{
bool pop_log = !(env->log.level & BPF_LOG_LEVEL2);
@@ -14585,11 +15676,11 @@ static int do_check(struct bpf_verifier_env *env)
print_insn_state(env, state->frame[state->curframe]);
verbose_linfo(env, env->insn_idx, "; ");
- env->prev_log_len = env->log.len_used;
+ env->prev_log_pos = env->log.end_pos;
verbose(env, "%d: ", env->insn_idx);
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
- env->prev_insn_print_len = env->log.len_used - env->prev_log_len;
- env->prev_log_len = env->log.len_used;
+ env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos;
+ env->prev_log_pos = env->log.end_pos;
}
if (bpf_prog_is_offloaded(env->prog->aux)) {
@@ -14609,7 +15700,7 @@ static int do_check(struct bpf_verifier_env *env)
return err;
} else if (class == BPF_LDX) {
- enum bpf_reg_type *prev_src_type, src_reg_type;
+ enum bpf_reg_type src_reg_type;
/* check for reserved fields is already done */
@@ -14633,29 +15724,11 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type;
-
- if (*prev_src_type == NOT_INIT) {
- /* saw a valid insn
- * dst_reg = *(u32 *)(src_reg + off)
- * save type to validate intersecting paths
- */
- *prev_src_type = src_reg_type;
-
- } else if (reg_type_mismatch(src_reg_type, *prev_src_type)) {
- /* ABuser program is trying to use the same insn
- * dst_reg = *(u32*) (src_reg + off)
- * with different pointer types:
- * src_reg == ctx in one branch and
- * src_reg == stack|map in some other branch.
- * Reject it.
- */
- verbose(env, "same insn cannot be used with different pointers\n");
- return -EINVAL;
- }
-
+ err = save_aux_ptr_type(env, src_reg_type, true);
+ if (err)
+ return err;
} else if (class == BPF_STX) {
- enum bpf_reg_type *prev_dst_type, dst_reg_type;
+ enum bpf_reg_type dst_reg_type;
if (BPF_MODE(insn->code) == BPF_ATOMIC) {
err = check_atomic(env, env->insn_idx, insn);
@@ -14688,16 +15761,12 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type;
-
- if (*prev_dst_type == NOT_INIT) {
- *prev_dst_type = dst_reg_type;
- } else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) {
- verbose(env, "same insn cannot be used with different pointers\n");
- return -EINVAL;
- }
-
+ err = save_aux_ptr_type(env, dst_reg_type, false);
+ if (err)
+ return err;
} else if (class == BPF_ST) {
+ enum bpf_reg_type dst_reg_type;
+
if (BPF_MODE(insn->code) != BPF_MEM ||
insn->src_reg != BPF_REG_0) {
verbose(env, "BPF_ST uses reserved fields\n");
@@ -14708,12 +15777,7 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
- if (is_ctx_reg(env, insn->dst_reg)) {
- verbose(env, "BPF_ST stores into R%d %s is not allowed\n",
- insn->dst_reg,
- reg_type_str(env, reg_state(env, insn->dst_reg)->type));
- return -EACCES;
- }
+ dst_reg_type = regs[insn->dst_reg].type;
/* check that memory (dst_reg + off) is writeable */
err = check_mem_access(env, env->insn_idx, insn->dst_reg,
@@ -14722,6 +15786,9 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
+ err = save_aux_ptr_type(env, dst_reg_type, false);
+ if (err)
+ return err;
} else if (class == BPF_JMP || class == BPF_JMP32) {
u8 opcode = BPF_OP(insn->code);
@@ -14756,6 +15823,8 @@ static int do_check(struct bpf_verifier_env *env)
err = check_helper_call(env, insn, &env->insn_idx);
if (err)
return err;
+
+ mark_reg_scratched(env, BPF_REG_0);
} else if (opcode == BPF_JA) {
if (BPF_SRC(insn->code) != BPF_K ||
insn->imm != 0 ||
@@ -14930,8 +15999,8 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
goto err_put;
}
- if (!btf_type_is_var(t)) {
- verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id);
+ if (!btf_type_is_var(t) && !btf_type_is_func(t)) {
+ verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id);
err = -EINVAL;
goto err_put;
}
@@ -14944,6 +16013,14 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
err = -ENOENT;
goto err_put;
}
+ insn[0].imm = (u32)addr;
+ insn[1].imm = addr >> 32;
+
+ if (btf_type_is_func(t)) {
+ aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
+ aux->btf_var.mem_size = 0;
+ goto check_btf;
+ }
datasec_id = find_btf_percpu_datasec(btf);
if (datasec_id > 0) {
@@ -14956,9 +16033,6 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
}
}
- insn[0].imm = (u32)addr;
- insn[1].imm = addr >> 32;
-
type = t->type;
t = btf_type_skip_modifiers(btf, type, NULL);
if (percpu) {
@@ -14986,7 +16060,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
aux->btf_var.btf = btf;
aux->btf_var.btf_id = type;
}
-
+check_btf:
/* check whether we recorded this BTF (and maybe module) already */
for (i = 0; i < env->used_btf_cnt; i++) {
if (env->used_btfs[i].btf == btf) {
@@ -15830,14 +16904,12 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
for (i = 0; i < insn_cnt; i++, insn++) {
bpf_convert_ctx_access_t convert_ctx_access;
- bool ctx_access;
if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) {
type = BPF_READ;
- ctx_access = true;
} else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
@@ -15847,7 +16919,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn->code == (BPF_ST | BPF_MEM | BPF_W) ||
insn->code == (BPF_ST | BPF_MEM | BPF_DW)) {
type = BPF_WRITE;
- ctx_access = BPF_CLASS(insn->code) == BPF_STX;
} else {
continue;
}
@@ -15870,9 +16941,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
continue;
}
- if (!ctx_access)
- continue;
-
switch ((int)env->insn_aux_data[i + delta].ptr_type) {
case PTR_TO_CTX:
if (!ops->convert_ctx_access)
@@ -16263,11 +17331,62 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return err;
}
+/* replace a generic kfunc with a specialized version if necessary */
+static void specialize_kfunc(struct bpf_verifier_env *env,
+ u32 func_id, u16 offset, unsigned long *addr)
+{
+ struct bpf_prog *prog = env->prog;
+ bool seen_direct_write;
+ void *xdp_kfunc;
+ bool is_rdonly;
+
+ if (bpf_dev_bound_kfunc_id(func_id)) {
+ xdp_kfunc = bpf_dev_bound_resolve_kfunc(prog, func_id);
+ if (xdp_kfunc) {
+ *addr = (unsigned long)xdp_kfunc;
+ return;
+ }
+ /* fallback to default kfunc when not supported by netdev */
+ }
+
+ if (offset)
+ return;
+
+ if (func_id == special_kfunc_list[KF_bpf_dynptr_from_skb]) {
+ seen_direct_write = env->seen_direct_write;
+ is_rdonly = !may_access_direct_pkt_data(env, NULL, BPF_WRITE);
+
+ if (is_rdonly)
+ *addr = (unsigned long)bpf_dynptr_from_skb_rdonly;
+
+ /* restore env->seen_direct_write to its original value, since
+ * may_access_direct_pkt_data mutates it
+ */
+ env->seen_direct_write = seen_direct_write;
+ }
+}
+
+static void __fixup_collection_insert_kfunc(struct bpf_insn_aux_data *insn_aux,
+ u16 struct_meta_reg,
+ u16 node_offset_reg,
+ struct bpf_insn *insn,
+ struct bpf_insn *insn_buf,
+ int *cnt)
+{
+ struct btf_struct_meta *kptr_struct_meta = insn_aux->kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(struct_meta_reg, (long)kptr_struct_meta) };
+
+ insn_buf[0] = addr[0];
+ insn_buf[1] = addr[1];
+ insn_buf[2] = BPF_MOV64_IMM(node_offset_reg, insn_aux->insert_off);
+ insn_buf[3] = *insn;
+ *cnt = 4;
+}
+
static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
struct bpf_insn *insn_buf, int insn_idx, int *cnt)
{
const struct bpf_kfunc_desc *desc;
- void *xdp_kfunc;
if (!insn->imm) {
verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
@@ -16276,18 +17395,9 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
*cnt = 0;
- if (bpf_dev_bound_kfunc_id(insn->imm)) {
- xdp_kfunc = bpf_dev_bound_resolve_kfunc(env->prog, insn->imm);
- if (xdp_kfunc) {
- insn->imm = BPF_CALL_IMM(xdp_kfunc);
- return 0;
- }
-
- /* fallback to default kfunc when not supported by netdev */
- }
-
- /* insn->imm has the btf func_id. Replace it with
- * an address (relative to __bpf_call_base).
+ /* insn->imm has the btf func_id. Replace it with an offset relative to
+ * __bpf_call_base, unless the JIT needs to call functions that are
+ * further than 32 bits away (bpf_jit_supports_far_kfunc_call()).
*/
desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
if (!desc) {
@@ -16296,7 +17406,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EFAULT;
}
- insn->imm = desc->imm;
+ if (!bpf_jit_supports_far_kfunc_call())
+ insn->imm = BPF_CALL_IMM(desc->addr);
if (insn->off)
return 0;
if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
@@ -16309,7 +17420,8 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
insn_buf[2] = addr[1];
insn_buf[3] = *insn;
*cnt = 4;
- } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_refcount_acquire_impl]) {
struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
@@ -16317,6 +17429,20 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
insn_buf[1] = addr[1];
insn_buf[2] = *insn;
*cnt = 3;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_list_push_back_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_list_push_front_impl] ||
+ desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ int struct_meta_reg = BPF_REG_3;
+ int node_offset_reg = BPF_REG_4;
+
+ /* rbtree_add has extra 'less' arg, so args-to-fixup are in diff regs */
+ if (desc->func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ struct_meta_reg = BPF_REG_4;
+ node_offset_reg = BPF_REG_5;
+ }
+
+ __fixup_collection_insert_kfunc(&env->insn_aux_data[insn_idx], struct_meta_reg,
+ node_offset_reg, insn, insn_buf, cnt);
} else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
@@ -16656,21 +17782,21 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
BUILD_BUG_ON(!__same_type(ops->map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
BUILD_BUG_ON(!__same_type(ops->map_delete_elem,
- (int (*)(struct bpf_map *map, void *key))NULL));
+ (long (*)(struct bpf_map *map, void *key))NULL));
BUILD_BUG_ON(!__same_type(ops->map_update_elem,
- (int (*)(struct bpf_map *map, void *key, void *value,
+ (long (*)(struct bpf_map *map, void *key, void *value,
u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_push_elem,
- (int (*)(struct bpf_map *map, void *value,
+ (long (*)(struct bpf_map *map, void *value,
u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_pop_elem,
- (int (*)(struct bpf_map *map, void *value))NULL));
+ (long (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
- (int (*)(struct bpf_map *map, void *value))NULL));
+ (long (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_redirect,
- (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
+ (long (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
- (int (*)(struct bpf_map *map,
+ (long (*)(struct bpf_map *map,
bpf_callback_t callback_fn,
void *callback_ctx,
u64 flags))NULL));
@@ -16850,7 +17976,7 @@ patch_call_imm:
}
}
- sort_kfunc_descs_by_imm(env->prog);
+ sort_kfunc_descs_by_imm_off(env->prog);
return 0;
}
@@ -17278,6 +18404,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
const char *tname;
struct btf *btf;
long addr = 0;
+ struct module *mod = NULL;
if (!btf_id) {
bpf_log(log, "Tracing programs must provide btf_id\n");
@@ -17451,8 +18578,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
else
addr = (long) tgt_prog->aux->func[subprog]->bpf_func;
} else {
- addr = kallsyms_lookup_name(tname);
+ if (btf_is_module(btf)) {
+ mod = btf_try_get_module(btf);
+ if (mod)
+ addr = find_kallsyms_symbol_value(mod, tname);
+ else
+ addr = 0;
+ } else {
+ addr = kallsyms_lookup_name(tname);
+ }
if (!addr) {
+ module_put(mod);
bpf_log(log,
"The address of function %s cannot be found\n",
tname);
@@ -17492,11 +18628,13 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
break;
}
if (ret) {
+ module_put(mod);
bpf_log(log, "%s is not sleepable\n", tname);
return ret;
}
} else if (prog->expected_attach_type == BPF_MODIFY_RETURN) {
if (tgt_prog) {
+ module_put(mod);
bpf_log(log, "can't modify return codes of BPF programs\n");
return -EINVAL;
}
@@ -17505,6 +18643,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
!check_attach_modify_return(addr, tname))
ret = 0;
if (ret) {
+ module_put(mod);
bpf_log(log, "%s() is not modifiable\n", tname);
return ret;
}
@@ -17515,6 +18654,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
tgt_info->tgt_addr = addr;
tgt_info->tgt_name = tname;
tgt_info->tgt_type = t;
+ tgt_info->tgt_mod = mod;
return 0;
}
@@ -17527,6 +18667,14 @@ BTF_ID(func, migrate_enable)
#if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU
BTF_ID(func, rcu_read_unlock_strict)
#endif
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
+BTF_ID(func, preempt_count_add)
+BTF_ID(func, preempt_count_sub)
+#endif
+#ifdef CONFIG_PREEMPT_RCU
+BTF_ID(func, __rcu_read_lock)
+BTF_ID(func, __rcu_read_unlock)
+#endif
BTF_SET_END(btf_id_deny)
static bool can_be_sleepable(struct bpf_prog *prog)
@@ -17594,6 +18742,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
/* store info about the attachment target that will be used later */
prog->aux->attach_func_proto = tgt_info.tgt_type;
prog->aux->attach_func_name = tgt_info.tgt_name;
+ prog->aux->mod = tgt_info.tgt_mod;
if (tgt_prog) {
prog->aux->saved_dst_prog_type = tgt_prog->type;
@@ -17638,12 +18787,12 @@ struct btf *bpf_get_btf_vmlinux(void)
return btf_vmlinux;
}
-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
+int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
u64 start_time = ktime_get_ns();
struct bpf_verifier_env *env;
- struct bpf_verifier_log *log;
- int i, len, ret = -EINVAL;
+ int i, len, ret = -EINVAL, err;
+ u32 log_true_size;
bool is_priv;
/* no program is valid */
@@ -17656,7 +18805,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
if (!env)
return -ENOMEM;
- log = &env->log;
len = (*prog)->len;
env->insn_aux_data =
@@ -17677,20 +18825,14 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
if (!is_priv)
mutex_lock(&bpf_verifier_lock);
- if (attr->log_level || attr->log_buf || attr->log_size) {
- /* user requested verbose verifier output
- * and supplied buffer to store the verification trace
- */
- log->level = attr->log_level;
- log->ubuf = (char __user *) (unsigned long) attr->log_buf;
- log->len_total = attr->log_size;
-
- /* log attributes have to be sane */
- if (!bpf_verifier_log_attr_valid(log)) {
- ret = -EINVAL;
- goto err_unlock;
- }
- }
+ /* user could have requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ ret = bpf_vlog_init(&env->log, attr->log_level,
+ (char __user *) (unsigned long) attr->log_buf,
+ attr->log_size);
+ if (ret)
+ goto err_unlock;
mark_verifier_state_clean(env);
@@ -17712,8 +18854,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
env->bypass_spec_v1 = bpf_bypass_spec_v1();
env->bypass_spec_v4 = bpf_bypass_spec_v4();
env->bpf_capable = bpf_capable();
- env->rcu_tag_supported = btf_vmlinux &&
- btf_find_by_name_kind(btf_vmlinux, "rcu", BTF_KIND_TYPE_TAG) > 0;
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
@@ -17806,9 +18946,14 @@ skip_full_check:
print_verification_stats(env);
env->prog->aux->verified_insns = env->insn_processed;
- if (log->level && bpf_verifier_log_full(log))
- ret = -ENOSPC;
- if (log->level && !log->ubuf) {
+ /* preserve original error even if log finalization is successful */
+ err = bpf_vlog_finalize(&env->log, &log_true_size);
+ if (err)
+ ret = err;
+
+ if (uattr_size >= offsetofend(union bpf_attr, log_true_size) &&
+ copy_to_bpfptr_offset(uattr, offsetof(union bpf_attr, log_true_size),
+ &log_true_size, sizeof(log_true_size))) {
ret = -EFAULT;
goto err_release_maps;
}
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 52bb5a74a23b..aeef06c465ef 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -58,7 +58,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
struct cgroup_root *root;
int retval = 0;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
cgroup_attach_lock(true);
for_each_root(root) {
struct cgroup *from_cgrp;
@@ -72,7 +72,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
break;
}
cgroup_attach_unlock(true);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return retval;
}
@@ -106,7 +106,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
if (ret)
return ret;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
percpu_down_write(&cgroup_threadgroup_rwsem);
@@ -145,7 +145,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
out_err:
cgroup_migrate_finish(&mgctx);
percpu_up_write(&cgroup_threadgroup_rwsem);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -847,13 +847,13 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent
kernfs_break_active_protection(new_parent);
kernfs_break_active_protection(kn);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
ret = kernfs_rename(kn, new_parent, new_name_str);
if (!ret)
TRACE_CGROUP_PATH(rename, cgrp);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
kernfs_unbreak_active_protection(kn);
kernfs_unbreak_active_protection(new_parent);
@@ -1119,7 +1119,7 @@ int cgroup1_reconfigure(struct fs_context *fc)
trace_cgroup_remount(root);
out_unlock:
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -1246,7 +1246,7 @@ int cgroup1_get_tree(struct fs_context *fc)
if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt))
ret = 1; /* restart */
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
if (!ret)
ret = cgroup_do_get_tree(fc);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 935e8121b21e..625d7483951c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1391,7 +1391,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)
cgroup_favor_dynmods(root, false);
cgroup_exit_root_id(root);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
cgroup_rstat_exit(cgrp);
kernfs_destroy_root(root->kf_root);
@@ -1465,8 +1465,18 @@ static struct cgroup *current_cgns_cgroup_dfl(void)
{
struct css_set *cset;
- cset = current->nsproxy->cgroup_ns->root_cset;
- return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
+ if (current->nsproxy) {
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
+ } else {
+ /*
+ * NOTE: This function may be called from bpf_cgroup_from_id()
+ * on a task which has already passed exit_task_namespaces() and
+ * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all
+ * cgroups visible for lookups.
+ */
+ return &cgrp_dfl_root.cgrp;
+ }
}
/* look up cgroup associated with given css_set on the specified hierarchy */
@@ -1625,7 +1635,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn)
else
cgrp = kn->parent->priv;
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
kernfs_unbreak_active_protection(kn);
cgroup_put(cgrp);
@@ -1670,7 +1680,7 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
if (drain_offline)
cgroup_lock_and_drain_offline(cgrp);
else
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
if (!cgroup_is_dead(cgrp))
return cgrp;
@@ -2167,13 +2177,13 @@ int cgroup_do_get_tree(struct fs_context *fc)
struct super_block *sb = fc->root->d_sb;
struct cgroup *cgrp;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
spin_lock_irq(&css_set_lock);
cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
nsdentry = kernfs_node_dentry(cgrp->kn, sb);
dput(fc->root);
@@ -2356,13 +2366,13 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
{
int ret;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
spin_lock_irq(&css_set_lock);
ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -2388,7 +2398,7 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
int hierarchy_id = 1;
int ret;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
spin_lock_irq(&css_set_lock);
root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
@@ -2402,7 +2412,7 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
}
spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(task_cgroup_path);
@@ -3111,7 +3121,7 @@ void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
int ssid;
restart:
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
for_each_subsys(ss, ssid) {
@@ -3125,7 +3135,7 @@ restart:
prepare_to_wait(&dsct->offline_waitq, &wait,
TASK_UNINTERRUPTIBLE);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
schedule();
finish_wait(&dsct->offline_waitq, &wait);
@@ -3761,7 +3771,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
}
psi = cgroup_psi(cgrp);
- new = psi_trigger_create(psi, buf, res);
+ new = psi_trigger_create(psi, buf, res, of->file);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
@@ -4374,9 +4384,9 @@ int cgroup_rm_cftypes(struct cftype *cfts)
if (!(cfts[0].flags & __CFTYPE_ADDED))
return -ENOENT;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
ret = cgroup_rm_cftypes_locked(cfts);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -4408,14 +4418,14 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
if (ret)
return ret;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
list_add_tail(&cfts->node, &ss->cfts);
ret = cgroup_apply_cftypes(cfts, true);
if (ret)
cgroup_rm_cftypes_locked(cfts);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
return ret;
}
@@ -5385,7 +5395,7 @@ static void css_release_work_fn(struct work_struct *work)
struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
css->flags |= CSS_RELEASED;
list_del_rcu(&css->sibling);
@@ -5426,7 +5436,7 @@ static void css_release_work_fn(struct work_struct *work)
NULL);
}
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
@@ -5774,7 +5784,7 @@ static void css_killed_work_fn(struct work_struct *work)
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
do {
offline_css(css);
@@ -5783,7 +5793,7 @@ static void css_killed_work_fn(struct work_struct *work)
css = css->parent;
} while (css && atomic_dec_and_test(&css->online_cnt));
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
}
/* css kill confirmation processing requires process context, bounce */
@@ -5967,7 +5977,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
pr_debug("Initializing cgroup subsys %s\n", ss->name);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
idr_init(&ss->css_idr);
INIT_LIST_HEAD(&ss->cfts);
@@ -6011,7 +6021,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
BUG_ON(online_css(css));
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
}
/**
@@ -6071,7 +6081,7 @@ int __init cgroup_init(void)
get_user_ns(init_cgroup_ns.user_ns);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
/*
* Add init_css_set to the hash table so that dfl_root can link to
@@ -6082,7 +6092,7 @@ int __init cgroup_init(void)
BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
for_each_subsys(ss, ssid) {
if (ss->early_init) {
@@ -6134,9 +6144,9 @@ int __init cgroup_init(void)
if (ss->bind)
ss->bind(init_css_set.subsys[ssid]);
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
css_populate_dir(init_css_set.subsys[ssid]);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
}
/* init_css_set.subsys[] has been updated, re-hash */
@@ -6241,7 +6251,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
if (!buf)
goto out;
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
spin_lock_irq(&css_set_lock);
for_each_root(root) {
@@ -6296,7 +6306,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
retval = 0;
out_unlock:
spin_unlock_irq(&css_set_lock);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
kfree(buf);
out:
return retval;
@@ -6380,7 +6390,7 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
struct file *f;
if (kargs->flags & CLONE_INTO_CGROUP)
- mutex_lock(&cgroup_mutex);
+ cgroup_lock();
cgroup_threadgroup_change_begin(current);
@@ -6455,7 +6465,7 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
err:
cgroup_threadgroup_change_end(current);
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
if (f)
fput(f);
if (dst_cgrp)
@@ -6482,7 +6492,7 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
struct cgroup *cgrp = kargs->cgrp;
struct css_set *cset = kargs->cset;
- mutex_unlock(&cgroup_mutex);
+ cgroup_unlock();
if (cset) {
put_css_set(cset);
@@ -6856,14 +6866,12 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path);
struct cgroup *cgroup_v1v2_get_from_fd(int fd)
{
struct cgroup *cgrp;
- struct file *f;
-
- f = fget_raw(fd);
- if (!f)
+ struct fd f = fdget_raw(fd);
+ if (!f.file)
return ERR_PTR(-EBADF);
- cgrp = cgroup_v1v2_get_from_file(f);
- fput(f);
+ cgrp = cgroup_v1v2_get_from_file(f.file);
+ fdput(f);
return cgrp;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 636f1c682ac0..e4ca2dd2b764 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1209,7 +1209,9 @@ void rebuild_sched_domains(void)
*
* Iterate through each task of @cs updating its cpus_allowed to the
* effective cpuset's. As this function is called with cpuset_rwsem held,
- * cpuset membership stays stable.
+ * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask()
+ * is used instead of effective_cpus to make sure all offline CPUs are also
+ * included as hotplug code won't update cpumasks for tasks in top_cpuset.
*/
static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
{
@@ -1219,15 +1221,18 @@ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it))) {
- /*
- * Percpu kthreads in top_cpuset are ignored
- */
- if (top_cs && (task->flags & PF_KTHREAD) &&
- kthread_is_per_cpu(task))
- continue;
+ const struct cpumask *possible_mask = task_cpu_possible_mask(task);
- cpumask_and(new_cpus, cs->effective_cpus,
- task_cpu_possible_mask(task));
+ if (top_cs) {
+ /*
+ * Percpu kthreads in top_cpuset are ignored
+ */
+ if ((task->flags & PF_KTHREAD) && kthread_is_per_cpu(task))
+ continue;
+ cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus);
+ } else {
+ cpumask_and(new_cpus, possible_mask, cs->effective_cpus);
+ }
set_cpus_allowed_ptr(task, new_cpus);
}
css_task_iter_end(&it);
@@ -1513,7 +1518,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
spin_unlock_irq(&callback_lock);
if (adding || deleting)
- update_tasks_cpumask(parent, tmp->new_cpus);
+ update_tasks_cpumask(parent, tmp->addmask);
/*
* Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary.
@@ -1770,10 +1775,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
/*
* Use the cpumasks in trialcs for tmpmasks when they are pointers
* to allocated cpumasks.
+ *
+ * Note that update_parent_subparts_cpumask() uses only addmask &
+ * delmask, but not new_cpus.
*/
tmp.addmask = trialcs->subparts_cpus;
tmp.delmask = trialcs->effective_cpus;
- tmp.new_cpus = trialcs->cpus_allowed;
+ tmp.new_cpus = NULL;
#endif
retval = validate_change(cs, trialcs);
@@ -1838,6 +1846,11 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
}
spin_unlock_irq(&callback_lock);
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ /* Now trialcs->cpus_allowed is available */
+ tmp.new_cpus = trialcs->cpus_allowed;
+#endif
+
/* effective_cpus will be updated here */
update_cpumasks_hier(cs, &tmp, false);
@@ -2445,6 +2458,20 @@ static int fmeter_getrate(struct fmeter *fmp)
static struct cpuset *cpuset_attach_old_cs;
+/*
+ * Check to see if a cpuset can accept a new task
+ * For v1, cpus_allowed and mems_allowed can't be empty.
+ * For v2, effective_cpus can't be empty.
+ * Note that in v1, effective_cpus = cpus_allowed.
+ */
+static int cpuset_can_attach_check(struct cpuset *cs)
+{
+ if (cpumask_empty(cs->effective_cpus) ||
+ (!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))
+ return -ENOSPC;
+ return 0;
+}
+
/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
@@ -2459,16 +2486,9 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
percpu_down_write(&cpuset_rwsem);
- /* allow moving tasks into an empty cpuset if on default hierarchy */
- ret = -ENOSPC;
- if (!is_in_v2_mode() &&
- (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
- goto out_unlock;
-
- /*
- * Task cannot be moved to a cpuset with empty effective cpus.
- */
- if (cpumask_empty(cs->effective_cpus))
+ /* Check to see if task is allowed in the cpuset */
+ ret = cpuset_can_attach_check(cs);
+ if (ret)
goto out_unlock;
cgroup_taskset_for_each(task, css, tset) {
@@ -2485,7 +2505,6 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
* changes which zero cpus/mems_allowed.
*/
cs->attach_in_progress++;
- ret = 0;
out_unlock:
percpu_up_write(&cpuset_rwsem);
return ret;
@@ -2494,25 +2513,47 @@ out_unlock:
static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
+ struct cpuset *cs;
cgroup_taskset_first(tset, &css);
+ cs = css_cs(css);
percpu_down_write(&cpuset_rwsem);
- css_cs(css)->attach_in_progress--;
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
percpu_up_write(&cpuset_rwsem);
}
/*
- * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach()
+ * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach_task()
* but we can't allocate it dynamically there. Define it global and
* allocate from cpuset_init().
*/
static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_to;
+
+static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
+{
+ percpu_rwsem_assert_held(&cpuset_rwsem);
+
+ if (cs != &top_cpuset)
+ guarantee_online_cpus(task, cpus_attach);
+ else
+ cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
+ cs->subparts_cpus);
+ /*
+ * can_attach beforehand should guarantee that this doesn't
+ * fail. TODO: have a better way to handle failure here
+ */
+ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+
+ cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
+ cpuset_update_task_spread_flags(cs, task);
+}
static void cpuset_attach(struct cgroup_taskset *tset)
{
- /* static buf protected by cpuset_rwsem */
- static nodemask_t cpuset_attach_nodemask_to;
struct task_struct *task;
struct task_struct *leader;
struct cgroup_subsys_state *css;
@@ -2543,20 +2584,8 @@ static void cpuset_attach(struct cgroup_taskset *tset)
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
- cgroup_taskset_for_each(task, css, tset) {
- if (cs != &top_cpuset)
- guarantee_online_cpus(task, cpus_attach);
- else
- cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
- /*
- * can_attach beforehand should guarantee that this doesn't
- * fail. TODO: have a better way to handle failure here
- */
- WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
-
- cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
- cpuset_update_task_spread_flags(cs, task);
- }
+ cgroup_taskset_for_each(task, css, tset)
+ cpuset_attach_task(cs, task);
/*
* Change mm for all threadgroup leaders. This is expensive and may
@@ -3248,17 +3277,101 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
}
/*
+ * In case the child is cloned into a cpuset different from its parent,
+ * additional checks are done to see if the move is allowed.
+ */
+static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
+{
+ struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
+ bool same_cs;
+ int ret;
+
+ rcu_read_lock();
+ same_cs = (cs == task_cs(current));
+ rcu_read_unlock();
+
+ if (same_cs)
+ return 0;
+
+ lockdep_assert_held(&cgroup_mutex);
+ percpu_down_write(&cpuset_rwsem);
+
+ /* Check to see if task is allowed in the cpuset */
+ ret = cpuset_can_attach_check(cs);
+ if (ret)
+ goto out_unlock;
+
+ ret = task_can_attach(task, cs->effective_cpus);
+ if (ret)
+ goto out_unlock;
+
+ ret = security_task_setscheduler(task);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Mark attach is in progress. This makes validate_change() fail
+ * changes which zero cpus/mems_allowed.
+ */
+ cs->attach_in_progress++;
+out_unlock:
+ percpu_up_write(&cpuset_rwsem);
+ return ret;
+}
+
+static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
+{
+ struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
+ bool same_cs;
+
+ rcu_read_lock();
+ same_cs = (cs == task_cs(current));
+ rcu_read_unlock();
+
+ if (same_cs)
+ return;
+
+ percpu_down_write(&cpuset_rwsem);
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
+ percpu_up_write(&cpuset_rwsem);
+}
+
+/*
* Make sure the new task conform to the current state of its parent,
* which could have been changed by cpuset just after it inherits the
* state from the parent and before it sits on the cgroup's task list.
*/
static void cpuset_fork(struct task_struct *task)
{
- if (task_css_is_root(task, cpuset_cgrp_id))
+ struct cpuset *cs;
+ bool same_cs;
+
+ rcu_read_lock();
+ cs = task_cs(task);
+ same_cs = (cs == task_cs(current));
+ rcu_read_unlock();
+
+ if (same_cs) {
+ if (cs == &top_cpuset)
+ return;
+
+ set_cpus_allowed_ptr(task, current->cpus_ptr);
+ task->mems_allowed = current->mems_allowed;
return;
+ }
- set_cpus_allowed_ptr(task, current->cpus_ptr);
- task->mems_allowed = current->mems_allowed;
+ /* CLONE_INTO_CGROUP */
+ percpu_down_write(&cpuset_rwsem);
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+ cpuset_attach_task(cs, task);
+
+ cs->attach_in_progress--;
+ if (!cs->attach_in_progress)
+ wake_up(&cpuset_attach_wq);
+
+ percpu_up_write(&cpuset_rwsem);
}
struct cgroup_subsys cpuset_cgrp_subsys = {
@@ -3271,6 +3384,8 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.attach = cpuset_attach,
.post_attach = cpuset_post_attach,
.bind = cpuset_bind,
+ .can_fork = cpuset_can_fork,
+ .cancel_fork = cpuset_cancel_fork,
.fork = cpuset_fork,
.legacy_cftypes = legacy_files,
.dfl_cftypes = dfl_files,
@@ -3508,6 +3623,8 @@ retry:
update_tasks:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
+ if (!cpus_updated && !mems_updated)
+ goto unlock; /* Hotplug doesn't affect this cpuset */
if (mems_updated)
check_insane_mems_config(&new_mems);
@@ -3519,6 +3636,7 @@ update_tasks:
hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
+unlock:
percpu_up_write(&cpuset_rwsem);
}
@@ -3831,7 +3949,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
}
/*
- * __cpuset_node_allowed - Can we allocate on a memory node?
+ * cpuset_node_allowed - Can we allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
@@ -3870,7 +3988,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
-bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool cpuset_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
bool allowed; /* is allocation in zone z allowed? */
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 1b6b21851e9d..936473203a6b 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -22,6 +22,7 @@
#include <linux/freezer.h>
#include <linux/seq_file.h>
#include <linux/mutex.h>
+#include <linux/cpu.h>
/*
* A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
@@ -350,7 +351,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
if (freeze) {
if (!(freezer->state & CGROUP_FREEZING))
- static_branch_inc(&freezer_active);
+ static_branch_inc_cpuslocked(&freezer_active);
freezer->state |= state;
freeze_cgroup(freezer);
} else {
@@ -361,7 +362,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
if (!(freezer->state & CGROUP_FREEZING)) {
freezer->state &= ~CGROUP_FROZEN;
if (was_freezing)
- static_branch_dec(&freezer_active);
+ static_branch_dec_cpuslocked(&freezer_active);
unfreeze_cgroup(freezer);
}
}
@@ -379,6 +380,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
{
struct cgroup_subsys_state *pos;
+ cpus_read_lock();
/*
* Update all its descendants in pre-order traversal. Each
* descendant will try to inherit its parent's FREEZING state as
@@ -407,6 +409,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
}
rcu_read_unlock();
mutex_unlock(&freezer_mutex);
+ cpus_read_unlock();
}
static ssize_t freezer_write(struct kernfs_open_file *of,
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 831f1f472bb8..9c4c55228567 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -241,12 +241,12 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
}
/**
- * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush()
+ * cgroup_rstat_flush_atomic- atomic version of cgroup_rstat_flush()
* @cgrp: target cgroup
*
* This function can be called from any context.
*/
-void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
+void cgroup_rstat_flush_atomic(struct cgroup *cgrp)
{
unsigned long flags;
@@ -457,9 +457,7 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
struct task_cputime *cputime = &bstat->cputime;
int i;
- cputime->stime = 0;
- cputime->utime = 0;
- cputime->sum_exec_runtime = 0;
+ memset(bstat, 0, sizeof(*bstat));
for_each_possible_cpu(i) {
struct kernel_cpustat kcpustat;
u64 *cpustat = kcpustat.cpustat;
diff --git a/kernel/compat.c b/kernel/compat.c
index 55551989d9da..fb50f29d9b36 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -152,7 +152,7 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len,
if (len & (sizeof(compat_ulong_t)-1))
return -EINVAL;
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
deleted file mode 100644
index 44b0f0146a3f..000000000000
--- a/kernel/configs/android-base.config
+++ /dev/null
@@ -1,159 +0,0 @@
-# KEEP ALPHABETICALLY SORTED
-# CONFIG_DEVMEM is not set
-# CONFIG_FHANDLE is not set
-# CONFIG_INET_LRO is not set
-# CONFIG_NFSD is not set
-# CONFIG_NFS_FS is not set
-# CONFIG_OABI_COMPAT is not set
-# CONFIG_SYSVIPC is not set
-# CONFIG_USELIB is not set
-CONFIG_ANDROID_BINDER_IPC=y
-CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder
-CONFIG_ANDROID_LOW_MEMORY_KILLER=y
-CONFIG_ARMV8_DEPRECATED=y
-CONFIG_ASHMEM=y
-CONFIG_AUDIT=y
-CONFIG_BLK_DEV_INITRD=y
-CONFIG_CGROUPS=y
-CONFIG_CGROUP_BPF=y
-CONFIG_CGROUP_CPUACCT=y
-CONFIG_CGROUP_DEBUG=y
-CONFIG_CGROUP_FREEZER=y
-CONFIG_CGROUP_SCHED=y
-CONFIG_CP15_BARRIER_EMULATION=y
-CONFIG_DEFAULT_SECURITY_SELINUX=y
-CONFIG_EMBEDDED=y
-CONFIG_FB=y
-CONFIG_HARDENED_USERCOPY=y
-CONFIG_HIGH_RES_TIMERS=y
-CONFIG_IKCONFIG=y
-CONFIG_IKCONFIG_PROC=y
-CONFIG_INET6_AH=y
-CONFIG_INET6_ESP=y
-CONFIG_INET6_IPCOMP=y
-CONFIG_INET=y
-CONFIG_INET_DIAG_DESTROY=y
-CONFIG_INET_ESP=y
-CONFIG_INET_XFRM_MODE_TUNNEL=y
-CONFIG_IP6_NF_FILTER=y
-CONFIG_IP6_NF_IPTABLES=y
-CONFIG_IP6_NF_MANGLE=y
-CONFIG_IP6_NF_RAW=y
-CONFIG_IP6_NF_TARGET_REJECT=y
-CONFIG_IPV6=y
-CONFIG_IPV6_MIP6=y
-CONFIG_IPV6_MULTIPLE_TABLES=y
-CONFIG_IPV6_OPTIMISTIC_DAD=y
-CONFIG_IPV6_ROUTER_PREF=y
-CONFIG_IPV6_ROUTE_INFO=y
-CONFIG_IP_ADVANCED_ROUTER=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_MULTIPLE_TABLES=y
-CONFIG_IP_NF_ARPFILTER=y
-CONFIG_IP_NF_ARPTABLES=y
-CONFIG_IP_NF_ARP_MANGLE=y
-CONFIG_IP_NF_FILTER=y
-CONFIG_IP_NF_IPTABLES=y
-CONFIG_IP_NF_MANGLE=y
-CONFIG_IP_NF_MATCH_AH=y
-CONFIG_IP_NF_MATCH_ECN=y
-CONFIG_IP_NF_MATCH_TTL=y
-CONFIG_IP_NF_NAT=y
-CONFIG_IP_NF_RAW=y
-CONFIG_IP_NF_SECURITY=y
-CONFIG_IP_NF_TARGET_MASQUERADE=y
-CONFIG_IP_NF_TARGET_NETMAP=y
-CONFIG_IP_NF_TARGET_REDIRECT=y
-CONFIG_IP_NF_TARGET_REJECT=y
-CONFIG_MODULES=y
-CONFIG_MODULE_UNLOAD=y
-CONFIG_MODVERSIONS=y
-CONFIG_NET=y
-CONFIG_NETDEVICES=y
-CONFIG_NETFILTER=y
-CONFIG_NETFILTER_TPROXY=y
-CONFIG_NETFILTER_XT_MATCH_COMMENT=y
-CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
-CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
-CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
-CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
-CONFIG_NETFILTER_XT_MATCH_HELPER=y
-CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
-CONFIG_NETFILTER_XT_MATCH_LENGTH=y
-CONFIG_NETFILTER_XT_MATCH_LIMIT=y
-CONFIG_NETFILTER_XT_MATCH_MAC=y
-CONFIG_NETFILTER_XT_MATCH_MARK=y
-CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
-CONFIG_NETFILTER_XT_MATCH_POLICY=y
-CONFIG_NETFILTER_XT_MATCH_QUOTA=y
-CONFIG_NETFILTER_XT_MATCH_SOCKET=y
-CONFIG_NETFILTER_XT_MATCH_STATE=y
-CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
-CONFIG_NETFILTER_XT_MATCH_STRING=y
-CONFIG_NETFILTER_XT_MATCH_TIME=y
-CONFIG_NETFILTER_XT_MATCH_U32=y
-CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
-CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
-CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
-CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
-CONFIG_NETFILTER_XT_TARGET_MARK=y
-CONFIG_NETFILTER_XT_TARGET_NFLOG=y
-CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
-CONFIG_NETFILTER_XT_TARGET_SECMARK=y
-CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
-CONFIG_NETFILTER_XT_TARGET_TPROXY=y
-CONFIG_NETFILTER_XT_TARGET_TRACE=y
-CONFIG_NET_CLS_ACT=y
-CONFIG_NET_CLS_U32=y
-CONFIG_NET_EMATCH=y
-CONFIG_NET_EMATCH_U32=y
-CONFIG_NET_KEY=y
-CONFIG_NET_SCHED=y
-CONFIG_NET_SCH_HTB=y
-CONFIG_NF_CONNTRACK=y
-CONFIG_NF_CONNTRACK_AMANDA=y
-CONFIG_NF_CONNTRACK_EVENTS=y
-CONFIG_NF_CONNTRACK_FTP=y
-CONFIG_NF_CONNTRACK_H323=y
-CONFIG_NF_CONNTRACK_IPV4=y
-CONFIG_NF_CONNTRACK_IPV6=y
-CONFIG_NF_CONNTRACK_IRC=y
-CONFIG_NF_CONNTRACK_NETBIOS_NS=y
-CONFIG_NF_CONNTRACK_PPTP=y
-CONFIG_NF_CONNTRACK_SANE=y
-CONFIG_NF_CONNTRACK_SECMARK=y
-CONFIG_NF_CONNTRACK_TFTP=y
-CONFIG_NF_CT_NETLINK=y
-CONFIG_NF_CT_PROTO_DCCP=y
-CONFIG_NF_CT_PROTO_SCTP=y
-CONFIG_NF_CT_PROTO_UDPLITE=y
-CONFIG_NF_NAT=y
-CONFIG_NO_HZ=y
-CONFIG_PACKET=y
-CONFIG_PM_AUTOSLEEP=y
-CONFIG_PM_WAKELOCKS=y
-CONFIG_PPP=y
-CONFIG_PPP_BSDCOMP=y
-CONFIG_PPP_DEFLATE=y
-CONFIG_PPP_MPPE=y
-CONFIG_PREEMPT=y
-CONFIG_QUOTA=y
-CONFIG_RANDOMIZE_BASE=y
-CONFIG_RTC_CLASS=y
-CONFIG_RT_GROUP_SCHED=y
-CONFIG_SECCOMP=y
-CONFIG_SECURITY=y
-CONFIG_SECURITY_NETWORK=y
-CONFIG_SECURITY_SELINUX=y
-CONFIG_SETEND_EMULATION=y
-CONFIG_STAGING=y
-CONFIG_SWP_EMULATION=y
-CONFIG_SYNC=y
-CONFIG_TUN=y
-CONFIG_UNIX=y
-CONFIG_USB_GADGET=y
-CONFIG_USB_CONFIGFS=y
-CONFIG_USB_CONFIGFS_F_FS=y
-CONFIG_USB_CONFIGFS_F_MIDI=y
-CONFIG_USB_OTG_WAKELOCK=y
-CONFIG_XFRM_USER=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
deleted file mode 100644
index e400fbbc8aba..000000000000
--- a/kernel/configs/android-recommended.config
+++ /dev/null
@@ -1,127 +0,0 @@
-# KEEP ALPHABETICALLY SORTED
-# CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set
-# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
-# CONFIG_INPUT_MOUSE is not set
-# CONFIG_LEGACY_PTYS is not set
-# CONFIG_NF_CONNTRACK_SIP is not set
-# CONFIG_PM_WAKELOCKS_GC is not set
-# CONFIG_VT is not set
-CONFIG_ARM64_SW_TTBR0_PAN=y
-CONFIG_BACKLIGHT_LCD_SUPPORT=y
-CONFIG_BLK_DEV_DM=y
-CONFIG_BLK_DEV_LOOP=y
-CONFIG_BLK_DEV_RAM=y
-CONFIG_BLK_DEV_RAM_SIZE=8192
-CONFIG_STACKPROTECTOR_STRONG=y
-CONFIG_COMPACTION=y
-CONFIG_CPU_SW_DOMAIN_PAN=y
-CONFIG_DM_CRYPT=y
-CONFIG_DM_UEVENT=y
-CONFIG_DM_VERITY=y
-CONFIG_DM_VERITY_FEC=y
-CONFIG_DRAGONRISE_FF=y
-CONFIG_ENABLE_DEFAULT_TRACERS=y
-CONFIG_EXT4_FS=y
-CONFIG_EXT4_FS_SECURITY=y
-CONFIG_FUSE_FS=y
-CONFIG_GREENASIA_FF=y
-CONFIG_HIDRAW=y
-CONFIG_HID_A4TECH=y
-CONFIG_HID_ACRUX=y
-CONFIG_HID_ACRUX_FF=y
-CONFIG_HID_APPLE=y
-CONFIG_HID_BELKIN=y
-CONFIG_HID_CHERRY=y
-CONFIG_HID_CHICONY=y
-CONFIG_HID_CYPRESS=y
-CONFIG_HID_DRAGONRISE=y
-CONFIG_HID_ELECOM=y
-CONFIG_HID_EMS_FF=y
-CONFIG_HID_EZKEY=y
-CONFIG_HID_GREENASIA=y
-CONFIG_HID_GYRATION=y
-CONFIG_HID_HOLTEK=y
-CONFIG_HID_KENSINGTON=y
-CONFIG_HID_KEYTOUCH=y
-CONFIG_HID_KYE=y
-CONFIG_HID_LCPOWER=y
-CONFIG_HID_LOGITECH=y
-CONFIG_HID_LOGITECH_DJ=y
-CONFIG_HID_MAGICMOUSE=y
-CONFIG_HID_MICROSOFT=y
-CONFIG_HID_MONTEREY=y
-CONFIG_HID_MULTITOUCH=y
-CONFIG_HID_NTRIG=y
-CONFIG_HID_ORTEK=y
-CONFIG_HID_PANTHERLORD=y
-CONFIG_HID_PETALYNX=y
-CONFIG_HID_PICOLCD=y
-CONFIG_HID_PRIMAX=y
-CONFIG_HID_PRODIKEYS=y
-CONFIG_HID_ROCCAT=y
-CONFIG_HID_SAITEK=y
-CONFIG_HID_SAMSUNG=y
-CONFIG_HID_SMARTJOYPLUS=y
-CONFIG_HID_SONY=y
-CONFIG_HID_SPEEDLINK=y
-CONFIG_HID_SUNPLUS=y
-CONFIG_HID_THRUSTMASTER=y
-CONFIG_HID_TIVO=y
-CONFIG_HID_TOPSEED=y
-CONFIG_HID_TWINHAN=y
-CONFIG_HID_UCLOGIC=y
-CONFIG_HID_WACOM=y
-CONFIG_HID_WALTOP=y
-CONFIG_HID_WIIMOTE=y
-CONFIG_HID_ZEROPLUS=y
-CONFIG_HID_ZYDACRON=y
-CONFIG_INPUT_EVDEV=y
-CONFIG_INPUT_GPIO=y
-CONFIG_INPUT_JOYSTICK=y
-CONFIG_INPUT_MISC=y
-CONFIG_INPUT_TABLET=y
-CONFIG_INPUT_UINPUT=y
-CONFIG_JOYSTICK_XPAD=y
-CONFIG_JOYSTICK_XPAD_FF=y
-CONFIG_JOYSTICK_XPAD_LEDS=y
-CONFIG_KALLSYMS_ALL=y
-CONFIG_KSM=y
-CONFIG_LOGIG940_FF=y
-CONFIG_LOGIRUMBLEPAD2_FF=y
-CONFIG_LOGITECH_FF=y
-CONFIG_MD=y
-CONFIG_MEDIA_SUPPORT=y
-CONFIG_MSDOS_FS=y
-CONFIG_PANIC_TIMEOUT=5
-CONFIG_PANTHERLORD_FF=y
-CONFIG_PERF_EVENTS=y
-CONFIG_PM_DEBUG=y
-CONFIG_PM_RUNTIME=y
-CONFIG_PM_WAKELOCKS_LIMIT=0
-CONFIG_POWER_SUPPLY=y
-CONFIG_PSTORE=y
-CONFIG_PSTORE_CONSOLE=y
-CONFIG_PSTORE_RAM=y
-CONFIG_SCHEDSTATS=y
-CONFIG_SMARTJOYPLUS_FF=y
-CONFIG_SND=y
-CONFIG_SOUND=y
-CONFIG_STRICT_KERNEL_RWX=y
-CONFIG_SUSPEND_TIME=y
-CONFIG_TABLET_USB_ACECAD=y
-CONFIG_TABLET_USB_AIPTEK=y
-CONFIG_TABLET_USB_HANWANG=y
-CONFIG_TABLET_USB_KBTAB=y
-CONFIG_TASKSTATS=y
-CONFIG_TASK_DELAY_ACCT=y
-CONFIG_TASK_IO_ACCOUNTING=y
-CONFIG_TASK_XACCT=y
-CONFIG_TIMER_STATS=y
-CONFIG_TMPFS=y
-CONFIG_TMPFS_POSIX_ACL=y
-CONFIG_UHID=y
-CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
-CONFIG_USB_EHCI_HCD=y
-CONFIG_USB_HIDDEV=y
-CONFIG_USB_USBNET=y
-CONFIG_VFAT_FS=y
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index c2f9c912df1c..144b2bd86b14 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -7,6 +7,5 @@ CONFIG_KERNEL_XZ=y
# CONFIG_KERNEL_LZO is not set
# CONFIG_KERNEL_LZ4 is not set
# CONFIG_SLAB is not set
-# CONFIG_SLOB_DEPRECATED is not set
CONFIG_SLUB=y
CONFIG_SLUB_TINY=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6c0a92ca6bb5..f4a2c5845bcb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -623,7 +623,7 @@ static int finish_cpu(unsigned int cpu)
*/
if (mm != &init_mm)
idle->active_mm = &init_mm;
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
return 0;
}
@@ -2569,22 +2569,33 @@ static const struct attribute_group cpuhp_smt_attr_group = {
static int __init cpu_smt_sysfs_init(void)
{
- return sysfs_create_group(&cpu_subsys.dev_root->kobj,
- &cpuhp_smt_attr_group);
+ struct device *dev_root;
+ int ret = -ENODEV;
+
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ ret = sysfs_create_group(&dev_root->kobj, &cpuhp_smt_attr_group);
+ put_device(dev_root);
+ }
+ return ret;
}
static int __init cpuhp_sysfs_init(void)
{
+ struct device *dev_root;
int cpu, ret;
ret = cpu_smt_sysfs_init();
if (ret)
return ret;
- ret = sysfs_create_group(&cpu_subsys.dev_root->kobj,
- &cpuhp_cpu_root_attr_group);
- if (ret)
- return ret;
+ dev_root = bus_get_dev_root(&cpu_subsys);
+ if (dev_root) {
+ ret = sysfs_create_group(&dev_root->kobj, &cpuhp_cpu_root_attr_group);
+ put_device(dev_root);
+ if (ret)
+ return ret;
+ }
for_each_possible_cpu(cpu) {
struct device *dev = get_cpu_device(cpu);
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 755f5f08ab38..90ce1dfd591c 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -474,7 +474,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(list_head, prev);
VMCOREINFO_OFFSET(vmap_area, va_start);
VMCOREINFO_OFFSET(vmap_area, list);
- VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+ VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1);
log_buf_vmcoreinfo_setup();
VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
VMCOREINFO_NUMBER(NR_FREE_PAGES);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index e39cb696cfbd..6f0c358e73d8 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -179,12 +179,15 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp;
tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay;
d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp;
+ tmp = d->irq_delay_total + tsk->delays->irq_delay;
+ d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp;
d->blkio_count += tsk->delays->blkio_count;
d->swapin_count += tsk->delays->swapin_count;
d->freepages_count += tsk->delays->freepages_count;
d->thrashing_count += tsk->delays->thrashing_count;
d->compact_count += tsk->delays->compact_count;
d->wpcopy_count += tsk->delays->wpcopy_count;
+ d->irq_count += tsk->delays->irq_count;
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return 0;
@@ -274,3 +277,14 @@ void __delayacct_wpcopy_end(void)
&current->delays->wpcopy_delay,
&current->delays->wpcopy_count);
}
+
+void __delayacct_irq(struct task_struct *task, u32 delta)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&task->delays->lock, flags);
+ task->delays->irq_delay += delta;
+ task->delays->irq_count++;
+ raw_spin_unlock_irqrestore(&task->delays->lock, flags);
+}
+
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
index 0520a8f4fb1d..02205ab53b7e 100644
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -356,4 +356,3 @@ module_exit(map_benchmark_cleanup);
MODULE_AUTHOR("Barry Song <song.bao.hua@hisilicon.com>");
MODULE_DESCRIPTION("dma_map benchmark driver");
-MODULE_LICENSE("GPL");
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 4d40dcce7604..1acec2e22827 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -84,8 +84,8 @@ static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
void *addr;
int ret = -ENOMEM;
- /* Cannot allocate larger than MAX_ORDER-1 */
- order = min(get_order(pool_size), MAX_ORDER-1);
+ /* Cannot allocate larger than MAX_ORDER */
+ order = min(get_order(pool_size), MAX_ORDER);
do {
pool_size = 1 << (PAGE_SHIFT + order);
@@ -190,7 +190,7 @@ static int __init dma_atomic_pool_init(void)
/*
* If coherent_pool was not used on the command line, default the pool
- * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER-1.
+ * sizes to 128KB per 1GB of memory, min 128KB, max MAX_ORDER.
*/
if (!atomic_pool_size) {
unsigned long pages = totalram_pages() / (SZ_1G / SZ_128K);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index e67dafc255f2..af2e304c672c 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -73,8 +73,6 @@ static bool swiotlb_force_disable;
struct io_tlb_mem io_tlb_default_mem;
-phys_addr_t swiotlb_unencrypted_base;
-
static unsigned long default_nslabs = IO_TLB_DEFAULT_SIZE >> IO_TLB_SHIFT;
static unsigned long default_nareas;
@@ -202,34 +200,6 @@ static inline unsigned long nr_slots(u64 val)
}
/*
- * Remap swioltb memory in the unencrypted physical address space
- * when swiotlb_unencrypted_base is set. (e.g. for Hyper-V AMD SEV-SNP
- * Isolation VMs).
- */
-#ifdef CONFIG_HAS_IOMEM
-static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
-{
- void *vaddr = NULL;
-
- if (swiotlb_unencrypted_base) {
- phys_addr_t paddr = mem->start + swiotlb_unencrypted_base;
-
- vaddr = memremap(paddr, bytes, MEMREMAP_WB);
- if (!vaddr)
- pr_err("Failed to map the unencrypted memory %pa size %lx.\n",
- &paddr, bytes);
- }
-
- return vaddr;
-}
-#else
-static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
-{
- return NULL;
-}
-#endif
-
-/*
* Early SWIOTLB allocation may be too early to allow an architecture to
* perform the desired operations. This function allows the architecture to
* call SWIOTLB when the operations are possible. It needs to be called
@@ -238,18 +208,12 @@ static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
void __init swiotlb_update_mem_attributes(void)
{
struct io_tlb_mem *mem = &io_tlb_default_mem;
- void *vaddr;
unsigned long bytes;
if (!mem->nslabs || mem->late_alloc)
return;
- vaddr = phys_to_virt(mem->start);
bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
- set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
-
- mem->vaddr = swiotlb_mem_remap(mem, bytes);
- if (!mem->vaddr)
- mem->vaddr = vaddr;
+ set_memory_decrypted((unsigned long)mem->vaddr, bytes >> PAGE_SHIFT);
}
static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
@@ -280,13 +244,6 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
mem->slots[i].alloc_size = 0;
}
- /*
- * If swiotlb_unencrypted_base is set, the bounce buffer memory will
- * be remapped and cleared in swiotlb_update_mem_attributes.
- */
- if (swiotlb_unencrypted_base)
- return;
-
memset(vaddr, 0, bytes);
mem->vaddr = vaddr;
return;
@@ -657,7 +614,7 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index,
phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
unsigned long max_slots = get_max_slots(boundary_mask);
unsigned int iotlb_align_mask =
- dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1);
+ dma_get_min_align_mask(dev) | alloc_align_mask;
unsigned int nslots = nr_slots(alloc_size), stride;
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
unsigned int index, slots_checked, count = 0, i;
@@ -673,8 +630,8 @@ static int swiotlb_do_find_slots(struct device *dev, int area_index,
* allocations.
*/
if (alloc_size >= PAGE_SIZE)
- iotlb_align_mask &= PAGE_MASK;
- iotlb_align_mask &= alloc_align_mask;
+ iotlb_align_mask |= ~PAGE_MASK;
+ iotlb_align_mask &= ~(IO_TLB_SIZE - 1);
/*
* For mappings with an alignment requirement don't bother looping to
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 846add8394c4..be61332c66b5 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -21,7 +21,7 @@ static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
arch_enter_from_user_mode(regs);
lockdep_hardirqs_off(CALLER_ADDR0);
- CT_WARN_ON(ct_state() != CONTEXT_USER);
+ CT_WARN_ON(__ct_state() != CONTEXT_USER);
user_exit_irqoff();
instrumentation_begin();
@@ -192,13 +192,14 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
static void exit_to_user_mode_prepare(struct pt_regs *regs)
{
- unsigned long ti_work = read_thread_flags();
+ unsigned long ti_work;
lockdep_assert_irqs_disabled();
/* Flush pending rcuog wakeup before the last need_resched() check */
tick_nohz_user_enter_prepare();
+ ti_work = read_thread_flags();
if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
ti_work = exit_to_user_mode_loop(regs, ti_work);
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
index 0b6379adff6b..5340c5aa89e7 100644
--- a/kernel/entry/syscall_user_dispatch.c
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -4,6 +4,7 @@
*/
#include <linux/sched.h>
#include <linux/prctl.h>
+#include <linux/ptrace.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/uaccess.h>
#include <linux/signal.h>
@@ -68,8 +69,9 @@ bool syscall_user_dispatch(struct pt_regs *regs)
return true;
}
-int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
- unsigned long len, char __user *selector)
+static int task_set_syscall_user_dispatch(struct task_struct *task, unsigned long mode,
+ unsigned long offset, unsigned long len,
+ char __user *selector)
{
switch (mode) {
case PR_SYS_DISPATCH_OFF:
@@ -86,7 +88,16 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
if (offset && offset + len <= offset)
return -EINVAL;
- if (selector && !access_ok(selector, sizeof(*selector)))
+ /*
+ * access_ok() will clear memory tags for tagged addresses
+ * if current has memory tagging enabled.
+
+ * To enable a tracer to set a tracees selector the
+ * selector address must be untagged for access_ok(),
+ * otherwise an untagged tracer will always fail to set a
+ * tagged tracees selector.
+ */
+ if (selector && !access_ok(untagged_addr(selector), sizeof(*selector)))
return -EFAULT;
break;
@@ -94,15 +105,60 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
return -EINVAL;
}
- current->syscall_dispatch.selector = selector;
- current->syscall_dispatch.offset = offset;
- current->syscall_dispatch.len = len;
- current->syscall_dispatch.on_dispatch = false;
+ task->syscall_dispatch.selector = selector;
+ task->syscall_dispatch.offset = offset;
+ task->syscall_dispatch.len = len;
+ task->syscall_dispatch.on_dispatch = false;
if (mode == PR_SYS_DISPATCH_ON)
- set_syscall_work(SYSCALL_USER_DISPATCH);
+ set_task_syscall_work(task, SYSCALL_USER_DISPATCH);
+ else
+ clear_task_syscall_work(task, SYSCALL_USER_DISPATCH);
+
+ return 0;
+}
+
+int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
+ unsigned long len, char __user *selector)
+{
+ return task_set_syscall_user_dispatch(current, mode, offset, len, selector);
+}
+
+int syscall_user_dispatch_get_config(struct task_struct *task, unsigned long size,
+ void __user *data)
+{
+ struct syscall_user_dispatch *sd = &task->syscall_dispatch;
+ struct ptrace_sud_config cfg;
+
+ if (size != sizeof(cfg))
+ return -EINVAL;
+
+ if (test_task_syscall_work(task, SYSCALL_USER_DISPATCH))
+ cfg.mode = PR_SYS_DISPATCH_ON;
else
- clear_syscall_work(SYSCALL_USER_DISPATCH);
+ cfg.mode = PR_SYS_DISPATCH_OFF;
+
+ cfg.offset = sd->offset;
+ cfg.len = sd->len;
+ cfg.selector = (__u64)(uintptr_t)sd->selector;
+
+ if (copy_to_user(data, &cfg, sizeof(cfg)))
+ return -EFAULT;
return 0;
}
+
+int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size,
+ void __user *data)
+{
+ struct ptrace_sud_config cfg;
+
+ if (size != sizeof(cfg))
+ return -EINVAL;
+
+ if (copy_from_user(&cfg, data, sizeof(cfg)))
+ return -EFAULT;
+
+ return task_set_syscall_user_dispatch(task, cfg.mode, cfg.offset, cfg.len,
+ (char __user *)(uintptr_t)cfg.selector);
+}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f79fd8b87f75..68baa8194d9f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2163,7 +2163,7 @@ static void perf_group_detach(struct perf_event *event)
/* Inherit group flags from the previous leader */
sibling->group_caps = event->group_caps;
- if (!RB_EMPTY_NODE(&event->group_node)) {
+ if (sibling->attach_state & PERF_ATTACH_CONTEXT) {
add_event_to_groups(sibling, event->ctx);
if (sibling->state == PERF_EVENT_STATE_ACTIVE)
@@ -3872,7 +3872,7 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
if (likely(!ctx->nr_events))
return;
- if (is_active ^ EVENT_TIME) {
+ if (!(is_active & EVENT_TIME)) {
/* start ctx time */
__update_context_time(ctx, false);
perf_cgroup_set_timestamp(cpuctx);
@@ -9187,7 +9187,7 @@ static void perf_event_bpf_output(struct perf_event *event, void *data)
perf_event_header__init_id(&bpf_event->event_id.header,
&sample, event);
- ret = perf_output_begin(&handle, data, event,
+ ret = perf_output_begin(&handle, &sample, event,
bpf_event->event_id.header.size);
if (ret)
return;
@@ -9433,8 +9433,8 @@ __perf_event_account_interrupt(struct perf_event *event, int throttle)
hwc->interrupts = 1;
} else {
hwc->interrupts++;
- if (unlikely(throttle
- && hwc->interrupts >= max_samples_per_tick)) {
+ if (unlikely(throttle &&
+ hwc->interrupts > max_samples_per_tick)) {
__this_cpu_inc(perf_throttled_count);
tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
hwc->interrupts = MAX_INTERRUPTS;
@@ -12173,7 +12173,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
/*
* If its not a per-cpu rb, it must be the same task.
*/
- if (output_event->cpu == -1 && output_event->ctx != event->ctx)
+ if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
goto out;
/*
@@ -12893,12 +12893,14 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
__perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
__perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);
- /*
- * Wait for the events to quiesce before re-instating them.
- */
- synchronize_rcu();
+ if (!list_empty(&events)) {
+ /*
+ * Wait for the events to quiesce before re-instating them.
+ */
+ synchronize_rcu();
- __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
+ __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
+ }
mutex_unlock(&dst_ctx->mutex);
mutex_unlock(&src_ctx->mutex);
diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c
index c57610f52bb4..2cfeeecf8de9 100644
--- a/kernel/events/hw_breakpoint_test.c
+++ b/kernel/events/hw_breakpoint_test.c
@@ -329,5 +329,4 @@ static struct kunit_suite hw_breakpoint_test_suite = {
kunit_test_suites(&hw_breakpoint_test_suite);
-MODULE_LICENSE("GPL");
MODULE_AUTHOR("Marco Elver <elver@google.com>");
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 273a0fe7910a..a0433f37b024 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -814,7 +814,7 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
size = sizeof(struct perf_buffer);
size += nr_pages * sizeof(void *);
- if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
+ if (order_base_2(size) > PAGE_SHIFT+MAX_ORDER)
goto fail;
node = (cpu == -1) ? cpu : cpu_to_node(cpu);
diff --git a/kernel/exit.c b/kernel/exit.c
index f2afdb0add7c..34b90e2e7cf7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -68,6 +68,7 @@
#include <linux/kprobes.h>
#include <linux/rethook.h>
#include <linux/sysfs.h>
+#include <linux/user_events.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -537,7 +538,7 @@ static void exit_mm(void)
return;
sync_mm_rss(mm);
mmap_read_lock(mm);
- mmgrab(mm);
+ mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
@@ -818,6 +819,7 @@ void __noreturn do_exit(long code)
coredump_task_exit(tsk);
ptrace_event(PTRACE_EVENT_EXIT, code);
+ user_events_exit(tsk);
validate_creds_for_do_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index d8cda4c6de6c..735d9f4f5acf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -97,6 +97,7 @@
#include <linux/io_uring.h>
#include <linux/bpf.h>
#include <linux/stackprotector.h>
+#include <linux/user_events.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -451,13 +452,49 @@ static struct kmem_cache *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
+#ifdef CONFIG_PER_VMA_LOCK
+
+/* SLAB cache for vm_area_struct.lock */
+static struct kmem_cache *vma_lock_cachep;
+
+static bool vma_lock_alloc(struct vm_area_struct *vma)
+{
+ vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
+ if (!vma->vm_lock)
+ return false;
+
+ init_rwsem(&vma->vm_lock->lock);
+ vma->vm_lock_seq = -1;
+
+ return true;
+}
+
+static inline void vma_lock_free(struct vm_area_struct *vma)
+{
+ kmem_cache_free(vma_lock_cachep, vma->vm_lock);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
+static inline void vma_lock_free(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
struct vm_area_struct *vma;
vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (vma)
- vma_init(vma, mm);
+ if (!vma)
+ return NULL;
+
+ vma_init(vma, mm);
+ if (!vma_lock_alloc(vma)) {
+ kmem_cache_free(vm_area_cachep, vma);
+ return NULL;
+ }
+
return vma;
}
@@ -465,26 +502,56 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{
struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
- if (new) {
- ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
- ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
- /*
- * orig->shared.rb may be modified concurrently, but the clone
- * will be reinitialized.
- */
- data_race(memcpy(new, orig, sizeof(*new)));
- INIT_LIST_HEAD(&new->anon_vma_chain);
- dup_anon_vma_name(orig, new);
+ if (!new)
+ return NULL;
+
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+ ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+ /*
+ * orig->shared.rb may be modified concurrently, but the clone
+ * will be reinitialized.
+ */
+ data_race(memcpy(new, orig, sizeof(*new)));
+ if (!vma_lock_alloc(new)) {
+ kmem_cache_free(vm_area_cachep, new);
+ return NULL;
}
+ INIT_LIST_HEAD(&new->anon_vma_chain);
+ vma_numab_state_init(new);
+ dup_anon_vma_name(orig, new);
+
return new;
}
-void vm_area_free(struct vm_area_struct *vma)
+void __vm_area_free(struct vm_area_struct *vma)
{
+ vma_numab_state_free(vma);
free_anon_vma_name(vma);
+ vma_lock_free(vma);
kmem_cache_free(vm_area_cachep, vma);
}
+#ifdef CONFIG_PER_VMA_LOCK
+static void vm_area_free_rcu_cb(struct rcu_head *head)
+{
+ struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
+ vm_rcu);
+
+ /* The vma should not be locked while being destroyed. */
+ VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
+ __vm_area_free(vma);
+}
+#endif
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_PER_VMA_LOCK
+ call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
+#else
+ __vm_area_free(vma);
+#endif
+}
+
static void account_kernel_stack(struct task_struct *tsk, int account)
{
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@@ -617,6 +684,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
if (retval)
goto out;
+ mt_clear_in_rcu(vmi.mas.tree);
for_each_vma(old_vmi, mpnt) {
struct file *file;
@@ -700,6 +768,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
retval = arch_dup_mmap(oldmm, mm);
loop_out:
vma_iter_free(&vmi);
+ if (!retval)
+ mt_set_in_rcu(vmi.mas.tree);
out:
mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
@@ -755,11 +825,6 @@ static void check_mm(struct mm_struct *mm)
for (i = 0; i < NR_MM_COUNTERS; i++) {
long x = percpu_counter_sum(&mm->rss_stat[i]);
- if (likely(!x))
- continue;
-
- /* Making sure this is not due to race with CPU offlining. */
- x = percpu_counter_sum_all(&mm->rss_stat[i]);
if (unlikely(x))
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
mm, resident_page_types[i], x);
@@ -777,6 +842,67 @@ static void check_mm(struct mm_struct *mm)
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
+static void do_check_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ WARN_ON_ONCE(current->active_mm == mm);
+}
+
+static void do_shoot_lazy_tlb(void *arg)
+{
+ struct mm_struct *mm = arg;
+
+ if (current->active_mm == mm) {
+ WARN_ON_ONCE(current->mm);
+ current->active_mm = &init_mm;
+ switch_mm(mm, &init_mm, current);
+ }
+}
+
+static void cleanup_lazy_tlbs(struct mm_struct *mm)
+{
+ if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
+ /*
+ * In this case, lazy tlb mms are refounted and would not reach
+ * __mmdrop until all CPUs have switched away and mmdrop()ed.
+ */
+ return;
+ }
+
+ /*
+ * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
+ * requires lazy mm users to switch to another mm when the refcount
+ * drops to zero, before the mm is freed. This requires IPIs here to
+ * switch kernel threads to init_mm.
+ *
+ * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
+ * switch with the final userspace teardown TLB flush which leaves the
+ * mm lazy on this CPU but no others, reducing the need for additional
+ * IPIs here. There are cases where a final IPI is still required here,
+ * such as the final mmdrop being performed on a different CPU than the
+ * one exiting, or kernel threads using the mm when userspace exits.
+ *
+ * IPI overheads have not found to be expensive, but they could be
+ * reduced in a number of possible ways, for example (roughly
+ * increasing order of complexity):
+ * - The last lazy reference created by exit_mm() could instead switch
+ * to init_mm, however it's probable this will run on the same CPU
+ * immediately afterwards, so this may not reduce IPIs much.
+ * - A batch of mms requiring IPIs could be gathered and freed at once.
+ * - CPUs store active_mm where it can be remotely checked without a
+ * lock, to filter out false-positives in the cpumask.
+ * - After mm_users or mm_count reaches zero, switching away from the
+ * mm could clear mm_cpumask to reduce some IPIs, perhaps together
+ * with some batching or delaying of the final IPIs.
+ * - A delayed freeing and RCU-like quiescing sequence based on mm
+ * switching to avoid IPIs completely.
+ */
+ on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
+ if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
+ on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
+}
+
/*
* Called when the last reference to the mm
* is dropped: either by a lazy thread or by
@@ -788,6 +914,10 @@ void __mmdrop(struct mm_struct *mm)
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
+
+ /* Ensure no CPUs are using this as their lazy tlb mm */
+ cleanup_lazy_tlbs(mm);
+
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
@@ -795,6 +925,7 @@ void __mmdrop(struct mm_struct *mm)
check_mm(mm);
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
+ mm_destroy_cid(mm);
for (i = 0; i < NR_MM_COUNTERS; i++)
percpu_counter_destroy(&mm->rss_stat[i]);
@@ -1059,7 +1190,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_SCHED_MM_CID
tsk->mm_cid = -1;
+ tsk->last_mm_cid = -1;
tsk->mm_cid_active = 0;
+ tsk->migrate_from_cpu = -1;
#endif
return tsk;
@@ -1130,6 +1263,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
+#ifdef CONFIG_PER_VMA_LOCK
+ mm->mm_lock_seq = 0;
+#endif
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
@@ -1164,18 +1300,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (init_new_context(p, mm))
goto fail_nocontext;
+ if (mm_alloc_cid(mm))
+ goto fail_cid;
+
for (i = 0; i < NR_MM_COUNTERS; i++)
if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
goto fail_pcpu;
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
- mm_init_cid(mm);
return mm;
fail_pcpu:
while (i > 0)
percpu_counter_destroy(&mm->rss_stat[--i]);
+ mm_destroy_cid(mm);
+fail_cid:
+ destroy_context(mm);
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
@@ -1627,7 +1768,8 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
+ int no_files)
{
struct files_struct *oldf, *newf;
int error = 0;
@@ -1639,6 +1781,11 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
if (!oldf)
goto out;
+ if (no_files) {
+ tsk->files = NULL;
+ goto out;
+ }
+
if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
goto out;
@@ -1956,6 +2103,91 @@ const struct file_operations pidfd_fops = {
#endif
};
+/**
+ * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid: the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @pidfd: the pidfd to return
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+
+ * The helper doesn't perform checks on @pid which makes it useful for pidfds
+ * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
+ * pidfd file are prepared.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ * pidfd file is returned in the last argument to the function. On
+ * error, a negative error code is returned from the function and the
+ * last argument remains unchanged.
+ */
+static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+ int pidfd;
+ struct file *pidfd_file;
+
+ if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
+ return -EINVAL;
+
+ pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ if (pidfd < 0)
+ return pidfd;
+
+ pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
+ flags | O_RDWR | O_CLOEXEC);
+ if (IS_ERR(pidfd_file)) {
+ put_unused_fd(pidfd);
+ return PTR_ERR(pidfd_file);
+ }
+ get_pid(pid); /* held by pidfd_file now */
+ *ret = pidfd_file;
+ return pidfd;
+}
+
+/**
+ * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
+ * @pid: the struct pid for which to create a pidfd
+ * @flags: flags of the new @pidfd
+ * @pidfd: the pidfd to return
+ *
+ * Allocate a new file that stashes @pid and reserve a new pidfd number in the
+ * caller's file descriptor table. The pidfd is reserved but not installed yet.
+ *
+ * The helper verifies that @pid is used as a thread group leader.
+ *
+ * If this function returns successfully the caller is responsible to either
+ * call fd_install() passing the returned pidfd and pidfd file as arguments in
+ * order to install the pidfd into its file descriptor table or they must use
+ * put_unused_fd() and fput() on the returned pidfd and pidfd file
+ * respectively.
+ *
+ * This function is useful when a pidfd must already be reserved but there
+ * might still be points of failure afterwards and the caller wants to ensure
+ * that no pidfd is leaked into its file descriptor table.
+ *
+ * Return: On success, a reserved pidfd is returned from the function and a new
+ * pidfd file is returned in the last argument to the function. On
+ * error, a negative error code is returned from the function and the
+ * last argument remains unchanged.
+ */
+int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
+{
+ if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+ return -EINVAL;
+
+ return __pidfd_prepare(pid, flags, ret);
+}
+
static void __delayed_free_task(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
@@ -2010,7 +2242,7 @@ static void rv_task_fork(struct task_struct *p)
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
-static __latent_entropy struct task_struct *copy_process(
+__latent_entropy struct task_struct *copy_process(
struct pid *pid,
int trace,
int node,
@@ -2103,6 +2335,8 @@ static __latent_entropy struct task_struct *copy_process(
p->flags &= ~PF_KTHREAD;
if (args->kthread)
p->flags |= PF_KTHREAD;
+ if (args->user_worker)
+ p->flags |= PF_USER_WORKER;
if (args->io_thread) {
/*
* Mark us an IO worker, and block any signal that isn't
@@ -2112,6 +2346,9 @@ static __latent_entropy struct task_struct *copy_process(
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
}
+ if (args->name)
+ strscpy_pad(p->comm, args->name, sizeof(p->comm));
+
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
* Clear TID on mm_release()?
@@ -2254,7 +2491,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_security;
- retval = copy_files(clone_flags, p);
+ retval = copy_files(clone_flags, p, args->no_files);
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p);
@@ -2279,6 +2516,9 @@ static __latent_entropy struct task_struct *copy_process(
if (retval)
goto bad_fork_cleanup_io;
+ if (args->ignore_signals)
+ ignore_signals(p);
+
stackleak_task_init(p);
if (pid != &init_struct_pid) {
@@ -2296,21 +2536,12 @@ static __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
- retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ /* Note that no task has been attached to @pid yet. */
+ retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
-
pidfd = retval;
- pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
- O_RDWR | O_CLOEXEC);
- if (IS_ERR(pidfile)) {
- put_unused_fd(pidfd);
- retval = PTR_ERR(pidfile);
- goto bad_fork_free_pid;
- }
- get_pid(pid); /* held by pidfile now */
-
retval = put_user(pidfd, args->pidfd);
if (retval)
goto bad_fork_put_pidfd;
@@ -2505,6 +2736,7 @@ static __latent_entropy struct task_struct *copy_process(
trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags);
+ user_events_fork(p, clone_flags);
copy_oom_score_adj(clone_flags, p);
@@ -2627,6 +2859,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
.fn = fn,
.fn_arg = arg,
.io_thread = 1,
+ .user_worker = 1,
};
return copy_process(NULL, 0, node, &args);
@@ -2730,7 +2963,8 @@ pid_t kernel_clone(struct kernel_clone_args *args)
/*
* Create a kernel thread.
*/
-pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+ unsigned long flags)
{
struct kernel_clone_args args = {
.flags = ((lower_32_bits(flags) | CLONE_VM |
@@ -2738,6 +2972,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
.fn = fn,
.fn_arg = arg,
+ .name = name,
.kthread = 1,
};
@@ -3067,6 +3302,9 @@ void __init proc_caches_init(void)
NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
+#ifdef CONFIG_PER_VMA_LOCK
+ vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
+#endif
mmap_init();
nsproxy_cache_init();
}
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 322813366c6c..9a24574988d2 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -28,7 +28,7 @@
/*
* The number of tasks checked:
*/
-int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
/*
* Limit number of tasks checked in a batch.
@@ -47,9 +47,9 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_
/*
* Zero (default value) means use sysctl_hung_task_timeout_secs:
*/
-unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
+static unsigned long __read_mostly sysctl_hung_task_check_interval_secs;
-int __read_mostly sysctl_hung_task_warnings = 10;
+static int __read_mostly sysctl_hung_task_warnings = 10;
static int __read_mostly did_panic;
static bool hung_task_show_lock;
@@ -72,8 +72,8 @@ static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
* Should we panic (and reboot, if panic_timeout= is set) when a
* hung task is detected:
*/
-unsigned int __read_mostly sysctl_hung_task_panic =
- IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC);
+static unsigned int __read_mostly sysctl_hung_task_panic =
+ IS_ENABLED(CONFIG_BOOTPARAM_HUNG_TASK_PANIC);
static int
hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8ce75495e04f..d2742af0f0fd 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -189,9 +189,12 @@ void irq_set_thread_affinity(struct irq_desc *desc)
{
struct irqaction *action;
- for_each_action_of_desc(desc, action)
+ for_each_action_of_desc(desc, action) {
if (action->thread)
set_bit(IRQTF_AFFINITY, &action->thread_flags);
+ if (action->secondary && action->secondary->thread)
+ set_bit(IRQTF_AFFINITY, &action->secondary->thread_flags);
+ }
}
#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 7afa40fe5cc4..2f4fb336dda1 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -22,6 +22,8 @@
#include <asm/processor.h>
#include <linux/kasan.h>
+#include <trace/events/ipi.h>
+
static DEFINE_PER_CPU(struct llist_head, raised_list);
static DEFINE_PER_CPU(struct llist_head, lazy_list);
static DEFINE_PER_CPU(struct task_struct *, irq_workd);
@@ -74,6 +76,14 @@ void __weak arch_irq_work_raise(void)
*/
}
+static __always_inline void irq_work_raise(struct irq_work *work)
+{
+ if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt())
+ trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func);
+
+ arch_irq_work_raise();
+}
+
/* Enqueue on current CPU, work must already be claimed and preempt disabled */
static void __irq_work_queue_local(struct irq_work *work)
{
@@ -99,7 +109,7 @@ static void __irq_work_queue_local(struct irq_work *work)
/* If the work is "lazy", handle it from next tick if any */
if (!lazy_work || tick_nohz_tick_stopped())
- arch_irq_work_raise();
+ irq_work_raise(work);
}
/* Enqueue the irq work @work on the current CPU */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 83f499182c9a..77747391f49b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -288,8 +288,7 @@ unsigned long kallsyms_lookup_name(const char *name)
* Iterate over all symbols in vmlinux. For symbols from modules use
* module_kallsyms_on_each_symbol instead.
*/
-int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
- unsigned long),
+int kallsyms_on_each_symbol(int (*fn)(void *, const char *, unsigned long),
void *data)
{
char namebuf[KSYM_NAME_LEN];
@@ -299,7 +298,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
- ret = fn(data, namebuf, NULL, kallsyms_sym_address(i));
+ ret = fn(data, namebuf, kallsyms_sym_address(i));
if (ret != 0)
return ret;
cond_resched();
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index bfbc12da3326..a2e3745d15c4 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -95,7 +95,7 @@ static struct test_item test_items[] = {
static char stub_name[KSYM_NAME_LEN];
-static int stat_symbol_len(void *data, const char *name, struct module *mod, unsigned long addr)
+static int stat_symbol_len(void *data, const char *name, unsigned long addr)
{
*(u32 *)data += strlen(name);
@@ -154,7 +154,7 @@ static void test_kallsyms_compression_ratio(void)
pr_info(" ---------------------------------------------------------\n");
}
-static int lookup_name(void *data, const char *name, struct module *mod, unsigned long addr)
+static int lookup_name(void *data, const char *name, unsigned long addr)
{
u64 t0, t1, t;
struct test_stat *stat = (struct test_stat *)data;
@@ -207,7 +207,7 @@ static bool match_cleanup_name(const char *s, const char *name)
return !strncmp(s, name, len);
}
-static int find_symbol(void *data, const char *name, struct module *mod, unsigned long addr)
+static int find_symbol(void *data, const char *name, unsigned long addr)
{
struct test_stat *stat = (struct test_stat *)data;
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index 8cf70f068d92..a45f3dfc8d14 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -16,6 +16,6 @@ obj-y := core.o debugfs.o report.o
KCSAN_INSTRUMENT_BARRIERS_selftest.o := y
obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
-CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
+CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -fno-omit-frame-pointer
CFLAGS_kcsan_test.o += $(DISABLE_STRUCTLEAK_PLUGIN)
obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 54d077e1a2dc..5a60cc52adc0 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -337,11 +337,20 @@ static void delay_access(int type)
*/
static __always_inline u64 read_instrumented_memory(const volatile void *ptr, size_t size)
{
+ /*
+ * In the below we don't necessarily need the read of the location to
+ * be atomic, and we don't use READ_ONCE(), since all we need for race
+ * detection is to observe 2 different values.
+ *
+ * Furthermore, on certain architectures (such as arm64), READ_ONCE()
+ * may turn into more complex instructions than a plain load that cannot
+ * do unaligned accesses.
+ */
switch (size) {
- case 1: return READ_ONCE(*(const u8 *)ptr);
- case 2: return READ_ONCE(*(const u16 *)ptr);
- case 4: return READ_ONCE(*(const u32 *)ptr);
- case 8: return READ_ONCE(*(const u64 *)ptr);
+ case 1: return *(const volatile u8 *)ptr;
+ case 2: return *(const volatile u16 *)ptr;
+ case 4: return *(const volatile u32 *)ptr;
+ case 8: return *(const volatile u64 *)ptr;
default: return 0; /* Ignore; we do not diff the values. */
}
}
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index a60c561724be..0ddbdab5903d 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -1572,34 +1572,26 @@ static void test_exit(struct kunit *test)
}
__no_kcsan
-static void register_tracepoints(struct tracepoint *tp, void *ignore)
+static void register_tracepoints(void)
{
- check_trace_callback_type_console(probe_console);
- if (!strcmp(tp->name, "console"))
- WARN_ON(tracepoint_probe_register(tp, probe_console, NULL));
+ register_trace_console(probe_console, NULL);
}
__no_kcsan
-static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
+static void unregister_tracepoints(void)
{
- if (!strcmp(tp->name, "console"))
- tracepoint_probe_unregister(tp, probe_console, NULL);
+ unregister_trace_console(probe_console, NULL);
}
static int kcsan_suite_init(struct kunit_suite *suite)
{
- /*
- * Because we want to be able to build the test as a module, we need to
- * iterate through all known tracepoints, since the static registration
- * won't work here.
- */
- for_each_kernel_tracepoint(register_tracepoints, NULL);
+ register_tracepoints();
return 0;
}
static void kcsan_suite_exit(struct kunit_suite *suite)
{
- for_each_kernel_tracepoint(unregister_tracepoints, NULL);
+ unregister_tracepoints();
tracepoint_synchronize_unregister();
}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f1a0e4e3fb5c..f989f5f1933b 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -65,7 +65,7 @@ int kexec_image_probe_default(struct kimage *image, void *buf,
return ret;
}
-void *kexec_image_load_default(struct kimage *image)
+static void *kexec_image_load_default(struct kimage *image)
{
if (!image->fops || !image->fops->load)
return ERR_PTR(-ENOEXEC);
@@ -249,8 +249,8 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
/* IMA needs to pass the measurement list to the next kernel. */
ima_add_kexec_buffer(image);
- /* Call arch image load handlers */
- ldata = arch_kexec_kernel_image_load(image);
+ /* Call image load handler */
+ ldata = kexec_image_load_default(image);
if (IS_ERR(ldata)) {
ret = PTR_ERR(ldata);
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
index 8f69772af77b..42163c9e94e5 100644
--- a/kernel/kheaders.c
+++ b/kernel/kheaders.c
@@ -26,15 +26,15 @@ asm (
" .popsection \n"
);
-extern char kernel_headers_data;
-extern char kernel_headers_data_end;
+extern char kernel_headers_data[];
+extern char kernel_headers_data_end[];
static ssize_t
ikheaders_read(struct file *file, struct kobject *kobj,
struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t len)
{
- memcpy(buf, &kernel_headers_data + off, len);
+ memcpy(buf, &kernel_headers_data[off], len);
return len;
}
@@ -48,8 +48,8 @@ static struct bin_attribute kheaders_attr __ro_after_init = {
static int __init ikheaders_init(void)
{
- kheaders_attr.size = (&kernel_headers_data_end -
- &kernel_headers_data);
+ kheaders_attr.size = (kernel_headers_data_end -
+ kernel_headers_data);
return sysfs_create_bin_file(kernel_kobj, &kheaders_attr);
}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 0408aab80941..aad7a3bfd846 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -39,7 +39,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
static ssize_t uevent_seqnum_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)uevent_seqnum);
}
KERNEL_ATTR_RO(uevent_seqnum);
@@ -64,7 +64,7 @@ KERNEL_ATTR_RO(address_bits);
static ssize_t uevent_helper_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%s\n", uevent_helper);
+ return sysfs_emit(buf, "%s\n", uevent_helper);
}
static ssize_t uevent_helper_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -85,7 +85,7 @@ KERNEL_ATTR_RW(uevent_helper);
static ssize_t profiling_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", prof_on);
+ return sysfs_emit(buf, "%d\n", prof_on);
}
static ssize_t profiling_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -116,14 +116,14 @@ KERNEL_ATTR_RW(profiling);
static ssize_t kexec_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", !!kexec_image);
+ return sysfs_emit(buf, "%d\n", !!kexec_image);
}
KERNEL_ATTR_RO(kexec_loaded);
static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", kexec_crash_loaded());
+ return sysfs_emit(buf, "%d\n", kexec_crash_loaded());
}
KERNEL_ATTR_RO(kexec_crash_loaded);
@@ -135,7 +135,7 @@ static ssize_t kexec_crash_size_show(struct kobject *kobj,
if (size < 0)
return size;
- return sprintf(buf, "%zd\n", size);
+ return sysfs_emit(buf, "%zd\n", size);
}
static ssize_t kexec_crash_size_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -160,8 +160,8 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
- return sprintf(buf, "%pa %x\n", &vmcore_base,
- (unsigned int)VMCOREINFO_NOTE_SIZE);
+ return sysfs_emit(buf, "%pa %x\n", &vmcore_base,
+ (unsigned int)VMCOREINFO_NOTE_SIZE);
}
KERNEL_ATTR_RO(vmcoreinfo);
@@ -171,7 +171,7 @@ KERNEL_ATTR_RO(vmcoreinfo);
static ssize_t fscaps_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", file_caps_enabled);
+ return sysfs_emit(buf, "%d\n", file_caps_enabled);
}
KERNEL_ATTR_RO(fscaps);
@@ -180,7 +180,7 @@ int rcu_expedited;
static ssize_t rcu_expedited_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited));
+ return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_expedited));
}
static ssize_t rcu_expedited_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -197,7 +197,7 @@ int rcu_normal;
static ssize_t rcu_normal_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", READ_ONCE(rcu_normal));
+ return sysfs_emit(buf, "%d\n", READ_ONCE(rcu_normal));
}
static ssize_t rcu_normal_store(struct kobject *kobj,
struct kobj_attribute *attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 7e6751b29101..490792b1066e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -38,6 +38,7 @@ struct task_struct *kthreadd_task;
struct kthread_create_info
{
/* Information passed to kthread() from kthreadd. */
+ char *full_name;
int (*threadfn)(void *data);
void *data;
int node;
@@ -343,10 +344,12 @@ static int kthread(void *_create)
/* Release the structure when caller killed by a fatal signal. */
done = xchg(&create->done, NULL);
if (!done) {
+ kfree(create->full_name);
kfree(create);
kthread_exit(-EINTR);
}
+ self->full_name = create->full_name;
self->threadfn = threadfn;
self->data = data;
@@ -396,11 +399,13 @@ static void create_kthread(struct kthread_create_info *create)
current->pref_node_fork = create->node;
#endif
/* We want our own signal handler (we take no signals by default). */
- pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
+ pid = kernel_thread(kthread, create, create->full_name,
+ CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
/* Release the structure when caller killed by a fatal signal. */
struct completion *done = xchg(&create->done, NULL);
+ kfree(create->full_name);
if (!done) {
kfree(create);
return;
@@ -427,6 +432,11 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
create->data = data;
create->node = node;
create->done = &done;
+ create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
+ if (!create->full_name) {
+ task = ERR_PTR(-ENOMEM);
+ goto free_create;
+ }
spin_lock(&kthread_create_lock);
list_add_tail(&create->list, &kthread_create_list);
@@ -453,26 +463,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
wait_for_completion(&done);
}
task = create->result;
- if (!IS_ERR(task)) {
- char name[TASK_COMM_LEN];
- va_list aq;
- int len;
-
- /*
- * task is already visible to other tasks, so updating
- * COMM must be protected.
- */
- va_copy(aq, args);
- len = vsnprintf(name, sizeof(name), namefmt, aq);
- va_end(aq);
- if (len >= TASK_COMM_LEN) {
- struct kthread *kthread = to_kthread(task);
-
- /* leave it truncated when out of memory. */
- kthread->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
- }
- set_task_comm(task, name);
- }
+free_create:
kfree(create);
return task;
}
@@ -1415,14 +1406,18 @@ void kthread_use_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(tsk->mm);
+ /*
+ * It is possible for mm to be the same as tsk->active_mm, but
+ * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm),
+ * because these references are not equivalent.
+ */
+ mmgrab(mm);
+
task_lock(tsk);
/* Hold off tlb flush IPIs while switching mm's */
local_irq_disable();
active_mm = tsk->active_mm;
- if (active_mm != mm) {
- mmgrab(mm);
- tsk->active_mm = mm;
- }
+ tsk->active_mm = mm;
tsk->mm = mm;
membarrier_update_current_mm(mm);
switch_mm_irqs_off(active_mm, mm, tsk);
@@ -1439,12 +1434,9 @@ void kthread_use_mm(struct mm_struct *mm)
* memory barrier after storing to tsk->mm, before accessing
* user-space memory. A full memory barrier for membarrier
* {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
- * mmdrop(), or explicitly with smp_mb().
+ * mmdrop_lazy_tlb().
*/
- if (active_mm != mm)
- mmdrop(active_mm);
- else
- smp_mb();
+ mmdrop_lazy_tlb(active_mm);
}
EXPORT_SYMBOL_GPL(kthread_use_mm);
@@ -1472,10 +1464,13 @@ void kthread_unuse_mm(struct mm_struct *mm)
local_irq_disable();
tsk->mm = NULL;
membarrier_update_current_mm(NULL);
+ mmgrab_lazy_tlb(mm);
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
local_irq_enable();
task_unlock(tsk);
+
+ mmdrop(mm);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 4bd2d5e10f20..61328328c474 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -33,6 +33,7 @@
*
* - klp_ftrace_handler()
* - klp_update_patch_state()
+ * - __klp_sched_try_switch()
*/
DEFINE_MUTEX(klp_mutex);
@@ -142,8 +143,7 @@ static int klp_match_callback(void *data, unsigned long addr)
return 0;
}
-static int klp_find_callback(void *data, const char *name,
- struct module *mod, unsigned long addr)
+static int klp_find_callback(void *data, const char *name, unsigned long addr)
{
struct klp_find_arg *args = data;
@@ -596,7 +596,7 @@ static void klp_kobj_release_patch(struct kobject *kobj)
complete(&patch->finish);
}
-static struct kobj_type klp_ktype_patch = {
+static const struct kobj_type klp_ktype_patch = {
.release = klp_kobj_release_patch,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = klp_patch_groups,
@@ -612,7 +612,7 @@ static void klp_kobj_release_object(struct kobject *kobj)
klp_free_object_dynamic(obj);
}
-static struct kobj_type klp_ktype_object = {
+static const struct kobj_type klp_ktype_object = {
.release = klp_kobj_release_object,
.sysfs_ops = &kobj_sysfs_ops,
.default_groups = klp_object_groups,
@@ -628,7 +628,7 @@ static void klp_kobj_release_func(struct kobject *kobj)
klp_free_func_nop(func);
}
-static struct kobj_type klp_ktype_func = {
+static const struct kobj_type klp_ktype_func = {
.release = klp_kobj_release_func,
.sysfs_ops = &kobj_sysfs_ops,
};
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index f1b25ec581e0..e9fd83a02228 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -9,11 +9,14 @@
#include <linux/cpu.h>
#include <linux/stacktrace.h>
+#include <linux/static_call.h>
#include "core.h"
#include "patch.h"
#include "transition.h"
#define MAX_STACK_ENTRIES 100
+DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries);
+
#define STACK_ERR_BUF_SIZE 128
#define SIGNALS_TIMEOUT 15
@@ -25,6 +28,25 @@ static int klp_target_state = KLP_UNDEFINED;
static unsigned int klp_signals_cnt;
/*
+ * When a livepatch is in progress, enable klp stack checking in
+ * cond_resched(). This helps CPU-bound kthreads get patched.
+ */
+#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+
+#define klp_cond_resched_enable() sched_dynamic_klp_enable()
+#define klp_cond_resched_disable() sched_dynamic_klp_disable()
+
+#else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
+EXPORT_SYMBOL(klp_sched_try_switch_key);
+
+#define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key)
+#define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key)
+
+#endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+/*
* This work can be performed periodically to finish patching or unpatching any
* "straggler" tasks which failed to transition in the first attempt.
*/
@@ -172,8 +194,8 @@ void klp_update_patch_state(struct task_struct *task)
* barrier (smp_rmb) for two cases:
*
* 1) Enforce the order of the TIF_PATCH_PENDING read and the
- * klp_target_state read. The corresponding write barrier is in
- * klp_init_transition().
+ * klp_target_state read. The corresponding write barriers are in
+ * klp_init_transition() and klp_reverse_transition().
*
* 2) Enforce the order of the TIF_PATCH_PENDING read and a future read
* of func->transition, if klp_ftrace_handler() is called later on
@@ -240,12 +262,15 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
*/
static int klp_check_stack(struct task_struct *task, const char **oldname)
{
- static unsigned long entries[MAX_STACK_ENTRIES];
+ unsigned long *entries = this_cpu_ptr(klp_stack_entries);
struct klp_object *obj;
struct klp_func *func;
int ret, nr_entries;
- ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
+ /* Protect 'klp_stack_entries' */
+ lockdep_assert_preemption_disabled();
+
+ ret = stack_trace_save_tsk_reliable(task, entries, MAX_STACK_ENTRIES);
if (ret < 0)
return -EINVAL;
nr_entries = ret;
@@ -307,7 +332,11 @@ static bool klp_try_switch_task(struct task_struct *task)
* functions. If all goes well, switch the task to the target patch
* state.
*/
- ret = task_call_func(task, klp_check_and_switch_task, &old_name);
+ if (task == current)
+ ret = klp_check_and_switch_task(current, &old_name);
+ else
+ ret = task_call_func(task, klp_check_and_switch_task, &old_name);
+
switch (ret) {
case 0: /* success */
break;
@@ -334,6 +363,44 @@ static bool klp_try_switch_task(struct task_struct *task)
return !ret;
}
+void __klp_sched_try_switch(void)
+{
+ if (likely(!klp_patch_pending(current)))
+ return;
+
+ /*
+ * This function is called from cond_resched() which is called in many
+ * places throughout the kernel. Using the klp_mutex here might
+ * deadlock.
+ *
+ * Instead, disable preemption to prevent racing with other callers of
+ * klp_try_switch_task(). Thanks to task_call_func() they won't be
+ * able to switch this task while it's running.
+ */
+ preempt_disable();
+
+ /*
+ * Make sure current didn't get patched between the above check and
+ * preempt_disable().
+ */
+ if (unlikely(!klp_patch_pending(current)))
+ goto out;
+
+ /*
+ * Enforce the order of the TIF_PATCH_PENDING read above and the
+ * klp_target_state read in klp_try_switch_task(). The corresponding
+ * write barriers are in klp_init_transition() and
+ * klp_reverse_transition().
+ */
+ smp_rmb();
+
+ klp_try_switch_task(current);
+
+out:
+ preempt_enable();
+}
+EXPORT_SYMBOL(__klp_sched_try_switch);
+
/*
* Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
* Kthreads with TIF_PATCH_PENDING set are woken up.
@@ -440,7 +507,8 @@ void klp_try_complete_transition(void)
return;
}
- /* we're done, now cleanup the data structures */
+ /* Done! Now cleanup the data structures. */
+ klp_cond_resched_disable();
patch = klp_transition_patch;
klp_complete_transition();
@@ -492,6 +560,8 @@ void klp_start_transition(void)
set_tsk_thread_flag(task, TIF_PATCH_PENDING);
}
+ klp_cond_resched_enable();
+
klp_signals_cnt = 0;
}
@@ -547,8 +617,9 @@ void klp_init_transition(struct klp_patch *patch, int state)
* see a func in transition with a task->patch_state of KLP_UNDEFINED.
*
* Also enforce the order of the klp_target_state write and future
- * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't
- * set a task->patch_state to KLP_UNDEFINED.
+ * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() and
+ * __klp_sched_try_switch() don't set a task->patch_state to
+ * KLP_UNDEFINED.
*/
smp_wmb();
@@ -584,14 +655,10 @@ void klp_reverse_transition(void)
klp_target_state == KLP_PATCHED ? "patching to unpatching" :
"unpatching to patching");
- klp_transition_patch->enabled = !klp_transition_patch->enabled;
-
- klp_target_state = !klp_target_state;
-
/*
* Clear all TIF_PATCH_PENDING flags to prevent races caused by
- * klp_update_patch_state() running in parallel with
- * klp_start_transition().
+ * klp_update_patch_state() or __klp_sched_try_switch() running in
+ * parallel with the reverse transition.
*/
read_lock(&tasklist_lock);
for_each_process_thread(g, task)
@@ -601,9 +668,28 @@ void klp_reverse_transition(void)
for_each_possible_cpu(cpu)
clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING);
- /* Let any remaining calls to klp_update_patch_state() complete */
+ /*
+ * Make sure all existing invocations of klp_update_patch_state() and
+ * __klp_sched_try_switch() see the cleared TIF_PATCH_PENDING before
+ * starting the reverse transition.
+ */
klp_synchronize_transition();
+ /*
+ * All patching has stopped, now re-initialize the global variables to
+ * prepare for the reverse transition.
+ */
+ klp_transition_patch->enabled = !klp_transition_patch->enabled;
+ klp_target_state = !klp_target_state;
+
+ /*
+ * Enforce the order of the klp_target_state write and the
+ * TIF_PATCH_PENDING writes in klp_start_transition() to ensure
+ * klp_update_patch_state() and __klp_sched_try_switch() don't set
+ * task->patch_state to the wrong value.
+ */
+ smp_wmb();
+
klp_start_transition();
}
@@ -617,9 +703,9 @@ void klp_copy_process(struct task_struct *child)
* the task flag up to date with the parent here.
*
* The operation is serialized against all klp_*_transition()
- * operations by the tasklist_lock. The only exception is
- * klp_update_patch_state(current), but we cannot race with
- * that because we are current.
+ * operations by the tasklist_lock. The only exceptions are
+ * klp_update_patch_state(current) and __klp_sched_try_switch(), but we
+ * cannot race with them because we are current.
*/
if (test_tsk_thread_flag(current, TIF_PATCH_PENDING))
set_tsk_thread_flag(child, TIF_PATCH_PENDING);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 50d4863974e7..dcd1d5bfc1e0 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1881,6 +1881,8 @@ print_circular_lock_scenario(struct held_lock *src,
struct lock_class *source = hlock_class(src);
struct lock_class *target = hlock_class(tgt);
struct lock_class *parent = prt->class;
+ int src_read = src->read;
+ int tgt_read = tgt->read;
/*
* A direct locking problem where unsafe_class lock is taken
@@ -1908,7 +1910,10 @@ print_circular_lock_scenario(struct held_lock *src,
printk(" Possible unsafe locking scenario:\n\n");
printk(" CPU0 CPU1\n");
printk(" ---- ----\n");
- printk(" lock(");
+ if (tgt_read != 0)
+ printk(" rlock(");
+ else
+ printk(" lock(");
__print_lock_name(target);
printk(KERN_CONT ");\n");
printk(" lock(");
@@ -1917,7 +1922,12 @@ print_circular_lock_scenario(struct held_lock *src,
printk(" lock(");
__print_lock_name(target);
printk(KERN_CONT ");\n");
- printk(" lock(");
+ if (src_read != 0)
+ printk(" rlock(");
+ else if (src->sync)
+ printk(" sync(");
+ else
+ printk(" lock(");
__print_lock_name(source);
printk(KERN_CONT ");\n");
printk("\n *** DEADLOCK ***\n\n");
@@ -4531,7 +4541,13 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
return 0;
}
}
- if (!hlock->hardirqs_off) {
+
+ /*
+ * For lock_sync(), don't mark the ENABLED usage, since lock_sync()
+ * creates no critical section and no extra dependency can be introduced
+ * by interrupts
+ */
+ if (!hlock->hardirqs_off && !hlock->sync) {
if (hlock->read) {
if (!mark_lock(curr, hlock,
LOCK_ENABLED_HARDIRQ_READ))
@@ -4910,7 +4926,7 @@ static int __lock_is_held(const struct lockdep_map *lock, int read);
static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int trylock, int read, int check, int hardirqs_off,
struct lockdep_map *nest_lock, unsigned long ip,
- int references, int pin_count)
+ int references, int pin_count, int sync)
{
struct task_struct *curr = current;
struct lock_class *class = NULL;
@@ -4961,7 +4977,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
class_idx = class - lock_classes;
- if (depth) { /* we're holding locks */
+ if (depth && !sync) {
+ /* we're holding locks and the new held lock is not a sync */
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
if (!references)
@@ -4995,6 +5012,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->trylock = trylock;
hlock->read = read;
hlock->check = check;
+ hlock->sync = !!sync;
hlock->hardirqs_off = !!hardirqs_off;
hlock->references = references;
#ifdef CONFIG_LOCK_STAT
@@ -5056,6 +5074,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!validate_chain(curr, hlock, chain_head, chain_key))
return 0;
+ /* For lock_sync(), we are done here since no actual critical section */
+ if (hlock->sync)
+ return 1;
+
curr->curr_chain_key = chain_key;
curr->lockdep_depth++;
check_chain_key(curr);
@@ -5197,7 +5219,7 @@ static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
hlock->read, hlock->check,
hlock->hardirqs_off,
hlock->nest_lock, hlock->acquire_ip,
- hlock->references, hlock->pin_count)) {
+ hlock->references, hlock->pin_count, 0)) {
case 0:
return 1;
case 1:
@@ -5667,7 +5689,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
lockdep_recursion_inc();
__lock_acquire(lock, subclass, trylock, read, check,
- irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 0);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
}
@@ -5693,6 +5715,34 @@ void lock_release(struct lockdep_map *lock, unsigned long ip)
}
EXPORT_SYMBOL_GPL(lock_release);
+/*
+ * lock_sync() - A special annotation for synchronize_{s,}rcu()-like API.
+ *
+ * No actual critical section is created by the APIs annotated with this: these
+ * APIs are used to wait for one or multiple critical sections (on other CPUs
+ * or threads), and it means that calling these APIs inside these critical
+ * sections is potential deadlock.
+ */
+void lock_sync(struct lockdep_map *lock, unsigned subclass, int read,
+ int check, struct lockdep_map *nest_lock, unsigned long ip)
+{
+ unsigned long flags;
+
+ if (unlikely(!lockdep_enabled()))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ lockdep_recursion_inc();
+ __lock_acquire(lock, subclass, 0, read, check,
+ irqs_disabled_flags(flags), nest_lock, ip, 0, 0, 1);
+ check_chain_key(current);
+ lockdep_recursion_finish();
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_sync);
+
noinstr int lock_is_held_type(const struct lockdep_map *lock, int read)
{
unsigned long flags;
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index f04b1978899d..153ddc4c47ef 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -51,8 +51,11 @@ torture_param(int, rt_boost, 2,
torture_param(int, rt_boost_factor, 50, "A factor determining how often rt-boost happens.");
torture_param(int, verbose, 1,
"Enable verbose debugging printk()s");
+torture_param(int, nested_locks, 0, "Number of nested locks (max = 8)");
+/* Going much higher trips "BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!" errors */
+#define MAX_NESTED_LOCKS 8
-static char *torture_type = "spin_lock";
+static char *torture_type = IS_ENABLED(CONFIG_PREEMPT_RT) ? "raw_spin_lock" : "spin_lock";
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type,
"Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
@@ -79,10 +82,12 @@ static void lock_torture_cleanup(void);
struct lock_torture_ops {
void (*init)(void);
void (*exit)(void);
+ int (*nested_lock)(int tid, u32 lockset);
int (*writelock)(int tid);
void (*write_delay)(struct torture_random_state *trsp);
void (*task_boost)(struct torture_random_state *trsp);
void (*writeunlock)(int tid);
+ void (*nested_unlock)(int tid, u32 lockset);
int (*readlock)(int tid);
void (*read_delay)(struct torture_random_state *trsp);
void (*readunlock)(int tid);
@@ -252,6 +257,59 @@ static struct lock_torture_ops spin_lock_irq_ops = {
.name = "spin_lock_irq"
};
+static DEFINE_RAW_SPINLOCK(torture_raw_spinlock);
+
+static int torture_raw_spin_lock_write_lock(int tid __maybe_unused)
+__acquires(torture_raw_spinlock)
+{
+ raw_spin_lock(&torture_raw_spinlock);
+ return 0;
+}
+
+static void torture_raw_spin_lock_write_unlock(int tid __maybe_unused)
+__releases(torture_raw_spinlock)
+{
+ raw_spin_unlock(&torture_raw_spinlock);
+}
+
+static struct lock_torture_ops raw_spin_lock_ops = {
+ .writelock = torture_raw_spin_lock_write_lock,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_spin_lock_write_unlock,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_spin_lock"
+};
+
+static int torture_raw_spin_lock_write_lock_irq(int tid __maybe_unused)
+__acquires(torture_raw_spinlock)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&torture_raw_spinlock, flags);
+ cxt.cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_raw_spin_lock_write_unlock_irq(int tid __maybe_unused)
+__releases(torture_raw_spinlock)
+{
+ raw_spin_unlock_irqrestore(&torture_raw_spinlock, cxt.cur_ops->flags);
+}
+
+static struct lock_torture_ops raw_spin_lock_irq_ops = {
+ .writelock = torture_raw_spin_lock_write_lock_irq,
+ .write_delay = torture_spin_lock_write_delay,
+ .task_boost = torture_rt_boost,
+ .writeunlock = torture_raw_spin_lock_write_unlock_irq,
+ .readlock = NULL,
+ .read_delay = NULL,
+ .readunlock = NULL,
+ .name = "raw_spin_lock_irq"
+};
+
static DEFINE_RWLOCK(torture_rwlock);
static int torture_rwlock_write_lock(int tid __maybe_unused)
@@ -365,6 +423,28 @@ static struct lock_torture_ops rw_lock_irq_ops = {
};
static DEFINE_MUTEX(torture_mutex);
+static struct mutex torture_nested_mutexes[MAX_NESTED_LOCKS];
+static struct lock_class_key nested_mutex_keys[MAX_NESTED_LOCKS];
+
+static void torture_mutex_init(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_NESTED_LOCKS; i++)
+ __mutex_init(&torture_nested_mutexes[i], __func__,
+ &nested_mutex_keys[i]);
+}
+
+static int torture_mutex_nested_lock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = 0; i < nested_locks; i++)
+ if (lockset & (1 << i))
+ mutex_lock(&torture_nested_mutexes[i]);
+ return 0;
+}
static int torture_mutex_lock(int tid __maybe_unused)
__acquires(torture_mutex)
@@ -393,11 +473,24 @@ __releases(torture_mutex)
mutex_unlock(&torture_mutex);
}
+static void torture_mutex_nested_unlock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = nested_locks - 1; i >= 0; i--)
+ if (lockset & (1 << i))
+ mutex_unlock(&torture_nested_mutexes[i]);
+}
+
static struct lock_torture_ops mutex_lock_ops = {
+ .init = torture_mutex_init,
+ .nested_lock = torture_mutex_nested_lock,
.writelock = torture_mutex_lock,
.write_delay = torture_mutex_delay,
.task_boost = torture_rt_boost,
.writeunlock = torture_mutex_unlock,
+ .nested_unlock = torture_mutex_nested_unlock,
.readlock = NULL,
.read_delay = NULL,
.readunlock = NULL,
@@ -504,6 +597,28 @@ static struct lock_torture_ops ww_mutex_lock_ops = {
#ifdef CONFIG_RT_MUTEXES
static DEFINE_RT_MUTEX(torture_rtmutex);
+static struct rt_mutex torture_nested_rtmutexes[MAX_NESTED_LOCKS];
+static struct lock_class_key nested_rtmutex_keys[MAX_NESTED_LOCKS];
+
+static void torture_rtmutex_init(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_NESTED_LOCKS; i++)
+ __rt_mutex_init(&torture_nested_rtmutexes[i], __func__,
+ &nested_rtmutex_keys[i]);
+}
+
+static int torture_rtmutex_nested_lock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = 0; i < nested_locks; i++)
+ if (lockset & (1 << i))
+ rt_mutex_lock(&torture_nested_rtmutexes[i]);
+ return 0;
+}
static int torture_rtmutex_lock(int tid __maybe_unused)
__acquires(torture_rtmutex)
@@ -545,11 +660,24 @@ static void torture_rt_boost_rtmutex(struct torture_random_state *trsp)
__torture_rt_boost(trsp);
}
+static void torture_rtmutex_nested_unlock(int tid __maybe_unused,
+ u32 lockset)
+{
+ int i;
+
+ for (i = nested_locks - 1; i >= 0; i--)
+ if (lockset & (1 << i))
+ rt_mutex_unlock(&torture_nested_rtmutexes[i]);
+}
+
static struct lock_torture_ops rtmutex_lock_ops = {
+ .init = torture_rtmutex_init,
+ .nested_lock = torture_rtmutex_nested_lock,
.writelock = torture_rtmutex_lock,
.write_delay = torture_rtmutex_delay,
.task_boost = torture_rt_boost_rtmutex,
.writeunlock = torture_rtmutex_unlock,
+ .nested_unlock = torture_rtmutex_nested_unlock,
.readlock = NULL,
.read_delay = NULL,
.readunlock = NULL,
@@ -684,6 +812,8 @@ static int lock_torture_writer(void *arg)
struct lock_stress_stats *lwsp = arg;
int tid = lwsp - cxt.lwsa;
DEFINE_TORTURE_RANDOM(rand);
+ u32 lockset_mask;
+ bool skip_main_lock;
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
set_user_nice(current, MAX_NICE);
@@ -692,19 +822,40 @@ static int lock_torture_writer(void *arg)
if ((torture_random(&rand) & 0xfffff) == 0)
schedule_timeout_uninterruptible(1);
- cxt.cur_ops->task_boost(&rand);
- cxt.cur_ops->writelock(tid);
- if (WARN_ON_ONCE(lock_is_write_held))
- lwsp->n_lock_fail++;
- lock_is_write_held = true;
- if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
- lwsp->n_lock_fail++; /* rare, but... */
+ lockset_mask = torture_random(&rand);
+ /*
+ * When using nested_locks, we want to occasionally
+ * skip the main lock so we can avoid always serializing
+ * the lock chains on that central lock. By skipping the
+ * main lock occasionally, we can create different
+ * contention patterns (allowing for multiple disjoint
+ * blocked trees)
+ */
+ skip_main_lock = (nested_locks &&
+ !(torture_random(&rand) % 100));
- lwsp->n_lock_acquired++;
+ cxt.cur_ops->task_boost(&rand);
+ if (cxt.cur_ops->nested_lock)
+ cxt.cur_ops->nested_lock(tid, lockset_mask);
+
+ if (!skip_main_lock) {
+ cxt.cur_ops->writelock(tid);
+ if (WARN_ON_ONCE(lock_is_write_held))
+ lwsp->n_lock_fail++;
+ lock_is_write_held = true;
+ if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
+ lwsp->n_lock_fail++; /* rare, but... */
+
+ lwsp->n_lock_acquired++;
+ }
cxt.cur_ops->write_delay(&rand);
- lock_is_write_held = false;
- WRITE_ONCE(last_lock_release, jiffies);
- cxt.cur_ops->writeunlock(tid);
+ if (!skip_main_lock) {
+ lock_is_write_held = false;
+ WRITE_ONCE(last_lock_release, jiffies);
+ cxt.cur_ops->writeunlock(tid);
+ }
+ if (cxt.cur_ops->nested_unlock)
+ cxt.cur_ops->nested_unlock(tid, lockset_mask);
stutter_wait("lock_torture_writer");
} while (!torture_must_stop());
@@ -845,11 +996,11 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
const char *tag)
{
pr_alert("%s" TORTURE_FLAG
- "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+ "--- %s%s: nwriters_stress=%d nreaders_stress=%d nested_locks=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
torture_type, tag, cxt.debug_lock ? " [debug]": "",
- cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval,
- verbose, shuffle_interval, stutter, shutdown_secs,
- onoff_interval, onoff_holdoff);
+ cxt.nrealwriters_stress, cxt.nrealreaders_stress,
+ nested_locks, stat_interval, verbose, shuffle_interval,
+ stutter, shutdown_secs, onoff_interval, onoff_holdoff);
}
static void lock_torture_cleanup(void)
@@ -919,6 +1070,7 @@ static int __init lock_torture_init(void)
static struct lock_torture_ops *torture_ops[] = {
&lock_busted_ops,
&spin_lock_ops, &spin_lock_irq_ops,
+ &raw_spin_lock_ops, &raw_spin_lock_irq_ops,
&rw_lock_ops, &rw_lock_irq_ops,
&mutex_lock_ops,
&ww_mutex_lock_ops,
@@ -1068,6 +1220,10 @@ static int __init lock_torture_init(void)
}
}
+ /* cap nested_locks to MAX_NESTED_LOCKS */
+ if (nested_locks > MAX_NESTED_LOCKS)
+ nested_locks = MAX_NESTED_LOCKS;
+
if (cxt.cur_ops->readlock) {
reader_tasks = kcalloc(cxt.nrealreaders_stress,
sizeof(reader_tasks[0]),
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 29dc253d03af..93cca6e69860 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -659,7 +659,7 @@ static int __init test_ww_mutex_init(void)
if (ret)
return ret;
- ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
+ ret = stress(2047, hweight32(STRESS_ALL)*ncpus, STRESS_ALL);
if (ret)
return ret;
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 424b3bc58f3f..33a2e991f608 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -22,6 +22,104 @@ menuconfig MODULES
if MODULES
+config MODULE_DEBUGFS
+ bool
+
+config MODULE_DEBUG
+ bool "Module debugging"
+ depends on DEBUG_FS
+ help
+ Allows you to enable / disable features which can help you debug
+ modules. You don't need these options on production systems.
+
+if MODULE_DEBUG
+
+config MODULE_STATS
+ bool "Module statistics"
+ depends on DEBUG_FS
+ select MODULE_DEBUGFS
+ help
+ This option allows you to maintain a record of module statistics.
+ For example, size of all modules, average size, text size, a list
+ of failed modules and the size for each of those. For failed
+ modules we keep track of modules which failed due to either the
+ existing module taking too long to load or that module was already
+ loaded.
+
+ You should enable this if you are debugging production loads
+ and want to see if userspace or the kernel is doing stupid things
+ with loading modules when it shouldn't or if you want to help
+ optimize userspace / kernel space module autoloading schemes.
+ You might want to do this because failed modules tend to use
+ up significant amount of memory, and so you'd be doing everyone a
+ favor in avoiding these failures proactively.
+
+ This functionality is also useful for those experimenting with
+ module .text ELF section optimization.
+
+ If unsure, say N.
+
+config MODULE_DEBUG_AUTOLOAD_DUPS
+ bool "Debug duplicate modules with auto-loading"
+ help
+ Module autoloading allows in-kernel code to request modules through
+ the *request_module*() API calls. This in turn just calls userspace
+ modprobe. Although modprobe checks to see if a module is already
+ loaded before trying to load a module there is a small time window in
+ which multiple duplicate requests can end up in userspace and multiple
+ modprobe calls race calling finit_module() around the same time for
+ duplicate modules. The finit_module() system call can consume in the
+ worst case more than twice the respective module size in virtual
+ memory for each duplicate module requests. Although duplicate module
+ requests are non-fatal virtual memory is a limited resource and each
+ duplicate module request ends up just unnecessarily straining virtual
+ memory.
+
+ This debugging facility will create pr_warn() splats for duplicate
+ module requests to help identify if module auto-loading may be the
+ culprit to your early boot virtual memory pressure. Since virtual
+ memory abuse caused by duplicate module requests could render a
+ system unusable this functionality will also converge races in
+ requests for the same module to a single request. You can boot with
+ the module.enable_dups_trace=1 kernel parameter to use WARN_ON()
+ instead of the pr_warn().
+
+ If the first module request used request_module_nowait() we cannot
+ use that as the anchor to wait for duplicate module requests, since
+ users of request_module() do want a proper return value. If a call
+ for the same module happened earlier with request_module() though,
+ then a duplicate request_module_nowait() would be detected. The
+ non-wait request_module() call is synchronous and waits until modprobe
+ completes. Subsequent auto-loading requests for the same module do
+ not trigger a new finit_module() calls and do not strain virtual
+ memory, and so as soon as modprobe successfully completes we remove
+ tracking for duplicates for that module.
+
+ Enable this functionality to try to debug virtual memory abuse during
+ boot on systems which are failing to boot or if you suspect you may be
+ straining virtual memory during boot, and you want to identify if the
+ abuse was due to module auto-loading. These issues are currently only
+ known to occur on systems with many CPUs (over 400) and is likely the
+ result of udev issuing duplicate module requests for each CPU, and so
+ module auto-loading is not the culprit. There may very well still be
+ many duplicate module auto-loading requests which could be optimized
+ for and this debugging facility can be used to help identify them.
+
+ Only enable this for debugging system functionality, never have it
+ enabled on real systems.
+
+config MODULE_DEBUG_AUTOLOAD_DUPS_TRACE
+ bool "Force full stack trace when duplicates are found"
+ depends on MODULE_DEBUG_AUTOLOAD_DUPS
+ help
+ Enabling this will force a full stack trace for duplicate module
+ auto-loading requests using WARN_ON() instead of pr_warn(). You
+ should keep this disabled at all times unless you are a developer
+ and are doing a manual inspection and want to debug exactly why
+ these duplicates occur.
+
+endif # MODULE_DEBUG
+
config MODULE_FORCE_LOAD
bool "Forced module loading"
default n
@@ -51,7 +149,7 @@ config MODULE_FORCE_UNLOAD
config MODULE_UNLOAD_TAINT_TRACKING
bool "Tainted module unload tracking"
depends on MODULE_UNLOAD
- default n
+ select MODULE_DEBUGFS
help
This option allows you to maintain a record of each unloaded
module that tainted the kernel. In addition to displaying a
diff --git a/kernel/module/Makefile b/kernel/module/Makefile
index 948efea81e85..a10b2b9a6fdf 100644
--- a/kernel/module/Makefile
+++ b/kernel/module/Makefile
@@ -7,7 +7,10 @@
# and produce insane amounts of uninteresting coverage.
KCOV_INSTRUMENT_module.o := n
-obj-y += main.o strict_rwx.o
+obj-y += main.o
+obj-y += strict_rwx.o
+obj-y += kmod.o
+obj-$(CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS) += dups.o
obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o
obj-$(CONFIG_MODULE_SIG) += signing.o
obj-$(CONFIG_LIVEPATCH) += livepatch.o
@@ -19,3 +22,4 @@ obj-$(CONFIG_SYSFS) += sysfs.o
obj-$(CONFIG_KGDB_KDB) += kdb.o
obj-$(CONFIG_MODVERSIONS) += version.o
obj-$(CONFIG_MODULE_UNLOAD_TAINT_TRACKING) += tracking.o
+obj-$(CONFIG_MODULE_STATS) += stats.o
diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
index bb79ac1a6d8f..e97232b125eb 100644
--- a/kernel/module/decompress.c
+++ b/kernel/module/decompress.c
@@ -267,7 +267,7 @@ static ssize_t module_zstd_decompress(struct load_info *info,
zstd_dec.size = PAGE_SIZE;
ret = zstd_decompress_stream(dstream, &zstd_dec, &zstd_buf);
- kunmap(page);
+ kunmap_local(zstd_dec.dst);
retval = zstd_get_error_code(ret);
if (retval)
break;
@@ -297,6 +297,10 @@ int module_decompress(struct load_info *info, const void *buf, size_t size)
ssize_t data_size;
int error;
+#if defined(CONFIG_MODULE_STATS)
+ info->compressed_len = size;
+#endif
+
/*
* Start with number of pages twice as big as needed for
* compressed data.
diff --git a/kernel/module/dups.c b/kernel/module/dups.c
new file mode 100644
index 000000000000..aa8e1361fdb5
--- /dev/null
+++ b/kernel/module/dups.c
@@ -0,0 +1,246 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * kmod dups - the kernel module autoloader duplicate suppressor
+ *
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#define pr_fmt(fmt) "module: " fmt
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/binfmts.h>
+#include <linux/syscalls.h>
+#include <linux/unistd.h>
+#include <linux/kmod.h>
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/cred.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/workqueue.h>
+#include <linux/security.h>
+#include <linux/mount.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/resource.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+#include <linux/rwsem.h>
+#include <linux/ptrace.h>
+#include <linux/async.h>
+#include <linux/uaccess.h>
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "module."
+static bool enable_dups_trace = IS_ENABLED(CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS_TRACE);
+module_param(enable_dups_trace, bool_enable_only, 0644);
+
+/*
+ * Protects dup_kmod_reqs list, adds / removals with RCU.
+ */
+static DEFINE_MUTEX(kmod_dup_mutex);
+static LIST_HEAD(dup_kmod_reqs);
+
+struct kmod_dup_req {
+ struct list_head list;
+ char name[MODULE_NAME_LEN];
+ struct completion first_req_done;
+ struct work_struct complete_work;
+ struct delayed_work delete_work;
+ int dup_ret;
+};
+
+static struct kmod_dup_req *kmod_dup_request_lookup(char *module_name)
+{
+ struct kmod_dup_req *kmod_req;
+
+ list_for_each_entry_rcu(kmod_req, &dup_kmod_reqs, list,
+ lockdep_is_held(&kmod_dup_mutex)) {
+ if (strlen(kmod_req->name) == strlen(module_name) &&
+ !memcmp(kmod_req->name, module_name, strlen(module_name))) {
+ return kmod_req;
+ }
+ }
+
+ return NULL;
+}
+
+static void kmod_dup_request_delete(struct work_struct *work)
+{
+ struct kmod_dup_req *kmod_req;
+ kmod_req = container_of(to_delayed_work(work), struct kmod_dup_req, delete_work);
+
+ /*
+ * The typical situation is a module successully loaded. In that
+ * situation the module will be present already in userspace. If
+ * new requests come in after that, userspace will already know the
+ * module is loaded so will just return 0 right away. There is still
+ * a small chance right after we delete this entry new request_module()
+ * calls may happen after that, they can happen. These heuristics
+ * are to protect finit_module() abuse for auto-loading, if modules
+ * are still tryign to auto-load even if a module is already loaded,
+ * that's on them, and those inneficiencies should not be fixed by
+ * kmod. The inneficies there are a call to modprobe and modprobe
+ * just returning 0.
+ */
+ mutex_lock(&kmod_dup_mutex);
+ list_del_rcu(&kmod_req->list);
+ synchronize_rcu();
+ mutex_unlock(&kmod_dup_mutex);
+ kfree(kmod_req);
+}
+
+static void kmod_dup_request_complete(struct work_struct *work)
+{
+ struct kmod_dup_req *kmod_req;
+
+ kmod_req = container_of(work, struct kmod_dup_req, complete_work);
+
+ /*
+ * This will ensure that the kernel will let all the waiters get
+ * informed its time to check the return value. It's time to
+ * go home.
+ */
+ complete_all(&kmod_req->first_req_done);
+
+ /*
+ * Now that we have allowed prior request_module() calls to go on
+ * with life, let's schedule deleting this entry. We don't have
+ * to do it right away, but we *eventually* want to do it so to not
+ * let this linger forever as this is just a boot optimization for
+ * possible abuses of vmalloc() incurred by finit_module() thrashing.
+ */
+ queue_delayed_work(system_wq, &kmod_req->delete_work, 60 * HZ);
+}
+
+bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret)
+{
+ struct kmod_dup_req *kmod_req, *new_kmod_req;
+ int ret;
+
+ /*
+ * Pre-allocate the entry in case we have to use it later
+ * to avoid contention with the mutex.
+ */
+ new_kmod_req = kzalloc(sizeof(*new_kmod_req), GFP_KERNEL);
+ if (!new_kmod_req)
+ return false;
+
+ memcpy(new_kmod_req->name, module_name, strlen(module_name));
+ INIT_WORK(&new_kmod_req->complete_work, kmod_dup_request_complete);
+ INIT_DELAYED_WORK(&new_kmod_req->delete_work, kmod_dup_request_delete);
+ init_completion(&new_kmod_req->first_req_done);
+
+ mutex_lock(&kmod_dup_mutex);
+
+ kmod_req = kmod_dup_request_lookup(module_name);
+ if (!kmod_req) {
+ /*
+ * If the first request that came through for a module
+ * was with request_module_nowait() we cannot wait for it
+ * and share its return value with other users which may
+ * have used request_module() and need a proper return value
+ * so just skip using them as an anchor.
+ *
+ * If a prior request to this one came through with
+ * request_module() though, then a request_module_nowait()
+ * would benefit from duplicate detection.
+ */
+ if (!wait) {
+ kfree(new_kmod_req);
+ pr_debug("New request_module_nowait() for %s -- cannot track duplicates for this request\n", module_name);
+ mutex_unlock(&kmod_dup_mutex);
+ return false;
+ }
+
+ /*
+ * There was no duplicate, just add the request so we can
+ * keep tab on duplicates later.
+ */
+ pr_debug("New request_module() for %s\n", module_name);
+ list_add_rcu(&new_kmod_req->list, &dup_kmod_reqs);
+ mutex_unlock(&kmod_dup_mutex);
+ return false;
+ }
+ mutex_unlock(&kmod_dup_mutex);
+
+ /* We are dealing with a duplicate request now */
+ kfree(new_kmod_req);
+
+ /*
+ * To fix these try to use try_then_request_module() instead as that
+ * will check if the component you are looking for is present or not.
+ * You could also just queue a single request to load the module once,
+ * instead of having each and everything you need try to request for
+ * the module.
+ *
+ * Duplicate request_module() calls can cause quite a bit of wasted
+ * vmalloc() space when racing with userspace.
+ */
+ if (enable_dups_trace)
+ WARN(1, "module-autoload: duplicate request for module %s\n", module_name);
+ else
+ pr_warn("module-autoload: duplicate request for module %s\n", module_name);
+
+ if (!wait) {
+ /*
+ * If request_module_nowait() was used then the user just
+ * wanted to issue the request and if another module request
+ * was already its way with the same name we don't care for
+ * the return value either. Let duplicate request_module_nowait()
+ * calls bail out right away.
+ */
+ *dup_ret = 0;
+ return true;
+ }
+
+ /*
+ * If a duplicate request_module() was used they *may* care for
+ * the return value, so we have no other option but to wait for
+ * the first caller to complete. If the first caller used
+ * the request_module_nowait() call, subsquent callers will
+ * deal with the comprmise of getting a successful call with this
+ * optimization enabled ...
+ */
+ ret = wait_for_completion_state(&kmod_req->first_req_done,
+ TASK_UNINTERRUPTIBLE | TASK_KILLABLE);
+ if (ret) {
+ *dup_ret = ret;
+ return true;
+ }
+
+ /* Now the duplicate request has the same exact return value as the first request */
+ *dup_ret = kmod_req->dup_ret;
+
+ return true;
+}
+
+void kmod_dup_request_announce(char *module_name, int ret)
+{
+ struct kmod_dup_req *kmod_req;
+
+ mutex_lock(&kmod_dup_mutex);
+
+ kmod_req = kmod_dup_request_lookup(module_name);
+ if (!kmod_req)
+ goto out;
+
+ kmod_req->dup_ret = ret;
+
+ /*
+ * If we complete() here we may allow duplicate threads
+ * to continue before the first one that submitted the
+ * request. We're in no rush also, given that each and
+ * every bounce back to userspace is slow we avoid that
+ * with a slight delay here. So queueue up the completion
+ * and let duplicates suffer, just wait a tad bit longer.
+ * There is no rush. But we also don't want to hold the
+ * caller up forever or introduce any boot delays.
+ */
+ queue_work(system_wq, &kmod_req->complete_work);
+
+out:
+ mutex_unlock(&kmod_dup_mutex);
+}
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 2e2bf236f558..dc7b0160c480 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -3,6 +3,7 @@
*
* Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
*/
#include <linux/elf.h>
@@ -17,27 +18,19 @@
#define ARCH_SHF_SMALL 0
#endif
-/* If this is set, the section belongs in the init part of the module */
-#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG - 1))
-/* Maximum number of characters written by module_flags() */
-#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
-
-#ifndef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
-#define data_layout core_layout
-#endif
-
/*
- * Modules' sections will be aligned on page boundaries
- * to ensure complete separation of code and data, but
- * only when CONFIG_STRICT_MODULE_RWX=y
+ * Use highest 4 bits of sh_entsize to store the mod_mem_type of this
+ * section. This leaves 28 bits for offset on 32-bit systems, which is
+ * about 256 MiB (WARN_ON_ONCE if we exceed that).
*/
-static inline unsigned int strict_align(unsigned int size)
-{
- if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
- return PAGE_ALIGN(size);
- else
- return size;
-}
+
+#define SH_ENTSIZE_TYPE_BITS 4
+#define SH_ENTSIZE_TYPE_SHIFT (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)
+#define SH_ENTSIZE_TYPE_MASK ((1UL << SH_ENTSIZE_TYPE_BITS) - 1)
+#define SH_ENTSIZE_OFFSET_MASK ((1UL << (BITS_PER_LONG - SH_ENTSIZE_TYPE_BITS)) - 1)
+
+/* Maximum number of characters written by module_flags() */
+#define MODULE_FLAGS_BUF_SIZE (TAINT_FLAGS_COUNT + 4)
extern struct mutex module_mutex;
extern struct list_head modules;
@@ -53,7 +46,6 @@ extern const struct kernel_symbol __stop___ksymtab_gpl[];
extern const s32 __start___kcrctab[];
extern const s32 __start___kcrctab_gpl[];
-#include <linux/dynamic_debug.h>
struct load_info {
const char *name;
/* pointer to module in temporary copy, freed at end of load_module() */
@@ -63,12 +55,14 @@ struct load_info {
Elf_Shdr *sechdrs;
char *secstrings, *strtab;
unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
- struct _ddebug_info dyndbg;
bool sig_ok;
#ifdef CONFIG_KALLSYMS
unsigned long mod_kallsyms_init_off;
#endif
#ifdef CONFIG_MODULE_DECOMPRESS
+#ifdef CONFIG_MODULE_STATS
+ unsigned long compressed_len;
+#endif
struct page **pages;
unsigned int max_pages;
unsigned int used_pages;
@@ -101,11 +95,16 @@ int try_to_force_load(struct module *mod, const char *reason);
bool find_symbol(struct find_symbol_arg *fsa);
struct module *find_module_all(const char *name, size_t len, bool even_unformed);
int cmp_name(const void *name, const void *sym);
-long module_get_offset(struct module *mod, unsigned int *size, Elf_Shdr *sechdr,
- unsigned int section);
+long module_get_offset_and_type(struct module *mod, enum mod_mem_type type,
+ Elf_Shdr *sechdr, unsigned int section);
char *module_flags(struct module *mod, char *buf, bool show_state);
size_t module_flags_taint(unsigned long taints, char *buf);
+char *module_next_tag_pair(char *string, unsigned long *secsize);
+
+#define for_each_modinfo_entry(entry, info, name) \
+ for (entry = get_modinfo(info, name); entry; entry = get_next_modinfo(info, name, entry))
+
static inline void module_assert_mutex_or_preempt(void)
{
#ifdef CONFIG_LOCKDEP
@@ -148,6 +147,95 @@ static inline bool set_livepatch_module(struct module *mod)
#endif
}
+/**
+ * enum fail_dup_mod_reason - state at which a duplicate module was detected
+ *
+ * @FAIL_DUP_MOD_BECOMING: the module is read properly, passes all checks but
+ * we've determined that another module with the same name is already loaded
+ * or being processed on our &modules list. This happens on early_mod_check()
+ * right before layout_and_allocate(). The kernel would have already
+ * vmalloc()'d space for the entire module through finit_module(). If
+ * decompression was used two vmap() spaces were used. These failures can
+ * happen when userspace has not seen the module present on the kernel and
+ * tries to load the module multiple times at same time.
+ * @FAIL_DUP_MOD_LOAD: the module has been read properly, passes all validation
+ * checks and the kernel determines that the module was unique and because
+ * of this allocated yet another private kernel copy of the module space in
+ * layout_and_allocate() but after this determined in add_unformed_module()
+ * that another module with the same name is already loaded or being processed.
+ * These failures should be mitigated as much as possible and are indicative
+ * of really fast races in loading modules. Without module decompression
+ * they waste twice as much vmap space. With module decompression three
+ * times the module's size vmap space is wasted.
+ */
+enum fail_dup_mod_reason {
+ FAIL_DUP_MOD_BECOMING = 0,
+ FAIL_DUP_MOD_LOAD,
+};
+
+#ifdef CONFIG_MODULE_DEBUGFS
+extern struct dentry *mod_debugfs_root;
+#endif
+
+#ifdef CONFIG_MODULE_STATS
+
+#define mod_stat_add_long(count, var) atomic_long_add(count, var)
+#define mod_stat_inc(name) atomic_inc(name)
+
+extern atomic_long_t total_mod_size;
+extern atomic_long_t total_text_size;
+extern atomic_long_t invalid_kread_bytes;
+extern atomic_long_t invalid_decompress_bytes;
+
+extern atomic_t modcount;
+extern atomic_t failed_kreads;
+extern atomic_t failed_decompress;
+struct mod_fail_load {
+ struct list_head list;
+ char name[MODULE_NAME_LEN];
+ atomic_long_t count;
+ unsigned long dup_fail_mask;
+};
+
+int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason);
+void mod_stat_bump_invalid(struct load_info *info, int flags);
+void mod_stat_bump_becoming(struct load_info *info, int flags);
+
+#else
+
+#define mod_stat_add_long(name, var)
+#define mod_stat_inc(name)
+
+static inline int try_add_failed_module(const char *name,
+ enum fail_dup_mod_reason reason)
+{
+ return 0;
+}
+
+static inline void mod_stat_bump_invalid(struct load_info *info, int flags)
+{
+}
+
+static inline void mod_stat_bump_becoming(struct load_info *info, int flags)
+{
+}
+
+#endif /* CONFIG_MODULE_STATS */
+
+#ifdef CONFIG_MODULE_DEBUG_AUTOLOAD_DUPS
+bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret);
+void kmod_dup_request_announce(char *module_name, int ret);
+#else
+static inline bool kmod_dup_request_exists_wait(char *module_name, bool wait, int *dup_ret)
+{
+ return false;
+}
+
+static inline void kmod_dup_request_announce(char *module_name, int ret)
+{
+}
+#endif
+
#ifdef CONFIG_MODULE_UNLOAD_TAINT_TRACKING
struct mod_unload_taint {
struct list_head list;
@@ -190,10 +278,13 @@ struct mod_tree_root {
#endif
unsigned long addr_min;
unsigned long addr_max;
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ unsigned long data_addr_min;
+ unsigned long data_addr_max;
+#endif
};
extern struct mod_tree_root mod_tree;
-extern struct mod_tree_root mod_data_tree;
#ifdef CONFIG_MODULES_TREE_LOOKUP
void mod_tree_insert(struct module *mod);
@@ -224,7 +315,6 @@ void module_enable_nx(const struct module *mod);
void module_enable_x(const struct module *mod);
int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
char *secstrings, struct module *mod);
-bool module_check_misalignment(const struct module *mod);
#ifdef CONFIG_MODULE_SIG
int module_sig_check(struct load_info *info, int flags);
@@ -246,7 +336,6 @@ static inline void kmemleak_load_module(const struct module *mod,
void init_build_id(struct module *mod, const struct load_info *info);
void layout_symtab(struct module *mod, struct load_info *info);
void add_kallsyms(struct module *mod, const struct load_info *info);
-unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
static inline bool sect_empty(const Elf_Shdr *sect)
{
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index ab2376a1be88..c550d7d45f2f 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -6,6 +6,7 @@
*/
#include <linux/module.h>
+#include <linux/module_symbol.h>
#include <linux/kallsyms.h>
#include <linux/buildid.h>
#include <linux/bsearch.h>
@@ -78,6 +79,7 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
unsigned int shnum, unsigned int pcpundx)
{
const Elf_Shdr *sec;
+ enum mod_mem_type type;
if (src->st_shndx == SHN_UNDEF ||
src->st_shndx >= shnum ||
@@ -90,11 +92,12 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
#endif
sec = sechdrs + src->st_shndx;
+ type = sec->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
if (!(sec->sh_flags & SHF_ALLOC)
#ifndef CONFIG_KALLSYMS_ALL
|| !(sec->sh_flags & SHF_EXECINSTR)
#endif
- || (sec->sh_entsize & INIT_OFFSET_MASK))
+ || mod_mem_type_is_init(type))
return false;
return true;
@@ -113,11 +116,13 @@ void layout_symtab(struct module *mod, struct load_info *info)
Elf_Shdr *strsect = info->sechdrs + info->index.str;
const Elf_Sym *src;
unsigned int i, nsrc, ndst, strtab_size = 0;
+ struct module_memory *mod_mem_data = &mod->mem[MOD_DATA];
+ struct module_memory *mod_mem_init_data = &mod->mem[MOD_INIT_DATA];
/* Put symbol section at end of init part of module. */
symsect->sh_flags |= SHF_ALLOC;
- symsect->sh_entsize = module_get_offset(mod, &mod->init_layout.size, symsect,
- info->index.sym) | INIT_OFFSET_MASK;
+ symsect->sh_entsize = module_get_offset_and_type(mod, MOD_INIT_DATA,
+ symsect, info->index.sym);
pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
src = (void *)info->hdr + symsect->sh_offset;
@@ -134,28 +139,27 @@ void layout_symtab(struct module *mod, struct load_info *info)
}
/* Append room for core symbols at end of core part. */
- info->symoffs = ALIGN(mod->data_layout.size, symsect->sh_addralign ?: 1);
- info->stroffs = mod->data_layout.size = info->symoffs + ndst * sizeof(Elf_Sym);
- mod->data_layout.size += strtab_size;
+ info->symoffs = ALIGN(mod_mem_data->size, symsect->sh_addralign ?: 1);
+ info->stroffs = mod_mem_data->size = info->symoffs + ndst * sizeof(Elf_Sym);
+ mod_mem_data->size += strtab_size;
/* Note add_kallsyms() computes strtab_size as core_typeoffs - stroffs */
- info->core_typeoffs = mod->data_layout.size;
- mod->data_layout.size += ndst * sizeof(char);
- mod->data_layout.size = strict_align(mod->data_layout.size);
+ info->core_typeoffs = mod_mem_data->size;
+ mod_mem_data->size += ndst * sizeof(char);
/* Put string table section at end of init part of module. */
strsect->sh_flags |= SHF_ALLOC;
- strsect->sh_entsize = module_get_offset(mod, &mod->init_layout.size, strsect,
- info->index.str) | INIT_OFFSET_MASK;
+ strsect->sh_entsize = module_get_offset_and_type(mod, MOD_INIT_DATA,
+ strsect, info->index.str);
pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
/* We'll tack temporary mod_kallsyms on the end. */
- mod->init_layout.size = ALIGN(mod->init_layout.size,
- __alignof__(struct mod_kallsyms));
- info->mod_kallsyms_init_off = mod->init_layout.size;
- mod->init_layout.size += sizeof(struct mod_kallsyms);
- info->init_typeoffs = mod->init_layout.size;
- mod->init_layout.size += nsrc * sizeof(char);
- mod->init_layout.size = strict_align(mod->init_layout.size);
+ mod_mem_init_data->size = ALIGN(mod_mem_init_data->size,
+ __alignof__(struct mod_kallsyms));
+ info->mod_kallsyms_init_off = mod_mem_init_data->size;
+
+ mod_mem_init_data->size += sizeof(struct mod_kallsyms);
+ info->init_typeoffs = mod_mem_init_data->size;
+ mod_mem_init_data->size += nsrc * sizeof(char);
}
/*
@@ -171,9 +175,11 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
char *s;
Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
unsigned long strtab_size;
+ void *data_base = mod->mem[MOD_DATA].base;
+ void *init_data_base = mod->mem[MOD_INIT_DATA].base;
/* Set up to point into init section. */
- mod->kallsyms = (void __rcu *)mod->init_layout.base +
+ mod->kallsyms = (void __rcu *)init_data_base +
info->mod_kallsyms_init_off;
rcu_read_lock();
@@ -183,15 +189,15 @@ void add_kallsyms(struct module *mod, const struct load_info *info)
/* Make sure we get permanent strtab: don't use info->strtab. */
rcu_dereference(mod->kallsyms)->strtab =
(void *)info->sechdrs[info->index.str].sh_addr;
- rcu_dereference(mod->kallsyms)->typetab = mod->init_layout.base + info->init_typeoffs;
+ rcu_dereference(mod->kallsyms)->typetab = init_data_base + info->init_typeoffs;
/*
* Now populate the cut down core kallsyms for after init
* and set types up while we still have access to sections.
*/
- mod->core_kallsyms.symtab = dst = mod->data_layout.base + info->symoffs;
- mod->core_kallsyms.strtab = s = mod->data_layout.base + info->stroffs;
- mod->core_kallsyms.typetab = mod->data_layout.base + info->core_typeoffs;
+ mod->core_kallsyms.symtab = dst = data_base + info->symoffs;
+ mod->core_kallsyms.strtab = s = data_base + info->stroffs;
+ mod->core_kallsyms.typetab = data_base + info->core_typeoffs;
strtab_size = info->core_typeoffs - info->stroffs;
src = rcu_dereference(mod->kallsyms)->symtab;
for (ndst = i = 0; i < rcu_dereference(mod->kallsyms)->num_symtab; i++) {
@@ -238,18 +244,6 @@ void init_build_id(struct module *mod, const struct load_info *info)
}
#endif
-/*
- * This ignores the intensely annoying "mapping symbols" found
- * in ARM ELF files: $a, $t and $d.
- */
-static inline int is_arm_mapping_symbol(const char *str)
-{
- if (str[0] == '.' && str[1] == 'L')
- return true;
- return str[0] == '$' && strchr("axtd", str[1]) &&
- (str[2] == '\0' || str[2] == '.');
-}
-
static const char *kallsyms_symbol_name(struct mod_kallsyms *kallsyms, unsigned int symnum)
{
return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
@@ -267,12 +261,15 @@ static const char *find_kallsyms_symbol(struct module *mod,
unsigned int i, best = 0;
unsigned long nextval, bestval;
struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
+ struct module_memory *mod_mem;
/* At worse, next value is at end of module */
if (within_module_init(addr, mod))
- nextval = (unsigned long)mod->init_layout.base + mod->init_layout.text_size;
+ mod_mem = &mod->mem[MOD_INIT_TEXT];
else
- nextval = (unsigned long)mod->core_layout.base + mod->core_layout.text_size;
+ mod_mem = &mod->mem[MOD_TEXT];
+
+ nextval = (unsigned long)mod_mem->base + mod_mem->size;
bestval = kallsyms_symbol_value(&kallsyms->symtab[best]);
@@ -292,7 +289,7 @@ static const char *find_kallsyms_symbol(struct module *mod,
* and inserted at a whim.
*/
if (*kallsyms_symbol_name(kallsyms, i) == '\0' ||
- is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
+ is_mapping_symbol(kallsyms_symbol_name(kallsyms, i)))
continue;
if (thisval <= addr && thisval > bestval) {
@@ -442,7 +439,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
}
/* Given a module and name of symbol, find and return the symbol's value */
-unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
+static unsigned long __find_kallsyms_symbol_value(struct module *mod, const char *name)
{
unsigned int i;
struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
@@ -466,7 +463,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
if (colon) {
mod = find_module_all(name, colon - name, false);
if (mod)
- return find_kallsyms_symbol_value(mod, colon + 1);
+ return __find_kallsyms_symbol_value(mod, colon + 1);
return 0;
}
@@ -475,7 +472,7 @@ static unsigned long __module_kallsyms_lookup_name(const char *name)
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- ret = find_kallsyms_symbol_value(mod, name);
+ ret = __find_kallsyms_symbol_value(mod, name);
if (ret)
return ret;
}
@@ -494,9 +491,18 @@ unsigned long module_kallsyms_lookup_name(const char *name)
return ret;
}
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name)
+{
+ unsigned long ret;
+
+ preempt_disable();
+ ret = __find_kallsyms_symbol_value(mod, name);
+ preempt_enable();
+ return ret;
+}
+
int module_kallsyms_on_each_symbol(const char *modname,
- int (*fn)(void *, const char *,
- struct module *, unsigned long),
+ int (*fn)(void *, const char *, unsigned long),
void *data)
{
struct module *mod;
@@ -525,7 +531,7 @@ int module_kallsyms_on_each_symbol(const char *modname,
continue;
ret = fn(data, kallsyms_symbol_name(kallsyms, i),
- mod, kallsyms_symbol_value(sym));
+ kallsyms_symbol_value(sym));
if (ret != 0)
goto out;
}
diff --git a/kernel/module/kdb.c b/kernel/module/kdb.c
index f4317f92e189..995c32d3698f 100644
--- a/kernel/module/kdb.c
+++ b/kernel/module/kdb.c
@@ -26,10 +26,11 @@ int kdb_lsmod(int argc, const char **argv)
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- kdb_printf("%-20s%8u", mod->name, mod->core_layout.size);
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
- kdb_printf("/%8u", mod->data_layout.size);
-#endif
+ kdb_printf("%-20s%8u", mod->name, mod->mem[MOD_TEXT].size);
+ kdb_printf("/%8u", mod->mem[MOD_RODATA].size);
+ kdb_printf("/%8u", mod->mem[MOD_RO_AFTER_INIT].size);
+ kdb_printf("/%8u", mod->mem[MOD_DATA].size);
+
kdb_printf(" 0x%px ", (void *)mod);
#ifdef CONFIG_MODULE_UNLOAD
kdb_printf("%4d ", module_refcount(mod));
@@ -40,10 +41,10 @@ int kdb_lsmod(int argc, const char **argv)
kdb_printf(" (Loading)");
else
kdb_printf(" (Live)");
- kdb_printf(" 0x%px", mod->core_layout.base);
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
- kdb_printf("/0x%px", mod->data_layout.base);
-#endif
+ kdb_printf(" 0x%px", mod->mem[MOD_TEXT].base);
+ kdb_printf("/0x%px", mod->mem[MOD_RODATA].base);
+ kdb_printf("/0x%px", mod->mem[MOD_RO_AFTER_INIT].base);
+ kdb_printf("/0x%px", mod->mem[MOD_DATA].base);
#ifdef CONFIG_MODULE_UNLOAD
{
diff --git a/kernel/kmod.c b/kernel/module/kmod.c
index b717134ebe17..0800d9891692 100644
--- a/kernel/kmod.c
+++ b/kernel/module/kmod.c
@@ -1,6 +1,9 @@
/*
* kmod - the kernel module loader
+ *
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
*/
+
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
@@ -27,6 +30,7 @@
#include <linux/uaccess.h>
#include <trace/events/module.h>
+#include "internal.h"
/*
* Assuming:
@@ -40,8 +44,7 @@
* effect. Systems like these are very unlikely if modules are enabled.
*/
#define MAX_KMOD_CONCURRENT 50
-static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
-static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
+static DEFINE_SEMAPHORE(kmod_concurrent_max, MAX_KMOD_CONCURRENT);
/*
* This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
@@ -66,7 +69,7 @@ static void free_modprobe_argv(struct subprocess_info *info)
kfree(info->argv);
}
-static int call_modprobe(char *module_name, int wait)
+static int call_modprobe(char *orig_module_name, int wait)
{
struct subprocess_info *info;
static char *envp[] = {
@@ -75,12 +78,14 @@ static int call_modprobe(char *module_name, int wait)
"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
NULL
};
+ char *module_name;
+ int ret;
char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
if (!argv)
goto out;
- module_name = kstrdup(module_name, GFP_KERNEL);
+ module_name = kstrdup(orig_module_name, GFP_KERNEL);
if (!module_name)
goto free_argv;
@@ -95,13 +100,16 @@ static int call_modprobe(char *module_name, int wait)
if (!info)
goto free_module_name;
- return call_usermodehelper_exec(info, wait | UMH_KILLABLE);
+ ret = call_usermodehelper_exec(info, wait | UMH_KILLABLE);
+ kmod_dup_request_announce(orig_module_name, ret);
+ return ret;
free_module_name:
kfree(module_name);
free_argv:
kfree(argv);
out:
+ kmod_dup_request_announce(orig_module_name, -ENOMEM);
return -ENOMEM;
}
@@ -125,7 +133,7 @@ int __request_module(bool wait, const char *fmt, ...)
{
va_list args;
char module_name[MODULE_NAME_LEN];
- int ret;
+ int ret, dup_ret;
/*
* We don't allow synchronous module loading from async. Module
@@ -148,29 +156,24 @@ int __request_module(bool wait, const char *fmt, ...)
if (ret)
return ret;
- if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) {
- pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
- atomic_read(&kmod_concurrent_max),
- MAX_KMOD_CONCURRENT, module_name);
- ret = wait_event_killable_timeout(kmod_wq,
- atomic_dec_if_positive(&kmod_concurrent_max) >= 0,
- MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
- if (!ret) {
- pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
- module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
- return -ETIME;
- } else if (ret == -ERESTARTSYS) {
- pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name);
- return ret;
- }
+ ret = down_timeout(&kmod_concurrent_max, MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
+ if (ret) {
+ pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
+ module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
+ return ret;
}
trace_module_request(module_name, wait, _RET_IP_);
+ if (kmod_dup_request_exists_wait(module_name, wait, &dup_ret)) {
+ ret = dup_ret;
+ goto out;
+ }
+
ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
- atomic_inc(&kmod_concurrent_max);
- wake_up(&kmod_wq);
+out:
+ up(&kmod_concurrent_max);
return ret;
}
diff --git a/kernel/module/livepatch.c b/kernel/module/livepatch.c
index 486d4ff92719..a89f01e1d6b7 100644
--- a/kernel/module/livepatch.c
+++ b/kernel/module/livepatch.c
@@ -11,7 +11,7 @@
#include "internal.h"
/*
- * Persist Elf information about a module. Copy the Elf header,
+ * Persist ELF information about a module. Copy the ELF header,
* section header table, section string table, and symtab section
* index from info to mod->klp_info.
*/
@@ -25,11 +25,11 @@ int copy_module_elf(struct module *mod, struct load_info *info)
if (!mod->klp_info)
return -ENOMEM;
- /* Elf header */
+ /* ELF header */
size = sizeof(mod->klp_info->hdr);
memcpy(&mod->klp_info->hdr, info->hdr, size);
- /* Elf section header table */
+ /* ELF section header table */
size = sizeof(*info->sechdrs) * info->hdr->e_shnum;
mod->klp_info->sechdrs = kmemdup(info->sechdrs, size, GFP_KERNEL);
if (!mod->klp_info->sechdrs) {
@@ -37,7 +37,7 @@ int copy_module_elf(struct module *mod, struct load_info *info)
goto free_info;
}
- /* Elf section name string table */
+ /* ELF section name string table */
size = info->sechdrs[info->hdr->e_shstrndx].sh_size;
mod->klp_info->secstrings = kmemdup(info->secstrings, size, GFP_KERNEL);
if (!mod->klp_info->secstrings) {
@@ -45,7 +45,7 @@ int copy_module_elf(struct module *mod, struct load_info *info)
goto free_sechdrs;
}
- /* Elf symbol section index */
+ /* ELF symbol section index */
symndx = info->index.sym;
mod->klp_info->symndx = symndx;
diff --git a/kernel/module/main.c b/kernel/module/main.c
index d3be89de706d..044aa2c9e3cb 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2,6 +2,7 @@
/*
* Copyright (C) 2002 Richard Henderson
* Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
*/
#define INCLUDE_VERMAGIC
@@ -55,6 +56,7 @@
#include <linux/dynamic_debug.h>
#include <linux/audit.h>
#include <linux/cfi.h>
+#include <linux/debugfs.h>
#include <uapi/linux/module.h>
#include "internal.h"
@@ -80,12 +82,6 @@ struct mod_tree_root mod_tree __cacheline_aligned = {
.addr_min = -1UL,
};
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
-struct mod_tree_root mod_data_tree __cacheline_aligned = {
- .addr_min = -1UL,
-};
-#endif
-
struct symsearch {
const struct kernel_symbol *start, *stop;
const s32 *crcs;
@@ -93,14 +89,24 @@ struct symsearch {
};
/*
- * Bounds of module text, for speeding up __module_address.
+ * Bounds of module memory, for speeding up __module_address.
* Protected by module_mutex.
*/
-static void __mod_update_bounds(void *base, unsigned int size, struct mod_tree_root *tree)
+static void __mod_update_bounds(enum mod_mem_type type __maybe_unused, void *base,
+ unsigned int size, struct mod_tree_root *tree)
{
unsigned long min = (unsigned long)base;
unsigned long max = min + size;
+#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
+ if (mod_mem_type_is_core_data(type)) {
+ if (min < tree->data_addr_min)
+ tree->data_addr_min = min;
+ if (max > tree->data_addr_max)
+ tree->data_addr_max = max;
+ return;
+ }
+#endif
if (min < tree->addr_min)
tree->addr_min = min;
if (max > tree->addr_max)
@@ -109,12 +115,12 @@ static void __mod_update_bounds(void *base, unsigned int size, struct mod_tree_r
static void mod_update_bounds(struct module *mod)
{
- __mod_update_bounds(mod->core_layout.base, mod->core_layout.size, &mod_tree);
- if (mod->init_layout.size)
- __mod_update_bounds(mod->init_layout.base, mod->init_layout.size, &mod_tree);
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
- __mod_update_bounds(mod->data_layout.base, mod->data_layout.size, &mod_data_tree);
-#endif
+ for_each_mod_mem_type(type) {
+ struct module_memory *mod_mem = &mod->mem[type];
+
+ if (mod_mem->size)
+ __mod_update_bounds(type, mod_mem->base, mod_mem->size, &mod_tree);
+ }
}
/* Block module loading/unloading? */
@@ -559,10 +565,8 @@ static int already_uses(struct module *a, struct module *b)
struct module_use *use;
list_for_each_entry(use, &b->source_list, source_list) {
- if (use->source == a) {
- pr_debug("%s uses %s!\n", a->name, b->name);
+ if (use->source == a)
return 1;
- }
}
pr_debug("%s does not use %s!\n", a->name, b->name);
return 0;
@@ -926,7 +930,13 @@ struct module_attribute module_uevent =
static ssize_t show_coresize(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%u\n", mk->mod->core_layout.size);
+ unsigned int size = mk->mod->mem[MOD_TEXT].size;
+
+ if (!IS_ENABLED(CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC)) {
+ for_class_mod_mem_type(type, core_data)
+ size += mk->mod->mem[type].size;
+ }
+ return sprintf(buffer, "%u\n", size);
}
static struct module_attribute modinfo_coresize =
@@ -936,7 +946,11 @@ static struct module_attribute modinfo_coresize =
static ssize_t show_datasize(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%u\n", mk->mod->data_layout.size);
+ unsigned int size = 0;
+
+ for_class_mod_mem_type(type, core_data)
+ size += mk->mod->mem[type].size;
+ return sprintf(buffer, "%u\n", size);
}
static struct module_attribute modinfo_datasize =
@@ -946,7 +960,11 @@ static struct module_attribute modinfo_datasize =
static ssize_t show_initsize(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%u\n", mk->mod->init_layout.size);
+ unsigned int size = 0;
+
+ for_class_mod_mem_type(type, init)
+ size += mk->mod->mem[type].size;
+ return sprintf(buffer, "%u\n", size);
}
static struct module_attribute modinfo_initsize =
@@ -998,9 +1016,55 @@ int try_to_force_load(struct module *mod, const char *reason)
#endif
}
-static char *get_modinfo(const struct load_info *info, const char *tag);
+/* Parse tag=value strings from .modinfo section */
+char *module_next_tag_pair(char *string, unsigned long *secsize)
+{
+ /* Skip non-zero chars */
+ while (string[0]) {
+ string++;
+ if ((*secsize)-- <= 1)
+ return NULL;
+ }
+
+ /* Skip any zero padding. */
+ while (!string[0]) {
+ string++;
+ if ((*secsize)-- <= 1)
+ return NULL;
+ }
+ return string;
+}
+
static char *get_next_modinfo(const struct load_info *info, const char *tag,
- char *prev);
+ char *prev)
+{
+ char *p;
+ unsigned int taglen = strlen(tag);
+ Elf_Shdr *infosec = &info->sechdrs[info->index.info];
+ unsigned long size = infosec->sh_size;
+
+ /*
+ * get_modinfo() calls made before rewrite_section_headers()
+ * must use sh_offset, as sh_addr isn't set!
+ */
+ char *modinfo = (char *)info->hdr + infosec->sh_offset;
+
+ if (prev) {
+ size -= prev - modinfo;
+ modinfo = module_next_tag_pair(prev, &size);
+ }
+
+ for (p = modinfo; p; p = module_next_tag_pair(p, &size)) {
+ if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
+ return p + taglen + 1;
+ }
+ return NULL;
+}
+
+static char *get_modinfo(const struct load_info *info, const char *tag)
+{
+ return get_next_modinfo(info, tag, NULL);
+}
static int verify_namespace_is_imported(const struct load_info *info,
const struct kernel_symbol *sym,
@@ -1011,12 +1075,9 @@ static int verify_namespace_is_imported(const struct load_info *info,
namespace = kernel_symbol_namespace(sym);
if (namespace && namespace[0]) {
- imported_namespace = get_modinfo(info, "import_ns");
- while (imported_namespace) {
+ for_each_modinfo_entry(imported_namespace, info, "import_ns") {
if (strcmp(namespace, imported_namespace) == 0)
return 0;
- imported_namespace = get_next_modinfo(
- info, "import_ns", imported_namespace);
}
#ifdef CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
pr_warn(
@@ -1143,6 +1204,46 @@ void __weak module_arch_freeing_init(struct module *mod)
{
}
+static bool mod_mem_use_vmalloc(enum mod_mem_type type)
+{
+ return IS_ENABLED(CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC) &&
+ mod_mem_type_is_core_data(type);
+}
+
+static void *module_memory_alloc(unsigned int size, enum mod_mem_type type)
+{
+ if (mod_mem_use_vmalloc(type))
+ return vzalloc(size);
+ return module_alloc(size);
+}
+
+static void module_memory_free(void *ptr, enum mod_mem_type type)
+{
+ if (mod_mem_use_vmalloc(type))
+ vfree(ptr);
+ else
+ module_memfree(ptr);
+}
+
+static void free_mod_mem(struct module *mod)
+{
+ for_each_mod_mem_type(type) {
+ struct module_memory *mod_mem = &mod->mem[type];
+
+ if (type == MOD_DATA)
+ continue;
+
+ /* Free lock-classes; relies on the preceding sync_rcu(). */
+ lockdep_free_key_range(mod_mem->base, mod_mem->size);
+ if (mod_mem->size)
+ module_memory_free(mod_mem->base, type);
+ }
+
+ /* MOD_DATA hosts mod, so free it at last */
+ lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size);
+ module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA);
+}
+
/* Free a module, remove from lists, etc. */
static void free_module(struct module *mod)
{
@@ -1158,9 +1259,6 @@ static void free_module(struct module *mod)
mod->state = MODULE_STATE_UNFORMED;
mutex_unlock(&module_mutex);
- /* Remove dynamic debug info */
- ddebug_remove_module(mod->name);
-
/* Arch-specific cleanup. */
module_arch_cleanup(mod);
@@ -1189,18 +1287,10 @@ static void free_module(struct module *mod)
/* This may be empty, but that's OK */
module_arch_freeing_init(mod);
- module_memfree(mod->init_layout.base);
kfree(mod->args);
percpu_modfree(mod);
- /* Free lock-classes; relies on the preceding sync_rcu(). */
- lockdep_free_key_range(mod->data_layout.base, mod->data_layout.size);
-
- /* Finally, free the core (containing the module structure) */
- module_memfree(mod->core_layout.base);
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
- vfree(mod->data_layout.base);
-#endif
+ free_mod_mem(mod);
}
void *__symbol_get(const char *symbol)
@@ -1303,8 +1393,8 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
case SHN_ABS:
/* Don't need to do anything */
- pr_debug("Absolute symbol: 0x%08lx\n",
- (long)sym[i].st_value);
+ pr_debug("Absolute symbol: 0x%08lx %s\n",
+ (long)sym[i].st_value, name);
break;
case SHN_LIVEPATCH:
@@ -1387,16 +1477,18 @@ unsigned int __weak arch_mod_section_prepend(struct module *mod,
return 0;
}
-/* Update size with this section: return offset. */
-long module_get_offset(struct module *mod, unsigned int *size,
- Elf_Shdr *sechdr, unsigned int section)
+long module_get_offset_and_type(struct module *mod, enum mod_mem_type type,
+ Elf_Shdr *sechdr, unsigned int section)
{
- long ret;
+ long offset;
+ long mask = ((unsigned long)(type) & SH_ENTSIZE_TYPE_MASK) << SH_ENTSIZE_TYPE_SHIFT;
- *size += arch_mod_section_prepend(mod, section);
- ret = ALIGN(*size, sechdr->sh_addralign ?: 1);
- *size = ret + sechdr->sh_size;
- return ret;
+ mod->mem[type].size += arch_mod_section_prepend(mod, section);
+ offset = ALIGN(mod->mem[type].size, sechdr->sh_addralign ?: 1);
+ mod->mem[type].size = offset + sechdr->sh_size;
+
+ WARN_ON_ONCE(offset & mask);
+ return offset | mask;
}
static bool module_init_layout_section(const char *sname)
@@ -1408,15 +1500,11 @@ static bool module_init_layout_section(const char *sname)
return module_init_section(sname);
}
-/*
- * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
- * might -- code, read-only data, read-write data, small data. Tally
- * sizes, and place the offsets into sh_entsize fields: high bit means it
- * belongs in init.
- */
-static void layout_sections(struct module *mod, struct load_info *info)
+static void __layout_sections(struct module *mod, struct load_info *info, bool is_init)
{
- static unsigned long const masks[][2] = {
+ unsigned int m, i;
+
+ static const unsigned long masks[][2] = {
/*
* NOTE: all executable code must be the first section
* in this array; otherwise modify the text_size
@@ -1428,85 +1516,64 @@ static void layout_sections(struct module *mod, struct load_info *info)
{ SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL },
{ ARCH_SHF_SMALL | SHF_ALLOC, 0 }
};
- unsigned int m, i;
-
- for (i = 0; i < info->hdr->e_shnum; i++)
- info->sechdrs[i].sh_entsize = ~0UL;
+ static const int core_m_to_mem_type[] = {
+ MOD_TEXT,
+ MOD_RODATA,
+ MOD_RO_AFTER_INIT,
+ MOD_DATA,
+ MOD_INVALID, /* This is needed to match the masks array */
+ };
+ static const int init_m_to_mem_type[] = {
+ MOD_INIT_TEXT,
+ MOD_INIT_RODATA,
+ MOD_INVALID,
+ MOD_INIT_DATA,
+ MOD_INVALID, /* This is needed to match the masks array */
+ };
- pr_debug("Core section allocation order:\n");
for (m = 0; m < ARRAY_SIZE(masks); ++m) {
+ enum mod_mem_type type = is_init ? init_m_to_mem_type[m] : core_m_to_mem_type[m];
+
for (i = 0; i < info->hdr->e_shnum; ++i) {
Elf_Shdr *s = &info->sechdrs[i];
const char *sname = info->secstrings + s->sh_name;
- unsigned int *sizep;
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
- || module_init_layout_section(sname))
+ || is_init != module_init_layout_section(sname))
continue;
- sizep = m ? &mod->data_layout.size : &mod->core_layout.size;
- s->sh_entsize = module_get_offset(mod, sizep, s, i);
- pr_debug("\t%s\n", sname);
- }
- switch (m) {
- case 0: /* executable */
- mod->core_layout.size = strict_align(mod->core_layout.size);
- mod->core_layout.text_size = mod->core_layout.size;
- break;
- case 1: /* RO: text and ro-data */
- mod->data_layout.size = strict_align(mod->data_layout.size);
- mod->data_layout.ro_size = mod->data_layout.size;
- break;
- case 2: /* RO after init */
- mod->data_layout.size = strict_align(mod->data_layout.size);
- mod->data_layout.ro_after_init_size = mod->data_layout.size;
- break;
- case 4: /* whole core */
- mod->data_layout.size = strict_align(mod->data_layout.size);
- break;
- }
- }
-
- pr_debug("Init section allocation order:\n");
- for (m = 0; m < ARRAY_SIZE(masks); ++m) {
- for (i = 0; i < info->hdr->e_shnum; ++i) {
- Elf_Shdr *s = &info->sechdrs[i];
- const char *sname = info->secstrings + s->sh_name;
- if ((s->sh_flags & masks[m][0]) != masks[m][0]
- || (s->sh_flags & masks[m][1])
- || s->sh_entsize != ~0UL
- || !module_init_layout_section(sname))
+ if (WARN_ON_ONCE(type == MOD_INVALID))
continue;
- s->sh_entsize = (module_get_offset(mod, &mod->init_layout.size, s, i)
- | INIT_OFFSET_MASK);
+
+ s->sh_entsize = module_get_offset_and_type(mod, type, s, i);
pr_debug("\t%s\n", sname);
}
- switch (m) {
- case 0: /* executable */
- mod->init_layout.size = strict_align(mod->init_layout.size);
- mod->init_layout.text_size = mod->init_layout.size;
- break;
- case 1: /* RO: text and ro-data */
- mod->init_layout.size = strict_align(mod->init_layout.size);
- mod->init_layout.ro_size = mod->init_layout.size;
- break;
- case 2:
- /*
- * RO after init doesn't apply to init_layout (only
- * core_layout), so it just takes the value of ro_size.
- */
- mod->init_layout.ro_after_init_size = mod->init_layout.ro_size;
- break;
- case 4: /* whole init */
- mod->init_layout.size = strict_align(mod->init_layout.size);
- break;
- }
}
}
-static void set_license(struct module *mod, const char *license)
+/*
+ * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld
+ * might -- code, read-only data, read-write data, small data. Tally
+ * sizes, and place the offsets into sh_entsize fields: high bit means it
+ * belongs in init.
+ */
+static void layout_sections(struct module *mod, struct load_info *info)
+{
+ unsigned int i;
+
+ for (i = 0; i < info->hdr->e_shnum; i++)
+ info->sechdrs[i].sh_entsize = ~0UL;
+
+ pr_debug("Core section allocation order for %s:\n", mod->name);
+ __layout_sections(mod, info, false);
+
+ pr_debug("Init section allocation order for %s:\n", mod->name);
+ __layout_sections(mod, info, true);
+}
+
+static void module_license_taint_check(struct module *mod, const char *license)
{
if (!license)
license = "unspecified";
@@ -1520,56 +1587,6 @@ static void set_license(struct module *mod, const char *license)
}
}
-/* Parse tag=value strings from .modinfo section */
-static char *next_string(char *string, unsigned long *secsize)
-{
- /* Skip non-zero chars */
- while (string[0]) {
- string++;
- if ((*secsize)-- <= 1)
- return NULL;
- }
-
- /* Skip any zero padding. */
- while (!string[0]) {
- string++;
- if ((*secsize)-- <= 1)
- return NULL;
- }
- return string;
-}
-
-static char *get_next_modinfo(const struct load_info *info, const char *tag,
- char *prev)
-{
- char *p;
- unsigned int taglen = strlen(tag);
- Elf_Shdr *infosec = &info->sechdrs[info->index.info];
- unsigned long size = infosec->sh_size;
-
- /*
- * get_modinfo() calls made before rewrite_section_headers()
- * must use sh_offset, as sh_addr isn't set!
- */
- char *modinfo = (char *)info->hdr + infosec->sh_offset;
-
- if (prev) {
- size -= prev - modinfo;
- modinfo = next_string(prev, &size);
- }
-
- for (p = modinfo; p; p = next_string(p, &size)) {
- if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=')
- return p + taglen + 1;
- }
- return NULL;
-}
-
-static char *get_modinfo(const struct load_info *info, const char *tag)
-{
- return get_next_modinfo(info, tag, NULL);
-}
-
static void setup_modinfo(struct module *mod, struct load_info *info)
{
struct module_attribute *attr;
@@ -1592,19 +1609,6 @@ static void free_modinfo(struct module *mod)
}
}
-static void dynamic_debug_setup(struct module *mod, struct _ddebug_info *dyndbg)
-{
- if (!dyndbg->num_descs)
- return;
- ddebug_add_module(dyndbg, mod->name);
-}
-
-static void dynamic_debug_remove(struct module *mod, struct _ddebug_info *dyndbg)
-{
- if (dyndbg->num_descs)
- ddebug_remove_module(mod->name);
-}
-
void * __weak module_alloc(unsigned long size)
{
return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
@@ -1642,16 +1646,33 @@ static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr)
}
/*
- * Sanity checks against invalid binaries, wrong arch, weird elf version.
+ * Check userspace passed ELF module against our expectations, and cache
+ * useful variables for further processing as we go.
*
- * Also do basic validity checks against section offsets and sizes, the
+ * This does basic validity checks against section offsets and sizes, the
* section name string table, and the indices used for it (sh_name).
+ *
+ * As a last step, since we're already checking the ELF sections we cache
+ * useful variables which will be used later for our convenience:
+ *
+ * o pointers to section headers
+ * o cache the modinfo symbol section
+ * o cache the string symbol section
+ * o cache the module section
+ *
+ * As a last step we set info->mod to the temporary copy of the module in
+ * info->hdr. The final one will be allocated in move_module(). Any
+ * modifications we make to our copy of the module will be carried over
+ * to the final minted module.
*/
-static int elf_validity_check(struct load_info *info)
+static int elf_validity_cache_copy(struct load_info *info, int flags)
{
unsigned int i;
Elf_Shdr *shdr, *strhdr;
int err;
+ unsigned int num_mod_secs = 0, mod_idx;
+ unsigned int num_info_secs = 0, info_idx;
+ unsigned int num_sym_secs = 0, sym_idx;
if (info->len < sizeof(*(info->hdr))) {
pr_err("Invalid ELF header len %lu\n", info->len);
@@ -1755,6 +1776,8 @@ static int elf_validity_check(struct load_info *info)
info->hdr->e_shnum);
goto no_exec;
}
+ num_sym_secs++;
+ sym_idx = i;
fallthrough;
default:
err = validate_section_offset(info, shdr);
@@ -1763,6 +1786,15 @@ static int elf_validity_check(struct load_info *info)
i, shdr->sh_type);
return err;
}
+ if (strcmp(info->secstrings + shdr->sh_name,
+ ".gnu.linkonce.this_module") == 0) {
+ num_mod_secs++;
+ mod_idx = i;
+ } else if (strcmp(info->secstrings + shdr->sh_name,
+ ".modinfo") == 0) {
+ num_info_secs++;
+ info_idx = i;
+ }
if (shdr->sh_flags & SHF_ALLOC) {
if (shdr->sh_name >= strhdr->sh_size) {
@@ -1775,6 +1807,91 @@ static int elf_validity_check(struct load_info *info)
}
}
+ if (num_info_secs > 1) {
+ pr_err("Only one .modinfo section must exist.\n");
+ goto no_exec;
+ } else if (num_info_secs == 1) {
+ /* Try to find a name early so we can log errors with a module name */
+ info->index.info = info_idx;
+ info->name = get_modinfo(info, "name");
+ }
+
+ if (num_sym_secs != 1) {
+ pr_warn("%s: module has no symbols (stripped?)\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ goto no_exec;
+ }
+
+ /* Sets internal symbols and strings. */
+ info->index.sym = sym_idx;
+ shdr = &info->sechdrs[sym_idx];
+ info->index.str = shdr->sh_link;
+ info->strtab = (char *)info->hdr + info->sechdrs[info->index.str].sh_offset;
+
+ /*
+ * The ".gnu.linkonce.this_module" ELF section is special. It is
+ * what modpost uses to refer to __this_module and let's use rely
+ * on THIS_MODULE to point to &__this_module properly. The kernel's
+ * modpost declares it on each modules's *.mod.c file. If the struct
+ * module of the kernel changes a full kernel rebuild is required.
+ *
+ * We have a few expectaions for this special section, the following
+ * code validates all this for us:
+ *
+ * o Only one section must exist
+ * o We expect the kernel to always have to allocate it: SHF_ALLOC
+ * o The section size must match the kernel's run time's struct module
+ * size
+ */
+ if (num_mod_secs != 1) {
+ pr_err("module %s: Only one .gnu.linkonce.this_module section must exist.\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ goto no_exec;
+ }
+
+ shdr = &info->sechdrs[mod_idx];
+
+ /*
+ * This is already implied on the switch above, however let's be
+ * pedantic about it.
+ */
+ if (shdr->sh_type == SHT_NOBITS) {
+ pr_err("module %s: .gnu.linkonce.this_module section must have a size set\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ goto no_exec;
+ }
+
+ if (!(shdr->sh_flags & SHF_ALLOC)) {
+ pr_err("module %s: .gnu.linkonce.this_module must occupy memory during process execution\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ goto no_exec;
+ }
+
+ if (shdr->sh_size != sizeof(struct module)) {
+ pr_err("module %s: .gnu.linkonce.this_module section size must match the kernel's built struct module size at run time\n",
+ info->name ?: "(missing .modinfo section or name field)");
+ goto no_exec;
+ }
+
+ info->index.mod = mod_idx;
+
+ /* This is temporary: point mod into copy of data. */
+ info->mod = (void *)info->hdr + shdr->sh_offset;
+
+ /*
+ * If we didn't load the .modinfo 'name' field earlier, fall back to
+ * on-disk struct mod 'name' field.
+ */
+ if (!info->name)
+ info->name = info->mod->name;
+
+ if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
+ info->index.vers = 0; /* Pretend no __versions section! */
+ else
+ info->index.vers = find_sec(info, "__versions");
+
+ info->index.pcpu = find_pcpusec(info);
+
return 0;
no_exec:
@@ -1804,12 +1921,8 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info)
/* Nothing more to do */
return 0;
- if (set_livepatch_module(mod)) {
- add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
- pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
- mod->name);
+ if (set_livepatch_module(mod))
return 0;
- }
pr_err("%s: module is marked as livepatch module, but livepatch support is disabled",
mod->name);
@@ -1892,63 +2005,71 @@ static int rewrite_section_headers(struct load_info *info, int flags)
}
/*
- * Set up our basic convenience variables (pointers to section headers,
- * search for module section index etc), and do some basic section
- * verification.
- *
- * Set info->mod to the temporary copy of the module in info->hdr. The final one
- * will be allocated in move_module().
- */
-static int setup_load_info(struct load_info *info, int flags)
+ * These calls taint the kernel depending certain module circumstances */
+static void module_augment_kernel_taints(struct module *mod, struct load_info *info)
{
- unsigned int i;
+ int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
- /* Try to find a name early so we can log errors with a module name */
- info->index.info = find_sec(info, ".modinfo");
- if (info->index.info)
- info->name = get_modinfo(info, "name");
+ if (!get_modinfo(info, "intree")) {
+ if (!test_taint(TAINT_OOT_MODULE))
+ pr_warn("%s: loading out-of-tree module taints kernel.\n",
+ mod->name);
+ add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
+ }
- /* Find internal symbols and strings. */
- for (i = 1; i < info->hdr->e_shnum; i++) {
- if (info->sechdrs[i].sh_type == SHT_SYMTAB) {
- info->index.sym = i;
- info->index.str = info->sechdrs[i].sh_link;
- info->strtab = (char *)info->hdr
- + info->sechdrs[info->index.str].sh_offset;
- break;
- }
+ check_modinfo_retpoline(mod, info);
+
+ if (get_modinfo(info, "staging")) {
+ add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
+ pr_warn("%s: module is from the staging directory, the quality "
+ "is unknown, you have been warned.\n", mod->name);
}
- if (info->index.sym == 0) {
- pr_warn("%s: module has no symbols (stripped?)\n",
- info->name ?: "(missing .modinfo section or name field)");
- return -ENOEXEC;
+ if (is_livepatch_module(mod)) {
+ add_taint_module(mod, TAINT_LIVEPATCH, LOCKDEP_STILL_OK);
+ pr_notice_once("%s: tainting kernel with TAINT_LIVEPATCH\n",
+ mod->name);
}
- info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
- if (!info->index.mod) {
- pr_warn("%s: No module found in object\n",
- info->name ?: "(missing .modinfo section or name field)");
- return -ENOEXEC;
+ module_license_taint_check(mod, get_modinfo(info, "license"));
+
+ if (get_modinfo(info, "test")) {
+ if (!test_taint(TAINT_TEST))
+ pr_warn("%s: loading test module taints kernel.\n",
+ mod->name);
+ add_taint_module(mod, TAINT_TEST, LOCKDEP_STILL_OK);
}
- /* This is temporary: point mod into copy of data. */
- info->mod = (void *)info->hdr + info->sechdrs[info->index.mod].sh_offset;
+#ifdef CONFIG_MODULE_SIG
+ mod->sig_ok = info->sig_ok;
+ if (!mod->sig_ok) {
+ pr_notice_once("%s: module verification failed: signature "
+ "and/or required key missing - tainting "
+ "kernel\n", mod->name);
+ add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
+ }
+#endif
/*
- * If we didn't load the .modinfo 'name' field earlier, fall back to
- * on-disk struct mod 'name' field.
+ * ndiswrapper is under GPL by itself, but loads proprietary modules.
+ * Don't use add_taint_module(), as it would prevent ndiswrapper from
+ * using GPL-only symbols it needs.
*/
- if (!info->name)
- info->name = info->mod->name;
+ if (strcmp(mod->name, "ndiswrapper") == 0)
+ add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
- if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
- info->index.vers = 0; /* Pretend no __versions section! */
- else
- info->index.vers = find_sec(info, "__versions");
+ /* driverloader was caught wrongly pretending to be under GPL */
+ if (strcmp(mod->name, "driverloader") == 0)
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
- info->index.pcpu = find_pcpusec(info);
+ /* lve claims to be GPL but upstream won't provide source */
+ if (strcmp(mod->name, "lve") == 0)
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
+
+ if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
+ pr_warn("%s: module license taints kernel.\n", mod->name);
- return 0;
}
static int check_modinfo(struct module *mod, struct load_info *info, int flags)
@@ -1970,35 +2091,10 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
return -ENOEXEC;
}
- if (!get_modinfo(info, "intree")) {
- if (!test_taint(TAINT_OOT_MODULE))
- pr_warn("%s: loading out-of-tree module taints kernel.\n",
- mod->name);
- add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK);
- }
-
- check_modinfo_retpoline(mod, info);
-
- if (get_modinfo(info, "staging")) {
- add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK);
- pr_warn("%s: module is from the staging directory, the quality "
- "is unknown, you have been warned.\n", mod->name);
- }
-
err = check_modinfo_livepatch(mod, info);
if (err)
return err;
- /* Set up license info based on the info section */
- set_license(mod, get_modinfo(info, "license"));
-
- if (get_modinfo(info, "test")) {
- if (!test_taint(TAINT_TEST))
- pr_warn("%s: loading test module taints kernel.\n",
- mod->name);
- add_taint_module(mod, TAINT_TEST, LOCKDEP_STILL_OK);
- }
-
return 0;
}
@@ -2110,10 +2206,14 @@ static int find_module_sections(struct module *mod, struct load_info *info)
if (section_addr(info, "__obsparm"))
pr_warn("%s: Ignoring obsolete parameters\n", mod->name);
- info->dyndbg.descs = section_objs(info, "__dyndbg",
- sizeof(*info->dyndbg.descs), &info->dyndbg.num_descs);
- info->dyndbg.classes = section_objs(info, "__dyndbg_classes",
- sizeof(*info->dyndbg.classes), &info->dyndbg.num_classes);
+#ifdef CONFIG_DYNAMIC_DEBUG_CORE
+ mod->dyndbg_info.descs = section_objs(info, "__dyndbg",
+ sizeof(*mod->dyndbg_info.descs),
+ &mod->dyndbg_info.num_descs);
+ mod->dyndbg_info.classes = section_objs(info, "__dyndbg_classes",
+ sizeof(*mod->dyndbg_info.classes),
+ &mod->dyndbg_info.num_classes);
+#endif
return 0;
}
@@ -2122,109 +2222,82 @@ static int move_module(struct module *mod, struct load_info *info)
{
int i;
void *ptr;
+ enum mod_mem_type t = 0;
+ int ret = -ENOMEM;
- /* Do the allocs. */
- ptr = module_alloc(mod->core_layout.size);
- /*
- * The pointer to this block is stored in the module structure
- * which is inside the block. Just mark it as not being a
- * leak.
- */
- kmemleak_not_leak(ptr);
- if (!ptr)
- return -ENOMEM;
-
- memset(ptr, 0, mod->core_layout.size);
- mod->core_layout.base = ptr;
-
- if (mod->init_layout.size) {
- ptr = module_alloc(mod->init_layout.size);
+ for_each_mod_mem_type(type) {
+ if (!mod->mem[type].size) {
+ mod->mem[type].base = NULL;
+ continue;
+ }
+ mod->mem[type].size = PAGE_ALIGN(mod->mem[type].size);
+ ptr = module_memory_alloc(mod->mem[type].size, type);
/*
- * The pointer to this block is stored in the module structure
- * which is inside the block. This block doesn't need to be
- * scanned as it contains data and code that will be freed
- * after the module is initialized.
+ * The pointer to these blocks of memory are stored on the module
+ * structure and we keep that around so long as the module is
+ * around. We only free that memory when we unload the module.
+ * Just mark them as not being a leak then. The .init* ELF
+ * sections *do* get freed after boot so we *could* treat them
+ * slightly differently with kmemleak_ignore() and only grey
+ * them out as they work as typical memory allocations which
+ * *do* eventually get freed, but let's just keep things simple
+ * and avoid *any* false positives.
*/
- kmemleak_ignore(ptr);
+ kmemleak_not_leak(ptr);
if (!ptr) {
- module_memfree(mod->core_layout.base);
- return -ENOMEM;
+ t = type;
+ goto out_enomem;
}
- memset(ptr, 0, mod->init_layout.size);
- mod->init_layout.base = ptr;
- } else
- mod->init_layout.base = NULL;
-
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
- /* Do the allocs. */
- ptr = vzalloc(mod->data_layout.size);
- /*
- * The pointer to this block is stored in the module structure
- * which is inside the block. Just mark it as not being a
- * leak.
- */
- kmemleak_not_leak(ptr);
- if (!ptr) {
- module_memfree(mod->core_layout.base);
- module_memfree(mod->init_layout.base);
- return -ENOMEM;
+ memset(ptr, 0, mod->mem[type].size);
+ mod->mem[type].base = ptr;
}
- mod->data_layout.base = ptr;
-#endif
/* Transfer each section which specifies SHF_ALLOC */
- pr_debug("final section addresses:\n");
+ pr_debug("Final section addresses for %s:\n", mod->name);
for (i = 0; i < info->hdr->e_shnum; i++) {
void *dest;
Elf_Shdr *shdr = &info->sechdrs[i];
+ enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
if (!(shdr->sh_flags & SHF_ALLOC))
continue;
- if (shdr->sh_entsize & INIT_OFFSET_MASK)
- dest = mod->init_layout.base
- + (shdr->sh_entsize & ~INIT_OFFSET_MASK);
- else if (!(shdr->sh_flags & SHF_EXECINSTR))
- dest = mod->data_layout.base + shdr->sh_entsize;
- else
- dest = mod->core_layout.base + shdr->sh_entsize;
+ dest = mod->mem[type].base + (shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK);
- if (shdr->sh_type != SHT_NOBITS)
+ if (shdr->sh_type != SHT_NOBITS) {
+ /*
+ * Our ELF checker already validated this, but let's
+ * be pedantic and make the goal clearer. We actually
+ * end up copying over all modifications made to the
+ * userspace copy of the entire struct module.
+ */
+ if (i == info->index.mod &&
+ (WARN_ON_ONCE(shdr->sh_size != sizeof(struct module)))) {
+ ret = -ENOEXEC;
+ goto out_enomem;
+ }
memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
- /* Update sh_addr to point to copy in image. */
+ }
+ /*
+ * Update the userspace copy's ELF section address to point to
+ * our newly allocated memory as a pure convenience so that
+ * users of info can keep taking advantage and using the newly
+ * minted official memory area.
+ */
shdr->sh_addr = (unsigned long)dest;
- pr_debug("\t0x%lx %s\n",
- (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
+ pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr,
+ (long)shdr->sh_size, info->secstrings + shdr->sh_name);
}
return 0;
+out_enomem:
+ for (t--; t >= 0; t--)
+ module_memory_free(mod->mem[t].base, t);
+ return ret;
}
-static int check_module_license_and_versions(struct module *mod)
+static int check_export_symbol_versions(struct module *mod)
{
- int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
-
- /*
- * ndiswrapper is under GPL by itself, but loads proprietary modules.
- * Don't use add_taint_module(), as it would prevent ndiswrapper from
- * using GPL-only symbols it needs.
- */
- if (strcmp(mod->name, "ndiswrapper") == 0)
- add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
-
- /* driverloader was caught wrongly pretending to be under GPL */
- if (strcmp(mod->name, "driverloader") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
- LOCKDEP_NOW_UNRELIABLE);
-
- /* lve claims to be GPL but upstream won't provide source */
- if (strcmp(mod->name, "lve") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
- LOCKDEP_NOW_UNRELIABLE);
-
- if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
- pr_warn("%s: module license taints kernel.\n", mod->name);
-
#ifdef CONFIG_MODVERSIONS
if ((mod->num_syms && !mod->crcs) ||
(mod->num_gpl_syms && !mod->gpl_crcs)) {
@@ -2242,12 +2315,14 @@ static void flush_module_icache(const struct module *mod)
* Do it before processing of module parameters, so the module
* can provide parameter accessor functions of its own.
*/
- if (mod->init_layout.base)
- flush_icache_range((unsigned long)mod->init_layout.base,
- (unsigned long)mod->init_layout.base
- + mod->init_layout.size);
- flush_icache_range((unsigned long)mod->core_layout.base,
- (unsigned long)mod->core_layout.base + mod->core_layout.size);
+ for_each_mod_mem_type(type) {
+ const struct module_memory *mod_mem = &mod->mem[type];
+
+ if (mod_mem->size) {
+ flush_icache_range((unsigned long)mod_mem->base,
+ (unsigned long)mod_mem->base + mod_mem->size);
+ }
+ }
}
bool __weak module_elf_check_arch(Elf_Ehdr *hdr)
@@ -2290,10 +2365,6 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
unsigned int ndx;
int err;
- err = check_modinfo(info->mod, info, flags);
- if (err)
- return ERR_PTR(err);
-
/* Allow arches to frob section contents and sizes. */
err = module_frob_arch_sections(info->hdr, info->sechdrs,
info->secstrings, info->mod);
@@ -2350,11 +2421,8 @@ static void module_deallocate(struct module *mod, struct load_info *info)
{
percpu_modfree(mod);
module_arch_freeing_init(mod);
- module_memfree(mod->init_layout.base);
- module_memfree(mod->core_layout.base);
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
- vfree(mod->data_layout.base);
-#endif
+
+ free_mod_mem(mod);
}
int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -2380,27 +2448,6 @@ static int post_relocation(struct module *mod, const struct load_info *info)
return module_finalize(info->hdr, info->sechdrs, mod);
}
-/* Is this module of this name done loading? No locks held. */
-static bool finished_loading(const char *name)
-{
- struct module *mod;
- bool ret;
-
- /*
- * The module_mutex should not be a heavily contended lock;
- * if we get the occasional sleep here, we'll go an extra iteration
- * in the wait_event_interruptible(), which is harmless.
- */
- sched_annotate_sleep();
- mutex_lock(&module_mutex);
- mod = find_module_all(name, strlen(name), true);
- ret = !mod || mod->state == MODULE_STATE_LIVE
- || mod->state == MODULE_STATE_GOING;
- mutex_unlock(&module_mutex);
-
- return ret;
-}
-
/* Call module constructors. */
static void do_mod_ctors(struct module *mod)
{
@@ -2415,7 +2462,9 @@ static void do_mod_ctors(struct module *mod)
/* For freeing module_init on success, in case kallsyms traversing */
struct mod_initfree {
struct llist_node node;
- void *module_init;
+ void *init_text;
+ void *init_data;
+ void *init_rodata;
};
static void do_free_init(struct work_struct *w)
@@ -2429,7 +2478,9 @@ static void do_free_init(struct work_struct *w)
llist_for_each_safe(pos, n, list) {
initfree = container_of(pos, struct mod_initfree, node);
- module_memfree(initfree->module_init);
+ module_memfree(initfree->init_text);
+ module_memfree(initfree->init_data);
+ module_memfree(initfree->init_rodata);
kfree(initfree);
}
}
@@ -2450,13 +2501,27 @@ static noinline int do_init_module(struct module *mod)
{
int ret = 0;
struct mod_initfree *freeinit;
+#if defined(CONFIG_MODULE_STATS)
+ unsigned int text_size = 0, total_size = 0;
+
+ for_each_mod_mem_type(type) {
+ const struct module_memory *mod_mem = &mod->mem[type];
+ if (mod_mem->size) {
+ total_size += mod_mem->size;
+ if (type == MOD_TEXT || type == MOD_INIT_TEXT)
+ text_size += mod_mem->size;
+ }
+ }
+#endif
freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL);
if (!freeinit) {
ret = -ENOMEM;
goto fail;
}
- freeinit->module_init = mod->init_layout.base;
+ freeinit->init_text = mod->mem[MOD_INIT_TEXT].base;
+ freeinit->init_data = mod->mem[MOD_INIT_DATA].base;
+ freeinit->init_rodata = mod->mem[MOD_INIT_RODATA].base;
do_mod_ctors(mod);
/* Start the module */
@@ -2492,8 +2557,8 @@ static noinline int do_init_module(struct module *mod)
if (!mod->async_probe_requested)
async_synchronize_full();
- ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
- mod->init_layout.size);
+ ftrace_free_mem(mod, mod->mem[MOD_INIT_TEXT].base,
+ mod->mem[MOD_INIT_TEXT].base + mod->mem[MOD_INIT_TEXT].size);
mutex_lock(&module_mutex);
/* Drop initial reference. */
module_put(mod);
@@ -2505,11 +2570,11 @@ static noinline int do_init_module(struct module *mod)
module_enable_ro(mod, true);
mod_tree_remove_init(mod);
module_arch_freeing_init(mod);
- mod->init_layout.base = NULL;
- mod->init_layout.size = 0;
- mod->init_layout.ro_size = 0;
- mod->init_layout.ro_after_init_size = 0;
- mod->init_layout.text_size = 0;
+ for_class_mod_mem_type(type, init) {
+ mod->mem[type].base = NULL;
+ mod->mem[type].size = 0;
+ }
+
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
/* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */
mod->btf_data = NULL;
@@ -2533,6 +2598,11 @@ static noinline int do_init_module(struct module *mod)
mutex_unlock(&module_mutex);
wake_up_all(&module_wq);
+ mod_stat_add_long(text_size, &total_text_size);
+ mod_stat_add_long(total_size, &total_mod_size);
+
+ mod_stat_inc(&modcount);
+
return 0;
fail_free_freeinit:
@@ -2548,6 +2618,7 @@ fail:
ftrace_release_mod(mod);
free_module(mod);
wake_up_all(&module_wq);
+
return ret;
}
@@ -2559,6 +2630,67 @@ static int may_init_module(void)
return 0;
}
+/* Is this module of this name done loading? No locks held. */
+static bool finished_loading(const char *name)
+{
+ struct module *mod;
+ bool ret;
+
+ /*
+ * The module_mutex should not be a heavily contended lock;
+ * if we get the occasional sleep here, we'll go an extra iteration
+ * in the wait_event_interruptible(), which is harmless.
+ */
+ sched_annotate_sleep();
+ mutex_lock(&module_mutex);
+ mod = find_module_all(name, strlen(name), true);
+ ret = !mod || mod->state == MODULE_STATE_LIVE
+ || mod->state == MODULE_STATE_GOING;
+ mutex_unlock(&module_mutex);
+
+ return ret;
+}
+
+/* Must be called with module_mutex held */
+static int module_patient_check_exists(const char *name,
+ enum fail_dup_mod_reason reason)
+{
+ struct module *old;
+ int err = 0;
+
+ old = find_module_all(name, strlen(name), true);
+ if (old == NULL)
+ return 0;
+
+ if (old->state == MODULE_STATE_COMING ||
+ old->state == MODULE_STATE_UNFORMED) {
+ /* Wait in case it fails to load. */
+ mutex_unlock(&module_mutex);
+ err = wait_event_interruptible(module_wq,
+ finished_loading(name));
+ mutex_lock(&module_mutex);
+ if (err)
+ return err;
+
+ /* The module might have gone in the meantime. */
+ old = find_module_all(name, strlen(name), true);
+ }
+
+ if (try_add_failed_module(name, reason))
+ pr_warn("Could not add fail-tracking for module: %s\n", name);
+
+ /*
+ * We are here only when the same module was being loaded. Do
+ * not try to load it again right now. It prevents long delays
+ * caused by serialized module load failures. It might happen
+ * when more devices of the same type trigger load of
+ * a particular module.
+ */
+ if (old && old->state == MODULE_STATE_LIVE)
+ return -EEXIST;
+ return -EBUSY;
+}
+
/*
* We try to place it in the list now to make sure it's unique before
* we dedicate too many resources. In particular, temporary percpu
@@ -2567,41 +2699,14 @@ static int may_init_module(void)
static int add_unformed_module(struct module *mod)
{
int err;
- struct module *old;
mod->state = MODULE_STATE_UNFORMED;
mutex_lock(&module_mutex);
- old = find_module_all(mod->name, strlen(mod->name), true);
- if (old != NULL) {
- if (old->state == MODULE_STATE_COMING
- || old->state == MODULE_STATE_UNFORMED) {
- /* Wait in case it fails to load. */
- mutex_unlock(&module_mutex);
- err = wait_event_interruptible(module_wq,
- finished_loading(mod->name));
- if (err)
- goto out_unlocked;
-
- /* The module might have gone in the meantime. */
- mutex_lock(&module_mutex);
- old = find_module_all(mod->name, strlen(mod->name),
- true);
- }
-
- /*
- * We are here only when the same module was being loaded. Do
- * not try to load it again right now. It prevents long delays
- * caused by serialized module load failures. It might happen
- * when more devices of the same type trigger load of
- * a particular module.
- */
- if (old && old->state == MODULE_STATE_LIVE)
- err = -EEXIST;
- else
- err = -EBUSY;
+ err = module_patient_check_exists(mod->name, FAIL_DUP_MOD_LOAD);
+ if (err)
goto out;
- }
+
mod_update_bounds(mod);
list_add_rcu(&mod->list, &modules);
mod_tree_insert(mod);
@@ -2609,7 +2714,6 @@ static int add_unformed_module(struct module *mod)
out:
mutex_unlock(&module_mutex);
-out_unlocked:
return err;
}
@@ -2628,9 +2732,6 @@ static int complete_formation(struct module *mod, struct load_info *info)
module_bug_finalize(info->hdr, info->sechdrs, mod);
module_cfi_finalize(info->hdr, info->sechdrs, mod);
- if (module_check_misalignment(mod))
- goto out_misaligned;
-
module_enable_ro(mod, false);
module_enable_nx(mod);
module_enable_x(mod);
@@ -2644,8 +2745,6 @@ static int complete_formation(struct module *mod, struct load_info *info)
return 0;
-out_misaligned:
- err = -EINVAL;
out:
mutex_unlock(&module_mutex);
return err;
@@ -2688,6 +2787,39 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname,
return 0;
}
+/* Module within temporary copy, this doesn't do any allocation */
+static int early_mod_check(struct load_info *info, int flags)
+{
+ int err;
+
+ /*
+ * Now that we know we have the correct module name, check
+ * if it's blacklisted.
+ */
+ if (blacklisted(info->name)) {
+ pr_err("Module %s is blacklisted\n", info->name);
+ return -EPERM;
+ }
+
+ err = rewrite_section_headers(info, flags);
+ if (err)
+ return err;
+
+ /* Check module struct version now, before we try to use module. */
+ if (!check_modstruct_version(info, info->mod))
+ return -ENOEXEC;
+
+ err = check_modinfo(info->mod, info, flags);
+ if (err)
+ return err;
+
+ mutex_lock(&module_mutex);
+ err = module_patient_check_exists(info->mod->name, FAIL_DUP_MOD_BECOMING);
+ mutex_unlock(&module_mutex);
+
+ return err;
+}
+
/*
* Allocate and load the module: note that size of section 0 is always
* zero, and we rely on this for optional sections.
@@ -2696,6 +2828,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
int flags)
{
struct module *mod;
+ bool module_allocated = false;
long err = 0;
char *after_dashes;
@@ -2717,40 +2850,17 @@ static int load_module(struct load_info *info, const char __user *uargs,
/*
* Do basic sanity checks against the ELF header and
- * sections.
+ * sections. Cache useful sections and set the
+ * info->mod to the userspace passed struct module.
*/
- err = elf_validity_check(info);
+ err = elf_validity_cache_copy(info, flags);
if (err)
goto free_copy;
- /*
- * Everything checks out, so set up the section info
- * in the info structure.
- */
- err = setup_load_info(info, flags);
+ err = early_mod_check(info, flags);
if (err)
goto free_copy;
- /*
- * Now that we know we have the correct module name, check
- * if it's blacklisted.
- */
- if (blacklisted(info->name)) {
- err = -EPERM;
- pr_err("Module %s is blacklisted\n", info->name);
- goto free_copy;
- }
-
- err = rewrite_section_headers(info, flags);
- if (err)
- goto free_copy;
-
- /* Check module struct version now, before we try to use module. */
- if (!check_modstruct_version(info, info->mod)) {
- err = -ENOEXEC;
- goto free_copy;
- }
-
/* Figure out module layout, and allocate all the memory. */
mod = layout_and_allocate(info, flags);
if (IS_ERR(mod)) {
@@ -2758,6 +2868,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
goto free_copy;
}
+ module_allocated = true;
+
audit_log_kern_module(mod->name);
/* Reserve our place in the list. */
@@ -2765,15 +2877,11 @@ static int load_module(struct load_info *info, const char __user *uargs,
if (err)
goto free_module;
-#ifdef CONFIG_MODULE_SIG
- mod->sig_ok = info->sig_ok;
- if (!mod->sig_ok) {
- pr_notice_once("%s: module verification failed: signature "
- "and/or required key missing - tainting "
- "kernel\n", mod->name);
- add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
- }
-#endif
+ /*
+ * We are tainting your kernel if your module gets into
+ * the modules linked list somehow.
+ */
+ module_augment_kernel_taints(mod, info);
/* To avoid stressing percpu allocator, do this once we're unique. */
err = percpu_modalloc(mod, info);
@@ -2795,7 +2903,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
if (err)
goto free_unload;
- err = check_module_license_and_versions(mod);
+ err = check_export_symbol_versions(mod);
if (err)
goto free_unload;
@@ -2825,7 +2933,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
}
init_build_id(mod, info);
- dynamic_debug_setup(mod, &info->dyndbg);
/* Ftrace init must be called in the MODULE_STATE_UNFORMED state */
ftrace_module_init(mod);
@@ -2889,7 +2996,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
ddebug_cleanup:
ftrace_release_mod(mod);
- dynamic_debug_remove(mod, &info->dyndbg);
synchronize_rcu();
kfree(mod->args);
free_arch_cleanup:
@@ -2908,11 +3014,22 @@ static int load_module(struct load_info *info, const char __user *uargs,
synchronize_rcu();
mutex_unlock(&module_mutex);
free_module:
+ mod_stat_bump_invalid(info, flags);
/* Free lock-classes; relies on the preceding sync_rcu() */
- lockdep_free_key_range(mod->data_layout.base, mod->data_layout.size);
+ for_class_mod_mem_type(type, core_data) {
+ lockdep_free_key_range(mod->mem[type].base,
+ mod->mem[type].size);
+ }
module_deallocate(mod, info);
free_copy:
+ /*
+ * The info->len is always set. We distinguish between
+ * failures once the proper module was allocated and
+ * before that.
+ */
+ if (!module_allocated)
+ mod_stat_bump_becoming(info, flags);
free_copy(info, flags);
return err;
}
@@ -2931,8 +3048,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
umod, len, uargs);
err = copy_module_from_user(umod, len, &info);
- if (err)
+ if (err) {
+ mod_stat_inc(&failed_kreads);
+ mod_stat_add_long(len, &invalid_kread_bytes);
return err;
+ }
return load_module(&info, uargs, 0);
}
@@ -2957,14 +3077,20 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
len = kernel_read_file_from_fd(fd, 0, &buf, INT_MAX, NULL,
READING_MODULE);
- if (len < 0)
+ if (len < 0) {
+ mod_stat_inc(&failed_kreads);
+ mod_stat_add_long(len, &invalid_kread_bytes);
return len;
+ }
if (flags & MODULE_INIT_COMPRESSED_FILE) {
err = module_decompress(&info, buf, len);
vfree(buf); /* compressed data is no longer needed */
- if (err)
+ if (err) {
+ mod_stat_inc(&failed_decompress);
+ mod_stat_add_long(len, &invalid_decompress_bytes);
return err;
+ }
} else {
info.hdr = buf;
info.len = len;
@@ -2973,11 +3099,6 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
return load_module(&info, uargs, flags);
}
-static inline int within(unsigned long addr, void *start, unsigned long size)
-{
- return ((void *)addr >= start && (void *)addr < start + size);
-}
-
/* Keep in sync with MODULE_FLAGS_BUF_SIZE !!! */
char *module_flags(struct module *mod, char *buf, bool show_state)
{
@@ -3060,20 +3181,21 @@ bool is_module_address(unsigned long addr)
struct module *__module_address(unsigned long addr)
{
struct module *mod;
- struct mod_tree_root *tree;
if (addr >= mod_tree.addr_min && addr <= mod_tree.addr_max)
- tree = &mod_tree;
+ goto lookup;
+
#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
- else if (addr >= mod_data_tree.addr_min && addr <= mod_data_tree.addr_max)
- tree = &mod_data_tree;
+ if (addr >= mod_tree.data_addr_min && addr <= mod_tree.data_addr_max)
+ goto lookup;
#endif
- else
- return NULL;
+ return NULL;
+
+lookup:
module_assert_mutex_or_preempt();
- mod = mod_find(addr, tree);
+ mod = mod_find(addr, &mod_tree);
if (mod) {
BUG_ON(!within_module(addr, mod));
if (mod->state == MODULE_STATE_UNFORMED)
@@ -3113,8 +3235,8 @@ struct module *__module_text_address(unsigned long addr)
struct module *mod = __module_address(addr);
if (mod) {
/* Make sure it's within the text section. */
- if (!within(addr, mod->init_layout.base, mod->init_layout.text_size)
- && !within(addr, mod->core_layout.base, mod->core_layout.text_size))
+ if (!within_module_mem_type(addr, mod, MOD_TEXT) &&
+ !within_module_mem_type(addr, mod, MOD_INIT_TEXT))
mod = NULL;
}
return mod;
@@ -3142,3 +3264,14 @@ void print_modules(void)
last_unloaded_module.taints);
pr_cont("\n");
}
+
+#ifdef CONFIG_MODULE_DEBUGFS
+struct dentry *mod_debugfs_root;
+
+static int module_debugfs_init(void)
+{
+ mod_debugfs_root = debugfs_create_dir("modules", NULL);
+ return 0;
+}
+module_init(module_debugfs_init);
+#endif
diff --git a/kernel/module/procfs.c b/kernel/module/procfs.c
index cf5b9f1e6ec4..0a4841e88adb 100644
--- a/kernel/module/procfs.c
+++ b/kernel/module/procfs.c
@@ -62,6 +62,15 @@ static void m_stop(struct seq_file *m, void *p)
mutex_unlock(&module_mutex);
}
+static unsigned int module_total_size(struct module *mod)
+{
+ int size = 0;
+
+ for_each_mod_mem_type(type)
+ size += mod->mem[type].size;
+ return size;
+}
+
static int m_show(struct seq_file *m, void *p)
{
struct module *mod = list_entry(p, struct module, list);
@@ -73,10 +82,7 @@ static int m_show(struct seq_file *m, void *p)
if (mod->state == MODULE_STATE_UNFORMED)
return 0;
- size = mod->init_layout.size + mod->core_layout.size;
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
- size += mod->data_layout.size;
-#endif
+ size = module_total_size(mod);
seq_printf(m, "%s %u", mod->name, size);
print_unload_info(m, mod);
@@ -86,7 +92,7 @@ static int m_show(struct seq_file *m, void *p)
mod->state == MODULE_STATE_COMING ? "Loading" :
"Live");
/* Used by oprofile and other similar tools. */
- value = m->private ? NULL : mod->core_layout.base;
+ value = m->private ? NULL : mod->mem[MOD_TEXT].base;
seq_printf(m, " 0x%px", value);
/* Taints info */
diff --git a/kernel/module/stats.c b/kernel/module/stats.c
new file mode 100644
index 000000000000..ad7b6ada29f2
--- /dev/null
+++ b/kernel/module/stats.c
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Debugging module statistics.
+ *
+ * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org>
+ */
+
+#include <linux/module.h>
+#include <uapi/linux/module.h>
+#include <linux/string.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/debugfs.h>
+#include <linux/rculist.h>
+#include <linux/math.h>
+
+#include "internal.h"
+
+/**
+ * DOC: module debugging statistics overview
+ *
+ * Enabling CONFIG_MODULE_STATS enables module debugging statistics which
+ * are useful to monitor and root cause memory pressure issues with module
+ * loading. These statistics are useful to allow us to improve production
+ * workloads.
+ *
+ * The current module debugging statistics supported help keep track of module
+ * loading failures to enable improvements either for kernel module auto-loading
+ * usage (request_module()) or interactions with userspace. Statistics are
+ * provided to track all possible failures in the finit_module() path and memory
+ * wasted in this process space. Each of the failure counters are associated
+ * to a type of module loading failure which is known to incur a certain amount
+ * of memory allocation loss. In the worst case loading a module will fail after
+ * a 3 step memory allocation process:
+ *
+ * a) memory allocated with kernel_read_file_from_fd()
+ * b) module decompression processes the file read from
+ * kernel_read_file_from_fd(), and vmap() is used to map
+ * the decompressed module to a new local buffer which represents
+ * a copy of the decompressed module passed from userspace. The buffer
+ * from kernel_read_file_from_fd() is freed right away.
+ * c) layout_and_allocate() allocates space for the final resting
+ * place where we would keep the module if it were to be processed
+ * successfully.
+ *
+ * If a failure occurs after these three different allocations only one
+ * counter will be incremented with the summation of the allocated bytes freed
+ * incurred during this failure. Likewise, if module loading failed only after
+ * step b) a separate counter is used and incremented for the bytes freed and
+ * not used during both of those allocations.
+ *
+ * Virtual memory space can be limited, for example on x86 virtual memory size
+ * defaults to 128 MiB. We should strive to limit and avoid wasting virtual
+ * memory allocations when possible. These module debugging statistics help
+ * to evaluate how much memory is being wasted on bootup due to module loading
+ * failures.
+ *
+ * All counters are designed to be incremental. Atomic counters are used so to
+ * remain simple and avoid delays and deadlocks.
+ */
+
+/**
+ * DOC: dup_failed_modules - tracks duplicate failed modules
+ *
+ * Linked list of modules which failed to be loaded because an already existing
+ * module with the same name was already being processed or already loaded.
+ * The finit_module() system call incurs heavy virtual memory allocations. In
+ * the worst case an finit_module() system call can end up allocating virtual
+ * memory 3 times:
+ *
+ * 1) kernel_read_file_from_fd() call uses vmalloc()
+ * 2) optional module decompression uses vmap()
+ * 3) layout_and allocate() can use vzalloc() or an arch specific variation of
+ * vmalloc to deal with ELF sections requiring special permissions
+ *
+ * In practice on a typical boot today most finit_module() calls fail due to
+ * the module with the same name already being loaded or about to be processed.
+ * All virtual memory allocated to these failed modules will be freed with
+ * no functional use.
+ *
+ * To help with this the dup_failed_modules allows us to track modules which
+ * failed to load due to the fact that a module was already loaded or being
+ * processed. There are only two points at which we can fail such calls,
+ * we list them below along with the number of virtual memory allocation
+ * calls:
+ *
+ * a) FAIL_DUP_MOD_BECOMING: at the end of early_mod_check() before
+ * layout_and_allocate().
+ * - with module decompression: 2 virtual memory allocation calls
+ * - without module decompression: 1 virtual memory allocation calls
+ * b) FAIL_DUP_MOD_LOAD: after layout_and_allocate() on add_unformed_module()
+ * - with module decompression 3 virtual memory allocation calls
+ * - without module decompression 2 virtual memory allocation calls
+ *
+ * We should strive to get this list to be as small as possible. If this list
+ * is not empty it is a reflection of possible work or optimizations possible
+ * either in-kernel or in userspace.
+ */
+static LIST_HEAD(dup_failed_modules);
+
+/**
+ * DOC: module statistics debugfs counters
+ *
+ * The total amount of wasted virtual memory allocation space during module
+ * loading can be computed by adding the total from the summation:
+ *
+ * * @invalid_kread_bytes +
+ * @invalid_decompress_bytes +
+ * @invalid_becoming_bytes +
+ * @invalid_mod_bytes
+ *
+ * The following debugfs counters are available to inspect module loading
+ * failures:
+ *
+ * * total_mod_size: total bytes ever used by all modules we've dealt with on
+ * this system
+ * * total_text_size: total bytes of the .text and .init.text ELF section
+ * sizes we've dealt with on this system
+ * * invalid_kread_bytes: bytes allocated and then freed on failures which
+ * happen due to the initial kernel_read_file_from_fd(). kernel_read_file_from_fd()
+ * uses vmalloc(). These should typically not happen unless your system is
+ * under memory pressure.
+ * * invalid_decompress_bytes: number of bytes allocated and freed due to
+ * memory allocations in the module decompression path that use vmap().
+ * These typically should not happen unless your system is under memory
+ * pressure.
+ * * invalid_becoming_bytes: total number of bytes allocated and freed used
+ * used to read the kernel module userspace wants us to read before we
+ * promote it to be processed to be added to our @modules linked list. These
+ * failures can happen if we had a check in between a successful kernel_read_file_from_fd()
+ * call and right before we allocate the our private memory for the module
+ * which would be kept if the module is successfully loaded. The most common
+ * reason for this failure is when userspace is racing to load a module
+ * which it does not yet see loaded. The first module to succeed in
+ * add_unformed_module() will add a module to our &modules list and
+ * subsequent loads of modules with the same name will error out at the
+ * end of early_mod_check(). The check for module_patient_check_exists()
+ * at the end of early_mod_check() prevents duplicate allocations
+ * on layout_and_allocate() for modules already being processed. These
+ * duplicate failed modules are non-fatal, however they typically are
+ * indicative of userspace not seeing a module in userspace loaded yet and
+ * unnecessarily trying to load a module before the kernel even has a chance
+ * to begin to process prior requests. Although duplicate failures can be
+ * non-fatal, we should try to reduce vmalloc() pressure proactively, so
+ * ideally after boot this will be close to as 0 as possible. If module
+ * decompression was used we also add to this counter the cost of the
+ * initial kernel_read_file_from_fd() of the compressed module. If module
+ * decompression was not used the value represents the total allocated and
+ * freed bytes in kernel_read_file_from_fd() calls for these type of
+ * failures. These failures can occur because:
+ *
+ * * module_sig_check() - module signature checks
+ * * elf_validity_cache_copy() - some ELF validation issue
+ * * early_mod_check():
+ *
+ * * blacklisting
+ * * failed to rewrite section headers
+ * * version magic
+ * * live patch requirements didn't check out
+ * * the module was detected as being already present
+ *
+ * * invalid_mod_bytes: these are the total number of bytes allocated and
+ * freed due to failures after we did all the sanity checks of the module
+ * which userspace passed to us and after our first check that the module
+ * is unique. A module can still fail to load if we detect the module is
+ * loaded after we allocate space for it with layout_and_allocate(), we do
+ * this check right before processing the module as live and run its
+ * initialization routines. Note that you have a failure of this type it
+ * also means the respective kernel_read_file_from_fd() memory space was
+ * also freed and not used, and so we increment this counter with twice
+ * the size of the module. Additionally if you used module decompression
+ * the size of the compressed module is also added to this counter.
+ *
+ * * modcount: how many modules we've loaded in our kernel life time
+ * * failed_kreads: how many modules failed due to failed kernel_read_file_from_fd()
+ * * failed_decompress: how many failed module decompression attempts we've had.
+ * These really should not happen unless your compression / decompression
+ * might be broken.
+ * * failed_becoming: how many modules failed after we kernel_read_file_from_fd()
+ * it and before we allocate memory for it with layout_and_allocate(). This
+ * counter is never incremented if you manage to validate the module and
+ * call layout_and_allocate() for it.
+ * * failed_load_modules: how many modules failed once we've allocated our
+ * private space for our module using layout_and_allocate(). These failures
+ * should hopefully mostly be dealt with already. Races in theory could
+ * still exist here, but it would just mean the kernel had started processing
+ * two threads concurrently up to early_mod_check() and one thread won.
+ * These failures are good signs the kernel or userspace is doing something
+ * seriously stupid or that could be improved. We should strive to fix these,
+ * but it is perhaps not easy to fix them. A recent example are the modules
+ * requests incurred for frequency modules, a separate module request was
+ * being issued for each CPU on a system.
+ */
+
+atomic_long_t total_mod_size;
+atomic_long_t total_text_size;
+atomic_long_t invalid_kread_bytes;
+atomic_long_t invalid_decompress_bytes;
+static atomic_long_t invalid_becoming_bytes;
+static atomic_long_t invalid_mod_bytes;
+atomic_t modcount;
+atomic_t failed_kreads;
+atomic_t failed_decompress;
+static atomic_t failed_becoming;
+static atomic_t failed_load_modules;
+
+static const char *mod_fail_to_str(struct mod_fail_load *mod_fail)
+{
+ if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask) &&
+ test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask))
+ return "Becoming & Load";
+ if (test_bit(FAIL_DUP_MOD_BECOMING, &mod_fail->dup_fail_mask))
+ return "Becoming";
+ if (test_bit(FAIL_DUP_MOD_LOAD, &mod_fail->dup_fail_mask))
+ return "Load";
+ return "Bug-on-stats";
+}
+
+void mod_stat_bump_invalid(struct load_info *info, int flags)
+{
+ atomic_long_add(info->len * 2, &invalid_mod_bytes);
+ atomic_inc(&failed_load_modules);
+#if defined(CONFIG_MODULE_DECOMPRESS)
+ if (flags & MODULE_INIT_COMPRESSED_FILE)
+ atomic_long_add(info->compressed_len, &invalid_mod_bytes);
+#endif
+}
+
+void mod_stat_bump_becoming(struct load_info *info, int flags)
+{
+ atomic_inc(&failed_becoming);
+ atomic_long_add(info->len, &invalid_becoming_bytes);
+#if defined(CONFIG_MODULE_DECOMPRESS)
+ if (flags & MODULE_INIT_COMPRESSED_FILE)
+ atomic_long_add(info->compressed_len, &invalid_becoming_bytes);
+#endif
+}
+
+int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason)
+{
+ struct mod_fail_load *mod_fail;
+
+ list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list,
+ lockdep_is_held(&module_mutex)) {
+ if (!strcmp(mod_fail->name, name)) {
+ atomic_long_inc(&mod_fail->count);
+ __set_bit(reason, &mod_fail->dup_fail_mask);
+ goto out;
+ }
+ }
+
+ mod_fail = kzalloc(sizeof(*mod_fail), GFP_KERNEL);
+ if (!mod_fail)
+ return -ENOMEM;
+ memcpy(mod_fail->name, name, strlen(name));
+ __set_bit(reason, &mod_fail->dup_fail_mask);
+ atomic_long_inc(&mod_fail->count);
+ list_add_rcu(&mod_fail->list, &dup_failed_modules);
+out:
+ return 0;
+}
+
+/*
+ * At 64 bytes per module and assuming a 1024 bytes preamble we can fit the
+ * 112 module prints within 8k.
+ *
+ * 1024 + (64*112) = 8k
+ */
+#define MAX_PREAMBLE 1024
+#define MAX_FAILED_MOD_PRINT 112
+#define MAX_BYTES_PER_MOD 64
+static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct mod_fail_load *mod_fail;
+ unsigned int len, size, count_failed = 0;
+ char *buf;
+ u32 live_mod_count, fkreads, fdecompress, fbecoming, floads;
+ unsigned long total_size, text_size, ikread_bytes, ibecoming_bytes,
+ idecompress_bytes, imod_bytes, total_virtual_lost;
+
+ live_mod_count = atomic_read(&modcount);
+ fkreads = atomic_read(&failed_kreads);
+ fdecompress = atomic_read(&failed_decompress);
+ fbecoming = atomic_read(&failed_becoming);
+ floads = atomic_read(&failed_load_modules);
+
+ total_size = atomic_long_read(&total_mod_size);
+ text_size = atomic_long_read(&total_text_size);
+ ikread_bytes = atomic_long_read(&invalid_kread_bytes);
+ idecompress_bytes = atomic_long_read(&invalid_decompress_bytes);
+ ibecoming_bytes = atomic_long_read(&invalid_becoming_bytes);
+ imod_bytes = atomic_long_read(&invalid_mod_bytes);
+
+ total_virtual_lost = ikread_bytes + idecompress_bytes + ibecoming_bytes + imod_bytes;
+
+ size = MAX_PREAMBLE + min((unsigned int)(floads + fbecoming),
+ (unsigned int)MAX_FAILED_MOD_PRINT) * MAX_BYTES_PER_MOD;
+ buf = kzalloc(size, GFP_KERNEL);
+ if (buf == NULL)
+ return -ENOMEM;
+
+ /* The beginning of our debug preamble */
+ len = scnprintf(buf, size, "%25s\t%u\n", "Mods ever loaded", live_mod_count);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on kread", fkreads);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on decompress",
+ fdecompress);
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on becoming", fbecoming);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%u\n", "Mods failed on load", floads);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total module size", total_size);
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Total mod text size", text_size);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kread bytes", ikread_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed decompress bytes",
+ idecompress_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed becoming bytes", ibecoming_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Failed kmod bytes", imod_bytes);
+
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Virtual mem wasted bytes", total_virtual_lost);
+
+ if (live_mod_count && total_size) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod size",
+ DIV_ROUND_UP(total_size, live_mod_count));
+ }
+
+ if (live_mod_count && text_size) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average mod text size",
+ DIV_ROUND_UP(text_size, live_mod_count));
+ }
+
+ /*
+ * We use WARN_ON_ONCE() for the counters to ensure we always have parity
+ * for keeping tabs on a type of failure with one type of byte counter.
+ * The counters for imod_bytes does not increase for fkreads failures
+ * for example, and so on.
+ */
+
+ WARN_ON_ONCE(ikread_bytes && !fkreads);
+ if (fkreads && ikread_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail kread bytes",
+ DIV_ROUND_UP(ikread_bytes, fkreads));
+ }
+
+ WARN_ON_ONCE(ibecoming_bytes && !fbecoming);
+ if (fbecoming && ibecoming_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail becoming bytes",
+ DIV_ROUND_UP(ibecoming_bytes, fbecoming));
+ }
+
+ WARN_ON_ONCE(idecompress_bytes && !fdecompress);
+ if (fdecompress && idecompress_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Avg fail decomp bytes",
+ DIV_ROUND_UP(idecompress_bytes, fdecompress));
+ }
+
+ WARN_ON_ONCE(imod_bytes && !floads);
+ if (floads && imod_bytes) {
+ len += scnprintf(buf + len, size - len, "%25s\t%lu\n", "Average fail load bytes",
+ DIV_ROUND_UP(imod_bytes, floads));
+ }
+
+ /* End of our debug preamble header. */
+
+ /* Catch when we've gone beyond our expected preamble */
+ WARN_ON_ONCE(len >= MAX_PREAMBLE);
+
+ if (list_empty(&dup_failed_modules))
+ goto out;
+
+ len += scnprintf(buf + len, size - len, "Duplicate failed modules:\n");
+ len += scnprintf(buf + len, size - len, "%25s\t%15s\t%25s\n",
+ "Module-name", "How-many-times", "Reason");
+ mutex_lock(&module_mutex);
+
+
+ list_for_each_entry_rcu(mod_fail, &dup_failed_modules, list) {
+ if (WARN_ON_ONCE(++count_failed >= MAX_FAILED_MOD_PRINT))
+ goto out_unlock;
+ len += scnprintf(buf + len, size - len, "%25s\t%15lu\t%25s\n", mod_fail->name,
+ atomic_long_read(&mod_fail->count), mod_fail_to_str(mod_fail));
+ }
+out_unlock:
+ mutex_unlock(&module_mutex);
+out:
+ kfree(buf);
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+#undef MAX_PREAMBLE
+#undef MAX_FAILED_MOD_PRINT
+#undef MAX_BYTES_PER_MOD
+
+static const struct file_operations fops_mod_stats = {
+ .read = read_file_mod_stats,
+ .open = simple_open,
+ .owner = THIS_MODULE,
+ .llseek = default_llseek,
+};
+
+#define mod_debug_add_ulong(name) debugfs_create_ulong(#name, 0400, mod_debugfs_root, (unsigned long *) &name.counter)
+#define mod_debug_add_atomic(name) debugfs_create_atomic_t(#name, 0400, mod_debugfs_root, &name)
+static int __init module_stats_init(void)
+{
+ mod_debug_add_ulong(total_mod_size);
+ mod_debug_add_ulong(total_text_size);
+ mod_debug_add_ulong(invalid_kread_bytes);
+ mod_debug_add_ulong(invalid_decompress_bytes);
+ mod_debug_add_ulong(invalid_becoming_bytes);
+ mod_debug_add_ulong(invalid_mod_bytes);
+
+ mod_debug_add_atomic(modcount);
+ mod_debug_add_atomic(failed_kreads);
+ mod_debug_add_atomic(failed_decompress);
+ mod_debug_add_atomic(failed_becoming);
+ mod_debug_add_atomic(failed_load_modules);
+
+ debugfs_create_file("stats", 0400, mod_debugfs_root, mod_debugfs_root, &fops_mod_stats);
+
+ return 0;
+}
+#undef mod_debug_add_ulong
+#undef mod_debug_add_atomic
+module_init(module_stats_init);
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
index 14fbea66f12f..a2b656b4e3d2 100644
--- a/kernel/module/strict_rwx.c
+++ b/kernel/module/strict_rwx.c
@@ -11,82 +11,25 @@
#include <linux/set_memory.h>
#include "internal.h"
-/*
- * LKM RO/NX protection: protect module's text/ro-data
- * from modification and any data from execution.
- *
- * General layout of module is:
- * [text] [read-only-data] [ro-after-init] [writable data]
- * text_size -----^ ^ ^ ^
- * ro_size ------------------------| | |
- * ro_after_init_size -----------------------------| |
- * size -----------------------------------------------------------|
- *
- * These values are always page-aligned (as is base) when
- * CONFIG_STRICT_MODULE_RWX is set.
- */
+static void module_set_memory(const struct module *mod, enum mod_mem_type type,
+ int (*set_memory)(unsigned long start, int num_pages))
+{
+ const struct module_memory *mod_mem = &mod->mem[type];
+
+ set_vm_flush_reset_perms(mod_mem->base);
+ set_memory((unsigned long)mod_mem->base, mod_mem->size >> PAGE_SHIFT);
+}
/*
* Since some arches are moving towards PAGE_KERNEL module allocations instead
- * of PAGE_KERNEL_EXEC, keep frob_text() and module_enable_x() independent of
+ * of PAGE_KERNEL_EXEC, keep module_enable_x() independent of
* CONFIG_STRICT_MODULE_RWX because they are needed regardless of whether we
* are strict.
*/
-static void frob_text(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- set_memory((unsigned long)layout->base,
- PAGE_ALIGN(layout->text_size) >> PAGE_SHIFT);
-}
-
-static void frob_rodata(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- set_memory((unsigned long)layout->base + layout->text_size,
- (layout->ro_size - layout->text_size) >> PAGE_SHIFT);
-}
-
-static void frob_ro_after_init(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- set_memory((unsigned long)layout->base + layout->ro_size,
- (layout->ro_after_init_size - layout->ro_size) >> PAGE_SHIFT);
-}
-
-static void frob_writable_data(const struct module_layout *layout,
- int (*set_memory)(unsigned long start, int num_pages))
-{
- set_memory((unsigned long)layout->base + layout->ro_after_init_size,
- (layout->size - layout->ro_after_init_size) >> PAGE_SHIFT);
-}
-
-static bool layout_check_misalignment(const struct module_layout *layout)
-{
- return WARN_ON(!PAGE_ALIGNED(layout->base)) ||
- WARN_ON(!PAGE_ALIGNED(layout->text_size)) ||
- WARN_ON(!PAGE_ALIGNED(layout->ro_size)) ||
- WARN_ON(!PAGE_ALIGNED(layout->ro_after_init_size)) ||
- WARN_ON(!PAGE_ALIGNED(layout->size));
-}
-
-bool module_check_misalignment(const struct module *mod)
-{
- if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
- return false;
-
- return layout_check_misalignment(&mod->core_layout) ||
- layout_check_misalignment(&mod->data_layout) ||
- layout_check_misalignment(&mod->init_layout);
-}
-
void module_enable_x(const struct module *mod)
{
- if (!PAGE_ALIGNED(mod->core_layout.base) ||
- !PAGE_ALIGNED(mod->init_layout.base))
- return;
-
- frob_text(&mod->core_layout, set_memory_x);
- frob_text(&mod->init_layout, set_memory_x);
+ for_class_mod_mem_type(type, text)
+ module_set_memory(mod, type, set_memory_x);
}
void module_enable_ro(const struct module *mod, bool after_init)
@@ -98,16 +41,13 @@ void module_enable_ro(const struct module *mod, bool after_init)
return;
#endif
- set_vm_flush_reset_perms(mod->core_layout.base);
- set_vm_flush_reset_perms(mod->init_layout.base);
- frob_text(&mod->core_layout, set_memory_ro);
-
- frob_rodata(&mod->data_layout, set_memory_ro);
- frob_text(&mod->init_layout, set_memory_ro);
- frob_rodata(&mod->init_layout, set_memory_ro);
+ module_set_memory(mod, MOD_TEXT, set_memory_ro);
+ module_set_memory(mod, MOD_INIT_TEXT, set_memory_ro);
+ module_set_memory(mod, MOD_RODATA, set_memory_ro);
+ module_set_memory(mod, MOD_INIT_RODATA, set_memory_ro);
if (after_init)
- frob_ro_after_init(&mod->data_layout, set_memory_ro);
+ module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro);
}
void module_enable_nx(const struct module *mod)
@@ -115,11 +55,8 @@ void module_enable_nx(const struct module *mod)
if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
return;
- frob_rodata(&mod->data_layout, set_memory_nx);
- frob_ro_after_init(&mod->data_layout, set_memory_nx);
- frob_writable_data(&mod->data_layout, set_memory_nx);
- frob_rodata(&mod->init_layout, set_memory_nx);
- frob_writable_data(&mod->init_layout, set_memory_nx);
+ for_class_mod_mem_type(type, data)
+ module_set_memory(mod, type, set_memory_nx);
}
int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c
index 26d812e07615..16742d1c630c 100644
--- a/kernel/module/tracking.c
+++ b/kernel/module/tracking.c
@@ -15,6 +15,7 @@
#include "internal.h"
static LIST_HEAD(unloaded_tainted_modules);
+extern struct dentry *mod_debugfs_root;
int try_add_tainted_module(struct module *mod)
{
@@ -120,12 +121,8 @@ static const struct file_operations unloaded_tainted_modules_fops = {
static int __init unloaded_tainted_modules_init(void)
{
- struct dentry *dir;
-
- dir = debugfs_create_dir("modules", NULL);
- debugfs_create_file("unloaded_tainted", 0444, dir, NULL,
+ debugfs_create_file("unloaded_tainted", 0444, mod_debugfs_root, NULL,
&unloaded_tainted_modules_fops);
-
return 0;
}
module_init(unloaded_tainted_modules_init);
diff --git a/kernel/module/tree_lookup.c b/kernel/module/tree_lookup.c
index 8ec5cfd60496..277197977d43 100644
--- a/kernel/module/tree_lookup.c
+++ b/kernel/module/tree_lookup.c
@@ -21,16 +21,16 @@
static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n)
{
- struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
+ struct module_memory *mod_mem = container_of(n, struct module_memory, mtn.node);
- return (unsigned long)layout->base;
+ return (unsigned long)mod_mem->base;
}
static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n)
{
- struct module_layout *layout = container_of(n, struct module_layout, mtn.node);
+ struct module_memory *mod_mem = container_of(n, struct module_memory, mtn.node);
- return (unsigned long)layout->size;
+ return (unsigned long)mod_mem->size;
}
static __always_inline bool
@@ -77,32 +77,27 @@ static void __mod_tree_remove(struct mod_tree_node *node, struct mod_tree_root *
*/
void mod_tree_insert(struct module *mod)
{
- mod->core_layout.mtn.mod = mod;
- mod->init_layout.mtn.mod = mod;
-
- __mod_tree_insert(&mod->core_layout.mtn, &mod_tree);
- if (mod->init_layout.size)
- __mod_tree_insert(&mod->init_layout.mtn, &mod_tree);
-
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
- mod->data_layout.mtn.mod = mod;
- __mod_tree_insert(&mod->data_layout.mtn, &mod_data_tree);
-#endif
+ for_each_mod_mem_type(type) {
+ mod->mem[type].mtn.mod = mod;
+ if (mod->mem[type].size)
+ __mod_tree_insert(&mod->mem[type].mtn, &mod_tree);
+ }
}
void mod_tree_remove_init(struct module *mod)
{
- if (mod->init_layout.size)
- __mod_tree_remove(&mod->init_layout.mtn, &mod_tree);
+ for_class_mod_mem_type(type, init) {
+ if (mod->mem[type].size)
+ __mod_tree_remove(&mod->mem[type].mtn, &mod_tree);
+ }
}
void mod_tree_remove(struct module *mod)
{
- __mod_tree_remove(&mod->core_layout.mtn, &mod_tree);
- mod_tree_remove_init(mod);
-#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
- __mod_tree_remove(&mod->data_layout.mtn, &mod_data_tree);
-#endif
+ for_each_mod_mem_type(type) {
+ if (mod->mem[type].size)
+ __mod_tree_remove(&mod->mem[type].mtn, &mod_tree);
+ }
}
struct module *mod_find(unsigned long addr, struct mod_tree_root *tree)
diff --git a/kernel/notifier.c b/kernel/notifier.c
index d353e4b5402d..b3ce28f39eb6 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -7,6 +7,9 @@
#include <linux/vmalloc.h>
#include <linux/reboot.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/notifier.h>
+
/*
* Notifier list for kernel code which wants to be called
* at shutdown. This is used to stop any idling DMA operations
@@ -37,6 +40,7 @@ static int notifier_chain_register(struct notifier_block **nl,
}
n->next = *nl;
rcu_assign_pointer(*nl, n);
+ trace_notifier_register((void *)n->notifier_call);
return 0;
}
@@ -46,6 +50,7 @@ static int notifier_chain_unregister(struct notifier_block **nl,
while ((*nl) != NULL) {
if ((*nl) == n) {
rcu_assign_pointer(*nl, n->next);
+ trace_notifier_unregister((void *)n->notifier_call);
return 0;
}
nl = &((*nl)->next);
@@ -84,6 +89,7 @@ static int notifier_call_chain(struct notifier_block **nl,
continue;
}
#endif
+ trace_notifier_run((void *)nb->notifier_call);
ret = nb->notifier_call(nb, val, v);
if (nr_calls)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a487ff24129b..80d9c6d77a45 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -545,21 +545,20 @@ static void commit_nsset(struct nsset *nsset)
SYSCALL_DEFINE2(setns, int, fd, int, flags)
{
- struct file *file;
+ struct fd f = fdget(fd);
struct ns_common *ns = NULL;
struct nsset nsset = {};
int err = 0;
- file = fget(fd);
- if (!file)
+ if (!f.file)
return -EBADF;
- if (proc_ns_file(file)) {
- ns = get_proc_ns(file_inode(file));
+ if (proc_ns_file(f.file)) {
+ ns = get_proc_ns(file_inode(f.file));
if (flags && (ns->ops->type != flags))
err = -EINVAL;
flags = ns->ops->type;
- } else if (!IS_ERR(pidfd_pid(file))) {
+ } else if (!IS_ERR(pidfd_pid(f.file))) {
err = check_setns_flags(flags);
} else {
err = -EINVAL;
@@ -571,17 +570,17 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
if (err)
goto out;
- if (proc_ns_file(file))
+ if (proc_ns_file(f.file))
err = validate_ns(&nsset, ns);
else
- err = validate_nsset(&nsset, file->private_data);
+ err = validate_nsset(&nsset, f.file->private_data);
if (!err) {
commit_nsset(&nsset);
perf_event_namespaces(current);
}
put_nsset(&nsset);
out:
- fput(file);
+ fdput(f);
return err;
}
diff --git a/kernel/padata.c b/kernel/padata.c
index e007b8a4b738..222d60195de6 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -491,7 +491,7 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
return;
/* Ensure at least one thread when size < min_chunk. */
- nworks = max(job->size / job->min_chunk, 1ul);
+ nworks = max(job->size / max(job->min_chunk, job->align), 1ul);
nworks = min(nworks, job->max_threads);
if (nworks == 1) {
@@ -967,7 +967,7 @@ static const struct sysfs_ops padata_sysfs_ops = {
.store = padata_sysfs_store,
};
-static struct kobj_type padata_attr_type = {
+static const struct kobj_type padata_attr_type = {
.sysfs_ops = &padata_sysfs_ops,
.default_groups = padata_default_groups,
.release = padata_sysfs_release,
diff --git a/kernel/panic.c b/kernel/panic.c
index 5cfea8302d23..886d2ebd0a0d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -141,7 +141,7 @@ EXPORT_SYMBOL(panic_blink);
/*
* Stop ourself in panic -- architecture code may override this
*/
-void __weak panic_smp_self_stop(void)
+void __weak __noreturn panic_smp_self_stop(void)
{
while (1)
cpu_relax();
@@ -151,7 +151,7 @@ void __weak panic_smp_self_stop(void)
* Stop ourselves in NMI context if another CPU has already panicked. Arch code
* may override this to prepare for crash dumping, e.g. save regs info.
*/
-void __weak nmi_panic_self_stop(struct pt_regs *regs)
+void __weak __noreturn nmi_panic_self_stop(struct pt_regs *regs)
{
panic_smp_self_stop();
}
diff --git a/kernel/params.c b/kernel/params.c
index 6e34ca89ebae..6a7548979aa9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -948,7 +948,7 @@ static void module_kobj_release(struct kobject *kobj)
complete(mk->kobj_completion);
}
-struct kobj_type module_ktype = {
+const struct kobj_type module_ktype = {
.release = module_kobj_release,
.sysfs_ops = &module_sysfs_ops,
};
diff --git a/kernel/pid.c b/kernel/pid.c
index 3fbc5e46b721..f93954a0384d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -594,20 +594,15 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
*/
int pidfd_create(struct pid *pid, unsigned int flags)
{
- int fd;
-
- if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
- return -EINVAL;
+ int pidfd;
+ struct file *pidfd_file;
- if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
- return -EINVAL;
-
- fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
- flags | O_RDWR | O_CLOEXEC);
- if (fd < 0)
- put_pid(pid);
+ pidfd = pidfd_prepare(pid, flags, &pidfd_file);
+ if (pidfd < 0)
+ return pidfd;
- return fd;
+ fd_install(pidfd, pidfd_file);
+ return pidfd;
}
/**
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 31ec4a9b9d70..3113ec2f1db4 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -6,6 +6,7 @@
* Copyright (c) 2003 Open Source Development Lab
*/
+#include <linux/acpi.h>
#include <linux/export.h>
#include <linux/kobject.h>
#include <linux/string.h>
@@ -83,6 +84,19 @@ int unregister_pm_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL_GPL(unregister_pm_notifier);
+void pm_report_hw_sleep_time(u64 t)
+{
+ suspend_stats.last_hw_sleep = t;
+ suspend_stats.total_hw_sleep += t;
+}
+EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time);
+
+void pm_report_max_hw_sleep(u64 t)
+{
+ suspend_stats.max_hw_sleep = t;
+}
+EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep);
+
int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down)
{
int ret;
@@ -314,24 +328,27 @@ static char *suspend_step_name(enum suspend_stat_step step)
}
}
-#define suspend_attr(_name) \
+#define suspend_attr(_name, format_str) \
static ssize_t _name##_show(struct kobject *kobj, \
struct kobj_attribute *attr, char *buf) \
{ \
- return sprintf(buf, "%d\n", suspend_stats._name); \
+ return sprintf(buf, format_str, suspend_stats._name); \
} \
static struct kobj_attribute _name = __ATTR_RO(_name)
-suspend_attr(success);
-suspend_attr(fail);
-suspend_attr(failed_freeze);
-suspend_attr(failed_prepare);
-suspend_attr(failed_suspend);
-suspend_attr(failed_suspend_late);
-suspend_attr(failed_suspend_noirq);
-suspend_attr(failed_resume);
-suspend_attr(failed_resume_early);
-suspend_attr(failed_resume_noirq);
+suspend_attr(success, "%d\n");
+suspend_attr(fail, "%d\n");
+suspend_attr(failed_freeze, "%d\n");
+suspend_attr(failed_prepare, "%d\n");
+suspend_attr(failed_suspend, "%d\n");
+suspend_attr(failed_suspend_late, "%d\n");
+suspend_attr(failed_suspend_noirq, "%d\n");
+suspend_attr(failed_resume, "%d\n");
+suspend_attr(failed_resume_early, "%d\n");
+suspend_attr(failed_resume_noirq, "%d\n");
+suspend_attr(last_hw_sleep, "%llu\n");
+suspend_attr(total_hw_sleep, "%llu\n");
+suspend_attr(max_hw_sleep, "%llu\n");
static ssize_t last_failed_dev_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -391,12 +408,30 @@ static struct attribute *suspend_attrs[] = {
&last_failed_dev.attr,
&last_failed_errno.attr,
&last_failed_step.attr,
+ &last_hw_sleep.attr,
+ &total_hw_sleep.attr,
+ &max_hw_sleep.attr,
NULL,
};
+static umode_t suspend_attr_is_visible(struct kobject *kobj, struct attribute *attr, int idx)
+{
+ if (attr != &last_hw_sleep.attr &&
+ attr != &total_hw_sleep.attr &&
+ attr != &max_hw_sleep.attr)
+ return 0444;
+
+#ifdef CONFIG_ACPI
+ if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0)
+ return 0444;
+#endif
+ return 0;
+}
+
static const struct attribute_group suspend_attr_group = {
.name = "suspend_stats",
.attrs = suspend_attrs,
+ .is_visible = suspend_attr_is_visible,
};
#ifdef CONFIG_DEBUG_FS
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 6c1c7e566d35..cae81a87cc91 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -93,7 +93,7 @@ static int try_to_freeze_tasks(bool user_only)
todo - wq_busy, wq_busy);
if (wq_busy)
- show_all_workqueues();
+ show_freezable_workqueues();
if (!wakeup || pm_debug_messages_on) {
read_lock(&tasklist_lock);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index fd0c9f913940..6a333adce3b3 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -71,6 +71,8 @@ EXPORT_SYMBOL_GPL(console_printk);
atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0);
EXPORT_SYMBOL(ignore_console_lock_warning);
+EXPORT_TRACEPOINT_SYMBOL_GPL(console);
+
/*
* Low level drivers may need that to know if they can schedule in
* their unblank() callback or not. So let's export it.
@@ -89,7 +91,7 @@ static DEFINE_MUTEX(console_mutex);
* console_sem protects updates to console->seq and console_suspended,
* and also provides serialization for console printing.
*/
-static DEFINE_SEMAPHORE(console_sem);
+static DEFINE_SEMAPHORE(console_sem, 1);
HLIST_HEAD(console_list);
EXPORT_SYMBOL_GPL(console_list);
DEFINE_STATIC_SRCU(console_srcu);
@@ -730,7 +732,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
size_t len = iov_iter_count(from);
ssize_t ret = len;
- if (!user || len > PRINTKRB_RECORD_MAX)
+ if (len > PRINTKRB_RECORD_MAX)
return -EINVAL;
/* Ignore when user logging is disabled. */
@@ -792,9 +794,6 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
};
ssize_t ret;
- if (!user)
- return -EBADF;
-
ret = mutex_lock_interruptible(&user->lock);
if (ret)
return ret;
@@ -859,8 +858,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
struct devkmsg_user *user = file->private_data;
loff_t ret = 0;
- if (!user)
- return -EBADF;
if (offset)
return -ESPIPE;
@@ -893,9 +890,6 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
struct printk_info info;
__poll_t ret = 0;
- if (!user)
- return EPOLLERR|EPOLLNVAL;
-
poll_wait(file, &log_wait, wait);
if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) {
@@ -944,9 +938,6 @@ static int devkmsg_release(struct inode *inode, struct file *file)
{
struct devkmsg_user *user = file->private_data;
- if (!user)
- return 0;
-
ratelimit_state_exit(&user->rs);
mutex_destroy(&user->lock);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 0786450074c1..443057bee87c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -32,6 +32,7 @@
#include <linux/compat.h>
#include <linux/sched/signal.h>
#include <linux/minmax.h>
+#include <linux/syscall_user_dispatch.h>
#include <asm/syscall.h> /* for syscall_get_* */
@@ -1259,6 +1260,14 @@ int ptrace_request(struct task_struct *child, long request,
break;
#endif
+ case PTRACE_SET_SYSCALL_USER_DISPATCH_CONFIG:
+ ret = syscall_user_dispatch_set_config(child, addr, datavp);
+ break;
+
+ case PTRACE_GET_SYSCALL_USER_DISPATCH_CONFIG:
+ ret = syscall_user_dispatch_get_config(child, addr, datavp);
+ break;
+
default:
break;
}
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index ab62074174c3..9071182b1284 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -53,9 +53,6 @@ config RCU_EXPERT
Say N if you are unsure.
-config SRCU
- def_bool y
-
config TINY_SRCU
bool
default y if TINY_RCU
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 115616ac3bfa..4a1b9622598b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -14,6 +14,43 @@
/*
* Grace-period counter management.
+ *
+ * The two least significant bits contain the control flags.
+ * The most significant bits contain the grace-period sequence counter.
+ *
+ * When both control flags are zero, no grace period is in progress.
+ * When either bit is non-zero, a grace period has started and is in
+ * progress. When the grace period completes, the control flags are reset
+ * to 0 and the grace-period sequence counter is incremented.
+ *
+ * However some specific RCU usages make use of custom values.
+ *
+ * SRCU special control values:
+ *
+ * SRCU_SNP_INIT_SEQ : Invalid/init value set when SRCU node
+ * is initialized.
+ *
+ * SRCU_STATE_IDLE : No SRCU gp is in progress
+ *
+ * SRCU_STATE_SCAN1 : State set by rcu_seq_start(). Indicates
+ * we are scanning the readers on the slot
+ * defined as inactive (there might well
+ * be pending readers that will use that
+ * index, but their number is bounded).
+ *
+ * SRCU_STATE_SCAN2 : State set manually via rcu_seq_set_state()
+ * Indicates we are flipping the readers
+ * index and then scanning the readers on the
+ * slot newly designated as inactive (again,
+ * the number of pending readers that will use
+ * this inactive index is bounded).
+ *
+ * RCU polled GP special control value:
+ *
+ * RCU_GET_STATE_COMPLETED : State value indicating an already-completed
+ * polled GP has completed. This value covers
+ * both the state and the counter of the
+ * grace-period sequence number.
*/
#define RCU_SEQ_CTR_SHIFT 2
@@ -341,11 +378,13 @@ extern void rcu_init_geometry(void);
* specified state structure (for SRCU) or the only rcu_state structure
* (for RCU).
*/
-#define srcu_for_each_node_breadth_first(sp, rnp) \
+#define _rcu_for_each_node_breadth_first(sp, rnp) \
for ((rnp) = &(sp)->node[0]; \
(rnp) < &(sp)->node[rcu_num_nodes]; (rnp)++)
#define rcu_for_each_node_breadth_first(rnp) \
- srcu_for_each_node_breadth_first(&rcu_state, rnp)
+ _rcu_for_each_node_breadth_first(&rcu_state, rnp)
+#define srcu_for_each_node_breadth_first(ssp, rnp) \
+ _rcu_for_each_node_breadth_first(ssp->srcu_sup, rnp)
/*
* Scan the leaves of the rcu_node hierarchy for the rcu_state structure.
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 91fb5905a008..e82ec9f9a5d8 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -631,8 +631,7 @@ static int compute_real(int n)
static int
rcu_scale_shutdown(void *arg)
{
- wait_event(shutdown_wq,
- atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
+ wait_event_idle(shutdown_wq, atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
smp_mb(); /* Wake before output. */
rcu_scale_cleanup();
kernel_power_off();
@@ -716,7 +715,7 @@ kfree_scale_thread(void *arg)
// is tested.
if ((kfree_rcu_test_single && !kfree_rcu_test_double) ||
(kfree_rcu_test_both && torture_random(&tr) & 0x800))
- kfree_rcu(alloc_ptr);
+ kfree_rcu_mightsleep(alloc_ptr);
else
kfree_rcu(alloc_ptr, rh);
}
@@ -771,8 +770,8 @@ kfree_scale_cleanup(void)
static int
kfree_scale_shutdown(void *arg)
{
- wait_event(shutdown_wq,
- atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads);
+ wait_event_idle(shutdown_wq,
+ atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads);
smp_mb(); /* Wake before output. */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 8e6c023212cb..147551c23baf 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -119,7 +119,9 @@ torture_param(int, stutter, 5, "Number of seconds to run/halt test");
torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds.");
torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds.");
+torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable.");
torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs");
+torture_param(int, test_srcu_lockdep, 0, "Test specified SRCU deadlock scenario.");
torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
static char *torture_type = "rcu";
@@ -179,7 +181,6 @@ static atomic_t n_rcu_torture_mbchk_tries;
static atomic_t n_rcu_torture_error;
static long n_rcu_torture_barrier_error;
static long n_rcu_torture_boost_ktrerror;
-static long n_rcu_torture_boost_rterror;
static long n_rcu_torture_boost_failure;
static long n_rcu_torture_boosts;
static atomic_long_t n_rcu_torture_timers;
@@ -2194,12 +2195,11 @@ rcu_torture_stats_print(void)
atomic_read(&n_rcu_torture_alloc),
atomic_read(&n_rcu_torture_alloc_fail),
atomic_read(&n_rcu_torture_free));
- pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld rtbre: %ld ",
+ pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld ",
atomic_read(&n_rcu_torture_mberror),
atomic_read(&n_rcu_torture_mbchk_fail), atomic_read(&n_rcu_torture_mbchk_tries),
n_rcu_torture_barrier_error,
- n_rcu_torture_boost_ktrerror,
- n_rcu_torture_boost_rterror);
+ n_rcu_torture_boost_ktrerror);
pr_cont("rtbf: %ld rtb: %ld nt: %ld ",
n_rcu_torture_boost_failure,
n_rcu_torture_boosts,
@@ -2217,15 +2217,13 @@ rcu_torture_stats_print(void)
if (atomic_read(&n_rcu_torture_mberror) ||
atomic_read(&n_rcu_torture_mbchk_fail) ||
n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror ||
- n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure ||
- i > 1) {
+ n_rcu_torture_boost_failure || i > 1) {
pr_cont("%s", "!!! ");
atomic_inc(&n_rcu_torture_error);
WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror));
WARN_ON_ONCE(atomic_read(&n_rcu_torture_mbchk_fail));
WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier()
WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread
- WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio
WARN_ON_ONCE(n_rcu_torture_boost_failure); // boost failed (TIMER_SOFTIRQ RT prio?)
WARN_ON_ONCE(i > 1); // Too-short grace period
}
@@ -2358,7 +2356,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"n_barrier_cbs=%d "
"onoff_interval=%d onoff_holdoff=%d "
"read_exit_delay=%d read_exit_burst=%d "
- "nocbs_nthreads=%d nocbs_toggle=%d\n",
+ "nocbs_nthreads=%d nocbs_toggle=%d "
+ "test_nmis=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
@@ -2369,7 +2368,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
n_barrier_cbs,
onoff_interval, onoff_holdoff,
read_exit_delay, read_exit_burst,
- nocbs_nthreads, nocbs_toggle);
+ nocbs_nthreads, nocbs_toggle,
+ test_nmis);
}
static int rcutorture_booster_cleanup(unsigned int cpu)
@@ -3273,6 +3273,29 @@ static void rcu_torture_read_exit_cleanup(void)
torture_stop_kthread(rcutorture_read_exit, read_exit_task);
}
+static void rcutorture_test_nmis(int n)
+{
+#if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+ int cpu;
+ int dumpcpu;
+ int i;
+
+ for (i = 0; i < n; i++) {
+ preempt_disable();
+ cpu = smp_processor_id();
+ dumpcpu = cpu + 1;
+ if (dumpcpu >= nr_cpu_ids)
+ dumpcpu = 0;
+ pr_alert("%s: CPU %d invoking dump_cpu_task(%d)\n", __func__, cpu, dumpcpu);
+ dump_cpu_task(dumpcpu);
+ preempt_enable();
+ schedule_timeout_uninterruptible(15 * HZ);
+ }
+#else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+ WARN_ONCE(n, "Non-zero rcutorture.test_nmis=%d permitted only when rcutorture is built in.\n", test_nmis);
+#endif // #else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
+}
+
static enum cpuhp_state rcutor_hp;
static void
@@ -3297,6 +3320,8 @@ rcu_torture_cleanup(void)
return;
}
+ rcutorture_test_nmis(test_nmis);
+
if (cur_ops->gp_kthread_dbg)
cur_ops->gp_kthread_dbg();
rcu_torture_read_exit_cleanup();
@@ -3463,6 +3488,188 @@ static void rcutorture_sync(void)
cur_ops->sync();
}
+static DEFINE_MUTEX(mut0);
+static DEFINE_MUTEX(mut1);
+static DEFINE_MUTEX(mut2);
+static DEFINE_MUTEX(mut3);
+static DEFINE_MUTEX(mut4);
+static DEFINE_MUTEX(mut5);
+static DEFINE_MUTEX(mut6);
+static DEFINE_MUTEX(mut7);
+static DEFINE_MUTEX(mut8);
+static DEFINE_MUTEX(mut9);
+
+static DECLARE_RWSEM(rwsem0);
+static DECLARE_RWSEM(rwsem1);
+static DECLARE_RWSEM(rwsem2);
+static DECLARE_RWSEM(rwsem3);
+static DECLARE_RWSEM(rwsem4);
+static DECLARE_RWSEM(rwsem5);
+static DECLARE_RWSEM(rwsem6);
+static DECLARE_RWSEM(rwsem7);
+static DECLARE_RWSEM(rwsem8);
+static DECLARE_RWSEM(rwsem9);
+
+DEFINE_STATIC_SRCU(srcu0);
+DEFINE_STATIC_SRCU(srcu1);
+DEFINE_STATIC_SRCU(srcu2);
+DEFINE_STATIC_SRCU(srcu3);
+DEFINE_STATIC_SRCU(srcu4);
+DEFINE_STATIC_SRCU(srcu5);
+DEFINE_STATIC_SRCU(srcu6);
+DEFINE_STATIC_SRCU(srcu7);
+DEFINE_STATIC_SRCU(srcu8);
+DEFINE_STATIC_SRCU(srcu9);
+
+static int srcu_lockdep_next(const char *f, const char *fl, const char *fs, const char *fu, int i,
+ int cyclelen, int deadlock)
+{
+ int j = i + 1;
+
+ if (j >= cyclelen)
+ j = deadlock ? 0 : -1;
+ if (j >= 0)
+ pr_info("%s: %s(%d), %s(%d), %s(%d)\n", f, fl, i, fs, j, fu, i);
+ else
+ pr_info("%s: %s(%d), %s(%d)\n", f, fl, i, fu, i);
+ return j;
+}
+
+// Test lockdep on SRCU-based deadlock scenarios.
+static void rcu_torture_init_srcu_lockdep(void)
+{
+ int cyclelen;
+ int deadlock;
+ bool err = false;
+ int i;
+ int j;
+ int idx;
+ struct mutex *muts[] = { &mut0, &mut1, &mut2, &mut3, &mut4,
+ &mut5, &mut6, &mut7, &mut8, &mut9 };
+ struct rw_semaphore *rwsems[] = { &rwsem0, &rwsem1, &rwsem2, &rwsem3, &rwsem4,
+ &rwsem5, &rwsem6, &rwsem7, &rwsem8, &rwsem9 };
+ struct srcu_struct *srcus[] = { &srcu0, &srcu1, &srcu2, &srcu3, &srcu4,
+ &srcu5, &srcu6, &srcu7, &srcu8, &srcu9 };
+ int testtype;
+
+ if (!test_srcu_lockdep)
+ return;
+
+ deadlock = test_srcu_lockdep / 1000;
+ testtype = (test_srcu_lockdep / 10) % 100;
+ cyclelen = test_srcu_lockdep % 10;
+ WARN_ON_ONCE(ARRAY_SIZE(muts) != ARRAY_SIZE(srcus));
+ if (WARN_ONCE(deadlock != !!deadlock,
+ "%s: test_srcu_lockdep=%d and deadlock digit %d must be zero or one.\n",
+ __func__, test_srcu_lockdep, deadlock))
+ err = true;
+ if (WARN_ONCE(cyclelen <= 0,
+ "%s: test_srcu_lockdep=%d and cycle-length digit %d must be greater than zero.\n",
+ __func__, test_srcu_lockdep, cyclelen))
+ err = true;
+ if (err)
+ goto err_out;
+
+ if (testtype == 0) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ if (deadlock && cyclelen == 1)
+ pr_info("%s: Expect hang.\n", __func__);
+ for (i = 0; i < cyclelen; i++) {
+ j = srcu_lockdep_next(__func__, "srcu_read_lock", "synchronize_srcu",
+ "srcu_read_unlock", i, cyclelen, deadlock);
+ idx = srcu_read_lock(srcus[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ srcu_read_unlock(srcus[i], idx);
+ }
+ return;
+ }
+
+ if (testtype == 1) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU/mutex %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ for (i = 0; i < cyclelen; i++) {
+ pr_info("%s: srcu_read_lock(%d), mutex_lock(%d), mutex_unlock(%d), srcu_read_unlock(%d)\n",
+ __func__, i, i, i, i);
+ idx = srcu_read_lock(srcus[i]);
+ mutex_lock(muts[i]);
+ mutex_unlock(muts[i]);
+ srcu_read_unlock(srcus[i], idx);
+
+ j = srcu_lockdep_next(__func__, "mutex_lock", "synchronize_srcu",
+ "mutex_unlock", i, cyclelen, deadlock);
+ mutex_lock(muts[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ mutex_unlock(muts[i]);
+ }
+ return;
+ }
+
+ if (testtype == 2) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU/rwsem %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ for (i = 0; i < cyclelen; i++) {
+ pr_info("%s: srcu_read_lock(%d), down_read(%d), up_read(%d), srcu_read_unlock(%d)\n",
+ __func__, i, i, i, i);
+ idx = srcu_read_lock(srcus[i]);
+ down_read(rwsems[i]);
+ up_read(rwsems[i]);
+ srcu_read_unlock(srcus[i], idx);
+
+ j = srcu_lockdep_next(__func__, "down_write", "synchronize_srcu",
+ "up_write", i, cyclelen, deadlock);
+ down_write(rwsems[i]);
+ if (j >= 0)
+ synchronize_srcu(srcus[j]);
+ up_write(rwsems[i]);
+ }
+ return;
+ }
+
+#ifdef CONFIG_TASKS_TRACE_RCU
+ if (testtype == 3) {
+ pr_info("%s: test_srcu_lockdep = %05d: SRCU and Tasks Trace RCU %d-way %sdeadlock.\n",
+ __func__, test_srcu_lockdep, cyclelen, deadlock ? "" : "non-");
+ if (deadlock && cyclelen == 1)
+ pr_info("%s: Expect hang.\n", __func__);
+ for (i = 0; i < cyclelen; i++) {
+ char *fl = i == 0 ? "rcu_read_lock_trace" : "srcu_read_lock";
+ char *fs = i == cyclelen - 1 ? "synchronize_rcu_tasks_trace"
+ : "synchronize_srcu";
+ char *fu = i == 0 ? "rcu_read_unlock_trace" : "srcu_read_unlock";
+
+ j = srcu_lockdep_next(__func__, fl, fs, fu, i, cyclelen, deadlock);
+ if (i == 0)
+ rcu_read_lock_trace();
+ else
+ idx = srcu_read_lock(srcus[i]);
+ if (j >= 0) {
+ if (i == cyclelen - 1)
+ synchronize_rcu_tasks_trace();
+ else
+ synchronize_srcu(srcus[j]);
+ }
+ if (i == 0)
+ rcu_read_unlock_trace();
+ else
+ srcu_read_unlock(srcus[i], idx);
+ }
+ return;
+ }
+#endif // #ifdef CONFIG_TASKS_TRACE_RCU
+
+err_out:
+ pr_info("%s: test_srcu_lockdep = %05d does nothing.\n", __func__, test_srcu_lockdep);
+ pr_info("%s: test_srcu_lockdep = DNNL.\n", __func__);
+ pr_info("%s: D: Deadlock if nonzero.\n", __func__);
+ pr_info("%s: NN: Test number, 0=SRCU, 1=SRCU/mutex, 2=SRCU/rwsem, 3=SRCU/Tasks Trace RCU.\n", __func__);
+ pr_info("%s: L: Cycle length.\n", __func__);
+ if (!IS_ENABLED(CONFIG_TASKS_TRACE_RCU))
+ pr_info("%s: NN=3 disallowed because kernel is built with CONFIG_TASKS_TRACE_RCU=n\n", __func__);
+}
+
static int __init
rcu_torture_init(void)
{
@@ -3501,9 +3708,17 @@ rcu_torture_init(void)
pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
fqs_duration = 0;
}
+ if (nocbs_nthreads != 0 && (cur_ops != &rcu_ops ||
+ !IS_ENABLED(CONFIG_RCU_NOCB_CPU))) {
+ pr_alert("rcu-torture types: %s and CONFIG_RCU_NOCB_CPU=%d, nocb toggle disabled.\n",
+ cur_ops->name, IS_ENABLED(CONFIG_RCU_NOCB_CPU));
+ nocbs_nthreads = 0;
+ }
if (cur_ops->init)
cur_ops->init();
+ rcu_torture_init_srcu_lockdep();
+
if (nreaders >= 0) {
nrealreaders = nreaders;
} else {
@@ -3540,7 +3755,6 @@ rcu_torture_init(void)
atomic_set(&n_rcu_torture_error, 0);
n_rcu_torture_barrier_error = 0;
n_rcu_torture_boost_ktrerror = 0;
- n_rcu_torture_boost_rterror = 0;
n_rcu_torture_boost_failure = 0;
n_rcu_torture_boosts = 0;
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index afa3e1a2f690..1970ce5f22d4 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -1031,7 +1031,7 @@ ref_scale_cleanup(void)
static int
ref_scale_shutdown(void *arg)
{
- wait_event(shutdown_wq, shutdown_start);
+ wait_event_idle(shutdown_wq, shutdown_start);
smp_mb(); // Wake before output.
ref_scale_cleanup();
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index b12fb0cec44d..336af24e0fe3 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -197,6 +197,8 @@ void synchronize_srcu(struct srcu_struct *ssp)
{
struct rcu_synchronize rs;
+ srcu_lock_sync(&ssp->dep_map);
+
RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index ab4ee58af84b..20d7a238d675 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -103,7 +103,7 @@ do { \
#define spin_trylock_irqsave_rcu_node(p, flags) \
({ \
- bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
+ bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \
\
if (___locked) \
smp_mb__after_unlock_lock(); \
@@ -135,8 +135,8 @@ static void init_srcu_struct_data(struct srcu_struct *ssp)
spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
rcu_segcblist_init(&sdp->srcu_cblist);
sdp->srcu_cblist_invoking = false;
- sdp->srcu_gp_seq_needed = ssp->srcu_gp_seq;
- sdp->srcu_gp_seq_needed_exp = ssp->srcu_gp_seq;
+ sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq;
+ sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq;
sdp->mynode = NULL;
sdp->cpu = cpu;
INIT_WORK(&sdp->work, srcu_invoke_callbacks);
@@ -173,14 +173,14 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
/* Initialize geometry if it has not already been initialized. */
rcu_init_geometry();
- ssp->node = kcalloc(rcu_num_nodes, sizeof(*ssp->node), gfp_flags);
- if (!ssp->node)
+ ssp->srcu_sup->node = kcalloc(rcu_num_nodes, sizeof(*ssp->srcu_sup->node), gfp_flags);
+ if (!ssp->srcu_sup->node)
return false;
/* Work out the overall tree geometry. */
- ssp->level[0] = &ssp->node[0];
+ ssp->srcu_sup->level[0] = &ssp->srcu_sup->node[0];
for (i = 1; i < rcu_num_lvls; i++)
- ssp->level[i] = ssp->level[i - 1] + num_rcu_lvl[i - 1];
+ ssp->srcu_sup->level[i] = ssp->srcu_sup->level[i - 1] + num_rcu_lvl[i - 1];
rcu_init_levelspread(levelspread, num_rcu_lvl);
/* Each pass through this loop initializes one srcu_node structure. */
@@ -195,17 +195,17 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ;
snp->grplo = -1;
snp->grphi = -1;
- if (snp == &ssp->node[0]) {
+ if (snp == &ssp->srcu_sup->node[0]) {
/* Root node, special case. */
snp->srcu_parent = NULL;
continue;
}
/* Non-root node. */
- if (snp == ssp->level[level + 1])
+ if (snp == ssp->srcu_sup->level[level + 1])
level++;
- snp->srcu_parent = ssp->level[level - 1] +
- (snp - ssp->level[level]) /
+ snp->srcu_parent = ssp->srcu_sup->level[level - 1] +
+ (snp - ssp->srcu_sup->level[level]) /
levelspread[level - 1];
}
@@ -214,7 +214,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
* leaves of the srcu_node tree.
*/
level = rcu_num_lvls - 1;
- snp_first = ssp->level[level];
+ snp_first = ssp->srcu_sup->level[level];
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
sdp->mynode = &snp_first[cpu / levelspread[level]];
@@ -225,7 +225,7 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
}
sdp->grpmask = 1 << (cpu - sdp->mynode->grplo);
}
- smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
+ smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER);
return true;
}
@@ -236,36 +236,47 @@ static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags)
*/
static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
{
- ssp->srcu_size_state = SRCU_SIZE_SMALL;
- ssp->node = NULL;
- mutex_init(&ssp->srcu_cb_mutex);
- mutex_init(&ssp->srcu_gp_mutex);
+ if (!is_static)
+ ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL);
+ if (!ssp->srcu_sup)
+ return -ENOMEM;
+ if (!is_static)
+ spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL;
+ ssp->srcu_sup->node = NULL;
+ mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
+ mutex_init(&ssp->srcu_sup->srcu_gp_mutex);
ssp->srcu_idx = 0;
- ssp->srcu_gp_seq = 0;
- ssp->srcu_barrier_seq = 0;
- mutex_init(&ssp->srcu_barrier_mutex);
- atomic_set(&ssp->srcu_barrier_cpu_cnt, 0);
- INIT_DELAYED_WORK(&ssp->work, process_srcu);
- ssp->sda_is_static = is_static;
+ ssp->srcu_sup->srcu_gp_seq = 0;
+ ssp->srcu_sup->srcu_barrier_seq = 0;
+ mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
+ atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0);
+ INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu);
+ ssp->srcu_sup->sda_is_static = is_static;
if (!is_static)
ssp->sda = alloc_percpu(struct srcu_data);
- if (!ssp->sda)
+ if (!ssp->sda) {
+ if (!is_static)
+ kfree(ssp->srcu_sup);
return -ENOMEM;
+ }
init_srcu_struct_data(ssp);
- ssp->srcu_gp_seq_needed_exp = 0;
- ssp->srcu_last_gp_end = ktime_get_mono_fast_ns();
- if (READ_ONCE(ssp->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
+ ssp->srcu_sup->srcu_gp_seq_needed_exp = 0;
+ ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns();
+ if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) {
if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) {
- if (!ssp->sda_is_static) {
+ if (!ssp->srcu_sup->sda_is_static) {
free_percpu(ssp->sda);
ssp->sda = NULL;
+ kfree(ssp->srcu_sup);
return -ENOMEM;
}
} else {
- WRITE_ONCE(ssp->srcu_size_state, SRCU_SIZE_BIG);
+ WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG);
}
}
- smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */
+ ssp->srcu_sup->srcu_ssp = ssp;
+ smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 0); /* Init done. */
return 0;
}
@@ -277,7 +288,6 @@ int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
/* Don't re-initialize a lock while it is held. */
debug_check_no_locks_freed((void *)ssp, sizeof(*ssp));
lockdep_init_map(&ssp->dep_map, name, key, 0);
- spin_lock_init(&ACCESS_PRIVATE(ssp, lock));
return init_srcu_struct_fields(ssp, false);
}
EXPORT_SYMBOL_GPL(__init_srcu_struct);
@@ -294,7 +304,6 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct);
*/
int init_srcu_struct(struct srcu_struct *ssp)
{
- spin_lock_init(&ACCESS_PRIVATE(ssp, lock));
return init_srcu_struct_fields(ssp, false);
}
EXPORT_SYMBOL_GPL(init_srcu_struct);
@@ -306,8 +315,8 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
*/
static void __srcu_transition_to_big(struct srcu_struct *ssp)
{
- lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
- smp_store_release(&ssp->srcu_size_state, SRCU_SIZE_ALLOC);
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC);
}
/*
@@ -318,15 +327,15 @@ static void srcu_transition_to_big(struct srcu_struct *ssp)
unsigned long flags;
/* Double-checked locking on ->srcu_size-state. */
- if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL)
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL)
return;
- spin_lock_irqsave_rcu_node(ssp, flags);
- if (smp_load_acquire(&ssp->srcu_size_state) != SRCU_SIZE_SMALL) {
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) {
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
return;
}
__srcu_transition_to_big(ssp);
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -337,14 +346,14 @@ static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp)
{
unsigned long j;
- if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_size_state)
+ if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_sup->srcu_size_state)
return;
j = jiffies;
- if (ssp->srcu_size_jiffies != j) {
- ssp->srcu_size_jiffies = j;
- ssp->srcu_n_lock_retries = 0;
+ if (ssp->srcu_sup->srcu_size_jiffies != j) {
+ ssp->srcu_sup->srcu_size_jiffies = j;
+ ssp->srcu_sup->srcu_n_lock_retries = 0;
}
- if (++ssp->srcu_n_lock_retries <= small_contention_lim)
+ if (++ssp->srcu_sup->srcu_n_lock_retries <= small_contention_lim)
return;
__srcu_transition_to_big(ssp);
}
@@ -361,9 +370,9 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon
if (spin_trylock_irqsave_rcu_node(sdp, *flags))
return;
- spin_lock_irqsave_rcu_node(ssp, *flags);
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
spin_lock_irqsave_check_contention(ssp);
- spin_unlock_irqrestore_rcu_node(ssp, *flags);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags);
spin_lock_irqsave_rcu_node(sdp, *flags);
}
@@ -375,9 +384,9 @@ static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned lon
*/
static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags)
{
- if (spin_trylock_irqsave_rcu_node(ssp, *flags))
+ if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags))
return;
- spin_lock_irqsave_rcu_node(ssp, *flags);
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags);
spin_lock_irqsave_check_contention(ssp);
}
@@ -394,15 +403,15 @@ static void check_init_srcu_struct(struct srcu_struct *ssp)
unsigned long flags;
/* The smp_load_acquire() pairs with the smp_store_release(). */
- if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq_needed))) /*^^^*/
+ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/
return; /* Already initialized. */
- spin_lock_irqsave_rcu_node(ssp, flags);
- if (!rcu_seq_state(ssp->srcu_gp_seq_needed)) {
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags);
+ if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) {
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
return;
}
init_srcu_struct_fields(ssp, true);
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -607,17 +616,18 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
unsigned long gpstart;
unsigned long j;
unsigned long jbase = SRCU_INTERVAL;
+ struct srcu_usage *sup = ssp->srcu_sup;
- if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
+ if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)))
jbase = 0;
- if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq))) {
+ if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) {
j = jiffies - 1;
- gpstart = READ_ONCE(ssp->srcu_gp_start);
+ gpstart = READ_ONCE(sup->srcu_gp_start);
if (time_after(j, gpstart))
jbase += j - gpstart;
if (!jbase) {
- WRITE_ONCE(ssp->srcu_n_exp_nodelay, READ_ONCE(ssp->srcu_n_exp_nodelay) + 1);
- if (READ_ONCE(ssp->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
+ WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1);
+ if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase)
jbase = 1;
}
}
@@ -634,12 +644,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
void cleanup_srcu_struct(struct srcu_struct *ssp)
{
int cpu;
+ struct srcu_usage *sup = ssp->srcu_sup;
if (WARN_ON(!srcu_get_delay(ssp)))
return; /* Just leak it! */
if (WARN_ON(srcu_readers_active(ssp)))
return; /* Just leak it! */
- flush_delayed_work(&ssp->work);
+ flush_delayed_work(&sup->work);
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
@@ -648,21 +659,23 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist)))
return; /* Forgot srcu_barrier(), so just leak it! */
}
- if (WARN_ON(rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
- WARN_ON(rcu_seq_current(&ssp->srcu_gp_seq) != ssp->srcu_gp_seq_needed) ||
+ if (WARN_ON(rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)) != SRCU_STATE_IDLE) ||
+ WARN_ON(rcu_seq_current(&sup->srcu_gp_seq) != sup->srcu_gp_seq_needed) ||
WARN_ON(srcu_readers_active(ssp))) {
pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n",
- __func__, ssp, rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)),
- rcu_seq_current(&ssp->srcu_gp_seq), ssp->srcu_gp_seq_needed);
+ __func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)),
+ rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed);
return; /* Caller forgot to stop doing call_srcu()? */
}
- if (!ssp->sda_is_static) {
+ kfree(sup->node);
+ sup->node = NULL;
+ sup->srcu_size_state = SRCU_SIZE_SMALL;
+ if (!sup->sda_is_static) {
free_percpu(ssp->sda);
ssp->sda = NULL;
+ kfree(sup);
+ ssp->srcu_sup = NULL;
}
- kfree(ssp->node);
- ssp->node = NULL;
- ssp->srcu_size_state = SRCU_SIZE_SMALL;
}
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
@@ -760,23 +773,23 @@ static void srcu_gp_start(struct srcu_struct *ssp)
struct srcu_data *sdp;
int state;
- if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
else
sdp = this_cpu_ptr(ssp->sda);
- lockdep_assert_held(&ACCESS_PRIVATE(ssp, lock));
- WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
+ WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed));
spin_lock_rcu_node(sdp); /* Interrupts already disabled. */
rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
- rcu_seq_snap(&ssp->srcu_gp_seq));
+ rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq));
spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */
- WRITE_ONCE(ssp->srcu_gp_start, jiffies);
- WRITE_ONCE(ssp->srcu_n_exp_nodelay, 0);
+ WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies);
+ WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0);
smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
- rcu_seq_start(&ssp->srcu_gp_seq);
- state = rcu_seq_state(ssp->srcu_gp_seq);
+ rcu_seq_start(&ssp->srcu_sup->srcu_gp_seq);
+ state = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq);
WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
}
@@ -849,28 +862,29 @@ static void srcu_gp_end(struct srcu_struct *ssp)
unsigned long sgsne;
struct srcu_node *snp;
int ss_state;
+ struct srcu_usage *sup = ssp->srcu_sup;
/* Prevent more than one additional grace period. */
- mutex_lock(&ssp->srcu_cb_mutex);
+ mutex_lock(&sup->srcu_cb_mutex);
/* End the current grace period. */
- spin_lock_irq_rcu_node(ssp);
- idx = rcu_seq_state(ssp->srcu_gp_seq);
+ spin_lock_irq_rcu_node(sup);
+ idx = rcu_seq_state(sup->srcu_gp_seq);
WARN_ON_ONCE(idx != SRCU_STATE_SCAN2);
- if (ULONG_CMP_LT(READ_ONCE(ssp->srcu_gp_seq), READ_ONCE(ssp->srcu_gp_seq_needed_exp)))
+ if (ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)))
cbdelay = 0;
- WRITE_ONCE(ssp->srcu_last_gp_end, ktime_get_mono_fast_ns());
- rcu_seq_end(&ssp->srcu_gp_seq);
- gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
- if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
- WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq);
- spin_unlock_irq_rcu_node(ssp);
- mutex_unlock(&ssp->srcu_gp_mutex);
+ WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns());
+ rcu_seq_end(&sup->srcu_gp_seq);
+ gpseq = rcu_seq_current(&sup->srcu_gp_seq);
+ if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq))
+ WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq);
+ spin_unlock_irq_rcu_node(sup);
+ mutex_unlock(&sup->srcu_gp_mutex);
/* A new grace period can start at this point. But only one. */
/* Initiate callback invocation as needed. */
- ss_state = smp_load_acquire(&ssp->srcu_size_state);
+ ss_state = smp_load_acquire(&sup->srcu_size_state);
if (ss_state < SRCU_SIZE_WAIT_BARRIER) {
srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()),
cbdelay);
@@ -879,7 +893,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
srcu_for_each_node_breadth_first(ssp, snp) {
spin_lock_irq_rcu_node(snp);
cbs = false;
- last_lvl = snp >= ssp->level[rcu_num_lvls - 1];
+ last_lvl = snp >= sup->level[rcu_num_lvls - 1];
if (last_lvl)
cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq;
snp->srcu_have_cbs[idx] = gpseq;
@@ -911,18 +925,18 @@ static void srcu_gp_end(struct srcu_struct *ssp)
}
/* Callback initiation done, allow grace periods after next. */
- mutex_unlock(&ssp->srcu_cb_mutex);
+ mutex_unlock(&sup->srcu_cb_mutex);
/* Start a new grace period if needed. */
- spin_lock_irq_rcu_node(ssp);
- gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
+ spin_lock_irq_rcu_node(sup);
+ gpseq = rcu_seq_current(&sup->srcu_gp_seq);
if (!rcu_seq_state(gpseq) &&
- ULONG_CMP_LT(gpseq, ssp->srcu_gp_seq_needed)) {
+ ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) {
srcu_gp_start(ssp);
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(sup);
srcu_reschedule(ssp, 0);
} else {
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(sup);
}
/* Transition to big if needed. */
@@ -930,7 +944,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
if (ss_state == SRCU_SIZE_ALLOC)
init_srcu_struct_nodes(ssp, GFP_KERNEL);
else
- smp_store_release(&ssp->srcu_size_state, ss_state + 1);
+ smp_store_release(&sup->srcu_size_state, ss_state + 1);
}
}
@@ -950,7 +964,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
if (snp)
for (; snp != NULL; snp = snp->srcu_parent) {
sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp);
- if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) ||
+ if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) ||
(!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)))
return;
spin_lock_irqsave_rcu_node(snp, flags);
@@ -963,9 +977,9 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
spin_unlock_irqrestore_rcu_node(snp, flags);
}
spin_lock_irqsave_ssp_contention(ssp, &flags);
- if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
- WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s))
+ WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s);
+ spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags);
}
/*
@@ -990,9 +1004,10 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
struct srcu_node *snp;
struct srcu_node *snp_leaf;
unsigned long snp_seq;
+ struct srcu_usage *sup = ssp->srcu_sup;
/* Ensure that snp node tree is fully initialized before traversing it */
- if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ if (smp_load_acquire(&sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
snp_leaf = NULL;
else
snp_leaf = sdp->mynode;
@@ -1000,7 +1015,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
if (snp_leaf)
/* Each pass through the loop does one level of the srcu_node tree. */
for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) {
- if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) && snp != snp_leaf)
+ if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf)
return; /* GP already done and CBs recorded. */
spin_lock_irqsave_rcu_node(snp, flags);
snp_seq = snp->srcu_have_cbs[idx];
@@ -1027,20 +1042,20 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
/* Top of tree, must ensure the grace period will be started. */
spin_lock_irqsave_ssp_contention(ssp, &flags);
- if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed, s)) {
+ if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) {
/*
* Record need for grace period s. Pair with load
* acquire setting up for initialization.
*/
- smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/
+ smp_store_release(&sup->srcu_gp_seq_needed, s); /*^^^*/
}
- if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
- WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
+ if (!do_norm && ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, s))
+ WRITE_ONCE(sup->srcu_gp_seq_needed_exp, s);
/* If grace period not already in progress, start it. */
- if (!WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_gp_seq, s)) &&
- rcu_seq_state(ssp->srcu_gp_seq) == SRCU_STATE_IDLE) {
- WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed));
+ if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) &&
+ rcu_seq_state(sup->srcu_gp_seq) == SRCU_STATE_IDLE) {
+ WARN_ON_ONCE(ULONG_CMP_GE(sup->srcu_gp_seq, sup->srcu_gp_seq_needed));
srcu_gp_start(ssp);
// And how can that list_add() in the "else" clause
@@ -1049,12 +1064,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
// can only be executed during early boot when there is only
// the one boot CPU running with interrupts still disabled.
if (likely(srcu_init_done))
- queue_delayed_work(rcu_gp_wq, &ssp->work,
+ queue_delayed_work(rcu_gp_wq, &sup->work,
!!srcu_get_delay(ssp));
- else if (list_empty(&ssp->work.work.entry))
- list_add(&ssp->work.work.entry, &srcu_boot_list);
+ else if (list_empty(&sup->work.work.entry))
+ list_add(&sup->work.work.entry, &srcu_boot_list);
}
- spin_unlock_irqrestore_rcu_node(ssp, flags);
+ spin_unlock_irqrestore_rcu_node(sup, flags);
}
/*
@@ -1085,16 +1100,36 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
static void srcu_flip(struct srcu_struct *ssp)
{
/*
- * Ensure that if this updater saw a given reader's increment
- * from __srcu_read_lock(), that reader was using an old value
- * of ->srcu_idx. Also ensure that if a given reader sees the
- * new value of ->srcu_idx, this updater's earlier scans cannot
- * have seen that reader's increments (which is OK, because this
- * grace period need not wait on that reader).
+ * Because the flip of ->srcu_idx is executed only if the
+ * preceding call to srcu_readers_active_idx_check() found that
+ * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched
+ * and because that summing uses atomic_long_read(), there is
+ * ordering due to a control dependency between that summing and
+ * the WRITE_ONCE() in this call to srcu_flip(). This ordering
+ * ensures that if this updater saw a given reader's increment from
+ * __srcu_read_lock(), that reader was using a value of ->srcu_idx
+ * from before the previous call to srcu_flip(), which should be
+ * quite rare. This ordering thus helps forward progress because
+ * the grace period could otherwise be delayed by additional
+ * calls to __srcu_read_lock() using that old (soon to be new)
+ * value of ->srcu_idx.
+ *
+ * This sum-equality check and ordering also ensures that if
+ * a given call to __srcu_read_lock() uses the new value of
+ * ->srcu_idx, this updater's earlier scans cannot have seen
+ * that reader's increments, which is all to the good, because
+ * this grace period need not wait on that reader. After all,
+ * if those earlier scans had seen that reader, there would have
+ * been a sum mismatch and this code would not be reached.
+ *
+ * This means that the following smp_mb() is redundant, but
+ * it stays until either (1) Compilers learn about this sort of
+ * control dependency or (2) Some production workload running on
+ * a production system is unduly delayed by this slowpath smp_mb().
*/
smp_mb(); /* E */ /* Pairs with B and C. */
- WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter.
/*
* Ensure that if the updater misses an __srcu_read_unlock()
@@ -1154,18 +1189,18 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
/* First, see if enough time has passed since the last GP. */
t = ktime_get_mono_fast_ns();
- tlast = READ_ONCE(ssp->srcu_last_gp_end);
+ tlast = READ_ONCE(ssp->srcu_sup->srcu_last_gp_end);
if (exp_holdoff == 0 ||
time_in_range_open(t, tlast, tlast + exp_holdoff))
return false; /* Too soon after last GP. */
/* Next, check for probable idleness. */
- curseq = rcu_seq_current(&ssp->srcu_gp_seq);
+ curseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */
- if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_gp_seq_needed)))
+ if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed)))
return false; /* Grace period in progress, so not idle. */
smp_mb(); /* Order ->srcu_gp_seq with prior access. */
- if (curseq != rcu_seq_current(&ssp->srcu_gp_seq))
+ if (curseq != rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq))
return false; /* GP # changed, so not idle. */
return true; /* With reasonable probability, idle! */
}
@@ -1199,7 +1234,7 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
* sequence number cannot wrap around in the meantime.
*/
idx = __srcu_read_lock_nmisafe(ssp);
- ss_state = smp_load_acquire(&ssp->srcu_size_state);
+ ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state);
if (ss_state < SRCU_SIZE_WAIT_CALL)
sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id());
else
@@ -1208,8 +1243,8 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
if (rhp)
rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
- s = rcu_seq_snap(&ssp->srcu_gp_seq);
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
+ s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
sdp->srcu_gp_seq_needed = s;
@@ -1307,6 +1342,8 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
{
struct rcu_synchronize rcu;
+ srcu_lock_sync(&ssp->dep_map);
+
RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
lock_is_held(&rcu_bh_lock_map) ||
lock_is_held(&rcu_lock_map) ||
@@ -1420,7 +1457,7 @@ unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
// Any prior manipulation of SRCU-protected data must happen
// before the load from ->srcu_gp_seq.
smp_mb();
- return rcu_seq_snap(&ssp->srcu_gp_seq);
+ return rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq);
}
EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
@@ -1467,7 +1504,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
*/
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
{
- if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie))
+ if (!rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie))
return false;
// Ensure that the end of the SRCU grace period happens before
// any subsequent code that the caller might execute.
@@ -1486,8 +1523,8 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
sdp = container_of(rhp, struct srcu_data, srcu_barrier_head);
ssp = sdp->ssp;
- if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
- complete(&ssp->srcu_barrier_completion);
+ if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))
+ complete(&ssp->srcu_sup->srcu_barrier_completion);
}
/*
@@ -1501,13 +1538,13 @@ static void srcu_barrier_cb(struct rcu_head *rhp)
static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp)
{
spin_lock_irq_rcu_node(sdp);
- atomic_inc(&ssp->srcu_barrier_cpu_cnt);
+ atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
sdp->srcu_barrier_head.func = srcu_barrier_cb;
debug_rcu_head_queue(&sdp->srcu_barrier_head);
if (!rcu_segcblist_entrain(&sdp->srcu_cblist,
&sdp->srcu_barrier_head)) {
debug_rcu_head_unqueue(&sdp->srcu_barrier_head);
- atomic_dec(&ssp->srcu_barrier_cpu_cnt);
+ atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt);
}
spin_unlock_irq_rcu_node(sdp);
}
@@ -1520,23 +1557,23 @@ void srcu_barrier(struct srcu_struct *ssp)
{
int cpu;
int idx;
- unsigned long s = rcu_seq_snap(&ssp->srcu_barrier_seq);
+ unsigned long s = rcu_seq_snap(&ssp->srcu_sup->srcu_barrier_seq);
check_init_srcu_struct(ssp);
- mutex_lock(&ssp->srcu_barrier_mutex);
- if (rcu_seq_done(&ssp->srcu_barrier_seq, s)) {
+ mutex_lock(&ssp->srcu_sup->srcu_barrier_mutex);
+ if (rcu_seq_done(&ssp->srcu_sup->srcu_barrier_seq, s)) {
smp_mb(); /* Force ordering following return. */
- mutex_unlock(&ssp->srcu_barrier_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex);
return; /* Someone else did our work for us. */
}
- rcu_seq_start(&ssp->srcu_barrier_seq);
- init_completion(&ssp->srcu_barrier_completion);
+ rcu_seq_start(&ssp->srcu_sup->srcu_barrier_seq);
+ init_completion(&ssp->srcu_sup->srcu_barrier_completion);
/* Initial count prevents reaching zero until all CBs are posted. */
- atomic_set(&ssp->srcu_barrier_cpu_cnt, 1);
+ atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 1);
idx = __srcu_read_lock_nmisafe(ssp);
- if (smp_load_acquire(&ssp->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
+ if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER)
srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, get_boot_cpu_id()));
else
for_each_possible_cpu(cpu)
@@ -1544,12 +1581,12 @@ void srcu_barrier(struct srcu_struct *ssp)
__srcu_read_unlock_nmisafe(ssp, idx);
/* Remove the initial count, at which point reaching zero can happen. */
- if (atomic_dec_and_test(&ssp->srcu_barrier_cpu_cnt))
- complete(&ssp->srcu_barrier_completion);
- wait_for_completion(&ssp->srcu_barrier_completion);
+ if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt))
+ complete(&ssp->srcu_sup->srcu_barrier_completion);
+ wait_for_completion(&ssp->srcu_sup->srcu_barrier_completion);
- rcu_seq_end(&ssp->srcu_barrier_seq);
- mutex_unlock(&ssp->srcu_barrier_mutex);
+ rcu_seq_end(&ssp->srcu_sup->srcu_barrier_seq);
+ mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex);
}
EXPORT_SYMBOL_GPL(srcu_barrier);
@@ -1575,7 +1612,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
{
int idx;
- mutex_lock(&ssp->srcu_gp_mutex);
+ mutex_lock(&ssp->srcu_sup->srcu_gp_mutex);
/*
* Because readers might be delayed for an extended period after
@@ -1587,39 +1624,39 @@ static void srcu_advance_state(struct srcu_struct *ssp)
* The load-acquire ensures that we see the accesses performed
* by the prior grace period.
*/
- idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_gp_seq)); /* ^^^ */
+ idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */
if (idx == SRCU_STATE_IDLE) {
- spin_lock_irq_rcu_node(ssp);
- if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
- WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq));
- spin_unlock_irq_rcu_node(ssp);
- mutex_unlock(&ssp->srcu_gp_mutex);
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
+ WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq));
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return;
}
- idx = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq));
+ idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq));
if (idx == SRCU_STATE_IDLE)
srcu_gp_start(ssp);
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (idx != SRCU_STATE_IDLE) {
- mutex_unlock(&ssp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* Someone else started the grace period. */
}
}
- if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
idx = 1 ^ (ssp->srcu_idx & 1);
if (!try_check_zero(ssp, idx, 1)) {
- mutex_unlock(&ssp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
}
srcu_flip(ssp);
- spin_lock_irq_rcu_node(ssp);
- rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2);
- ssp->srcu_n_exp_nodelay = 0;
- spin_unlock_irq_rcu_node(ssp);
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2);
+ ssp->srcu_sup->srcu_n_exp_nodelay = 0;
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
}
- if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
+ if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
/*
* SRCU read-side critical sections are normally short,
@@ -1627,10 +1664,10 @@ static void srcu_advance_state(struct srcu_struct *ssp)
*/
idx = 1 ^ (ssp->srcu_idx & 1);
if (!try_check_zero(ssp, idx, 2)) {
- mutex_unlock(&ssp->srcu_gp_mutex);
+ mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
}
- ssp->srcu_n_exp_nodelay = 0;
+ ssp->srcu_sup->srcu_n_exp_nodelay = 0;
srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */
}
}
@@ -1656,7 +1693,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
rcu_cblist_init(&ready_cbs);
spin_lock_irq_rcu_node(sdp);
rcu_segcblist_advance(&sdp->srcu_cblist,
- rcu_seq_current(&ssp->srcu_gp_seq));
+ rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq));
if (sdp->srcu_cblist_invoking ||
!rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) {
spin_unlock_irq_rcu_node(sdp);
@@ -1684,7 +1721,7 @@ static void srcu_invoke_callbacks(struct work_struct *work)
spin_lock_irq_rcu_node(sdp);
rcu_segcblist_add_len(&sdp->srcu_cblist, -len);
(void)rcu_segcblist_accelerate(&sdp->srcu_cblist,
- rcu_seq_snap(&ssp->srcu_gp_seq));
+ rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq));
sdp->srcu_cblist_invoking = false;
more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist);
spin_unlock_irq_rcu_node(sdp);
@@ -1700,20 +1737,20 @@ static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay)
{
bool pushgp = true;
- spin_lock_irq_rcu_node(ssp);
- if (ULONG_CMP_GE(ssp->srcu_gp_seq, ssp->srcu_gp_seq_needed)) {
- if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_gp_seq))) {
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) {
+ if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) {
/* All requests fulfilled, time to go idle. */
pushgp = false;
}
- } else if (!rcu_seq_state(ssp->srcu_gp_seq)) {
+ } else if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)) {
/* Outstanding request and no GP. Start one. */
srcu_gp_start(ssp);
}
- spin_unlock_irq_rcu_node(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (pushgp)
- queue_delayed_work(rcu_gp_wq, &ssp->work, delay);
+ queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay);
}
/*
@@ -1724,22 +1761,24 @@ static void process_srcu(struct work_struct *work)
unsigned long curdelay;
unsigned long j;
struct srcu_struct *ssp;
+ struct srcu_usage *sup;
- ssp = container_of(work, struct srcu_struct, work.work);
+ sup = container_of(work, struct srcu_usage, work.work);
+ ssp = sup->srcu_ssp;
srcu_advance_state(ssp);
curdelay = srcu_get_delay(ssp);
if (curdelay) {
- WRITE_ONCE(ssp->reschedule_count, 0);
+ WRITE_ONCE(sup->reschedule_count, 0);
} else {
j = jiffies;
- if (READ_ONCE(ssp->reschedule_jiffies) == j) {
- WRITE_ONCE(ssp->reschedule_count, READ_ONCE(ssp->reschedule_count) + 1);
- if (READ_ONCE(ssp->reschedule_count) > srcu_max_nodelay)
+ if (READ_ONCE(sup->reschedule_jiffies) == j) {
+ WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1);
+ if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay)
curdelay = 1;
} else {
- WRITE_ONCE(ssp->reschedule_count, 1);
- WRITE_ONCE(ssp->reschedule_jiffies, j);
+ WRITE_ONCE(sup->reschedule_count, 1);
+ WRITE_ONCE(sup->reschedule_jiffies, j);
}
}
srcu_reschedule(ssp, curdelay);
@@ -1752,7 +1791,7 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
if (test_type != SRCU_FLAVOR)
return;
*flags = 0;
- *gp_seq = rcu_seq_current(&ssp->srcu_gp_seq);
+ *gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq);
}
EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
@@ -1774,14 +1813,14 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
int cpu;
int idx;
unsigned long s0 = 0, s1 = 0;
- int ss_state = READ_ONCE(ssp->srcu_size_state);
+ int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state);
int ss_state_idx = ss_state;
idx = ssp->srcu_idx & 0x1;
if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name))
ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1;
pr_alert("%s%s Tree SRCU g%ld state %d (%s)",
- tt, tf, rcu_seq_current(&ssp->srcu_gp_seq), ss_state,
+ tt, tf, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq), ss_state,
srcu_size_state_name[ss_state_idx]);
if (!ssp->sda) {
// Called after cleanup_srcu_struct(), perhaps.
@@ -1838,7 +1877,7 @@ early_initcall(srcu_bootup_announce);
void __init srcu_init(void)
{
- struct srcu_struct *ssp;
+ struct srcu_usage *sup;
/* Decide on srcu_struct-size strategy. */
if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) {
@@ -1858,12 +1897,13 @@ void __init srcu_init(void)
*/
srcu_init_done = true;
while (!list_empty(&srcu_boot_list)) {
- ssp = list_first_entry(&srcu_boot_list, struct srcu_struct,
+ sup = list_first_entry(&srcu_boot_list, struct srcu_usage,
work.work.entry);
- list_del_init(&ssp->work.work.entry);
- if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && ssp->srcu_size_state == SRCU_SIZE_SMALL)
- ssp->srcu_size_state = SRCU_SIZE_ALLOC;
- queue_work(rcu_gp_wq, &ssp->work.work);
+ list_del_init(&sup->work.work.entry);
+ if (SRCU_SIZING_IS(SRCU_SIZING_INIT) &&
+ sup->srcu_size_state == SRCU_SIZE_SMALL)
+ sup->srcu_size_state = SRCU_SIZE_ALLOC;
+ queue_work(rcu_gp_wq, &sup->work.work);
}
}
@@ -1873,13 +1913,14 @@ void __init srcu_init(void)
static int srcu_module_coming(struct module *mod)
{
int i;
+ struct srcu_struct *ssp;
struct srcu_struct **sspp = mod->srcu_struct_ptrs;
- int ret;
for (i = 0; i < mod->num_srcu_structs; i++) {
- ret = init_srcu_struct(*(sspp++));
- if (WARN_ON_ONCE(ret))
- return ret;
+ ssp = *(sspp++);
+ ssp->sda = alloc_percpu(struct srcu_data);
+ if (WARN_ON_ONCE(!ssp->sda))
+ return -ENOMEM;
}
return 0;
}
@@ -1888,10 +1929,17 @@ static int srcu_module_coming(struct module *mod)
static void srcu_module_going(struct module *mod)
{
int i;
+ struct srcu_struct *ssp;
struct srcu_struct **sspp = mod->srcu_struct_ptrs;
- for (i = 0; i < mod->num_srcu_structs; i++)
- cleanup_srcu_struct(*(sspp++));
+ for (i = 0; i < mod->num_srcu_structs; i++) {
+ ssp = *(sspp++);
+ if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed)) &&
+ !WARN_ON_ONCE(!ssp->srcu_sup->sda_is_static))
+ cleanup_srcu_struct(ssp);
+ if (!WARN_ON(srcu_readers_active(ssp)))
+ free_percpu(ssp->sda);
+ }
}
/* Handle one module, either coming or going. */
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index bfb5e1549f2b..5f4fc8184dd0 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -136,8 +136,16 @@ static struct rcu_tasks rt_name = \
.kname = #rt_name, \
}
+#ifdef CONFIG_TASKS_RCU
/* Track exiting tasks in order to allow them to be waited for. */
DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
+#endif
+
+#ifdef CONFIG_TASKS_RCU
+/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */
+static void tasks_rcu_exit_srcu_stall(struct timer_list *unused);
+static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall);
+#endif
/* Avoid IPIing CPUs early in the grace period. */
#define RCU_TASK_IPI_DELAY (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) ? HZ / 2 : 0)
@@ -830,6 +838,13 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
/* Processing between scanning taskslist and draining the holdout list. */
static void rcu_tasks_postscan(struct list_head *hop)
{
+ int rtsi = READ_ONCE(rcu_task_stall_info);
+
+ if (!IS_ENABLED(CONFIG_TINY_RCU)) {
+ tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi;
+ add_timer(&tasks_rcu_exit_srcu_stall_timer);
+ }
+
/*
* Exiting tasks may escape the tasklist scan. Those are vulnerable
* until their final schedule() with TASK_DEAD state. To cope with
@@ -848,6 +863,9 @@ static void rcu_tasks_postscan(struct list_head *hop)
* call to synchronize_rcu().
*/
synchronize_srcu(&tasks_rcu_exit_srcu);
+
+ if (!IS_ENABLED(CONFIG_TINY_RCU))
+ del_timer_sync(&tasks_rcu_exit_srcu_stall_timer);
}
/* See if tasks are still holding out, complain if so. */
@@ -923,6 +941,21 @@ static void rcu_tasks_postgp(struct rcu_tasks *rtp)
void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func);
DEFINE_RCU_TASKS(rcu_tasks, rcu_tasks_wait_gp, call_rcu_tasks, "RCU Tasks");
+static void tasks_rcu_exit_srcu_stall(struct timer_list *unused)
+{
+#ifndef CONFIG_TINY_RCU
+ int rtsi;
+
+ rtsi = READ_ONCE(rcu_task_stall_info);
+ pr_info("%s: %s grace period number %lu (since boot) gp_state: %s is %lu jiffies old.\n",
+ __func__, rcu_tasks.kname, rcu_tasks.tasks_gp_seq,
+ tasks_gp_state_getname(&rcu_tasks), jiffies - rcu_tasks.gp_jiffies);
+ pr_info("Please check any exiting tasks stuck between calls to exit_tasks_rcu_start() and exit_tasks_rcu_finish()\n");
+ tasks_rcu_exit_srcu_stall_timer.expires = jiffies + rtsi;
+ add_timer(&tasks_rcu_exit_srcu_stall_timer);
+#endif // #ifndef CONFIG_TINY_RCU
+}
+
/**
* call_rcu_tasks() - Queue an RCU for invocation task-based grace period
* @rhp: structure to be used for queueing the RCU updates.
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8e880c09ab59..f52ff7241041 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -640,6 +640,7 @@ void __rcu_irq_enter_check_tick(void)
}
raw_spin_unlock_rcu_node(rdp->mynode);
}
+NOKPROBE_SYMBOL(__rcu_irq_enter_check_tick);
#endif /* CONFIG_NO_HZ_FULL */
/*
@@ -1955,7 +1956,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
{
unsigned long flags;
unsigned long mask;
- bool needwake = false;
bool needacc = false;
struct rcu_node *rnp;
@@ -1987,7 +1987,12 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
* NOCB kthreads have their own way to deal with that...
*/
if (!rcu_rdp_is_offloaded(rdp)) {
- needwake = rcu_accelerate_cbs(rnp, rdp);
+ /*
+ * The current GP has not yet ended, so it
+ * should not be possible for rcu_accelerate_cbs()
+ * to return true. So complain, but don't awaken.
+ */
+ WARN_ON_ONCE(rcu_accelerate_cbs(rnp, rdp));
} else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
/*
* ...but NOCB kthreads may miss or delay callbacks acceleration
@@ -1999,8 +2004,6 @@ rcu_report_qs_rdp(struct rcu_data *rdp)
rcu_disable_urgency_upon_qs(rdp);
rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
/* ^^^ Released rnp->lock */
- if (needwake)
- rcu_gp_kthread_wake();
if (needacc) {
rcu_nocb_lock_irqsave(rdp, flags);
@@ -2131,6 +2134,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
break;
}
} else {
+ // In rcuoc context, so no worries about depriving
+ // other softirq vectors of CPU cycles.
local_bh_enable();
lockdep_assert_irqs_enabled();
cond_resched_tasks_rcu_qs();
@@ -3024,6 +3029,18 @@ need_offload_krc(struct kfree_rcu_cpu *krcp)
return !!READ_ONCE(krcp->head);
}
+static bool
+need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
+{
+ int i;
+
+ for (i = 0; i < FREE_N_CHANNELS; i++)
+ if (!list_empty(&krwp->bulk_head_free[i]))
+ return true;
+
+ return !!krwp->head_free;
+}
+
static int krc_count(struct kfree_rcu_cpu *krcp)
{
int sum = atomic_read(&krcp->head_count);
@@ -3107,15 +3124,14 @@ static void kfree_rcu_monitor(struct work_struct *work)
for (i = 0; i < KFREE_N_BATCHES; i++) {
struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
- // Try to detach bulk_head or head and attach it over any
- // available corresponding free channel. It can be that
- // a previous RCU batch is in progress, it means that
- // immediately to queue another one is not possible so
- // in that case the monitor work is rearmed.
- if ((!list_empty(&krcp->bulk_head[0]) && list_empty(&krwp->bulk_head_free[0])) ||
- (!list_empty(&krcp->bulk_head[1]) && list_empty(&krwp->bulk_head_free[1])) ||
- (READ_ONCE(krcp->head) && !krwp->head_free)) {
+ // Try to detach bulk_head or head and attach it, only when
+ // all channels are free. Any channel is not free means at krwp
+ // there is on-going rcu work to handle krwp's free business.
+ if (need_wait_for_krwp_work(krwp))
+ continue;
+ // kvfree_rcu_drain_ready() might handle this krcp, if so give up.
+ if (need_offload_krc(krcp)) {
// Channel 1 corresponds to the SLAB-pointer bulk path.
// Channel 2 corresponds to vmalloc-pointer bulk path.
for (j = 0; j < FREE_N_CHANNELS; j++) {
@@ -4940,9 +4956,8 @@ void __init rcu_init(void)
else
qovld_calc = qovld;
- // Kick-start any polled grace periods that started early.
- if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1))
- (void)start_poll_synchronize_rcu_expedited();
+ // Kick-start in case any polled grace periods started early.
+ (void)start_poll_synchronize_rcu_expedited();
rcu_test_sync_prims();
}
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 249c2967d9e6..3b7abb58157d 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -594,6 +594,7 @@ static void synchronize_rcu_expedited_wait(void)
struct rcu_data *rdp;
struct rcu_node *rnp;
struct rcu_node *rnp_root = rcu_get_root();
+ unsigned long flags;
trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("startwait"));
jiffies_stall = rcu_exp_jiffies_till_stall_check();
@@ -602,17 +603,17 @@ static void synchronize_rcu_expedited_wait(void)
if (synchronize_rcu_expedited_wait_once(1))
return;
rcu_for_each_leaf_node(rnp) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
mask = READ_ONCE(rnp->expmask);
for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rdp->rcu_forced_tick_exp)
continue;
rdp->rcu_forced_tick_exp = true;
- preempt_disable();
if (cpu_online(cpu))
tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP);
- preempt_enable();
}
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
j = READ_ONCE(jiffies_till_first_fqs);
if (synchronize_rcu_expedited_wait_once(j + HZ))
@@ -802,9 +803,11 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
int ndetected = 0;
struct task_struct *t;
- if (!READ_ONCE(rnp->exp_tasks))
- return 0;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ if (!rnp->exp_tasks) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ return 0;
+ }
t = list_entry(rnp->exp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
@@ -1065,9 +1068,10 @@ unsigned long start_poll_synchronize_rcu_expedited(void)
if (rcu_init_invoked())
raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags);
if (!poll_state_synchronize_rcu(s)) {
- rnp->exp_seq_poll_rq = s;
- if (rcu_init_invoked())
+ if (rcu_init_invoked()) {
+ rnp->exp_seq_poll_rq = s;
queue_work(rcu_gp_wq, &rnp->exp_poll_wq);
+ }
}
if (rcu_init_invoked())
raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 9e1c8caec5ce..f2280616f9d5 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1312,6 +1312,7 @@ int rcu_nocb_cpu_offload(int cpu)
}
EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
+#ifdef CONFIG_RCU_LAZY
static unsigned long
lazy_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
@@ -1360,6 +1361,7 @@ static struct shrinker lazy_rcu_shrinker = {
.batch = 0,
.seeks = DEFAULT_SEEKS,
};
+#endif // #ifdef CONFIG_RCU_LAZY
void __init rcu_init_nohz(void)
{
@@ -1391,8 +1393,10 @@ void __init rcu_init_nohz(void)
if (!rcu_state.nocb_is_setup)
return;
+#ifdef CONFIG_RCU_LAZY
if (register_shrinker(&lazy_rcu_shrinker, "rcu-lazy"))
pr_err("Failed to register lazy_rcu shrinker!\n");
+#endif // #ifdef CONFIG_RCU_LAZY
if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 5732fa75ebab..b5cc2b53464d 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -300,6 +300,9 @@ noinstr u64 local_clock(void)
if (static_branch_likely(&__sched_clock_stable))
return sched_clock() + __sched_clock_offset;
+ if (!static_branch_likely(&sched_clock_running))
+ return sched_clock();
+
preempt_disable_notrace();
clock = sched_clock_local(this_scd());
preempt_enable_notrace();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index af017e038b48..944c3ae39861 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -80,6 +80,7 @@
#define CREATE_TRACE_POINTS
#include <linux/sched/rseq_api.h>
#include <trace/events/sched.h>
+#include <trace/events/ipi.h>
#undef CREATE_TRACE_POINTS
#include "sched.h"
@@ -95,6 +96,9 @@
#include "../../io_uring/io-wq.h"
#include "../smpboot.h"
+EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
+EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);
+
/*
* Export tracepoints that act as a bare tracehook (ie: have no trace event
* associated with them) to allow external modules to probe them.
@@ -261,36 +265,51 @@ void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
resched_curr(rq);
}
-/*
- * Find left-most (aka, highest priority) task matching @cookie.
- */
-static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+static int sched_task_is_throttled(struct task_struct *p, int cpu)
{
- struct rb_node *node;
-
- node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
- /*
- * The idle task always matches any cookie!
- */
- if (!node)
- return idle_sched_class.pick_task(rq);
+ if (p->sched_class->task_is_throttled)
+ return p->sched_class->task_is_throttled(p, cpu);
- return __node_2_sc(node);
+ return 0;
}
static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
{
struct rb_node *node = &p->core_node;
+ int cpu = task_cpu(p);
+
+ do {
+ node = rb_next(node);
+ if (!node)
+ return NULL;
+
+ p = __node_2_sc(node);
+ if (p->core_cookie != cookie)
+ return NULL;
+
+ } while (sched_task_is_throttled(p, cpu));
- node = rb_next(node);
+ return p;
+}
+
+/*
+ * Find left-most (aka, highest priority) and unthrottled task matching @cookie.
+ * If no suitable task is found, NULL will be returned.
+ */
+static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+{
+ struct task_struct *p;
+ struct rb_node *node;
+
+ node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
if (!node)
return NULL;
- p = container_of(node, struct task_struct, core_node);
- if (p->core_cookie != cookie)
- return NULL;
+ p = __node_2_sc(node);
+ if (!sched_task_is_throttled(p, rq->cpu))
+ return p;
- return p;
+ return sched_core_next(p, cookie);
}
/*
@@ -704,6 +723,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
psi_account_irqtime(rq->curr, irq_delta);
+ delayacct_irq(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) {
@@ -2084,6 +2104,11 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
+ if (task_on_rq_migrating(p))
+ flags |= ENQUEUE_MIGRATED;
+ if (flags & ENQUEUE_MIGRATED)
+ sched_mm_cid_migrate_to(rq, p);
+
enqueue_task(rq, p, flags);
p->on_rq = TASK_ON_RQ_QUEUED;
@@ -3192,6 +3217,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
rseq_migrate(p);
+ sched_mm_cid_migrate_from(p);
perf_event_task_migrate(p);
}
@@ -3826,14 +3852,20 @@ void sched_ttwu_pending(void *arg)
rq_unlock_irqrestore(rq, &rf);
}
-void send_call_function_single_ipi(int cpu)
+/*
+ * Prepare the scene for sending an IPI for a remote smp_call
+ *
+ * Returns true if the caller can proceed with sending the IPI.
+ * Returns false otherwise.
+ */
+bool call_function_single_prep_ipi(int cpu)
{
- struct rq *rq = cpu_rq(cpu);
-
- if (!set_nr_if_polling(rq->idle))
- arch_send_call_function_single_ipi(cpu);
- else
+ if (set_nr_if_polling(cpu_rq(cpu)->idle)) {
trace_sched_wake_idle_without_ipi(cpu);
+ return false;
+ }
+
+ return true;
}
/*
@@ -4465,6 +4497,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
#endif
+ init_sched_mm_cid(p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -5111,7 +5144,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
- switch_mm_cid(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
kmap_local_sched_out();
prepare_task(next);
@@ -5200,13 +5232,14 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* rq->curr, before returning to userspace, so provide them here:
*
* - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
- * provided by mmdrop(),
+ * provided by mmdrop_lazy_tlb(),
* - a sync_core for SYNC_CORE.
*/
if (mm) {
membarrier_mm_sync_core_before_usermode(mm);
- mmdrop_sched(mm);
+ mmdrop_lazy_tlb_sched(mm);
}
+
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
@@ -5263,17 +5296,20 @@ context_switch(struct rq *rq, struct task_struct *prev,
/*
* kernel -> kernel lazy + transfer active
- * user -> kernel lazy + mmgrab() active
+ * user -> kernel lazy + mmgrab_lazy_tlb() active
*
- * kernel -> user switch + mmdrop() active
+ * kernel -> user switch + mmdrop_lazy_tlb() active
* user -> user switch
+ *
+ * switch_mm_cid() needs to be updated if the barriers provided
+ * by context_switch() are modified.
*/
if (!next->mm) { // to kernel
enter_lazy_tlb(prev->active_mm, next);
next->active_mm = prev->active_mm;
if (prev->mm) // from user
- mmgrab(prev->active_mm);
+ mmgrab_lazy_tlb(prev->active_mm);
else
prev->active_mm = NULL;
} else { // to user
@@ -5290,12 +5326,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
lru_gen_use_mm(next->mm);
if (!prev->mm) { // from kernel
- /* will mmdrop() in finish_task_switch(). */
+ /* will mmdrop_lazy_tlb() in finish_task_switch(). */
rq->prev_mm = prev->active_mm;
prev->active_mm = NULL;
}
}
+ /* switch_mm_cid() requires the memory barriers above. */
+ switch_mm_cid(rq, prev, next);
+
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
prepare_lock_switch(rq, next, rf);
@@ -5584,6 +5623,7 @@ void scheduler_tick(void)
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
sched_core_tick(rq);
+ task_tick_mm_cid(rq, curr);
rq_unlock(rq, &rf);
@@ -6236,7 +6276,7 @@ static bool try_steal_cookie(int this, int that)
goto unlock;
p = sched_core_find(src, cookie);
- if (p == src->idle)
+ if (!p)
goto unlock;
do {
@@ -6248,6 +6288,13 @@ static bool try_steal_cookie(int this, int that)
if (p->core_occupation > dst->idle->core_occupation)
goto next;
+ /*
+ * sched_core_find() and sched_core_next() will ensure that task @p
+ * is not throttled now, we also need to check whether the runqueue
+ * of the destination CPU is being throttled.
+ */
+ if (sched_task_is_throttled(p, this))
+ goto next;
deactivate_task(src, p, 0);
set_task_cpu(p, this);
@@ -8414,14 +8461,14 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
if (len & (sizeof(unsigned long)-1))
return -EINVAL;
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
ret = sched_getaffinity(pid, mask);
if (ret == 0) {
unsigned int retlen = min(len, cpumask_size());
- if (copy_to_user(user_mask_ptr, mask, retlen))
+ if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
ret = -EFAULT;
else
ret = retlen;
@@ -8503,6 +8550,7 @@ EXPORT_STATIC_CALL_TRAMP(might_resched);
static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
int __sched dynamic_cond_resched(void)
{
+ klp_sched_try_switch();
if (!static_branch_unlikely(&sk_dynamic_cond_resched))
return 0;
return __cond_resched();
@@ -8651,13 +8699,17 @@ int sched_dynamic_mode(const char *str)
#error "Unsupported PREEMPT_DYNAMIC mechanism"
#endif
-void sched_dynamic_update(int mode)
+static DEFINE_MUTEX(sched_dynamic_mutex);
+static bool klp_override;
+
+static void __sched_dynamic_update(int mode)
{
/*
* Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
* the ZERO state, which is invalid.
*/
- preempt_dynamic_enable(cond_resched);
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
preempt_dynamic_enable(might_resched);
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
@@ -8665,36 +8717,79 @@ void sched_dynamic_update(int mode)
switch (mode) {
case preempt_dynamic_none:
- preempt_dynamic_enable(cond_resched);
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
preempt_dynamic_disable(might_resched);
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
preempt_dynamic_disable(irqentry_exit_cond_resched);
- pr_info("Dynamic Preempt: none\n");
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: none\n");
break;
case preempt_dynamic_voluntary:
- preempt_dynamic_enable(cond_resched);
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
preempt_dynamic_enable(might_resched);
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
preempt_dynamic_disable(irqentry_exit_cond_resched);
- pr_info("Dynamic Preempt: voluntary\n");
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: voluntary\n");
break;
case preempt_dynamic_full:
- preempt_dynamic_disable(cond_resched);
+ if (!klp_override)
+ preempt_dynamic_disable(cond_resched);
preempt_dynamic_disable(might_resched);
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
preempt_dynamic_enable(irqentry_exit_cond_resched);
- pr_info("Dynamic Preempt: full\n");
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: full\n");
break;
}
preempt_dynamic_mode = mode;
}
+void sched_dynamic_update(int mode)
+{
+ mutex_lock(&sched_dynamic_mutex);
+ __sched_dynamic_update(mode);
+ mutex_unlock(&sched_dynamic_mutex);
+}
+
+#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
+
+static int klp_cond_resched(void)
+{
+ __klp_sched_try_switch();
+ return __cond_resched();
+}
+
+void sched_dynamic_klp_enable(void)
+{
+ mutex_lock(&sched_dynamic_mutex);
+
+ klp_override = true;
+ static_call_update(cond_resched, klp_cond_resched);
+
+ mutex_unlock(&sched_dynamic_mutex);
+}
+
+void sched_dynamic_klp_disable(void)
+{
+ mutex_lock(&sched_dynamic_mutex);
+
+ klp_override = false;
+ __sched_dynamic_update(preempt_dynamic_mode);
+
+ mutex_unlock(&sched_dynamic_mutex);
+}
+
+#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
static int __init setup_preempt_mode(char *str)
{
int mode = sched_dynamic_mode(str);
@@ -9932,7 +10027,7 @@ void __init sched_init(void)
/*
* The boot idle thread does lazy MMU switching as well:
*/
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
enter_lazy_tlb(&init_mm, current);
/*
@@ -10329,7 +10424,7 @@ void sched_release_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-static void sched_change_group(struct task_struct *tsk)
+static struct task_group *sched_get_task_group(struct task_struct *tsk)
{
struct task_group *tg;
@@ -10341,7 +10436,13 @@ static void sched_change_group(struct task_struct *tsk)
tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
- tsk->sched_task_group = tg;
+
+ return tg;
+}
+
+static void sched_change_group(struct task_struct *tsk, struct task_group *group)
+{
+ tsk->sched_task_group = group;
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_change_group)
@@ -10362,10 +10463,19 @@ void sched_move_task(struct task_struct *tsk)
{
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ struct task_group *group;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(tsk, &rf);
+ /*
+ * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
+ * group changes.
+ */
+ group = sched_get_task_group(tsk);
+ if (group == tsk->sched_task_group)
+ goto unlock;
+
update_rq_clock(rq);
running = task_current(rq, tsk);
@@ -10376,7 +10486,7 @@ void sched_move_task(struct task_struct *tsk)
if (running)
put_prev_task(rq, tsk);
- sched_change_group(tsk);
+ sched_change_group(tsk, group);
if (queued)
enqueue_task(rq, tsk, queue_flags);
@@ -10390,6 +10500,7 @@ void sched_move_task(struct task_struct *tsk)
resched_curr(rq);
}
+unlock:
task_rq_unlock(rq, tsk, &rf);
}
@@ -11380,45 +11491,524 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
}
#ifdef CONFIG_SCHED_MM_CID
-void sched_mm_cid_exit_signals(struct task_struct *t)
+
+/**
+ * @cid_lock: Guarantee forward-progress of cid allocation.
+ *
+ * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock
+ * is only used when contention is detected by the lock-free allocation so
+ * forward progress can be guaranteed.
+ */
+DEFINE_RAW_SPINLOCK(cid_lock);
+
+/**
+ * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock.
+ *
+ * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is
+ * detected, it is set to 1 to ensure that all newly coming allocations are
+ * serialized by @cid_lock until the allocation which detected contention
+ * completes and sets @use_cid_lock back to 0. This guarantees forward progress
+ * of a cid allocation.
+ */
+int use_cid_lock;
+
+/*
+ * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid
+ * concurrently with respect to the execution of the source runqueue context
+ * switch.
+ *
+ * There is one basic properties we want to guarantee here:
+ *
+ * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively
+ * used by a task. That would lead to concurrent allocation of the cid and
+ * userspace corruption.
+ *
+ * Provide this guarantee by introducing a Dekker memory ordering to guarantee
+ * that a pair of loads observe at least one of a pair of stores, which can be
+ * shown as:
+ *
+ * X = Y = 0
+ *
+ * w[X]=1 w[Y]=1
+ * MB MB
+ * r[Y]=y r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible. But rather than using
+ * values 0 and 1, this algorithm cares about specific state transitions of the
+ * runqueue current task (as updated by the scheduler context switch), and the
+ * per-mm/cpu cid value.
+ *
+ * Let's introduce task (Y) which has task->mm == mm and task (N) which has
+ * task->mm != mm for the rest of the discussion. There are two scheduler state
+ * transitions on context switch we care about:
+ *
+ * (TSA) Store to rq->curr with transition from (N) to (Y)
+ *
+ * (TSB) Store to rq->curr with transition from (Y) to (N)
+ *
+ * On the remote-clear side, there is one transition we care about:
+ *
+ * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag
+ *
+ * There is also a transition to UNSET state which can be performed from all
+ * sides (scheduler, remote-clear). It is always performed with a cmpxchg which
+ * guarantees that only a single thread will succeed:
+ *
+ * (TMB) cmpxchg to *pcpu_cid to mark UNSET
+ *
+ * Just to be clear, what we do _not_ want to happen is a transition to UNSET
+ * when a thread is actively using the cid (property (1)).
+ *
+ * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions.
+ *
+ * Scenario A) (TSA)+(TMA) (from next task perspective)
+ *
+ * CPU0 CPU1
+ *
+ * Context switch CS-1 Remote-clear
+ * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA)
+ * (implied barrier after cmpxchg)
+ * - switch_mm_cid()
+ * - memory barrier (see switch_mm_cid()
+ * comment explaining how this barrier
+ * is combined with other scheduler
+ * barriers)
+ * - mm_cid_get (next)
+ * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr)
+ *
+ * This Dekker ensures that either task (Y) is observed by the
+ * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are
+ * observed.
+ *
+ * If task (Y) store is observed by rcu_dereference(), it means that there is
+ * still an active task on the cpu. Remote-clear will therefore not transition
+ * to UNSET, which fulfills property (1).
+ *
+ * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(),
+ * it will move its state to UNSET, which clears the percpu cid perhaps
+ * uselessly (which is not an issue for correctness). Because task (Y) is not
+ * observed, CPU1 can move ahead to set the state to UNSET. Because moving
+ * state to UNSET is done with a cmpxchg expecting that the old state has the
+ * LAZY flag set, only one thread will successfully UNSET.
+ *
+ * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0
+ * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and
+ * CPU1 will observe task (Y) and do nothing more, which is fine.
+ *
+ * What we are effectively preventing with this Dekker is a scenario where
+ * neither LAZY flag nor store (Y) are observed, which would fail property (1)
+ * because this would UNSET a cid which is actively used.
+ */
+
+void sched_mm_cid_migrate_from(struct task_struct *t)
+{
+ t->migrate_from_cpu = task_cpu(t);
+}
+
+static
+int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
+ struct task_struct *t,
+ struct mm_cid *src_pcpu_cid)
{
struct mm_struct *mm = t->mm;
- unsigned long flags;
+ struct task_struct *src_task;
+ int src_cid, last_mm_cid;
if (!mm)
+ return -1;
+
+ last_mm_cid = t->last_mm_cid;
+ /*
+ * If the migrated task has no last cid, or if the current
+ * task on src rq uses the cid, it means the source cid does not need
+ * to be moved to the destination cpu.
+ */
+ if (last_mm_cid == -1)
+ return -1;
+ src_cid = READ_ONCE(src_pcpu_cid->cid);
+ if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid)
+ return -1;
+
+ /*
+ * If we observe an active task using the mm on this rq, it means we
+ * are not the last task to be migrated from this cpu for this mm, so
+ * there is no need to move src_cid to the destination cpu.
+ */
+ rcu_read_lock();
+ src_task = rcu_dereference(src_rq->curr);
+ if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+ rcu_read_unlock();
+ t->last_mm_cid = -1;
+ return -1;
+ }
+ rcu_read_unlock();
+
+ return src_cid;
+}
+
+static
+int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
+ struct task_struct *t,
+ struct mm_cid *src_pcpu_cid,
+ int src_cid)
+{
+ struct task_struct *src_task;
+ struct mm_struct *mm = t->mm;
+ int lazy_cid;
+
+ if (src_cid == -1)
+ return -1;
+
+ /*
+ * Attempt to clear the source cpu cid to move it to the destination
+ * cpu.
+ */
+ lazy_cid = mm_cid_set_lazy_put(src_cid);
+ if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid))
+ return -1;
+
+ /*
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm matches the scheduler barrier in context_switch()
+ * between store to rq->curr and load of prev and next task's
+ * per-mm/cpu cid.
+ *
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm_cid_active matches the barrier in
+ * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
+ * sched_mm_cid_after_execve() between store to t->mm_cid_active and
+ * load of per-mm/cpu cid.
+ */
+
+ /*
+ * If we observe an active task using the mm on this rq after setting
+ * the lazy-put flag, this task will be responsible for transitioning
+ * from lazy-put flag set to MM_CID_UNSET.
+ */
+ rcu_read_lock();
+ src_task = rcu_dereference(src_rq->curr);
+ if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+ rcu_read_unlock();
+ /*
+ * We observed an active task for this mm, there is therefore
+ * no point in moving this cid to the destination cpu.
+ */
+ t->last_mm_cid = -1;
+ return -1;
+ }
+ rcu_read_unlock();
+
+ /*
+ * The src_cid is unused, so it can be unset.
+ */
+ if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+ return -1;
+ return src_cid;
+}
+
+/*
+ * Migration to dst cpu. Called with dst_rq lock held.
+ * Interrupts are disabled, which keeps the window of cid ownership without the
+ * source rq lock held small.
+ */
+void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
+{
+ struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;
+ struct mm_struct *mm = t->mm;
+ int src_cid, dst_cid, src_cpu;
+ struct rq *src_rq;
+
+ lockdep_assert_rq_held(dst_rq);
+
+ if (!mm)
+ return;
+ src_cpu = t->migrate_from_cpu;
+ if (src_cpu == -1) {
+ t->last_mm_cid = -1;
+ return;
+ }
+ /*
+ * Move the src cid if the dst cid is unset. This keeps id
+ * allocation closest to 0 in cases where few threads migrate around
+ * many cpus.
+ *
+ * If destination cid is already set, we may have to just clear
+ * the src cid to ensure compactness in frequent migrations
+ * scenarios.
+ *
+ * It is not useful to clear the src cid when the number of threads is
+ * greater or equal to the number of allowed cpus, because user-space
+ * can expect that the number of allowed cids can reach the number of
+ * allowed cpus.
+ */
+ dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
+ dst_cid = READ_ONCE(dst_pcpu_cid->cid);
+ if (!mm_cid_is_unset(dst_cid) &&
+ atomic_read(&mm->mm_users) >= t->nr_cpus_allowed)
+ return;
+ src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);
+ src_rq = cpu_rq(src_cpu);
+ src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid);
+ if (src_cid == -1)
+ return;
+ src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid,
+ src_cid);
+ if (src_cid == -1)
return;
+ if (!mm_cid_is_unset(dst_cid)) {
+ __mm_cid_put(mm, src_cid);
+ return;
+ }
+ /* Move src_cid to dst cpu. */
+ mm_cid_snapshot_time(dst_rq, mm);
+ WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
+}
+
+static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid,
+ int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *t;
+ unsigned long flags;
+ int cid, lazy_cid;
+
+ cid = READ_ONCE(pcpu_cid->cid);
+ if (!mm_cid_is_valid(cid))
+ return;
+
+ /*
+ * Clear the cpu cid if it is set to keep cid allocation compact. If
+ * there happens to be other tasks left on the source cpu using this
+ * mm, the next task using this mm will reallocate its cid on context
+ * switch.
+ */
+ lazy_cid = mm_cid_set_lazy_put(cid);
+ if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid))
+ return;
+
+ /*
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm matches the scheduler barrier in context_switch()
+ * between store to rq->curr and load of prev and next task's
+ * per-mm/cpu cid.
+ *
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm_cid_active matches the barrier in
+ * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
+ * sched_mm_cid_after_execve() between store to t->mm_cid_active and
+ * load of per-mm/cpu cid.
+ */
+
+ /*
+ * If we observe an active task using the mm on this rq after setting
+ * the lazy-put flag, that task will be responsible for transitioning
+ * from lazy-put flag set to MM_CID_UNSET.
+ */
+ rcu_read_lock();
+ t = rcu_dereference(rq->curr);
+ if (READ_ONCE(t->mm_cid_active) && t->mm == mm) {
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+
+ /*
+ * The cid is unused, so it can be unset.
+ * Disable interrupts to keep the window of cid ownership without rq
+ * lock small.
+ */
local_irq_save(flags);
- mm_cid_put(mm, t->mm_cid);
- t->mm_cid = -1;
- t->mm_cid_active = 0;
+ if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+ __mm_cid_put(mm, cid);
local_irq_restore(flags);
}
+static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct mm_cid *pcpu_cid;
+ struct task_struct *curr;
+ u64 rq_clock;
+
+ /*
+ * rq->clock load is racy on 32-bit but one spurious clear once in a
+ * while is irrelevant.
+ */
+ rq_clock = READ_ONCE(rq->clock);
+ pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+
+ /*
+ * In order to take care of infrequently scheduled tasks, bump the time
+ * snapshot associated with this cid if an active task using the mm is
+ * observed on this rq.
+ */
+ rcu_read_lock();
+ curr = rcu_dereference(rq->curr);
+ if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
+ WRITE_ONCE(pcpu_cid->time, rq_clock);
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+
+ if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
+ return;
+ sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+}
+
+static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
+ int weight)
+{
+ struct mm_cid *pcpu_cid;
+ int cid;
+
+ pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+ cid = READ_ONCE(pcpu_cid->cid);
+ if (!mm_cid_is_valid(cid) || cid < weight)
+ return;
+ sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+}
+
+static void task_mm_cid_work(struct callback_head *work)
+{
+ unsigned long now = jiffies, old_scan, next_scan;
+ struct task_struct *t = current;
+ struct cpumask *cidmask;
+ struct mm_struct *mm;
+ int weight, cpu;
+
+ SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work));
+
+ work->next = work; /* Prevent double-add */
+ if (t->flags & PF_EXITING)
+ return;
+ mm = t->mm;
+ if (!mm)
+ return;
+ old_scan = READ_ONCE(mm->mm_cid_next_scan);
+ next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
+ if (!old_scan) {
+ unsigned long res;
+
+ res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan);
+ if (res != old_scan)
+ old_scan = res;
+ else
+ old_scan = next_scan;
+ }
+ if (time_before(now, old_scan))
+ return;
+ if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
+ return;
+ cidmask = mm_cidmask(mm);
+ /* Clear cids that were not recently used. */
+ for_each_possible_cpu(cpu)
+ sched_mm_cid_remote_clear_old(mm, cpu);
+ weight = cpumask_weight(cidmask);
+ /*
+ * Clear cids that are greater or equal to the cidmask weight to
+ * recompact it.
+ */
+ for_each_possible_cpu(cpu)
+ sched_mm_cid_remote_clear_weight(mm, cpu, weight);
+}
+
+void init_sched_mm_cid(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ int mm_users = 0;
+
+ if (mm) {
+ mm_users = atomic_read(&mm->mm_users);
+ if (mm_users == 1)
+ mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
+ }
+ t->cid_work.next = &t->cid_work; /* Protect against double add */
+ init_task_work(&t->cid_work, task_mm_cid_work);
+}
+
+void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
+{
+ struct callback_head *work = &curr->cid_work;
+ unsigned long now = jiffies;
+
+ if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
+ work->next != work)
+ return;
+ if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
+ return;
+ task_work_add(curr, work, TWA_RESUME);
+}
+
+void sched_mm_cid_exit_signals(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ if (!mm)
+ return;
+
+ preempt_disable();
+ rq = this_rq();
+ rq_lock_irqsave(rq, &rf);
+ preempt_enable_no_resched(); /* holding spinlock */
+ WRITE_ONCE(t->mm_cid_active, 0);
+ /*
+ * Store t->mm_cid_active before loading per-mm/cpu cid.
+ * Matches barrier in sched_mm_cid_remote_clear_old().
+ */
+ smp_mb();
+ mm_cid_put(mm);
+ t->last_mm_cid = t->mm_cid = -1;
+ rq_unlock_irqrestore(rq, &rf);
+}
+
void sched_mm_cid_before_execve(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- unsigned long flags;
+ struct rq_flags rf;
+ struct rq *rq;
if (!mm)
return;
- local_irq_save(flags);
- mm_cid_put(mm, t->mm_cid);
- t->mm_cid = -1;
- t->mm_cid_active = 0;
- local_irq_restore(flags);
+
+ preempt_disable();
+ rq = this_rq();
+ rq_lock_irqsave(rq, &rf);
+ preempt_enable_no_resched(); /* holding spinlock */
+ WRITE_ONCE(t->mm_cid_active, 0);
+ /*
+ * Store t->mm_cid_active before loading per-mm/cpu cid.
+ * Matches barrier in sched_mm_cid_remote_clear_old().
+ */
+ smp_mb();
+ mm_cid_put(mm);
+ t->last_mm_cid = t->mm_cid = -1;
+ rq_unlock_irqrestore(rq, &rf);
}
void sched_mm_cid_after_execve(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- unsigned long flags;
+ struct rq_flags rf;
+ struct rq *rq;
if (!mm)
return;
- local_irq_save(flags);
- t->mm_cid = mm_cid_get(mm);
- t->mm_cid_active = 1;
- local_irq_restore(flags);
+
+ preempt_disable();
+ rq = this_rq();
+ rq_lock_irqsave(rq, &rf);
+ preempt_enable_no_resched(); /* holding spinlock */
+ WRITE_ONCE(t->mm_cid_active, 1);
+ /*
+ * Store t->mm_cid_active before loading per-mm/cpu cid.
+ * Matches barrier in sched_mm_cid_remote_clear_old().
+ */
+ smp_mb();
+ t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
+ rq_unlock_irqrestore(rq, &rf);
rseq_set_notify_resume(t);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 71b24371a6f7..5a9a4b81c972 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2246,6 +2246,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
!cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
task_on_cpu(rq, task) ||
!dl_task(task) ||
+ is_migration_disabled(task) ||
!task_on_rq_queued(task))) {
double_unlock_balance(rq, later_rq);
later_rq = NULL;
@@ -2704,6 +2705,13 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
#endif
}
+#ifdef CONFIG_SCHED_CORE
+static int task_is_throttled_dl(struct task_struct *p, int cpu)
+{
+ return p->dl.dl_throttled;
+}
+#endif
+
DEFINE_SCHED_CLASS(dl) = {
.enqueue_task = enqueue_task_dl,
@@ -2736,6 +2744,9 @@ DEFINE_SCHED_CLASS(dl) = {
.switched_to = switched_to_dl,
.update_curr = update_curr_dl,
+#ifdef CONFIG_SCHED_CORE
+ .task_is_throttled = task_is_throttled_dl,
+#endif
};
/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1637b65ba07a..0b2340a79b65 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -280,6 +280,45 @@ static const struct file_operations sched_dynamic_fops = {
__read_mostly bool sched_debug_verbose;
+#ifdef CONFIG_SMP
+static struct dentry *sd_dentry;
+
+
+static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ ssize_t result;
+ bool orig;
+
+ cpus_read_lock();
+ mutex_lock(&sched_domains_mutex);
+
+ orig = sched_debug_verbose;
+ result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
+
+ if (sched_debug_verbose && !orig)
+ update_sched_domain_debugfs();
+ else if (!sched_debug_verbose && orig) {
+ debugfs_remove(sd_dentry);
+ sd_dentry = NULL;
+ }
+
+ mutex_unlock(&sched_domains_mutex);
+ cpus_read_unlock();
+
+ return result;
+}
+#else
+#define sched_verbose_write debugfs_write_file_bool
+#endif
+
+static const struct file_operations sched_verbose_fops = {
+ .read = debugfs_read_file_bool,
+ .write = sched_verbose_write,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
static const struct seq_operations sched_debug_sops;
static int sched_debug_open(struct inode *inode, struct file *filp)
@@ -303,7 +342,7 @@ static __init int sched_init_debug(void)
debugfs_sched = debugfs_create_dir("sched", NULL);
debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);
- debugfs_create_bool("verbose", 0644, debugfs_sched, &sched_debug_verbose);
+ debugfs_create_file_unsafe("verbose", 0644, debugfs_sched, &sched_debug_verbose, &sched_verbose_fops);
#ifdef CONFIG_PREEMPT_DYNAMIC
debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
#endif
@@ -345,7 +384,6 @@ late_initcall(sched_init_debug);
#ifdef CONFIG_SMP
static cpumask_var_t sd_sysctl_cpus;
-static struct dentry *sd_dentry;
static int sd_flags_show(struct seq_file *m, void *v)
{
@@ -402,15 +440,23 @@ void update_sched_domain_debugfs(void)
if (!debugfs_sched)
return;
+ if (!sched_debug_verbose)
+ return;
+
if (!cpumask_available(sd_sysctl_cpus)) {
if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
return;
cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
}
- if (!sd_dentry)
+ if (!sd_dentry) {
sd_dentry = debugfs_create_dir("domains", debugfs_sched);
+ /* rebuild sd_sysctl_cpus if empty since it gets cleared below */
+ if (cpumask_empty(sd_sysctl_cpus))
+ cpumask_copy(sd_sysctl_cpus, cpu_online_mask);
+ }
+
for_each_cpu(cpu, sd_sysctl_cpus) {
struct sched_domain *sd;
struct dentry *d_cpu;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a1b1f855b96..373ff5f55884 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2928,6 +2928,24 @@ static void reset_ptenuma_scan(struct task_struct *p)
p->mm->numa_scan_offset = 0;
}
+static bool vma_is_accessed(struct vm_area_struct *vma)
+{
+ unsigned long pids;
+ /*
+ * Allow unconditional access first two times, so that all the (pages)
+ * of VMAs get prot_none fault introduced irrespective of accesses.
+ * This is also done to avoid any side effect of task scanning
+ * amplifying the unfairness of disjoint set of VMAs' access.
+ */
+ if (READ_ONCE(current->mm->numa_scan_seq) < 2)
+ return true;
+
+ pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1];
+ return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids);
+}
+
+#define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay)
+
/*
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
@@ -3027,6 +3045,45 @@ static void task_numa_work(struct callback_head *work)
if (!vma_is_accessible(vma))
continue;
+ /* Initialise new per-VMA NUMAB state. */
+ if (!vma->numab_state) {
+ vma->numab_state = kzalloc(sizeof(struct vma_numab_state),
+ GFP_KERNEL);
+ if (!vma->numab_state)
+ continue;
+
+ vma->numab_state->next_scan = now +
+ msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+
+ /* Reset happens after 4 times scan delay of scan start */
+ vma->numab_state->next_pid_reset = vma->numab_state->next_scan +
+ msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+ }
+
+ /*
+ * Scanning the VMA's of short lived tasks add more overhead. So
+ * delay the scan for new VMAs.
+ */
+ if (mm->numa_scan_seq && time_before(jiffies,
+ vma->numab_state->next_scan))
+ continue;
+
+ /* Do not scan the VMA if task has not accessed */
+ if (!vma_is_accessed(vma))
+ continue;
+
+ /*
+ * RESET access PIDs regularly for old VMAs. Resetting after checking
+ * vma for recent access to avoid clearing PID info before access..
+ */
+ if (mm->numa_scan_seq &&
+ time_after(jiffies, vma->numab_state->next_pid_reset)) {
+ vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset +
+ msecs_to_jiffies(VMA_PID_RESET_PERIOD);
+ vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]);
+ vma->numab_state->access_pids[1] = 0;
+ }
+
do {
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
@@ -4648,11 +4705,33 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
#endif
}
+static inline bool entity_is_long_sleeper(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq;
+ u64 sleep_time;
+
+ if (se->exec_start == 0)
+ return false;
+
+ cfs_rq = cfs_rq_of(se);
+
+ sleep_time = rq_clock_task(rq_of(cfs_rq));
+
+ /* Happen while migrating because of clock task divergence */
+ if (sleep_time <= se->exec_start)
+ return false;
+
+ sleep_time -= se->exec_start;
+ if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD)))
+ return true;
+
+ return false;
+}
+
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime = cfs_rq->min_vruntime;
- u64 sleep_time;
/*
* The 'current' period is already promised to the current tasks,
@@ -4684,13 +4763,24 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
/*
* Pull vruntime of the entity being placed to the base level of
- * cfs_rq, to prevent boosting it if placed backwards. If the entity
- * slept for a long time, don't even try to compare its vruntime with
- * the base as it may be too far off and the comparison may get
- * inversed due to s64 overflow.
- */
- sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
- if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
+ * cfs_rq, to prevent boosting it if placed backwards.
+ * However, min_vruntime can advance much faster than real time, with
+ * the extreme being when an entity with the minimal weight always runs
+ * on the cfs_rq. If the waking entity slept for a long time, its
+ * vruntime difference from min_vruntime may overflow s64 and their
+ * comparison may get inversed, so ignore the entity's original
+ * vruntime in that case.
+ * The maximal vruntime speedup is given by the ratio of normal to
+ * minimal weight: scale_load_down(NICE_0_LOAD) / MIN_SHARES.
+ * When placing a migrated waking entity, its exec_start has been set
+ * from a different rq. In order to take into account a possible
+ * divergence between new and prev rq's clocks task because of irq and
+ * stolen time, we take an additional margin.
+ * So, cutting off on the sleep time of
+ * 2^63 / scale_load_down(NICE_0_LOAD) ~ 104 days
+ * should be safe.
+ */
+ if (entity_is_long_sleeper(se))
se->vruntime = vruntime;
else
se->vruntime = max_vruntime(se->vruntime, vruntime);
@@ -4770,6 +4860,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0);
+ /* Entity has migrated, no longer consider this task hot */
+ if (flags & ENQUEUE_MIGRATED)
+ se->exec_start = 0;
check_schedstat_required();
update_stats_enqueue_fair(cfs_rq, se, flags);
@@ -5923,6 +6016,10 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
cfs_b->period_timer.function = sched_cfs_period_timer;
+
+ /* Add a random offset so that timers interleave */
+ hrtimer_set_expires(&cfs_b->period_timer,
+ get_random_u32_below(cfs_b->period));
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
cfs_b->slack_started = false;
@@ -6578,7 +6675,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->stats.nr_wakeups_affine_attempts);
- if (target == nr_cpumask_bits)
+ if (target != this_cpu)
return prev_cpu;
schedstat_inc(sd->ttwu_move_affine);
@@ -7657,9 +7754,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
/* Tell new CPU we are migrated */
se->avg.last_update_time = 0;
- /* We have migrated, no longer consider this task hot */
- se->exec_start = 0;
-
update_scan_period(p, new_cpu);
}
@@ -10205,6 +10299,16 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
sds->total_capacity;
+
+ /*
+ * If the local group is more loaded than the average system
+ * load, don't try to pull any tasks.
+ */
+ if (local->avg_load >= sds->avg_load) {
+ env->imbalance = 0;
+ return;
+ }
+
}
/*
@@ -11933,6 +12037,18 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
return delta > 0;
}
+
+static int task_is_throttled_fair(struct task_struct *p, int cpu)
+{
+ struct cfs_rq *cfs_rq;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ cfs_rq = task_group(p)->cfs_rq[cpu];
+#else
+ cfs_rq = &cpu_rq(cpu)->cfs;
+#endif
+ return throttled_hierarchy(cfs_rq);
+}
#else
static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
#endif
@@ -12559,6 +12675,10 @@ DEFINE_SCHED_CLASS(fair) = {
.task_change_group = task_change_group_fair,
#endif
+#ifdef CONFIG_SCHED_CORE
+ .task_is_throttled = task_is_throttled_fair,
+#endif
+
#ifdef CONFIG_UCLAMP_TASK
.uclamp_enabled = 1,
#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index e9ef66be2870..342f58a329f5 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -75,7 +75,7 @@ static noinline int __cpuidle cpu_idle_poll(void)
void __weak arch_cpu_idle_prepare(void) { }
void __weak arch_cpu_idle_enter(void) { }
void __weak arch_cpu_idle_exit(void) { }
-void __weak arch_cpu_idle_dead(void) { }
+void __weak __noreturn arch_cpu_idle_dead(void) { while (1); }
void __weak arch_cpu_idle(void)
{
cpu_idle_force_poll = 1;
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 02e011cabe91..e072f6b31bf3 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -186,17 +186,22 @@ static void group_init(struct psi_group *group)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
group->avg_last_update = sched_clock();
group->avg_next_update = group->avg_last_update + psi_period;
- INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
mutex_init(&group->avgs_lock);
- /* Init trigger-related members */
- atomic_set(&group->poll_scheduled, 0);
- mutex_init(&group->trigger_lock);
- INIT_LIST_HEAD(&group->triggers);
- group->poll_min_period = U32_MAX;
- group->polling_next_update = ULLONG_MAX;
- init_waitqueue_head(&group->poll_wait);
- timer_setup(&group->poll_timer, poll_timer_fn, 0);
- rcu_assign_pointer(group->poll_task, NULL);
+
+ /* Init avg trigger-related members */
+ INIT_LIST_HEAD(&group->avg_triggers);
+ memset(group->avg_nr_triggers, 0, sizeof(group->avg_nr_triggers));
+ INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
+
+ /* Init rtpoll trigger-related members */
+ atomic_set(&group->rtpoll_scheduled, 0);
+ mutex_init(&group->rtpoll_trigger_lock);
+ INIT_LIST_HEAD(&group->rtpoll_triggers);
+ group->rtpoll_min_period = U32_MAX;
+ group->rtpoll_next_update = ULLONG_MAX;
+ init_waitqueue_head(&group->rtpoll_wait);
+ timer_setup(&group->rtpoll_timer, poll_timer_fn, 0);
+ rcu_assign_pointer(group->rtpoll_task, NULL);
}
void __init psi_init(void)
@@ -384,92 +389,6 @@ static void collect_percpu_times(struct psi_group *group,
*pchanged_states = changed_states;
}
-static u64 update_averages(struct psi_group *group, u64 now)
-{
- unsigned long missed_periods = 0;
- u64 expires, period;
- u64 avg_next_update;
- int s;
-
- /* avgX= */
- expires = group->avg_next_update;
- if (now - expires >= psi_period)
- missed_periods = div_u64(now - expires, psi_period);
-
- /*
- * The periodic clock tick can get delayed for various
- * reasons, especially on loaded systems. To avoid clock
- * drift, we schedule the clock in fixed psi_period intervals.
- * But the deltas we sample out of the per-cpu buckets above
- * are based on the actual time elapsing between clock ticks.
- */
- avg_next_update = expires + ((1 + missed_periods) * psi_period);
- period = now - (group->avg_last_update + (missed_periods * psi_period));
- group->avg_last_update = now;
-
- for (s = 0; s < NR_PSI_STATES - 1; s++) {
- u32 sample;
-
- sample = group->total[PSI_AVGS][s] - group->avg_total[s];
- /*
- * Due to the lockless sampling of the time buckets,
- * recorded time deltas can slip into the next period,
- * which under full pressure can result in samples in
- * excess of the period length.
- *
- * We don't want to report non-sensical pressures in
- * excess of 100%, nor do we want to drop such events
- * on the floor. Instead we punt any overage into the
- * future until pressure subsides. By doing this we
- * don't underreport the occurring pressure curve, we
- * just report it delayed by one period length.
- *
- * The error isn't cumulative. As soon as another
- * delta slips from a period P to P+1, by definition
- * it frees up its time T in P.
- */
- if (sample > period)
- sample = period;
- group->avg_total[s] += sample;
- calc_avgs(group->avg[s], missed_periods, sample, period);
- }
-
- return avg_next_update;
-}
-
-static void psi_avgs_work(struct work_struct *work)
-{
- struct delayed_work *dwork;
- struct psi_group *group;
- u32 changed_states;
- u64 now;
-
- dwork = to_delayed_work(work);
- group = container_of(dwork, struct psi_group, avgs_work);
-
- mutex_lock(&group->avgs_lock);
-
- now = sched_clock();
-
- collect_percpu_times(group, PSI_AVGS, &changed_states);
- /*
- * If there is task activity, periodically fold the per-cpu
- * times and feed samples into the running averages. If things
- * are idle and there is no data to process, stop the clock.
- * Once restarted, we'll catch up the running averages in one
- * go - see calc_avgs() and missed_periods.
- */
- if (now >= group->avg_next_update)
- group->avg_next_update = update_averages(group, now);
-
- if (changed_states & PSI_STATE_RESCHEDULE) {
- schedule_delayed_work(dwork, nsecs_to_jiffies(
- group->avg_next_update - now) + 1);
- }
-
- mutex_unlock(&group->avgs_lock);
-}
-
/* Trigger tracking window manipulations */
static void window_reset(struct psi_window *win, u64 now, u64 value,
u64 prev_growth)
@@ -516,33 +435,32 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
return growth;
}
-static void init_triggers(struct psi_group *group, u64 now)
-{
- struct psi_trigger *t;
-
- list_for_each_entry(t, &group->triggers, node)
- window_reset(&t->win, now,
- group->total[PSI_POLL][t->state], 0);
- memcpy(group->polling_total, group->total[PSI_POLL],
- sizeof(group->polling_total));
- group->polling_next_update = now + group->poll_min_period;
-}
-
-static u64 update_triggers(struct psi_group *group, u64 now)
+static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
+ enum psi_aggregators aggregator)
{
struct psi_trigger *t;
- bool update_total = false;
- u64 *total = group->total[PSI_POLL];
+ u64 *total = group->total[aggregator];
+ struct list_head *triggers;
+ u64 *aggregator_total;
+ *update_total = false;
+
+ if (aggregator == PSI_AVGS) {
+ triggers = &group->avg_triggers;
+ aggregator_total = group->avg_total;
+ } else {
+ triggers = &group->rtpoll_triggers;
+ aggregator_total = group->rtpoll_total;
+ }
/*
* On subsequent updates, calculate growth deltas and let
* watchers know when their specified thresholds are exceeded.
*/
- list_for_each_entry(t, &group->triggers, node) {
+ list_for_each_entry(t, triggers, node) {
u64 growth;
bool new_stall;
- new_stall = group->polling_total[t->state] != total[t->state];
+ new_stall = aggregator_total[t->state] != total[t->state];
/* Check for stall activity or a previous threshold breach */
if (!new_stall && !t->pending_event)
@@ -560,7 +478,7 @@ static u64 update_triggers(struct psi_group *group, u64 now)
* been through all of them. Also remember to extend the
* polling time if we see new stall activity.
*/
- update_total = true;
+ *update_total = true;
/* Calculate growth since last update */
growth = window_update(&t->win, now, total[t->state]);
@@ -583,52 +501,150 @@ static u64 update_triggers(struct psi_group *group, u64 now)
t->pending_event = false;
}
- if (update_total)
- memcpy(group->polling_total, total,
- sizeof(group->polling_total));
+ return now + group->rtpoll_min_period;
+}
+
+static u64 update_averages(struct psi_group *group, u64 now)
+{
+ unsigned long missed_periods = 0;
+ u64 expires, period;
+ u64 avg_next_update;
+ int s;
+
+ /* avgX= */
+ expires = group->avg_next_update;
+ if (now - expires >= psi_period)
+ missed_periods = div_u64(now - expires, psi_period);
+
+ /*
+ * The periodic clock tick can get delayed for various
+ * reasons, especially on loaded systems. To avoid clock
+ * drift, we schedule the clock in fixed psi_period intervals.
+ * But the deltas we sample out of the per-cpu buckets above
+ * are based on the actual time elapsing between clock ticks.
+ */
+ avg_next_update = expires + ((1 + missed_periods) * psi_period);
+ period = now - (group->avg_last_update + (missed_periods * psi_period));
+ group->avg_last_update = now;
+
+ for (s = 0; s < NR_PSI_STATES - 1; s++) {
+ u32 sample;
+
+ sample = group->total[PSI_AVGS][s] - group->avg_total[s];
+ /*
+ * Due to the lockless sampling of the time buckets,
+ * recorded time deltas can slip into the next period,
+ * which under full pressure can result in samples in
+ * excess of the period length.
+ *
+ * We don't want to report non-sensical pressures in
+ * excess of 100%, nor do we want to drop such events
+ * on the floor. Instead we punt any overage into the
+ * future until pressure subsides. By doing this we
+ * don't underreport the occurring pressure curve, we
+ * just report it delayed by one period length.
+ *
+ * The error isn't cumulative. As soon as another
+ * delta slips from a period P to P+1, by definition
+ * it frees up its time T in P.
+ */
+ if (sample > period)
+ sample = period;
+ group->avg_total[s] += sample;
+ calc_avgs(group->avg[s], missed_periods, sample, period);
+ }
+
+ return avg_next_update;
+}
+
+static void psi_avgs_work(struct work_struct *work)
+{
+ struct delayed_work *dwork;
+ struct psi_group *group;
+ u32 changed_states;
+ bool update_total;
+ u64 now;
+
+ dwork = to_delayed_work(work);
+ group = container_of(dwork, struct psi_group, avgs_work);
+
+ mutex_lock(&group->avgs_lock);
+
+ now = sched_clock();
+
+ collect_percpu_times(group, PSI_AVGS, &changed_states);
+ /*
+ * If there is task activity, periodically fold the per-cpu
+ * times and feed samples into the running averages. If things
+ * are idle and there is no data to process, stop the clock.
+ * Once restarted, we'll catch up the running averages in one
+ * go - see calc_avgs() and missed_periods.
+ */
+ if (now >= group->avg_next_update) {
+ update_triggers(group, now, &update_total, PSI_AVGS);
+ group->avg_next_update = update_averages(group, now);
+ }
+
+ if (changed_states & PSI_STATE_RESCHEDULE) {
+ schedule_delayed_work(dwork, nsecs_to_jiffies(
+ group->avg_next_update - now) + 1);
+ }
- return now + group->poll_min_period;
+ mutex_unlock(&group->avgs_lock);
+}
+
+static void init_rtpoll_triggers(struct psi_group *group, u64 now)
+{
+ struct psi_trigger *t;
+
+ list_for_each_entry(t, &group->rtpoll_triggers, node)
+ window_reset(&t->win, now,
+ group->total[PSI_POLL][t->state], 0);
+ memcpy(group->rtpoll_total, group->total[PSI_POLL],
+ sizeof(group->rtpoll_total));
+ group->rtpoll_next_update = now + group->rtpoll_min_period;
}
/* Schedule polling if it's not already scheduled or forced. */
-static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay,
+static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay,
bool force)
{
struct task_struct *task;
/*
* atomic_xchg should be called even when !force to provide a
- * full memory barrier (see the comment inside psi_poll_work).
+ * full memory barrier (see the comment inside psi_rtpoll_work).
*/
- if (atomic_xchg(&group->poll_scheduled, 1) && !force)
+ if (atomic_xchg(&group->rtpoll_scheduled, 1) && !force)
return;
rcu_read_lock();
- task = rcu_dereference(group->poll_task);
+ task = rcu_dereference(group->rtpoll_task);
/*
* kworker might be NULL in case psi_trigger_destroy races with
* psi_task_change (hotpath) which can't use locks
*/
if (likely(task))
- mod_timer(&group->poll_timer, jiffies + delay);
+ mod_timer(&group->rtpoll_timer, jiffies + delay);
else
- atomic_set(&group->poll_scheduled, 0);
+ atomic_set(&group->rtpoll_scheduled, 0);
rcu_read_unlock();
}
-static void psi_poll_work(struct psi_group *group)
+static void psi_rtpoll_work(struct psi_group *group)
{
bool force_reschedule = false;
u32 changed_states;
+ bool update_total;
u64 now;
- mutex_lock(&group->trigger_lock);
+ mutex_lock(&group->rtpoll_trigger_lock);
now = sched_clock();
- if (now > group->polling_until) {
+ if (now > group->rtpoll_until) {
/*
* We are either about to start or might stop polling if no
* state change was recorded. Resetting poll_scheduled leaves
@@ -638,7 +654,7 @@ static void psi_poll_work(struct psi_group *group)
* should be negligible and polling_next_update still keeps
* updates correctly on schedule.
*/
- atomic_set(&group->poll_scheduled, 0);
+ atomic_set(&group->rtpoll_scheduled, 0);
/*
* A task change can race with the poll worker that is supposed to
* report on it. To avoid missing events, ensure ordering between
@@ -667,60 +683,64 @@ static void psi_poll_work(struct psi_group *group)
collect_percpu_times(group, PSI_POLL, &changed_states);
- if (changed_states & group->poll_states) {
+ if (changed_states & group->rtpoll_states) {
/* Initialize trigger windows when entering polling mode */
- if (now > group->polling_until)
- init_triggers(group, now);
+ if (now > group->rtpoll_until)
+ init_rtpoll_triggers(group, now);
/*
* Keep the monitor active for at least the duration of the
* minimum tracking window as long as monitor states are
* changing.
*/
- group->polling_until = now +
- group->poll_min_period * UPDATES_PER_WINDOW;
+ group->rtpoll_until = now +
+ group->rtpoll_min_period * UPDATES_PER_WINDOW;
}
- if (now > group->polling_until) {
- group->polling_next_update = ULLONG_MAX;
+ if (now > group->rtpoll_until) {
+ group->rtpoll_next_update = ULLONG_MAX;
goto out;
}
- if (now >= group->polling_next_update)
- group->polling_next_update = update_triggers(group, now);
+ if (now >= group->rtpoll_next_update) {
+ group->rtpoll_next_update = update_triggers(group, now, &update_total, PSI_POLL);
+ if (update_total)
+ memcpy(group->rtpoll_total, group->total[PSI_POLL],
+ sizeof(group->rtpoll_total));
+ }
- psi_schedule_poll_work(group,
- nsecs_to_jiffies(group->polling_next_update - now) + 1,
+ psi_schedule_rtpoll_work(group,
+ nsecs_to_jiffies(group->rtpoll_next_update - now) + 1,
force_reschedule);
out:
- mutex_unlock(&group->trigger_lock);
+ mutex_unlock(&group->rtpoll_trigger_lock);
}
-static int psi_poll_worker(void *data)
+static int psi_rtpoll_worker(void *data)
{
struct psi_group *group = (struct psi_group *)data;
sched_set_fifo_low(current);
while (true) {
- wait_event_interruptible(group->poll_wait,
- atomic_cmpxchg(&group->poll_wakeup, 1, 0) ||
+ wait_event_interruptible(group->rtpoll_wait,
+ atomic_cmpxchg(&group->rtpoll_wakeup, 1, 0) ||
kthread_should_stop());
if (kthread_should_stop())
break;
- psi_poll_work(group);
+ psi_rtpoll_work(group);
}
return 0;
}
static void poll_timer_fn(struct timer_list *t)
{
- struct psi_group *group = from_timer(group, t, poll_timer);
+ struct psi_group *group = from_timer(group, t, rtpoll_timer);
- atomic_set(&group->poll_wakeup, 1);
- wake_up_interruptible(&group->poll_wait);
+ atomic_set(&group->rtpoll_wakeup, 1);
+ wake_up_interruptible(&group->rtpoll_wait);
}
static void record_times(struct psi_group_cpu *groupc, u64 now)
@@ -851,8 +871,8 @@ static void psi_group_change(struct psi_group *group, int cpu,
write_seqcount_end(&groupc->seq);
- if (state_mask & group->poll_states)
- psi_schedule_poll_work(group, 1, false);
+ if (state_mask & group->rtpoll_states)
+ psi_schedule_rtpoll_work(group, 1, false);
if (wake_clock && !delayed_work_pending(&group->avgs_work))
schedule_delayed_work(&group->avgs_work, PSI_FREQ);
@@ -1005,8 +1025,8 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
write_seqcount_end(&groupc->seq);
- if (group->poll_states & (1 << PSI_IRQ_FULL))
- psi_schedule_poll_work(group, 1, false);
+ if (group->rtpoll_states & (1 << PSI_IRQ_FULL))
+ psi_schedule_rtpoll_work(group, 1, false);
} while ((group = group->parent));
}
#endif
@@ -1101,7 +1121,7 @@ void psi_cgroup_free(struct cgroup *cgroup)
cancel_delayed_work_sync(&cgroup->psi->avgs_work);
free_percpu(cgroup->psi->pcpu);
/* All triggers must be removed by now */
- WARN_ONCE(cgroup->psi->poll_states, "psi: trigger leak\n");
+ WARN_ONCE(cgroup->psi->rtpoll_states, "psi: trigger leak\n");
kfree(cgroup->psi);
}
@@ -1253,16 +1273,23 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
}
struct psi_trigger *psi_trigger_create(struct psi_group *group,
- char *buf, enum psi_res res)
+ char *buf, enum psi_res res, struct file *file)
{
struct psi_trigger *t;
enum psi_states state;
u32 threshold_us;
+ bool privileged;
u32 window_us;
if (static_branch_likely(&psi_disabled))
return ERR_PTR(-EOPNOTSUPP);
+ /*
+ * Checking the privilege here on file->f_cred implies that a privileged user
+ * could open the file and delegate the write to an unprivileged one.
+ */
+ privileged = cap_raised(file->f_cred->cap_effective, CAP_SYS_RESOURCE);
+
if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2)
state = PSI_IO_SOME + res * 2;
else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2)
@@ -1282,6 +1309,13 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
window_us > WINDOW_MAX_US)
return ERR_PTR(-EINVAL);
+ /*
+ * Unprivileged users can only use 2s windows so that averages aggregation
+ * work is used, and no RT threads need to be spawned.
+ */
+ if (!privileged && window_us % 2000000)
+ return ERR_PTR(-EINVAL);
+
/* Check threshold */
if (threshold_us == 0 || threshold_us > window_us)
return ERR_PTR(-EINVAL);
@@ -1301,31 +1335,40 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
t->last_event_time = 0;
init_waitqueue_head(&t->event_wait);
t->pending_event = false;
+ t->aggregator = privileged ? PSI_POLL : PSI_AVGS;
- mutex_lock(&group->trigger_lock);
+ if (privileged) {
+ mutex_lock(&group->rtpoll_trigger_lock);
- if (!rcu_access_pointer(group->poll_task)) {
- struct task_struct *task;
+ if (!rcu_access_pointer(group->rtpoll_task)) {
+ struct task_struct *task;
- task = kthread_create(psi_poll_worker, group, "psimon");
- if (IS_ERR(task)) {
- kfree(t);
- mutex_unlock(&group->trigger_lock);
- return ERR_CAST(task);
+ task = kthread_create(psi_rtpoll_worker, group, "psimon");
+ if (IS_ERR(task)) {
+ kfree(t);
+ mutex_unlock(&group->rtpoll_trigger_lock);
+ return ERR_CAST(task);
+ }
+ atomic_set(&group->rtpoll_wakeup, 0);
+ wake_up_process(task);
+ rcu_assign_pointer(group->rtpoll_task, task);
}
- atomic_set(&group->poll_wakeup, 0);
- wake_up_process(task);
- rcu_assign_pointer(group->poll_task, task);
- }
- list_add(&t->node, &group->triggers);
- group->poll_min_period = min(group->poll_min_period,
- div_u64(t->win.size, UPDATES_PER_WINDOW));
- group->nr_triggers[t->state]++;
- group->poll_states |= (1 << t->state);
+ list_add(&t->node, &group->rtpoll_triggers);
+ group->rtpoll_min_period = min(group->rtpoll_min_period,
+ div_u64(t->win.size, UPDATES_PER_WINDOW));
+ group->rtpoll_nr_triggers[t->state]++;
+ group->rtpoll_states |= (1 << t->state);
+
+ mutex_unlock(&group->rtpoll_trigger_lock);
+ } else {
+ mutex_lock(&group->avgs_lock);
- mutex_unlock(&group->trigger_lock);
+ list_add(&t->node, &group->avg_triggers);
+ group->avg_nr_triggers[t->state]++;
+ mutex_unlock(&group->avgs_lock);
+ }
return t;
}
@@ -1349,51 +1392,59 @@ void psi_trigger_destroy(struct psi_trigger *t)
*/
wake_up_pollfree(&t->event_wait);
- mutex_lock(&group->trigger_lock);
-
- if (!list_empty(&t->node)) {
- struct psi_trigger *tmp;
- u64 period = ULLONG_MAX;
-
- list_del(&t->node);
- group->nr_triggers[t->state]--;
- if (!group->nr_triggers[t->state])
- group->poll_states &= ~(1 << t->state);
- /* reset min update period for the remaining triggers */
- list_for_each_entry(tmp, &group->triggers, node)
- period = min(period, div_u64(tmp->win.size,
- UPDATES_PER_WINDOW));
- group->poll_min_period = period;
- /* Destroy poll_task when the last trigger is destroyed */
- if (group->poll_states == 0) {
- group->polling_until = 0;
- task_to_destroy = rcu_dereference_protected(
- group->poll_task,
- lockdep_is_held(&group->trigger_lock));
- rcu_assign_pointer(group->poll_task, NULL);
- del_timer(&group->poll_timer);
+ if (t->aggregator == PSI_AVGS) {
+ mutex_lock(&group->avgs_lock);
+ if (!list_empty(&t->node)) {
+ list_del(&t->node);
+ group->avg_nr_triggers[t->state]--;
}
+ mutex_unlock(&group->avgs_lock);
+ } else {
+ mutex_lock(&group->rtpoll_trigger_lock);
+ if (!list_empty(&t->node)) {
+ struct psi_trigger *tmp;
+ u64 period = ULLONG_MAX;
+
+ list_del(&t->node);
+ group->rtpoll_nr_triggers[t->state]--;
+ if (!group->rtpoll_nr_triggers[t->state])
+ group->rtpoll_states &= ~(1 << t->state);
+ /* reset min update period for the remaining triggers */
+ list_for_each_entry(tmp, &group->rtpoll_triggers, node)
+ period = min(period, div_u64(tmp->win.size,
+ UPDATES_PER_WINDOW));
+ group->rtpoll_min_period = period;
+ /* Destroy rtpoll_task when the last trigger is destroyed */
+ if (group->rtpoll_states == 0) {
+ group->rtpoll_until = 0;
+ task_to_destroy = rcu_dereference_protected(
+ group->rtpoll_task,
+ lockdep_is_held(&group->rtpoll_trigger_lock));
+ rcu_assign_pointer(group->rtpoll_task, NULL);
+ del_timer(&group->rtpoll_timer);
+ }
+ }
+ mutex_unlock(&group->rtpoll_trigger_lock);
}
- mutex_unlock(&group->trigger_lock);
-
/*
- * Wait for psi_schedule_poll_work RCU to complete its read-side
+ * Wait for psi_schedule_rtpoll_work RCU to complete its read-side
* critical section before destroying the trigger and optionally the
- * poll_task.
+ * rtpoll_task.
*/
synchronize_rcu();
/*
- * Stop kthread 'psimon' after releasing trigger_lock to prevent a
- * deadlock while waiting for psi_poll_work to acquire trigger_lock
+ * Stop kthread 'psimon' after releasing rtpoll_trigger_lock to prevent
+ * a deadlock while waiting for psi_rtpoll_work to acquire
+ * rtpoll_trigger_lock
*/
if (task_to_destroy) {
/*
* After the RCU grace period has expired, the worker
- * can no longer be found through group->poll_task.
+ * can no longer be found through group->rtpoll_task.
*/
kthread_stop(task_to_destroy);
- atomic_set(&group->poll_scheduled, 0);
+ atomic_set(&group->rtpoll_scheduled, 0);
}
kfree(t);
}
@@ -1435,27 +1486,19 @@ static int psi_cpu_show(struct seq_file *m, void *v)
return psi_show(m, &psi_system, PSI_CPU);
}
-static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *))
-{
- if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
- return -EPERM;
-
- return single_open(file, psi_show, NULL);
-}
-
static int psi_io_open(struct inode *inode, struct file *file)
{
- return psi_open(file, psi_io_show);
+ return single_open(file, psi_io_show, NULL);
}
static int psi_memory_open(struct inode *inode, struct file *file)
{
- return psi_open(file, psi_memory_show);
+ return single_open(file, psi_memory_show, NULL);
}
static int psi_cpu_open(struct inode *inode, struct file *file)
{
- return psi_open(file, psi_cpu_show);
+ return single_open(file, psi_cpu_show, NULL);
}
static ssize_t psi_write(struct file *file, const char __user *user_buf,
@@ -1489,7 +1532,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
return -EBUSY;
}
- new = psi_trigger_create(&psi_system, buf, res);
+ new = psi_trigger_create(&psi_system, buf, res, file);
if (IS_ERR(new)) {
mutex_unlock(&seq->lock);
return PTR_ERR(new);
@@ -1569,7 +1612,7 @@ static int psi_irq_show(struct seq_file *m, void *v)
static int psi_irq_open(struct inode *inode, struct file *file)
{
- return psi_open(file, psi_irq_show);
+ return single_open(file, psi_irq_show, NULL);
}
static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0a11f44adee5..00e0e5074115 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2000,11 +2000,15 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
* the mean time, task could have
* migrated already or had its affinity changed.
* Also make sure that it wasn't scheduled on its rq.
+ * It is possible the task was scheduled, set
+ * "migrate_disabled" and then got preempted, so we must
+ * check the task migration disable flag here too.
*/
if (unlikely(task_rq(task) != rq ||
!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
task_on_cpu(rq, task) ||
!rt_task(task) ||
+ is_migration_disabled(task) ||
!task_on_rq_queued(task))) {
double_unlock_balance(rq, lowest_rq);
@@ -2677,6 +2681,21 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
return 0;
}
+#ifdef CONFIG_SCHED_CORE
+static int task_is_throttled_rt(struct task_struct *p, int cpu)
+{
+ struct rt_rq *rt_rq;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ rt_rq = task_group(p)->rt_rq[cpu];
+#else
+ rt_rq = &cpu_rq(cpu)->rt;
+#endif
+
+ return rt_rq_throttled(rt_rq);
+}
+#endif
+
DEFINE_SCHED_CLASS(rt) = {
.enqueue_task = enqueue_task_rt,
@@ -2710,6 +2729,10 @@ DEFINE_SCHED_CLASS(rt) = {
.update_curr = update_curr_rt,
+#ifdef CONFIG_SCHED_CORE
+ .task_is_throttled = task_is_throttled_rt,
+#endif
+
#ifdef CONFIG_UCLAMP_TASK
.uclamp_enabled = 1,
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3e8df6d31c1e..ec7b3e0a2b20 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2224,6 +2224,10 @@ struct sched_class {
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_change_group)(struct task_struct *p);
#endif
+
+#ifdef CONFIG_SCHED_CORE
+ int (*task_is_throttled)(struct task_struct *p, int cpu);
+#endif
};
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
@@ -3249,61 +3253,238 @@ static inline void update_current_exec_runtime(struct task_struct *curr,
}
#ifdef CONFIG_SCHED_MM_CID
-static inline int __mm_cid_get(struct mm_struct *mm)
+
+#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */
+#define MM_CID_SCAN_DELAY 100 /* 100ms */
+
+extern raw_spinlock_t cid_lock;
+extern int use_cid_lock;
+
+extern void sched_mm_cid_migrate_from(struct task_struct *t);
+extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
+extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr);
+extern void init_sched_mm_cid(struct task_struct *t);
+
+static inline void __mm_cid_put(struct mm_struct *mm, int cid)
+{
+ if (cid < 0)
+ return;
+ cpumask_clear_cpu(cid, mm_cidmask(mm));
+}
+
+/*
+ * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to
+ * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to
+ * be held to transition to other states.
+ *
+ * State transitions synchronized with cmpxchg or try_cmpxchg need to be
+ * consistent across cpus, which prevents use of this_cpu_cmpxchg.
+ */
+static inline void mm_cid_put_lazy(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+ int cid;
+
+ lockdep_assert_irqs_disabled();
+ cid = __this_cpu_read(pcpu_cid->cid);
+ if (!mm_cid_is_lazy_put(cid) ||
+ !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
+ return;
+ __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+}
+
+static inline int mm_cid_pcpu_unset(struct mm_struct *mm)
+{
+ struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+ int cid, res;
+
+ lockdep_assert_irqs_disabled();
+ cid = __this_cpu_read(pcpu_cid->cid);
+ for (;;) {
+ if (mm_cid_is_unset(cid))
+ return MM_CID_UNSET;
+ /*
+ * Attempt transition from valid or lazy-put to unset.
+ */
+ res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET);
+ if (res == cid)
+ break;
+ cid = res;
+ }
+ return cid;
+}
+
+static inline void mm_cid_put(struct mm_struct *mm)
+{
+ int cid;
+
+ lockdep_assert_irqs_disabled();
+ cid = mm_cid_pcpu_unset(mm);
+ if (cid == MM_CID_UNSET)
+ return;
+ __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+}
+
+static inline int __mm_cid_try_get(struct mm_struct *mm)
{
struct cpumask *cpumask;
int cid;
cpumask = mm_cidmask(mm);
- cid = cpumask_first_zero(cpumask);
- if (cid >= nr_cpu_ids)
+ /*
+ * Retry finding first zero bit if the mask is temporarily
+ * filled. This only happens during concurrent remote-clear
+ * which owns a cid without holding a rq lock.
+ */
+ for (;;) {
+ cid = cpumask_first_zero(cpumask);
+ if (cid < nr_cpu_ids)
+ break;
+ cpu_relax();
+ }
+ if (cpumask_test_and_set_cpu(cid, cpumask))
return -1;
- __cpumask_set_cpu(cid, cpumask);
return cid;
}
-static inline void mm_cid_put(struct mm_struct *mm, int cid)
+/*
+ * Save a snapshot of the current runqueue time of this cpu
+ * with the per-cpu cid value, allowing to estimate how recently it was used.
+ */
+static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm)
+{
+ struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq));
+
+ lockdep_assert_rq_held(rq);
+ WRITE_ONCE(pcpu_cid->time, rq->clock);
+}
+
+static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm)
{
- lockdep_assert_irqs_disabled();
- if (cid < 0)
- return;
- raw_spin_lock(&mm->cid_lock);
- __cpumask_clear_cpu(cid, mm_cidmask(mm));
- raw_spin_unlock(&mm->cid_lock);
+ int cid;
+
+ /*
+ * All allocations (even those using the cid_lock) are lock-free. If
+ * use_cid_lock is set, hold the cid_lock to perform cid allocation to
+ * guarantee forward progress.
+ */
+ if (!READ_ONCE(use_cid_lock)) {
+ cid = __mm_cid_try_get(mm);
+ if (cid >= 0)
+ goto end;
+ raw_spin_lock(&cid_lock);
+ } else {
+ raw_spin_lock(&cid_lock);
+ cid = __mm_cid_try_get(mm);
+ if (cid >= 0)
+ goto unlock;
+ }
+
+ /*
+ * cid concurrently allocated. Retry while forcing following
+ * allocations to use the cid_lock to ensure forward progress.
+ */
+ WRITE_ONCE(use_cid_lock, 1);
+ /*
+ * Set use_cid_lock before allocation. Only care about program order
+ * because this is only required for forward progress.
+ */
+ barrier();
+ /*
+ * Retry until it succeeds. It is guaranteed to eventually succeed once
+ * all newcoming allocations observe the use_cid_lock flag set.
+ */
+ do {
+ cid = __mm_cid_try_get(mm);
+ cpu_relax();
+ } while (cid < 0);
+ /*
+ * Allocate before clearing use_cid_lock. Only care about
+ * program order because this is for forward progress.
+ */
+ barrier();
+ WRITE_ONCE(use_cid_lock, 0);
+unlock:
+ raw_spin_unlock(&cid_lock);
+end:
+ mm_cid_snapshot_time(rq, mm);
+ return cid;
}
-static inline int mm_cid_get(struct mm_struct *mm)
+static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm)
{
- int ret;
+ struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+ struct cpumask *cpumask;
+ int cid;
- lockdep_assert_irqs_disabled();
- raw_spin_lock(&mm->cid_lock);
- ret = __mm_cid_get(mm);
- raw_spin_unlock(&mm->cid_lock);
- return ret;
+ lockdep_assert_rq_held(rq);
+ cpumask = mm_cidmask(mm);
+ cid = __this_cpu_read(pcpu_cid->cid);
+ if (mm_cid_is_valid(cid)) {
+ mm_cid_snapshot_time(rq, mm);
+ return cid;
+ }
+ if (mm_cid_is_lazy_put(cid)) {
+ if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
+ __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+ }
+ cid = __mm_cid_get(rq, mm);
+ __this_cpu_write(pcpu_cid->cid, cid);
+ return cid;
}
-static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next)
+static inline void switch_mm_cid(struct rq *rq,
+ struct task_struct *prev,
+ struct task_struct *next)
{
+ /*
+ * Provide a memory barrier between rq->curr store and load of
+ * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition.
+ *
+ * Should be adapted if context_switch() is modified.
+ */
+ if (!next->mm) { // to kernel
+ /*
+ * user -> kernel transition does not guarantee a barrier, but
+ * we can use the fact that it performs an atomic operation in
+ * mmgrab().
+ */
+ if (prev->mm) // from user
+ smp_mb__after_mmgrab();
+ /*
+ * kernel -> kernel transition does not change rq->curr->mm
+ * state. It stays NULL.
+ */
+ } else { // to user
+ /*
+ * kernel -> user transition does not provide a barrier
+ * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu].
+ * Provide it here.
+ */
+ if (!prev->mm) // from kernel
+ smp_mb();
+ /*
+ * user -> user transition guarantees a memory barrier through
+ * switch_mm() when current->mm changes. If current->mm is
+ * unchanged, no barrier is needed.
+ */
+ }
if (prev->mm_cid_active) {
- if (next->mm_cid_active && next->mm == prev->mm) {
- /*
- * Context switch between threads in same mm, hand over
- * the mm_cid from prev to next.
- */
- next->mm_cid = prev->mm_cid;
- prev->mm_cid = -1;
- return;
- }
- mm_cid_put(prev->mm, prev->mm_cid);
+ mm_cid_snapshot_time(rq, prev->mm);
+ mm_cid_put_lazy(prev);
prev->mm_cid = -1;
}
if (next->mm_cid_active)
- next->mm_cid = mm_cid_get(next->mm);
+ next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm);
}
#else
-static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { }
+static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
+static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
+static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
+static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
+static inline void init_sched_mm_cid(struct task_struct *t) { }
#endif
#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h
index 2eb23dd0f285..21ac44428bb0 100644
--- a/kernel/sched/smp.h
+++ b/kernel/sched/smp.h
@@ -6,7 +6,7 @@
extern void sched_ttwu_pending(void *arg);
-extern void send_call_function_single_ipi(int cpu);
+extern bool call_function_single_prep_ipi(int cpu);
#ifdef CONFIG_SMP
extern void flush_smp_call_function_queue(void);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 051aaf65c749..6682535e37c8 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -209,8 +209,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
DEFINE_STATIC_KEY_FALSE(sched_energy_present);
static unsigned int sysctl_sched_energy_aware = 1;
-DEFINE_MUTEX(sched_energy_mutex);
-bool sched_energy_update;
+static DEFINE_MUTEX(sched_energy_mutex);
+static bool sched_energy_update;
void rebuild_sched_domains_energy(void)
{
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index cebf26445f9e..d3e584065c7f 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -2368,12 +2368,6 @@ static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
return ret;
}
-static struct ctl_path seccomp_sysctl_path[] = {
- { .procname = "kernel", },
- { .procname = "seccomp", },
- { }
-};
-
static struct ctl_table seccomp_sysctl_table[] = {
{
.procname = "actions_avail",
@@ -2392,14 +2386,7 @@ static struct ctl_table seccomp_sysctl_table[] = {
static int __init seccomp_sysctl_init(void)
{
- struct ctl_table_header *hdr;
-
- hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
- if (!hdr)
- pr_warn("sysctl registration failed\n");
- else
- kmemleak_not_leak(hdr);
-
+ register_sysctl_init("kernel/seccomp", seccomp_sysctl_table);
return 0;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 8cb28f1df294..8f6330f0e9ca 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1003,8 +1003,7 @@ static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
/*
* Now find a thread we can wake up to take the signal off the queue.
*
- * If the main thread wants the signal, it gets first crack.
- * Probably the least surprising to the average bear.
+ * Try the suggested task first (may or may not be the main thread).
*/
if (wants_signal(sig, p))
t = p;
@@ -1970,8 +1969,24 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
ret = -1;
rcu_read_lock();
+
+ /*
+ * This function is used by POSIX timers to deliver a timer signal.
+ * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID
+ * set), the signal must be delivered to the specific thread (queues
+ * into t->pending).
+ *
+ * Where type is not PIDTYPE_PID, signals must be delivered to the
+ * process. In this case, prefer to deliver to current if it is in
+ * the same thread group as the target process, which avoids
+ * unnecessarily waking up a potentially idle task.
+ */
t = pid_task(pid, type);
- if (!t || !likely(lock_task_sighand(t, &flags)))
+ if (!t)
+ goto ret;
+ if (type != PIDTYPE_PID && same_thread_group(t, current))
+ t = current;
+ if (!likely(lock_task_sighand(t, &flags)))
goto ret;
ret = 1; /* the signal is ignored */
diff --git a/kernel/smp.c b/kernel/smp.c
index 06a413987a14..ab3e5dad6cfe 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -26,68 +26,15 @@
#include <linux/sched/debug.h>
#include <linux/jump_label.h>
+#include <trace/events/ipi.h>
+
#include "smpboot.h"
#include "sched/smp.h"
#define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK)
-#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
-union cfd_seq_cnt {
- u64 val;
- struct {
- u64 src:16;
- u64 dst:16;
-#define CFD_SEQ_NOCPU 0xffff
- u64 type:4;
-#define CFD_SEQ_QUEUE 0
-#define CFD_SEQ_IPI 1
-#define CFD_SEQ_NOIPI 2
-#define CFD_SEQ_PING 3
-#define CFD_SEQ_PINGED 4
-#define CFD_SEQ_HANDLE 5
-#define CFD_SEQ_DEQUEUE 6
-#define CFD_SEQ_IDLE 7
-#define CFD_SEQ_GOTIPI 8
-#define CFD_SEQ_HDLEND 9
- u64 cnt:28;
- } u;
-};
-
-static char *seq_type[] = {
- [CFD_SEQ_QUEUE] = "queue",
- [CFD_SEQ_IPI] = "ipi",
- [CFD_SEQ_NOIPI] = "noipi",
- [CFD_SEQ_PING] = "ping",
- [CFD_SEQ_PINGED] = "pinged",
- [CFD_SEQ_HANDLE] = "handle",
- [CFD_SEQ_DEQUEUE] = "dequeue (src CPU 0 == empty)",
- [CFD_SEQ_IDLE] = "idle",
- [CFD_SEQ_GOTIPI] = "gotipi",
- [CFD_SEQ_HDLEND] = "hdlend (src CPU 0 == early)",
-};
-
-struct cfd_seq_local {
- u64 ping;
- u64 pinged;
- u64 handle;
- u64 dequeue;
- u64 idle;
- u64 gotipi;
- u64 hdlend;
-};
-#endif
-
-struct cfd_percpu {
- call_single_data_t csd;
-#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
- u64 seq_queue;
- u64 seq_ipi;
- u64 seq_noipi;
-#endif
-};
-
struct call_function_data {
- struct cfd_percpu __percpu *pcpu;
+ call_single_data_t __percpu *csd;
cpumask_var_t cpumask;
cpumask_var_t cpumask_ipi;
};
@@ -110,8 +57,8 @@ int smpcfd_prepare_cpu(unsigned int cpu)
free_cpumask_var(cfd->cpumask);
return -ENOMEM;
}
- cfd->pcpu = alloc_percpu(struct cfd_percpu);
- if (!cfd->pcpu) {
+ cfd->csd = alloc_percpu(call_single_data_t);
+ if (!cfd->csd) {
free_cpumask_var(cfd->cpumask);
free_cpumask_var(cfd->cpumask_ipi);
return -ENOMEM;
@@ -126,7 +73,7 @@ int smpcfd_dead_cpu(unsigned int cpu)
free_cpumask_var(cfd->cpumask);
free_cpumask_var(cfd->cpumask_ipi);
- free_percpu(cfd->pcpu);
+ free_percpu(cfd->csd);
return 0;
}
@@ -156,23 +103,49 @@ void __init call_function_init(void)
smpcfd_prepare_cpu(smp_processor_id());
}
+static __always_inline void
+send_call_function_single_ipi(int cpu)
+{
+ if (call_function_single_prep_ipi(cpu)) {
+ trace_ipi_send_cpu(cpu, _RET_IP_,
+ generic_smp_call_function_single_interrupt);
+ arch_send_call_function_single_ipi(cpu);
+ }
+}
+
+static __always_inline void
+send_call_function_ipi_mask(struct cpumask *mask)
+{
+ trace_ipi_send_cpumask(mask, _RET_IP_,
+ generic_smp_call_function_single_interrupt);
+ arch_send_call_function_ipi_mask(mask);
+}
+
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
-static DEFINE_STATIC_KEY_FALSE(csdlock_debug_enabled);
-static DEFINE_STATIC_KEY_FALSE(csdlock_debug_extended);
+static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled);
+/*
+ * Parse the csdlock_debug= kernel boot parameter.
+ *
+ * If you need to restore the old "ext" value that once provided
+ * additional debugging information, reapply the following commits:
+ *
+ * de7b09ef658d ("locking/csd_lock: Prepare more CSD lock debugging")
+ * a5aabace5fb8 ("locking/csd_lock: Add more data to CSD lock debugging")
+ */
static int __init csdlock_debug(char *str)
{
+ int ret;
unsigned int val = 0;
- if (str && !strcmp(str, "ext")) {
- val = 1;
- static_branch_enable(&csdlock_debug_extended);
- } else
- get_option(&str, &val);
-
- if (val)
- static_branch_enable(&csdlock_debug_enabled);
+ ret = get_option(&str, &val);
+ if (ret) {
+ if (val)
+ static_branch_enable(&csdlock_debug_enabled);
+ else
+ static_branch_disable(&csdlock_debug_enabled);
+ }
return 1;
}
@@ -181,36 +154,11 @@ __setup("csdlock_debug=", csdlock_debug);
static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
static DEFINE_PER_CPU(void *, cur_csd_info);
-static DEFINE_PER_CPU(struct cfd_seq_local, cfd_seq_local);
static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */
module_param(csd_lock_timeout, ulong, 0444);
static atomic_t csd_bug_count = ATOMIC_INIT(0);
-static u64 cfd_seq;
-
-#define CFD_SEQ(s, d, t, c) \
- (union cfd_seq_cnt){ .u.src = s, .u.dst = d, .u.type = t, .u.cnt = c }
-
-static u64 cfd_seq_inc(unsigned int src, unsigned int dst, unsigned int type)
-{
- union cfd_seq_cnt new, old;
-
- new = CFD_SEQ(src, dst, type, 0);
-
- do {
- old.val = READ_ONCE(cfd_seq);
- new.u.cnt = old.u.cnt + 1;
- } while (cmpxchg(&cfd_seq, old.val, new.val) != old.val);
-
- return old.val;
-}
-
-#define cfd_seq_store(var, src, dst, type) \
- do { \
- if (static_branch_unlikely(&csdlock_debug_extended)) \
- var = cfd_seq_inc(src, dst, type); \
- } while (0)
/* Record current CSD work for current CPU, NULL to erase. */
static void __csd_lock_record(struct __call_single_data *csd)
@@ -244,80 +192,6 @@ static int csd_lock_wait_getcpu(struct __call_single_data *csd)
return -1;
}
-static void cfd_seq_data_add(u64 val, unsigned int src, unsigned int dst,
- unsigned int type, union cfd_seq_cnt *data,
- unsigned int *n_data, unsigned int now)
-{
- union cfd_seq_cnt new[2];
- unsigned int i, j, k;
-
- new[0].val = val;
- new[1] = CFD_SEQ(src, dst, type, new[0].u.cnt + 1);
-
- for (i = 0; i < 2; i++) {
- if (new[i].u.cnt <= now)
- new[i].u.cnt |= 0x80000000U;
- for (j = 0; j < *n_data; j++) {
- if (new[i].u.cnt == data[j].u.cnt) {
- /* Direct read value trumps generated one. */
- if (i == 0)
- data[j].val = new[i].val;
- break;
- }
- if (new[i].u.cnt < data[j].u.cnt) {
- for (k = *n_data; k > j; k--)
- data[k].val = data[k - 1].val;
- data[j].val = new[i].val;
- (*n_data)++;
- break;
- }
- }
- if (j == *n_data) {
- data[j].val = new[i].val;
- (*n_data)++;
- }
- }
-}
-
-static const char *csd_lock_get_type(unsigned int type)
-{
- return (type >= ARRAY_SIZE(seq_type)) ? "?" : seq_type[type];
-}
-
-static void csd_lock_print_extended(struct __call_single_data *csd, int cpu)
-{
- struct cfd_seq_local *seq = &per_cpu(cfd_seq_local, cpu);
- unsigned int srccpu = csd->node.src;
- struct call_function_data *cfd = per_cpu_ptr(&cfd_data, srccpu);
- struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu);
- unsigned int now;
- union cfd_seq_cnt data[2 * ARRAY_SIZE(seq_type)];
- unsigned int n_data = 0, i;
-
- data[0].val = READ_ONCE(cfd_seq);
- now = data[0].u.cnt;
-
- cfd_seq_data_add(pcpu->seq_queue, srccpu, cpu, CFD_SEQ_QUEUE, data, &n_data, now);
- cfd_seq_data_add(pcpu->seq_ipi, srccpu, cpu, CFD_SEQ_IPI, data, &n_data, now);
- cfd_seq_data_add(pcpu->seq_noipi, srccpu, cpu, CFD_SEQ_NOIPI, data, &n_data, now);
-
- cfd_seq_data_add(per_cpu(cfd_seq_local.ping, srccpu), srccpu, CFD_SEQ_NOCPU, CFD_SEQ_PING, data, &n_data, now);
- cfd_seq_data_add(per_cpu(cfd_seq_local.pinged, srccpu), srccpu, CFD_SEQ_NOCPU, CFD_SEQ_PINGED, data, &n_data, now);
-
- cfd_seq_data_add(seq->idle, CFD_SEQ_NOCPU, cpu, CFD_SEQ_IDLE, data, &n_data, now);
- cfd_seq_data_add(seq->gotipi, CFD_SEQ_NOCPU, cpu, CFD_SEQ_GOTIPI, data, &n_data, now);
- cfd_seq_data_add(seq->handle, CFD_SEQ_NOCPU, cpu, CFD_SEQ_HANDLE, data, &n_data, now);
- cfd_seq_data_add(seq->dequeue, CFD_SEQ_NOCPU, cpu, CFD_SEQ_DEQUEUE, data, &n_data, now);
- cfd_seq_data_add(seq->hdlend, CFD_SEQ_NOCPU, cpu, CFD_SEQ_HDLEND, data, &n_data, now);
-
- for (i = 0; i < n_data; i++) {
- pr_alert("\tcsd: cnt(%07x): %04x->%04x %s\n",
- data[i].u.cnt & ~0x80000000U, data[i].u.src,
- data[i].u.dst, csd_lock_get_type(data[i].u.type));
- }
- pr_alert("\tcsd: cnt now: %07x\n", now);
-}
-
/*
* Complain if too much time spent waiting. Note that only
* the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
@@ -368,8 +242,6 @@ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *
*bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request");
}
if (cpu >= 0) {
- if (static_branch_unlikely(&csdlock_debug_extended))
- csd_lock_print_extended(csd, cpu);
dump_cpu_task(cpu);
if (!cpu_cur_csd) {
pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
@@ -412,27 +284,7 @@ static __always_inline void csd_lock_wait(struct __call_single_data *csd)
smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
}
-
-static void __smp_call_single_queue_debug(int cpu, struct llist_node *node)
-{
- unsigned int this_cpu = smp_processor_id();
- struct cfd_seq_local *seq = this_cpu_ptr(&cfd_seq_local);
- struct call_function_data *cfd = this_cpu_ptr(&cfd_data);
- struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu);
-
- cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE);
- if (llist_add(node, &per_cpu(call_single_queue, cpu))) {
- cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI);
- cfd_seq_store(seq->ping, this_cpu, cpu, CFD_SEQ_PING);
- send_call_function_single_ipi(cpu);
- cfd_seq_store(seq->pinged, this_cpu, cpu, CFD_SEQ_PINGED);
- } else {
- cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI);
- }
-}
#else
-#define cfd_seq_store(var, src, dst, type)
-
static void csd_lock_record(struct __call_single_data *csd)
{
}
@@ -470,23 +322,29 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);
void __smp_call_single_queue(int cpu, struct llist_node *node)
{
-#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
- if (static_branch_unlikely(&csdlock_debug_extended)) {
- unsigned int type;
-
- type = CSD_TYPE(container_of(node, call_single_data_t,
- node.llist));
- if (type == CSD_TYPE_SYNC || type == CSD_TYPE_ASYNC) {
- __smp_call_single_queue_debug(cpu, node);
- return;
- }
+ /*
+ * We have to check the type of the CSD before queueing it, because
+ * once queued it can have its flags cleared by
+ * flush_smp_call_function_queue()
+ * even if we haven't sent the smp_call IPI yet (e.g. the stopper
+ * executes migration_cpu_stop() on the remote CPU).
+ */
+ if (trace_ipi_send_cpu_enabled()) {
+ call_single_data_t *csd;
+ smp_call_func_t func;
+
+ csd = container_of(node, call_single_data_t, node.llist);
+ func = CSD_TYPE(csd) == CSD_TYPE_TTWU ?
+ sched_ttwu_pending : csd->func;
+
+ trace_ipi_send_cpu(cpu, _RET_IP_, func);
}
-#endif
/*
- * The list addition should be visible before sending the IPI
- * handler locks the list to pull the entry off it because of
- * normal cache coherency rules implied by spinlocks.
+ * The list addition should be visible to the target CPU when it pops
+ * the head of the list to pull the entry off it in the IPI handler
+ * because of normal cache coherency rules implied by the underlying
+ * llist ops.
*
* If IPIs can go out of order to the cache coherency protocol
* in an architecture, sufficient synchronisation should be added
@@ -541,8 +399,6 @@ static int generic_exec_single(int cpu, struct __call_single_data *csd)
*/
void generic_smp_call_function_single_interrupt(void)
{
- cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU,
- smp_processor_id(), CFD_SEQ_GOTIPI);
__flush_smp_call_function_queue(true);
}
@@ -570,13 +426,7 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline)
lockdep_assert_irqs_disabled();
head = this_cpu_ptr(&call_single_queue);
- cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->handle, CFD_SEQ_NOCPU,
- smp_processor_id(), CFD_SEQ_HANDLE);
entry = llist_del_all(head);
- cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->dequeue,
- /* Special meaning of source cpu: 0 == queue empty */
- entry ? CFD_SEQ_NOCPU : 0,
- smp_processor_id(), CFD_SEQ_DEQUEUE);
entry = llist_reverse_order(entry);
/* There shouldn't be any pending callbacks on an offline CPU. */
@@ -635,12 +485,8 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline)
}
}
- if (!entry) {
- cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->hdlend,
- 0, smp_processor_id(),
- CFD_SEQ_HDLEND);
+ if (!entry)
return;
- }
/*
* Second; run all !SYNC callbacks.
@@ -678,9 +524,6 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline)
*/
if (entry)
sched_ttwu_pending(entry);
-
- cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->hdlend, CFD_SEQ_NOCPU,
- smp_processor_id(), CFD_SEQ_HDLEND);
}
@@ -704,8 +547,6 @@ void flush_smp_call_function_queue(void)
if (llist_empty(this_cpu_ptr(&call_single_queue)))
return;
- cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU,
- smp_processor_id(), CFD_SEQ_IDLE);
local_irq_save(flags);
/* Get the already pending soft interrupts for RT enabled kernels */
was_pending = local_softirq_pending();
@@ -887,9 +728,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
int cpu, last_cpu, this_cpu = smp_processor_id();
struct call_function_data *cfd;
bool wait = scf_flags & SCF_WAIT;
+ int nr_cpus = 0, nr_queued = 0;
bool run_remote = false;
bool run_local = false;
- int nr_cpus = 0;
lockdep_assert_preemption_disabled();
@@ -929,11 +770,12 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
cpumask_clear(cfd->cpumask_ipi);
for_each_cpu(cpu, cfd->cpumask) {
- struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu);
- call_single_data_t *csd = &pcpu->csd;
+ call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);
- if (cond_func && !cond_func(cpu, info))
+ if (cond_func && !cond_func(cpu, info)) {
+ __cpumask_clear_cpu(cpu, cfd->cpumask);
continue;
+ }
csd_lock(csd);
if (wait)
@@ -944,19 +786,20 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
csd->node.src = smp_processor_id();
csd->node.dst = cpu;
#endif
- cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE);
if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
nr_cpus++;
last_cpu = cpu;
-
- cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI);
- } else {
- cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI);
}
+ nr_queued++;
}
- cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->ping, this_cpu, CFD_SEQ_NOCPU, CFD_SEQ_PING);
+ /*
+ * Trace each smp_function_call_*() as an IPI, actual IPIs
+ * will be traced with func==generic_smp_call_function_single_ipi().
+ */
+ if (nr_queued)
+ trace_ipi_send_cpumask(cfd->cpumask, _RET_IP_, func);
/*
* Choose the most efficient way to send an IPI. Note that the
@@ -966,9 +809,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
if (nr_cpus == 1)
send_call_function_single_ipi(last_cpu);
else if (likely(nr_cpus > 1))
- arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
-
- cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->pinged, this_cpu, CFD_SEQ_NOCPU, CFD_SEQ_PINGED);
+ send_call_function_ipi_mask(cfd->cpumask_ipi);
}
if (run_local && (!cond_func || cond_func(this_cpu, info))) {
@@ -983,7 +824,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
for_each_cpu(cpu, cfd->cpumask) {
call_single_data_t *csd;
- csd = &per_cpu_ptr(cfd->pcpu, cpu)->csd;
+ csd = per_cpu_ptr(cfd->csd, cpu);
csd_lock_wait(csd);
}
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c8a6913c067d..1b725510dd0f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -793,10 +793,15 @@ static void tasklet_action_common(struct softirq_action *a,
if (tasklet_trylock(t)) {
if (!atomic_read(&t->count)) {
if (tasklet_clear_sched(t)) {
- if (t->use_callback)
+ if (t->use_callback) {
+ trace_tasklet_entry(t, t->callback);
t->callback(t);
- else
+ trace_tasklet_exit(t, t->callback);
+ } else {
+ trace_tasklet_entry(t, t->func);
t->func(t->data);
+ trace_tasklet_exit(t, t->func);
+ }
}
tasklet_unlock(t);
continue;
diff --git a/kernel/sys.c b/kernel/sys.c
index 495cd87d9bf4..72cdb16e2636 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -15,6 +15,7 @@
#include <linux/highuid.h>
#include <linux/fs.h>
#include <linux/kmod.h>
+#include <linux/ksm.h>
#include <linux/perf_event.h>
#include <linux/resource.h>
#include <linux/kernel.h>
@@ -664,6 +665,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
struct cred *new;
int retval;
kuid_t kruid, keuid, ksuid;
+ bool ruid_new, euid_new, suid_new;
kruid = make_kuid(ns, ruid);
keuid = make_kuid(ns, euid);
@@ -678,25 +680,29 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
if ((suid != (uid_t) -1) && !uid_valid(ksuid))
return -EINVAL;
+ old = current_cred();
+
+ /* check for no-op */
+ if ((ruid == (uid_t) -1 || uid_eq(kruid, old->uid)) &&
+ (euid == (uid_t) -1 || (uid_eq(keuid, old->euid) &&
+ uid_eq(keuid, old->fsuid))) &&
+ (suid == (uid_t) -1 || uid_eq(ksuid, old->suid)))
+ return 0;
+
+ ruid_new = ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
+ !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid);
+ euid_new = euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&
+ !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid);
+ suid_new = suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&
+ !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid);
+ if ((ruid_new || euid_new || suid_new) &&
+ !ns_capable_setid(old->user_ns, CAP_SETUID))
+ return -EPERM;
+
new = prepare_creds();
if (!new)
return -ENOMEM;
- old = current_cred();
-
- retval = -EPERM;
- if (!ns_capable_setid(old->user_ns, CAP_SETUID)) {
- if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) &&
- !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid))
- goto error;
- if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) &&
- !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid))
- goto error;
- if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) &&
- !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid))
- goto error;
- }
-
if (ruid != (uid_t) -1) {
new->uid = kruid;
if (!uid_eq(kruid, old->uid)) {
@@ -761,6 +767,7 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
struct cred *new;
int retval;
kgid_t krgid, kegid, ksgid;
+ bool rgid_new, egid_new, sgid_new;
krgid = make_kgid(ns, rgid);
kegid = make_kgid(ns, egid);
@@ -773,23 +780,28 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
return -EINVAL;
+ old = current_cred();
+
+ /* check for no-op */
+ if ((rgid == (gid_t) -1 || gid_eq(krgid, old->gid)) &&
+ (egid == (gid_t) -1 || (gid_eq(kegid, old->egid) &&
+ gid_eq(kegid, old->fsgid))) &&
+ (sgid == (gid_t) -1 || gid_eq(ksgid, old->sgid)))
+ return 0;
+
+ rgid_new = rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
+ !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid);
+ egid_new = egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&
+ !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid);
+ sgid_new = sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&
+ !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid);
+ if ((rgid_new || egid_new || sgid_new) &&
+ !ns_capable_setid(old->user_ns, CAP_SETGID))
+ return -EPERM;
+
new = prepare_creds();
if (!new)
return -ENOMEM;
- old = current_cred();
-
- retval = -EPERM;
- if (!ns_capable_setid(old->user_ns, CAP_SETGID)) {
- if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) &&
- !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid))
- goto error;
- if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) &&
- !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid))
- goto error;
- if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) &&
- !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid))
- goto error;
- }
if (rgid != (gid_t) -1)
new->gid = krgid;
@@ -2377,6 +2389,16 @@ static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3,
PR_MDWE_REFUSE_EXEC_GAIN : 0;
}
+static int prctl_get_auxv(void __user *addr, unsigned long len)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long size = min_t(unsigned long, sizeof(mm->saved_auxv), len);
+
+ if (size && copy_to_user(addr, mm->saved_auxv, size))
+ return -EFAULT;
+ return sizeof(mm->saved_auxv);
+}
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2507,6 +2529,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
else
return -EINVAL;
break;
+ case PR_GET_AUXV:
+ if (arg4 || arg5)
+ return -EINVAL;
+ error = prctl_get_auxv((void __user *)arg2, arg3);
+ break;
default:
return -EINVAL;
}
@@ -2661,6 +2688,32 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SET_VMA:
error = prctl_set_vma(arg2, arg3, arg4, arg5);
break;
+#ifdef CONFIG_KSM
+ case PR_SET_MEMORY_MERGE:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ if (mmap_write_lock_killable(me->mm))
+ return -EINTR;
+
+ if (arg2) {
+ error = ksm_enable_merge_any(me->mm);
+ } else {
+ /*
+ * TODO: we might want disable KSM on all VMAs and
+ * trigger unsharing to completely disable KSM.
+ */
+ clear_bit(MMF_VM_MERGE_ANY, &me->mm->flags);
+ error = 0;
+ }
+ mmap_write_unlock(me->mm);
+ break;
+ case PR_GET_MEMORY_MERGE:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags);
+ break;
+#endif
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1c240d2c99bc..bfe53e835524 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -42,7 +42,6 @@
#include <linux/highuid.h>
#include <linux/writeback.h>
#include <linux/ratelimit.h>
-#include <linux/compaction.h>
#include <linux/hugetlb.h>
#include <linux/initrd.h>
#include <linux/key.h>
@@ -746,27 +745,6 @@ int proc_dointvec(struct ctl_table *table, int write, void *buffer,
return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
}
-#ifdef CONFIG_COMPACTION
-static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table,
- int write, void *buffer, size_t *lenp, loff_t *ppos)
-{
- int ret, old;
-
- if (!IS_ENABLED(CONFIG_PREEMPT_RT) || !write)
- return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-
- old = *(int *)table->data;
- ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (ret)
- return ret;
- if (old != *(int *)table->data)
- pr_warn_once("sysctl attribute %s changed by %s[%d]\n",
- table->procname, current->comm,
- task_pid_nr(current));
- return ret;
-}
-#endif
-
/**
* proc_douintvec - read a vector of unsigned integers
* @table: the sysctl table
@@ -2141,38 +2119,6 @@ static struct ctl_table vm_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif
-#ifdef CONFIG_HUGETLB_PAGE
- {
- .procname = "nr_hugepages",
- .data = NULL,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = hugetlb_sysctl_handler,
- },
-#ifdef CONFIG_NUMA
- {
- .procname = "nr_hugepages_mempolicy",
- .data = NULL,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = &hugetlb_mempolicy_sysctl_handler,
- },
-#endif
- {
- .procname = "hugetlb_shm_group",
- .data = &sysctl_hugetlb_shm_group,
- .maxlen = sizeof(gid_t),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "nr_overcommit_hugepages",
- .data = NULL,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = hugetlb_overcommit_handler,
- },
-#endif
{
.procname = "lowmem_reserve_ratio",
.data = &sysctl_lowmem_reserve_ratio,
@@ -2189,43 +2135,6 @@ static struct ctl_table vm_table[] = {
.extra1 = SYSCTL_ONE,
.extra2 = SYSCTL_FOUR,
},
-#ifdef CONFIG_COMPACTION
- {
- .procname = "compact_memory",
- .data = NULL,
- .maxlen = sizeof(int),
- .mode = 0200,
- .proc_handler = sysctl_compaction_handler,
- },
- {
- .procname = "compaction_proactiveness",
- .data = &sysctl_compaction_proactiveness,
- .maxlen = sizeof(sysctl_compaction_proactiveness),
- .mode = 0644,
- .proc_handler = compaction_proactiveness_sysctl_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE_HUNDRED,
- },
- {
- .procname = "extfrag_threshold",
- .data = &sysctl_extfrag_threshold,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE_THOUSAND,
- },
- {
- .procname = "compact_unevictable_allowed",
- .data = &sysctl_compact_unevictable_allowed,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax_warn_RT_change,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-
-#endif /* CONFIG_COMPACTION */
{
.procname = "min_free_kbytes",
.data = &min_free_kbytes,
@@ -2383,26 +2292,6 @@ static struct ctl_table vm_table[] = {
.extra1 = SYSCTL_ZERO,
},
#endif
-#ifdef CONFIG_MEMORY_FAILURE
- {
- .procname = "memory_failure_early_kill",
- .data = &sysctl_memory_failure_early_kill,
- .maxlen = sizeof(sysctl_memory_failure_early_kill),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
- {
- .procname = "memory_failure_recovery",
- .data = &sysctl_memory_failure_recovery,
- .maxlen = sizeof(sysctl_memory_failure_recovery),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
{
.procname = "user_reserve_kbytes",
.data = &sysctl_user_reserve_kbytes,
@@ -2439,17 +2328,6 @@ static struct ctl_table vm_table[] = {
.extra2 = (void *)&mmap_rnd_compat_bits_max,
},
#endif
-#ifdef CONFIG_USERFAULTFD
- {
- .procname = "unprivileged_userfaultfd",
- .data = &sysctl_unprivileged_userfaultfd,
- .maxlen = sizeof(sysctl_unprivileged_userfaultfd),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
{ }
};
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 7e5dff602585..82b28ab0f328 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -81,8 +81,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void)
}
EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);
-static int alarmtimer_rtc_add_device(struct device *dev,
- struct class_interface *class_intf)
+static int alarmtimer_rtc_add_device(struct device *dev)
{
unsigned long flags;
struct rtc_device *rtc = to_rtc_device(dev);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 2f5e9b34022c..e9c6f9d0e42c 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -846,6 +846,8 @@ static u64 collect_timerqueue(struct timerqueue_head *head,
return expires;
ctmr->firing = 1;
+ /* See posix_cpu_timer_wait_running() */
+ rcu_assign_pointer(ctmr->handling, current);
cpu_timer_dequeue(ctmr);
list_add_tail(&ctmr->elist, firing);
}
@@ -1161,7 +1163,49 @@ static void handle_posix_cpu_timers(struct task_struct *tsk);
#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
static void posix_cpu_timers_work(struct callback_head *work)
{
+ struct posix_cputimers_work *cw = container_of(work, typeof(*cw), work);
+
+ mutex_lock(&cw->mutex);
handle_posix_cpu_timers(current);
+ mutex_unlock(&cw->mutex);
+}
+
+/*
+ * Invoked from the posix-timer core when a cancel operation failed because
+ * the timer is marked firing. The caller holds rcu_read_lock(), which
+ * protects the timer and the task which is expiring it from being freed.
+ */
+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
+{
+ struct task_struct *tsk = rcu_dereference(timr->it.cpu.handling);
+
+ /* Has the handling task completed expiry already? */
+ if (!tsk)
+ return;
+
+ /* Ensure that the task cannot go away */
+ get_task_struct(tsk);
+ /* Now drop the RCU protection so the mutex can be locked */
+ rcu_read_unlock();
+ /* Wait on the expiry mutex */
+ mutex_lock(&tsk->posix_cputimers_work.mutex);
+ /* Release it immediately again. */
+ mutex_unlock(&tsk->posix_cputimers_work.mutex);
+ /* Drop the task reference. */
+ put_task_struct(tsk);
+ /* Relock RCU so the callsite is balanced */
+ rcu_read_lock();
+}
+
+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
+{
+ /* Ensure that timr->it.cpu.handling task cannot go away */
+ rcu_read_lock();
+ spin_unlock_irq(&timr->it_lock);
+ posix_cpu_timer_wait_running(timr);
+ rcu_read_unlock();
+ /* @timr is on stack and is valid */
+ spin_lock_irq(&timr->it_lock);
}
/*
@@ -1177,6 +1221,7 @@ void clear_posix_cputimers_work(struct task_struct *p)
sizeof(p->posix_cputimers_work.work));
init_task_work(&p->posix_cputimers_work.work,
posix_cpu_timers_work);
+ mutex_init(&p->posix_cputimers_work.mutex);
p->posix_cputimers_work.scheduled = false;
}
@@ -1255,6 +1300,18 @@ static inline void __run_posix_cpu_timers(struct task_struct *tsk)
lockdep_posixtimer_exit();
}
+static void posix_cpu_timer_wait_running(struct k_itimer *timr)
+{
+ cpu_relax();
+}
+
+static void posix_cpu_timer_wait_running_nsleep(struct k_itimer *timr)
+{
+ spin_unlock_irq(&timr->it_lock);
+ cpu_relax();
+ spin_lock_irq(&timr->it_lock);
+}
+
static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk)
{
return false;
@@ -1363,6 +1420,8 @@ static void handle_posix_cpu_timers(struct task_struct *tsk)
*/
if (likely(cpu_firing >= 0))
cpu_timer_fire(timer);
+ /* See posix_cpu_timer_wait_running() */
+ rcu_assign_pointer(timer->it.cpu.handling, NULL);
spin_unlock(&timer->it_lock);
}
}
@@ -1497,23 +1556,16 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
expires = cpu_timer_getexpires(&timer.it.cpu);
error = posix_cpu_timer_set(&timer, 0, &zero_it, &it);
if (!error) {
- /*
- * Timer is now unarmed, deletion can not fail.
- */
+ /* Timer is now unarmed, deletion can not fail. */
posix_cpu_timer_del(&timer);
+ } else {
+ while (error == TIMER_RETRY) {
+ posix_cpu_timer_wait_running_nsleep(&timer);
+ error = posix_cpu_timer_del(&timer);
+ }
}
- spin_unlock_irq(&timer.it_lock);
- while (error == TIMER_RETRY) {
- /*
- * We need to handle case when timer was or is in the
- * middle of firing. In other cases we already freed
- * resources.
- */
- spin_lock_irq(&timer.it_lock);
- error = posix_cpu_timer_del(&timer);
- spin_unlock_irq(&timer.it_lock);
- }
+ spin_unlock_irq(&timer.it_lock);
if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
/*
@@ -1623,6 +1675,7 @@ const struct k_clock clock_posix_cpu = {
.timer_del = posix_cpu_timer_del,
.timer_get = posix_cpu_timer_get,
.timer_rearm = posix_cpu_timer_rearm,
+ .timer_wait_running = posix_cpu_timer_wait_running,
};
const struct k_clock clock_process = {
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 0c8a87a11b39..808a247205a9 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -846,6 +846,10 @@ static struct k_itimer *timer_wait_running(struct k_itimer *timer,
rcu_read_lock();
unlock_timer(timer, *flags);
+ /*
+ * kc->timer_wait_running() might drop RCU lock. So @timer
+ * cannot be touched anymore after the function returns!
+ */
if (!WARN_ON_ONCE(!kc->timer_wait_running))
kc->timer_wait_running(timer);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 46789356f856..65b8658da829 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -218,9 +218,19 @@ static void tick_setup_device(struct tick_device *td,
* this cpu:
*/
if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
+ ktime_t next_p;
+ u32 rem;
+
tick_do_timer_cpu = cpu;
- tick_next_period = ktime_get();
+ next_p = ktime_get();
+ div_u64_rem(next_p, TICK_NSEC, &rem);
+ if (rem) {
+ next_p -= rem;
+ next_p += TICK_NSEC;
+ }
+
+ tick_next_period = next_p;
#ifdef CONFIG_NO_HZ_FULL
/*
* The boot CPU may be nohz_full, in which case set
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b0e3c9205946..52254679ec48 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -281,6 +281,11 @@ static bool check_tick_dependency(atomic_t *dep)
return true;
}
+ if (val & TICK_DEP_MASK_RCU_EXP) {
+ trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP);
+ return true;
+ }
+
return false;
}
@@ -527,7 +532,7 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
tick_nohz_full_running = true;
}
-static int tick_nohz_cpu_down(unsigned int cpu)
+bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
{
/*
* The tick_do_timer_cpu CPU handles housekeeping duty (unbound
@@ -535,8 +540,13 @@ static int tick_nohz_cpu_down(unsigned int cpu)
* CPUs. It must remain online when nohz full is enabled.
*/
if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
- return -EBUSY;
- return 0;
+ return false;
+ return true;
+}
+
+static int tick_nohz_cpu_down(unsigned int cpu)
+{
+ return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY;
}
void __init tick_nohz_init(void)
@@ -637,43 +647,67 @@ static void tick_nohz_update_jiffies(ktime_t now)
touch_softlockup_watchdog_sched();
}
-/*
- * Updates the per-CPU time idle statistics counters
- */
-static void
-update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
+static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
ktime_t delta;
- if (ts->idle_active) {
- delta = ktime_sub(now, ts->idle_entrytime);
- if (nr_iowait_cpu(cpu) > 0)
- ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
- else
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
- ts->idle_entrytime = now;
- }
+ if (WARN_ON_ONCE(!ts->idle_active))
+ return;
- if (last_update_time)
- *last_update_time = ktime_to_us(now);
+ delta = ktime_sub(now, ts->idle_entrytime);
-}
+ write_seqcount_begin(&ts->idle_sleeptime_seq);
+ if (nr_iowait_cpu(smp_processor_id()) > 0)
+ ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+ else
+ ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
-static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
-{
- update_ts_time_stats(smp_processor_id(), ts, now, NULL);
+ ts->idle_entrytime = now;
ts->idle_active = 0;
+ write_seqcount_end(&ts->idle_sleeptime_seq);
sched_clock_idle_wakeup_event();
}
static void tick_nohz_start_idle(struct tick_sched *ts)
{
+ write_seqcount_begin(&ts->idle_sleeptime_seq);
ts->idle_entrytime = ktime_get();
ts->idle_active = 1;
+ write_seqcount_end(&ts->idle_sleeptime_seq);
+
sched_clock_idle_sleep_event();
}
+static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
+ bool compute_delta, u64 *last_update_time)
+{
+ ktime_t now, idle;
+ unsigned int seq;
+
+ if (!tick_nohz_active)
+ return -1;
+
+ now = ktime_get();
+ if (last_update_time)
+ *last_update_time = ktime_to_us(now);
+
+ do {
+ seq = read_seqcount_begin(&ts->idle_sleeptime_seq);
+
+ if (ts->idle_active && compute_delta) {
+ ktime_t delta = ktime_sub(now, ts->idle_entrytime);
+
+ idle = ktime_add(*sleeptime, delta);
+ } else {
+ idle = *sleeptime;
+ }
+ } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));
+
+ return ktime_to_us(idle);
+
+}
+
/**
* get_cpu_idle_time_us - get the total idle time of a CPU
* @cpu: CPU number to query
@@ -681,7 +715,10 @@ static void tick_nohz_start_idle(struct tick_sched *ts)
* counters if NULL.
*
* Return the cumulative idle time (since boot) for a given
- * CPU, in microseconds.
+ * CPU, in microseconds. Note this is partially broken due to
+ * the counter of iowait tasks that can be remotely updated without
+ * any synchronization. Therefore it is possible to observe backward
+ * values within two consecutive reads.
*
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
@@ -691,27 +728,9 @@ static void tick_nohz_start_idle(struct tick_sched *ts)
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t now, idle;
-
- if (!tick_nohz_active)
- return -1;
-
- now = ktime_get();
- if (last_update_time) {
- update_ts_time_stats(cpu, ts, now, last_update_time);
- idle = ts->idle_sleeptime;
- } else {
- if (ts->idle_active && !nr_iowait_cpu(cpu)) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
-
- idle = ktime_add(ts->idle_sleeptime, delta);
- } else {
- idle = ts->idle_sleeptime;
- }
- }
-
- return ktime_to_us(idle);
+ return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime,
+ !nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
@@ -722,7 +741,10 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
* counters if NULL.
*
* Return the cumulative iowait time (since boot) for a given
- * CPU, in microseconds.
+ * CPU, in microseconds. Note this is partially broken due to
+ * the counter of iowait tasks that can be remotely updated without
+ * any synchronization. Therefore it is possible to observe backward
+ * values within two consecutive reads.
*
* This time is measured via accounting rather than sampling,
* and is as accurate as ktime_get() is.
@@ -732,26 +754,9 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t now, iowait;
-
- if (!tick_nohz_active)
- return -1;
-
- now = ktime_get();
- if (last_update_time) {
- update_ts_time_stats(cpu, ts, now, last_update_time);
- iowait = ts->iowait_sleeptime;
- } else {
- if (ts->idle_active && nr_iowait_cpu(cpu) > 0) {
- ktime_t delta = ktime_sub(now, ts->idle_entrytime);
- iowait = ktime_add(ts->iowait_sleeptime, delta);
- } else {
- iowait = ts->iowait_sleeptime;
- }
- }
-
- return ktime_to_us(iowait);
+ return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime,
+ nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
@@ -1084,10 +1089,16 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
return true;
}
-static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
+/**
+ * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
+ *
+ * When the next event is more than a tick into the future, stop the idle tick
+ */
+void tick_nohz_idle_stop_tick(void)
{
- ktime_t expires;
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
int cpu = smp_processor_id();
+ ktime_t expires;
/*
* If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
@@ -1119,16 +1130,6 @@ static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
}
}
-/**
- * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
- *
- * When the next event is more than a tick into the future, stop the idle tick
- */
-void tick_nohz_idle_stop_tick(void)
-{
- __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched));
-}
-
void tick_nohz_idle_retain_tick(void)
{
tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 504649513399..5ed5a9d41d5a 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -22,65 +22,82 @@ enum tick_nohz_mode {
/**
* struct tick_sched - sched tick emulation and no idle tick control/stats
- * @sched_timer: hrtimer to schedule the periodic tick in high
- * resolution mode
- * @check_clocks: Notification mechanism about clocksource changes
- * @nohz_mode: Mode - one state of tick_nohz_mode
+ *
* @inidle: Indicator that the CPU is in the tick idle mode
* @tick_stopped: Indicator that the idle tick has been stopped
* @idle_active: Indicator that the CPU is actively in the tick idle mode;
* it is reset during irq handling phases.
- * @do_timer_lst: CPU was the last one doing do_timer before going idle
+ * @do_timer_last: CPU was the last one doing do_timer before going idle
* @got_idle_tick: Tick timer function has run with @inidle set
+ * @stalled_jiffies: Number of stalled jiffies detected across ticks
+ * @last_tick_jiffies: Value of jiffies seen on last tick
+ * @sched_timer: hrtimer to schedule the periodic tick in high
+ * resolution mode
* @last_tick: Store the last tick expiry time when the tick
* timer is modified for nohz sleeps. This is necessary
* to resume the tick timer operation in the timeline
* when the CPU returns from nohz sleep.
* @next_tick: Next tick to be fired when in dynticks mode.
* @idle_jiffies: jiffies at the entry to idle for idle time accounting
+ * @idle_waketime: Time when the idle was interrupted
+ * @idle_entrytime: Time when the idle call was entered
+ * @nohz_mode: Mode - one state of tick_nohz_mode
+ * @last_jiffies: Base jiffies snapshot when next event was last computed
+ * @timer_expires_base: Base time clock monotonic for @timer_expires
+ * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
+ * @next_timer: Expiry time of next expiring timer for debugging purpose only
+ * @idle_expires: Next tick in idle, for debugging purpose only
* @idle_calls: Total number of idle calls
* @idle_sleeps: Number of idle calls, where the sched tick was stopped
- * @idle_entrytime: Time when the idle call was entered
- * @idle_waketime: Time when the idle was interrupted
* @idle_exittime: Time when the idle state was left
* @idle_sleeptime: Sum of the time slept in idle with sched tick stopped
* @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding
- * @timer_expires: Anticipated timer expiration time (in case sched tick is stopped)
- * @timer_expires_base: Base time clock monotonic for @timer_expires
- * @next_timer: Expiry time of next expiring timer for debugging purpose only
* @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick
- * @last_tick_jiffies: Value of jiffies seen on last tick
- * @stalled_jiffies: Number of stalled jiffies detected across ticks
+ * @check_clocks: Notification mechanism about clocksource changes
*/
struct tick_sched {
- struct hrtimer sched_timer;
- unsigned long check_clocks;
- enum tick_nohz_mode nohz_mode;
-
+ /* Common flags */
unsigned int inidle : 1;
unsigned int tick_stopped : 1;
unsigned int idle_active : 1;
unsigned int do_timer_last : 1;
unsigned int got_idle_tick : 1;
+ /* Tick handling: jiffies stall check */
+ unsigned int stalled_jiffies;
+ unsigned long last_tick_jiffies;
+
+ /* Tick handling */
+ struct hrtimer sched_timer;
ktime_t last_tick;
ktime_t next_tick;
unsigned long idle_jiffies;
- unsigned long idle_calls;
- unsigned long idle_sleeps;
- ktime_t idle_entrytime;
ktime_t idle_waketime;
- ktime_t idle_exittime;
- ktime_t idle_sleeptime;
- ktime_t iowait_sleeptime;
+
+ /* Idle entry */
+ seqcount_t idle_sleeptime_seq;
+ ktime_t idle_entrytime;
+
+ /* Tick stop */
+ enum tick_nohz_mode nohz_mode;
unsigned long last_jiffies;
- u64 timer_expires;
u64 timer_expires_base;
+ u64 timer_expires;
u64 next_timer;
ktime_t idle_expires;
+ unsigned long idle_calls;
+ unsigned long idle_sleeps;
+
+ /* Idle exit */
+ ktime_t idle_exittime;
+ ktime_t idle_sleeptime;
+ ktime_t iowait_sleeptime;
+
+ /* Full dynticks handling */
atomic_t tick_dep_mask;
- unsigned long last_tick_jiffies;
- unsigned int stalled_jiffies;
+
+ /* Clocksource changes */
+ unsigned long check_clocks;
};
extern struct tick_sched *tick_get_tick_sched(int cpu);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5579ead449f2..09d594900ee0 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -526,7 +526,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
* partially updated. Since the tk->offs_boot update is a rare event, this
* should be a rare occurrence which postprocessing should be able to handle.
*
- * The caveats vs. timestamp ordering as documented for ktime_get_fast_ns()
+ * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns()
* apply as well.
*/
u64 notrace ktime_get_boot_fast_ns(void)
@@ -576,7 +576,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
/**
* ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
*
- * See ktime_get_fast_ns() for documentation of the time stamp ordering.
+ * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering.
*/
u64 ktime_get_real_fast_ns(void)
{
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a856d4a34c67..8cf97fa4a4b3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -257,7 +257,7 @@ config DYNAMIC_FTRACE_WITH_REGS
config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
def_bool y
- depends on DYNAMIC_FTRACE_WITH_REGS
+ depends on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS
depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
config DYNAMIC_FTRACE_WITH_CALL_OPS
@@ -792,15 +792,15 @@ config USER_EVENTS
bool "User trace events"
select TRACING
select DYNAMIC_EVENTS
- depends on BROKEN || COMPILE_TEST # API needs to be straighten out
help
User trace events are user-defined trace events that
can be used like an existing kernel trace event. User trace
events are generated by writing to a tracefs file. User
processes can determine if their tracing events should be
- generated by memory mapping a tracefs file and checking for
- an associated byte being non-zero.
+ generated by registering a value and bit with the kernel
+ that reflects when it is enabled or not.
+ See Documentation/trace/user_events.rst.
If in doubt, say N.
config HIST_TRIGGERS
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e8da032bb6fc..9a050e36dc6c 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1453,10 +1453,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
NULL : &bpf_probe_read_compat_str_proto;
#endif
#ifdef CONFIG_CGROUPS
- case BPF_FUNC_get_current_cgroup_id:
- return &bpf_get_current_cgroup_id_proto;
- case BPF_FUNC_get_current_ancestor_cgroup_id:
- return &bpf_get_current_ancestor_cgroup_id_proto;
case BPF_FUNC_cgrp_storage_get:
return &bpf_cgrp_storage_get_proto;
case BPF_FUNC_cgrp_storage_delete:
@@ -2644,9 +2640,20 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
return err;
}
-static void
+static int
kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
- struct pt_regs *regs)
+ struct pt_regs *regs, void *data)
+{
+ struct bpf_kprobe_multi_link *link;
+
+ link = container_of(fp, struct bpf_kprobe_multi_link, fp);
+ kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs);
+ return 0;
+}
+
+static void
+kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip,
+ struct pt_regs *regs, void *data)
{
struct bpf_kprobe_multi_link *link;
@@ -2848,7 +2855,7 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
goto error;
if (flags & BPF_F_KPROBE_MULTI_RETURN)
- link->fp.exit_handler = kprobe_multi_link_handler;
+ link->fp.exit_handler = kprobe_multi_link_exit_handler;
else
link->fp.entry_handler = kprobe_multi_link_handler;
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index e8143e368074..9abb3905bc8e 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -17,15 +17,17 @@
struct fprobe_rethook_node {
struct rethook_node node;
unsigned long entry_ip;
+ char data[];
};
static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
struct fprobe_rethook_node *fpr;
- struct rethook_node *rh;
+ struct rethook_node *rh = NULL;
struct fprobe *fp;
- int bit;
+ void *entry_data = NULL;
+ int bit, ret;
fp = container_of(ops, struct fprobe, ops);
if (fprobe_disabled(fp))
@@ -37,9 +39,6 @@ static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
return;
}
- if (fp->entry_handler)
- fp->entry_handler(fp, ip, ftrace_get_regs(fregs));
-
if (fp->exit_handler) {
rh = rethook_try_get(fp->rethook);
if (!rh) {
@@ -48,9 +47,20 @@ static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
}
fpr = container_of(rh, struct fprobe_rethook_node, node);
fpr->entry_ip = ip;
- rethook_hook(rh, ftrace_get_regs(fregs), true);
+ if (fp->entry_data_size)
+ entry_data = fpr->data;
}
+ if (fp->entry_handler)
+ ret = fp->entry_handler(fp, ip, ftrace_get_regs(fregs), entry_data);
+
+ /* If entry_handler returns !0, nmissed is not counted. */
+ if (rh) {
+ if (ret)
+ rethook_recycle(rh);
+ else
+ rethook_hook(rh, ftrace_get_regs(fregs), true);
+ }
out:
ftrace_test_recursion_unlock(bit);
}
@@ -81,7 +91,8 @@ static void fprobe_exit_handler(struct rethook_node *rh, void *data,
fpr = container_of(rh, struct fprobe_rethook_node, node);
- fp->exit_handler(fp, fpr->entry_ip, regs);
+ fp->exit_handler(fp, fpr->entry_ip, regs,
+ fp->entry_data_size ? (void *)fpr->data : NULL);
}
NOKPROBE_SYMBOL(fprobe_exit_handler);
@@ -136,7 +147,10 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
}
/* Initialize rethook if needed */
- size = num * num_possible_cpus() * 2;
+ if (fp->nr_maxactive)
+ size = fp->nr_maxactive;
+ else
+ size = num * num_possible_cpus() * 2;
if (size < 0)
return -E2BIG;
@@ -146,7 +160,7 @@ static int fprobe_init_rethook(struct fprobe *fp, int num)
for (i = 0; i < size; i++) {
struct fprobe_rethook_node *node;
- node = kzalloc(sizeof(*node), GFP_KERNEL);
+ node = kzalloc(sizeof(*node) + fp->entry_data_size, GFP_KERNEL);
if (!node) {
rethook_free(fp->rethook);
fp->rethook = NULL;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 29baa97d0d53..76973a70ab9d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -45,6 +45,9 @@
#include "trace_output.h"
#include "trace_stat.h"
+/* Flags that do not get reset */
+#define FTRACE_NOCLEAR_FLAGS (FTRACE_FL_DISABLED | FTRACE_FL_TOUCHED)
+
#define FTRACE_INVALID_FUNCTION "__ftrace_invalid_address__"
#define FTRACE_WARN_ON(cond) \
@@ -1564,7 +1567,8 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end)
key.flags = end; /* overload flags, as it is unsigned long */
for (pg = ftrace_pages_start; pg; pg = pg->next) {
- if (end < pg->records[0].ip ||
+ if (pg->index == 0 ||
+ end < pg->records[0].ip ||
start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
continue;
rec = bsearch(&key, pg->records, pg->index,
@@ -2255,7 +2259,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
flag ^= rec->flags & FTRACE_FL_ENABLED;
if (update) {
- rec->flags |= FTRACE_FL_ENABLED;
+ rec->flags |= FTRACE_FL_ENABLED | FTRACE_FL_TOUCHED;
if (flag & FTRACE_FL_REGS) {
if (rec->flags & FTRACE_FL_REGS)
rec->flags |= FTRACE_FL_REGS_EN;
@@ -2325,7 +2329,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
if (update) {
/* If there's no more users, clear all flags */
if (!ftrace_rec_count(rec))
- rec->flags &= FTRACE_FL_DISABLED;
+ rec->flags &= FTRACE_NOCLEAR_FLAGS;
else
/*
* Just disable the record, but keep the ops TRAMP
@@ -2582,28 +2586,13 @@ ftrace_add_rec_direct(unsigned long ip, unsigned long addr,
static void call_direct_funcs(unsigned long ip, unsigned long pip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
- unsigned long addr;
+ unsigned long addr = READ_ONCE(ops->direct_call);
- addr = ftrace_find_rec_direct(ip);
if (!addr)
return;
arch_ftrace_set_direct_caller(fregs, addr);
}
-
-struct ftrace_ops direct_ops = {
- .func = call_direct_funcs,
- .flags = FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
- | FTRACE_OPS_FL_PERMANENT,
- /*
- * By declaring the main trampoline as this trampoline
- * it will never have one allocated for it. Allocated
- * trampolines should not call direct functions.
- * The direct_ops should only be called by the builtin
- * ftrace_regs_caller trampoline.
- */
- .trampoline = FTRACE_REGS_ADDR,
-};
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
/**
@@ -3161,7 +3150,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
struct dyn_ftrace *rec;
do_for_each_ftrace_rec(pg, rec) {
- if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_FL_DISABLED))
+ if (FTRACE_WARN_ON_ONCE(rec->flags & ~FTRACE_NOCLEAR_FLAGS))
pr_warn(" %pS flags:%lx\n",
(void *)rec->ip, rec->flags);
} while_for_each_ftrace_rec();
@@ -3612,7 +3601,10 @@ t_func_next(struct seq_file *m, loff_t *pos)
!ftrace_lookup_ip(iter->hash, rec->ip)) ||
((iter->flags & FTRACE_ITER_ENABLED) &&
- !(rec->flags & FTRACE_FL_ENABLED))) {
+ !(rec->flags & FTRACE_FL_ENABLED)) ||
+
+ ((iter->flags & FTRACE_ITER_TOUCHED) &&
+ !(rec->flags & FTRACE_FL_TOUCHED))) {
rec = NULL;
goto retry;
@@ -3871,7 +3863,7 @@ static int t_show(struct seq_file *m, void *v)
return 0;
}
- if (iter->flags & FTRACE_ITER_ENABLED) {
+ if (iter->flags & (FTRACE_ITER_ENABLED | FTRACE_ITER_TOUCHED)) {
struct ftrace_ops *ops;
seq_printf(m, " (%ld)%s%s%s%s",
@@ -3973,6 +3965,31 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
return 0;
}
+static int
+ftrace_touched_open(struct inode *inode, struct file *file)
+{
+ struct ftrace_iterator *iter;
+
+ /*
+ * This shows us what functions have ever been enabled
+ * (traced, direct, patched, etc). Not sure if we want lockdown
+ * to hide such critical information for an admin.
+ * Although, perhaps it can show information we don't
+ * want people to see, but if something had traced
+ * something, we probably want to know about it.
+ */
+
+ iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter));
+ if (!iter)
+ return -ENOMEM;
+
+ iter->pg = ftrace_pages_start;
+ iter->flags = FTRACE_ITER_TOUCHED;
+ iter->ops = &global_ops;
+
+ return 0;
+}
+
/**
* ftrace_regex_open - initialize function tracer filter files
* @ops: The ftrace_ops that hold the hash filters
@@ -5300,388 +5317,9 @@ struct ftrace_direct_func {
static LIST_HEAD(ftrace_direct_funcs);
-/**
- * ftrace_find_direct_func - test an address if it is a registered direct caller
- * @addr: The address of a registered direct caller
- *
- * This searches to see if a ftrace direct caller has been registered
- * at a specific address, and if so, it returns a descriptor for it.
- *
- * This can be used by architecture code to see if an address is
- * a direct caller (trampoline) attached to a fentry/mcount location.
- * This is useful for the function_graph tracer, as it may need to
- * do adjustments if it traced a location that also has a direct
- * trampoline attached to it.
- */
-struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr)
-{
- struct ftrace_direct_func *entry;
- bool found = false;
-
- /* May be called by fgraph trampoline (protected by rcu tasks) */
- list_for_each_entry_rcu(entry, &ftrace_direct_funcs, next) {
- if (entry->addr == addr) {
- found = true;
- break;
- }
- }
- if (found)
- return entry;
-
- return NULL;
-}
-
-static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr)
-{
- struct ftrace_direct_func *direct;
-
- direct = kmalloc(sizeof(*direct), GFP_KERNEL);
- if (!direct)
- return NULL;
- direct->addr = addr;
- direct->count = 0;
- list_add_rcu(&direct->next, &ftrace_direct_funcs);
- ftrace_direct_func_count++;
- return direct;
-}
-
static int register_ftrace_function_nolock(struct ftrace_ops *ops);
-/**
- * register_ftrace_direct - Call a custom trampoline directly
- * @ip: The address of the nop at the beginning of a function
- * @addr: The address of the trampoline to call at @ip
- *
- * This is used to connect a direct call from the nop location (@ip)
- * at the start of ftrace traced functions. The location that it calls
- * (@addr) must be able to handle a direct call, and save the parameters
- * of the function being traced, and restore them (or inject new ones
- * if needed), before returning.
- *
- * Returns:
- * 0 on success
- * -EBUSY - Another direct function is already attached (there can be only one)
- * -ENODEV - @ip does not point to a ftrace nop location (or not supported)
- * -ENOMEM - There was an allocation failure.
- */
-int register_ftrace_direct(unsigned long ip, unsigned long addr)
-{
- struct ftrace_direct_func *direct;
- struct ftrace_func_entry *entry;
- struct ftrace_hash *free_hash = NULL;
- struct dyn_ftrace *rec;
- int ret = -ENODEV;
-
- mutex_lock(&direct_mutex);
-
- ip = ftrace_location(ip);
- if (!ip)
- goto out_unlock;
-
- /* See if there's a direct function at @ip already */
- ret = -EBUSY;
- if (ftrace_find_rec_direct(ip))
- goto out_unlock;
-
- ret = -ENODEV;
- rec = lookup_rec(ip, ip);
- if (!rec)
- goto out_unlock;
-
- /*
- * Check if the rec says it has a direct call but we didn't
- * find one earlier?
- */
- if (WARN_ON(rec->flags & FTRACE_FL_DIRECT))
- goto out_unlock;
-
- /* Make sure the ip points to the exact record */
- if (ip != rec->ip) {
- ip = rec->ip;
- /* Need to check this ip for a direct. */
- if (ftrace_find_rec_direct(ip))
- goto out_unlock;
- }
-
- ret = -ENOMEM;
- direct = ftrace_find_direct_func(addr);
- if (!direct) {
- direct = ftrace_alloc_direct_func(addr);
- if (!direct)
- goto out_unlock;
- }
-
- entry = ftrace_add_rec_direct(ip, addr, &free_hash);
- if (!entry)
- goto out_unlock;
-
- ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0);
-
- if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) {
- ret = register_ftrace_function_nolock(&direct_ops);
- if (ret)
- ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
- }
-
- if (ret) {
- remove_hash_entry(direct_functions, entry);
- kfree(entry);
- if (!direct->count) {
- list_del_rcu(&direct->next);
- synchronize_rcu_tasks();
- kfree(direct);
- if (free_hash)
- free_ftrace_hash(free_hash);
- free_hash = NULL;
- ftrace_direct_func_count--;
- }
- } else {
- direct->count++;
- }
- out_unlock:
- mutex_unlock(&direct_mutex);
-
- if (free_hash) {
- synchronize_rcu_tasks();
- free_ftrace_hash(free_hash);
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(register_ftrace_direct);
-
-static struct ftrace_func_entry *find_direct_entry(unsigned long *ip,
- struct dyn_ftrace **recp)
-{
- struct ftrace_func_entry *entry;
- struct dyn_ftrace *rec;
-
- rec = lookup_rec(*ip, *ip);
- if (!rec)
- return NULL;
-
- entry = __ftrace_lookup_ip(direct_functions, rec->ip);
- if (!entry) {
- WARN_ON(rec->flags & FTRACE_FL_DIRECT);
- return NULL;
- }
-
- WARN_ON(!(rec->flags & FTRACE_FL_DIRECT));
-
- /* Passed in ip just needs to be on the call site */
- *ip = rec->ip;
-
- if (recp)
- *recp = rec;
-
- return entry;
-}
-
-int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
-{
- struct ftrace_direct_func *direct;
- struct ftrace_func_entry *entry;
- struct ftrace_hash *hash;
- int ret = -ENODEV;
-
- mutex_lock(&direct_mutex);
-
- ip = ftrace_location(ip);
- if (!ip)
- goto out_unlock;
-
- entry = find_direct_entry(&ip, NULL);
- if (!entry)
- goto out_unlock;
-
- hash = direct_ops.func_hash->filter_hash;
- if (hash->count == 1)
- unregister_ftrace_function(&direct_ops);
-
- ret = ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
-
- WARN_ON(ret);
-
- remove_hash_entry(direct_functions, entry);
-
- direct = ftrace_find_direct_func(addr);
- if (!WARN_ON(!direct)) {
- /* This is the good path (see the ! before WARN) */
- direct->count--;
- WARN_ON(direct->count < 0);
- if (!direct->count) {
- list_del_rcu(&direct->next);
- synchronize_rcu_tasks();
- kfree(direct);
- kfree(entry);
- ftrace_direct_func_count--;
- }
- }
- out_unlock:
- mutex_unlock(&direct_mutex);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
-
-static struct ftrace_ops stub_ops = {
- .func = ftrace_stub,
-};
-
-/**
- * ftrace_modify_direct_caller - modify ftrace nop directly
- * @entry: The ftrace hash entry of the direct helper for @rec
- * @rec: The record representing the function site to patch
- * @old_addr: The location that the site at @rec->ip currently calls
- * @new_addr: The location that the site at @rec->ip should call
- *
- * An architecture may overwrite this function to optimize the
- * changing of the direct callback on an ftrace nop location.
- * This is called with the ftrace_lock mutex held, and no other
- * ftrace callbacks are on the associated record (@rec). Thus,
- * it is safe to modify the ftrace record, where it should be
- * currently calling @old_addr directly, to call @new_addr.
- *
- * This is called with direct_mutex locked.
- *
- * Safety checks should be made to make sure that the code at
- * @rec->ip is currently calling @old_addr. And this must
- * also update entry->direct to @new_addr.
- */
-int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
- struct dyn_ftrace *rec,
- unsigned long old_addr,
- unsigned long new_addr)
-{
- unsigned long ip = rec->ip;
- int ret;
-
- lockdep_assert_held(&direct_mutex);
-
- /*
- * The ftrace_lock was used to determine if the record
- * had more than one registered user to it. If it did,
- * we needed to prevent that from changing to do the quick
- * switch. But if it did not (only a direct caller was attached)
- * then this function is called. But this function can deal
- * with attached callers to the rec that we care about, and
- * since this function uses standard ftrace calls that take
- * the ftrace_lock mutex, we need to release it.
- */
- mutex_unlock(&ftrace_lock);
-
- /*
- * By setting a stub function at the same address, we force
- * the code to call the iterator and the direct_ops helper.
- * This means that @ip does not call the direct call, and
- * we can simply modify it.
- */
- ret = ftrace_set_filter_ip(&stub_ops, ip, 0, 0);
- if (ret)
- goto out_lock;
-
- ret = register_ftrace_function_nolock(&stub_ops);
- if (ret) {
- ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
- goto out_lock;
- }
-
- entry->direct = new_addr;
-
- /*
- * By removing the stub, we put back the direct call, calling
- * the @new_addr.
- */
- unregister_ftrace_function(&stub_ops);
- ftrace_set_filter_ip(&stub_ops, ip, 1, 0);
-
- out_lock:
- mutex_lock(&ftrace_lock);
-
- return ret;
-}
-
-/**
- * modify_ftrace_direct - Modify an existing direct call to call something else
- * @ip: The instruction pointer to modify
- * @old_addr: The address that the current @ip calls directly
- * @new_addr: The address that the @ip should call
- *
- * This modifies a ftrace direct caller at an instruction pointer without
- * having to disable it first. The direct call will switch over to the
- * @new_addr without missing anything.
- *
- * Returns: zero on success. Non zero on error, which includes:
- * -ENODEV : the @ip given has no direct caller attached
- * -EINVAL : the @old_addr does not match the current direct caller
- */
-int modify_ftrace_direct(unsigned long ip,
- unsigned long old_addr, unsigned long new_addr)
-{
- struct ftrace_direct_func *direct, *new_direct = NULL;
- struct ftrace_func_entry *entry;
- struct dyn_ftrace *rec;
- int ret = -ENODEV;
-
- mutex_lock(&direct_mutex);
-
- mutex_lock(&ftrace_lock);
-
- ip = ftrace_location(ip);
- if (!ip)
- goto out_unlock;
-
- entry = find_direct_entry(&ip, &rec);
- if (!entry)
- goto out_unlock;
-
- ret = -EINVAL;
- if (entry->direct != old_addr)
- goto out_unlock;
-
- direct = ftrace_find_direct_func(old_addr);
- if (WARN_ON(!direct))
- goto out_unlock;
- if (direct->count > 1) {
- ret = -ENOMEM;
- new_direct = ftrace_alloc_direct_func(new_addr);
- if (!new_direct)
- goto out_unlock;
- direct->count--;
- new_direct->count++;
- } else {
- direct->addr = new_addr;
- }
-
- /*
- * If there's no other ftrace callback on the rec->ip location,
- * then it can be changed directly by the architecture.
- * If there is another caller, then we just need to change the
- * direct caller helper to point to @new_addr.
- */
- if (ftrace_rec_count(rec) == 1) {
- ret = ftrace_modify_direct_caller(entry, rec, old_addr, new_addr);
- } else {
- entry->direct = new_addr;
- ret = 0;
- }
-
- if (unlikely(ret && new_direct)) {
- direct->count++;
- list_del_rcu(&new_direct->next);
- synchronize_rcu_tasks();
- kfree(new_direct);
- ftrace_direct_func_count--;
- }
-
- out_unlock:
- mutex_unlock(&ftrace_lock);
- mutex_unlock(&direct_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(modify_ftrace_direct);
-
-#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS)
+#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_ARGS)
static int check_direct_multi(struct ftrace_ops *ops)
{
@@ -5710,7 +5348,7 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
}
/**
- * register_ftrace_direct_multi - Call a custom trampoline directly
+ * register_ftrace_direct - Call a custom trampoline directly
* for multiple functions registered in @ops
* @ops: The address of the struct ftrace_ops object
* @addr: The address of the trampoline to call at @ops functions
@@ -5731,7 +5369,7 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
* -ENODEV - @ip does not point to a ftrace nop location (or not supported)
* -ENOMEM - There was an allocation failure.
*/
-int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
struct ftrace_hash *hash, *free_hash = NULL;
struct ftrace_func_entry *entry, *new;
@@ -5773,6 +5411,7 @@ int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
ops->func = call_direct_funcs;
ops->flags = MULTI_FLAGS;
ops->trampoline = FTRACE_REGS_ADDR;
+ ops->direct_call = addr;
err = register_ftrace_function_nolock(ops);
@@ -5789,11 +5428,11 @@ int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
}
return err;
}
-EXPORT_SYMBOL_GPL(register_ftrace_direct_multi);
+EXPORT_SYMBOL_GPL(register_ftrace_direct);
/**
- * unregister_ftrace_direct_multi - Remove calls to custom trampoline
- * previously registered by register_ftrace_direct_multi for @ops object.
+ * unregister_ftrace_direct - Remove calls to custom trampoline
+ * previously registered by register_ftrace_direct for @ops object.
* @ops: The address of the struct ftrace_ops object
*
* This is used to remove a direct calls to @addr from the nop locations
@@ -5804,7 +5443,8 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct_multi);
* 0 on success
* -EINVAL - The @ops object was not properly registered.
*/
-int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr,
+ bool free_filters)
{
struct ftrace_hash *hash = ops->func_hash->filter_hash;
int err;
@@ -5822,12 +5462,15 @@ int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
/* cleanup for possible another register call */
ops->func = NULL;
ops->trampoline = 0;
+
+ if (free_filters)
+ ftrace_free_filter(ops);
return err;
}
-EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi);
+EXPORT_SYMBOL_GPL(unregister_ftrace_direct);
static int
-__modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+__modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
struct ftrace_hash *hash;
struct ftrace_func_entry *entry, *iter;
@@ -5843,6 +5486,7 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
/* Enable the tmp_ops to have the same functions as the direct ops */
ftrace_ops_init(&tmp_ops);
tmp_ops.func_hash = ops->func_hash;
+ tmp_ops.direct_call = addr;
err = register_ftrace_function_nolock(&tmp_ops);
if (err)
@@ -5864,6 +5508,8 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
entry->direct = addr;
}
}
+ /* Prevent store tearing if a trampoline concurrently accesses the value */
+ WRITE_ONCE(ops->direct_call, addr);
mutex_unlock(&ftrace_lock);
@@ -5874,7 +5520,7 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
}
/**
- * modify_ftrace_direct_multi_nolock - Modify an existing direct 'multi' call
+ * modify_ftrace_direct_nolock - Modify an existing direct 'multi' call
* to call something else
* @ops: The address of the struct ftrace_ops object
* @addr: The address of the new trampoline to call at @ops functions
@@ -5891,19 +5537,19 @@ __modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
* Returns: zero on success. Non zero on error, which includes:
* -EINVAL - The @ops object was not properly registered.
*/
-int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr)
+int modify_ftrace_direct_nolock(struct ftrace_ops *ops, unsigned long addr)
{
if (check_direct_multi(ops))
return -EINVAL;
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
return -EINVAL;
- return __modify_ftrace_direct_multi(ops, addr);
+ return __modify_ftrace_direct(ops, addr);
}
-EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi_nolock);
+EXPORT_SYMBOL_GPL(modify_ftrace_direct_nolock);
/**
- * modify_ftrace_direct_multi - Modify an existing direct 'multi' call
+ * modify_ftrace_direct - Modify an existing direct 'multi' call
* to call something else
* @ops: The address of the struct ftrace_ops object
* @addr: The address of the new trampoline to call at @ops functions
@@ -5917,7 +5563,7 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi_nolock);
* Returns: zero on success. Non zero on error, which includes:
* -EINVAL - The @ops object was not properly registered.
*/
-int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+int modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
int err;
@@ -5927,11 +5573,11 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
return -EINVAL;
mutex_lock(&direct_mutex);
- err = __modify_ftrace_direct_multi(ops, addr);
+ err = __modify_ftrace_direct(ops, addr);
mutex_unlock(&direct_mutex);
return err;
}
-EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi);
+EXPORT_SYMBOL_GPL(modify_ftrace_direct);
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
/**
@@ -6257,6 +5903,13 @@ static const struct file_operations ftrace_enabled_fops = {
.release = seq_release_private,
};
+static const struct file_operations ftrace_touched_fops = {
+ .open = ftrace_touched_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
static const struct file_operations ftrace_filter_fops = {
.open = ftrace_filter_open,
.read = seq_read,
@@ -6721,6 +6374,9 @@ static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer)
trace_create_file("enabled_functions", TRACE_MODE_READ,
d_tracer, NULL, &ftrace_enabled_fops);
+ trace_create_file("touched_functions", TRACE_MODE_READ,
+ d_tracer, NULL, &ftrace_touched_fops);
+
ftrace_create_filter_files(&global_ops, d_tracer);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -8391,8 +8047,7 @@ struct kallsyms_data {
* and returns 1 in case we resolved all the requested symbols,
* 0 otherwise.
*/
-static int kallsyms_callback(void *data, const char *name,
- struct module *mod, unsigned long addr)
+static int kallsyms_callback(void *data, const char *name, unsigned long addr)
{
struct kallsyms_data *args = data;
const char **sym;
diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c
index 4850fdfe27f1..5a4b722b5045 100644
--- a/kernel/trace/kprobe_event_gen_test.c
+++ b/kernel/trace/kprobe_event_gen_test.c
@@ -146,7 +146,7 @@ static int __init test_gen_kprobe_cmd(void)
if (trace_event_file_is_valid(gen_kprobe_test))
gen_kprobe_test = NULL;
/* We got an error after creating the event, delete it */
- ret = kprobe_event_delete("gen_kprobe_test");
+ kprobe_event_delete("gen_kprobe_test");
goto out;
}
@@ -211,7 +211,7 @@ static int __init test_gen_kretprobe_cmd(void)
if (trace_event_file_is_valid(gen_kretprobe_test))
gen_kretprobe_test = NULL;
/* We got an error after creating the event, delete it */
- ret = kprobe_event_delete("gen_kretprobe_test");
+ kprobe_event_delete("gen_kretprobe_test");
goto out;
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index af50d931b020..834b361a4a66 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -163,7 +163,7 @@ enum {
#define extended_time(event) \
(event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
-static inline int rb_null_event(struct ring_buffer_event *event)
+static inline bool rb_null_event(struct ring_buffer_event *event)
{
return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
}
@@ -354,10 +354,6 @@ static void rb_init_page(struct buffer_data_page *bpage)
local_set(&bpage->commit, 0);
}
-/*
- * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
- * this issue out.
- */
static void free_buffer_page(struct buffer_page *bpage)
{
free_page((unsigned long)bpage->page);
@@ -367,11 +363,9 @@ static void free_buffer_page(struct buffer_page *bpage)
/*
* We need to fit the time_stamp delta into 27 bits.
*/
-static inline int test_time_stamp(u64 delta)
+static inline bool test_time_stamp(u64 delta)
{
- if (delta & TS_DELTA_TEST)
- return 1;
- return 0;
+ return !!(delta & TS_DELTA_TEST);
}
#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
@@ -700,7 +694,7 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
return ret == expect;
}
-static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
{
unsigned long cnt, top, bottom, msb;
unsigned long cnt2, top2, bottom2, msb2;
@@ -1490,7 +1484,7 @@ rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
return NULL;
}
-static int rb_head_page_replace(struct buffer_page *old,
+static bool rb_head_page_replace(struct buffer_page *old,
struct buffer_page *new)
{
unsigned long *ptr = (unsigned long *)&old->list.prev->next;
@@ -1569,15 +1563,12 @@ static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
}
}
-static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
+static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage)
{
unsigned long val = (unsigned long)bpage;
- if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
- return 1;
-
- return 0;
+ RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK);
}
/**
@@ -1587,30 +1578,28 @@ static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
* As a safety measure we check to make sure the data pages have not
* been corrupted.
*/
-static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
+static void rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
struct list_head *head = rb_list_head(cpu_buffer->pages);
struct list_head *tmp;
if (RB_WARN_ON(cpu_buffer,
rb_list_head(rb_list_head(head->next)->prev) != head))
- return -1;
+ return;
if (RB_WARN_ON(cpu_buffer,
rb_list_head(rb_list_head(head->prev)->next) != head))
- return -1;
+ return;
for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) {
if (RB_WARN_ON(cpu_buffer,
rb_list_head(rb_list_head(tmp->next)->prev) != tmp))
- return -1;
+ return;
if (RB_WARN_ON(cpu_buffer,
rb_list_head(rb_list_head(tmp->prev)->next) != tmp))
- return -1;
+ return;
}
-
- return 0;
}
static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
@@ -1778,6 +1767,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
struct list_head *head = cpu_buffer->pages;
struct buffer_page *bpage, *tmp;
+ irq_work_sync(&cpu_buffer->irq_work.work);
+
free_buffer_page(cpu_buffer->reader_page);
if (head) {
@@ -1884,6 +1875,8 @@ ring_buffer_free(struct trace_buffer *buffer)
cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
+ irq_work_sync(&buffer->irq_work.work);
+
for_each_buffer_cpu(buffer, cpu)
rb_free_cpu_buffer(buffer->buffers[cpu]);
@@ -1922,7 +1915,7 @@ static inline unsigned long rb_page_write(struct buffer_page *bpage)
return local_read(&bpage->write) & RB_WRITE_MASK;
}
-static int
+static bool
rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
{
struct list_head *tail_page, *to_remove, *next_page;
@@ -2035,12 +2028,13 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages)
return nr_removed == 0;
}
-static int
+static bool
rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
struct list_head *pages = &cpu_buffer->new_pages;
- int retries, success;
unsigned long flags;
+ bool success;
+ int retries;
/* Can be called at early boot up, where interrupts must not been enabled */
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
@@ -2059,15 +2053,16 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
* spinning.
*/
retries = 10;
- success = 0;
+ success = false;
while (retries--) {
struct list_head *head_page, *prev_page, *r;
struct list_head *last_page, *first_page;
struct list_head *head_page_with_bit;
+ struct buffer_page *hpage = rb_set_head_page(cpu_buffer);
- head_page = &rb_set_head_page(cpu_buffer)->list;
- if (!head_page)
+ if (!hpage)
break;
+ head_page = &hpage->list;
prev_page = head_page->prev;
first_page = pages->next;
@@ -2088,7 +2083,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
* pointer to point to end of list
*/
head_page->prev = last_page;
- success = 1;
+ success = true;
break;
}
}
@@ -2116,7 +2111,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer)
static void rb_update_pages(struct ring_buffer_per_cpu *cpu_buffer)
{
- int success;
+ bool success;
if (cpu_buffer->nr_pages_to_update > 0)
success = rb_insert_pages(cpu_buffer);
@@ -2999,7 +2994,7 @@ static u64 rb_time_delta(struct ring_buffer_event *event)
}
}
-static inline int
+static inline bool
rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
@@ -3020,7 +3015,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
delta = rb_time_delta(event);
if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
- return 0;
+ return false;
/* Make sure the write stamp is read before testing the location */
barrier();
@@ -3033,7 +3028,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
/* Something came in, can't discard */
if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
write_stamp, write_stamp - delta))
- return 0;
+ return false;
/*
* It's possible that the event time delta is zero
@@ -3066,12 +3061,12 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
if (index == old_index) {
/* update counters */
local_sub(event_length, &cpu_buffer->entries_bytes);
- return 1;
+ return true;
}
}
/* could not discard */
- return 0;
+ return false;
}
static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
@@ -3102,6 +3097,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
if (RB_WARN_ON(cpu_buffer,
rb_is_reader_page(cpu_buffer->tail_page)))
return;
+ /*
+ * No need for a memory barrier here, as the update
+ * of the tail_page did it for this page.
+ */
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(&cpu_buffer->commit_page);
@@ -3111,6 +3110,8 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
while (rb_commit_index(cpu_buffer) !=
rb_page_write(cpu_buffer->commit_page)) {
+ /* Make sure the readers see the content of what is committed. */
+ smp_wmb();
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
RB_WARN_ON(cpu_buffer,
@@ -3286,7 +3287,7 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* Note: The TRANSITION bit only handles a single transition between context.
*/
-static __always_inline int
+static __always_inline bool
trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned int val = cpu_buffer->current_context;
@@ -3303,14 +3304,14 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
bit = RB_CTX_TRANSITION;
if (val & (1 << (bit + cpu_buffer->nest))) {
do_ring_buffer_record_recursion();
- return 1;
+ return true;
}
}
val |= (1 << (bit + cpu_buffer->nest));
cpu_buffer->current_context = val;
- return 0;
+ return false;
}
static __always_inline void
@@ -4067,10 +4068,10 @@ void ring_buffer_record_off(struct trace_buffer *buffer)
unsigned int rd;
unsigned int new_rd;
+ rd = atomic_read(&buffer->record_disabled);
do {
- rd = atomic_read(&buffer->record_disabled);
new_rd = rd | RB_BUFFER_OFF;
- } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+ } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd));
}
EXPORT_SYMBOL_GPL(ring_buffer_record_off);
@@ -4090,10 +4091,10 @@ void ring_buffer_record_on(struct trace_buffer *buffer)
unsigned int rd;
unsigned int new_rd;
+ rd = atomic_read(&buffer->record_disabled);
do {
- rd = atomic_read(&buffer->record_disabled);
new_rd = rd & ~RB_BUFFER_OFF;
- } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+ } while (!atomic_try_cmpxchg(&buffer->record_disabled, &rd, new_rd));
}
EXPORT_SYMBOL_GPL(ring_buffer_record_on);
@@ -4500,7 +4501,6 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
default:
RB_WARN_ON(cpu_buffer, 1);
}
- return;
}
static void
@@ -4531,7 +4531,6 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
default:
RB_WARN_ON(iter->cpu_buffer, 1);
}
- return;
}
static struct buffer_page *
@@ -4541,7 +4540,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
unsigned long overwrite;
unsigned long flags;
int nr_loops = 0;
- int ret;
+ bool ret;
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
@@ -4688,7 +4687,12 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
/*
* Make sure we see any padding after the write update
- * (see rb_reset_tail())
+ * (see rb_reset_tail()).
+ *
+ * In addition, a writer may be writing on the reader page
+ * if the page has not been fully filled, so the read barrier
+ * is also needed to make sure we see the content of what is
+ * committed by the writer (see rb_set_commit_to_write()).
*/
smp_rmb();
@@ -4946,7 +4950,6 @@ rb_reader_unlock(struct ring_buffer_per_cpu *cpu_buffer, bool locked)
{
if (likely(locked))
raw_spin_unlock(&cpu_buffer->reader_lock);
- return;
}
/**
@@ -5338,6 +5341,9 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
+/* Flag to ensure proper resetting of atomic variables */
+#define RESET_BIT (1 << 30)
+
/**
* ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer
* @buffer: The ring buffer to reset a per cpu buffer of
@@ -5354,20 +5360,27 @@ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
for_each_online_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
- atomic_inc(&cpu_buffer->resize_disabled);
+ atomic_add(RESET_BIT, &cpu_buffer->resize_disabled);
atomic_inc(&cpu_buffer->record_disabled);
}
/* Make sure all commits have finished */
synchronize_rcu();
- for_each_online_buffer_cpu(buffer, cpu) {
+ for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
+ /*
+ * If a CPU came online during the synchronize_rcu(), then
+ * ignore it.
+ */
+ if (!(atomic_read(&cpu_buffer->resize_disabled) & RESET_BIT))
+ continue;
+
reset_disabled_cpu_buffer(cpu_buffer);
atomic_dec(&cpu_buffer->record_disabled);
- atomic_dec(&cpu_buffer->resize_disabled);
+ atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled);
}
mutex_unlock(&buffer->mutex);
@@ -5417,8 +5430,8 @@ bool ring_buffer_empty(struct trace_buffer *buffer)
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
bool dolock;
+ bool ret;
int cpu;
- int ret;
/* yes this is racy, but if you don't like the race, lock the buffer */
for_each_buffer_cpu(buffer, cpu) {
@@ -5447,7 +5460,7 @@ bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu)
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
bool dolock;
- int ret;
+ bool ret;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return true;
diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c
index d65f6c25a87c..0186ff4cbd0b 100644
--- a/kernel/trace/rv/reactor_panic.c
+++ b/kernel/trace/rv/reactor_panic.c
@@ -38,6 +38,5 @@ static void __exit unregister_react_panic(void)
module_init(register_react_panic);
module_exit(unregister_react_panic);
-MODULE_LICENSE("GPL");
MODULE_AUTHOR("Daniel Bristot de Oliveira");
MODULE_DESCRIPTION("panic rv reactor: panic if an exception is found.");
diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c
index 4b6b7106a477..178759dbf89f 100644
--- a/kernel/trace/rv/reactor_printk.c
+++ b/kernel/trace/rv/reactor_printk.c
@@ -37,6 +37,5 @@ static void __exit unregister_react_printk(void)
module_init(register_react_printk);
module_exit(unregister_react_printk);
-MODULE_LICENSE("GPL");
MODULE_AUTHOR("Daniel Bristot de Oliveira");
MODULE_DESCRIPTION("printk rv reactor: printk if an exception is hit.");
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 7e9061828c24..2f68e93fff0b 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -290,8 +290,6 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u
if (retval)
return retval;
- retval = count;
-
mutex_lock(&rv_interface_lock);
if (val)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 45551c7b4c36..427da2341bf0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1149,22 +1149,22 @@ static void tracing_snapshot_instance_cond(struct trace_array *tr,
unsigned long flags;
if (in_nmi()) {
- internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
- internal_trace_puts("*** snapshot is being ignored ***\n");
+ trace_array_puts(tr, "*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
+ trace_array_puts(tr, "*** snapshot is being ignored ***\n");
return;
}
if (!tr->allocated_snapshot) {
- internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n");
- internal_trace_puts("*** stopping trace here! ***\n");
- tracing_off();
+ trace_array_puts(tr, "*** SNAPSHOT NOT ALLOCATED ***\n");
+ trace_array_puts(tr, "*** stopping trace here! ***\n");
+ tracer_tracing_off(tr);
return;
}
/* Note, snapshot can not be used when the tracer uses it */
if (tracer->use_max_tr) {
- internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n");
- internal_trace_puts("*** Can not use snapshot (sorry) ***\n");
+ trace_array_puts(tr, "*** LATENCY TRACER ACTIVE ***\n");
+ trace_array_puts(tr, "*** Can not use snapshot (sorry) ***\n");
return;
}
@@ -3726,7 +3726,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
#define STATIC_FMT_BUF_SIZE 128
static char static_fmt_buf[STATIC_FMT_BUF_SIZE];
-static char *trace_iter_expand_format(struct trace_iterator *iter)
+char *trace_iter_expand_format(struct trace_iterator *iter)
{
char *tmp;
@@ -4446,8 +4446,11 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
if (trace_seq_has_overflowed(s))
return TRACE_TYPE_PARTIAL_LINE;
- if (event)
+ if (event) {
+ if (tr->trace_flags & TRACE_ITER_FIELDS)
+ return print_event_fields(iter, event);
return event->funcs->trace(iter, sym_flags, event);
+ }
trace_seq_printf(s, "Unknown type %d\n", entry->type);
@@ -5167,6 +5170,8 @@ loff_t tracing_lseek(struct file *file, loff_t offset, int whence)
static const struct file_operations tracing_fops = {
.open = tracing_open,
.read = seq_read,
+ .read_iter = seq_read_iter,
+ .splice_read = generic_file_splice_read,
.write = tracing_write_stub,
.llseek = tracing_lseek,
.release = tracing_release,
@@ -9514,6 +9519,7 @@ static int __remove_instance(struct trace_array *tr)
tracefs_remove(tr->dir);
free_percpu(tr->last_func_repeats);
free_trace_buffers(tr);
+ clear_tracing_err_log(tr);
for (i = 0; i < tr->nr_topts; i++) {
kfree(tr->topts[i].topts);
@@ -10391,19 +10397,20 @@ out:
void __init ftrace_boot_snapshot(void)
{
+#ifdef CONFIG_TRACER_MAX_TRACE
struct trace_array *tr;
- if (snapshot_at_boot) {
- tracing_snapshot();
- internal_trace_puts("** Boot snapshot taken **\n");
- }
+ if (!snapshot_at_boot)
+ return;
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (tr == &global_trace)
+ if (!tr->allocated_snapshot)
continue;
- trace_array_puts(tr, "** Boot snapshot taken **\n");
+
tracing_snapshot_instance(tr);
+ trace_array_puts(tr, "** Boot snapshot taken **\n");
}
+#endif
}
void __init early_trace_init(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 616e1aa1c4da..79bdefe9261b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -619,6 +619,7 @@ bool trace_is_tracepoint_string(const char *str);
const char *trace_event_format(struct trace_iterator *iter, const char *fmt);
void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
va_list ap) __printf(2, 0);
+char *trace_iter_expand_format(struct trace_iterator *iter);
int trace_empty(struct trace_iterator *iter);
@@ -1199,6 +1200,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(HEX, "hex"), \
C(BIN, "bin"), \
C(BLOCK, "block"), \
+ C(FIELDS, "fields"), \
C(PRINTK, "trace_printk"), \
C(ANNOTATE, "annotate"), \
C(USERSTACKTRACE, "userstacktrace"), \
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 89877a18f933..486cca3c2b75 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1331,6 +1331,9 @@ static const char *hist_field_name(struct hist_field *field,
{
const char *field_name = "";
+ if (WARN_ON_ONCE(!field))
+ return field_name;
+
if (level > 1)
return field_name;
@@ -4235,6 +4238,15 @@ static int __create_val_field(struct hist_trigger_data *hist_data,
goto out;
}
+ /* Some types cannot be a value */
+ if (hist_field->flags & (HIST_FIELD_FL_GRAPH | HIST_FIELD_FL_PERCENT |
+ HIST_FIELD_FL_BUCKET | HIST_FIELD_FL_LOG2 |
+ HIST_FIELD_FL_SYM | HIST_FIELD_FL_SYM_OFFSET |
+ HIST_FIELD_FL_SYSCALL | HIST_FIELD_FL_STACKTRACE)) {
+ hist_err(file->tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(field_str));
+ ret = -EINVAL;
+ }
+
hist_data->fields[val_idx] = hist_field;
++hist_data->n_vals;
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 46d0abb32d0f..d6a70aff2410 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -44,14 +44,21 @@ enum { ERRORS };
static const char *err_text[] = { ERRORS };
+static DEFINE_MUTEX(lastcmd_mutex);
static char *last_cmd;
static int errpos(const char *str)
{
+ int ret = 0;
+
+ mutex_lock(&lastcmd_mutex);
if (!str || !last_cmd)
- return 0;
+ goto out;
- return err_pos(last_cmd, str);
+ ret = err_pos(last_cmd, str);
+ out:
+ mutex_unlock(&lastcmd_mutex);
+ return ret;
}
static void last_cmd_set(const char *str)
@@ -59,18 +66,22 @@ static void last_cmd_set(const char *str)
if (!str)
return;
+ mutex_lock(&lastcmd_mutex);
kfree(last_cmd);
-
last_cmd = kstrdup(str, GFP_KERNEL);
+ mutex_unlock(&lastcmd_mutex);
}
static void synth_err(u8 err_type, u16 err_pos)
{
+ mutex_lock(&lastcmd_mutex);
if (!last_cmd)
- return;
+ goto out;
tracing_log_err(NULL, "synthetic_events", last_cmd, err_text,
err_type, err_pos);
+ out:
+ mutex_unlock(&lastcmd_mutex);
}
static int create_synth_event(const char *raw_command);
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 908e8a13c675..b1ecd7677642 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -19,14 +19,12 @@
#include <linux/tracefs.h>
#include <linux/types.h>
#include <linux/uaccess.h>
-/* Reminder to move to uapi when everything works */
-#ifdef CONFIG_COMPILE_TEST
+#include <linux/highmem.h>
+#include <linux/init.h>
#include <linux/user_events.h>
-#else
-#include <uapi/linux/user_events.h>
-#endif
-#include "trace.h"
#include "trace_dynevent.h"
+#include "trace_output.h"
+#include "trace.h"
#define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1)
@@ -34,35 +32,12 @@
#define FIELD_DEPTH_NAME 1
#define FIELD_DEPTH_SIZE 2
-/*
- * Limits how many trace_event calls user processes can create:
- * Must be a power of two of PAGE_SIZE.
- */
-#define MAX_PAGE_ORDER 0
-#define MAX_PAGES (1 << MAX_PAGE_ORDER)
-#define MAX_BYTES (MAX_PAGES * PAGE_SIZE)
-#define MAX_EVENTS (MAX_BYTES * 8)
-
/* Limit how long of an event name plus args within the subsystem. */
#define MAX_EVENT_DESC 512
#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
#define MAX_FIELD_ARRAY_SIZE 1024
/*
- * The MAP_STATUS_* macros are used for taking a index and determining the
- * appropriate byte and the bit in the byte to set/reset for an event.
- *
- * The lower 3 bits of the index decide which bit to set.
- * The remaining upper bits of the index decide which byte to use for the bit.
- *
- * This is used when an event has a probe attached/removed to reflect live
- * status of the event wanting tracing or not to user-programs via shared
- * memory maps.
- */
-#define MAP_STATUS_BYTE(index) ((index) >> 3)
-#define MAP_STATUS_MASK(index) BIT((index) & 7)
-
-/*
* Internal bits (kernel side only) to keep track of connected probes:
* These are used when status is requested in text form about an event. These
* bits are compared against an internal byte on the event to determine which
@@ -75,25 +50,25 @@
#define EVENT_STATUS_OTHER BIT(7)
/*
- * Stores the pages, tables, and locks for a group of events.
- * Each logical grouping of events has its own group, with a
- * matching page for status checks within user programs. This
- * allows for isolation of events to user programs by various
- * means.
+ * Stores the system name, tables, and locks for a group of events. This
+ * allows isolation for events by various means.
*/
struct user_event_group {
- struct page *pages;
- char *register_page_data;
- char *system_name;
- struct hlist_node node;
- struct mutex reg_mutex;
+ char *system_name;
+ struct hlist_node node;
+ struct mutex reg_mutex;
DECLARE_HASHTABLE(register_table, 8);
- DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
};
/* Group for init_user_ns mapping, top-most group */
static struct user_event_group *init_group;
+/* Max allowed events for the whole system */
+static unsigned int max_user_events = 32768;
+
+/* Current number of events on the whole system */
+static unsigned int current_user_events;
+
/*
* Stores per-event properties, as users register events
* within a file a user_event might be created if it does not
@@ -102,21 +77,61 @@ static struct user_event_group *init_group;
* refcnt reaches one.
*/
struct user_event {
- struct user_event_group *group;
- struct tracepoint tracepoint;
- struct trace_event_call call;
- struct trace_event_class class;
- struct dyn_event devent;
- struct hlist_node node;
- struct list_head fields;
- struct list_head validators;
- refcount_t refcnt;
- int index;
- int flags;
- int min_size;
- char status;
+ struct user_event_group *group;
+ struct tracepoint tracepoint;
+ struct trace_event_call call;
+ struct trace_event_class class;
+ struct dyn_event devent;
+ struct hlist_node node;
+ struct list_head fields;
+ struct list_head validators;
+ refcount_t refcnt;
+ int min_size;
+ char status;
+};
+
+/*
+ * Stores per-mm/event properties that enable an address to be
+ * updated properly for each task. As tasks are forked, we use
+ * these to track enablement sites that are tied to an event.
+ */
+struct user_event_enabler {
+ struct list_head link;
+ struct user_event *event;
+ unsigned long addr;
+
+ /* Track enable bit, flags, etc. Aligned for bitops. */
+ unsigned int values;
+};
+
+/* Bits 0-5 are for the bit to update upon enable/disable (0-63 allowed) */
+#define ENABLE_VAL_BIT_MASK 0x3F
+
+/* Bit 6 is for faulting status of enablement */
+#define ENABLE_VAL_FAULTING_BIT 6
+
+/* Bit 7 is for freeing status of enablement */
+#define ENABLE_VAL_FREEING_BIT 7
+
+/* Only duplicate the bit value */
+#define ENABLE_VAL_DUP_MASK ENABLE_VAL_BIT_MASK
+
+#define ENABLE_BITOPS(e) ((unsigned long *)&(e)->values)
+
+/* Used for asynchronous faulting in of pages */
+struct user_event_enabler_fault {
+ struct work_struct work;
+ struct user_event_mm *mm;
+ struct user_event_enabler *enabler;
+ int attempt;
};
+static struct kmem_cache *fault_cache;
+
+/* Global list of memory descriptors using user_events */
+static LIST_HEAD(user_event_mms);
+static DEFINE_SPINLOCK(user_event_mms_lock);
+
/*
* Stores per-file events references, as users register events
* within a file this structure is modified and freed via RCU.
@@ -124,23 +139,23 @@ struct user_event {
* These are not shared and only accessible by the file that created it.
*/
struct user_event_refs {
- struct rcu_head rcu;
- int count;
- struct user_event *events[];
+ struct rcu_head rcu;
+ int count;
+ struct user_event *events[];
};
struct user_event_file_info {
- struct user_event_group *group;
- struct user_event_refs *refs;
+ struct user_event_group *group;
+ struct user_event_refs *refs;
};
#define VALIDATOR_ENSURE_NULL (1 << 0)
#define VALIDATOR_REL (1 << 1)
struct user_event_validator {
- struct list_head link;
- int offset;
- int flags;
+ struct list_head link;
+ int offset;
+ int flags;
};
typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
@@ -150,33 +165,17 @@ static int user_event_parse(struct user_event_group *group, char *name,
char *args, char *flags,
struct user_event **newuser);
+static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm);
+static struct user_event_mm *user_event_mm_get_all(struct user_event *user);
+static void user_event_mm_put(struct user_event_mm *mm);
+
static u32 user_event_key(char *name)
{
return jhash(name, strlen(name), 0);
}
-static void set_page_reservations(char *pages, bool set)
-{
- int page;
-
- for (page = 0; page < MAX_PAGES; ++page) {
- void *addr = pages + (PAGE_SIZE * page);
-
- if (set)
- SetPageReserved(virt_to_page(addr));
- else
- ClearPageReserved(virt_to_page(addr));
- }
-}
-
static void user_event_group_destroy(struct user_event_group *group)
{
- if (group->register_page_data)
- set_page_reservations(group->register_page_data, false);
-
- if (group->pages)
- __free_pages(group->pages, MAX_PAGE_ORDER);
-
kfree(group->system_name);
kfree(group);
}
@@ -247,19 +246,6 @@ static struct user_event_group
if (!group->system_name)
goto error;
- group->pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
-
- if (!group->pages)
- goto error;
-
- group->register_page_data = page_address(group->pages);
-
- set_page_reservations(group->register_page_data, true);
-
- /* Zero all bits beside 0 (which is reserved for failures) */
- bitmap_zero(group->page_bitmap, MAX_EVENTS);
- set_bit(0, group->page_bitmap);
-
mutex_init(&group->reg_mutex);
hash_init(group->register_table);
@@ -271,20 +257,514 @@ error:
return NULL;
};
-static __always_inline
-void user_event_register_set(struct user_event *user)
+static void user_event_enabler_destroy(struct user_event_enabler *enabler)
+{
+ list_del_rcu(&enabler->link);
+
+ /* No longer tracking the event via the enabler */
+ refcount_dec(&enabler->event->refcnt);
+
+ kfree(enabler);
+}
+
+static int user_event_mm_fault_in(struct user_event_mm *mm, unsigned long uaddr,
+ int attempt)
+{
+ bool unlocked;
+ int ret;
+
+ /*
+ * Normally this is low, ensure that it cannot be taken advantage of by
+ * bad user processes to cause excessive looping.
+ */
+ if (attempt > 10)
+ return -EFAULT;
+
+ mmap_read_lock(mm->mm);
+
+ /* Ensure MM has tasks, cannot use after exit_mm() */
+ if (refcount_read(&mm->tasks) == 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = fixup_user_fault(mm->mm, uaddr, FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
+ &unlocked);
+out:
+ mmap_read_unlock(mm->mm);
+
+ return ret;
+}
+
+static int user_event_enabler_write(struct user_event_mm *mm,
+ struct user_event_enabler *enabler,
+ bool fixup_fault, int *attempt);
+
+static void user_event_enabler_fault_fixup(struct work_struct *work)
+{
+ struct user_event_enabler_fault *fault = container_of(
+ work, struct user_event_enabler_fault, work);
+ struct user_event_enabler *enabler = fault->enabler;
+ struct user_event_mm *mm = fault->mm;
+ unsigned long uaddr = enabler->addr;
+ int attempt = fault->attempt;
+ int ret;
+
+ ret = user_event_mm_fault_in(mm, uaddr, attempt);
+
+ if (ret && ret != -ENOENT) {
+ struct user_event *user = enabler->event;
+
+ pr_warn("user_events: Fault for mm: 0x%pK @ 0x%llx event: %s\n",
+ mm->mm, (unsigned long long)uaddr, EVENT_NAME(user));
+ }
+
+ /* Prevent state changes from racing */
+ mutex_lock(&event_mutex);
+
+ /* User asked for enabler to be removed during fault */
+ if (test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))) {
+ user_event_enabler_destroy(enabler);
+ goto out;
+ }
+
+ /*
+ * If we managed to get the page, re-issue the write. We do not
+ * want to get into a possible infinite loop, which is why we only
+ * attempt again directly if the page came in. If we couldn't get
+ * the page here, then we will try again the next time the event is
+ * enabled/disabled.
+ */
+ clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
+
+ if (!ret) {
+ mmap_read_lock(mm->mm);
+ user_event_enabler_write(mm, enabler, true, &attempt);
+ mmap_read_unlock(mm->mm);
+ }
+out:
+ mutex_unlock(&event_mutex);
+
+ /* In all cases we no longer need the mm or fault */
+ user_event_mm_put(mm);
+ kmem_cache_free(fault_cache, fault);
+}
+
+static bool user_event_enabler_queue_fault(struct user_event_mm *mm,
+ struct user_event_enabler *enabler,
+ int attempt)
+{
+ struct user_event_enabler_fault *fault;
+
+ fault = kmem_cache_zalloc(fault_cache, GFP_NOWAIT | __GFP_NOWARN);
+
+ if (!fault)
+ return false;
+
+ INIT_WORK(&fault->work, user_event_enabler_fault_fixup);
+ fault->mm = user_event_mm_get(mm);
+ fault->enabler = enabler;
+ fault->attempt = attempt;
+
+ /* Don't try to queue in again while we have a pending fault */
+ set_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
+
+ if (!schedule_work(&fault->work)) {
+ /* Allow another attempt later */
+ clear_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler));
+
+ user_event_mm_put(mm);
+ kmem_cache_free(fault_cache, fault);
+
+ return false;
+ }
+
+ return true;
+}
+
+static int user_event_enabler_write(struct user_event_mm *mm,
+ struct user_event_enabler *enabler,
+ bool fixup_fault, int *attempt)
+{
+ unsigned long uaddr = enabler->addr;
+ unsigned long *ptr;
+ struct page *page;
+ void *kaddr;
+ int ret;
+
+ lockdep_assert_held(&event_mutex);
+ mmap_assert_locked(mm->mm);
+
+ *attempt += 1;
+
+ /* Ensure MM has tasks, cannot use after exit_mm() */
+ if (refcount_read(&mm->tasks) == 0)
+ return -ENOENT;
+
+ if (unlikely(test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)) ||
+ test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler))))
+ return -EBUSY;
+
+ ret = pin_user_pages_remote(mm->mm, uaddr, 1, FOLL_WRITE | FOLL_NOFAULT,
+ &page, NULL, NULL);
+
+ if (unlikely(ret <= 0)) {
+ if (!fixup_fault)
+ return -EFAULT;
+
+ if (!user_event_enabler_queue_fault(mm, enabler, *attempt))
+ pr_warn("user_events: Unable to queue fault handler\n");
+
+ return -EFAULT;
+ }
+
+ kaddr = kmap_local_page(page);
+ ptr = kaddr + (uaddr & ~PAGE_MASK);
+
+ /* Update bit atomically, user tracers must be atomic as well */
+ if (enabler->event && enabler->event->status)
+ set_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr);
+ else
+ clear_bit(enabler->values & ENABLE_VAL_BIT_MASK, ptr);
+
+ kunmap_local(kaddr);
+ unpin_user_pages_dirty_lock(&page, 1, true);
+
+ return 0;
+}
+
+static bool user_event_enabler_exists(struct user_event_mm *mm,
+ unsigned long uaddr, unsigned char bit)
{
- int i = user->index;
+ struct user_event_enabler *enabler;
+ struct user_event_enabler *next;
- user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i);
+ list_for_each_entry_safe(enabler, next, &mm->enablers, link) {
+ if (enabler->addr == uaddr &&
+ (enabler->values & ENABLE_VAL_BIT_MASK) == bit)
+ return true;
+ }
+
+ return false;
}
-static __always_inline
-void user_event_register_clear(struct user_event *user)
+static void user_event_enabler_update(struct user_event *user)
{
- int i = user->index;
+ struct user_event_enabler *enabler;
+ struct user_event_mm *mm = user_event_mm_get_all(user);
+ struct user_event_mm *next;
+ int attempt;
+
+ while (mm) {
+ next = mm->next;
+ mmap_read_lock(mm->mm);
+ rcu_read_lock();
- user->group->register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i);
+ list_for_each_entry_rcu(enabler, &mm->enablers, link) {
+ if (enabler->event == user) {
+ attempt = 0;
+ user_event_enabler_write(mm, enabler, true, &attempt);
+ }
+ }
+
+ rcu_read_unlock();
+ mmap_read_unlock(mm->mm);
+ user_event_mm_put(mm);
+ mm = next;
+ }
+}
+
+static bool user_event_enabler_dup(struct user_event_enabler *orig,
+ struct user_event_mm *mm)
+{
+ struct user_event_enabler *enabler;
+
+ /* Skip pending frees */
+ if (unlikely(test_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(orig))))
+ return true;
+
+ enabler = kzalloc(sizeof(*enabler), GFP_NOWAIT | __GFP_ACCOUNT);
+
+ if (!enabler)
+ return false;
+
+ enabler->event = orig->event;
+ enabler->addr = orig->addr;
+
+ /* Only dup part of value (ignore future flags, etc) */
+ enabler->values = orig->values & ENABLE_VAL_DUP_MASK;
+
+ refcount_inc(&enabler->event->refcnt);
+ list_add_rcu(&enabler->link, &mm->enablers);
+
+ return true;
+}
+
+static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm)
+{
+ refcount_inc(&mm->refcnt);
+
+ return mm;
+}
+
+static struct user_event_mm *user_event_mm_get_all(struct user_event *user)
+{
+ struct user_event_mm *found = NULL;
+ struct user_event_enabler *enabler;
+ struct user_event_mm *mm;
+
+ /*
+ * We do not want to block fork/exec while enablements are being
+ * updated, so we use RCU to walk the current tasks that have used
+ * user_events ABI for 1 or more events. Each enabler found in each
+ * task that matches the event being updated has a write to reflect
+ * the kernel state back into the process. Waits/faults must not occur
+ * during this. So we scan the list under RCU for all the mm that have
+ * the event within it. This is needed because mm_read_lock() can wait.
+ * Each user mm returned has a ref inc to handle remove RCU races.
+ */
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(mm, &user_event_mms, link)
+ list_for_each_entry_rcu(enabler, &mm->enablers, link)
+ if (enabler->event == user) {
+ mm->next = found;
+ found = user_event_mm_get(mm);
+ break;
+ }
+
+ rcu_read_unlock();
+
+ return found;
+}
+
+static struct user_event_mm *user_event_mm_create(struct task_struct *t)
+{
+ struct user_event_mm *user_mm;
+ unsigned long flags;
+
+ user_mm = kzalloc(sizeof(*user_mm), GFP_KERNEL_ACCOUNT);
+
+ if (!user_mm)
+ return NULL;
+
+ user_mm->mm = t->mm;
+ INIT_LIST_HEAD(&user_mm->enablers);
+ refcount_set(&user_mm->refcnt, 1);
+ refcount_set(&user_mm->tasks, 1);
+
+ spin_lock_irqsave(&user_event_mms_lock, flags);
+ list_add_rcu(&user_mm->link, &user_event_mms);
+ spin_unlock_irqrestore(&user_event_mms_lock, flags);
+
+ t->user_event_mm = user_mm;
+
+ /*
+ * The lifetime of the memory descriptor can slightly outlast
+ * the task lifetime if a ref to the user_event_mm is taken
+ * between list_del_rcu() and call_rcu(). Therefore we need
+ * to take a reference to it to ensure it can live this long
+ * under this corner case. This can also occur in clones that
+ * outlast the parent.
+ */
+ mmgrab(user_mm->mm);
+
+ return user_mm;
+}
+
+static struct user_event_mm *current_user_event_mm(void)
+{
+ struct user_event_mm *user_mm = current->user_event_mm;
+
+ if (user_mm)
+ goto inc;
+
+ user_mm = user_event_mm_create(current);
+
+ if (!user_mm)
+ goto error;
+inc:
+ refcount_inc(&user_mm->refcnt);
+error:
+ return user_mm;
+}
+
+static void user_event_mm_destroy(struct user_event_mm *mm)
+{
+ struct user_event_enabler *enabler, *next;
+
+ list_for_each_entry_safe(enabler, next, &mm->enablers, link)
+ user_event_enabler_destroy(enabler);
+
+ mmdrop(mm->mm);
+ kfree(mm);
+}
+
+static void user_event_mm_put(struct user_event_mm *mm)
+{
+ if (mm && refcount_dec_and_test(&mm->refcnt))
+ user_event_mm_destroy(mm);
+}
+
+static void delayed_user_event_mm_put(struct work_struct *work)
+{
+ struct user_event_mm *mm;
+
+ mm = container_of(to_rcu_work(work), struct user_event_mm, put_rwork);
+ user_event_mm_put(mm);
+}
+
+void user_event_mm_remove(struct task_struct *t)
+{
+ struct user_event_mm *mm;
+ unsigned long flags;
+
+ might_sleep();
+
+ mm = t->user_event_mm;
+ t->user_event_mm = NULL;
+
+ /* Clone will increment the tasks, only remove if last clone */
+ if (!refcount_dec_and_test(&mm->tasks))
+ return;
+
+ /* Remove the mm from the list, so it can no longer be enabled */
+ spin_lock_irqsave(&user_event_mms_lock, flags);
+ list_del_rcu(&mm->link);
+ spin_unlock_irqrestore(&user_event_mms_lock, flags);
+
+ /*
+ * We need to wait for currently occurring writes to stop within
+ * the mm. This is required since exit_mm() snaps the current rss
+ * stats and clears them. On the final mmdrop(), check_mm() will
+ * report a bug if these increment.
+ *
+ * All writes/pins are done under mmap_read lock, take the write
+ * lock to ensure in-progress faults have completed. Faults that
+ * are pending but yet to run will check the task count and skip
+ * the fault since the mm is going away.
+ */
+ mmap_write_lock(mm->mm);
+ mmap_write_unlock(mm->mm);
+
+ /*
+ * Put for mm must be done after RCU delay to handle new refs in
+ * between the list_del_rcu() and now. This ensures any get refs
+ * during rcu_read_lock() are accounted for during list removal.
+ *
+ * CPU A | CPU B
+ * ---------------------------------------------------------------
+ * user_event_mm_remove() | rcu_read_lock();
+ * list_del_rcu() | list_for_each_entry_rcu();
+ * call_rcu() | refcount_inc();
+ * . | rcu_read_unlock();
+ * schedule_work() | .
+ * user_event_mm_put() | .
+ *
+ * mmdrop() cannot be called in the softirq context of call_rcu()
+ * so we use a work queue after call_rcu() to run within.
+ */
+ INIT_RCU_WORK(&mm->put_rwork, delayed_user_event_mm_put);
+ queue_rcu_work(system_wq, &mm->put_rwork);
+}
+
+void user_event_mm_dup(struct task_struct *t, struct user_event_mm *old_mm)
+{
+ struct user_event_mm *mm = user_event_mm_create(t);
+ struct user_event_enabler *enabler;
+
+ if (!mm)
+ return;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(enabler, &old_mm->enablers, link)
+ if (!user_event_enabler_dup(enabler, mm))
+ goto error;
+
+ rcu_read_unlock();
+
+ return;
+error:
+ rcu_read_unlock();
+ user_event_mm_remove(t);
+}
+
+static bool current_user_event_enabler_exists(unsigned long uaddr,
+ unsigned char bit)
+{
+ struct user_event_mm *user_mm = current_user_event_mm();
+ bool exists;
+
+ if (!user_mm)
+ return false;
+
+ exists = user_event_enabler_exists(user_mm, uaddr, bit);
+
+ user_event_mm_put(user_mm);
+
+ return exists;
+}
+
+static struct user_event_enabler
+*user_event_enabler_create(struct user_reg *reg, struct user_event *user,
+ int *write_result)
+{
+ struct user_event_enabler *enabler;
+ struct user_event_mm *user_mm;
+ unsigned long uaddr = (unsigned long)reg->enable_addr;
+ int attempt = 0;
+
+ user_mm = current_user_event_mm();
+
+ if (!user_mm)
+ return NULL;
+
+ enabler = kzalloc(sizeof(*enabler), GFP_KERNEL_ACCOUNT);
+
+ if (!enabler)
+ goto out;
+
+ enabler->event = user;
+ enabler->addr = uaddr;
+ enabler->values = reg->enable_bit;
+retry:
+ /* Prevents state changes from racing with new enablers */
+ mutex_lock(&event_mutex);
+
+ /* Attempt to reflect the current state within the process */
+ mmap_read_lock(user_mm->mm);
+ *write_result = user_event_enabler_write(user_mm, enabler, false,
+ &attempt);
+ mmap_read_unlock(user_mm->mm);
+
+ /*
+ * If the write works, then we will track the enabler. A ref to the
+ * underlying user_event is held by the enabler to prevent it going
+ * away while the enabler is still in use by a process. The ref is
+ * removed when the enabler is destroyed. This means a event cannot
+ * be forcefully deleted from the system until all tasks using it
+ * exit or run exec(), which includes forks and clones.
+ */
+ if (!*write_result) {
+ refcount_inc(&enabler->event->refcnt);
+ list_add_rcu(&enabler->link, &user_mm->enablers);
+ }
+
+ mutex_unlock(&event_mutex);
+
+ if (*write_result) {
+ /* Attempt to fault-in and retry if it worked */
+ if (!user_event_mm_fault_in(user_mm, uaddr, attempt))
+ goto retry;
+
+ kfree(enabler);
+ enabler = NULL;
+ }
+out:
+ user_event_mm_put(user_mm);
+
+ return enabler;
}
static __always_inline __must_check
@@ -449,7 +929,7 @@ static int user_event_add_field(struct user_event *user, const char *type,
struct ftrace_event_field *field;
int validator_flags = 0;
- field = kmalloc(sizeof(*field), GFP_KERNEL);
+ field = kmalloc(sizeof(*field), GFP_KERNEL_ACCOUNT);
if (!field)
return -ENOMEM;
@@ -468,7 +948,7 @@ add_validator:
if (strstr(type, "char") != NULL)
validator_flags |= VALIDATOR_ENSURE_NULL;
- validator = kmalloc(sizeof(*validator), GFP_KERNEL);
+ validator = kmalloc(sizeof(*validator), GFP_KERNEL_ACCOUNT);
if (!validator) {
kfree(field);
@@ -489,6 +969,9 @@ add_field:
field->is_signed = is_signed;
field->filter_type = filter_type;
+ if (filter_type == FILTER_OTHER)
+ field->filter_type = filter_assign_type(type);
+
list_add(&field->link, &user->fields);
/*
@@ -754,7 +1237,7 @@ static int user_event_create_print_fmt(struct user_event *user)
len = user_event_set_print_fmt(user, NULL, 0);
- print_fmt = kmalloc(len, GFP_KERNEL);
+ print_fmt = kmalloc(len, GFP_KERNEL_ACCOUNT);
if (!print_fmt)
return -ENOMEM;
@@ -770,11 +1253,7 @@ static enum print_line_t user_event_print_trace(struct trace_iterator *iter,
int flags,
struct trace_event *event)
{
- /* Unsafe to try to decode user provided print_fmt, use hex */
- trace_print_hex_dump_seq(&iter->seq, "", DUMP_PREFIX_OFFSET, 16,
- 1, iter->ent, iter->ent_size, true);
-
- return trace_handle_return(&iter->seq);
+ return print_event_fields(iter, event);
}
static struct trace_event_functions user_event_funcs = {
@@ -820,6 +1299,8 @@ static int destroy_user_event(struct user_event *user)
{
int ret = 0;
+ lockdep_assert_held(&event_mutex);
+
/* Must destroy fields before call removal */
user_event_destroy_fields(user);
@@ -829,9 +1310,6 @@ static int destroy_user_event(struct user_event *user)
return ret;
dyn_event_remove(&user->devent);
-
- user_event_register_clear(user);
- clear_bit(user->index, user->group->page_bitmap);
hash_del(&user->node);
user_event_destroy_validators(user);
@@ -839,6 +1317,11 @@ static int destroy_user_event(struct user_event *user)
kfree(EVENT_NAME(user));
kfree(user);
+ if (current_user_events > 0)
+ current_user_events--;
+ else
+ pr_alert("BUG: Bad current_user_events\n");
+
return ret;
}
@@ -977,9 +1460,9 @@ discard:
#endif
/*
- * Update the register page that is shared between user processes.
+ * Update the enabled bit among all user processes.
*/
-static void update_reg_page_for(struct user_event *user)
+static void update_enable_bit_for(struct user_event *user)
{
struct tracepoint *tp = &user->tracepoint;
char status = 0;
@@ -1010,12 +1493,9 @@ static void update_reg_page_for(struct user_event *user)
rcu_read_unlock_sched();
}
- if (status)
- user_event_register_set(user);
- else
- user_event_register_clear(user);
-
user->status = status;
+
+ user_event_enabler_update(user);
}
/*
@@ -1072,10 +1552,10 @@ static int user_event_reg(struct trace_event_call *call,
return ret;
inc:
refcount_inc(&user->refcnt);
- update_reg_page_for(user);
+ update_enable_bit_for(user);
return 0;
dec:
- update_reg_page_for(user);
+ update_enable_bit_for(user);
refcount_dec(&user->refcnt);
return 0;
}
@@ -1093,7 +1573,7 @@ static int user_event_create(const char *raw_command)
raw_command += USER_EVENTS_PREFIX_LEN;
raw_command = skip_spaces(raw_command);
- name = kstrdup(raw_command, GFP_KERNEL);
+ name = kstrdup(raw_command, GFP_KERNEL_ACCOUNT);
if (!name)
return -ENOMEM;
@@ -1271,7 +1751,6 @@ static int user_event_parse(struct user_event_group *group, char *name,
struct user_event **newuser)
{
int ret;
- int index;
u32 key;
struct user_event *user;
@@ -1290,12 +1769,7 @@ static int user_event_parse(struct user_event_group *group, char *name,
return 0;
}
- index = find_first_zero_bit(group->page_bitmap, MAX_EVENTS);
-
- if (index == MAX_EVENTS)
- return -EMFILE;
-
- user = kzalloc(sizeof(*user), GFP_KERNEL);
+ user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT);
if (!user)
return -ENOMEM;
@@ -1335,20 +1809,23 @@ static int user_event_parse(struct user_event_group *group, char *name,
mutex_lock(&event_mutex);
+ if (current_user_events >= max_user_events) {
+ ret = -EMFILE;
+ goto put_user_lock;
+ }
+
ret = user_event_trace_register(user);
if (ret)
goto put_user_lock;
- user->index = index;
-
/* Ensure we track self ref and caller ref (2) */
refcount_set(&user->refcnt, 2);
dyn_event_init(&user->devent, &user_event_dops);
dyn_event_add(&user->devent, &user->call);
- set_bit(user->index, group->page_bitmap);
hash_add(group->register_table, &user->node, key);
+ current_user_events++;
mutex_unlock(&event_mutex);
@@ -1398,6 +1875,9 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx)))
return -EFAULT;
+ if (idx < 0)
+ return -EINVAL;
+
rcu_read_lock_sched();
refs = rcu_dereference_sched(info->refs);
@@ -1468,7 +1948,7 @@ static int user_events_open(struct inode *node, struct file *file)
if (!group)
return -ENOENT;
- info = kzalloc(sizeof(*info), GFP_KERNEL);
+ info = kzalloc(sizeof(*info), GFP_KERNEL_ACCOUNT);
if (!info)
return -ENOMEM;
@@ -1521,7 +2001,7 @@ static int user_events_ref_add(struct user_event_file_info *info,
size = struct_size(refs, events, count + 1);
- new_refs = kzalloc(size, GFP_KERNEL);
+ new_refs = kzalloc(size, GFP_KERNEL_ACCOUNT);
if (!new_refs)
return -ENOMEM;
@@ -1564,6 +2044,37 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
if (ret)
return ret;
+ /* Ensure no flags, since we don't support any yet */
+ if (kreg->flags != 0)
+ return -EINVAL;
+
+ /* Ensure supported size */
+ switch (kreg->enable_size) {
+ case 4:
+ /* 32-bit */
+ break;
+#if BITS_PER_LONG >= 64
+ case 8:
+ /* 64-bit */
+ break;
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ /* Ensure natural alignment */
+ if (kreg->enable_addr % kreg->enable_size)
+ return -EINVAL;
+
+ /* Ensure bit range for size */
+ if (kreg->enable_bit > (kreg->enable_size * BITS_PER_BYTE) - 1)
+ return -EINVAL;
+
+ /* Ensure accessible */
+ if (!access_ok((const void __user *)(uintptr_t)kreg->enable_addr,
+ kreg->enable_size))
+ return -EFAULT;
+
kreg->size = size;
return 0;
@@ -1578,14 +2089,26 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
struct user_reg __user *ureg = (struct user_reg __user *)uarg;
struct user_reg reg;
struct user_event *user;
+ struct user_event_enabler *enabler;
char *name;
long ret;
+ int write_result;
ret = user_reg_get(ureg, &reg);
if (ret)
return ret;
+ /*
+ * Prevent users from using the same address and bit multiple times
+ * within the same mm address space. This can cause unexpected behavior
+ * for user processes that is far easier to debug if this is explictly
+ * an error upon registering.
+ */
+ if (current_user_event_enabler_exists((unsigned long)reg.enable_addr,
+ reg.enable_bit))
+ return -EADDRINUSE;
+
name = strndup_user((const char __user *)(uintptr_t)reg.name_args,
MAX_EVENT_DESC);
@@ -1610,8 +2133,28 @@ static long user_events_ioctl_reg(struct user_event_file_info *info,
if (ret < 0)
return ret;
+ /*
+ * user_events_ref_add succeeded:
+ * At this point we have a user_event, it's lifetime is bound by the
+ * reference count, not this file. If anything fails, the user_event
+ * still has a reference until the file is released. During release
+ * any remaining references (from user_events_ref_add) are decremented.
+ *
+ * Attempt to create an enabler, which too has a lifetime tied in the
+ * same way for the event. Once the task that caused the enabler to be
+ * created exits or issues exec() then the enablers it has created
+ * will be destroyed and the ref to the event will be decremented.
+ */
+ enabler = user_event_enabler_create(&reg, user, &write_result);
+
+ if (!enabler)
+ return -ENOMEM;
+
+ /* Write failed/faulted, give error back to caller */
+ if (write_result)
+ return write_result;
+
put_user((u32)ret, &ureg->write_index);
- put_user(user->index, &ureg->status_bit);
return 0;
}
@@ -1641,6 +2184,114 @@ static long user_events_ioctl_del(struct user_event_file_info *info,
return ret;
}
+static long user_unreg_get(struct user_unreg __user *ureg,
+ struct user_unreg *kreg)
+{
+ u32 size;
+ long ret;
+
+ ret = get_user(size, &ureg->size);
+
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE)
+ return -E2BIG;
+
+ if (size < offsetofend(struct user_unreg, disable_addr))
+ return -EINVAL;
+
+ ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+
+ /* Ensure no reserved values, since we don't support any yet */
+ if (kreg->__reserved || kreg->__reserved2)
+ return -EINVAL;
+
+ return ret;
+}
+
+static int user_event_mm_clear_bit(struct user_event_mm *user_mm,
+ unsigned long uaddr, unsigned char bit)
+{
+ struct user_event_enabler enabler;
+ int result;
+ int attempt = 0;
+
+ memset(&enabler, 0, sizeof(enabler));
+ enabler.addr = uaddr;
+ enabler.values = bit;
+retry:
+ /* Prevents state changes from racing with new enablers */
+ mutex_lock(&event_mutex);
+
+ /* Force the bit to be cleared, since no event is attached */
+ mmap_read_lock(user_mm->mm);
+ result = user_event_enabler_write(user_mm, &enabler, false, &attempt);
+ mmap_read_unlock(user_mm->mm);
+
+ mutex_unlock(&event_mutex);
+
+ if (result) {
+ /* Attempt to fault-in and retry if it worked */
+ if (!user_event_mm_fault_in(user_mm, uaddr, attempt))
+ goto retry;
+ }
+
+ return result;
+}
+
+/*
+ * Unregisters an enablement address/bit within a task/user mm.
+ */
+static long user_events_ioctl_unreg(unsigned long uarg)
+{
+ struct user_unreg __user *ureg = (struct user_unreg __user *)uarg;
+ struct user_event_mm *mm = current->user_event_mm;
+ struct user_event_enabler *enabler, *next;
+ struct user_unreg reg;
+ long ret;
+
+ ret = user_unreg_get(ureg, &reg);
+
+ if (ret)
+ return ret;
+
+ if (!mm)
+ return -ENOENT;
+
+ ret = -ENOENT;
+
+ /*
+ * Flags freeing and faulting are used to indicate if the enabler is in
+ * use at all. When faulting is set a page-fault is occurring asyncly.
+ * During async fault if freeing is set, the enabler will be destroyed.
+ * If no async fault is happening, we can destroy it now since we hold
+ * the event_mutex during these checks.
+ */
+ mutex_lock(&event_mutex);
+
+ list_for_each_entry_safe(enabler, next, &mm->enablers, link)
+ if (enabler->addr == reg.disable_addr &&
+ (enabler->values & ENABLE_VAL_BIT_MASK) == reg.disable_bit) {
+ set_bit(ENABLE_VAL_FREEING_BIT, ENABLE_BITOPS(enabler));
+
+ if (!test_bit(ENABLE_VAL_FAULTING_BIT, ENABLE_BITOPS(enabler)))
+ user_event_enabler_destroy(enabler);
+
+ /* Removed at least one */
+ ret = 0;
+ }
+
+ mutex_unlock(&event_mutex);
+
+ /* Ensure bit is now cleared for user, regardless of event status */
+ if (!ret)
+ ret = user_event_mm_clear_bit(mm, reg.disable_addr,
+ reg.disable_bit);
+
+ return ret;
+}
+
/*
* Handles the ioctl from user mode to register or alter operations.
*/
@@ -1663,6 +2314,12 @@ static long user_events_ioctl(struct file *file, unsigned int cmd,
ret = user_events_ioctl_del(info, uarg);
mutex_unlock(&group->reg_mutex);
break;
+
+ case DIAG_IOCSUNREG:
+ mutex_lock(&group->reg_mutex);
+ ret = user_events_ioctl_unreg(uarg);
+ mutex_unlock(&group->reg_mutex);
+ break;
}
return ret;
@@ -1718,45 +2375,13 @@ out:
}
static const struct file_operations user_data_fops = {
- .open = user_events_open,
- .write = user_events_write,
- .write_iter = user_events_write_iter,
+ .open = user_events_open,
+ .write = user_events_write,
+ .write_iter = user_events_write_iter,
.unlocked_ioctl = user_events_ioctl,
- .release = user_events_release,
+ .release = user_events_release,
};
-static struct user_event_group *user_status_group(struct file *file)
-{
- struct seq_file *m = file->private_data;
-
- if (!m)
- return NULL;
-
- return m->private;
-}
-
-/*
- * Maps the shared page into the user process for checking if event is enabled.
- */
-static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
-{
- char *pages;
- struct user_event_group *group = user_status_group(file);
- unsigned long size = vma->vm_end - vma->vm_start;
-
- if (size != MAX_BYTES)
- return -EINVAL;
-
- if (!group)
- return -EINVAL;
-
- pages = group->register_page_data;
-
- return remap_pfn_range(vma, vma->vm_start,
- virt_to_phys(pages) >> PAGE_SHIFT,
- size, vm_get_page_prot(VM_READ));
-}
-
static void *user_seq_start(struct seq_file *m, loff_t *pos)
{
if (*pos)
@@ -1780,7 +2405,7 @@ static int user_seq_show(struct seq_file *m, void *p)
struct user_event_group *group = m->private;
struct user_event *user;
char status;
- int i, active = 0, busy = 0, flags;
+ int i, active = 0, busy = 0;
if (!group)
return -EINVAL;
@@ -1789,11 +2414,10 @@ static int user_seq_show(struct seq_file *m, void *p)
hash_for_each(group->register_table, i, user, node) {
status = user->status;
- flags = user->flags;
- seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
+ seq_printf(m, "%s", EVENT_NAME(user));
- if (flags != 0 || status != 0)
+ if (status != 0)
seq_puts(m, " #");
if (status != 0) {
@@ -1816,16 +2440,15 @@ static int user_seq_show(struct seq_file *m, void *p)
seq_puts(m, "\n");
seq_printf(m, "Active: %d\n", active);
seq_printf(m, "Busy: %d\n", busy);
- seq_printf(m, "Max: %ld\n", MAX_EVENTS);
return 0;
}
static const struct seq_operations user_seq_ops = {
- .start = user_seq_start,
- .next = user_seq_next,
- .stop = user_seq_stop,
- .show = user_seq_show,
+ .start = user_seq_start,
+ .next = user_seq_next,
+ .stop = user_seq_stop,
+ .show = user_seq_show,
};
static int user_status_open(struct inode *node, struct file *file)
@@ -1851,11 +2474,10 @@ static int user_status_open(struct inode *node, struct file *file)
}
static const struct file_operations user_status_fops = {
- .open = user_status_open,
- .mmap = user_status_mmap,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
+ .open = user_status_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
};
/*
@@ -1873,8 +2495,7 @@ static int create_user_tracefs(void)
goto err;
}
- /* mmap with MAP_SHARED requires writable fd */
- emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE,
+ emmap = tracefs_create_file("user_events_status", TRACE_MODE_READ,
NULL, NULL, &user_status_fops);
if (!emmap) {
@@ -1888,20 +2509,53 @@ err:
return -ENODEV;
}
+static int set_max_user_events_sysctl(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ mutex_lock(&event_mutex);
+
+ ret = proc_douintvec(table, write, buffer, lenp, ppos);
+
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+static struct ctl_table user_event_sysctls[] = {
+ {
+ .procname = "user_events_max",
+ .data = &max_user_events,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = set_max_user_events_sysctl,
+ },
+ {}
+};
+
static int __init trace_events_user_init(void)
{
int ret;
+ fault_cache = KMEM_CACHE(user_event_enabler_fault, 0);
+
+ if (!fault_cache)
+ return -ENOMEM;
+
init_group = user_event_group_create(&init_user_ns);
- if (!init_group)
+ if (!init_group) {
+ kmem_cache_destroy(fault_cache);
return -ENOMEM;
+ }
ret = create_user_tracefs();
if (ret) {
pr_warn("user_events could not register with tracefs\n");
user_event_group_destroy(init_group);
+ kmem_cache_destroy(fault_cache);
init_group = NULL;
return ret;
}
@@ -1909,6 +2563,8 @@ static int __init trace_events_user_init(void)
if (dyn_event_register(&user_event_dops))
pr_warn("user_events could not register with dyn_events\n");
+ register_sysctl_init("kernel", user_event_sysctls);
+
return 0;
}
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index d440ddd5fd8b..2f37a6e68aa9 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -339,7 +339,7 @@ static void move_to_next_cpu(void)
cpumask_clear(current_mask);
cpumask_set_cpu(next_cpu, current_mask);
- sched_setaffinity(0, current_mask);
+ set_cpus_allowed_ptr(current, current_mask);
return;
change_mode:
@@ -446,7 +446,7 @@ static int start_single_kthread(struct trace_array *tr)
}
- sched_setaffinity(kthread->pid, current_mask);
+ set_cpus_allowed_ptr(kthread, current_mask);
kdata->kthread = kthread;
wake_up_process(kthread);
@@ -492,6 +492,10 @@ static int start_cpu_kthread(unsigned int cpu)
{
struct task_struct *kthread;
+ /* Do not start a new hwlatd thread if it is already running */
+ if (per_cpu(hwlat_per_cpu_data, cpu).kthread)
+ return 0;
+
kthread = kthread_run_on_cpu(kthread_fn, NULL, cpu, "hwlatd/%u");
if (IS_ERR(kthread)) {
pr_err(BANNER "could not start sampling thread\n");
@@ -584,9 +588,6 @@ static int start_per_cpu_kthreads(struct trace_array *tr)
*/
cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
- for_each_online_cpu(cpu)
- per_cpu(hwlat_per_cpu_data, cpu).kthread = NULL;
-
for_each_cpu(cpu, current_mask) {
retval = start_cpu_kthread(cpu);
if (retval)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 04f0fdae19a1..efbbec2caff8 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -159,7 +159,7 @@ static void osnoise_unregister_instance(struct trace_array *tr)
if (!found)
return;
- kvfree_rcu(inst);
+ kvfree_rcu_mightsleep(inst);
}
/*
@@ -217,7 +217,7 @@ struct osnoise_variables {
/*
* Per-cpu runtime information.
*/
-DEFINE_PER_CPU(struct osnoise_variables, per_cpu_osnoise_var);
+static DEFINE_PER_CPU(struct osnoise_variables, per_cpu_osnoise_var);
/*
* this_cpu_osn_var - Return the per-cpu osnoise_variables on its relative CPU
@@ -240,7 +240,7 @@ struct timerlat_variables {
u64 count;
};
-DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var);
+static DEFINE_PER_CPU(struct timerlat_variables, per_cpu_timerlat_var);
/*
* this_cpu_tmr_var - Return the per-cpu timerlat_variables on its relative CPU
@@ -332,7 +332,7 @@ struct timerlat_sample {
/*
* Protect the interface.
*/
-struct mutex interface_lock;
+static struct mutex interface_lock;
/*
* Tracer data.
@@ -1296,7 +1296,7 @@ static void notify_new_max_latency(u64 latency)
rcu_read_lock();
list_for_each_entry_rcu(inst, &osnoise_instances, list) {
tr = inst->tr;
- if (tr->max_latency < latency) {
+ if (tracer_tracing_is_on(tr) && tr->max_latency < latency) {
tr->max_latency = latency;
latency_fsnotify(tr);
}
@@ -1738,6 +1738,8 @@ static int timerlat_main(void *data)
trace_timerlat_sample(&s);
+ notify_new_max_latency(diff);
+
timerlat_dump_stack(time_to_us(diff));
tlat->tracing_thread = false;
@@ -2239,8 +2241,8 @@ static struct trace_min_max_param osnoise_print_stack = {
/*
* osnoise/timerlat_period: min 100 us, max 1 s
*/
-u64 timerlat_min_period = 100;
-u64 timerlat_max_period = 1000000;
+static u64 timerlat_min_period = 100;
+static u64 timerlat_max_period = 1000000;
static struct trace_min_max_param timerlat_period = {
.lock = &interface_lock,
.val = &osnoise_data.timerlat_period,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index bd475a00f96d..15f05faaae44 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -221,8 +221,11 @@ trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len,
const char *ret = trace_seq_buffer_ptr(p);
const char *fmt = concatenate ? "%*phN" : "%*ph";
- for (i = 0; i < buf_len; i += 16)
+ for (i = 0; i < buf_len; i += 16) {
+ if (!concatenate && i != 0)
+ trace_seq_putc(p, ' ');
trace_seq_printf(p, fmt, min(buf_len - i, 16), &buf[i]);
+ }
trace_seq_putc(p, 0);
return ret;
@@ -808,6 +811,176 @@ EXPORT_SYMBOL_GPL(unregister_trace_event);
* Standard events
*/
+static void print_array(struct trace_iterator *iter, void *pos,
+ struct ftrace_event_field *field)
+{
+ int offset;
+ int len;
+ int i;
+
+ offset = *(int *)pos & 0xffff;
+ len = *(int *)pos >> 16;
+
+ if (field)
+ offset += field->offset + sizeof(int);
+
+ if (offset + len > iter->ent_size) {
+ trace_seq_puts(&iter->seq, "<OVERFLOW>");
+ return;
+ }
+
+ pos = (void *)iter->ent + offset;
+
+ for (i = 0; i < len; i++, pos++) {
+ if (i)
+ trace_seq_putc(&iter->seq, ',');
+ trace_seq_printf(&iter->seq, "%02x", *(unsigned char *)pos);
+ }
+}
+
+static void print_fields(struct trace_iterator *iter, struct trace_event_call *call,
+ struct list_head *head)
+{
+ struct ftrace_event_field *field;
+ int offset;
+ int len;
+ int ret;
+ void *pos;
+
+ list_for_each_entry(field, head, link) {
+ trace_seq_printf(&iter->seq, " %s=", field->name);
+ if (field->offset + field->size > iter->ent_size) {
+ trace_seq_puts(&iter->seq, "<OVERFLOW>");
+ continue;
+ }
+ pos = (void *)iter->ent + field->offset;
+
+ switch (field->filter_type) {
+ case FILTER_COMM:
+ case FILTER_STATIC_STRING:
+ trace_seq_printf(&iter->seq, "%.*s", field->size, (char *)pos);
+ break;
+ case FILTER_RDYN_STRING:
+ case FILTER_DYN_STRING:
+ offset = *(int *)pos & 0xffff;
+ len = *(int *)pos >> 16;
+
+ if (field->filter_type == FILTER_RDYN_STRING)
+ offset += field->offset + sizeof(int);
+
+ if (offset + len > iter->ent_size) {
+ trace_seq_puts(&iter->seq, "<OVERFLOW>");
+ break;
+ }
+ pos = (void *)iter->ent + offset;
+ trace_seq_printf(&iter->seq, "%.*s", len, (char *)pos);
+ break;
+ case FILTER_PTR_STRING:
+ if (!iter->fmt_size)
+ trace_iter_expand_format(iter);
+ pos = *(void **)pos;
+ ret = strncpy_from_kernel_nofault(iter->fmt, pos,
+ iter->fmt_size);
+ if (ret < 0)
+ trace_seq_printf(&iter->seq, "(0x%px)", pos);
+ else
+ trace_seq_printf(&iter->seq, "(0x%px:%s)",
+ pos, iter->fmt);
+ break;
+ case FILTER_TRACE_FN:
+ pos = *(void **)pos;
+ trace_seq_printf(&iter->seq, "%pS", pos);
+ break;
+ case FILTER_CPU:
+ case FILTER_OTHER:
+ switch (field->size) {
+ case 1:
+ if (isprint(*(char *)pos)) {
+ trace_seq_printf(&iter->seq, "'%c'",
+ *(unsigned char *)pos);
+ }
+ trace_seq_printf(&iter->seq, "(%d)",
+ *(unsigned char *)pos);
+ break;
+ case 2:
+ trace_seq_printf(&iter->seq, "0x%x (%d)",
+ *(unsigned short *)pos,
+ *(unsigned short *)pos);
+ break;
+ case 4:
+ /* dynamic array info is 4 bytes */
+ if (strstr(field->type, "__data_loc")) {
+ print_array(iter, pos, NULL);
+ break;
+ }
+
+ if (strstr(field->type, "__rel_loc")) {
+ print_array(iter, pos, field);
+ break;
+ }
+
+ trace_seq_printf(&iter->seq, "0x%x (%d)",
+ *(unsigned int *)pos,
+ *(unsigned int *)pos);
+ break;
+ case 8:
+ trace_seq_printf(&iter->seq, "0x%llx (%lld)",
+ *(unsigned long long *)pos,
+ *(unsigned long long *)pos);
+ break;
+ default:
+ trace_seq_puts(&iter->seq, "<INVALID-SIZE>");
+ break;
+ }
+ break;
+ default:
+ trace_seq_puts(&iter->seq, "<INVALID-TYPE>");
+ }
+ }
+ trace_seq_putc(&iter->seq, '\n');
+}
+
+enum print_line_t print_event_fields(struct trace_iterator *iter,
+ struct trace_event *event)
+{
+ struct trace_event_call *call;
+ struct list_head *head;
+
+ /* ftrace defined events have separate call structures */
+ if (event->type <= __TRACE_LAST_TYPE) {
+ bool found = false;
+
+ down_read(&trace_event_sem);
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (call->event.type == event->type) {
+ found = true;
+ break;
+ }
+ /* No need to search all events */
+ if (call->event.type > __TRACE_LAST_TYPE)
+ break;
+ }
+ up_read(&trace_event_sem);
+ if (!found) {
+ trace_seq_printf(&iter->seq, "UNKNOWN TYPE %d\n", event->type);
+ goto out;
+ }
+ } else {
+ call = container_of(event, struct trace_event_call, event);
+ }
+ head = trace_get_fields(call);
+
+ trace_seq_printf(&iter->seq, "%s:", trace_event_name(call));
+
+ if (head && !list_empty(head))
+ print_fields(iter, call, head);
+ else
+ trace_seq_puts(&iter->seq, "No fields found\n");
+
+ out:
+ return trace_handle_return(&iter->seq);
+}
+
enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 4c954636caf0..dca40f1f1da4 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -19,6 +19,8 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
extern void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset);
extern int trace_print_context(struct trace_iterator *iter);
extern int trace_print_lat_context(struct trace_iterator *iter);
+extern enum print_line_t print_event_fields(struct trace_iterator *iter,
+ struct trace_event *event);
extern void trace_event_read_lock(void);
extern void trace_event_read_unlock(void);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 20d0c4a97633..2d2616678295 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1172,7 +1172,7 @@ int trace_probe_remove_file(struct trace_probe *tp,
return -ENOENT;
list_del_rcu(&link->list);
- kvfree_rcu(link);
+ kvfree_rcu_mightsleep(link);
if (list_empty(&tp->event->files))
trace_probe_clear_flag(tp, TP_FLAG_TRACE);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index ff0536cea968..a931d9aaea26 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -785,14 +785,7 @@ static struct fgraph_ops fgraph_ops __initdata = {
};
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
-#ifndef CALL_DEPTH_ACCOUNT
-#define CALL_DEPTH_ACCOUNT ""
-#endif
-
-noinline __noclone static void trace_direct_tramp(void)
-{
- asm(CALL_DEPTH_ACCOUNT);
-}
+static struct ftrace_ops direct;
#endif
/*
@@ -870,8 +863,9 @@ trace_selftest_startup_function_graph(struct tracer *trace,
* Register direct function together with graph tracer
* and make sure we get graph trace.
*/
- ret = register_ftrace_direct((unsigned long) DYN_FTRACE_TEST_NAME,
- (unsigned long) trace_direct_tramp);
+ ftrace_set_filter_ip(&direct, (unsigned long)DYN_FTRACE_TEST_NAME, 0, 0);
+ ret = register_ftrace_direct(&direct,
+ (unsigned long)ftrace_stub_direct_tramp);
if (ret)
goto out;
@@ -891,8 +885,9 @@ trace_selftest_startup_function_graph(struct tracer *trace,
unregister_ftrace_graph(&fgraph_ops);
- ret = unregister_ftrace_direct((unsigned long) DYN_FTRACE_TEST_NAME,
- (unsigned long) trace_direct_tramp);
+ ret = unregister_ftrace_direct(&direct,
+ (unsigned long)ftrace_stub_direct_tramp,
+ true);
if (ret)
goto out;
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index f50398cb790d..019e3a1566cf 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -123,15 +123,6 @@ static struct ctl_table uts_kern_table[] = {
{}
};
-static struct ctl_table uts_root_table[] = {
- {
- .procname = "kernel",
- .mode = 0555,
- .child = uts_kern_table,
- },
- {}
-};
-
#ifdef CONFIG_PROC_SYSCTL
/*
* Notify userspace about a change in a certain entry of uts_kern_table,
@@ -147,7 +138,7 @@ void uts_proc_notify(enum uts_proc proc)
static int __init utsname_sysctl_init(void)
{
- register_sysctl_table(uts_root_table);
+ register_sysctl("kernel", uts_kern_table);
return 0;
}
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
new file mode 100644
index 000000000000..b7cbd66f889e
--- /dev/null
+++ b/kernel/vhost_task.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 Oracle Corporation
+ */
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/sched/task.h>
+#include <linux/sched/vhost_task.h>
+#include <linux/sched/signal.h>
+
+enum vhost_task_flags {
+ VHOST_TASK_FLAGS_STOP,
+};
+
+static int vhost_task_fn(void *data)
+{
+ struct vhost_task *vtsk = data;
+ int ret;
+
+ ret = vtsk->fn(vtsk->data);
+ complete(&vtsk->exited);
+ do_exit(ret);
+}
+
+/**
+ * vhost_task_stop - stop a vhost_task
+ * @vtsk: vhost_task to stop
+ *
+ * Callers must call vhost_task_should_stop and return from their worker
+ * function when it returns true;
+ */
+void vhost_task_stop(struct vhost_task *vtsk)
+{
+ pid_t pid = vtsk->task->pid;
+
+ set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
+ wake_up_process(vtsk->task);
+ /*
+ * Make sure vhost_task_fn is no longer accessing the vhost_task before
+ * freeing it below. If userspace crashed or exited without closing,
+ * then the vhost_task->task could already be marked dead so
+ * kernel_wait will return early.
+ */
+ wait_for_completion(&vtsk->exited);
+ /*
+ * If we are just closing/removing a device and the parent process is
+ * not exiting then reap the task.
+ */
+ kernel_wait4(pid, NULL, __WCLONE, NULL);
+ kfree(vtsk);
+}
+EXPORT_SYMBOL_GPL(vhost_task_stop);
+
+/**
+ * vhost_task_should_stop - should the vhost task return from the work function
+ * @vtsk: vhost_task to stop
+ */
+bool vhost_task_should_stop(struct vhost_task *vtsk)
+{
+ return test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
+}
+EXPORT_SYMBOL_GPL(vhost_task_should_stop);
+
+/**
+ * vhost_task_create - create a copy of a process to be used by the kernel
+ * @fn: thread stack
+ * @arg: data to be passed to fn
+ * @name: the thread's name
+ *
+ * This returns a specialized task for use by the vhost layer or NULL on
+ * failure. The returned task is inactive, and the caller must fire it up
+ * through vhost_task_start().
+ */
+struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
+ const char *name)
+{
+ struct kernel_clone_args args = {
+ .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM,
+ .exit_signal = 0,
+ .fn = vhost_task_fn,
+ .name = name,
+ .user_worker = 1,
+ .no_files = 1,
+ .ignore_signals = 1,
+ };
+ struct vhost_task *vtsk;
+ struct task_struct *tsk;
+
+ vtsk = kzalloc(sizeof(*vtsk), GFP_KERNEL);
+ if (!vtsk)
+ return NULL;
+ init_completion(&vtsk->exited);
+ vtsk->data = arg;
+ vtsk->fn = fn;
+
+ args.fn_arg = vtsk;
+
+ tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args);
+ if (IS_ERR(tsk)) {
+ kfree(vtsk);
+ return NULL;
+ }
+
+ vtsk->task = tsk;
+ return vtsk;
+}
+EXPORT_SYMBOL_GPL(vhost_task_create);
+
+/**
+ * vhost_task_start - start a vhost_task created with vhost_task_create
+ * @vtsk: vhost_task to wake up
+ */
+void vhost_task_start(struct vhost_task *vtsk)
+{
+ wake_up_new_task(vtsk->task);
+}
+EXPORT_SYMBOL_GPL(vhost_task_start);
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index f10f403104e7..e91cb4c2833f 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -29,7 +29,6 @@
MODULE_DESCRIPTION("Watch queue");
MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
#define WATCH_QUEUE_NOTE_SIZE 128
#define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b8b541caed48..4666a1a92a31 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -49,6 +49,7 @@
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
#include <linux/sched/isolation.h>
+#include <linux/sched/debug.h>
#include <linux/nmi.h>
#include <linux/kvm_para.h>
@@ -141,6 +142,8 @@ enum {
* WR: wq->mutex protected for writes. RCU protected for reads.
*
* MD: wq_mayday_lock protected.
+ *
+ * WD: Used internally by the watchdog.
*/
/* struct worker is defined in workqueue_internal.h */
@@ -153,6 +156,7 @@ struct worker_pool {
unsigned int flags; /* X: flags */
unsigned long watchdog_ts; /* L: watchdog timestamp */
+ bool cpu_stall; /* WD: stalled cpu bound pool */
/*
* The counter is incremented in a process context on the associated CPU
@@ -1392,15 +1396,13 @@ static bool is_chained_work(struct workqueue_struct *wq)
*/
static int wq_select_unbound_cpu(int cpu)
{
- static bool printed_dbg_warning;
int new_cpu;
if (likely(!wq_debug_force_rr_cpu)) {
if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
return cpu;
- } else if (!printed_dbg_warning) {
- pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n");
- printed_dbg_warning = true;
+ } else {
+ pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
}
if (cpumask_empty(wq_unbound_cpumask))
@@ -1938,12 +1940,17 @@ static struct worker *create_worker(struct worker_pool *pool)
/* ID is needed to determine kthread name */
id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
- if (id < 0)
+ if (id < 0) {
+ pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n",
+ ERR_PTR(id));
return NULL;
+ }
worker = alloc_worker(pool->node);
- if (!worker)
+ if (!worker) {
+ pr_err_once("workqueue: Failed to allocate a worker\n");
goto fail;
+ }
worker->id = id;
@@ -1955,8 +1962,16 @@ static struct worker *create_worker(struct worker_pool *pool)
worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
"kworker/%s", id_buf);
- if (IS_ERR(worker->task))
+ if (IS_ERR(worker->task)) {
+ if (PTR_ERR(worker->task) == -EINTR) {
+ pr_err("workqueue: Interrupted when creating a worker thread \"kworker/%s\"\n",
+ id_buf);
+ } else {
+ pr_err_once("workqueue: Failed to create a worker thread: %pe",
+ worker->task);
+ }
goto fail;
+ }
set_user_nice(worker->task, pool->attrs->nice);
kthread_bind_mask(worker->task, pool->attrs->cpumask);
@@ -4380,13 +4395,18 @@ static int init_rescuer(struct workqueue_struct *wq)
return 0;
rescuer = alloc_worker(NUMA_NO_NODE);
- if (!rescuer)
+ if (!rescuer) {
+ pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n",
+ wq->name);
return -ENOMEM;
+ }
rescuer->rescue_wq = wq;
rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name);
if (IS_ERR(rescuer->task)) {
ret = PTR_ERR(rescuer->task);
+ pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe",
+ wq->name, ERR_PTR(ret));
kfree(rescuer);
return ret;
}
@@ -5002,10 +5022,16 @@ static void show_one_worker_pool(struct worker_pool *pool)
struct worker *worker;
bool first = true;
unsigned long flags;
+ unsigned long hung = 0;
raw_spin_lock_irqsave(&pool->lock, flags);
if (pool->nr_workers == pool->nr_idle)
goto next_pool;
+
+ /* How long the first pending work is waiting for a worker. */
+ if (!list_empty(&pool->worklist))
+ hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000;
+
/*
* Defer printing to avoid deadlocks in console drivers that
* queue work while holding locks also taken in their write
@@ -5014,9 +5040,7 @@ static void show_one_worker_pool(struct worker_pool *pool)
printk_deferred_enter();
pr_info("pool %d:", pool->id);
pr_cont_pool_info(pool);
- pr_cont(" hung=%us workers=%d",
- jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000,
- pool->nr_workers);
+ pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers);
if (pool->manager)
pr_cont(" manager: %d",
task_pid_nr(pool->manager->task));
@@ -5041,8 +5065,7 @@ next_pool:
/**
* show_all_workqueues - dump workqueue state
*
- * Called from a sysrq handler or try_to_freeze_tasks() and prints out
- * all busy workqueues and pools.
+ * Called from a sysrq handler and prints out all busy workqueues and pools.
*/
void show_all_workqueues(void)
{
@@ -5063,6 +5086,29 @@ void show_all_workqueues(void)
rcu_read_unlock();
}
+/**
+ * show_freezable_workqueues - dump freezable workqueue state
+ *
+ * Called from try_to_freeze_tasks() and prints out all freezable workqueues
+ * still busy.
+ */
+void show_freezable_workqueues(void)
+{
+ struct workqueue_struct *wq;
+
+ rcu_read_lock();
+
+ pr_info("Showing freezable workqueues that are still busy:\n");
+
+ list_for_each_entry_rcu(wq, &workqueues, list) {
+ if (!(wq->flags & WQ_FREEZABLE))
+ continue;
+ show_one_workqueue(wq);
+ }
+
+ rcu_read_unlock();
+}
+
/* used to show worker information through /proc/PID/{comm,stat,status} */
void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
{
@@ -5826,13 +5872,19 @@ static struct device_attribute wq_sysfs_cpumask_attr =
static int __init wq_sysfs_init(void)
{
+ struct device *dev_root;
int err;
err = subsys_virtual_register(&wq_subsys, NULL);
if (err)
return err;
- return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr);
+ dev_root = bus_get_dev_root(&wq_subsys);
+ if (dev_root) {
+ err = device_create_file(dev_root, &wq_sysfs_cpumask_attr);
+ put_device(dev_root);
+ }
+ return err;
}
core_initcall(wq_sysfs_init);
@@ -5956,6 +6008,57 @@ static struct timer_list wq_watchdog_timer;
static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
+/*
+ * Show workers that might prevent the processing of pending work items.
+ * The only candidates are CPU-bound workers in the running state.
+ * Pending work items should be handled by another idle worker
+ * in all other situations.
+ */
+static void show_cpu_pool_hog(struct worker_pool *pool)
+{
+ struct worker *worker;
+ unsigned long flags;
+ int bkt;
+
+ raw_spin_lock_irqsave(&pool->lock, flags);
+
+ hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+ if (task_is_running(worker->task)) {
+ /*
+ * Defer printing to avoid deadlocks in console
+ * drivers that queue work while holding locks
+ * also taken in their write paths.
+ */
+ printk_deferred_enter();
+
+ pr_info("pool %d:\n", pool->id);
+ sched_show_task(worker->task);
+
+ printk_deferred_exit();
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&pool->lock, flags);
+}
+
+static void show_cpu_pools_hogs(void)
+{
+ struct worker_pool *pool;
+ int pi;
+
+ pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n");
+
+ rcu_read_lock();
+
+ for_each_pool(pool, pi) {
+ if (pool->cpu_stall)
+ show_cpu_pool_hog(pool);
+
+ }
+
+ rcu_read_unlock();
+}
+
static void wq_watchdog_reset_touched(void)
{
int cpu;
@@ -5969,6 +6072,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
{
unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
bool lockup_detected = false;
+ bool cpu_pool_stall = false;
unsigned long now = jiffies;
struct worker_pool *pool;
int pi;
@@ -5981,6 +6085,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
for_each_pool(pool, pi) {
unsigned long pool_ts, touched, ts;
+ pool->cpu_stall = false;
if (list_empty(&pool->worklist))
continue;
@@ -6005,11 +6110,17 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
/* did we stall? */
if (time_after(now, ts + thresh)) {
lockup_detected = true;
+ if (pool->cpu >= 0) {
+ pool->cpu_stall = true;
+ cpu_pool_stall = true;
+ }
pr_emerg("BUG: workqueue lockup - pool");
pr_cont_pool_info(pool);
pr_cont(" stuck for %us!\n",
jiffies_to_msecs(now - pool_ts) / 1000);
}
+
+
}
rcu_read_unlock();
@@ -6017,6 +6128,9 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
if (lockup_detected)
show_all_workqueues();
+ if (cpu_pool_stall)
+ show_cpu_pools_hogs();
+
wq_watchdog_reset_touched();
mod_timer(&wq_watchdog_timer, jiffies + thresh);
}